镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-08 23:46:48 +00:00
follow master
这个提交包含在:
@@ -1,5 +1,7 @@
|
||||
from toolbox import update_ui, get_conf, trimmed_format_exc
|
||||
from toolbox import update_ui, get_conf, trimmed_format_exc, get_log_folder
|
||||
import threading
|
||||
import os
|
||||
import logging
|
||||
|
||||
def input_clipping(inputs, history, max_token_limit):
|
||||
import numpy as np
|
||||
@@ -469,14 +471,16 @@ def read_and_clean_pdf_text(fp):
|
||||
'- ', '') for t in text_areas['blocks'] if 'lines' in t]
|
||||
|
||||
############################## <第 2 步,获取正文主字体> ##################################
|
||||
fsize_statiscs = {}
|
||||
for span in meta_span:
|
||||
if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
|
||||
fsize_statiscs[span[1]] += span[2]
|
||||
main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
|
||||
if REMOVE_FOOT_NOTE:
|
||||
give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
|
||||
|
||||
try:
|
||||
fsize_statiscs = {}
|
||||
for span in meta_span:
|
||||
if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
|
||||
fsize_statiscs[span[1]] += span[2]
|
||||
main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
|
||||
if REMOVE_FOOT_NOTE:
|
||||
give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
|
||||
except:
|
||||
raise RuntimeError(f'抱歉, 我们暂时无法解析此PDF文档: {fp}。')
|
||||
############################## <第 3 步,切分和重新整合> ##################################
|
||||
mega_sec = []
|
||||
sec = []
|
||||
@@ -703,49 +707,96 @@ class knowledge_archive_interface():
|
||||
)
|
||||
self.threadLock.release()
|
||||
return resp, prompt
|
||||
|
||||
@Singleton
|
||||
class nougat_interface():
|
||||
def __init__(self):
|
||||
self.threadLock = threading.Lock()
|
||||
|
||||
def try_install_deps(deps):
|
||||
def nougat_with_timeout(self, command, cwd, timeout=3600):
|
||||
import subprocess
|
||||
logging.info(f'正在执行命令 {command}')
|
||||
process = subprocess.Popen(command, shell=True, cwd=cwd)
|
||||
try:
|
||||
stdout, stderr = process.communicate(timeout=timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
process.kill()
|
||||
stdout, stderr = process.communicate()
|
||||
print("Process timed out!")
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def NOUGAT_parse_pdf(self, fp, chatbot, history):
|
||||
from toolbox import update_ui_lastest_msg
|
||||
|
||||
yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在排队, 等待线程锁...",
|
||||
chatbot=chatbot, history=history, delay=0)
|
||||
self.threadLock.acquire()
|
||||
import glob, threading, os
|
||||
from toolbox import get_log_folder, gen_time_str
|
||||
dst = os.path.join(get_log_folder(plugin_name='nougat'), gen_time_str())
|
||||
os.makedirs(dst)
|
||||
|
||||
yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在加载NOUGAT... (提示:首次运行需要花费较长时间下载NOUGAT参数)",
|
||||
chatbot=chatbot, history=history, delay=0)
|
||||
self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600)
|
||||
res = glob.glob(os.path.join(dst,'*.mmd'))
|
||||
if len(res) == 0:
|
||||
self.threadLock.release()
|
||||
raise RuntimeError("Nougat解析论文失败。")
|
||||
self.threadLock.release()
|
||||
return res[0]
|
||||
|
||||
|
||||
|
||||
|
||||
def try_install_deps(deps, reload_m=[]):
|
||||
import subprocess, sys, importlib
|
||||
for dep in deps:
|
||||
import subprocess, sys
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--user', dep])
|
||||
import site
|
||||
importlib.reload(site)
|
||||
for m in reload_m:
|
||||
importlib.reload(__import__(m))
|
||||
|
||||
|
||||
class construct_html():
|
||||
def __init__(self) -> None:
|
||||
self.css = """
|
||||
HTML_CSS = """
|
||||
.row {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.column {
|
||||
flex: 1;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
.table-header {
|
||||
font-weight: bold;
|
||||
border-bottom: 1px solid black;
|
||||
}
|
||||
|
||||
.table-row {
|
||||
border-bottom: 1px solid lightgray;
|
||||
}
|
||||
|
||||
.table-cell {
|
||||
padding: 5px;
|
||||
}
|
||||
"""
|
||||
self.html_string = f'<!DOCTYPE html><head><meta charset="utf-8"><title>翻译结果</title><style>{self.css}</style></head>'
|
||||
"""
|
||||
|
||||
|
||||
def add_row(self, a, b):
|
||||
tmp = """
|
||||
TABLE_CSS = """
|
||||
<div class="row table-row">
|
||||
<div class="column table-cell">REPLACE_A</div>
|
||||
<div class="column table-cell">REPLACE_B</div>
|
||||
</div>
|
||||
"""
|
||||
"""
|
||||
|
||||
class construct_html():
|
||||
def __init__(self) -> None:
|
||||
self.css = HTML_CSS
|
||||
self.html_string = f'<!DOCTYPE html><head><meta charset="utf-8"><title>翻译结果</title><style>{self.css}</style></head>'
|
||||
|
||||
|
||||
def add_row(self, a, b):
|
||||
tmp = TABLE_CSS
|
||||
from toolbox import markdown_convertion
|
||||
tmp = tmp.replace('REPLACE_A', markdown_convertion(a))
|
||||
tmp = tmp.replace('REPLACE_B', markdown_convertion(b))
|
||||
@@ -753,6 +804,6 @@ class construct_html():
|
||||
|
||||
|
||||
def save_file(self, file_name):
|
||||
with open(f'./gpt_log/{file_name}', 'w', encoding='utf8') as f:
|
||||
with open(os.path.join(get_log_folder(), file_name), 'w', encoding='utf8') as f:
|
||||
f.write(self.html_string.encode('utf-8', 'ignore').decode())
|
||||
|
||||
return os.path.join(get_log_folder(), file_name)
|
||||
|
||||
在新工单中引用
屏蔽一个用户