镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 14:36:48 +00:00
revise
这个提交包含在:
@@ -104,6 +104,8 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
|
|||||||
z_decoded = z_decoded[len("data: "):]
|
z_decoded = z_decoded[len("data: "):]
|
||||||
decoded_json = json.loads(z_decoded)
|
decoded_json = json.loads(z_decoded)
|
||||||
res_json.append(decoded_json)
|
res_json.append(decoded_json)
|
||||||
|
if 'limit exceeded' in decoded_json.get('status', ''):
|
||||||
|
raise RuntimeError("Doc2x API 页数受限,请联系 Doc2x 方面,并更换新的 API 秘钥。")
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
||||||
uuid = res_json[0]['uuid']
|
uuid = res_json[0]['uuid']
|
||||||
@@ -159,8 +161,8 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
|
|||||||
file_name = '在线预览翻译(原文)' + gen_time_str() + '.html'
|
file_name = '在线预览翻译(原文)' + gen_time_str() + '.html'
|
||||||
preview_fp = os.path.join(ex_folder, file_name)
|
preview_fp = os.path.join(ex_folder, file_name)
|
||||||
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
|
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
|
||||||
# with open(generated_fp, "r", encoding="utf-8") as f:
|
with open(generated_fp, "r", encoding="utf-8") as f:
|
||||||
# md = f.read()
|
md = f.read()
|
||||||
# # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
# # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
||||||
# md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
|
# md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
|
||||||
html = markdown_convertion_for_file(md)
|
html = markdown_convertion_for_file(md)
|
||||||
|
|||||||
@@ -218,13 +218,29 @@ def fix_dollar_sticking_bug(txt):
|
|||||||
single_stack_height = 0
|
single_stack_height = 0
|
||||||
double_stack_height = 0
|
double_stack_height = 0
|
||||||
while True:
|
while True:
|
||||||
index = txt.find('$')
|
|
||||||
if index == -1:
|
|
||||||
txt_result += txt
|
|
||||||
return txt_result
|
|
||||||
# still has $
|
|
||||||
# how many dollar
|
|
||||||
while True:
|
while True:
|
||||||
|
# still has $
|
||||||
|
# how many dollar
|
||||||
|
index = txt.find('$')
|
||||||
|
|
||||||
|
if index == -1:
|
||||||
|
txt_result += txt
|
||||||
|
return txt_result
|
||||||
|
|
||||||
|
if single_stack_height > 0:
|
||||||
|
if txt[:(index+1)].find('\n') > 0 or txt[:(index+1)].find('<td>') > 0 or txt[:(index+1)].find('</td>') > 0:
|
||||||
|
print('公式之中出现了异常换行 unexpect new line in equation')
|
||||||
|
single_stack_height = 0
|
||||||
|
txt_result += ' $'
|
||||||
|
continue
|
||||||
|
|
||||||
|
if double_stack_height > 0:
|
||||||
|
if txt[:(index+1)].find('\n\n') > 0:
|
||||||
|
print('公式之中出现了异常换行 unexpect new line in equation')
|
||||||
|
double_stack_height = 0
|
||||||
|
txt_result += '$$'
|
||||||
|
continue
|
||||||
|
|
||||||
is_double = (txt[index+1] == '$')
|
is_double = (txt[index+1] == '$')
|
||||||
if is_double:
|
if is_double:
|
||||||
if single_stack_height != 0:
|
if single_stack_height != 0:
|
||||||
@@ -239,11 +255,15 @@ def fix_dollar_sticking_bug(txt):
|
|||||||
txt = txt[(index+2):]
|
txt = txt[(index+2):]
|
||||||
else:
|
else:
|
||||||
if double_stack_height != 0:
|
if double_stack_height != 0:
|
||||||
|
print(txt[:(index)])
|
||||||
print('Fatal')
|
print('Fatal')
|
||||||
if single_stack_height == 0:
|
if single_stack_height == 0:
|
||||||
single_stack_height = 1
|
single_stack_height = 1
|
||||||
else:
|
else:
|
||||||
single_stack_height = 0
|
single_stack_height = 0
|
||||||
|
print(txt[:(index)])
|
||||||
|
import time
|
||||||
|
# time.sleep(0.1)
|
||||||
txt_result += txt[:(index+1)]
|
txt_result += txt[:(index+1)]
|
||||||
txt = txt[(index+1):]
|
txt = txt[(index+1):]
|
||||||
break
|
break
|
||||||
@@ -274,9 +294,10 @@ def markdown_convertion_for_file(txt):
|
|||||||
|
|
||||||
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
|
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
|
||||||
txt = fix_markdown_indent(txt)
|
txt = fix_markdown_indent(txt)
|
||||||
|
convert_stage_1 = fix_dollar_sticking_bug(txt)
|
||||||
# convert everything to html format
|
# convert everything to html format
|
||||||
convert_stage_1 = markdown.markdown(
|
convert_stage_2 = markdown.markdown(
|
||||||
text=txt,
|
text=convert_stage_1,
|
||||||
extensions=[
|
extensions=[
|
||||||
"sane_lists",
|
"sane_lists",
|
||||||
"tables",
|
"tables",
|
||||||
@@ -288,16 +309,15 @@ def markdown_convertion_for_file(txt):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
convert_stage_1 = fix_dollar_sticking_bug(convert_stage_1)
|
|
||||||
def repl_fn(match):
|
def repl_fn(match):
|
||||||
content = match.group(2)
|
content = match.group(2)
|
||||||
return f'<script type="math/tex">{content}</script>'
|
return f'<script type="math/tex">{content}</script>'
|
||||||
|
|
||||||
pattern = "|".join([pattern for pattern, property in mathpatterns.items() if not property["allow_multi_lines"]])
|
pattern = "|".join([pattern for pattern, property in mathpatterns.items() if not property["allow_multi_lines"]])
|
||||||
pattern = re.compile(pattern, flags=re.ASCII)
|
pattern = re.compile(pattern, flags=re.ASCII)
|
||||||
convert_stage_2 = pattern.sub(repl_fn, convert_stage_1)
|
convert_stage_3 = pattern.sub(repl_fn, convert_stage_2)
|
||||||
|
|
||||||
convert_stage_4 = markdown_bug_hunt(convert_stage_2)
|
convert_stage_4 = markdown_bug_hunt(convert_stage_3)
|
||||||
|
|
||||||
# 2. convert to rendered equation
|
# 2. convert to rendered equation
|
||||||
convert_stage_5, n = re.subn(
|
convert_stage_5, n = re.subn(
|
||||||
|
|||||||
在新工单中引用
屏蔽一个用户