这个提交包含在:
binary-husky
2024-06-07 16:50:27 +00:00
父节点 85dbe4a4bf
当前提交 612caa2f5f
共有 2 个文件被更改,包括 35 次插入13 次删除

查看文件

@@ -104,6 +104,8 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
z_decoded = z_decoded[len("data: "):]
decoded_json = json.loads(z_decoded)
res_json.append(decoded_json)
if 'limit exceeded' in decoded_json.get('status', ''):
raise RuntimeError("Doc2x API 页数受限,请联系 Doc2x 方面,并更换新的 API 秘钥。")
else:
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
uuid = res_json[0]['uuid']
@@ -159,8 +161,8 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
file_name = '在线预览翻译(原文)' + gen_time_str() + '.html'
preview_fp = os.path.join(ex_folder, file_name)
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
# with open(generated_fp, "r", encoding="utf-8") as f:
# md = f.read()
with open(generated_fp, "r", encoding="utf-8") as f:
md = f.read()
# # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
# md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
html = markdown_convertion_for_file(md)

查看文件

@@ -218,13 +218,29 @@ def fix_dollar_sticking_bug(txt):
single_stack_height = 0
double_stack_height = 0
while True:
index = txt.find('$')
if index == -1:
txt_result += txt
return txt_result
# still has $
# how many dollar
while True:
# still has $
# how many dollar
index = txt.find('$')
if index == -1:
txt_result += txt
return txt_result
if single_stack_height > 0:
if txt[:(index+1)].find('\n') > 0 or txt[:(index+1)].find('<td>') > 0 or txt[:(index+1)].find('</td>') > 0:
print('公式之中出现了异常换行 unexpect new line in equation')
single_stack_height = 0
txt_result += ' $'
continue
if double_stack_height > 0:
if txt[:(index+1)].find('\n\n') > 0:
print('公式之中出现了异常换行 unexpect new line in equation')
double_stack_height = 0
txt_result += '$$'
continue
is_double = (txt[index+1] == '$')
if is_double:
if single_stack_height != 0:
@@ -239,11 +255,15 @@ def fix_dollar_sticking_bug(txt):
txt = txt[(index+2):]
else:
if double_stack_height != 0:
print(txt[:(index)])
print('Fatal')
if single_stack_height == 0:
single_stack_height = 1
else:
single_stack_height = 0
print(txt[:(index)])
import time
# time.sleep(0.1)
txt_result += txt[:(index+1)]
txt = txt[(index+1):]
break
@@ -274,9 +294,10 @@ def markdown_convertion_for_file(txt):
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
txt = fix_markdown_indent(txt)
convert_stage_1 = fix_dollar_sticking_bug(txt)
# convert everything to html format
convert_stage_1 = markdown.markdown(
text=txt,
convert_stage_2 = markdown.markdown(
text=convert_stage_1,
extensions=[
"sane_lists",
"tables",
@@ -288,16 +309,15 @@ def markdown_convertion_for_file(txt):
)
convert_stage_1 = fix_dollar_sticking_bug(convert_stage_1)
def repl_fn(match):
content = match.group(2)
return f'<script type="math/tex">{content}</script>'
pattern = "|".join([pattern for pattern, property in mathpatterns.items() if not property["allow_multi_lines"]])
pattern = re.compile(pattern, flags=re.ASCII)
convert_stage_2 = pattern.sub(repl_fn, convert_stage_1)
convert_stage_3 = pattern.sub(repl_fn, convert_stage_2)
convert_stage_4 = markdown_bug_hunt(convert_stage_2)
convert_stage_4 = markdown_bug_hunt(convert_stage_3)
# 2. convert to rendered equation
convert_stage_5, n = re.subn(