镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 06:26:47 +00:00
pdf processing improvement
这个提交包含在:
@@ -159,10 +159,10 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
|
|||||||
file_name = '在线预览翻译(原文)' + gen_time_str() + '.html'
|
file_name = '在线预览翻译(原文)' + gen_time_str() + '.html'
|
||||||
preview_fp = os.path.join(ex_folder, file_name)
|
preview_fp = os.path.join(ex_folder, file_name)
|
||||||
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
|
from shared_utils.advanced_markdown_format import markdown_convertion_for_file
|
||||||
with open(generated_fp, "r", encoding="utf-8") as f:
|
# with open(generated_fp, "r", encoding="utf-8") as f:
|
||||||
md = f.read()
|
# md = f.read()
|
||||||
# Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
# # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
||||||
md = re.sub(r'^<table>', r'😃<table>', md, flags=re.MULTILINE)
|
# md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
|
||||||
html = markdown_convertion_for_file(md)
|
html = markdown_convertion_for_file(md)
|
||||||
with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
|
with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
|
||||||
chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"])
|
chatbot.append([None, f"生成在线预览:{generate_file_link([preview_fp])}"])
|
||||||
@@ -182,7 +182,7 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
|
|||||||
with open(generated_fp, 'r', encoding='utf8') as f: content = f.read()
|
with open(generated_fp, 'r', encoding='utf8') as f: content = f.read()
|
||||||
content = content.replace('```markdown', '\n').replace('```', '\n')
|
content = content.replace('```markdown', '\n').replace('```', '\n')
|
||||||
# Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
# Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染
|
||||||
content = re.sub(r'^<table>', r'😃<table>', content, flags=re.MULTILINE)
|
# content = re.sub(r'^<table>', r'.<table>', content, flags=re.MULTILINE)
|
||||||
with open(generated_fp, 'w', encoding='utf8') as f: f.write(content)
|
with open(generated_fp, 'w', encoding='utf8') as f: f.write(content)
|
||||||
# 生成在线预览html
|
# 生成在线预览html
|
||||||
file_name = '在线预览翻译' + gen_time_str() + '.html'
|
file_name = '在线预览翻译' + gen_time_str() + '.html'
|
||||||
|
|||||||
@@ -46,6 +46,16 @@ code_highlight_configs_block_mermaid = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
mathpatterns = {
|
||||||
|
r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False}, # $...$
|
||||||
|
r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True}, # $$...$$
|
||||||
|
r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False}, # \[...\]
|
||||||
|
r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False}, # \(...\)
|
||||||
|
# r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True}, # \begin...\end
|
||||||
|
# r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False}, # $`...`$
|
||||||
|
}
|
||||||
|
|
||||||
def tex2mathml_catch_exception(content, *args, **kwargs):
|
def tex2mathml_catch_exception(content, *args, **kwargs):
|
||||||
try:
|
try:
|
||||||
content = tex2mathml(content, *args, **kwargs)
|
content = tex2mathml(content, *args, **kwargs)
|
||||||
@@ -96,14 +106,7 @@ def is_equation(txt):
|
|||||||
return False
|
return False
|
||||||
if "$" not in txt and "\\[" not in txt:
|
if "$" not in txt and "\\[" not in txt:
|
||||||
return False
|
return False
|
||||||
mathpatterns = {
|
|
||||||
r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False}, # $...$
|
|
||||||
r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True}, # $$...$$
|
|
||||||
r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False}, # \[...\]
|
|
||||||
# r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False}, # \(...\)
|
|
||||||
# r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True}, # \begin...\end
|
|
||||||
# r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False}, # $`...`$
|
|
||||||
}
|
|
||||||
matches = []
|
matches = []
|
||||||
for pattern, property in mathpatterns.items():
|
for pattern, property in mathpatterns.items():
|
||||||
flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII
|
flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII
|
||||||
@@ -207,6 +210,45 @@ def fix_code_segment_indent(txt):
|
|||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
|
||||||
|
def fix_dollar_sticking_bug(txt):
|
||||||
|
"""
|
||||||
|
修复不标准的dollar符号的问题
|
||||||
|
"""
|
||||||
|
txt_result = ""
|
||||||
|
single_stack_height = 0
|
||||||
|
double_stack_height = 0
|
||||||
|
while True:
|
||||||
|
index = txt.find('$')
|
||||||
|
if index == -1:
|
||||||
|
txt_result += txt
|
||||||
|
return txt_result
|
||||||
|
# still has $
|
||||||
|
# how many dollar
|
||||||
|
while True:
|
||||||
|
is_double = (txt[index+1] == '$')
|
||||||
|
if is_double:
|
||||||
|
if single_stack_height != 0:
|
||||||
|
# add a padding
|
||||||
|
txt = txt[:(index+1)] + " " + txt[(index+1):]
|
||||||
|
continue
|
||||||
|
if double_stack_height == 0:
|
||||||
|
double_stack_height = 1
|
||||||
|
else:
|
||||||
|
double_stack_height = 0
|
||||||
|
txt_result += txt[:(index+2)]
|
||||||
|
txt = txt[(index+2):]
|
||||||
|
else:
|
||||||
|
if double_stack_height != 0:
|
||||||
|
print('Fatal')
|
||||||
|
if single_stack_height == 0:
|
||||||
|
single_stack_height = 1
|
||||||
|
else:
|
||||||
|
single_stack_height = 0
|
||||||
|
txt_result += txt[:(index+1)]
|
||||||
|
txt = txt[(index+1):]
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def markdown_convertion_for_file(txt):
|
def markdown_convertion_for_file(txt):
|
||||||
"""
|
"""
|
||||||
将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。
|
将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。
|
||||||
@@ -233,7 +275,6 @@ def markdown_convertion_for_file(txt):
|
|||||||
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
|
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
|
||||||
txt = fix_markdown_indent(txt)
|
txt = fix_markdown_indent(txt)
|
||||||
# convert everything to html format
|
# convert everything to html format
|
||||||
split = markdown.markdown(text="---")
|
|
||||||
convert_stage_1 = markdown.markdown(
|
convert_stage_1 = markdown.markdown(
|
||||||
text=txt,
|
text=txt,
|
||||||
extensions=[
|
extensions=[
|
||||||
@@ -245,14 +286,25 @@ def markdown_convertion_for_file(txt):
|
|||||||
],
|
],
|
||||||
extension_configs={**markdown_extension_configs, **code_highlight_configs},
|
extension_configs={**markdown_extension_configs, **code_highlight_configs},
|
||||||
)
|
)
|
||||||
convert_stage_1 = markdown_bug_hunt(convert_stage_1)
|
|
||||||
|
|
||||||
|
convert_stage_1 = fix_dollar_sticking_bug(convert_stage_1)
|
||||||
|
def repl_fn(match):
|
||||||
|
content = match.group(2)
|
||||||
|
return f'<script type="math/tex">{content}</script>'
|
||||||
|
|
||||||
|
pattern = "|".join([pattern for pattern, property in mathpatterns.items() if not property["allow_multi_lines"]])
|
||||||
|
pattern = re.compile(pattern, flags=re.ASCII)
|
||||||
|
convert_stage_2 = pattern.sub(repl_fn, convert_stage_1)
|
||||||
|
|
||||||
|
convert_stage_4 = markdown_bug_hunt(convert_stage_2)
|
||||||
|
|
||||||
# 2. convert to rendered equation
|
# 2. convert to rendered equation
|
||||||
convert_stage_2_2, n = re.subn(
|
convert_stage_5, n = re.subn(
|
||||||
find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
|
find_equation_pattern, replace_math_render, convert_stage_4, flags=re.DOTALL
|
||||||
)
|
)
|
||||||
# cat them together
|
# cat them together
|
||||||
return pre + convert_stage_2_2 + suf
|
return pre + convert_stage_5 + suf
|
||||||
|
|
||||||
@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
|
@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
|
||||||
def markdown_convertion(txt):
|
def markdown_convertion(txt):
|
||||||
|
|||||||
在新工单中引用
屏蔽一个用户