pdf processing improvement

2025-12-06 06:26:47 +00:00 · 2024-06-07 15:53:08 +00:00
--- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py
+++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py
@@ -159,10 +159,10 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
            file_name = '在线预览翻译（原文）' + gen_time_str() + '.html'
            preview_fp = os.path.join(ex_folder, file_name)
            from shared_utils.advanced_markdown_format import markdown_convertion_for_file
-            with open(generated_fp, "r", encoding="utf-8") as f:
+            # with open(generated_fp, "r", encoding="utf-8") as f:
-                md = f.read()
+            #     md = f.read()
-                # Markdown中使用不标准的表格，需要在表格前加上一个emoji，以便公式渲染
+            #     # Markdown中使用不标准的表格，需要在表格前加上一个emoji，以便公式渲染
-                md = re.sub(r'^<table>', r'😃<table>', md, flags=re.MULTILINE)
+            #     md = re.sub(r'^<table>', r'.<table>', md, flags=re.MULTILINE)
            html = markdown_convertion_for_file(md)
            with open(preview_fp, "w", encoding="utf-8") as f: f.write(html)
            chatbot.append([None, f"生成在线预览：{generate_file_link([preview_fp])}"])
@@ -182,7 +182,7 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha
            with open(generated_fp, 'r', encoding='utf8') as f: content = f.read()
            content = content.replace('```markdown', '\n').replace('```', '\n')
            # Markdown中使用不标准的表格，需要在表格前加上一个emoji，以便公式渲染
-            content = re.sub(r'^<table>', r'😃<table>', content, flags=re.MULTILINE)
+            # content = re.sub(r'^<table>', r'.<table>', content, flags=re.MULTILINE)
            with open(generated_fp, 'w', encoding='utf8') as f: f.write(content)
            # 生成在线预览html
            file_name = '在线预览翻译' + gen_time_str() + '.html'
--- a/shared_utils/advanced_markdown_format.py
+++ b/shared_utils/advanced_markdown_format.py
@@ -46,6 +46,16 @@ code_highlight_configs_block_mermaid = {
    },
 }
 mathpatterns = {
    r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False},  #  $...$
    r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True},  # $$...$$
    r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False},  # \[...\]
    r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False},                       # \(...\)
    # r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True},  # \begin...\end
    # r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False},                       # $`...`$
 }
 def tex2mathml_catch_exception(content, *args, **kwargs):
    try:
        content = tex2mathml(content, *args, **kwargs)
@@ -96,14 +106,7 @@ def is_equation(txt):
        return False
    if "$" not in txt and "\\[" not in txt:
        return False
-    mathpatterns = {
+
        r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False},  #  $...$
        r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True},  # $$...$$
        r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False},  # \[...\]
        # r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False},                       # \(...\)
        # r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True},  # \begin...\end
        # r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False},                       # $`...`$
    }
    matches = []
    for pattern, property in mathpatterns.items():
        flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII
@@ -207,6 +210,45 @@ def fix_code_segment_indent(txt):
        return txt
 def fix_dollar_sticking_bug(txt):
    """
    修复不标准的dollar符号的问题
    """
    txt_result = ""
    single_stack_height = 0
    double_stack_height = 0
    while True:
        index = txt.find('$')
        if index == -1:
            txt_result += txt
            return txt_result
        # still has $
        # how many dollar
        while True:
            is_double = (txt[index+1] == '$')
            if is_double:
                if single_stack_height != 0:
                    # add a padding
                    txt = txt[:(index+1)] + " " + txt[(index+1):]
                    continue
                if double_stack_height == 0:
                    double_stack_height = 1
                else:
                    double_stack_height = 0
                txt_result += txt[:(index+2)]
                txt = txt[(index+2):]
            else:
                if double_stack_height != 0:
                    print('Fatal')
                if single_stack_height == 0:
                    single_stack_height = 1
                else:
                    single_stack_height = 0
                txt_result += txt[:(index+1)]
                txt = txt[(index+1):]
            break
 def markdown_convertion_for_file(txt):
    """
    将Markdown格式的文本转换为HTML格式。如果包含数学公式，则先将公式转换为HTML格式。
@@ -233,7 +275,6 @@ def markdown_convertion_for_file(txt):
    find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
    txt = fix_markdown_indent(txt)
    # convert everything to html format
    split = markdown.markdown(text="---")
    convert_stage_1 = markdown.markdown(
        text=txt,
        extensions=[
@@ -245,14 +286,25 @@ def markdown_convertion_for_file(txt):
        ],
        extension_configs={**markdown_extension_configs, **code_highlight_configs},
    )
-    convert_stage_1 = markdown_bug_hunt(convert_stage_1)
+
    convert_stage_1 = fix_dollar_sticking_bug(convert_stage_1)
    def repl_fn(match):
        content = match.group(2)
        return f'<script type="math/tex">{content}</script>'
    pattern = "|".join([pattern for pattern, property in mathpatterns.items() if not property["allow_multi_lines"]])
    pattern = re.compile(pattern, flags=re.ASCII)
    convert_stage_2 = pattern.sub(repl_fn, convert_stage_1)
    convert_stage_4 = markdown_bug_hunt(convert_stage_2)
    # 2. convert to rendered equation
-    convert_stage_2_2, n = re.subn(
+    convert_stage_5, n = re.subn(
-        find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
+        find_equation_pattern, replace_math_render, convert_stage_4, flags=re.DOTALL
    )
    # cat them together
-    return pre + convert_stage_2_2 + suf
+    return pre + convert_stage_5 + suf
@lru_cache(maxsize=128)  # 使用 lru缓存 加快转换速度
 def markdown_convertion(txt):