doc2x latex convertion

2025-12-06 06:26:47 +00:00 · 2024-05-21 12:24:50 +00:00
--- a/crazy_functional.py
+++ b/crazy_functional.py
@@ -41,6 +41,7 @@ def get_crazy_functions():
    from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF
    from crazy_functions.Latex_Function import PDF翻译中文并重新编译PDF
    from crazy_functions.Latex_Function_Wrap import Arxiv_Localize
+    from crazy_functions.Latex_Function_Wrap import PDF_Localize


    function_plugins = {
@@ -333,7 +334,9 @@ def get_crazy_functions():
                            r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
                            r'If the term "agent" is used in this section, it should be translated to "智能体". ',
            "Info": "PDF翻译中文，并重新编译PDF | 输入参数为路径",
-            "Function": HotReload(PDF翻译中文并重新编译PDF)
+            "Function": None,
+            "Class": PDF_Localize
+
        }
    }

--- a/crazy_functions/Latex_Function.py
+++ b/crazy_functions/Latex_Function.py
@@ -158,65 +158,72 @@ def arxiv_download(chatbot, history, txt, allow_cache=True):
    return extract_dst, arxiv_id


-def pdf2tex_project(pdf_file_path):
-    # Mathpix API credentials
-    app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
-    headers = {"app_id": app_id, "app_key": app_key}
+def pdf2tex_project(pdf_file_path, plugin_kwargs):
+    if plugin_kwargs["method"] == "MATHPIX":
+        # Mathpix API credentials
+        app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
+        headers = {"app_id": app_id, "app_key": app_key}

-    # Step 1: Send PDF file for processing
-    options = {
-        "conversion_formats": {"tex.zip": True},
-        "math_inline_delimiters": ["$", "$"],
-        "rm_spaces": True
-    }
+        # Step 1: Send PDF file for processing
+        options = {
+            "conversion_formats": {"tex.zip": True},
+            "math_inline_delimiters": ["$", "$"],
+            "rm_spaces": True
+        }

-    response = requests.post(url="https://api.mathpix.com/v3/pdf",
-                             headers=headers,
-                             data={"options_json": json.dumps(options)},
-                             files={"file": open(pdf_file_path, "rb")})
+        response = requests.post(url="https://api.mathpix.com/v3/pdf",
+                                headers=headers,
+                                data={"options_json": json.dumps(options)},
+                                files={"file": open(pdf_file_path, "rb")})

-    if response.ok:
-        pdf_id = response.json()["pdf_id"]
-        print(f"PDF processing initiated. PDF ID: {pdf_id}")
+        if response.ok:
+            pdf_id = response.json()["pdf_id"]
+            print(f"PDF processing initiated. PDF ID: {pdf_id}")

-        # Step 2: Check processing status
-        while True:
-            conversion_response = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers)
-            conversion_data = conversion_response.json()
+            # Step 2: Check processing status
+            while True:
+                conversion_response = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers)
+                conversion_data = conversion_response.json()

-            if conversion_data["status"] == "completed":
-                print("PDF processing completed.")
-                break
-            elif conversion_data["status"] == "error":
-                print("Error occurred during processing.")
-            else:
-                print(f"Processing status: {conversion_data['status']}")
-                time.sleep(5)  # wait for a few seconds before checking again
+                if conversion_data["status"] == "completed":
+                    print("PDF processing completed.")
+                    break
+                elif conversion_data["status"] == "error":
+                    print("Error occurred during processing.")
+                else:
+                    print(f"Processing status: {conversion_data['status']}")
+                    time.sleep(5)  # wait for a few seconds before checking again

-        # Step 3: Save results to local files
-        output_dir = os.path.join(os.path.dirname(pdf_file_path), 'mathpix_output')
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
+            # Step 3: Save results to local files
+            output_dir = os.path.join(os.path.dirname(pdf_file_path), 'mathpix_output')
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)

-        url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex"
-        response = requests.get(url, headers=headers)
-        file_name_wo_dot = '_'.join(os.path.basename(pdf_file_path).split('.')[:-1])
-        output_name = f"{file_name_wo_dot}.tex.zip"
-        output_path = os.path.join(output_dir, output_name)
-        with open(output_path, "wb") as output_file:
-            output_file.write(response.content)
-        print(f"tex.zip file saved at: {output_path}")
+            url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex"
+            response = requests.get(url, headers=headers)
+            file_name_wo_dot = '_'.join(os.path.basename(pdf_file_path).split('.')[:-1])
+            output_name = f"{file_name_wo_dot}.tex.zip"
+            output_path = os.path.join(output_dir, output_name)
+            with open(output_path, "wb") as output_file:
+                output_file.write(response.content)
+            print(f"tex.zip file saved at: {output_path}")

-        import zipfile
-        unzip_dir = os.path.join(output_dir, file_name_wo_dot)
-        with zipfile.ZipFile(output_path, 'r') as zip_ref:
-            zip_ref.extractall(unzip_dir)
+            import zipfile
+            unzip_dir = os.path.join(output_dir, file_name_wo_dot)
+            with zipfile.ZipFile(output_path, 'r') as zip_ref:
+                zip_ref.extractall(unzip_dir)

+            return unzip_dir
+
+        else:
+            print(f"Error sending PDF for processing. Status code: {response.status_code}")
+            return None
+    else:
+        from crazy_functions.pdf_fns.parse_pdf_via_doc2x import 解析PDF_DOC2X_转Latex
+        unzip_dir = 解析PDF_DOC2X_转Latex(pdf_file_path)
        return unzip_dir

-    else:
-        print(f"Error sending PDF for processing. Status code: {response.status_code}")
-        return None
+


 # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序1 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
@@ -437,11 +444,20 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
        report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"不支持同时处理多个pdf文件: {txt}")
        yield from update_ui(chatbot=chatbot, history=history)  # 刷新界面
        return
-    app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
-    if len(app_id) == 0 or len(app_key) == 0:
-        report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY")
-        yield from update_ui(chatbot=chatbot, history=history)  # 刷新界面
-        return
+
+    if plugin_kwargs.get("method", "") == 'MATHPIX':
+        app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
+        if len(app_id) == 0 or len(app_key) == 0:
+            report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY")
+            yield from update_ui(chatbot=chatbot, history=history)  # 刷新界面
+            return
+    if plugin_kwargs.get("method", "") == 'DOC2X':
+        app_id, app_key = "", ""
+        DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
+        if len(DOC2X_API_KEY) == 0:
+            report_exception(chatbot, history, a="缺失 DOC2X_API_KEY。", b=f"请配置 DOC2X_API_KEY")
+            yield from update_ui(chatbot=chatbot, history=history)  # 刷新界面
+            return

    hash_tag = map_file_to_sha256(file_manifest[0])

@@ -486,7 +502,7 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
        # <-------------- convert pdf into tex ------------->
        chatbot.append([f"解析项目: {txt}", "正在将PDF转换为tex项目，请耐心等待..."])
        yield from update_ui(chatbot=chatbot, history=history)
-        project_folder = pdf2tex_project(file_manifest[0])
+        project_folder = pdf2tex_project(file_manifest[0], plugin_kwargs)
        if project_folder is None:
            report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"PDF转换为tex项目失败")
            yield from update_ui(chatbot=chatbot, history=history)
--- a/crazy_functions/Latex_Function_Wrap.py
+++ b/crazy_functions/Latex_Function_Wrap.py
@@ -1,5 +1,5 @@

-from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF
+from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF, PDF翻译中文并重新编译PDF
 from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty


@@ -36,4 +36,38 @@ class Arxiv_Localize(GptAcademicPluginTemplate):
        advanced_arg = plugin_kwargs["advanced_arg"]

        if allow_cache == "从头执行": plugin_kwargs["advanced_arg"] = "--no-cache " + plugin_kwargs["advanced_arg"]
-        yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
+        yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
+
+
+
+class PDF_Localize(GptAcademicPluginTemplate):
+    def __init__(self):
+        """
+        请注意`execute`会执行在不同的线程中，因此您在定义和使用类变量时，应当慎之又慎！
+        """
+        pass
+
+    def define_arg_selection_menu(self):
+        """
+        定义插件的二级选项菜单
+        """
+        gui_definition = {
+            "main_input":
+                ArgProperty(title="PDF文件路径", description="未指定路径，请上传文件后，再点击该插件", default_value="", type="string").model_dump_json(), # 主输入，自动从输入框同步
+            "advanced_arg":
+                ArgProperty(title="额外的翻译提示词",
+                            description=r"如果有必要, 请在此处给出自定义翻译命令, 解决部分词汇翻译不准确的问题。 "
+                                        r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
+                                        r'If the term "agent" is used in this section, it should be translated to "智能体". ',
+                            default_value="", type="string").model_dump_json(), # 高级参数输入区，自动同步
+            "method":
+                ArgProperty(title="采用哪种方法执行转换", options=["MATHPIX", "DOC2X"], default_value="DOC2X", description="无", type="dropdown").model_dump_json(),
+
+        }
+        return gui_definition
+
+    def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
+        """
+        执行插件
+        """
+        yield from PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
--- a/crazy_functions/PDF_Translate_Wrap.py
+++ b/crazy_functions/PDF_Translate_Wrap.py
@@ -15,7 +15,7 @@ class PDF_Tran(GptAcademicPluginTemplate):
        """
        gui_definition = {
            "main_input":
-                ArgProperty(title="PDF文件路径", description="请上传文件后，再点击该插件", default_value="", type="string").model_dump_json(), # 主输入，自动从输入框同步
+                ArgProperty(title="PDF文件路径", description="未指定路径，请上传文件后，再点击该插件", default_value="", type="string").model_dump_json(), # 主输入，自动从输入框同步
            "additional_prompt":
                ArgProperty(title="额外提示词", description="例如：对专有名词、翻译语气等方面的要求", default_value="", type="string").model_dump_json(), # 高级参数输入区，自动同步
            "pdf_parse_method":
--- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py
+++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py
@@ -1,4 +1,4 @@
-from toolbox import get_log_folder, gen_time_str
+from toolbox import get_log_folder, gen_time_str, get_conf
 from toolbox import update_ui, promote_file_to_downloadzone
 from toolbox import promote_file_to_downloadzone, extract_archive
 from toolbox import generate_file_link, zip_folder
@@ -6,24 +6,75 @@ from crazy_functions.crazy_utils import get_files_from_everything
 from shared_utils.colorful import *
 import os

+def refresh_key(doc2x_api_key):
+    import requests, json
+    url = "https://api.doc2x.noedgeai.com/api/token/refresh"
+    res = requests.post(
+        url,
+        headers={"Authorization": "Bearer " + doc2x_api_key}
+    )
+    res_json = []
+    if res.status_code == 200:
+        decoded = res.content.decode("utf-8")
+        res_json = json.loads(decoded)
+        doc2x_api_key = res_json['data']['token']
+    else:
+        raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
+    return doc2x_api_key
+
+def 解析PDF_DOC2X_转Latex(pdf_file_path):
+    import requests, json, os
+    DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
+    latex_dir = get_log_folder(plugin_name="pdf_ocr_latex")
+    doc2x_api_key = DOC2X_API_KEY
+    if doc2x_api_key.startswith('sk-'):
+        url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
+    else:
+        doc2x_api_key = refresh_key(doc2x_api_key)
+        url = "https://api.doc2x.noedgeai.com/api/platform/pdf"
+
+    res = requests.post(
+        url,
+        files={"file": open(pdf_file_path, "rb")},
+        data={"ocr": "1"},
+        headers={"Authorization": "Bearer " + doc2x_api_key}
+    )
+    res_json = []
+    if res.status_code == 200:
+        decoded = res.content.decode("utf-8")
+        for z_decoded in decoded.split('\n'):
+            if len(z_decoded) == 0: continue
+            assert z_decoded.startswith("data: ")
+            z_decoded = z_decoded[len("data: "):]
+            decoded_json = json.loads(z_decoded)
+            res_json.append(decoded_json)
+    else:
+        raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
+
+    uuid = res_json[0]['uuid']
+    to = "latex" # latex, md, docx
+    url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to
+
+    res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key})
+    latex_zip_path = os.path.join(latex_dir, gen_time_str() + '.zip')
+    latex_unzip_path = os.path.join(latex_dir, gen_time_str())
+    if res.status_code == 200:
+        with open(latex_zip_path, "wb") as f: f.write(res.content)
+    else:
+        raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
+
+    import zipfile
+    with zipfile.ZipFile(latex_zip_path, 'r') as zip_ref:
+        zip_ref.extractall(latex_unzip_path)
+
+
+    return latex_unzip_path
+
+
+

 def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):

-    def refresh_key(doc2x_api_key):
-        import requests, json
-        url = "https://api.doc2x.noedgeai.com/api/token/refresh"
-        res = requests.post(
-            url,
-            headers={"Authorization": "Bearer " + doc2x_api_key}
-        )
-        res_json = []
-        if res.status_code == 200:
-            decoded = res.content.decode("utf-8")
-            res_json = json.loads(decoded)
-            doc2x_api_key = res_json['data']['token']
-        else:
-            raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
-        return doc2x_api_key

    def pdf2markdown(filepath):
        import requests, json, os