diff --git a/crazy_functional.py b/crazy_functional.py index 80c1850d..ad7de48f 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -41,6 +41,7 @@ def get_crazy_functions(): from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF from crazy_functions.Latex_Function import PDF翻译中文并重新编译PDF from crazy_functions.Latex_Function_Wrap import Arxiv_Localize + from crazy_functions.Latex_Function_Wrap import PDF_Localize function_plugins = { @@ -333,7 +334,9 @@ def get_crazy_functions(): r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: " r'If the term "agent" is used in this section, it should be translated to "智能体". ', "Info": "PDF翻译中文,并重新编译PDF | 输入参数为路径", - "Function": HotReload(PDF翻译中文并重新编译PDF) + "Function": None, + "Class": PDF_Localize + } } diff --git a/crazy_functions/Latex_Function.py b/crazy_functions/Latex_Function.py index 88e742e3..0cdc4ed7 100644 --- a/crazy_functions/Latex_Function.py +++ b/crazy_functions/Latex_Function.py @@ -158,65 +158,72 @@ def arxiv_download(chatbot, history, txt, allow_cache=True): return extract_dst, arxiv_id -def pdf2tex_project(pdf_file_path): - # Mathpix API credentials - app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY') - headers = {"app_id": app_id, "app_key": app_key} +def pdf2tex_project(pdf_file_path, plugin_kwargs): + if plugin_kwargs["method"] == "MATHPIX": + # Mathpix API credentials + app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY') + headers = {"app_id": app_id, "app_key": app_key} - # Step 1: Send PDF file for processing - options = { - "conversion_formats": {"tex.zip": True}, - "math_inline_delimiters": ["$", "$"], - "rm_spaces": True - } + # Step 1: Send PDF file for processing + options = { + "conversion_formats": {"tex.zip": True}, + "math_inline_delimiters": ["$", "$"], + "rm_spaces": True + } - response = requests.post(url="https://api.mathpix.com/v3/pdf", - headers=headers, - data={"options_json": json.dumps(options)}, - files={"file": open(pdf_file_path, "rb")}) + response = requests.post(url="https://api.mathpix.com/v3/pdf", + headers=headers, + data={"options_json": json.dumps(options)}, + files={"file": open(pdf_file_path, "rb")}) - if response.ok: - pdf_id = response.json()["pdf_id"] - print(f"PDF processing initiated. PDF ID: {pdf_id}") + if response.ok: + pdf_id = response.json()["pdf_id"] + print(f"PDF processing initiated. PDF ID: {pdf_id}") - # Step 2: Check processing status - while True: - conversion_response = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers) - conversion_data = conversion_response.json() + # Step 2: Check processing status + while True: + conversion_response = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers) + conversion_data = conversion_response.json() - if conversion_data["status"] == "completed": - print("PDF processing completed.") - break - elif conversion_data["status"] == "error": - print("Error occurred during processing.") - else: - print(f"Processing status: {conversion_data['status']}") - time.sleep(5) # wait for a few seconds before checking again + if conversion_data["status"] == "completed": + print("PDF processing completed.") + break + elif conversion_data["status"] == "error": + print("Error occurred during processing.") + else: + print(f"Processing status: {conversion_data['status']}") + time.sleep(5) # wait for a few seconds before checking again - # Step 3: Save results to local files - output_dir = os.path.join(os.path.dirname(pdf_file_path), 'mathpix_output') - if not os.path.exists(output_dir): - os.makedirs(output_dir) + # Step 3: Save results to local files + output_dir = os.path.join(os.path.dirname(pdf_file_path), 'mathpix_output') + if not os.path.exists(output_dir): + os.makedirs(output_dir) - url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex" - response = requests.get(url, headers=headers) - file_name_wo_dot = '_'.join(os.path.basename(pdf_file_path).split('.')[:-1]) - output_name = f"{file_name_wo_dot}.tex.zip" - output_path = os.path.join(output_dir, output_name) - with open(output_path, "wb") as output_file: - output_file.write(response.content) - print(f"tex.zip file saved at: {output_path}") + url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex" + response = requests.get(url, headers=headers) + file_name_wo_dot = '_'.join(os.path.basename(pdf_file_path).split('.')[:-1]) + output_name = f"{file_name_wo_dot}.tex.zip" + output_path = os.path.join(output_dir, output_name) + with open(output_path, "wb") as output_file: + output_file.write(response.content) + print(f"tex.zip file saved at: {output_path}") - import zipfile - unzip_dir = os.path.join(output_dir, file_name_wo_dot) - with zipfile.ZipFile(output_path, 'r') as zip_ref: - zip_ref.extractall(unzip_dir) + import zipfile + unzip_dir = os.path.join(output_dir, file_name_wo_dot) + with zipfile.ZipFile(output_path, 'r') as zip_ref: + zip_ref.extractall(unzip_dir) + return unzip_dir + + else: + print(f"Error sending PDF for processing. Status code: {response.status_code}") + return None + else: + from crazy_functions.pdf_fns.parse_pdf_via_doc2x import 解析PDF_DOC2X_转Latex + unzip_dir = 解析PDF_DOC2X_转Latex(pdf_file_path) return unzip_dir - else: - print(f"Error sending PDF for processing. Status code: {response.status_code}") - return None + # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序1 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= @@ -437,11 +444,20 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"不支持同时处理多个pdf文件: {txt}") yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return - app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY') - if len(app_id) == 0 or len(app_key) == 0: - report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return + + if plugin_kwargs.get("method", "") == 'MATHPIX': + app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY') + if len(app_id) == 0 or len(app_key) == 0: + report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + if plugin_kwargs.get("method", "") == 'DOC2X': + app_id, app_key = "", "" + DOC2X_API_KEY = get_conf('DOC2X_API_KEY') + if len(DOC2X_API_KEY) == 0: + report_exception(chatbot, history, a="缺失 DOC2X_API_KEY。", b=f"请配置 DOC2X_API_KEY") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return hash_tag = map_file_to_sha256(file_manifest[0]) @@ -486,7 +502,7 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h # <-------------- convert pdf into tex -------------> chatbot.append([f"解析项目: {txt}", "正在将PDF转换为tex项目,请耐心等待..."]) yield from update_ui(chatbot=chatbot, history=history) - project_folder = pdf2tex_project(file_manifest[0]) + project_folder = pdf2tex_project(file_manifest[0], plugin_kwargs) if project_folder is None: report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"PDF转换为tex项目失败") yield from update_ui(chatbot=chatbot, history=history) diff --git a/crazy_functions/Latex_Function_Wrap.py b/crazy_functions/Latex_Function_Wrap.py index 69391b5e..0860984c 100644 --- a/crazy_functions/Latex_Function_Wrap.py +++ b/crazy_functions/Latex_Function_Wrap.py @@ -1,5 +1,5 @@ -from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF +from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF, PDF翻译中文并重新编译PDF from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty @@ -36,4 +36,38 @@ class Arxiv_Localize(GptAcademicPluginTemplate): advanced_arg = plugin_kwargs["advanced_arg"] if allow_cache == "从头执行": plugin_kwargs["advanced_arg"] = "--no-cache " + plugin_kwargs["advanced_arg"] - yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) \ No newline at end of file + yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) + + + +class PDF_Localize(GptAcademicPluginTemplate): + def __init__(self): + """ + 请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎! + """ + pass + + def define_arg_selection_menu(self): + """ + 定义插件的二级选项菜单 + """ + gui_definition = { + "main_input": + ArgProperty(title="PDF文件路径", description="未指定路径,请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步 + "advanced_arg": + ArgProperty(title="额外的翻译提示词", + description=r"如果有必要, 请在此处给出自定义翻译命令, 解决部分词汇翻译不准确的问题。 " + r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: " + r'If the term "agent" is used in this section, it should be translated to "智能体". ', + default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步 + "method": + ArgProperty(title="采用哪种方法执行转换", options=["MATHPIX", "DOC2X"], default_value="DOC2X", description="无", type="dropdown").model_dump_json(), + + } + return gui_definition + + def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + """ + 执行插件 + """ + yield from PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) \ No newline at end of file diff --git a/crazy_functions/PDF_Translate_Wrap.py b/crazy_functions/PDF_Translate_Wrap.py index 898f0695..669051bb 100644 --- a/crazy_functions/PDF_Translate_Wrap.py +++ b/crazy_functions/PDF_Translate_Wrap.py @@ -15,7 +15,7 @@ class PDF_Tran(GptAcademicPluginTemplate): """ gui_definition = { "main_input": - ArgProperty(title="PDF文件路径", description="请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步 + ArgProperty(title="PDF文件路径", description="未指定路径,请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步 "additional_prompt": ArgProperty(title="额外提示词", description="例如:对专有名词、翻译语气等方面的要求", default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步 "pdf_parse_method": diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py index 180f0c56..f67e79fe 100644 --- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -1,4 +1,4 @@ -from toolbox import get_log_folder, gen_time_str +from toolbox import get_log_folder, gen_time_str, get_conf from toolbox import update_ui, promote_file_to_downloadzone from toolbox import promote_file_to_downloadzone, extract_archive from toolbox import generate_file_link, zip_folder @@ -6,24 +6,75 @@ from crazy_functions.crazy_utils import get_files_from_everything from shared_utils.colorful import * import os +def refresh_key(doc2x_api_key): + import requests, json + url = "https://api.doc2x.noedgeai.com/api/token/refresh" + res = requests.post( + url, + headers={"Authorization": "Bearer " + doc2x_api_key} + ) + res_json = [] + if res.status_code == 200: + decoded = res.content.decode("utf-8") + res_json = json.loads(decoded) + doc2x_api_key = res_json['data']['token'] + else: + raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + return doc2x_api_key + +def 解析PDF_DOC2X_转Latex(pdf_file_path): + import requests, json, os + DOC2X_API_KEY = get_conf('DOC2X_API_KEY') + latex_dir = get_log_folder(plugin_name="pdf_ocr_latex") + doc2x_api_key = DOC2X_API_KEY + if doc2x_api_key.startswith('sk-'): + url = "https://api.doc2x.noedgeai.com/api/v1/pdf" + else: + doc2x_api_key = refresh_key(doc2x_api_key) + url = "https://api.doc2x.noedgeai.com/api/platform/pdf" + + res = requests.post( + url, + files={"file": open(pdf_file_path, "rb")}, + data={"ocr": "1"}, + headers={"Authorization": "Bearer " + doc2x_api_key} + ) + res_json = [] + if res.status_code == 200: + decoded = res.content.decode("utf-8") + for z_decoded in decoded.split('\n'): + if len(z_decoded) == 0: continue + assert z_decoded.startswith("data: ") + z_decoded = z_decoded[len("data: "):] + decoded_json = json.loads(z_decoded) + res_json.append(decoded_json) + else: + raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + + uuid = res_json[0]['uuid'] + to = "latex" # latex, md, docx + url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to + + res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key}) + latex_zip_path = os.path.join(latex_dir, gen_time_str() + '.zip') + latex_unzip_path = os.path.join(latex_dir, gen_time_str()) + if res.status_code == 200: + with open(latex_zip_path, "wb") as f: f.write(res.content) + else: + raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + + import zipfile + with zipfile.ZipFile(latex_zip_path, 'r') as zip_ref: + zip_ref.extractall(latex_unzip_path) + + + return latex_unzip_path + + + def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request): - def refresh_key(doc2x_api_key): - import requests, json - url = "https://api.doc2x.noedgeai.com/api/token/refresh" - res = requests.post( - url, - headers={"Authorization": "Bearer " + doc2x_api_key} - ) - res_json = [] - if res.status_code == 200: - decoded = res.content.decode("utf-8") - res_json = json.loads(decoded) - doc2x_api_key = res_json['data']['token'] - else: - raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) - return doc2x_api_key def pdf2markdown(filepath): import requests, json, os