From a9c86a7fb8853df13335cac337681f4e10ceddb3 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Fri, 18 Oct 2024 14:16:24 +0000 Subject: [PATCH 1/3] pre --- crazy_functions/Latex_Function.py | 42 ++++++++++++++++++++-- crazy_functions/latex_fns/latex_actions.py | 3 ++ tests/test_latex_auto_correct.py | 3 +- 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/crazy_functions/Latex_Function.py b/crazy_functions/Latex_Function.py index 53bbdd21..236460eb 100644 --- a/crazy_functions/Latex_Function.py +++ b/crazy_functions/Latex_Function.py @@ -364,6 +364,24 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, chatbot=chatbot, history=history) return + # allow_cloud_io = True + # arxiv_id = "2203.01927" + # if allow_cloud_io and arxiv_id: + # # 如果用户允许,我们将arxiv论文PDF上传到云端 + # for file_path in chatbot._cookies.get("files_to_promote", []): + # if file_path.endswith('comparison.pdf'): + # def compute_hash(file_path): + # return map_file_to_sha256(file_path) + # with open(file_path, 'rb') as f: + # import requests + # url = 'https://cloud-2.agent-matrix.com/upload' + # files = {'file': (file_path, f, 'application/octet-stream')} + # data = { + # 'arxiv_id': arxiv_id, + # 'file_hash': compute_hash(file_path), + # } + # resp = requests.get(url=url, files=files, data=data, timeout=10) + if txt.endswith('.pdf'): report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"发现已经存在翻译好的PDF文档") yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 @@ -406,14 +424,34 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- zip PDF -------------> zip_res = zip_result(project_folder) if success: + allow_cloud_io = True + arxiv_id = "2203.01927" + if allow_cloud_io and arxiv_id: + # 如果用户允许,我们将arxiv论文PDF上传到云端 + for file_path in chatbot._cookies.get("files_to_promote", []): + if file_path.endswith('translate_zh.pdf') or file_path.endswith('comparison.pdf'): + def compute_hash(file_path): + return map_file_to_sha256(file_path) + with open(file_path, 'rb') as f: + import requests + url = 'https://cloud-2.agent-matrix.com/upload' + files = {'file': (file_path, f, 'application/octet-stream')} + data = { + 'arxiv_id': arxiv_id, + 'file_hash': compute_hash(file_path), + } + resp = requests.post(url=url, files=files, data=data, timeout=10) + + chatbot.append((f"成功啦", '请查收结果(压缩包)...')) - yield from update_ui(chatbot=chatbot, history=history); + yield from update_ui(chatbot=chatbot, history=history) time.sleep(1) # 刷新界面 promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) + else: chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 您可以到Github Issue区, 用该压缩包进行反馈。如系统是Linux,请检查系统字体(见Github wiki) ...')) - yield from update_ui(chatbot=chatbot, history=history); + yield from update_ui(chatbot=chatbot, history=history) time.sleep(1) # 刷新界面 promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py index 4293f0d0..67c9b8c6 100644 --- a/crazy_functions/latex_fns/latex_actions.py +++ b/crazy_functions/latex_fns/latex_actions.py @@ -423,6 +423,9 @@ def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_f except Exception as e: logger.error(e) pass + + + return True # 成功啦 else: if n_fix>=max_try: break diff --git a/tests/test_latex_auto_correct.py b/tests/test_latex_auto_correct.py index c51e7414..93c8f707 100644 --- a/tests/test_latex_auto_correct.py +++ b/tests/test_latex_auto_correct.py @@ -19,4 +19,5 @@ if __name__ == "__main__": plugin_test = importlib.import_module('test_utils').plugin_test - plugin_test(plugin='crazy_functions.Latex_Function->Latex翻译中文并重新编译PDF', main_input="2203.01927") + # plugin_test(plugin='crazy_functions.Latex_Function->Latex翻译中文并重新编译PDF', main_input="2203.01927") + plugin_test(plugin='crazy_functions.Latex_Function->Latex翻译中文并重新编译PDF', main_input="gpt_log/arxiv_cache/2203.01927/workfolder") From 50a1ea83ef406051975e347c084df35bb548c487 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Fri, 18 Oct 2024 18:05:50 +0000 Subject: [PATCH 2/3] control whether to allow sharing translation results with GPTAC academic cloud. --- crazy_functions/Latex_Function.py | 67 +++++++++------------- crazy_functions/Latex_Function_Wrap.py | 8 +++ crazy_functions/latex_fns/latex_actions.py | 65 ++++++++++++++++++++- tests/test_latex_auto_correct.py | 5 +- 4 files changed, 104 insertions(+), 41 deletions(-) diff --git a/crazy_functions/Latex_Function.py b/crazy_functions/Latex_Function.py index 236460eb..51b03283 100644 --- a/crazy_functions/Latex_Function.py +++ b/crazy_functions/Latex_Function.py @@ -3,7 +3,7 @@ from toolbox import CatchException, report_exception, update_ui_lastest_msg, zip from functools import partial from loguru import logger -import glob, os, requests, time, json, tarfile +import glob, os, requests, time, json, tarfile, threading pj = os.path.join ARXIV_CACHE_DIR = get_conf("ARXIV_CACHE_DIR") @@ -338,11 +338,17 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- more requirements -------------> if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") more_req = plugin_kwargs.get("advanced_arg", "") - no_cache = more_req.startswith("--no-cache") - if no_cache: more_req.lstrip("--no-cache") + + no_cache = ("--no-cache" in more_req) + if no_cache: more_req = more_req.replace("--no-cache", "").strip() + + allow_gptac_cloud_io = ("--allow-cloudio" in more_req) # 从云端下载翻译结果,以及上传翻译结果到云端 + if allow_gptac_cloud_io: more_req = more_req.replace("--allow-cloudio", "").strip() + allow_cache = not no_cache _switch_prompt_ = partial(switch_prompt, more_requirement=more_req) + # <-------------- check deps -------------> try: import glob, os, time, subprocess @@ -364,29 +370,25 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, chatbot=chatbot, history=history) return - # allow_cloud_io = True - # arxiv_id = "2203.01927" - # if allow_cloud_io and arxiv_id: - # # 如果用户允许,我们将arxiv论文PDF上传到云端 - # for file_path in chatbot._cookies.get("files_to_promote", []): - # if file_path.endswith('comparison.pdf'): - # def compute_hash(file_path): - # return map_file_to_sha256(file_path) - # with open(file_path, 'rb') as f: - # import requests - # url = 'https://cloud-2.agent-matrix.com/upload' - # files = {'file': (file_path, f, 'application/octet-stream')} - # data = { - # 'arxiv_id': arxiv_id, - # 'file_hash': compute_hash(file_path), - # } - # resp = requests.get(url=url, files=files, data=data, timeout=10) - if txt.endswith('.pdf'): report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"发现已经存在翻译好的PDF文档") yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 return + # ################################################################# + if allow_gptac_cloud_io and arxiv_id: + # 访问 GPTAC学术云,查询云端是否存在该论文的翻译版本 + from crazy_functions.latex_fns.latex_actions import check_gptac_cloud + success, downloaded = check_gptac_cloud(arxiv_id, chatbot) + if success: + chatbot.append([ + f"检测到GPTAC云端存在翻译版本, 如果不满意翻译结果, 请禁用云端分享, 然后重新执行。", + None + ]) + yield from update_ui(chatbot=chatbot, history=history) + return + ################################################################# + if os.path.exists(txt): project_folder = txt else: @@ -424,24 +426,11 @@ def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, # <-------------- zip PDF -------------> zip_res = zip_result(project_folder) if success: - allow_cloud_io = True - arxiv_id = "2203.01927" - if allow_cloud_io and arxiv_id: - # 如果用户允许,我们将arxiv论文PDF上传到云端 - for file_path in chatbot._cookies.get("files_to_promote", []): - if file_path.endswith('translate_zh.pdf') or file_path.endswith('comparison.pdf'): - def compute_hash(file_path): - return map_file_to_sha256(file_path) - with open(file_path, 'rb') as f: - import requests - url = 'https://cloud-2.agent-matrix.com/upload' - files = {'file': (file_path, f, 'application/octet-stream')} - data = { - 'arxiv_id': arxiv_id, - 'file_hash': compute_hash(file_path), - } - resp = requests.post(url=url, files=files, data=data, timeout=10) - + if allow_gptac_cloud_io and arxiv_id: + # 如果用户允许,我们将翻译好的arxiv论文PDF上传到GPTAC学术云 + from crazy_functions.latex_fns.latex_actions import upload_to_gptac_cloud_if_user_allow + threading.Thread(target=upload_to_gptac_cloud_if_user_allow, + args=(chatbot, arxiv_id), daemon=True).start() chatbot.append((f"成功啦", '请查收结果(压缩包)...')) yield from update_ui(chatbot=chatbot, history=history) diff --git a/crazy_functions/Latex_Function_Wrap.py b/crazy_functions/Latex_Function_Wrap.py index 5d7b1f31..e591e380 100644 --- a/crazy_functions/Latex_Function_Wrap.py +++ b/crazy_functions/Latex_Function_Wrap.py @@ -30,6 +30,9 @@ class Arxiv_Localize(GptAcademicPluginTemplate): default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步 "allow_cache": ArgProperty(title="是否允许从缓存中调取结果", options=["允许缓存", "从头执行"], default_value="允许缓存", description="无", type="dropdown").model_dump_json(), + "allow_cloudio": + ArgProperty(title="是否允许向GPTAC学术云共享翻译结果", options=["允许", "禁止"], default_value="禁止", description="人人为我,我为人人", type="dropdown").model_dump_json(), + } return gui_definition @@ -38,9 +41,14 @@ class Arxiv_Localize(GptAcademicPluginTemplate): 执行插件 """ allow_cache = plugin_kwargs["allow_cache"] + allow_cloudio = plugin_kwargs["allow_cloudio"] advanced_arg = plugin_kwargs["advanced_arg"] if allow_cache == "从头执行": plugin_kwargs["advanced_arg"] = "--no-cache " + plugin_kwargs["advanced_arg"] + + # 从云端下载翻译结果,以及上传翻译结果到云端;人人为我,我为人人。 + if allow_cloudio == "允许": plugin_kwargs["advanced_arg"] = "--allow-cloudio " + plugin_kwargs["advanced_arg"] + yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py index 67c9b8c6..cfa0f155 100644 --- a/crazy_functions/latex_fns/latex_actions.py +++ b/crazy_functions/latex_fns/latex_actions.py @@ -3,7 +3,7 @@ import re import shutil import numpy as np from loguru import logger -from toolbox import update_ui, update_ui_lastest_msg, get_log_folder +from toolbox import update_ui, update_ui_lastest_msg, get_log_folder, gen_time_str from toolbox import get_conf, promote_file_to_downloadzone from crazy_functions.latex_fns.latex_toolbox import PRESERVE, TRANSFORM from crazy_functions.latex_fns.latex_toolbox import set_forbidden_text, set_forbidden_text_begin_end, set_forbidden_text_careful_brace @@ -471,3 +471,66 @@ def write_html(sp_file_contents, sp_file_result, chatbot, project_folder): except: from toolbox import trimmed_format_exc logger.error('writing html result failed:', trimmed_format_exc()) + + +def upload_to_gptac_cloud_if_user_allow(chatbot, arxiv_id): + try: + # 如果用户允许,我们将arxiv论文PDF上传到GPTAC学术云 + from toolbox import map_file_to_sha256 + # 检查是否顺利,如果没有生成预期的文件,则跳过 + is_result_good = False + for file_path in chatbot._cookies.get("files_to_promote", []): + if file_path.endswith('translate_zh.pdf'): + is_result_good = True + if not is_result_good: + return + # 上传文件 + for file_path in chatbot._cookies.get("files_to_promote", []): + align_name = None + # normalized name + for name in ['translate_zh.pdf', 'comparison.pdf']: + if file_path.endswith(name): align_name = name + # if match any align name + if align_name: + logger.info(f'Uploading to GPTAC cloud as the user has set `allow_cloud_io`: {file_path}') + with open(file_path, 'rb') as f: + import requests + url = 'https://cloud-2.agent-matrix.com/upload' + files = {'file': (align_name, f, 'application/octet-stream')} + data = { + 'arxiv_id': arxiv_id, + 'file_hash': map_file_to_sha256(file_path), + } + resp = requests.post(url=url, files=files, data=data, timeout=30) + logger.info(f'Uploading terminate ({resp.status_code})`: {file_path}') + except: + # 如果上传失败,不会中断程序,因为这是次要功能 + pass + +def check_gptac_cloud(arxiv_id, chatbot): + import requests + success = False + downloaded = [] + try: + for pdf_target in ['translate_zh.pdf', 'comparison.pdf']: + url = 'https://cloud-2.agent-matrix.com/paper_exist' + data = { + 'arxiv_id': arxiv_id, + 'name': pdf_target, + } + resp = requests.post(url=url, data=data) + cache_hit_result = resp.text.strip('"') + if cache_hit_result.startswith("http"): + url = cache_hit_result + logger.info(f'Downloading from GPTAC cloud: {url}') + resp = requests.get(url=url, timeout=30) + target = os.path.join(get_log_folder(plugin_name='gptac_cloud'), gen_time_str(), pdf_target) + os.makedirs(os.path.dirname(target), exist_ok=True) + with open(target, 'wb') as f: + f.write(resp.content) + new_path = promote_file_to_downloadzone(target, chatbot=chatbot) + success = True + downloaded.append(new_path) + except: + pass + return success, downloaded diff --git a/tests/test_latex_auto_correct.py b/tests/test_latex_auto_correct.py index 93c8f707..ea421370 100644 --- a/tests/test_latex_auto_correct.py +++ b/tests/test_latex_auto_correct.py @@ -20,4 +20,7 @@ if __name__ == "__main__": # plugin_test(plugin='crazy_functions.Latex_Function->Latex翻译中文并重新编译PDF', main_input="2203.01927") - plugin_test(plugin='crazy_functions.Latex_Function->Latex翻译中文并重新编译PDF', main_input="gpt_log/arxiv_cache/2203.01927/workfolder") + # plugin_test(plugin='crazy_functions.Latex_Function->Latex翻译中文并重新编译PDF', main_input="gpt_log/arxiv_cache/2203.01927/workfolder") + # plugin_test(plugin='crazy_functions.Latex_Function->Latex翻译中文并重新编译PDF', main_input="2410.05779") + plugin_test(plugin='crazy_functions.Latex_Function->Latex翻译中文并重新编译PDF', main_input="gpt_log/default_user/workfolder") + From 42d10a9481d14e2603b16351261d5c07250ac548 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Mon, 21 Oct 2024 14:05:05 +0000 Subject: [PATCH 3/3] update doc2x functions --- .../pdf_fns/parse_pdf_via_doc2x.py | 191 +++++++++++------- tests/test_doc2x.py | 7 + 2 files changed, 121 insertions(+), 77 deletions(-) create mode 100644 tests/test_doc2x.py diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py index d64aa91c..97c62fbf 100644 --- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -4,7 +4,9 @@ from toolbox import promote_file_to_downloadzone, extract_archive from toolbox import generate_file_link, zip_folder from crazy_functions.crazy_utils import get_files_from_everything from shared_utils.colorful import * +from loguru import logger import os +import time def refresh_key(doc2x_api_key): import requests, json @@ -22,105 +24,140 @@ def refresh_key(doc2x_api_key): raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) return doc2x_api_key + + def 解析PDF_DOC2X_转Latex(pdf_file_path): + zip_file_path, unzipped_folder = 解析PDF_DOC2X(pdf_file_path, format='tex') + return unzipped_folder + + +def 解析PDF_DOC2X(pdf_file_path, format='tex'): + """ + format: 'tex', 'md', 'docx' + """ import requests, json, os DOC2X_API_KEY = get_conf('DOC2X_API_KEY') latex_dir = get_log_folder(plugin_name="pdf_ocr_latex") + markdown_dir = get_log_folder(plugin_name="pdf_ocr") doc2x_api_key = DOC2X_API_KEY - if doc2x_api_key.startswith('sk-'): - url = "https://api.doc2x.noedgeai.com/api/v1/pdf" - else: - doc2x_api_key = refresh_key(doc2x_api_key) - url = "https://api.doc2x.noedgeai.com/api/platform/pdf" + + # < ------ 第1步:上传 ------ > + logger.info("Doc2x 第1步:上传") + with open(pdf_file_path, 'rb') as file: + res = requests.post( + "https://v2.doc2x.noedgeai.com/api/v2/parse/pdf", + headers={"Authorization": "Bearer " + doc2x_api_key}, + data=file + ) + # res_json = [] + if res.status_code == 200: + res_json = res.json() + else: + raise RuntimeError(f"Doc2x return an error: {res.json()}") + uuid = res_json['data']['uid'] + + # < ------ 第2步:轮询等待 ------ > + logger.info("Doc2x 第2步:轮询等待") + params = {'uid': uuid} + while True: + res = requests.get( + 'https://v2.doc2x.noedgeai.com/api/v2/parse/status', + headers={"Authorization": "Bearer " + doc2x_api_key}, + params=params + ) + res_json = res.json() + if res_json['data']['status'] == "success": + break + elif res_json['data']['status'] == "processing": + time.sleep(3) + logger.info(f"Doc2x is processing at {res_json['data']['progress']}%") + elif res_json['data']['status'] == "failed": + raise RuntimeError(f"Doc2x return an error: {res_json}") + + + # < ------ 第3步:提交转化 ------ > + logger.info("Doc2x 第3步:提交转化") + data = { + "uid": uuid, + "to": format, + "formula_mode": "dollar", + "filename": "output" + } res = requests.post( - url, - files={"file": open(pdf_file_path, "rb")}, - data={"ocr": "1"}, - headers={"Authorization": "Bearer " + doc2x_api_key} + 'https://v2.doc2x.noedgeai.com/api/v2/convert/parse', + headers={"Authorization": "Bearer " + doc2x_api_key}, + json=data ) - res_json = [] if res.status_code == 200: - decoded = res.content.decode("utf-8") - for z_decoded in decoded.split('\n'): - if len(z_decoded) == 0: continue - assert z_decoded.startswith("data: ") - z_decoded = z_decoded[len("data: "):] - decoded_json = json.loads(z_decoded) - res_json.append(decoded_json) + res_json = res.json() else: - raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + raise RuntimeError(f"Doc2x return an error: {res.json()}") - uuid = res_json[0]['uuid'] - to = "latex" # latex, md, docx - url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to - res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key}) - latex_zip_path = os.path.join(latex_dir, gen_time_str() + '.zip') - latex_unzip_path = os.path.join(latex_dir, gen_time_str()) - if res.status_code == 200: - with open(latex_zip_path, "wb") as f: f.write(res.content) - else: - raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) + # < ------ 第4步:等待结果 ------ > + logger.info("Doc2x 第4步:等待结果") + params = {'uid': uuid} + while True: + res = requests.get( + 'https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result', + headers={"Authorization": "Bearer " + doc2x_api_key}, + params=params + ) + res_json = res.json() + if res_json['data']['status'] == "success": + break + elif res_json['data']['status'] == "processing": + time.sleep(3) + logger.info(f"Doc2x still processing") + elif res_json['data']['status'] == "failed": + raise RuntimeError(f"Doc2x return an error: {res_json}") + + # < ------ 第5步:最后的处理 ------ > + logger.info("Doc2x 第5步:最后的处理") + + if format=='tex': + target_path = latex_dir + if format=='md': + target_path = markdown_dir + os.makedirs(target_path, exist_ok=True) + + max_attempt = 3 + # < ------ 下载 ------ > + for attempt in range(max_attempt): + try: + result_url = res_json['data']['url'] + res = requests.get(result_url) + zip_path = os.path.join(target_path, gen_time_str() + '.zip') + unzip_path = os.path.join(target_path, gen_time_str()) + if res.status_code == 200: + with open(zip_path, "wb") as f: f.write(res.content) + else: + raise RuntimeError(f"Doc2x return an error: {res.json()}") + except Exception as e: + if attempt < max_attempt - 1: + logger.error(f"Failed to download latex file, retrying... {e}") + time.sleep(3) + continue + else: + raise e + + # < ------ 解压 ------ > import zipfile - with zipfile.ZipFile(latex_zip_path, 'r') as zip_ref: - zip_ref.extractall(latex_unzip_path) - - - return latex_unzip_path - - + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(unzip_path) + return zip_path, unzip_path def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request): - def pdf2markdown(filepath): - import requests, json, os - markdown_dir = get_log_folder(plugin_name="pdf_ocr") - doc2x_api_key = DOC2X_API_KEY - if doc2x_api_key.startswith('sk-'): - url = "https://api.doc2x.noedgeai.com/api/v1/pdf" - else: - doc2x_api_key = refresh_key(doc2x_api_key) - url = "https://api.doc2x.noedgeai.com/api/platform/pdf" - - chatbot.append((None, "加载PDF文件,发送至DOC2X解析...")) + chatbot.append((None, f"Doc2x 解析中")) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - res = requests.post( - url, - files={"file": open(filepath, "rb")}, - data={"ocr": "1"}, - headers={"Authorization": "Bearer " + doc2x_api_key} - ) - res_json = [] - if res.status_code == 200: - decoded = res.content.decode("utf-8") - for z_decoded in decoded.split('\n'): - if len(z_decoded) == 0: continue - assert z_decoded.startswith("data: ") - z_decoded = z_decoded[len("data: "):] - decoded_json = json.loads(z_decoded) - res_json.append(decoded_json) - if 'limit exceeded' in decoded_json.get('status', ''): - raise RuntimeError("Doc2x API 页数受限,请联系 Doc2x 方面,并更换新的 API 秘钥。") - else: - raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) - uuid = res_json[0]['uuid'] - to = "md" # latex, md, docx - url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to + md_zip_path, unzipped_folder = 解析PDF_DOC2X(filepath, format='md') - chatbot.append((None, f"读取解析: {url} ...")) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key}) - md_zip_path = os.path.join(markdown_dir, gen_time_str() + '.zip') - if res.status_code == 200: - with open(md_zip_path, "wb") as f: f.write(res.content) - else: - raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) promote_file_to_downloadzone(md_zip_path, chatbot=chatbot) chatbot.append((None, f"完成解析 {md_zip_path} ...")) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 diff --git a/tests/test_doc2x.py b/tests/test_doc2x.py new file mode 100644 index 00000000..9d02c4b7 --- /dev/null +++ b/tests/test_doc2x.py @@ -0,0 +1,7 @@ +import init_test + +from crazy_functions.pdf_fns.parse_pdf_via_doc2x import 解析PDF_DOC2X_转Latex + +# 解析PDF_DOC2X_转Latex("gpt_log/arxiv_cache_old/2410.10819/workfolder/merge.pdf") +# 解析PDF_DOC2X_转Latex("gpt_log/arxiv_cache_ooo/2410.07095/workfolder/merge.pdf") +解析PDF_DOC2X_转Latex("2410.11190v2.pdf")