doc2x latex convertion

这个提交包含在:
binary-husky
2024-05-21 12:24:50 +00:00
父节点 f41419e767
当前提交 1339aa0e1a
共有 5 个文件被更改,包括 178 次插入74 次删除

查看文件

@@ -41,6 +41,7 @@ def get_crazy_functions():
from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF
from crazy_functions.Latex_Function import PDF翻译中文并重新编译PDF from crazy_functions.Latex_Function import PDF翻译中文并重新编译PDF
from crazy_functions.Latex_Function_Wrap import Arxiv_Localize from crazy_functions.Latex_Function_Wrap import Arxiv_Localize
from crazy_functions.Latex_Function_Wrap import PDF_Localize
function_plugins = { function_plugins = {
@@ -333,7 +334,9 @@ def get_crazy_functions():
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: " r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
r'If the term "agent" is used in this section, it should be translated to "智能体". ', r'If the term "agent" is used in this section, it should be translated to "智能体". ',
"Info": "PDF翻译中文,并重新编译PDF | 输入参数为路径", "Info": "PDF翻译中文,并重新编译PDF | 输入参数为路径",
"Function": HotReload(PDF翻译中文并重新编译PDF) "Function": None,
"Class": PDF_Localize
} }
} }

查看文件

@@ -158,65 +158,72 @@ def arxiv_download(chatbot, history, txt, allow_cache=True):
return extract_dst, arxiv_id return extract_dst, arxiv_id
def pdf2tex_project(pdf_file_path): def pdf2tex_project(pdf_file_path, plugin_kwargs):
# Mathpix API credentials if plugin_kwargs["method"] == "MATHPIX":
app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY') # Mathpix API credentials
headers = {"app_id": app_id, "app_key": app_key} app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
headers = {"app_id": app_id, "app_key": app_key}
# Step 1: Send PDF file for processing # Step 1: Send PDF file for processing
options = { options = {
"conversion_formats": {"tex.zip": True}, "conversion_formats": {"tex.zip": True},
"math_inline_delimiters": ["$", "$"], "math_inline_delimiters": ["$", "$"],
"rm_spaces": True "rm_spaces": True
} }
response = requests.post(url="https://api.mathpix.com/v3/pdf", response = requests.post(url="https://api.mathpix.com/v3/pdf",
headers=headers, headers=headers,
data={"options_json": json.dumps(options)}, data={"options_json": json.dumps(options)},
files={"file": open(pdf_file_path, "rb")}) files={"file": open(pdf_file_path, "rb")})
if response.ok: if response.ok:
pdf_id = response.json()["pdf_id"] pdf_id = response.json()["pdf_id"]
print(f"PDF processing initiated. PDF ID: {pdf_id}") print(f"PDF processing initiated. PDF ID: {pdf_id}")
# Step 2: Check processing status # Step 2: Check processing status
while True: while True:
conversion_response = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers) conversion_response = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers)
conversion_data = conversion_response.json() conversion_data = conversion_response.json()
if conversion_data["status"] == "completed": if conversion_data["status"] == "completed":
print("PDF processing completed.") print("PDF processing completed.")
break break
elif conversion_data["status"] == "error": elif conversion_data["status"] == "error":
print("Error occurred during processing.") print("Error occurred during processing.")
else: else:
print(f"Processing status: {conversion_data['status']}") print(f"Processing status: {conversion_data['status']}")
time.sleep(5) # wait for a few seconds before checking again time.sleep(5) # wait for a few seconds before checking again
# Step 3: Save results to local files # Step 3: Save results to local files
output_dir = os.path.join(os.path.dirname(pdf_file_path), 'mathpix_output') output_dir = os.path.join(os.path.dirname(pdf_file_path), 'mathpix_output')
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex" url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex"
response = requests.get(url, headers=headers) response = requests.get(url, headers=headers)
file_name_wo_dot = '_'.join(os.path.basename(pdf_file_path).split('.')[:-1]) file_name_wo_dot = '_'.join(os.path.basename(pdf_file_path).split('.')[:-1])
output_name = f"{file_name_wo_dot}.tex.zip" output_name = f"{file_name_wo_dot}.tex.zip"
output_path = os.path.join(output_dir, output_name) output_path = os.path.join(output_dir, output_name)
with open(output_path, "wb") as output_file: with open(output_path, "wb") as output_file:
output_file.write(response.content) output_file.write(response.content)
print(f"tex.zip file saved at: {output_path}") print(f"tex.zip file saved at: {output_path}")
import zipfile import zipfile
unzip_dir = os.path.join(output_dir, file_name_wo_dot) unzip_dir = os.path.join(output_dir, file_name_wo_dot)
with zipfile.ZipFile(output_path, 'r') as zip_ref: with zipfile.ZipFile(output_path, 'r') as zip_ref:
zip_ref.extractall(unzip_dir) zip_ref.extractall(unzip_dir)
return unzip_dir
else:
print(f"Error sending PDF for processing. Status code: {response.status_code}")
return None
else:
from crazy_functions.pdf_fns.parse_pdf_via_doc2x import 解析PDF_DOC2X_转Latex
unzip_dir = 解析PDF_DOC2X_转Latex(pdf_file_path)
return unzip_dir return unzip_dir
else:
print(f"Error sending PDF for processing. Status code: {response.status_code}")
return None
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序1 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序1 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
@@ -437,11 +444,20 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"不支持同时处理多个pdf文件: {txt}") report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"不支持同时处理多个pdf文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return return
app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
if len(app_id) == 0 or len(app_key) == 0: if plugin_kwargs.get("method", "") == 'MATHPIX':
report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY") app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 if len(app_id) == 0 or len(app_key) == 0:
return report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
if plugin_kwargs.get("method", "") == 'DOC2X':
app_id, app_key = "", ""
DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
if len(DOC2X_API_KEY) == 0:
report_exception(chatbot, history, a="缺失 DOC2X_API_KEY。", b=f"请配置 DOC2X_API_KEY")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return
hash_tag = map_file_to_sha256(file_manifest[0]) hash_tag = map_file_to_sha256(file_manifest[0])
@@ -486,7 +502,7 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
# <-------------- convert pdf into tex -------------> # <-------------- convert pdf into tex ------------->
chatbot.append([f"解析项目: {txt}", "正在将PDF转换为tex项目,请耐心等待..."]) chatbot.append([f"解析项目: {txt}", "正在将PDF转换为tex项目,请耐心等待..."])
yield from update_ui(chatbot=chatbot, history=history) yield from update_ui(chatbot=chatbot, history=history)
project_folder = pdf2tex_project(file_manifest[0]) project_folder = pdf2tex_project(file_manifest[0], plugin_kwargs)
if project_folder is None: if project_folder is None:
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"PDF转换为tex项目失败") report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"PDF转换为tex项目失败")
yield from update_ui(chatbot=chatbot, history=history) yield from update_ui(chatbot=chatbot, history=history)

查看文件

@@ -1,5 +1,5 @@
from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF, PDF翻译中文并重新编译PDF
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
@@ -37,3 +37,37 @@ class Arxiv_Localize(GptAcademicPluginTemplate):
if allow_cache == "从头执行": plugin_kwargs["advanced_arg"] = "--no-cache " + plugin_kwargs["advanced_arg"] if allow_cache == "从头执行": plugin_kwargs["advanced_arg"] = "--no-cache " + plugin_kwargs["advanced_arg"]
yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request) yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
class PDF_Localize(GptAcademicPluginTemplate):
def __init__(self):
"""
请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎!
"""
pass
def define_arg_selection_menu(self):
"""
定义插件的二级选项菜单
"""
gui_definition = {
"main_input":
ArgProperty(title="PDF文件路径", description="未指定路径,请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
"advanced_arg":
ArgProperty(title="额外的翻译提示词",
description=r"如果有必要, 请在此处给出自定义翻译命令, 解决部分词汇翻译不准确的问题。 "
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步
"method":
ArgProperty(title="采用哪种方法执行转换", options=["MATHPIX", "DOC2X"], default_value="DOC2X", description="", type="dropdown").model_dump_json(),
}
return gui_definition
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
执行插件
"""
yield from PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)

查看文件

@@ -15,7 +15,7 @@ class PDF_Tran(GptAcademicPluginTemplate):
""" """
gui_definition = { gui_definition = {
"main_input": "main_input":
ArgProperty(title="PDF文件路径", description="请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步 ArgProperty(title="PDF文件路径", description="未指定路径,请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
"additional_prompt": "additional_prompt":
ArgProperty(title="额外提示词", description="例如:对专有名词、翻译语气等方面的要求", default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步 ArgProperty(title="额外提示词", description="例如:对专有名词、翻译语气等方面的要求", default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步
"pdf_parse_method": "pdf_parse_method":

查看文件

@@ -1,4 +1,4 @@
from toolbox import get_log_folder, gen_time_str from toolbox import get_log_folder, gen_time_str, get_conf
from toolbox import update_ui, promote_file_to_downloadzone from toolbox import update_ui, promote_file_to_downloadzone
from toolbox import promote_file_to_downloadzone, extract_archive from toolbox import promote_file_to_downloadzone, extract_archive
from toolbox import generate_file_link, zip_folder from toolbox import generate_file_link, zip_folder
@@ -6,24 +6,75 @@ from crazy_functions.crazy_utils import get_files_from_everything
from shared_utils.colorful import * from shared_utils.colorful import *
import os import os
def refresh_key(doc2x_api_key):
import requests, json
url = "https://api.doc2x.noedgeai.com/api/token/refresh"
res = requests.post(
url,
headers={"Authorization": "Bearer " + doc2x_api_key}
)
res_json = []
if res.status_code == 200:
decoded = res.content.decode("utf-8")
res_json = json.loads(decoded)
doc2x_api_key = res_json['data']['token']
else:
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
return doc2x_api_key
def 解析PDF_DOC2X_转Latex(pdf_file_path):
import requests, json, os
DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
latex_dir = get_log_folder(plugin_name="pdf_ocr_latex")
doc2x_api_key = DOC2X_API_KEY
if doc2x_api_key.startswith('sk-'):
url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
else:
doc2x_api_key = refresh_key(doc2x_api_key)
url = "https://api.doc2x.noedgeai.com/api/platform/pdf"
res = requests.post(
url,
files={"file": open(pdf_file_path, "rb")},
data={"ocr": "1"},
headers={"Authorization": "Bearer " + doc2x_api_key}
)
res_json = []
if res.status_code == 200:
decoded = res.content.decode("utf-8")
for z_decoded in decoded.split('\n'):
if len(z_decoded) == 0: continue
assert z_decoded.startswith("data: ")
z_decoded = z_decoded[len("data: "):]
decoded_json = json.loads(z_decoded)
res_json.append(decoded_json)
else:
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
uuid = res_json[0]['uuid']
to = "latex" # latex, md, docx
url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to
res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key})
latex_zip_path = os.path.join(latex_dir, gen_time_str() + '.zip')
latex_unzip_path = os.path.join(latex_dir, gen_time_str())
if res.status_code == 200:
with open(latex_zip_path, "wb") as f: f.write(res.content)
else:
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
import zipfile
with zipfile.ZipFile(latex_zip_path, 'r') as zip_ref:
zip_ref.extractall(latex_unzip_path)
return latex_unzip_path
def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request): def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):
def refresh_key(doc2x_api_key):
import requests, json
url = "https://api.doc2x.noedgeai.com/api/token/refresh"
res = requests.post(
url,
headers={"Authorization": "Bearer " + doc2x_api_key}
)
res_json = []
if res.status_code == 200:
decoded = res.content.decode("utf-8")
res_json = json.loads(decoded)
doc2x_api_key = res_json['data']['token']
else:
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
return doc2x_api_key
def pdf2markdown(filepath): def pdf2markdown(filepath):
import requests, json, os import requests, json, os