镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-07 15:06:48 +00:00
doc2x latex convertion
这个提交包含在:
@@ -41,6 +41,7 @@ def get_crazy_functions():
|
|||||||
from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF
|
from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF
|
||||||
from crazy_functions.Latex_Function import PDF翻译中文并重新编译PDF
|
from crazy_functions.Latex_Function import PDF翻译中文并重新编译PDF
|
||||||
from crazy_functions.Latex_Function_Wrap import Arxiv_Localize
|
from crazy_functions.Latex_Function_Wrap import Arxiv_Localize
|
||||||
|
from crazy_functions.Latex_Function_Wrap import PDF_Localize
|
||||||
|
|
||||||
|
|
||||||
function_plugins = {
|
function_plugins = {
|
||||||
@@ -333,7 +334,9 @@ def get_crazy_functions():
|
|||||||
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
|
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
|
||||||
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
|
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
|
||||||
"Info": "PDF翻译中文,并重新编译PDF | 输入参数为路径",
|
"Info": "PDF翻译中文,并重新编译PDF | 输入参数为路径",
|
||||||
"Function": HotReload(PDF翻译中文并重新编译PDF)
|
"Function": None,
|
||||||
|
"Class": PDF_Localize
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -158,65 +158,72 @@ def arxiv_download(chatbot, history, txt, allow_cache=True):
|
|||||||
return extract_dst, arxiv_id
|
return extract_dst, arxiv_id
|
||||||
|
|
||||||
|
|
||||||
def pdf2tex_project(pdf_file_path):
|
def pdf2tex_project(pdf_file_path, plugin_kwargs):
|
||||||
# Mathpix API credentials
|
if plugin_kwargs["method"] == "MATHPIX":
|
||||||
app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
|
# Mathpix API credentials
|
||||||
headers = {"app_id": app_id, "app_key": app_key}
|
app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
|
||||||
|
headers = {"app_id": app_id, "app_key": app_key}
|
||||||
|
|
||||||
# Step 1: Send PDF file for processing
|
# Step 1: Send PDF file for processing
|
||||||
options = {
|
options = {
|
||||||
"conversion_formats": {"tex.zip": True},
|
"conversion_formats": {"tex.zip": True},
|
||||||
"math_inline_delimiters": ["$", "$"],
|
"math_inline_delimiters": ["$", "$"],
|
||||||
"rm_spaces": True
|
"rm_spaces": True
|
||||||
}
|
}
|
||||||
|
|
||||||
response = requests.post(url="https://api.mathpix.com/v3/pdf",
|
response = requests.post(url="https://api.mathpix.com/v3/pdf",
|
||||||
headers=headers,
|
headers=headers,
|
||||||
data={"options_json": json.dumps(options)},
|
data={"options_json": json.dumps(options)},
|
||||||
files={"file": open(pdf_file_path, "rb")})
|
files={"file": open(pdf_file_path, "rb")})
|
||||||
|
|
||||||
if response.ok:
|
if response.ok:
|
||||||
pdf_id = response.json()["pdf_id"]
|
pdf_id = response.json()["pdf_id"]
|
||||||
print(f"PDF processing initiated. PDF ID: {pdf_id}")
|
print(f"PDF processing initiated. PDF ID: {pdf_id}")
|
||||||
|
|
||||||
# Step 2: Check processing status
|
# Step 2: Check processing status
|
||||||
while True:
|
while True:
|
||||||
conversion_response = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers)
|
conversion_response = requests.get(f"https://api.mathpix.com/v3/pdf/{pdf_id}", headers=headers)
|
||||||
conversion_data = conversion_response.json()
|
conversion_data = conversion_response.json()
|
||||||
|
|
||||||
if conversion_data["status"] == "completed":
|
if conversion_data["status"] == "completed":
|
||||||
print("PDF processing completed.")
|
print("PDF processing completed.")
|
||||||
break
|
break
|
||||||
elif conversion_data["status"] == "error":
|
elif conversion_data["status"] == "error":
|
||||||
print("Error occurred during processing.")
|
print("Error occurred during processing.")
|
||||||
else:
|
else:
|
||||||
print(f"Processing status: {conversion_data['status']}")
|
print(f"Processing status: {conversion_data['status']}")
|
||||||
time.sleep(5) # wait for a few seconds before checking again
|
time.sleep(5) # wait for a few seconds before checking again
|
||||||
|
|
||||||
# Step 3: Save results to local files
|
# Step 3: Save results to local files
|
||||||
output_dir = os.path.join(os.path.dirname(pdf_file_path), 'mathpix_output')
|
output_dir = os.path.join(os.path.dirname(pdf_file_path), 'mathpix_output')
|
||||||
if not os.path.exists(output_dir):
|
if not os.path.exists(output_dir):
|
||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex"
|
url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.tex"
|
||||||
response = requests.get(url, headers=headers)
|
response = requests.get(url, headers=headers)
|
||||||
file_name_wo_dot = '_'.join(os.path.basename(pdf_file_path).split('.')[:-1])
|
file_name_wo_dot = '_'.join(os.path.basename(pdf_file_path).split('.')[:-1])
|
||||||
output_name = f"{file_name_wo_dot}.tex.zip"
|
output_name = f"{file_name_wo_dot}.tex.zip"
|
||||||
output_path = os.path.join(output_dir, output_name)
|
output_path = os.path.join(output_dir, output_name)
|
||||||
with open(output_path, "wb") as output_file:
|
with open(output_path, "wb") as output_file:
|
||||||
output_file.write(response.content)
|
output_file.write(response.content)
|
||||||
print(f"tex.zip file saved at: {output_path}")
|
print(f"tex.zip file saved at: {output_path}")
|
||||||
|
|
||||||
import zipfile
|
import zipfile
|
||||||
unzip_dir = os.path.join(output_dir, file_name_wo_dot)
|
unzip_dir = os.path.join(output_dir, file_name_wo_dot)
|
||||||
with zipfile.ZipFile(output_path, 'r') as zip_ref:
|
with zipfile.ZipFile(output_path, 'r') as zip_ref:
|
||||||
zip_ref.extractall(unzip_dir)
|
zip_ref.extractall(unzip_dir)
|
||||||
|
|
||||||
|
return unzip_dir
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"Error sending PDF for processing. Status code: {response.status_code}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
from crazy_functions.pdf_fns.parse_pdf_via_doc2x import 解析PDF_DOC2X_转Latex
|
||||||
|
unzip_dir = 解析PDF_DOC2X_转Latex(pdf_file_path)
|
||||||
return unzip_dir
|
return unzip_dir
|
||||||
|
|
||||||
else:
|
|
||||||
print(f"Error sending PDF for processing. Status code: {response.status_code}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序1 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 插件主程序1 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||||
@@ -437,11 +444,20 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
|
|||||||
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"不支持同时处理多个pdf文件: {txt}")
|
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"不支持同时处理多个pdf文件: {txt}")
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
return
|
return
|
||||||
app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
|
|
||||||
if len(app_id) == 0 or len(app_key) == 0:
|
if plugin_kwargs.get("method", "") == 'MATHPIX':
|
||||||
report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY")
|
app_id, app_key = get_conf('MATHPIX_APPID', 'MATHPIX_APPKEY')
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
if len(app_id) == 0 or len(app_key) == 0:
|
||||||
return
|
report_exception(chatbot, history, a="缺失 MATHPIX_APPID 和 MATHPIX_APPKEY。", b=f"请配置 MATHPIX_APPID 和 MATHPIX_APPKEY")
|
||||||
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
return
|
||||||
|
if plugin_kwargs.get("method", "") == 'DOC2X':
|
||||||
|
app_id, app_key = "", ""
|
||||||
|
DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
|
||||||
|
if len(DOC2X_API_KEY) == 0:
|
||||||
|
report_exception(chatbot, history, a="缺失 DOC2X_API_KEY。", b=f"请配置 DOC2X_API_KEY")
|
||||||
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
return
|
||||||
|
|
||||||
hash_tag = map_file_to_sha256(file_manifest[0])
|
hash_tag = map_file_to_sha256(file_manifest[0])
|
||||||
|
|
||||||
@@ -486,7 +502,7 @@ def PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, h
|
|||||||
# <-------------- convert pdf into tex ------------->
|
# <-------------- convert pdf into tex ------------->
|
||||||
chatbot.append([f"解析项目: {txt}", "正在将PDF转换为tex项目,请耐心等待..."])
|
chatbot.append([f"解析项目: {txt}", "正在将PDF转换为tex项目,请耐心等待..."])
|
||||||
yield from update_ui(chatbot=chatbot, history=history)
|
yield from update_ui(chatbot=chatbot, history=history)
|
||||||
project_folder = pdf2tex_project(file_manifest[0])
|
project_folder = pdf2tex_project(file_manifest[0], plugin_kwargs)
|
||||||
if project_folder is None:
|
if project_folder is None:
|
||||||
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"PDF转换为tex项目失败")
|
report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"PDF转换为tex项目失败")
|
||||||
yield from update_ui(chatbot=chatbot, history=history)
|
yield from update_ui(chatbot=chatbot, history=history)
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF
|
from crazy_functions.Latex_Function import Latex翻译中文并重新编译PDF, PDF翻译中文并重新编译PDF
|
||||||
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
|
from crazy_functions.plugin_template.plugin_class_template import GptAcademicPluginTemplate, ArgProperty
|
||||||
|
|
||||||
|
|
||||||
@@ -37,3 +37,37 @@ class Arxiv_Localize(GptAcademicPluginTemplate):
|
|||||||
|
|
||||||
if allow_cache == "从头执行": plugin_kwargs["advanced_arg"] = "--no-cache " + plugin_kwargs["advanced_arg"]
|
if allow_cache == "从头执行": plugin_kwargs["advanced_arg"] = "--no-cache " + plugin_kwargs["advanced_arg"]
|
||||||
yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
|
yield from Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class PDF_Localize(GptAcademicPluginTemplate):
|
||||||
|
def __init__(self):
|
||||||
|
"""
|
||||||
|
请注意`execute`会执行在不同的线程中,因此您在定义和使用类变量时,应当慎之又慎!
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def define_arg_selection_menu(self):
|
||||||
|
"""
|
||||||
|
定义插件的二级选项菜单
|
||||||
|
"""
|
||||||
|
gui_definition = {
|
||||||
|
"main_input":
|
||||||
|
ArgProperty(title="PDF文件路径", description="未指定路径,请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
|
||||||
|
"advanced_arg":
|
||||||
|
ArgProperty(title="额外的翻译提示词",
|
||||||
|
description=r"如果有必要, 请在此处给出自定义翻译命令, 解决部分词汇翻译不准确的问题。 "
|
||||||
|
r"例如当单词'agent'翻译不准确时, 请尝试把以下指令复制到高级参数区: "
|
||||||
|
r'If the term "agent" is used in this section, it should be translated to "智能体". ',
|
||||||
|
default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步
|
||||||
|
"method":
|
||||||
|
ArgProperty(title="采用哪种方法执行转换", options=["MATHPIX", "DOC2X"], default_value="DOC2X", description="无", type="dropdown").model_dump_json(),
|
||||||
|
|
||||||
|
}
|
||||||
|
return gui_definition
|
||||||
|
|
||||||
|
def execute(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
|
||||||
|
"""
|
||||||
|
执行插件
|
||||||
|
"""
|
||||||
|
yield from PDF翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
|
||||||
@@ -15,7 +15,7 @@ class PDF_Tran(GptAcademicPluginTemplate):
|
|||||||
"""
|
"""
|
||||||
gui_definition = {
|
gui_definition = {
|
||||||
"main_input":
|
"main_input":
|
||||||
ArgProperty(title="PDF文件路径", description="请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
|
ArgProperty(title="PDF文件路径", description="未指定路径,请上传文件后,再点击该插件", default_value="", type="string").model_dump_json(), # 主输入,自动从输入框同步
|
||||||
"additional_prompt":
|
"additional_prompt":
|
||||||
ArgProperty(title="额外提示词", description="例如:对专有名词、翻译语气等方面的要求", default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步
|
ArgProperty(title="额外提示词", description="例如:对专有名词、翻译语气等方面的要求", default_value="", type="string").model_dump_json(), # 高级参数输入区,自动同步
|
||||||
"pdf_parse_method":
|
"pdf_parse_method":
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from toolbox import get_log_folder, gen_time_str
|
from toolbox import get_log_folder, gen_time_str, get_conf
|
||||||
from toolbox import update_ui, promote_file_to_downloadzone
|
from toolbox import update_ui, promote_file_to_downloadzone
|
||||||
from toolbox import promote_file_to_downloadzone, extract_archive
|
from toolbox import promote_file_to_downloadzone, extract_archive
|
||||||
from toolbox import generate_file_link, zip_folder
|
from toolbox import generate_file_link, zip_folder
|
||||||
@@ -6,24 +6,75 @@ from crazy_functions.crazy_utils import get_files_from_everything
|
|||||||
from shared_utils.colorful import *
|
from shared_utils.colorful import *
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
def refresh_key(doc2x_api_key):
|
||||||
|
import requests, json
|
||||||
|
url = "https://api.doc2x.noedgeai.com/api/token/refresh"
|
||||||
|
res = requests.post(
|
||||||
|
url,
|
||||||
|
headers={"Authorization": "Bearer " + doc2x_api_key}
|
||||||
|
)
|
||||||
|
res_json = []
|
||||||
|
if res.status_code == 200:
|
||||||
|
decoded = res.content.decode("utf-8")
|
||||||
|
res_json = json.loads(decoded)
|
||||||
|
doc2x_api_key = res_json['data']['token']
|
||||||
|
else:
|
||||||
|
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
||||||
|
return doc2x_api_key
|
||||||
|
|
||||||
|
def 解析PDF_DOC2X_转Latex(pdf_file_path):
|
||||||
|
import requests, json, os
|
||||||
|
DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
|
||||||
|
latex_dir = get_log_folder(plugin_name="pdf_ocr_latex")
|
||||||
|
doc2x_api_key = DOC2X_API_KEY
|
||||||
|
if doc2x_api_key.startswith('sk-'):
|
||||||
|
url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
|
||||||
|
else:
|
||||||
|
doc2x_api_key = refresh_key(doc2x_api_key)
|
||||||
|
url = "https://api.doc2x.noedgeai.com/api/platform/pdf"
|
||||||
|
|
||||||
|
res = requests.post(
|
||||||
|
url,
|
||||||
|
files={"file": open(pdf_file_path, "rb")},
|
||||||
|
data={"ocr": "1"},
|
||||||
|
headers={"Authorization": "Bearer " + doc2x_api_key}
|
||||||
|
)
|
||||||
|
res_json = []
|
||||||
|
if res.status_code == 200:
|
||||||
|
decoded = res.content.decode("utf-8")
|
||||||
|
for z_decoded in decoded.split('\n'):
|
||||||
|
if len(z_decoded) == 0: continue
|
||||||
|
assert z_decoded.startswith("data: ")
|
||||||
|
z_decoded = z_decoded[len("data: "):]
|
||||||
|
decoded_json = json.loads(z_decoded)
|
||||||
|
res_json.append(decoded_json)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
||||||
|
|
||||||
|
uuid = res_json[0]['uuid']
|
||||||
|
to = "latex" # latex, md, docx
|
||||||
|
url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to
|
||||||
|
|
||||||
|
res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key})
|
||||||
|
latex_zip_path = os.path.join(latex_dir, gen_time_str() + '.zip')
|
||||||
|
latex_unzip_path = os.path.join(latex_dir, gen_time_str())
|
||||||
|
if res.status_code == 200:
|
||||||
|
with open(latex_zip_path, "wb") as f: f.write(res.content)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
||||||
|
|
||||||
|
import zipfile
|
||||||
|
with zipfile.ZipFile(latex_zip_path, 'r') as zip_ref:
|
||||||
|
zip_ref.extractall(latex_unzip_path)
|
||||||
|
|
||||||
|
|
||||||
|
return latex_unzip_path
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):
|
def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):
|
||||||
|
|
||||||
def refresh_key(doc2x_api_key):
|
|
||||||
import requests, json
|
|
||||||
url = "https://api.doc2x.noedgeai.com/api/token/refresh"
|
|
||||||
res = requests.post(
|
|
||||||
url,
|
|
||||||
headers={"Authorization": "Bearer " + doc2x_api_key}
|
|
||||||
)
|
|
||||||
res_json = []
|
|
||||||
if res.status_code == 200:
|
|
||||||
decoded = res.content.decode("utf-8")
|
|
||||||
res_json = json.loads(decoded)
|
|
||||||
doc2x_api_key = res_json['data']['token']
|
|
||||||
else:
|
|
||||||
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
|
||||||
return doc2x_api_key
|
|
||||||
|
|
||||||
def pdf2markdown(filepath):
|
def pdf2markdown(filepath):
|
||||||
import requests, json, os
|
import requests, json, os
|
||||||
|
|||||||
在新工单中引用
屏蔽一个用户