镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 14:36:48 +00:00
update doc2x functions
这个提交包含在:
@@ -4,7 +4,9 @@ from toolbox import promote_file_to_downloadzone, extract_archive
|
|||||||
from toolbox import generate_file_link, zip_folder
|
from toolbox import generate_file_link, zip_folder
|
||||||
from crazy_functions.crazy_utils import get_files_from_everything
|
from crazy_functions.crazy_utils import get_files_from_everything
|
||||||
from shared_utils.colorful import *
|
from shared_utils.colorful import *
|
||||||
|
from loguru import logger
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
def refresh_key(doc2x_api_key):
|
def refresh_key(doc2x_api_key):
|
||||||
import requests, json
|
import requests, json
|
||||||
@@ -22,105 +24,140 @@ def refresh_key(doc2x_api_key):
|
|||||||
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
||||||
return doc2x_api_key
|
return doc2x_api_key
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def 解析PDF_DOC2X_转Latex(pdf_file_path):
|
def 解析PDF_DOC2X_转Latex(pdf_file_path):
|
||||||
|
zip_file_path, unzipped_folder = 解析PDF_DOC2X(pdf_file_path, format='tex')
|
||||||
|
return unzipped_folder
|
||||||
|
|
||||||
|
|
||||||
|
def 解析PDF_DOC2X(pdf_file_path, format='tex'):
|
||||||
|
"""
|
||||||
|
format: 'tex', 'md', 'docx'
|
||||||
|
"""
|
||||||
import requests, json, os
|
import requests, json, os
|
||||||
DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
|
DOC2X_API_KEY = get_conf('DOC2X_API_KEY')
|
||||||
latex_dir = get_log_folder(plugin_name="pdf_ocr_latex")
|
latex_dir = get_log_folder(plugin_name="pdf_ocr_latex")
|
||||||
|
markdown_dir = get_log_folder(plugin_name="pdf_ocr")
|
||||||
doc2x_api_key = DOC2X_API_KEY
|
doc2x_api_key = DOC2X_API_KEY
|
||||||
if doc2x_api_key.startswith('sk-'):
|
|
||||||
url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
|
|
||||||
else:
|
|
||||||
doc2x_api_key = refresh_key(doc2x_api_key)
|
|
||||||
url = "https://api.doc2x.noedgeai.com/api/platform/pdf"
|
|
||||||
|
|
||||||
|
|
||||||
|
# < ------ 第1步:上传 ------ >
|
||||||
|
logger.info("Doc2x 第1步:上传")
|
||||||
|
with open(pdf_file_path, 'rb') as file:
|
||||||
res = requests.post(
|
res = requests.post(
|
||||||
url,
|
"https://v2.doc2x.noedgeai.com/api/v2/parse/pdf",
|
||||||
files={"file": open(pdf_file_path, "rb")},
|
headers={"Authorization": "Bearer " + doc2x_api_key},
|
||||||
data={"ocr": "1"},
|
data=file
|
||||||
headers={"Authorization": "Bearer " + doc2x_api_key}
|
|
||||||
)
|
)
|
||||||
res_json = []
|
# res_json = []
|
||||||
if res.status_code == 200:
|
if res.status_code == 200:
|
||||||
decoded = res.content.decode("utf-8")
|
res_json = res.json()
|
||||||
for z_decoded in decoded.split('\n'):
|
|
||||||
if len(z_decoded) == 0: continue
|
|
||||||
assert z_decoded.startswith("data: ")
|
|
||||||
z_decoded = z_decoded[len("data: "):]
|
|
||||||
decoded_json = json.loads(z_decoded)
|
|
||||||
res_json.append(decoded_json)
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
raise RuntimeError(f"Doc2x return an error: {res.json()}")
|
||||||
|
uuid = res_json['data']['uid']
|
||||||
|
|
||||||
uuid = res_json[0]['uuid']
|
# < ------ 第2步:轮询等待 ------ >
|
||||||
to = "latex" # latex, md, docx
|
logger.info("Doc2x 第2步:轮询等待")
|
||||||
url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to
|
params = {'uid': uuid}
|
||||||
|
while True:
|
||||||
|
res = requests.get(
|
||||||
|
'https://v2.doc2x.noedgeai.com/api/v2/parse/status',
|
||||||
|
headers={"Authorization": "Bearer " + doc2x_api_key},
|
||||||
|
params=params
|
||||||
|
)
|
||||||
|
res_json = res.json()
|
||||||
|
if res_json['data']['status'] == "success":
|
||||||
|
break
|
||||||
|
elif res_json['data']['status'] == "processing":
|
||||||
|
time.sleep(3)
|
||||||
|
logger.info(f"Doc2x is processing at {res_json['data']['progress']}%")
|
||||||
|
elif res_json['data']['status'] == "failed":
|
||||||
|
raise RuntimeError(f"Doc2x return an error: {res_json}")
|
||||||
|
|
||||||
res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key})
|
|
||||||
latex_zip_path = os.path.join(latex_dir, gen_time_str() + '.zip')
|
# < ------ 第3步:提交转化 ------ >
|
||||||
latex_unzip_path = os.path.join(latex_dir, gen_time_str())
|
logger.info("Doc2x 第3步:提交转化")
|
||||||
|
data = {
|
||||||
|
"uid": uuid,
|
||||||
|
"to": format,
|
||||||
|
"formula_mode": "dollar",
|
||||||
|
"filename": "output"
|
||||||
|
}
|
||||||
|
res = requests.post(
|
||||||
|
'https://v2.doc2x.noedgeai.com/api/v2/convert/parse',
|
||||||
|
headers={"Authorization": "Bearer " + doc2x_api_key},
|
||||||
|
json=data
|
||||||
|
)
|
||||||
if res.status_code == 200:
|
if res.status_code == 200:
|
||||||
with open(latex_zip_path, "wb") as f: f.write(res.content)
|
res_json = res.json()
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
raise RuntimeError(f"Doc2x return an error: {res.json()}")
|
||||||
|
|
||||||
|
|
||||||
|
# < ------ 第4步:等待结果 ------ >
|
||||||
|
logger.info("Doc2x 第4步:等待结果")
|
||||||
|
params = {'uid': uuid}
|
||||||
|
while True:
|
||||||
|
res = requests.get(
|
||||||
|
'https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result',
|
||||||
|
headers={"Authorization": "Bearer " + doc2x_api_key},
|
||||||
|
params=params
|
||||||
|
)
|
||||||
|
res_json = res.json()
|
||||||
|
if res_json['data']['status'] == "success":
|
||||||
|
break
|
||||||
|
elif res_json['data']['status'] == "processing":
|
||||||
|
time.sleep(3)
|
||||||
|
logger.info(f"Doc2x still processing")
|
||||||
|
elif res_json['data']['status'] == "failed":
|
||||||
|
raise RuntimeError(f"Doc2x return an error: {res_json}")
|
||||||
|
|
||||||
|
|
||||||
|
# < ------ 第5步:最后的处理 ------ >
|
||||||
|
logger.info("Doc2x 第5步:最后的处理")
|
||||||
|
|
||||||
|
if format=='tex':
|
||||||
|
target_path = latex_dir
|
||||||
|
if format=='md':
|
||||||
|
target_path = markdown_dir
|
||||||
|
os.makedirs(target_path, exist_ok=True)
|
||||||
|
|
||||||
|
max_attempt = 3
|
||||||
|
# < ------ 下载 ------ >
|
||||||
|
for attempt in range(max_attempt):
|
||||||
|
try:
|
||||||
|
result_url = res_json['data']['url']
|
||||||
|
res = requests.get(result_url)
|
||||||
|
zip_path = os.path.join(target_path, gen_time_str() + '.zip')
|
||||||
|
unzip_path = os.path.join(target_path, gen_time_str())
|
||||||
|
if res.status_code == 200:
|
||||||
|
with open(zip_path, "wb") as f: f.write(res.content)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Doc2x return an error: {res.json()}")
|
||||||
|
except Exception as e:
|
||||||
|
if attempt < max_attempt - 1:
|
||||||
|
logger.error(f"Failed to download latex file, retrying... {e}")
|
||||||
|
time.sleep(3)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# < ------ 解压 ------ >
|
||||||
import zipfile
|
import zipfile
|
||||||
with zipfile.ZipFile(latex_zip_path, 'r') as zip_ref:
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||||
zip_ref.extractall(latex_unzip_path)
|
zip_ref.extractall(unzip_path)
|
||||||
|
return zip_path, unzip_path
|
||||||
|
|
||||||
return latex_unzip_path
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):
|
def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):
|
||||||
|
|
||||||
|
|
||||||
def pdf2markdown(filepath):
|
def pdf2markdown(filepath):
|
||||||
import requests, json, os
|
chatbot.append((None, f"Doc2x 解析中"))
|
||||||
markdown_dir = get_log_folder(plugin_name="pdf_ocr")
|
|
||||||
doc2x_api_key = DOC2X_API_KEY
|
|
||||||
if doc2x_api_key.startswith('sk-'):
|
|
||||||
url = "https://api.doc2x.noedgeai.com/api/v1/pdf"
|
|
||||||
else:
|
|
||||||
doc2x_api_key = refresh_key(doc2x_api_key)
|
|
||||||
url = "https://api.doc2x.noedgeai.com/api/platform/pdf"
|
|
||||||
|
|
||||||
chatbot.append((None, "加载PDF文件,发送至DOC2X解析..."))
|
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|
||||||
res = requests.post(
|
md_zip_path, unzipped_folder = 解析PDF_DOC2X(filepath, format='md')
|
||||||
url,
|
|
||||||
files={"file": open(filepath, "rb")},
|
|
||||||
data={"ocr": "1"},
|
|
||||||
headers={"Authorization": "Bearer " + doc2x_api_key}
|
|
||||||
)
|
|
||||||
res_json = []
|
|
||||||
if res.status_code == 200:
|
|
||||||
decoded = res.content.decode("utf-8")
|
|
||||||
for z_decoded in decoded.split('\n'):
|
|
||||||
if len(z_decoded) == 0: continue
|
|
||||||
assert z_decoded.startswith("data: ")
|
|
||||||
z_decoded = z_decoded[len("data: "):]
|
|
||||||
decoded_json = json.loads(z_decoded)
|
|
||||||
res_json.append(decoded_json)
|
|
||||||
if 'limit exceeded' in decoded_json.get('status', ''):
|
|
||||||
raise RuntimeError("Doc2x API 页数受限,请联系 Doc2x 方面,并更换新的 API 秘钥。")
|
|
||||||
else:
|
|
||||||
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
|
||||||
uuid = res_json[0]['uuid']
|
|
||||||
to = "md" # latex, md, docx
|
|
||||||
url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to
|
|
||||||
|
|
||||||
chatbot.append((None, f"读取解析: {url} ..."))
|
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
|
||||||
|
|
||||||
res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key})
|
|
||||||
md_zip_path = os.path.join(markdown_dir, gen_time_str() + '.zip')
|
|
||||||
if res.status_code == 200:
|
|
||||||
with open(md_zip_path, "wb") as f: f.write(res.content)
|
|
||||||
else:
|
|
||||||
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
|
|
||||||
promote_file_to_downloadzone(md_zip_path, chatbot=chatbot)
|
promote_file_to_downloadzone(md_zip_path, chatbot=chatbot)
|
||||||
chatbot.append((None, f"完成解析 {md_zip_path} ..."))
|
chatbot.append((None, f"完成解析 {md_zip_path} ..."))
|
||||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||||
|
|||||||
7
tests/test_doc2x.py
普通文件
7
tests/test_doc2x.py
普通文件
@@ -0,0 +1,7 @@
|
|||||||
|
import init_test
|
||||||
|
|
||||||
|
from crazy_functions.pdf_fns.parse_pdf_via_doc2x import 解析PDF_DOC2X_转Latex
|
||||||
|
|
||||||
|
# 解析PDF_DOC2X_转Latex("gpt_log/arxiv_cache_old/2410.10819/workfolder/merge.pdf")
|
||||||
|
# 解析PDF_DOC2X_转Latex("gpt_log/arxiv_cache_ooo/2410.07095/workfolder/merge.pdf")
|
||||||
|
解析PDF_DOC2X_转Latex("2410.11190v2.pdf")
|
||||||
在新工单中引用
屏蔽一个用户