diff --git a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py index bae4d951..d64aa91c 100644 --- a/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py +++ b/crazy_functions/pdf_fns/parse_pdf_via_doc2x.py @@ -104,6 +104,8 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha z_decoded = z_decoded[len("data: "):] decoded_json = json.loads(z_decoded) res_json.append(decoded_json) + if 'limit exceeded' in decoded_json.get('status', ''): + raise RuntimeError("Doc2x API 页数受限,请联系 Doc2x 方面,并更换新的 API 秘钥。") else: raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text))) uuid = res_json[0]['uuid'] @@ -159,8 +161,8 @@ def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, cha file_name = '在线预览翻译(原文)' + gen_time_str() + '.html' preview_fp = os.path.join(ex_folder, file_name) from shared_utils.advanced_markdown_format import markdown_convertion_for_file - # with open(generated_fp, "r", encoding="utf-8") as f: - # md = f.read() + with open(generated_fp, "r", encoding="utf-8") as f: + md = f.read() # # Markdown中使用不标准的表格,需要在表格前加上一个emoji,以便公式渲染 # md = re.sub(r'^
| ') > 0 or txt[:(index+1)].find(' | ') > 0: + print('公式之中出现了异常换行 unexpect new line in equation') + single_stack_height = 0 + txt_result += ' $' + continue + + if double_stack_height > 0: + if txt[:(index+1)].find('\n\n') > 0: + print('公式之中出现了异常换行 unexpect new line in equation') + double_stack_height = 0 + txt_result += '$$' + continue + is_double = (txt[index+1] == '$') if is_double: if single_stack_height != 0: @@ -239,11 +255,15 @@ def fix_dollar_sticking_bug(txt): txt = txt[(index+2):] else: if double_stack_height != 0: + print(txt[:(index)]) print('Fatal') if single_stack_height == 0: single_stack_height = 1 else: single_stack_height = 0 + print(txt[:(index)]) + import time + # time.sleep(0.1) txt_result += txt[:(index+1)] txt = txt[(index+1):] break @@ -274,9 +294,10 @@ def markdown_convertion_for_file(txt): find_equation_pattern = r'' pattern = "|".join([pattern for pattern, property in mathpatterns.items() if not property["allow_multi_lines"]]) pattern = re.compile(pattern, flags=re.ASCII) - convert_stage_2 = pattern.sub(repl_fn, convert_stage_1) + convert_stage_3 = pattern.sub(repl_fn, convert_stage_2) - convert_stage_4 = markdown_bug_hunt(convert_stage_2) + convert_stage_4 = markdown_bug_hunt(convert_stage_3) # 2. convert to rendered equation convert_stage_5, n = re.subn(