论文翻译只输出中文

2025-12-10 00:16:48 +00:00 · 2023-09-16 22:09:44 +08:00
--- a/crazy_functions/批量翻译PDF文档_NOUGAT.py
+++ b/crazy_functions/批量翻译PDF文档_NOUGAT.py
@@ -47,7 +47,7 @@ def markdown_to_dict(article_content):


@CatchException
-def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port, only_chinese=True):

    disable_auto_promotion(chatbot)
    # 基本信息：功能、贡献者
@@ -84,12 +84,12 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
        return

    # 开始正式执行任务
-    yield from 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
+    yield from 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, only_chinese)




-def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
+def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, only_chinese=True):
    import copy
    import tiktoken
    TOKEN_LIMIT_PER_FRAGMENT = 1280
@@ -100,7 +100,7 @@ def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwa
    nougat_handle = nougat_interface()
    for index, fp in enumerate(file_manifest):
        chatbot.append(["当前进度：", f"正在解析论文，请稍候。（第一次运行时，需要花费较长时间下载NOUGAT参数）"]); yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-        fpp = yield from nougat_handle.NOUGAT_parse_pdf(fp, chatbot, history)
+        fpp = nougat_handle.NOUGAT_parse_pdf(fp)

        with open(fpp, 'r', encoding='utf8') as f:
            article_content = f.readlines()
@@ -170,19 +170,52 @@ def 解析PDF_基于NOUGAT(file_manifest, project_folder, llm_kwargs, plugin_kwa
            sys_prompt_array=[
                "请你作为一个学术翻译，负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in inputs_array],
        )
-        res_path = write_history_to_file(meta +  ["# Meta Translation" , paper_meta_info] + gpt_response_collection, file_basename=None, file_fullname=None)
-        promote_file_to_downloadzone(res_path, rename_file=os.path.basename(fp)+'.md', chatbot=chatbot)
-        generated_conclusion_files.append(res_path)
+        if only_chinese:
+            # 直接提取出翻译的内容，然后保存下去：
+            chinese_list = []
+            # 记录当前的大章节标题：
+            last_section_name = ""
+            for index, value in enumerate(gpt_response_collection):
+                # 先挑选偶数序列号：
+                if index % 2 != 0:
+                    # 先提取当前英文标题：
+                    cur_section_name = gpt_response_collection[index-1].split('\n')[0].split(" Part")[0]
+
+                    # 如果index是1的话，则直接使用first section name：
+                    if cur_section_name != last_section_name:
+                        cur_value = cur_section_name + '\n'
+                        last_section_name = copy.deepcopy(cur_section_name)
+                    else:
+                        cur_value = ""
+                    # 再判断翻译是否错误，如果错误，则直接贴原来的英文：
+                    if "The OpenAI account associated" in value:
+                        cur_value += gpt_response_collection[index-1]
+                    else:
+                        # 再做一个小修改：重新修改当前part的标题，默认用英文的
+                        cur_value += value
+
+                    chinese_list.append(cur_value)
+            res_path = write_history_to_file(meta +  ["# Meta Translation" , paper_meta_info] + chinese_list, file_basename=None, file_fullname=None)
+            promote_file_to_downloadzone(res_path, rename_file=os.path.basename(fp)+'.md', chatbot=chatbot)
+            generated_conclusion_files.append(res_path)
+        else:
+            res_path = write_history_to_file(meta +  ["# Meta Translation" , paper_meta_info] + gpt_response_collection, file_basename=None, file_fullname=None)
+            promote_file_to_downloadzone(res_path, rename_file=os.path.basename(fp)+'.md', chatbot=chatbot)
+            generated_conclusion_files.append(res_path)

        ch = construct_html() 
        orig = ""
        trans = ""
        gpt_response_collection_html = copy.deepcopy(gpt_response_collection)
+        # 叠加HTML文件
        for i,k in enumerate(gpt_response_collection_html): 
            if i%2==0:
                gpt_response_collection_html[i] = inputs_show_user_array[i//2]
            else:
-                gpt_response_collection_html[i] = gpt_response_collection_html[i]
+                # 先提取当前英文标题：
+                cur_section_name = gpt_response_collection[i-1].split('\n')[0].split(" Part")[0]
+                cur_value = cur_section_name + "\n" + gpt_response_collection_html[i]
+                gpt_response_collection_html[i] = cur_value

        final = ["", "", "一、论文概况",  "", "Abstract", paper_meta_info,  "二、论文翻译",  ""]
        final.extend(gpt_response_collection_html)