version 3.6

2025-12-07 23:16:48 +00:00 · 2023-11-20 01:17:59 +08:00
--- a/crazy_functions/latex_fns/latex_actions.py
+++ b/crazy_functions/latex_fns/latex_actions.py
@@ -1,9 +1,10 @@
 from toolbox import update_ui, update_ui_lastest_msg, get_log_folder
-from toolbox import zip_folder, objdump, objload, promote_file_to_downloadzone
+from toolbox import get_conf, objdump, objload, promote_file_to_downloadzone
 from .latex_toolbox import PRESERVE, TRANSFORM
 from .latex_toolbox import set_forbidden_text, set_forbidden_text_begin_end, set_forbidden_text_careful_brace
 from .latex_toolbox import reverse_forbidden_text_careful_brace, reverse_forbidden_text, convert_to_linklist, post_process
 from .latex_toolbox import fix_content, find_main_tex_file, merge_tex_files, compile_latex_with_timeout
+from .latex_toolbox import find_title_and_abs

 import os, shutil
 import re
@@ -90,7 +91,18 @@ class LatexPaperSplit():
            "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。"
        # 请您不要删除或修改这行警告，除非您是论文的原作者（如果您是论文原作者，欢迎加REAME中的QQ联系开发者）
        self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响，禁止移除或修改此警告。}}\\\\" 
+        self.title = "unknown"
+        self.abstract = "unknown"

+    def read_title_and_abstract(self, txt):
+        try:
+            title, abstract = find_title_and_abs(txt)
+            if title is not None: 
+                self.title = title.replace('\n', ' ').replace('\\\\', ' ').replace('  ', '').replace('  ', '')
+            if abstract is not None: 
+                self.abstract = abstract.replace('\n', ' ').replace('\\\\', ' ').replace('  ', '').replace('  ', '')
+        except:
+            pass

    def merge_result(self, arr, mode, msg, buggy_lines=[], buggy_line_surgery_n_lines=10):
        """
@@ -165,7 +177,7 @@ class LatexPaperFileGroup():
        self.sp_file_tag = []

        # count_token
-        from request_llm.bridge_all import model_info
+        from request_llms.bridge_all import model_info
        enc = model_info["gpt-3.5-turbo"]['tokenizer']
        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
        self.get_token_num = get_token_num
@@ -234,8 +246,8 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin
    chatbot.append((f"Latex文件融合完成", f'[Local Message] 正在精细切分latex文件，这需要一段时间计算，文档越长耗时越长，请耐心等待。'))
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
    lps = LatexPaperSplit()
+    lps.read_title_and_abstract(merged_content)
    res = lps.split(merged_content, project_folder, opts) # 消耗时间的函数
-
    #  <-------- 拆分过长的latex片段 ----------> 
    pfg = LatexPaperFileGroup()
    for index, r in enumerate(res):
@@ -256,12 +268,19 @@ def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin

    else:
        #  <-------- gpt 多线程请求 ----------> 
+        history_array = [[""] for _ in range(n_split)]
+        # LATEX_EXPERIMENTAL, = get_conf('LATEX_EXPERIMENTAL')
+        # if LATEX_EXPERIMENTAL:
+        #     paper_meta = f"The paper you processing is `{lps.title}`, a part of the abstraction is `{lps.abstract}`"
+        #     paper_meta_max_len = 888
+        #     history_array = [[ paper_meta[:paper_meta_max_len] + '...',  "Understand, what should I do?"] for _ in range(n_split)]
+
        gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
            inputs_array=inputs_array,
            inputs_show_user_array=inputs_show_user_array,
            llm_kwargs=llm_kwargs,
            chatbot=chatbot,
-            history_array=[[""] for _ in range(n_split)],
+            history_array=history_array,
            sys_prompt_array=sys_prompt_array,
            # max_workers=5,  # 并行任务数量限制, 最多同时执行5个, 其他的排队等待
            scroller_max_len = 40
@@ -423,7 +442,7 @@ def write_html(sp_file_contents, sp_file_result, chatbot, project_folder):
    # write html
    try:
        import shutil
-        from ..crazy_utils import construct_html
+        from crazy_functions.pdf_fns.report_gen_html import construct_html
        from toolbox import gen_time_str
        ch = construct_html() 
        orig = ""
--- a/crazy_functions/latex_fns/latex_toolbox.py
+++ b/crazy_functions/latex_fns/latex_toolbox.py
@@ -308,13 +308,51 @@ def merge_tex_files_(project_foler, main_file, mode):
        fp = os.path.join(project_foler, f)
        fp_ = find_tex_file_ignore_case(fp)
        if fp_:
-            with open(fp_, 'r', encoding='utf-8', errors='replace') as fx: c = fx.read()
+            try:
+                with open(fp_, 'r', encoding='utf-8', errors='replace') as fx: c = fx.read()
+            except:
+                c = f"\n\nWarning from GPT-Academic: LaTex source file is missing!\n\n"
        else:
            raise RuntimeError(f'找不到{fp}，Tex源文件缺失！')
        c = merge_tex_files_(project_foler, c, mode)
        main_file = main_file[:s.span()[0]] + c + main_file[s.span()[1]:]
    return main_file

+
+def find_title_and_abs(main_file):
+
+    def extract_abstract_1(text):
+        pattern = r"\\abstract\{(.*?)\}"
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            return match.group(1)
+        else:
+            return None
+
+    def extract_abstract_2(text):
+        pattern = r"\\begin\{abstract\}(.*?)\\end\{abstract\}"
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            return match.group(1)
+        else:
+            return None
+
+    def extract_title(string):
+        pattern = r"\\title\{(.*?)\}"
+        match = re.search(pattern, string, re.DOTALL)
+
+        if match:
+            return match.group(1)
+        else:
+            return None
+
+    abstract = extract_abstract_1(main_file)
+    if abstract is None:
+        abstract = extract_abstract_2(main_file)
+    title = extract_title(main_file)
+    return title, abstract
+
+
 def merge_tex_files(project_foler, main_file, mode):
    """
    Merge Tex project recrusively
@@ -342,10 +380,41 @@ def merge_tex_files(project_foler, main_file, mode):
        pattern_opt2 = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL)
        match_opt1 = pattern_opt1.search(main_file)
        match_opt2 = pattern_opt2.search(main_file)
+        if (match_opt1 is None) and (match_opt2 is None):
+            # "Cannot find paper abstract section!"
+            main_file = insert_abstract(main_file)
+        match_opt1 = pattern_opt1.search(main_file)
+        match_opt2 = pattern_opt2.search(main_file)
        assert (match_opt1 is not None) or (match_opt2 is not None), "Cannot find paper abstract section!"
    return main_file


+insert_missing_abs_str = r"""
+\begin{abstract}
+The GPT-Academic program cannot find abstract section in this paper.
+\end{abstract}
+"""
+
+def insert_abstract(tex_content):
+    if "\\maketitle" in tex_content:
+        # find the position of "\maketitle"
+        find_index = tex_content.index("\\maketitle")
+        # find the nearest ending line
+        end_line_index = tex_content.find("\n", find_index)
+        # insert "abs_str" on the next line
+        modified_tex = tex_content[:end_line_index+1] + '\n\n' + insert_missing_abs_str + '\n\n' + tex_content[end_line_index+1:]
+        return modified_tex
+    elif r"\begin{document}" in tex_content:
+        # find the position of "\maketitle"
+        find_index = tex_content.index(r"\begin{document}")
+        # find the nearest ending line
+        end_line_index = tex_content.find("\n", find_index)
+        # insert "abs_str" on the next line
+        modified_tex = tex_content[:end_line_index+1] + '\n\n' + insert_missing_abs_str + '\n\n' + tex_content[end_line_index+1:]
+        return modified_tex
+    else:
+        return tex_content
+
 """
 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
 Post process