follow master

2025-12-08 23:46:48 +00:00 · 2023-09-15 17:25:21 +08:00
--- a/crazy_functions/crazy_utils.py
+++ b/crazy_functions/crazy_utils.py
@@ -1,5 +1,7 @@
-from toolbox import update_ui, get_conf, trimmed_format_exc
+from toolbox import update_ui, get_conf, trimmed_format_exc, get_log_folder
 import threading
+import os
+import logging

 def input_clipping(inputs, history, max_token_limit):
    import numpy as np
@@ -469,14 +471,16 @@ def read_and_clean_pdf_text(fp):
                    '- ', '') for t in text_areas['blocks'] if 'lines' in t]
                
        ############################## <第 2 步，获取正文主字体> ##################################
-        fsize_statiscs = {}
-        for span in meta_span:
-            if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
-            fsize_statiscs[span[1]] += span[2]
-        main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
-        if REMOVE_FOOT_NOTE:
-            give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
-
+        try:
+            fsize_statiscs = {}
+            for span in meta_span:
+                if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
+                fsize_statiscs[span[1]] += span[2]
+            main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
+            if REMOVE_FOOT_NOTE:
+                give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
+        except:
+            raise RuntimeError(f'抱歉, 我们暂时无法解析此PDF文档: {fp}。')
        ############################## <第 3 步，切分和重新整合> ##################################
        mega_sec = []
        sec = []
@@ -703,49 +707,96 @@ class knowledge_archive_interface():
        )
        self.threadLock.release()
        return resp, prompt
+    
+@Singleton
+class nougat_interface():
+    def __init__(self):
+        self.threadLock = threading.Lock()

-def try_install_deps(deps):
+    def nougat_with_timeout(self, command, cwd, timeout=3600):
+        import subprocess
+        logging.info(f'正在执行命令 {command}')
+        process = subprocess.Popen(command, shell=True, cwd=cwd)
+        try:
+            stdout, stderr = process.communicate(timeout=timeout)
+        except subprocess.TimeoutExpired:
+            process.kill()
+            stdout, stderr = process.communicate()
+            print("Process timed out!")
+            return False
+        return True
+
+
+    def NOUGAT_parse_pdf(self, fp, chatbot, history):
+        from toolbox import update_ui_lastest_msg
+
+        yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度：正在排队, 等待线程锁...", 
+                                         chatbot=chatbot, history=history, delay=0)
+        self.threadLock.acquire()
+        import glob, threading, os
+        from toolbox import get_log_folder, gen_time_str
+        dst = os.path.join(get_log_folder(plugin_name='nougat'), gen_time_str())
+        os.makedirs(dst)
+
+        yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度：正在加载NOUGAT... （提示：首次运行需要花费较长时间下载NOUGAT参数）", 
+                                         chatbot=chatbot, history=history, delay=0)
+        self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600)
+        res = glob.glob(os.path.join(dst,'*.mmd'))
+        if len(res) == 0:
+            self.threadLock.release()
+            raise RuntimeError("Nougat解析论文失败。")
+        self.threadLock.release()
+        return res[0]
+
+
+
+
+def try_install_deps(deps, reload_m=[]):
+    import subprocess, sys, importlib
    for dep in deps:
-        import subprocess, sys
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--user', dep])
+    import site
+    importlib.reload(site)
+    for m in reload_m:
+        importlib.reload(__import__(m))


-class construct_html():
-    def __init__(self) -> None:
-        self.css = """
+HTML_CSS = """
 .row {
  display: flex;
  flex-wrap: wrap;
 }
-
 .column {
  flex: 1;
  padding: 10px;
 }
-
 .table-header {
  font-weight: bold;
  border-bottom: 1px solid black;
 }
-
 .table-row {
  border-bottom: 1px solid lightgray;
 }
-
 .table-cell {
  padding: 5px;
 }
-        """
-        self.html_string = f'<!DOCTYPE html><head><meta charset="utf-8"><title>翻译结果</title><style>{self.css}</style></head>'
+"""

-
-    def add_row(self, a, b):
-        tmp = """
+TABLE_CSS = """
 <div class="row table-row">
    <div class="column table-cell">REPLACE_A</div>
    <div class="column table-cell">REPLACE_B</div>
 </div>
-        """
+"""
+
+class construct_html():
+    def __init__(self) -> None:
+        self.css = HTML_CSS
+        self.html_string = f'<!DOCTYPE html><head><meta charset="utf-8"><title>翻译结果</title><style>{self.css}</style></head>'
+
+
+    def add_row(self, a, b):
+        tmp = TABLE_CSS
        from toolbox import markdown_convertion
        tmp = tmp.replace('REPLACE_A', markdown_convertion(a))
        tmp = tmp.replace('REPLACE_B', markdown_convertion(b))
@@ -753,6 +804,6 @@ class construct_html():


    def save_file(self, file_name):
-        with open(f'./gpt_log/{file_name}', 'w', encoding='utf8') as f:
+        with open(os.path.join(get_log_folder(), file_name), 'w', encoding='utf8') as f:
            f.write(self.html_string.encode('utf-8', 'ignore').decode())
-
+        return os.path.join(get_log_folder(), file_name)