改善chatpdf的功能

2025-12-06 22:46:48 +00:00 · 2023-04-13 11:08:53 +08:00
--- a/crazy_functions/crazy_utils.py
+++ b/crazy_functions/crazy_utils.py
@@ -360,3 +360,171 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
            # 这个中文的句号是故意的，作为一个标识而存在
            res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
            return [r.replace('。\n', '.') for r in res]
+
+
+
+def read_and_clean_pdf_text(fp):
+    """
+    这个函数用于分割pdf，用了很多trick，逻辑较乱，效果奇好
+
+    **输入参数说明**
+    - `fp`：需要读取和清理文本的pdf文件路径
+
+    **输出参数说明**
+    - `meta_txt`：清理后的文本内容字符串
+    - `page_one_meta`：第一页清理后的文本内容列表
+
+    **函数功能**
+    读取pdf文件并清理其中的文本内容，清理规则包括：
+    - 提取所有块元的文本信息，并合并为一个字符串
+    - 去除短块（字符数小于100）并替换为回车符
+    - 清理多余的空行
+    - 合并小写字母开头的段落块并替换为空格
+    - 清除重复的换行
+    - 将每个换行符替换为两个换行符，使每个段落之间有两个换行符分隔
+    """
+    import fitz, copy
+    import re
+    import numpy as np
+    from colorful import print亮黄, print亮绿
+    fc = 0
+    fs = 1
+    fb = 2
+    REMOVE_FOOT_NOTE = True
+    REMOVE_FOOT_FFSIZE_PERCENT = 0.95 
+    def primary_ffsize(l):
+        fsize_statiscs = {}
+        for wtf in l['spans']:
+            if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
+            fsize_statiscs[wtf['size']] += len(wtf['text'])
+        return max(fsize_statiscs, key=fsize_statiscs.get)
+        
+    def ffsize_same(a,b):
+        return abs((a-b)/max(a,b)) < 0.02
+    # file_content = ""
+    with fitz.open(fp) as doc:
+        meta_txt = []
+        meta_font = []
+
+        meta_line = []
+        meta_span = []
+        for index, page in enumerate(doc):
+            # file_content += page.get_text()
+            text_areas = page.get_text("dict")  # 获取页面上的文本信息
+            for t in text_areas['blocks']:
+                if 'lines' in t:
+                    pf = 998
+                    for l in t['lines']:
+                        txt_line = "".join([wtf['text'] for wtf in l['spans']])
+                        pf = primary_ffsize(l)
+                        meta_line.append([txt_line, pf, l['bbox'], l])
+                        for wtf in l['spans']: # for l in t['lines']:
+                            meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
+                    # meta_line.append(["NEW_BLOCK", pf])
+            # 块元提取                           for each word segment with in line                       for each line         cross-line words                          for each block
+            meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
+                '- ', '') for t in text_areas['blocks'] if 'lines' in t])
+            meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
+                             for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
+            if index == 0:
+                page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
+                    '- ', '') for t in text_areas['blocks'] if 'lines' in t]
+        # 获取正文主字体
+        fsize_statiscs = {}
+        for span in meta_span:
+            if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
+            fsize_statiscs[span[1]] += span[2]
+        main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
+        if REMOVE_FOOT_NOTE:
+            give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
+
+        # 切分和重新整合
+        mega_sec = []
+        sec = []
+        for index, line in enumerate(meta_line):
+            if index == 0: 
+                sec.append(line[fc])
+                continue
+            if REMOVE_FOOT_NOTE:
+                if meta_line[index][fs] <= give_up_fize_threshold:
+                    continue
+            if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
+                # 尝试识别段落
+                if meta_line[index][fc].endswith('.') and\
+                    (meta_line[index-1][fc] != 'NEW_BLOCK') and \
+                    (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
+                    sec[-1] += line[fc]
+                    sec[-1] += "\n\n"
+                else:
+                    sec[-1] += " "
+                    sec[-1] += line[fc]
+            else:
+                if (index+1 < len(meta_line)) and \
+                    meta_line[index][fs] > main_fsize:
+                    # 单行 + 字体大
+                    mega_sec.append(copy.deepcopy(sec))
+                    sec = []
+                    sec.append("# " + line[fc])
+                else:
+                    # 尝试识别section
+                    if meta_line[index-1][fs] > meta_line[index][fs]:
+                        sec.append("\n" + line[fc])
+                    else:
+                        sec.append(line[fc])
+        mega_sec.append(copy.deepcopy(sec))
+
+        finals = []
+        for ms in mega_sec:
+            final = " ".join(ms)
+            final = final.replace('- ', ' ')
+            finals.append(final)
+        meta_txt = finals
+
+        def 把字符太少的块清除为回车(meta_txt):
+            for index, block_txt in enumerate(meta_txt):
+                if len(block_txt) < 100:
+                    meta_txt[index] = '\n'
+            return meta_txt
+        meta_txt = 把字符太少的块清除为回车(meta_txt)
+
+        def 清理多余的空行(meta_txt):
+            for index in reversed(range(1, len(meta_txt))):
+                if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
+                    meta_txt.pop(index)
+            return meta_txt
+        meta_txt = 清理多余的空行(meta_txt)
+
+        def 合并小写开头的段落块(meta_txt):
+            def starts_with_lowercase_word(s):
+                pattern = r"^[a-z]+"
+                match = re.match(pattern, s)
+                if match:
+                    return True
+                else:
+                    return False
+            for _ in range(100):
+                for index, block_txt in enumerate(meta_txt):
+                    if starts_with_lowercase_word(block_txt):
+                        if meta_txt[index-1] != '\n':
+                            meta_txt[index-1] += ' '
+                        else:
+                            meta_txt[index-1] = ''
+                        meta_txt[index-1] += meta_txt[index]
+                        meta_txt[index] = '\n'
+            return meta_txt
+        meta_txt = 合并小写开头的段落块(meta_txt)
+        meta_txt = 清理多余的空行(meta_txt)
+
+        meta_txt = '\n'.join(meta_txt)
+        # 清除重复的换行
+        for _ in range(5):
+            meta_txt = meta_txt.replace('\n\n', '\n')
+
+        # 换行 -> 双换行
+        meta_txt = meta_txt.replace('\n', '\n\n')
+
+        for f in finals:
+            print亮黄(f)
+            print亮绿('***************************')
+
+    return meta_txt, page_one_meta