改善chatpdf的功能

这个提交包含在:
qingxu fu
2023-04-13 11:08:53 +08:00
父节点 613be5509b
当前提交 8ac9b454e3
共有 5 个文件被更改,包括 221 次插入301 次删除

查看文件

@@ -360,3 +360,171 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
# 这个中文的句号是故意的,作为一个标识而存在
res = cut(txt.replace('.', '\n'), must_break_at_empty_line=False)
return [r.replace('\n', '.') for r in res]
def read_and_clean_pdf_text(fp):
"""
这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好
**输入参数说明**
- `fp`需要读取和清理文本的pdf文件路径
**输出参数说明**
- `meta_txt`:清理后的文本内容字符串
- `page_one_meta`:第一页清理后的文本内容列表
**函数功能**
读取pdf文件并清理其中的文本内容,清理规则包括
- 提取所有块元的文本信息,并合并为一个字符串
- 去除短块字符数小于100并替换为回车符
- 清理多余的空行
- 合并小写字母开头的段落块并替换为空格
- 清除重复的换行
- 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
"""
import fitz, copy
import re
import numpy as np
from colorful import print亮黄, print亮绿
fc = 0
fs = 1
fb = 2
REMOVE_FOOT_NOTE = True
REMOVE_FOOT_FFSIZE_PERCENT = 0.95
def primary_ffsize(l):
fsize_statiscs = {}
for wtf in l['spans']:
if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
fsize_statiscs[wtf['size']] += len(wtf['text'])
return max(fsize_statiscs, key=fsize_statiscs.get)
def ffsize_same(a,b):
return abs((a-b)/max(a,b)) < 0.02
# file_content = ""
with fitz.open(fp) as doc:
meta_txt = []
meta_font = []
meta_line = []
meta_span = []
for index, page in enumerate(doc):
# file_content += page.get_text()
text_areas = page.get_text("dict") # 获取页面上的文本信息
for t in text_areas['blocks']:
if 'lines' in t:
pf = 998
for l in t['lines']:
txt_line = "".join([wtf['text'] for wtf in l['spans']])
pf = primary_ffsize(l)
meta_line.append([txt_line, pf, l['bbox'], l])
for wtf in l['spans']: # for l in t['lines']:
meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
# meta_line.append(["NEW_BLOCK", pf])
# 块元提取 for each word segment with in line for each line cross-line words for each block
meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
'- ', '') for t in text_areas['blocks'] if 'lines' in t])
meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
if index == 0:
page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
'- ', '') for t in text_areas['blocks'] if 'lines' in t]
# 获取正文主字体
fsize_statiscs = {}
for span in meta_span:
if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
fsize_statiscs[span[1]] += span[2]
main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
if REMOVE_FOOT_NOTE:
give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
# 切分和重新整合
mega_sec = []
sec = []
for index, line in enumerate(meta_line):
if index == 0:
sec.append(line[fc])
continue
if REMOVE_FOOT_NOTE:
if meta_line[index][fs] <= give_up_fize_threshold:
continue
if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
# 尝试识别段落
if meta_line[index][fc].endswith('.') and\
(meta_line[index-1][fc] != 'NEW_BLOCK') and \
(meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
sec[-1] += line[fc]
sec[-1] += "\n\n"
else:
sec[-1] += " "
sec[-1] += line[fc]
else:
if (index+1 < len(meta_line)) and \
meta_line[index][fs] > main_fsize:
# 单行 + 字体大
mega_sec.append(copy.deepcopy(sec))
sec = []
sec.append("# " + line[fc])
else:
# 尝试识别section
if meta_line[index-1][fs] > meta_line[index][fs]:
sec.append("\n" + line[fc])
else:
sec.append(line[fc])
mega_sec.append(copy.deepcopy(sec))
finals = []
for ms in mega_sec:
final = " ".join(ms)
final = final.replace('- ', ' ')
finals.append(final)
meta_txt = finals
def 把字符太少的块清除为回车(meta_txt):
for index, block_txt in enumerate(meta_txt):
if len(block_txt) < 100:
meta_txt[index] = '\n'
return meta_txt
meta_txt = 把字符太少的块清除为回车(meta_txt)
def 清理多余的空行(meta_txt):
for index in reversed(range(1, len(meta_txt))):
if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
meta_txt.pop(index)
return meta_txt
meta_txt = 清理多余的空行(meta_txt)
def 合并小写开头的段落块(meta_txt):
def starts_with_lowercase_word(s):
pattern = r"^[a-z]+"
match = re.match(pattern, s)
if match:
return True
else:
return False
for _ in range(100):
for index, block_txt in enumerate(meta_txt):
if starts_with_lowercase_word(block_txt):
if meta_txt[index-1] != '\n':
meta_txt[index-1] += ' '
else:
meta_txt[index-1] = ''
meta_txt[index-1] += meta_txt[index]
meta_txt[index] = '\n'
return meta_txt
meta_txt = 合并小写开头的段落块(meta_txt)
meta_txt = 清理多余的空行(meta_txt)
meta_txt = '\n'.join(meta_txt)
# 清除重复的换行
for _ in range(5):
meta_txt = meta_txt.replace('\n\n', '\n')
# 换行 -> 双换行
meta_txt = meta_txt.replace('\n', '\n\n')
for f in finals:
print亮黄(f)
print亮绿('***************************')
return meta_txt, page_one_meta