feature(read pdf paper then write summary):

add a func called readPdf in toolbox, which can read pdf paper to str. then use bs4.BeautifulSoup to clean content.
这个提交包含在:
欧玮杰
2023-03-31 00:54:01 +08:00
父节点 2d5d719696
当前提交 5521f0e41c
共有 4 个文件被更改,包括 80 次插入11 次删除

查看文件

@@ -1,6 +1,14 @@
import markdown, mdtex2html, threading, importlib, traceback
from show_math import convert as convert_math
from functools import wraps
import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''):
"""
@@ -235,4 +243,52 @@ def clear_line_break(txt):
txt = txt.replace('\n', ' ')
txt = txt.replace(' ', ' ')
txt = txt.replace(' ', ' ')
return txt
return txt
def readPdf(pdfPath):
"""
读取pdf文件,返回文本内容
"""
fp = open(pdfPath, 'rb')
# Create a PDF parser object associated with the file object
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
# device = PDFDevice(rsrcmgr)
# BEGIN LAYOUT ANALYSIS.
# Set parameters for analysis.
laparams = LAParams(
char_margin=10.0,
line_margin=0.2,
boxes_flow=0.2,
all_texts=False,
)
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# loop over all pages in the document
outTextList = []
for page in PDFPage.create_pages(document):
# read the page into a layout object
interpreter.process_page(page)
layout = device.get_result()
for obj in layout._objs:
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
# print(obj.get_text())
outTextList.append(obj.get_text())
return outTextList