镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-07 15:06:48 +00:00
feature(read pdf paper then write summary):
add a func called readPdf in toolbox, which can read pdf paper to str. then use bs4.BeautifulSoup to clean content.
这个提交包含在:
58
toolbox.py
58
toolbox.py
@@ -1,6 +1,14 @@
|
||||
import markdown, mdtex2html, threading, importlib, traceback
|
||||
from show_math import convert as convert_math
|
||||
from functools import wraps
|
||||
import pdfminer
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
|
||||
def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''):
|
||||
"""
|
||||
@@ -235,4 +243,52 @@ def clear_line_break(txt):
|
||||
txt = txt.replace('\n', ' ')
|
||||
txt = txt.replace(' ', ' ')
|
||||
txt = txt.replace(' ', ' ')
|
||||
return txt
|
||||
return txt
|
||||
|
||||
def readPdf(pdfPath):
|
||||
"""
|
||||
读取pdf文件,返回文本内容
|
||||
"""
|
||||
fp = open(pdfPath, 'rb')
|
||||
|
||||
# Create a PDF parser object associated with the file object
|
||||
parser = PDFParser(fp)
|
||||
|
||||
# Create a PDF document object that stores the document structure.
|
||||
# Password for initialization as 2nd parameter
|
||||
document = PDFDocument(parser)
|
||||
# Check if the document allows text extraction. If not, abort.
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed
|
||||
|
||||
# Create a PDF resource manager object that stores shared resources.
|
||||
rsrcmgr = PDFResourceManager()
|
||||
|
||||
# Create a PDF device object.
|
||||
# device = PDFDevice(rsrcmgr)
|
||||
|
||||
# BEGIN LAYOUT ANALYSIS.
|
||||
# Set parameters for analysis.
|
||||
laparams = LAParams(
|
||||
char_margin=10.0,
|
||||
line_margin=0.2,
|
||||
boxes_flow=0.2,
|
||||
all_texts=False,
|
||||
)
|
||||
# Create a PDF page aggregator object.
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
# Create a PDF interpreter object.
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
|
||||
# loop over all pages in the document
|
||||
outTextList = []
|
||||
for page in PDFPage.create_pages(document):
|
||||
# read the page into a layout object
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
for obj in layout._objs:
|
||||
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
|
||||
# print(obj.get_text())
|
||||
outTextList.append(obj.get_text())
|
||||
|
||||
return outTextList
|
||||
在新工单中引用
屏蔽一个用户