pdfminer整合到一个文件中

这个提交包含在:
Your Name
2023-03-31 21:03:12 +08:00
父节点 201f53c0f0
当前提交 da55ae68f6
共有 6 个文件被更改,包括 171 次插入82 次删除

查看文件

@@ -1,14 +1,6 @@
import markdown, mdtex2html, threading, importlib, traceback
from show_math import convert as convert_math
from functools import wraps
import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''):
"""
@@ -243,52 +235,4 @@ def clear_line_break(txt):
txt = txt.replace('\n', ' ')
txt = txt.replace(' ', ' ')
txt = txt.replace(' ', ' ')
return txt
def readPdf(pdfPath):
"""
读取pdf文件,返回文本内容
"""
fp = open(pdfPath, 'rb')
# Create a PDF parser object associated with the file object
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
# device = PDFDevice(rsrcmgr)
# BEGIN LAYOUT ANALYSIS.
# Set parameters for analysis.
laparams = LAParams(
char_margin=10.0,
line_margin=0.2,
boxes_flow=0.2,
all_texts=False,
)
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# loop over all pages in the document
outTextList = []
for page in PDFPage.create_pages(document):
# read the page into a layout object
interpreter.process_page(page)
layout = device.get_result()
for obj in layout._objs:
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
# print(obj.get_text())
outTextList.append(obj.get_text())
return outTextList
return txt