first_version

这个提交包含在:
lbykkkk
2024-09-20 00:06:59 +08:00
父节点 597c320808
当前提交 df717f8bba
共有 3 个文件被更改,包括 207 次插入63 次删除

查看文件

@@ -0,0 +1,47 @@
import os
import markdown
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
from docx import Document as DocxDocument
def extract_text_from_pdf(file_path):
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def extract_text_from_docx(file_path):
doc = DocxDocument(file_path)
return "\n".join([para.text for para in doc.paragraphs])
def extract_text_from_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
def extract_text_from_md(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
md_content = f.read()
return markdown.markdown(md_content)
def extract_text_from_html(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
return soup.get_text()
def extract_text(file_path):
_, ext = os.path.splitext(file_path.lower())
if ext == '.pdf':
return extract_text_from_pdf(file_path)
elif ext in ['.docx', '.doc']:
return extract_text_from_docx(file_path)
elif ext == '.txt':
return extract_text_from_txt(file_path)
elif ext == '.md':
return extract_text_from_md(file_path)
elif ext == '.html':
return extract_text_from_html(file_path)
else:
raise ValueError(f"Unsupported file extension: {ext}")