文件
gpt_academic/crazy_functions/rag_fns/rag_file_support.py
2024-09-20 00:06:59 +08:00

47 行
1.4 KiB
Python

import os
import markdown
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
from docx import Document as DocxDocument
def extract_text_from_pdf(file_path):
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def extract_text_from_docx(file_path):
doc = DocxDocument(file_path)
return "\n".join([para.text for para in doc.paragraphs])
def extract_text_from_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
def extract_text_from_md(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
md_content = f.read()
return markdown.markdown(md_content)
def extract_text_from_html(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
return soup.get_text()
def extract_text(file_path):
_, ext = os.path.splitext(file_path.lower())
if ext == '.pdf':
return extract_text_from_pdf(file_path)
elif ext in ['.docx', '.doc']:
return extract_text_from_docx(file_path)
elif ext == '.txt':
return extract_text_from_txt(file_path)
elif ext == '.md':
return extract_text_from_md(file_path)
elif ext == '.html':
return extract_text_from_html(file_path)
else:
raise ValueError(f"Unsupported file extension: {ext}")