gpt_academic/crazy_functions/rag_fns/rag_file_support.py

import os

import markdown
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
from docx import Document as DocxDocument


def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(file_path):
    doc = DocxDocument(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def extract_text_from_md(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        md_content = f.read()
    return markdown.markdown(md_content)

def extract_text_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    return soup.get_text()

def extract_text(file_path):
    _, ext = os.path.splitext(file_path.lower())
    if ext == '.pdf':
        return extract_text_from_pdf(file_path)
    elif ext in ['.docx', '.doc']:
        return extract_text_from_docx(file_path)
    elif ext == '.txt':
        return extract_text_from_txt(file_path)
    elif ext == '.md':
        return extract_text_from_md(file_path)
    elif ext == '.html':
        return extract_text_from_html(file_path)
    else:
        raise ValueError(f"Unsupported file extension: {ext}")