first_version

2025-12-06 06:26:47 +00:00 · 2024-09-20 00:06:59 +08:00
--- a/crazy_functions/rag_fns/rag_file_support.py
+++ b/crazy_functions/rag_fns/rag_file_support.py
@@ -0,0 +1,47 @@
+import os
+
+import markdown
+from PyPDF2 import PdfReader
+from bs4 import BeautifulSoup
+from docx import Document as DocxDocument
+
+
+def extract_text_from_pdf(file_path):
+    reader = PdfReader(file_path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() + "\n"
+    return text
+
+def extract_text_from_docx(file_path):
+    doc = DocxDocument(file_path)
+    return "\n".join([para.text for para in doc.paragraphs])
+
+def extract_text_from_txt(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.read()
+
+def extract_text_from_md(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        md_content = f.read()
+    return markdown.markdown(md_content)
+
+def extract_text_from_html(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        soup = BeautifulSoup(f, 'html.parser')
+    return soup.get_text()
+
+def extract_text(file_path):
+    _, ext = os.path.splitext(file_path.lower())
+    if ext == '.pdf':
+        return extract_text_from_pdf(file_path)
+    elif ext in ['.docx', '.doc']:
+        return extract_text_from_docx(file_path)
+    elif ext == '.txt':
+        return extract_text_from_txt(file_path)
+    elif ext == '.md':
+        return extract_text_from_md(file_path)
+    elif ext == '.html':
+        return extract_text_from_html(file_path)
+    else:
+        raise ValueError(f"Unsupported file extension: {ext}")