diff --git a/crazy_functions/Rag_Interface.py b/crazy_functions/Rag_Interface.py index d1cd67b4..d4312f6a 100644 --- a/crazy_functions/Rag_Interface.py +++ b/crazy_functions/Rag_Interface.py @@ -5,7 +5,7 @@ from llama_index.core import Document from shared_utils.fastapi_server import validate_path_safety from toolbox import report_exception -from crazy_functions.rag_fns.rag_file_support import extract_text +from crazy_functions.rag_fns.rag_file_support import extract_text, supports_format from toolbox import CatchException, update_ui, get_conf, get_log_folder, update_ui_lastest_msg from crazy_functions.crazy_utils import input_clipping from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive @@ -48,7 +48,7 @@ def handle_document_upload(files: List[str], llm_kwargs, plugin_kwargs, chatbot, text = extract_text(file_path) if text is None: chatbot.append( - [f"上传文件: {os.path.basename(file_path)}", "文件解析失败,无法提取文本内容,请更换文件。"]) + [f"上传文件: {os.path.basename(file_path)}", f"文件解析失败,无法提取文本内容,请更换文件。失败原因可能为:1.文档格式过于复杂;2. 不支持的文件格式,支持的文件格式后缀有:" + ", ".join(supports_format)]) else: chatbot.append( [f"上传文件: {os.path.basename(file_path)}", f"上传文件前50个字符为:{text[:50]}。"]) diff --git a/crazy_functions/rag_fns/rag_file_support.py b/crazy_functions/rag_fns/rag_file_support.py index 50a07615..98ba3bee 100644 --- a/crazy_functions/rag_fns/rag_file_support.py +++ b/crazy_functions/rag_fns/rag_file_support.py @@ -1,22 +1,16 @@ import os from llama_index.core import SimpleDirectoryReader -# 保留你原有的自定义解析函数 -from PyPDF2 import PdfReader +supports_format = ['.csv', '.docx', '.epub', '.ipynb', '.mbox', '.md', '.pdf', '.txt', '.ppt', + '.pptm', '.pptx'] -def extract_text_from_pdf(file_path): - reader = PdfReader(file_path) - text = "" - for page in reader.pages: - text += page.extract_text() + "\n" - return text # 修改后的 extract_text 函数,结合 SimpleDirectoryReader 和自定义解析逻辑 def extract_text(file_path): _, ext = os.path.splitext(file_path.lower()) # 使用 SimpleDirectoryReader 处理它支持的文件格式 - if ext in ['.txt', '.md', '.pdf', '.docx', '.html']: + if ext in supports_format: try: reader = SimpleDirectoryReader(input_files=[file_path]) documents = reader.load_data() @@ -25,10 +19,4 @@ def extract_text(file_path): except Exception as e: pass - # 如果 SimpleDirectoryReader 失败,或文件格式不支持,使用自定义解析逻辑 - if ext == '.pdf': - try: - return extract_text_from_pdf(file_path) - except Exception as e: - pass return None