镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 06:26:47 +00:00
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
49 行
1.9 KiB
Python
49 行
1.9 KiB
Python
import subprocess
|
|
import os
|
|
|
|
supports_format = ['.csv', '.docx', '.epub', '.ipynb', '.mbox', '.md', '.pdf', '.txt', '.ppt', '.pptm', '.pptx', '.bat']
|
|
|
|
def convert_to_markdown(file_path: str) -> str:
|
|
"""
|
|
将支持的文件格式转换为Markdown格式
|
|
Args:
|
|
file_path: 输入文件路径
|
|
Returns:
|
|
str: 转换后的Markdown文件路径,如果转换失败则返回原始文件路径
|
|
"""
|
|
_, ext = os.path.splitext(file_path.lower())
|
|
|
|
if ext in ['.docx', '.doc', '.pptx', '.ppt', '.pptm', '.xls', '.xlsx', '.csv', 'pdf']:
|
|
try:
|
|
# 创建输出Markdown文件路径
|
|
md_path = os.path.splitext(file_path)[0] + '.md'
|
|
# 使用markitdown工具将文件转换为Markdown
|
|
command = f"markitdown {file_path} > {md_path}"
|
|
subprocess.run(command, shell=True, check=True)
|
|
print(f"已将{ext}文件转换为Markdown: {md_path}")
|
|
return md_path
|
|
except Exception as e:
|
|
print(f"{ext}转Markdown失败: {str(e)},将继续处理原文件")
|
|
return file_path
|
|
|
|
return file_path
|
|
|
|
# 修改后的 extract_text 函数,结合 SimpleDirectoryReader 和自定义解析逻辑
|
|
def extract_text(file_path):
|
|
from llama_index.core import SimpleDirectoryReader
|
|
_, ext = os.path.splitext(file_path.lower())
|
|
|
|
# 使用 SimpleDirectoryReader 处理它支持的文件格式
|
|
if ext in supports_format:
|
|
try:
|
|
reader = SimpleDirectoryReader(input_files=[file_path])
|
|
print(f"Extracting text from {file_path} using SimpleDirectoryReader")
|
|
documents = reader.load_data()
|
|
print(f"Complete: Extracting text from {file_path} using SimpleDirectoryReader")
|
|
buffer = [ doc.text for doc in documents ]
|
|
return '\n'.join(buffer)
|
|
except Exception as e:
|
|
pass
|
|
else:
|
|
return '格式不支持'
|