镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 06:26:47 +00:00
Master 4.0 (#2210)
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
这个提交包含在:
@@ -1,22 +1,48 @@
|
||||
import subprocess
|
||||
import os
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
|
||||
supports_format = ['.csv', '.docx', '.epub', '.ipynb', '.mbox', '.md', '.pdf', '.txt', '.ppt',
|
||||
'.pptm', '.pptx']
|
||||
supports_format = ['.csv', '.docx', '.epub', '.ipynb', '.mbox', '.md', '.pdf', '.txt', '.ppt', '.pptm', '.pptx', '.bat']
|
||||
|
||||
def convert_to_markdown(file_path: str) -> str:
|
||||
"""
|
||||
将支持的文件格式转换为Markdown格式
|
||||
Args:
|
||||
file_path: 输入文件路径
|
||||
Returns:
|
||||
str: 转换后的Markdown文件路径,如果转换失败则返回原始文件路径
|
||||
"""
|
||||
_, ext = os.path.splitext(file_path.lower())
|
||||
|
||||
if ext in ['.docx', '.doc', '.pptx', '.ppt', '.pptm', '.xls', '.xlsx', '.csv', 'pdf']:
|
||||
try:
|
||||
# 创建输出Markdown文件路径
|
||||
md_path = os.path.splitext(file_path)[0] + '.md'
|
||||
# 使用markitdown工具将文件转换为Markdown
|
||||
command = f"markitdown {file_path} > {md_path}"
|
||||
subprocess.run(command, shell=True, check=True)
|
||||
print(f"已将{ext}文件转换为Markdown: {md_path}")
|
||||
return md_path
|
||||
except Exception as e:
|
||||
print(f"{ext}转Markdown失败: {str(e)},将继续处理原文件")
|
||||
return file_path
|
||||
|
||||
return file_path
|
||||
|
||||
# 修改后的 extract_text 函数,结合 SimpleDirectoryReader 和自定义解析逻辑
|
||||
def extract_text(file_path):
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
_, ext = os.path.splitext(file_path.lower())
|
||||
|
||||
# 使用 SimpleDirectoryReader 处理它支持的文件格式
|
||||
if ext in supports_format:
|
||||
try:
|
||||
reader = SimpleDirectoryReader(input_files=[file_path])
|
||||
print(f"Extracting text from {file_path} using SimpleDirectoryReader")
|
||||
documents = reader.load_data()
|
||||
if len(documents) > 0:
|
||||
return documents[0].text
|
||||
print(f"Complete: Extracting text from {file_path} using SimpleDirectoryReader")
|
||||
buffer = [ doc.text for doc in documents ]
|
||||
return '\n'.join(buffer)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return None
|
||||
else:
|
||||
return '格式不支持'
|
||||
|
||||
在新工单中引用
屏蔽一个用户