文件
gpt_academic/shared_utils/doc_loader_dynamic.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

75 行
2.9 KiB
Python

import re
import os
def start_with_url(inputs:str):
"""
检查输入是否以 http:// 或 https:// 开头,且为有效的网址
"""
if not ("http://" in inputs or "https://" in inputs):
return False
try:
text = text.strip(',.!?,。!? \t\n\r')
words = text.split()
if len(words) != 1:
return False
from urllib.parse import urlparse
result = urlparse(text)
return all([result.scheme, result.netloc])
except:
return False
def load_web_content(inputs:str, chatbot_with_cookie, history:list):
from crazy_functions.doc_fns.read_fns.web_reader import WebTextExtractor, WebExtractorConfig
from toolbox import update_ui
extractor = WebTextExtractor(WebExtractorConfig())
try:
# 添加正在处理的提示信息
chatbot_with_cookie.append([None, "正在提取网页内容,请稍作等待..."])
yield from update_ui(chatbot=chatbot_with_cookie, history=history)
web_content = extractor.extract_text(inputs)
# 移除提示信息
chatbot_with_cookie.pop()
# 显示提取的内容
chatbot_with_cookie.append([None, f"网页{inputs}的文本内容如下:" + web_content])
history.extend([f"网页{inputs}的文本内容如下:" + web_content])
yield from update_ui(chatbot=chatbot_with_cookie, history=history)
except Exception as e:
# 如果出错,移除提示信息(如果存在)
if len(chatbot_with_cookie) > 0 and chatbot_with_cookie[-1][-1] == "正在提取网页内容,请稍作等待...":
chatbot_with_cookie.pop()
chatbot_with_cookie.append([inputs, f"网页内容提取失败: {str(e)}"])
yield from update_ui(chatbot=chatbot_with_cookie, history=history)
def extract_file_path(text):
# 匹配以 private_upload 开头,包含时间戳格式的路径
pattern = r'(private_upload/[^\s]+?/\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})'
match = re.search(pattern, text)
if match and os.path.exists(match.group(1)):
return match.group(1)
return None
def contain_uploaded_files(inputs: str):
file_path_match = extract_file_path(inputs)
if file_path_match:
return True
return False
def load_uploaded_files(inputs, method, llm_kwargs, plugin_kwargs, chatbot_with_cookie, history, system_prompt, stream, additional_fn):
# load file
from crazy_functions.doc_fns.text_content_loader import TextContentLoader
file_path = extract_file_path(inputs)
loader = TextContentLoader(chatbot_with_cookie, history)
yield from loader.execute(file_path)
# get question
original_question = inputs.replace(file_path, '').strip()
if not original_question:
original_question = f"请简单分析上述文件内容"
else:
original_question = f"基于上述文件内容,{original_question}"
return original_question