gpt_academic/shared_utils/doc_loader_dynamic.py

import re
import os


def start_with_url(inputs:str):
    """
    检查输入是否以 http:// 或 https:// 开头，且为有效的网址
    """
    if not ("http://" in inputs or "https://" in inputs):
        return False
    try:
        text = text.strip(',.!?，。！？ \t\n\r')
        words = text.split()
        if len(words) != 1:
            return False
        from urllib.parse import urlparse
        result = urlparse(text)
        return all([result.scheme, result.netloc])
    except:
        return False

def load_web_content(inputs:str, chatbot_with_cookie, history:list):
    from crazy_functions.doc_fns.read_fns.web_reader import WebTextExtractor, WebExtractorConfig
    from toolbox import update_ui

    extractor = WebTextExtractor(WebExtractorConfig())
    try:
        # 添加正在处理的提示信息
        chatbot_with_cookie.append([None, "正在提取网页内容，请稍作等待..."])
        yield from update_ui(chatbot=chatbot_with_cookie, history=history)
        web_content = extractor.extract_text(inputs)
        # 移除提示信息
        chatbot_with_cookie.pop()
        # 显示提取的内容
        chatbot_with_cookie.append([None, f"网页{inputs}的文本内容如下：" + web_content])
        history.extend([f"网页{inputs}的文本内容如下：" + web_content])
        yield from update_ui(chatbot=chatbot_with_cookie, history=history)
    except Exception as e:
        # 如果出错，移除提示信息（如果存在）
        if len(chatbot_with_cookie) > 0 and chatbot_with_cookie[-1][-1] == "正在提取网页内容，请稍作等待...":
            chatbot_with_cookie.pop()
        chatbot_with_cookie.append([inputs, f"网页内容提取失败: {str(e)}"])
        yield from update_ui(chatbot=chatbot_with_cookie, history=history)

def extract_file_path(text):
    # 匹配以 private_upload 开头，包含时间戳格式的路径
    pattern = r'(private_upload/[^\s]+?/\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})'
    match = re.search(pattern, text)
    if match and os.path.exists(match.group(1)):
        return match.group(1)
    return None

def contain_uploaded_files(inputs: str):
    file_path_match = extract_file_path(inputs)
    if file_path_match:
        return True
    return False


def load_uploaded_files(inputs, method, llm_kwargs, plugin_kwargs, chatbot_with_cookie, history, system_prompt, stream, additional_fn):
    # load file
    from crazy_functions.doc_fns.text_content_loader import TextContentLoader
    file_path = extract_file_path(inputs)
    loader = TextContentLoader(chatbot_with_cookie, history)
    yield from loader.execute(file_path)

    # get question
    original_question = inputs.replace(file_path, '').strip()
    if not original_question:
        original_question = f"请简单分析上述文件内容"
    else:
        original_question = f"基于上述文件内容，{original_question}"

    return original_question