gpt_academic/crazy_functions/批量文件询问.py

import os
import threading
import time
from dataclasses import dataclass
from typing import List, Tuple, Dict, Generator

from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
from crazy_functions.rag_fns.rag_file_support import extract_text
from request_llms.bridge_all import model_info
from toolbox import update_ui, CatchException, report_exception


@dataclass
class FileFragment:
    """文件片段数据类，用于组织处理单元"""
    file_path: str
    content: str
    rel_path: str
    fragment_index: int
    total_fragments: int


class BatchDocumentSummarizer:
    """优化的文档总结器 - 批处理版本"""

    def __init__(self, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, history: List, system_prompt: str):
        """初始化总结器"""
        self.llm_kwargs = llm_kwargs
        self.plugin_kwargs = plugin_kwargs
        self.chatbot = chatbot
        self.history = history
        self.system_prompt = system_prompt
        self.failed_files = []
        self.file_summaries_map = {}

    def _get_token_limit(self) -> int:
        """获取模型token限制"""
        max_token = model_info[self.llm_kwargs['llm_model']]['max_token']
        return max_token * 3 // 4

    def _create_batch_inputs(self, fragments: List[FileFragment]) -> Tuple[List, List, List]:
        """创建批处理输入"""
        inputs_array = []
        inputs_show_user_array = []
        history_array = []

        for frag in fragments:
            if self.plugin_kwargs.get("advanced_arg"):
                i_say = (f'请按照用户要求对文件内容进行处理，文件名为{os.path.basename(frag.file_path)}，'
                         f'用户要求为：{self.plugin_kwargs["advanced_arg"]}：'
                         f'文件内容是 ```{frag.content}```')
                i_say_show_user = (f'正在处理 {frag.rel_path} (片段 {frag.fragment_index + 1}/{frag.total_fragments})')
            else:
                i_say = (f'请对下面的内容用中文做总结，不超过500字，文件名是{os.path.basename(frag.file_path)}，'
                         f'内容是 ```{frag.content}```')
                i_say_show_user = f'正在处理 {frag.rel_path} (片段 {frag.fragment_index + 1}/{frag.total_fragments})'

            inputs_array.append(i_say)
            inputs_show_user_array.append(i_say_show_user)
            history_array.append([])

        return inputs_array, inputs_show_user_array, history_array

    def _process_single_file_with_timeout(self, file_info: Tuple[str, str], mutable_status: List) -> List[FileFragment]:
        """包装了超时控制的文件处理函数"""

        def timeout_handler():
            thread = threading.current_thread()
            if hasattr(thread, '_timeout_occurred'):
                thread._timeout_occurred = True

        # 设置超时标记
        thread = threading.current_thread()
        thread._timeout_occurred = False

        # 设置超时定时器
        timer = threading.Timer(self.watch_dog_patience, timeout_handler)
        timer.start()

        try:
            fp, project_folder = file_info
            fragments = []

            # 定期检查是否超时
            def check_timeout():
                if hasattr(thread, '_timeout_occurred') and thread._timeout_occurred:
                    raise TimeoutError("处理超时")

            # 更新状态
            mutable_status[0] = "检查文件大小"
            mutable_status[1] = time.time()
            check_timeout()

            # 文件大小检查
            if os.path.getsize(fp) > self.max_file_size:
                self.failed_files.append((fp, f"文件过大：超过{self.max_file_size / 1024 / 1024}MB"))
                mutable_status[2] = "文件过大"
                return fragments

            check_timeout()

            # 更新状态
            mutable_status[0] = "提取文件内容"
            mutable_status[1] = time.time()

            # 提取内容
            content = extract_text(fp)
            if content is None:
                self.failed_files.append((fp, "文件解析失败：不支持的格式或文件损坏"))
                mutable_status[2] = "格式不支持"
                return fragments
            elif not content.strip():
                self.failed_files.append((fp, "文件内容为空"))
                mutable_status[2] = "内容为空"
                return fragments

            check_timeout()

            # 更新状态
            mutable_status[0] = "分割文本"
            mutable_status[1] = time.time()

            # 分割文本
            try:
                paper_fragments = breakdown_text_to_satisfy_token_limit(
                    txt=content,
                    limit=self._get_token_limit(),
                    llm_model=self.llm_kwargs['llm_model']
                )
            except Exception as e:
                self.failed_files.append((fp, f"文本分割失败：{str(e)}"))
                mutable_status[2] = "分割失败"
                return fragments

            check_timeout()

            # 处理片段
            rel_path = os.path.relpath(fp, project_folder)
            for i, frag in enumerate(paper_fragments):
                if frag.strip():
                    fragments.append(FileFragment(
                        file_path=fp,
                        content=frag,
                        rel_path=rel_path,
                        fragment_index=i,
                        total_fragments=len(paper_fragments)
                    ))

            mutable_status[2] = "处理完成"
            return fragments

        except TimeoutError as e:
            self.failed_files.append((fp, "处理超时"))
            mutable_status[2] = "处理超时"
            return []
        except Exception as e:
            self.failed_files.append((fp, f"处理失败：{str(e)}"))
            mutable_status[2] = "处理异常"
            return []
        finally:
            timer.cancel()

    def prepare_fragments(self, project_folder: str, file_paths: List[str]) -> Generator:
        import concurrent.futures


        from concurrent.futures import ThreadPoolExecutor
        from typing import Generator, List
        """并行准备所有文件的处理片段"""
        all_fragments = []
        total_files = len(file_paths)

        # 配置参数
        self.refresh_interval = 0.2  # UI刷新间隔
        self.watch_dog_patience = 5  # 看门狗超时时间
        self.max_file_size = 10 * 1024 * 1024  # 10MB限制
        self.max_workers = min(32, len(file_paths))  # 最多32个线程

        # 创建有超时控制的线程池
        executor = ThreadPoolExecutor(max_workers=self.max_workers)

        # 用于跨线程状态传递的可变列表 - 增加文件名信息
        mutable_status_array = [["等待中", time.time(), "pending", file_path] for file_path in file_paths]

        # 创建文件处理任务
        file_infos = [(fp, project_folder) for fp in file_paths]

        # 提交所有任务，使用带超时控制的处理函数
        futures = [
            executor.submit(
                self._process_single_file_with_timeout,
                file_info,
                mutable_status_array[i]
            ) for i, file_info in enumerate(file_infos)
        ]

        # 更新UI的计数器
        cnt = 0

        try:
            # 监控任务执行
            while True:
                time.sleep(self.refresh_interval)
                cnt += 1

                # 检查任务完成状态
                worker_done = [f.done() for f in futures]

                # 更新状态显示
                status_str = ""
                for i, (status, timestamp, desc, file_path) in enumerate(mutable_status_array):
                    # 获取文件名（去掉路径）
                    file_name = os.path.basename(file_path)
                    if worker_done[i]:
                        status_str += f"文件 {file_name}: {desc}\n"
                    else:
                        status_str += f"文件 {file_name}: {status} {desc}\n"

                # 更新UI
                self.chatbot[-1] = [
                    "处理进度",
                    f"正在处理文件...\n\n{status_str}" + "." * (cnt % 10 + 1)
                ]
                yield from update_ui(chatbot=self.chatbot, history=self.history)

                # 检查是否所有任务完成
                if all(worker_done):
                    break

        finally:
            # 确保线程池正确关闭
            executor.shutdown(wait=False)

        # 收集结果
        processed_files = 0
        for future in futures:
            try:
                fragments = future.result(timeout=0.1)  # 给予一个短暂的超时时间来获取结果
                all_fragments.extend(fragments)
                processed_files += 1
            except concurrent.futures.TimeoutError:
                # 处理获取结果超时
                file_index = futures.index(future)
                self.failed_files.append((file_paths[file_index], "结果获取超时"))
                continue
            except Exception as e:
                # 处理其他异常
                file_index = futures.index(future)
                self.failed_files.append((file_paths[file_index], f"未知错误：{str(e)}"))
                continue

        # 最终进度更新
        self.chatbot.append([
            "文件处理完成",
            f"成功处理 {len(all_fragments)} 个片段，失败 {len(self.failed_files)} 个文件"
        ])
        yield from update_ui(chatbot=self.chatbot, history=self.history)

        return all_fragments

    def _process_fragments_batch(self, fragments: List[FileFragment]) -> Generator:
        """批量处理文件片段"""
        from collections import defaultdict
        batch_size = 64  # 每批处理的片段数
        max_retries = 3  # 最大重试次数
        retry_delay = 5  # 重试延迟（秒）
        results = defaultdict(list)

        # 按批次处理
        for i in range(0, len(fragments), batch_size):
            batch = fragments[i:i + batch_size]

            inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(batch)
            sys_prompt_array = ["请总结以下内容："] * len(batch)

            # 添加重试机制
            for retry in range(max_retries):
                try:
                    response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
                        inputs_array=inputs_array,
                        inputs_show_user_array=inputs_show_user_array,
                        llm_kwargs=self.llm_kwargs,
                        chatbot=self.chatbot,
                        history_array=history_array,
                        sys_prompt_array=sys_prompt_array,
                    )

                    # 处理响应
                    for j, frag in enumerate(batch):
                        summary = response_collection[j * 2 + 1]
                        if summary and summary.strip():
                            results[frag.rel_path].append({
                                'index': frag.fragment_index,
                                'summary': summary,
                                'total': frag.total_fragments
                            })
                    break  # 成功处理，跳出重试循环

                except Exception as e:
                    if retry == max_retries - 1:  # 最后一次重试失败
                        for frag in batch:
                            self.failed_files.append((frag.file_path, f"处理失败：{str(e)}"))
                    else:
                        yield from update_ui(self.chatbot.append([f"批次处理失败，{retry_delay}秒后重试...", str(e)]))
                        time.sleep(retry_delay)

        return results

    def _generate_final_summary_request(self) -> Tuple[List, List, List]:
        """准备最终总结请求"""
        if not self.file_summaries_map:
            return (["无可用的文件总结"], ["生成最终总结"], [[]])

        summaries = list(self.file_summaries_map.values())
        if all(not summary for summary in summaries):
            return (["所有文件处理均失败"], ["生成最终总结"], [[]])

        if self.plugin_kwargs.get("advanced_arg"):
            i_say = "根据以上所有文件的处理结果，按要求进行综合处理：" + self.plugin_kwargs['advanced_arg']
        else:
            i_say = "请根据以上所有文件的处理结果，生成最终的总结，不超过1000字。"

        return ([i_say], [i_say], [summaries])

    def process_files(self, project_folder: str, file_paths: List[str]) -> Generator:
        """处理所有文件"""
        total_files = len(file_paths)
        self.chatbot.append([f"开始处理", f"总计 {total_files} 个文件"])
        yield from update_ui(chatbot=self.chatbot, history=self.history)

        # 1. 准备所有文件片段
        # 在 process_files 函数中：
        fragments = yield from self.prepare_fragments(project_folder, file_paths)
        if not fragments:
            self.chatbot.append(["处理失败", "没有可处理的文件内容"])
            return "没有可处理的文件内容"

        # 2. 批量处理所有文件片段
        self.chatbot.append([f"文件分析", f"共计 {len(fragments)} 个处理单元"])
        yield from update_ui(chatbot=self.chatbot, history=self.history)

        try:
            file_summaries = yield from self._process_fragments_batch(fragments)
        except Exception as e:
            self.chatbot.append(["处理错误", f"批处理过程失败：{str(e)}"])
            return "处理过程发生错误"

        # 3. 为每个文件生成整体总结
        self.chatbot.append(["生成总结", "正在汇总文件内容..."])
        yield from update_ui(chatbot=self.chatbot, history=self.history)

        # 处理每个文件的总结
        for rel_path, summaries in file_summaries.items():
            if len(summaries) > 1:  # 多片段文件需要生成整体总结
                sorted_summaries = sorted(summaries, key=lambda x: x['index'])
                if self.plugin_kwargs.get("advanced_arg"):

                    i_say = f'请按照用户要求对文件内容进行处理，用户要求为：{self.plugin_kwargs["advanced_arg"]}：'
                else:
                    i_say = f"请总结文件 {os.path.basename(rel_path)} 的主要内容，不超过500字。"

                try:
                    summary_texts = [s['summary'] for s in sorted_summaries]
                    response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
                        inputs_array=[i_say],
                        inputs_show_user_array=[f"生成 {rel_path} 的处理结果"],
                        llm_kwargs=self.llm_kwargs,
                        chatbot=self.chatbot,
                        history_array=[summary_texts],
                        sys_prompt_array=["你是一个优秀的助手，"],
                    )
                    self.file_summaries_map[rel_path] = response_collection[1]
                except Exception as e:
                    self.chatbot.append(["警告", f"文件 {rel_path} 总结生成失败：{str(e)}"])
                    self.file_summaries_map[rel_path] = "总结生成失败"
            else:  # 单片段文件直接使用其唯一的总结
                self.file_summaries_map[rel_path] = summaries[0]['summary']

        # 4. 生成最终总结
        if total_files ==1:
            return "文件数为1，此时不调用总结模块"
        else:
            try:
                # 收集所有文件的总结用于生成最终总结
                file_summaries_for_final = []
                for rel_path, summary in self.file_summaries_map.items():
                    file_summaries_for_final.append(f"文件 {rel_path} 的总结：\n{summary}")

                if self.plugin_kwargs.get("advanced_arg"):
                    final_summary_prompt = ("根据以下所有文件的总结内容，按要求进行综合处理：" +
                                            self.plugin_kwargs['advanced_arg'])
                else:
                    final_summary_prompt = "请根据以下所有文件的总结内容，生成最终的总结报告。"

                response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
                    inputs_array=[final_summary_prompt],
                    inputs_show_user_array=["生成最终总结报告"],
                    llm_kwargs=self.llm_kwargs,
                    chatbot=self.chatbot,
                    history_array=[file_summaries_for_final],
                    sys_prompt_array=["总结所有文件内容。"],
                    max_workers=1
                )

                return response_collection[1] if len(response_collection) > 1 else "生成总结失败"
            except Exception as e:
                self.chatbot.append(["错误", f"最终总结生成失败：{str(e)}"])
                return "生成总结失败"

    def save_results(self, final_summary: str):
        """保存结果到文件"""
        from toolbox import promote_file_to_downloadzone, write_history_to_file
        from crazy_functions.doc_fns.batch_file_query_doc import MarkdownFormatter, HtmlFormatter, WordFormatter
        import os
        timestamp = time.strftime("%Y%m%d_%H%M%S")

        # 创建各种格式化器
        md_formatter = MarkdownFormatter(final_summary, self.file_summaries_map, self.failed_files)
        html_formatter = HtmlFormatter(final_summary, self.file_summaries_map, self.failed_files)
        word_formatter = WordFormatter(final_summary, self.file_summaries_map, self.failed_files)

        result_files = []

        # 保存 Markdown
        md_content = md_formatter.create_document()
        result_file_md = write_history_to_file(
            history=[md_content],  # 直接传入内容列表
            file_basename=f"文档总结_{timestamp}.md"
        )
        result_files.append(result_file_md)

        # 保存 HTML
        html_content = html_formatter.create_document()
        result_file_html = write_history_to_file(
            history=[html_content],
            file_basename=f"文档总结_{timestamp}.html"
        )
        result_files.append(result_file_html)

        # 保存 Word
        doc = word_formatter.create_document()
        # 由于 Word 文档需要用 doc.save()，我们使用与 md 文件相同的目录
        result_file_docx = os.path.join(
            os.path.dirname(result_file_md),
            f"文档总结_{timestamp}.docx"
        )
        doc.save(result_file_docx)
        result_files.append(result_file_docx)

        # 添加到下载区
        for file in result_files:
            promote_file_to_downloadzone(file, chatbot=self.chatbot)

        self.chatbot.append(["处理完成", f"结果已保存至: {', '.join(result_files)}"])
@CatchException
def 批量文件询问(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List,
                 history: List, system_prompt: str, user_request: str):
    """主函数 - 优化版本"""
    # 初始化
    import glob
    import re
    from crazy_functions.rag_fns.rag_file_support import supports_format
    from toolbox import report_exception

    summarizer = BatchDocumentSummarizer(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
    chatbot.append(["函数插件功能", f"作者：lbykkkk，批量总结文件。支持格式: {', '.join(supports_format)}等其他文本格式文件，如果长时间卡在文件处理过程，请查看处理进度，然后删除所有处于“pending”状态的文件，然后重新上传处理。"])
    yield from update_ui(chatbot=chatbot, history=history)

    # 验证输入路径
    if not os.path.exists(txt):
        report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到项目或无权访问: {txt}")
        yield from update_ui(chatbot=chatbot, history=history)
        return

    # 获取文件列表
    project_folder = txt
    extract_folder = next((d for d in glob.glob(f'{project_folder}/*')
                           if os.path.isdir(d) and d.endswith('.extract')), project_folder)

    exclude_patterns = r'/[^/]+\.(zip|rar|7z|tar|gz)$'
    file_manifest = [f for f in glob.glob(f'{extract_folder}/**', recursive=True)
                     if os.path.isfile(f) and not re.search(exclude_patterns, f)]

    if not file_manifest:
        report_exception(chatbot, history, a=f"解析项目: {txt}", b="未找到支持的文件类型")
        yield from update_ui(chatbot=chatbot, history=history)
        return

    # 处理所有文件并生成总结
    final_summary = yield from summarizer.process_files(project_folder, file_manifest)
    yield from update_ui(chatbot=chatbot, history=history)

    # 保存结果
    summarizer.save_results(final_summary)
    yield from update_ui(chatbot=chatbot, history=history)