From 99453402778f7c83b4b36d08f1dd3aa57af0022f Mon Sep 17 00:00:00 2001 From: binary-husky Date: Mon, 14 Jul 2025 02:19:02 +0800 Subject: [PATCH] merge more academic plugins --- crazy_functional.py | 38 + crazy_functions/Document_Optimize.py | 673 ++++++++++++++ crazy_functions/Paper_Reading.py | 360 ++++++++ crazy_functions/paper_fns/__init__.py | 0 .../auto_git/handlers/base_handler.py | 386 ++++++++ .../auto_git/handlers/code_handler.py | 156 ++++ .../auto_git/handlers/repo_handler.py | 192 ++++ .../auto_git/handlers/topic_handler.py | 217 +++++ .../auto_git/handlers/user_handler.py | 164 ++++ .../paper_fns/auto_git/query_analyzer.py | 356 +++++++ .../auto_git/sources/github_source.py | 701 ++++++++++++++ .../paper_fns/document_structure_extractor.py | 593 ++++++++++++ .../paper_fns/file2file_doc/__init__.py | 4 + .../paper_fns/file2file_doc/html_doc.py | 300 ++++++ .../paper_fns/file2file_doc/markdown_doc.py | 40 + .../paper_fns/file2file_doc/txt_doc.py | 69 ++ .../paper_fns/file2file_doc/word2pdf.py | 125 +++ .../paper_fns/file2file_doc/word_doc.py | 236 +++++ crazy_functions/paper_fns/github_search.py | 278 ++++++ .../paper_fns/journal_paper_recom.py | 635 +++++++++++++ crazy_functions/paper_fns/paper_download.py | 295 ++++++ crazy_functions/paper_fns/reduce_aigc.py | 867 ++++++++++++++++++ .../paper_fns/wiki/wikipedia_api.py | 387 ++++++++ crazy_functions/rag_fns/rag_file_support.py | 25 + 24 files changed, 7097 insertions(+) create mode 100644 crazy_functions/Document_Optimize.py create mode 100644 crazy_functions/Paper_Reading.py create mode 100644 crazy_functions/paper_fns/__init__.py create mode 100644 crazy_functions/paper_fns/auto_git/handlers/base_handler.py create mode 100644 crazy_functions/paper_fns/auto_git/handlers/code_handler.py create mode 100644 crazy_functions/paper_fns/auto_git/handlers/repo_handler.py create mode 100644 crazy_functions/paper_fns/auto_git/handlers/topic_handler.py create mode 100644 crazy_functions/paper_fns/auto_git/handlers/user_handler.py create mode 100644 crazy_functions/paper_fns/auto_git/query_analyzer.py create mode 100644 crazy_functions/paper_fns/auto_git/sources/github_source.py create mode 100644 crazy_functions/paper_fns/document_structure_extractor.py create mode 100644 crazy_functions/paper_fns/file2file_doc/__init__.py create mode 100644 crazy_functions/paper_fns/file2file_doc/html_doc.py create mode 100644 crazy_functions/paper_fns/file2file_doc/markdown_doc.py create mode 100644 crazy_functions/paper_fns/file2file_doc/txt_doc.py create mode 100644 crazy_functions/paper_fns/file2file_doc/word2pdf.py create mode 100644 crazy_functions/paper_fns/file2file_doc/word_doc.py create mode 100644 crazy_functions/paper_fns/github_search.py create mode 100644 crazy_functions/paper_fns/journal_paper_recom.py create mode 100644 crazy_functions/paper_fns/paper_download.py create mode 100644 crazy_functions/paper_fns/reduce_aigc.py create mode 100644 crazy_functions/paper_fns/wiki/wikipedia_api.py diff --git a/crazy_functional.py b/crazy_functional.py index 9ef7ecdb..e9470e1e 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -706,6 +706,44 @@ def get_crazy_functions(): logger.error(trimmed_format_exc()) logger.error("Load function plugin failed") + try: + from crazy_functions.Document_Optimize import 自定义智能文档处理 + function_plugins.update( + { + "一键处理文档(支持自定义全文润色、降重等)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "AdvancedArgs": True, + "ArgsReminder": "请输入处理指令和要求(可以详细描述),如:请帮我润色文本,要求幽默点。默认调用润色指令。", + "Info": "保留文档结构,智能处理文档内容 | 输入参数为文件路径", + "Function": HotReload(自定义智能文档处理) + }, + } + ) + except: + logger.error(trimmed_format_exc()) + logger.error("Load function plugin failed") + + + + try: + from crazy_functions.Paper_Reading import 快速论文解读 + function_plugins.update( + { + "速读论文": { + "Group": "学术", + "Color": "stop", + "AsButton": False, + "Info": "上传一篇论文进行快速分析和解读 | 输入参数为论文路径或DOI/arXiv ID", + "Function": HotReload(快速论文解读), + }, + } + ) + except: + logger.error(trimmed_format_exc()) + logger.error("Load function plugin failed") + # try: # from crazy_functions.高级功能函数模板 import 测试图表渲染 diff --git a/crazy_functions/Document_Optimize.py b/crazy_functions/Document_Optimize.py new file mode 100644 index 00000000..c7728afe --- /dev/null +++ b/crazy_functions/Document_Optimize.py @@ -0,0 +1,673 @@ +import os +import time +import glob +import re +import threading +from typing import Dict, List, Generator, Tuple +from dataclasses import dataclass + +from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency +from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit +from crazy_functions.rag_fns.rag_file_support import extract_text, supports_format, convert_to_markdown +from request_llms.bridge_all import model_info +from toolbox import update_ui, CatchException, report_exception, promote_file_to_downloadzone, write_history_to_file +from shared_utils.fastapi_server import validate_path_safety + +# 新增:导入结构化论文提取器 +from crazy_functions.doc_fns.read_fns.unstructured_all.paper_structure_extractor import PaperStructureExtractor, ExtractorConfig, StructuredPaper + +# 导入格式化器 +from crazy_functions.paper_fns.file2file_doc import ( + TxtFormatter, + MarkdownFormatter, + HtmlFormatter, + WordFormatter +) + +@dataclass +class TextFragment: + """文本片段数据类,用于组织处理单元""" + content: str + fragment_index: int + total_fragments: int + + +class DocumentProcessor: + """文档处理器 - 处理单个文档并输出结果""" + + def __init__(self, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, history: List, system_prompt: str): + """初始化处理器""" + self.llm_kwargs = llm_kwargs + self.plugin_kwargs = plugin_kwargs + self.chatbot = chatbot + self.history = history + self.system_prompt = system_prompt + self.processed_results = [] + self.failed_fragments = [] + # 新增:初始化论文结构提取器 + self.paper_extractor = PaperStructureExtractor() + + def _get_token_limit(self) -> int: + """获取模型token限制,返回更小的值以确保更细粒度的分割""" + max_token = model_info[self.llm_kwargs['llm_model']]['max_token'] + # 降低token限制,使每个片段更小 + return max_token // 4 # 从3/4降低到1/4 + + def _create_batch_inputs(self, fragments: List[TextFragment]) -> Tuple[List, List, List]: + """创建批处理输入""" + inputs_array = [] + inputs_show_user_array = [] + history_array = [] + + user_instruction = self.plugin_kwargs.get("advanced_arg", "请润色以下学术文本,提高其语言表达的准确性、专业性和流畅度,保持学术风格,确保逻辑连贯,但不改变原文的科学内容和核心观点") + + for frag in fragments: + i_say = (f'请按照以下要求处理文本内容:{user_instruction}\n\n' + f'请将对文本的处理结果放在标签之间。\n\n' + f'文本内容:\n```\n{frag.content}\n```') + + i_say_show_user = f'正在处理文本片段 {frag.fragment_index + 1}/{frag.total_fragments}' + + inputs_array.append(i_say) + inputs_show_user_array.append(i_say_show_user) + history_array.append([]) + + return inputs_array, inputs_show_user_array, history_array + + def _extract_decision(self, text: str) -> str: + """从LLM响应中提取标签内的内容""" + import re + pattern = r'(.*?)' + matches = re.findall(pattern, text, re.DOTALL) + + if matches: + return matches[0].strip() + else: + # 如果没有找到标签,返回原始文本 + return text.strip() + + def process_file(self, file_path: str) -> Generator: + """处理单个文件""" + self.chatbot.append(["开始处理文件", f"文件路径: {file_path}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + try: + # 首先尝试转换为Markdown + from crazy_functions.rag_fns.rag_file_support import convert_to_markdown + file_path = convert_to_markdown(file_path) + + # 1. 检查文件是否为支持的论文格式 + is_paper_format = any(file_path.lower().endswith(ext) for ext in self.paper_extractor.SUPPORTED_EXTENSIONS) + + if is_paper_format: + # 使用结构化提取器处理论文 + return (yield from self._process_structured_paper(file_path)) + else: + # 使用原有方式处理普通文档 + return (yield from self._process_regular_file(file_path)) + + except Exception as e: + self.chatbot.append(["处理错误", f"文件处理失败: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + + def _process_structured_paper(self, file_path: str) -> Generator: + """处理结构化论文文件""" + # 1. 提取论文结构 + self.chatbot[-1] = ["正在分析论文结构", f"文件路径: {file_path}"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + try: + paper = self.paper_extractor.extract_paper_structure(file_path) + + if not paper or not paper.sections: + self.chatbot.append(["无法提取论文结构", "将使用全文内容进行处理"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 使用全文内容进行段落切分 + if paper and paper.full_text: + # 使用增强的分割函数进行更细致的分割 + fragments = self._breakdown_section_content(paper.full_text) + + # 创建文本片段对象 + text_fragments = [] + for i, frag in enumerate(fragments): + if frag.strip(): + text_fragments.append(TextFragment( + content=frag, + fragment_index=i, + total_fragments=len(fragments) + )) + + # 批量处理片段 + if text_fragments: + self.chatbot[-1] = ["开始处理文本", f"共 {len(text_fragments)} 个片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 一次性准备所有输入 + inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(text_fragments) + + # 使用系统提示 + instruction = self.plugin_kwargs.get("advanced_arg", "请润色以下学术文本,提高其语言表达的准确性、专业性和流畅度,保持学术风格,确保逻辑连贯,但不改变原文的科学内容和核心观点") + sys_prompt_array = [f"你是一个专业的学术文献编辑助手。请按照用户的要求:'{instruction}'处理文本。保持学术风格,增强表达的准确性和专业性。"] * len(text_fragments) + + # 调用LLM一次性处理所有片段 + response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history_array=history_array, + sys_prompt_array=sys_prompt_array, + ) + + # 处理响应 + for j, frag in enumerate(text_fragments): + try: + llm_response = response_collection[j * 2 + 1] + processed_text = self._extract_decision(llm_response) + + if processed_text and processed_text.strip(): + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': processed_text + }) + else: + self.failed_fragments.append(frag) + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': frag.content + }) + except Exception as e: + self.failed_fragments.append(frag) + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': frag.content + }) + + # 按原始顺序合并结果 + self.processed_results.sort(key=lambda x: x['index']) + final_content = "\n".join([item['content'] for item in self.processed_results]) + + # 更新UI + success_count = len(text_fragments) - len(self.failed_fragments) + self.chatbot[-1] = ["处理完成", f"成功处理 {success_count}/{len(text_fragments)} 个片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + return final_content + else: + self.chatbot.append(["处理失败", "未能提取到有效的文本内容"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + else: + self.chatbot.append(["处理失败", "未能提取到论文内容"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + + # 2. 准备处理章节内容(不处理标题) + self.chatbot[-1] = ["已提取论文结构", f"共 {len(paper.sections)} 个主要章节"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 3. 收集所有需要处理的章节内容并分割为合适大小 + sections_to_process = [] + section_map = {} # 用于映射处理前后的内容 + + def collect_section_contents(sections, parent_path=""): + """递归收集章节内容,跳过参考文献部分""" + for i, section in enumerate(sections): + current_path = f"{parent_path}/{i}" if parent_path else f"{i}" + + # 检查是否为参考文献部分,如果是则跳过 + if section.section_type == 'references' or section.title.lower() in ['references', '参考文献', 'bibliography', '文献']: + continue # 跳过参考文献部分 + + # 只处理内容非空的章节 + if section.content and section.content.strip(): + # 使用增强的分割函数进行更细致的分割 + fragments = self._breakdown_section_content(section.content) + + for fragment_idx, fragment_content in enumerate(fragments): + if fragment_content.strip(): + fragment_index = len(sections_to_process) + sections_to_process.append(TextFragment( + content=fragment_content, + fragment_index=fragment_index, + total_fragments=0 # 临时值,稍后更新 + )) + + # 保存映射关系,用于稍后更新章节内容 + # 为每个片段存储原始章节和片段索引信息 + section_map[fragment_index] = (current_path, section, fragment_idx, len(fragments)) + + # 递归处理子章节 + if section.subsections: + collect_section_contents(section.subsections, current_path) + + # 收集所有章节内容 + collect_section_contents(paper.sections) + + # 更新总片段数 + total_fragments = len(sections_to_process) + for frag in sections_to_process: + frag.total_fragments = total_fragments + + # 4. 如果没有内容需要处理,直接返回 + if not sections_to_process: + self.chatbot.append(["处理完成", "未找到需要处理的内容"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + + # 5. 批量处理章节内容 + self.chatbot[-1] = ["开始处理论文内容", f"共 {len(sections_to_process)} 个内容片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 一次性准备所有输入 + inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(sections_to_process) + + # 使用系统提示 + instruction = self.plugin_kwargs.get("advanced_arg", "请润色以下学术文本,提高其语言表达的准确性、专业性和流畅度,保持学术风格,确保逻辑连贯,但不改变原文的科学内容和核心观点") + sys_prompt_array = [f"你是一个专业的学术文献编辑助手。请按照用户的要求:'{instruction}'处理文本。保持学术风格,增强表达的准确性和专业性。"] * len(sections_to_process) + + # 调用LLM一次性处理所有片段 + response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history_array=history_array, + sys_prompt_array=sys_prompt_array, + ) + + # 处理响应,重组章节内容 + section_contents = {} # 用于重组各章节的处理后内容 + + for j, frag in enumerate(sections_to_process): + try: + llm_response = response_collection[j * 2 + 1] + processed_text = self._extract_decision(llm_response) + + if processed_text and processed_text.strip(): + # 保存处理结果 + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': processed_text + }) + + # 存储处理后的文本片段,用于后续重组 + fragment_index = frag.fragment_index + if fragment_index in section_map: + path, section, fragment_idx, total_fragments = section_map[fragment_index] + + # 初始化此章节的内容容器(如果尚未创建) + if path not in section_contents: + section_contents[path] = [""] * total_fragments + + # 将处理后的片段放入正确位置 + section_contents[path][fragment_idx] = processed_text + else: + self.failed_fragments.append(frag) + except Exception as e: + self.failed_fragments.append(frag) + + # 重组每个章节的内容 + for path, fragments in section_contents.items(): + section = None + for idx in section_map: + if section_map[idx][0] == path: + section = section_map[idx][1] + break + + if section: + # 合并该章节的所有处理后片段 + section.content = "\n".join(fragments) + + # 6. 更新UI + success_count = total_fragments - len(self.failed_fragments) + self.chatbot[-1] = ["处理完成", f"成功处理 {success_count}/{total_fragments} 个内容片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 收集参考文献部分(不进行处理) + references_sections = [] + def collect_references(sections, parent_path=""): + """递归收集参考文献部分""" + for i, section in enumerate(sections): + current_path = f"{parent_path}/{i}" if parent_path else f"{i}" + + # 检查是否为参考文献部分 + if section.section_type == 'references' or section.title.lower() in ['references', '参考文献', 'bibliography', '文献']: + references_sections.append((current_path, section)) + + # 递归检查子章节 + if section.subsections: + collect_references(section.subsections, current_path) + + # 收集参考文献 + collect_references(paper.sections) + + # 7. 将处理后的结构化论文转换为Markdown + markdown_content = self.paper_extractor.generate_markdown(paper) + + # 8. 返回处理后的内容 + self.chatbot[-1] = ["处理完成", f"成功处理 {success_count}/{total_fragments} 个内容片段,参考文献部分未处理"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + return markdown_content + + except Exception as e: + self.chatbot.append(["结构化处理失败", f"错误: {str(e)},将尝试作为普通文件处理"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return (yield from self._process_regular_file(file_path)) + + def _process_regular_file(self, file_path: str) -> Generator: + """使用原有方式处理普通文件""" + # 原有的文件处理逻辑 + self.chatbot[-1] = ["正在读取文件", f"文件路径: {file_path}"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + content = extract_text(file_path) + if not content or not content.strip(): + self.chatbot.append(["处理失败", "文件内容为空或无法提取内容"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + + # 2. 分割文本 + self.chatbot[-1] = ["正在分析文件", "将文件内容分割为适当大小的片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 使用增强的分割函数 + fragments = self._breakdown_section_content(content) + + # 3. 创建文本片段对象 + text_fragments = [] + for i, frag in enumerate(fragments): + if frag.strip(): + text_fragments.append(TextFragment( + content=frag, + fragment_index=i, + total_fragments=len(fragments) + )) + + # 4. 处理所有片段 + self.chatbot[-1] = ["开始处理文本", f"共 {len(text_fragments)} 个片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 批量处理片段 + batch_size = 8 # 每批处理的片段数 + for i in range(0, len(text_fragments), batch_size): + batch = text_fragments[i:i + batch_size] + + inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(batch) + + # 使用系统提示 + instruction = self.plugin_kwargs.get("advanced_arg", "请润色以下文本") + sys_prompt_array = [f"你是一个专业的文本处理助手。请按照用户的要求:'{instruction}'处理文本。"] * len(batch) + + # 调用LLM处理 + response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history_array=history_array, + sys_prompt_array=sys_prompt_array, + ) + + # 处理响应 + for j, frag in enumerate(batch): + try: + llm_response = response_collection[j * 2 + 1] + processed_text = self._extract_decision(llm_response) + + if processed_text and processed_text.strip(): + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': processed_text + }) + else: + self.failed_fragments.append(frag) + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': frag.content # 如果处理失败,使用原始内容 + }) + except Exception as e: + self.failed_fragments.append(frag) + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': frag.content # 如果处理失败,使用原始内容 + }) + + # 5. 按原始顺序合并结果 + self.processed_results.sort(key=lambda x: x['index']) + final_content = "\n".join([item['content'] for item in self.processed_results]) + + # 6. 更新UI + success_count = len(text_fragments) - len(self.failed_fragments) + self.chatbot[-1] = ["处理完成", f"成功处理 {success_count}/{len(text_fragments)} 个片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + return final_content + + def save_results(self, content: str, original_file_path: str) -> List[str]: + """保存处理结果为多种格式""" + if not content: + return [] + + timestamp = time.strftime("%Y%m%d_%H%M%S") + original_filename = os.path.basename(original_file_path) + filename_without_ext = os.path.splitext(original_filename)[0] + base_filename = f"{filename_without_ext}_processed_{timestamp}" + + result_files = [] + + # 获取用户指定的处理类型 + processing_type = self.plugin_kwargs.get("advanced_arg", "文本处理") + + # 1. 保存为TXT + try: + txt_formatter = TxtFormatter() + txt_content = txt_formatter.create_document(content) + txt_file = write_history_to_file( + history=[txt_content], + file_basename=f"{base_filename}.txt" + ) + result_files.append(txt_file) + except Exception as e: + self.chatbot.append(["警告", f"TXT格式保存失败: {str(e)}"]) + + # 2. 保存为Markdown + try: + md_formatter = MarkdownFormatter() + md_content = md_formatter.create_document(content, processing_type) + md_file = write_history_to_file( + history=[md_content], + file_basename=f"{base_filename}.md" + ) + result_files.append(md_file) + except Exception as e: + self.chatbot.append(["警告", f"Markdown格式保存失败: {str(e)}"]) + + # 3. 保存为HTML + try: + html_formatter = HtmlFormatter(processing_type=processing_type) + html_content = html_formatter.create_document(content) + html_file = write_history_to_file( + history=[html_content], + file_basename=f"{base_filename}.html" + ) + result_files.append(html_file) + except Exception as e: + self.chatbot.append(["警告", f"HTML格式保存失败: {str(e)}"]) + + # 4. 保存为Word + try: + word_formatter = WordFormatter() + doc = word_formatter.create_document(content, processing_type) + + # 获取保存路径 + from toolbox import get_log_folder + word_path = os.path.join(get_log_folder(), f"{base_filename}.docx") + doc.save(word_path) + + # 5. 保存为PDF(通过Word转换) + try: + from crazy_functions.paper_fns.file2file_doc.word2pdf import WordToPdfConverter + pdf_path = WordToPdfConverter.convert_to_pdf(word_path) + result_files.append(pdf_path) + except Exception as e: + self.chatbot.append(["警告", f"PDF格式保存失败: {str(e)}"]) + + except Exception as e: + self.chatbot.append(["警告", f"Word格式保存失败: {str(e)}"]) + + # 添加到下载区 + for file in result_files: + promote_file_to_downloadzone(file, chatbot=self.chatbot) + + return result_files + + def _breakdown_section_content(self, content: str) -> List[str]: + """对文本内容进行分割与合并 + + 主要按段落进行组织,只合并较小的段落以减少片段数量 + 保留原始段落结构,不对长段落进行强制分割 + 针对中英文设置不同的阈值,因为字符密度不同 + """ + # 先按段落分割文本 + paragraphs = content.split('\n\n') + + # 检测语言类型 + chinese_char_count = sum(1 for char in content if '\u4e00' <= char <= '\u9fff') + is_chinese_text = chinese_char_count / max(1, len(content)) > 0.3 + + # 根据语言类型设置不同的阈值(只用于合并小段落) + if is_chinese_text: + # 中文文本:一个汉字就是一个字符,信息密度高 + min_chunk_size = 300 # 段落合并的最小阈值 + target_size = 800 # 理想的段落大小 + else: + # 英文文本:一个单词由多个字符组成,信息密度低 + min_chunk_size = 600 # 段落合并的最小阈值 + target_size = 1600 # 理想的段落大小 + + # 1. 只合并小段落,不对长段落进行分割 + result_fragments = [] + current_chunk = [] + current_length = 0 + + for para in paragraphs: + # 如果段落太小且不会超过目标大小,则合并 + if len(para) < min_chunk_size and current_length + len(para) <= target_size: + current_chunk.append(para) + current_length += len(para) + # 否则,创建新段落 + else: + # 如果当前块非空且与当前段落无关,先保存它 + if current_chunk and current_length > 0: + result_fragments.append('\n\n'.join(current_chunk)) + + # 当前段落作为新块 + current_chunk = [para] + current_length = len(para) + + # 如果当前块大小已接近目标大小,保存并开始新块 + if current_length >= target_size: + result_fragments.append('\n\n'.join(current_chunk)) + current_chunk = [] + current_length = 0 + + # 保存最后一个块 + if current_chunk: + result_fragments.append('\n\n'.join(current_chunk)) + + # 2. 处理可能过大的片段(确保不超过token限制) + final_fragments = [] + max_token = self._get_token_limit() + + for fragment in result_fragments: + # 检查fragment是否可能超出token限制 + # 根据语言类型调整token估算 + if is_chinese_text: + estimated_tokens = len(fragment) / 1.5 # 中文每个token约1-2个字符 + else: + estimated_tokens = len(fragment) / 4 # 英文每个token约4个字符 + + if estimated_tokens > max_token: + # 即使可能超出限制,也尽量保持段落的完整性 + # 使用breakdown_text但设置更大的限制来减少分割 + larger_limit = max_token * 0.95 # 使用95%的限制 + sub_fragments = breakdown_text_to_satisfy_token_limit( + txt=fragment, + limit=larger_limit, + llm_model=self.llm_kwargs['llm_model'] + ) + final_fragments.extend(sub_fragments) + else: + final_fragments.append(fragment) + + return final_fragments + + +@CatchException +def 自定义智能文档处理(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, + history: List, system_prompt: str, user_request: str): + """主函数 - 文件到文件处理""" + # 初始化 + processor = DocumentProcessor(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + chatbot.append(["函数插件功能", "文件内容处理:将文档内容按照指定要求处理后输出为多种格式"]) + yield from update_ui(chatbot=chatbot, history=history) + + # 验证输入路径 + if not os.path.exists(txt): + report_exception(chatbot, history, a=f"解析路径: {txt}", b=f"找不到路径或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 验证路径安全性 + user_name = chatbot.get_user() + validate_path_safety(txt, user_name) + + # 获取文件列表 + if os.path.isfile(txt): + # 单个文件处理 + file_paths = [txt] + else: + # 目录处理 - 类似批量文件询问插件 + project_folder = txt + extract_folder = next((d for d in glob.glob(f'{project_folder}/*') + if os.path.isdir(d) and d.endswith('.extract')), project_folder) + + # 排除压缩文件 + exclude_patterns = r'/[^/]+\.(zip|rar|7z|tar|gz)$' + file_paths = [f for f in glob.glob(f'{extract_folder}/**', recursive=True) + if os.path.isfile(f) and not re.search(exclude_patterns, f)] + + # 过滤支持的文件格式 + file_paths = [f for f in file_paths if any(f.lower().endswith(ext) for ext in + list(processor.paper_extractor.SUPPORTED_EXTENSIONS) + ['.json', '.csv', '.xlsx', '.xls'])] + + if not file_paths: + report_exception(chatbot, history, a=f"解析路径: {txt}", b="未找到支持的文件类型") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 处理文件 + if len(file_paths) > 1: + chatbot.append(["发现多个文件", f"共找到 {len(file_paths)} 个文件,将处理第一个文件"]) + yield from update_ui(chatbot=chatbot, history=history) + + # 只处理第一个文件 + file_to_process = file_paths[0] + processed_content = yield from processor.process_file(file_to_process) + + if processed_content: + # 保存结果 + result_files = processor.save_results(processed_content, file_to_process) + + if result_files: + chatbot.append(["处理完成", f"已生成 {len(result_files)} 个结果文件"]) + else: + chatbot.append(["处理完成", "但未能保存任何结果文件"]) + else: + chatbot.append(["处理失败", "未能生成有效的处理结果"]) + + yield from update_ui(chatbot=chatbot, history=history) diff --git a/crazy_functions/Paper_Reading.py b/crazy_functions/Paper_Reading.py new file mode 100644 index 00000000..93c6dd1b --- /dev/null +++ b/crazy_functions/Paper_Reading.py @@ -0,0 +1,360 @@ +import os +import time +import glob +from pathlib import Path +from datetime import datetime +from dataclasses import dataclass +from typing import Dict, List, Generator, Tuple +from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from toolbox import update_ui, promote_file_to_downloadzone, write_history_to_file, CatchException, report_exception +from shared_utils.fastapi_server import validate_path_safety +from crazy_functions.paper_fns.paper_download import extract_paper_id, extract_paper_ids, get_arxiv_paper, format_arxiv_id + + + +@dataclass +class PaperQuestion: + """论文分析问题类""" + id: str # 问题ID + question: str # 问题内容 + importance: int # 重要性 (1-5,5最高) + description: str # 问题描述 + + +class PaperAnalyzer: + """论文快速分析器""" + + def __init__(self, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, history: List, system_prompt: str): + """初始化分析器""" + self.llm_kwargs = llm_kwargs + self.plugin_kwargs = plugin_kwargs + self.chatbot = chatbot + self.history = history + self.system_prompt = system_prompt + self.paper_content = "" + self.results = {} + + # 定义论文分析问题库(已合并为4个核心问题) + self.questions = [ + PaperQuestion( + id="research_and_methods", + question="这篇论文的主要研究问题、目标和方法是什么?请分析:1)论文的核心研究问题和研究动机;2)论文提出的关键方法、模型或理论框架;3)这些方法如何解决研究问题。", + importance=5, + description="研究问题与方法" + ), + PaperQuestion( + id="findings_and_innovation", + question="论文的主要发现、结论及创新点是什么?请分析:1)论文的核心结果与主要发现;2)作者得出的关键结论;3)研究的创新点与对领域的贡献;4)与已有工作的区别。", + importance=4, + description="研究发现与创新" + ), + PaperQuestion( + id="methodology_and_data", + question="论文使用了什么研究方法和数据?请详细分析:1)研究设计与实验设置;2)数据收集方法与数据集特点;3)分析技术与评估方法;4)方法学上的合理性。", + importance=3, + description="研究方法与数据" + ), + PaperQuestion( + id="limitations_and_impact", + question="论文的局限性、未来方向及潜在影响是什么?请分析:1)研究的不足与限制因素;2)作者提出的未来研究方向;3)该研究对学术界和行业可能产生的影响;4)研究结果的适用范围与推广价值。", + importance=2, + description="局限性与影响" + ), + ] + + # 按重要性排序 + self.questions.sort(key=lambda q: q.importance, reverse=True) + + def _load_paper(self, paper_path: str) -> Generator: + from crazy_functions.doc_fns.text_content_loader import TextContentLoader + """加载论文内容""" + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 使用TextContentLoader读取文件 + loader = TextContentLoader(self.chatbot, self.history) + + yield from loader.execute_single_file(paper_path) + + # 获取加载的内容 + if len(self.history) >= 2 and self.history[-2]: + self.paper_content = self.history[-2] + yield from update_ui(chatbot=self.chatbot, history=self.history) + return True + else: + self.chatbot.append(["错误", "无法读取论文内容,请检查文件是否有效"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return False + + def _analyze_question(self, question: PaperQuestion) -> Generator: + """分析单个问题 - 直接显示问题和答案""" + try: + # 创建分析提示 + prompt = f"请基于以下论文内容回答问题:\n\n{self.paper_content}\n\n问题:{question.question}" + + # 使用单线程版本的请求函数 + response = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=prompt, + inputs_show_user=question.question, # 显示问题本身 + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history=[], # 空历史,确保每个问题独立分析 + sys_prompt="你是一个专业的科研论文分析助手,需要仔细阅读论文内容并回答问题。请保持客观、准确,并基于论文内容提供深入分析。" + ) + + if response: + self.results[question.id] = response + return True + return False + + except Exception as e: + self.chatbot.append(["错误", f"分析问题时出错: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return False + + def _generate_summary(self) -> Generator: + """生成最终总结报告""" + self.chatbot.append(["生成报告", "正在整合分析结果,生成最终报告..."]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + summary_prompt = "请基于以下对论文的各个方面的分析,生成一份全面的论文解读报告。报告应该简明扼要地呈现论文的关键内容,并保持逻辑连贯性。" + + for q in self.questions: + if q.id in self.results: + summary_prompt += f"\n\n关于{q.description}的分析:\n{self.results[q.id]}" + + try: + # 使用单线程版本的请求函数,可以在前端实时显示生成结果 + response = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=summary_prompt, + inputs_show_user="生成论文解读报告", + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history=[], + sys_prompt="你是一个科研论文解读专家,请将多个方面的分析整合为一份完整、连贯、有条理的报告。报告应当重点突出,层次分明,并且保持学术性和客观性。" + ) + + if response: + return response + return "报告生成失败" + + except Exception as e: + self.chatbot.append(["错误", f"生成报告时出错: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return "报告生成失败: " + str(e) + + def save_report(self, report: str) -> Generator: + """保存分析报告""" + timestamp = time.strftime("%Y%m%d_%H%M%S") + + # 保存为Markdown文件 + try: + md_content = f"# 论文快速解读报告\n\n{report}" + for q in self.questions: + if q.id in self.results: + md_content += f"\n\n## {q.description}\n\n{self.results[q.id]}" + + result_file = write_history_to_file( + history=[md_content], + file_basename=f"论文解读_{timestamp}.md" + ) + + if result_file and os.path.exists(result_file): + promote_file_to_downloadzone(result_file, chatbot=self.chatbot) + self.chatbot.append(["保存成功", f"解读报告已保存至: {os.path.basename(result_file)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + else: + self.chatbot.append(["警告", "保存报告成功但找不到文件"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + except Exception as e: + self.chatbot.append(["警告", f"保存报告失败: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + def analyze_paper(self, paper_path: str) -> Generator: + """分析论文主流程""" + # 加载论文 + success = yield from self._load_paper(paper_path) + if not success: + return + + # 分析关键问题 - 直接询问每个问题,不显示进度信息 + for question in self.questions: + yield from self._analyze_question(question) + + # 生成总结报告 + final_report = yield from self._generate_summary() + + # 显示最终报告 + # self.chatbot.append(["论文解读报告", final_report]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 保存报告 + yield from self.save_report(final_report) + + +def _find_paper_file(path: str) -> str: + """查找路径中的论文文件(简化版)""" + if os.path.isfile(path): + return path + + # 支持的文件扩展名(按优先级排序) + extensions = ["pdf", "docx", "doc", "txt", "md", "tex"] + + # 简单地遍历目录 + if os.path.isdir(path): + try: + for ext in extensions: + # 手动检查每个可能的文件,而不使用glob + potential_file = os.path.join(path, f"paper.{ext}") + if os.path.exists(potential_file) and os.path.isfile(potential_file): + return potential_file + + # 如果没找到特定命名的文件,检查目录中的所有文件 + for file in os.listdir(path): + file_path = os.path.join(path, file) + if os.path.isfile(file_path): + file_ext = file.split('.')[-1].lower() if '.' in file else "" + if file_ext in extensions: + return file_path + except Exception: + pass # 忽略任何错误 + + return None + + +def download_paper_by_id(paper_info, chatbot, history) -> str: + """下载论文并返回保存路径 + + Args: + paper_info: 元组,包含论文ID类型(arxiv或doi)和ID值 + chatbot: 聊天机器人对象 + history: 历史记录 + + Returns: + str: 下载的论文路径或None + """ + from crazy_functions.review_fns.data_sources.scihub_source import SciHub + id_type, paper_id = paper_info + + # 创建保存目录 - 使用时间戳创建唯一文件夹 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + user_name = chatbot.get_user() if hasattr(chatbot, 'get_user') else "default" + from toolbox import get_log_folder, get_user + base_save_dir = get_log_folder(get_user(chatbot), plugin_name='paper_download') + save_dir = os.path.join(base_save_dir, f"papers_{timestamp}") + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_path = Path(save_dir) + + chatbot.append([f"下载论文", f"正在下载{'arXiv' if id_type == 'arxiv' else 'DOI'} {paper_id} 的论文..."]) + update_ui(chatbot=chatbot, history=history) + + pdf_path = None + + try: + if id_type == 'arxiv': + # 使用改进的arxiv查询方法 + formatted_id = format_arxiv_id(paper_id) + paper_result = get_arxiv_paper(formatted_id) + + if not paper_result: + chatbot.append([f"下载失败", f"未找到arXiv论文: {paper_id}"]) + update_ui(chatbot=chatbot, history=history) + return None + + # 下载PDF + filename = f"arxiv_{paper_id.replace('/', '_')}.pdf" + pdf_path = str(save_path / filename) + paper_result.download_pdf(filename=pdf_path) + + else: # doi + # 下载DOI + sci_hub = SciHub( + doi=paper_id, + path=save_path + ) + pdf_path = sci_hub.fetch() + + # 检查下载结果 + if pdf_path and os.path.exists(pdf_path): + promote_file_to_downloadzone(pdf_path, chatbot=chatbot) + chatbot.append([f"下载成功", f"已成功下载论文: {os.path.basename(pdf_path)}"]) + update_ui(chatbot=chatbot, history=history) + return pdf_path + else: + chatbot.append([f"下载失败", f"论文下载失败: {paper_id}"]) + update_ui(chatbot=chatbot, history=history) + return None + + except Exception as e: + chatbot.append([f"下载错误", f"下载论文时出错: {str(e)}"]) + update_ui(chatbot=chatbot, history=history) + return None + + +@CatchException +def 快速论文解读(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, + history: List, system_prompt: str, user_request: str): + """主函数 - 论文快速解读""" + # 初始化分析器 + chatbot.append(["函数插件功能及使用方式", "论文快速解读:通过分析论文的关键要素,帮助您迅速理解论文内容,适用于各学科领域的科研论文。

📋 使用方式:
1、直接上传PDF文件或者输入DOI号(仅针对SCI hub存在的论文)或arXiv ID(如2501.03916)
2、点击插件开始分析"]) + yield from update_ui(chatbot=chatbot, history=history) + + paper_file = None + + # 检查输入是否为论文ID(arxiv或DOI) + paper_info = extract_paper_id(txt) + + if paper_info: + # 如果是论文ID,下载论文 + chatbot.append(["检测到论文ID", f"检测到{'arXiv' if paper_info[0] == 'arxiv' else 'DOI'} ID: {paper_info[1]},准备下载论文..."]) + yield from update_ui(chatbot=chatbot, history=history) + + # 下载论文 - 完全重新实现 + paper_file = download_paper_by_id(paper_info, chatbot, history) + + if not paper_file: + report_exception(chatbot, history, a=f"下载论文失败", b=f"无法下载{'arXiv' if paper_info[0] == 'arxiv' else 'DOI'}论文: {paper_info[1]}") + yield from update_ui(chatbot=chatbot, history=history) + return + else: + # 检查输入路径 + if not os.path.exists(txt): + report_exception(chatbot, history, a=f"解析论文: {txt}", b=f"找不到文件或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 验证路径安全性 + user_name = chatbot.get_user() + validate_path_safety(txt, user_name) + + # 查找论文文件 + paper_file = _find_paper_file(txt) + + if not paper_file: + report_exception(chatbot, history, a=f"解析论文", b=f"在路径 {txt} 中未找到支持的论文文件") + yield from update_ui(chatbot=chatbot, history=history) + return + + yield from update_ui(chatbot=chatbot, history=history) + + # 增加调试信息,检查paper_file的类型和值 + chatbot.append(["文件类型检查", f"paper_file类型: {type(paper_file)}, 值: {paper_file}"]) + yield from update_ui(chatbot=chatbot, history=history) + chatbot.pop() # 移除调试信息 + + # 确保paper_file是字符串 + if paper_file is not None and not isinstance(paper_file, str): + # 尝试转换为字符串 + try: + paper_file = str(paper_file) + except: + report_exception(chatbot, history, a=f"类型错误", b=f"论文路径不是有效的字符串: {type(paper_file)}") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 分析论文 + chatbot.append(["开始分析", f"正在分析论文: {os.path.basename(paper_file)}"]) + yield from update_ui(chatbot=chatbot, history=history) + + analyzer = PaperAnalyzer(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + yield from analyzer.analyze_paper(paper_file) \ No newline at end of file diff --git a/crazy_functions/paper_fns/__init__.py b/crazy_functions/paper_fns/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crazy_functions/paper_fns/auto_git/handlers/base_handler.py b/crazy_functions/paper_fns/auto_git/handlers/base_handler.py new file mode 100644 index 00000000..9852a910 --- /dev/null +++ b/crazy_functions/paper_fns/auto_git/handlers/base_handler.py @@ -0,0 +1,386 @@ +from abc import ABC, abstractmethod +from typing import List, Dict, Any +from ..query_analyzer import SearchCriteria +from ..sources.github_source import GitHubSource +import asyncio +import re +from datetime import datetime + +class BaseHandler(ABC): + """处理器基类""" + + def __init__(self, github: GitHubSource, llm_kwargs: Dict = None): + self.github = github + self.llm_kwargs = llm_kwargs or {} + self.ranked_repos = [] # 存储排序后的仓库列表 + + def _get_search_params(self, plugin_kwargs: Dict) -> Dict: + """获取搜索参数""" + return { + 'max_repos': plugin_kwargs.get('max_repos', 150), # 最大仓库数量,从30改为150 + 'max_details': plugin_kwargs.get('max_details', 80), # 最多展示详情的仓库数量,新增参数 + 'search_multiplier': plugin_kwargs.get('search_multiplier', 3), # 检索倍数 + 'min_stars': plugin_kwargs.get('min_stars', 0), # 最少星标数 + } + + @abstractmethod + async def handle( + self, + criteria: SearchCriteria, + chatbot: List[List[str]], + history: List[List[str]], + system_prompt: str, + llm_kwargs: Dict[str, Any], + plugin_kwargs: Dict[str, Any], + ) -> str: + """处理查询""" + pass + + async def _search_repositories(self, query: str, language: str = None, min_stars: int = 0, + sort: str = "stars", per_page: int = 30) -> List[Dict]: + """搜索仓库""" + try: + # 构建查询字符串 + if min_stars > 0 and "stars:>" not in query: + query += f" stars:>{min_stars}" + + if language and "language:" not in query: + query += f" language:{language}" + + # 执行搜索 + result = await self.github.search_repositories( + query=query, + sort=sort, + per_page=per_page + ) + + if result and "items" in result: + return result["items"] + return [] + except Exception as e: + print(f"仓库搜索出错: {str(e)}") + return [] + + async def _search_bilingual_repositories(self, english_query: str, chinese_query: str, language: str = None, min_stars: int = 0, + sort: str = "stars", per_page: int = 30) -> List[Dict]: + """同时搜索中英文仓库并合并结果""" + try: + # 搜索英文仓库 + english_results = await self._search_repositories( + query=english_query, + language=language, + min_stars=min_stars, + sort=sort, + per_page=per_page + ) + + # 搜索中文仓库 + chinese_results = await self._search_repositories( + query=chinese_query, + language=language, + min_stars=min_stars, + sort=sort, + per_page=per_page + ) + + # 合并结果,去除重复项 + merged_results = [] + seen_repos = set() + + # 优先添加英文结果 + for repo in english_results: + repo_id = repo.get('id') + if repo_id and repo_id not in seen_repos: + seen_repos.add(repo_id) + merged_results.append(repo) + + # 添加中文结果(排除重复) + for repo in chinese_results: + repo_id = repo.get('id') + if repo_id and repo_id not in seen_repos: + seen_repos.add(repo_id) + merged_results.append(repo) + + # 按星标数重新排序 + merged_results.sort(key=lambda x: x.get('stargazers_count', 0), reverse=True) + + return merged_results[:per_page] # 返回合并后的前per_page个结果 + except Exception as e: + print(f"双语仓库搜索出错: {str(e)}") + return [] + + async def _search_code(self, query: str, language: str = None, per_page: int = 30) -> List[Dict]: + """搜索代码""" + try: + # 构建查询字符串 + if language and "language:" not in query: + query += f" language:{language}" + + # 执行搜索 + result = await self.github.search_code( + query=query, + per_page=per_page + ) + + if result and "items" in result: + return result["items"] + return [] + except Exception as e: + print(f"代码搜索出错: {str(e)}") + return [] + + async def _search_bilingual_code(self, english_query: str, chinese_query: str, language: str = None, per_page: int = 30) -> List[Dict]: + """同时搜索中英文代码并合并结果""" + try: + # 搜索英文代码 + english_results = await self._search_code( + query=english_query, + language=language, + per_page=per_page + ) + + # 搜索中文代码 + chinese_results = await self._search_code( + query=chinese_query, + language=language, + per_page=per_page + ) + + # 合并结果,去除重复项 + merged_results = [] + seen_files = set() + + # 优先添加英文结果 + for item in english_results: + # 使用文件URL作为唯一标识 + file_url = item.get('html_url', '') + if file_url and file_url not in seen_files: + seen_files.add(file_url) + merged_results.append(item) + + # 添加中文结果(排除重复) + for item in chinese_results: + file_url = item.get('html_url', '') + if file_url and file_url not in seen_files: + seen_files.add(file_url) + merged_results.append(item) + + # 对结果进行排序,优先显示匹配度高的结果 + # 由于无法直接获取匹配度,这里使用仓库的星标数作为替代指标 + merged_results.sort(key=lambda x: x.get('repository', {}).get('stargazers_count', 0), reverse=True) + + return merged_results[:per_page] # 返回合并后的前per_page个结果 + except Exception as e: + print(f"双语代码搜索出错: {str(e)}") + return [] + + async def _search_users(self, query: str, per_page: int = 30) -> List[Dict]: + """搜索用户""" + try: + result = await self.github.search_users( + query=query, + per_page=per_page + ) + + if result and "items" in result: + return result["items"] + return [] + except Exception as e: + print(f"用户搜索出错: {str(e)}") + return [] + + async def _search_bilingual_users(self, english_query: str, chinese_query: str, per_page: int = 30) -> List[Dict]: + """同时搜索中英文用户并合并结果""" + try: + # 搜索英文用户 + english_results = await self._search_users( + query=english_query, + per_page=per_page + ) + + # 搜索中文用户 + chinese_results = await self._search_users( + query=chinese_query, + per_page=per_page + ) + + # 合并结果,去除重复项 + merged_results = [] + seen_users = set() + + # 优先添加英文结果 + for user in english_results: + user_id = user.get('id') + if user_id and user_id not in seen_users: + seen_users.add(user_id) + merged_results.append(user) + + # 添加中文结果(排除重复) + for user in chinese_results: + user_id = user.get('id') + if user_id and user_id not in seen_users: + seen_users.add(user_id) + merged_results.append(user) + + # 按关注者数量进行排序 + merged_results.sort(key=lambda x: x.get('followers', 0), reverse=True) + + return merged_results[:per_page] # 返回合并后的前per_page个结果 + except Exception as e: + print(f"双语用户搜索出错: {str(e)}") + return [] + + async def _search_topics(self, query: str, per_page: int = 30) -> List[Dict]: + """搜索主题""" + try: + result = await self.github.search_topics( + query=query, + per_page=per_page + ) + + if result and "items" in result: + return result["items"] + return [] + except Exception as e: + print(f"主题搜索出错: {str(e)}") + return [] + + async def _search_bilingual_topics(self, english_query: str, chinese_query: str, per_page: int = 30) -> List[Dict]: + """同时搜索中英文主题并合并结果""" + try: + # 搜索英文主题 + english_results = await self._search_topics( + query=english_query, + per_page=per_page + ) + + # 搜索中文主题 + chinese_results = await self._search_topics( + query=chinese_query, + per_page=per_page + ) + + # 合并结果,去除重复项 + merged_results = [] + seen_topics = set() + + # 优先添加英文结果 + for topic in english_results: + topic_name = topic.get('name') + if topic_name and topic_name not in seen_topics: + seen_topics.add(topic_name) + merged_results.append(topic) + + # 添加中文结果(排除重复) + for topic in chinese_results: + topic_name = topic.get('name') + if topic_name and topic_name not in seen_topics: + seen_topics.add(topic_name) + merged_results.append(topic) + + # 可以按流行度进行排序(如果有) + if merged_results and 'featured' in merged_results[0]: + merged_results.sort(key=lambda x: x.get('featured', False), reverse=True) + + return merged_results[:per_page] # 返回合并后的前per_page个结果 + except Exception as e: + print(f"双语主题搜索出错: {str(e)}") + return [] + + async def _get_repo_details(self, repos: List[Dict]) -> List[Dict]: + """获取仓库详细信息""" + enhanced_repos = [] + + for repo in repos: + try: + # 获取README信息 + owner = repo.get('owner', {}).get('login') if repo.get('owner') is not None else None + repo_name = repo.get('name') + + if owner and repo_name: + readme = await self.github.get_repo_readme(owner, repo_name) + if readme and "decoded_content" in readme: + # 提取README的前1000个字符作为摘要 + repo['readme_excerpt'] = readme["decoded_content"][:1000] + "..." + + # 获取语言使用情况 + languages = await self.github.get_repository_languages(owner, repo_name) + if languages: + repo['languages_detail'] = languages + + # 获取最新发布版本 + releases = await self.github.get_repo_releases(owner, repo_name, per_page=1) + if releases and len(releases) > 0: + repo['latest_release'] = releases[0] + + # 获取主题标签 + topics = await self.github.get_repo_topics(owner, repo_name) + if topics and "names" in topics: + repo['topics'] = topics["names"] + + enhanced_repos.append(repo) + except Exception as e: + print(f"获取仓库 {repo.get('full_name')} 详情时出错: {str(e)}") + enhanced_repos.append(repo) # 添加原始仓库信息 + + return enhanced_repos + + def _format_repos(self, repos: List[Dict]) -> str: + """格式化仓库列表""" + formatted = [] + + for i, repo in enumerate(repos, 1): + # 构建仓库URL + repo_url = repo.get('html_url', '') + + # 构建完整的引用 + reference = ( + f"{i}. **{repo.get('full_name', '')}**\n" + f" - 描述: {repo.get('description', 'N/A')}\n" + f" - 语言: {repo.get('language', 'N/A')}\n" + f" - 星标: {repo.get('stargazers_count', 0)}\n" + f" - Fork数: {repo.get('forks_count', 0)}\n" + f" - 更新时间: {repo.get('updated_at', 'N/A')[:10]}\n" + f" - 创建时间: {repo.get('created_at', 'N/A')[:10]}\n" + f" - URL: {repo_url}\n" + ) + + # 添加主题标签(如果有) + if repo.get('topics'): + topics_str = ", ".join(repo.get('topics')) + reference += f" - 主题标签: {topics_str}\n" + + # 添加最新发布版本(如果有) + if repo.get('latest_release'): + release = repo.get('latest_release') + reference += f" - 最新版本: {release.get('tag_name', 'N/A')} ({release.get('published_at', 'N/A')[:10]})\n" + + # 添加README摘要(如果有) + if repo.get('readme_excerpt'): + # 截断README,只取前300个字符 + readme_short = repo.get('readme_excerpt')[:300].replace('\n', ' ') + reference += f" - README摘要: {readme_short}...\n" + + formatted.append(reference) + + return "\n".join(formatted) + + def _generate_apology_prompt(self, criteria: SearchCriteria) -> str: + """生成道歉提示""" + return f"""很抱歉,我们未能找到与"{criteria.main_topic}"相关的GitHub项目。 + +可能的原因: +1. 搜索词过于具体或冷门 +2. 星标数要求过高 +3. 编程语言限制过于严格 + +建议解决方案: + 1. 尝试使用更通用的关键词 + 2. 降低最低星标数要求 + 3. 移除或更改编程语言限制 +请根据以上建议调整后重试。""" + + def _get_current_time(self) -> str: + """获取当前时间信息""" + now = datetime.now() + return now.strftime("%Y年%m月%d日") \ No newline at end of file diff --git a/crazy_functions/paper_fns/auto_git/handlers/code_handler.py b/crazy_functions/paper_fns/auto_git/handlers/code_handler.py new file mode 100644 index 00000000..3f672a22 --- /dev/null +++ b/crazy_functions/paper_fns/auto_git/handlers/code_handler.py @@ -0,0 +1,156 @@ +from typing import List, Dict, Any +from .base_handler import BaseHandler +from ..query_analyzer import SearchCriteria +import asyncio + +class CodeSearchHandler(BaseHandler): + """代码搜索处理器""" + + def __init__(self, github, llm_kwargs=None): + super().__init__(github, llm_kwargs) + + async def handle( + self, + criteria: SearchCriteria, + chatbot: List[List[str]], + history: List[List[str]], + system_prompt: str, + llm_kwargs: Dict[str, Any], + plugin_kwargs: Dict[str, Any], + ) -> str: + """处理代码搜索请求,返回最终的prompt""" + + search_params = self._get_search_params(plugin_kwargs) + + # 搜索代码 + code_results = await self._search_bilingual_code( + english_query=criteria.github_params["query"], + chinese_query=criteria.github_params["chinese_query"], + language=criteria.language, + per_page=search_params['max_repos'] + ) + + if not code_results: + return self._generate_apology_prompt(criteria) + + # 获取代码文件内容 + enhanced_code_results = await self._get_code_details(code_results[:search_params['max_details']]) + self.ranked_repos = [item["repository"] for item in enhanced_code_results if "repository" in item] + + if not enhanced_code_results: + return self._generate_apology_prompt(criteria) + + # 构建最终的prompt + current_time = self._get_current_time() + final_prompt = f"""当前时间: {current_time} + +基于用户对{criteria.main_topic}的查询,我找到了以下代码示例。 + +代码搜索结果: +{self._format_code_results(enhanced_code_results)} + +请提供: + +1. 对于搜索的"{criteria.main_topic}"主题的综合解释: + - 概念和原理介绍 + - 常见实现方法和技术 + - 最佳实践和注意事项 + +2. 对每个代码示例: + - 解释代码的主要功能和实现方式 + - 分析代码质量、可读性和效率 + - 指出代码中的亮点和潜在改进空间 + - 说明代码的适用场景 + +3. 代码实现比较: + - 不同实现方法的优缺点 + - 性能和可维护性分析 + - 适用不同场景的实现建议 + +4. 学习建议: + - 理解和使用这些代码需要的背景知识 + - 如何扩展或改进所展示的代码 + - 进一步学习相关技术的资源 + +重要提示: +- 深入解释代码的核心逻辑和实现思路 +- 提供专业、技术性的分析 +- 优先关注代码的实现质量和技术价值 +- 当代码实现有问题时,指出并提供改进建议 +- 对于复杂代码,分解解释其组成部分 +- 根据用户查询的具体问题提供针对性答案 +- 所有链接请使用链接文本格式,确保链接在新窗口打开 + +使用markdown格式提供清晰的分节回复。 +""" + + return final_prompt + + async def _get_code_details(self, code_results: List[Dict]) -> List[Dict]: + """获取代码详情""" + enhanced_results = [] + + for item in code_results: + try: + repo = item.get('repository', {}) + file_path = item.get('path', '') + repo_name = repo.get('full_name', '') + + if repo_name and file_path: + owner, repo_name = repo_name.split('/') + + # 获取文件内容 + file_content = await self.github.get_file_content(owner, repo_name, file_path) + if file_content and "decoded_content" in file_content: + item['code_content'] = file_content["decoded_content"] + + # 获取仓库基本信息 + repo_details = await self.github.get_repo(owner, repo_name) + if repo_details: + item['repository'] = repo_details + + enhanced_results.append(item) + except Exception as e: + print(f"获取代码详情时出错: {str(e)}") + enhanced_results.append(item) # 添加原始信息 + + return enhanced_results + + def _format_code_results(self, code_results: List[Dict]) -> str: + """格式化代码搜索结果""" + formatted = [] + + for i, item in enumerate(code_results, 1): + # 构建仓库信息 + repo = item.get('repository', {}) + repo_name = repo.get('full_name', 'N/A') + repo_url = repo.get('html_url', '') + stars = repo.get('stargazers_count', 0) + language = repo.get('language', 'N/A') + + # 构建文件信息 + file_path = item.get('path', 'N/A') + file_url = item.get('html_url', '') + + # 构建代码内容 + code_content = item.get('code_content', '') + if code_content: + # 只显示前30行代码 + code_lines = code_content.split("\n") + if len(code_lines) > 30: + displayed_code = "\n".join(code_lines[:30]) + "\n... (代码太长已截断) ..." + else: + displayed_code = code_content + else: + displayed_code = "(代码内容获取失败)" + + reference = ( + f"### {i}. {file_path} (在 {repo_name} 中)\n\n" + f"- **仓库**: {repo_name} (⭐ {stars}, 语言: {language})\n" + f"- **文件路径**: {file_path}\n\n" + f"```{language.lower()}\n{displayed_code}\n```\n\n" + ) + + formatted.append(reference) + + return "\n".join(formatted) \ No newline at end of file diff --git a/crazy_functions/paper_fns/auto_git/handlers/repo_handler.py b/crazy_functions/paper_fns/auto_git/handlers/repo_handler.py new file mode 100644 index 00000000..2038c2eb --- /dev/null +++ b/crazy_functions/paper_fns/auto_git/handlers/repo_handler.py @@ -0,0 +1,192 @@ +from typing import List, Dict, Any +from .base_handler import BaseHandler +from ..query_analyzer import SearchCriteria +import asyncio + +class RepositoryHandler(BaseHandler): + """仓库搜索处理器""" + + def __init__(self, github, llm_kwargs=None): + super().__init__(github, llm_kwargs) + + async def handle( + self, + criteria: SearchCriteria, + chatbot: List[List[str]], + history: List[List[str]], + system_prompt: str, + llm_kwargs: Dict[str, Any], + plugin_kwargs: Dict[str, Any], + ) -> str: + """处理仓库搜索请求,返回最终的prompt""" + + search_params = self._get_search_params(plugin_kwargs) + + # 如果是特定仓库查询 + if criteria.repo_id: + try: + owner, repo = criteria.repo_id.split('/') + repo_details = await self.github.get_repo(owner, repo) + if repo_details: + # 获取推荐的相似仓库 + similar_repos = await self.github.get_repo_recommendations(criteria.repo_id, limit=5) + + # 添加详细信息 + all_repos = [repo_details] + similar_repos + enhanced_repos = await self._get_repo_details(all_repos) + + self.ranked_repos = enhanced_repos + + # 构建最终的prompt + current_time = self._get_current_time() + final_prompt = self._build_repo_detail_prompt(enhanced_repos[0], enhanced_repos[1:], current_time) + return final_prompt + else: + return self._generate_apology_prompt(criteria) + except Exception as e: + print(f"处理特定仓库时出错: {str(e)}") + return self._generate_apology_prompt(criteria) + + # 一般仓库搜索 + repos = await self._search_bilingual_repositories( + english_query=criteria.github_params["query"], + chinese_query=criteria.github_params["chinese_query"], + language=criteria.language, + min_stars=criteria.min_stars, + per_page=search_params['max_repos'] + ) + + if not repos: + return self._generate_apology_prompt(criteria) + + # 获取仓库详情 + enhanced_repos = await self._get_repo_details(repos[:search_params['max_details']]) # 使用max_details参数 + self.ranked_repos = enhanced_repos + + if not enhanced_repos: + return self._generate_apology_prompt(criteria) + + # 构建最终的prompt + current_time = self._get_current_time() + final_prompt = f"""当前时间: {current_time} + +基于用户对{criteria.main_topic}的兴趣,以下是相关的GitHub仓库。 + +可供推荐的GitHub仓库: +{self._format_repos(enhanced_repos)} + +请提供: +1. 按功能、用途或成熟度对仓库进行分组 + +2. 对每个仓库: + - 简要描述其主要功能和用途 + - 分析其技术特点和优势 + - 说明其适用场景和使用难度 + - 指出其与同类产品相比的独特优势 + - 解释其星标数量和活跃度代表的意义 + +3. 使用建议: + - 新手最适合入门的仓库 + - 生产环境中最稳定可靠的选择 + - 最新技术栈或创新方案的代表 + - 学习特定技术的最佳资源 + +4. 相关资源: + - 学习这些项目需要的前置知识 + - 项目间的关联和技术栈兼容性 + - 可能的使用组合方案 + +重要提示: +- 重点解释为什么每个仓库值得关注 +- 突出项目间的关联性和差异性 +- 考虑用户不同水平的需求(初学者vs专业人士) +- 在介绍项目时,使用文本格式,确保链接在新窗口打开 +- 根据仓库的活跃度、更新频率、维护状态提供使用建议 +- 仅基于提供的信息,不要做无根据的猜测 +- 在信息缺失或不明确时,坦诚说明 + +使用markdown格式提供清晰的分节回复。 +""" + + return final_prompt + + def _build_repo_detail_prompt(self, main_repo: Dict, similar_repos: List[Dict], current_time: str) -> str: + """构建仓库详情prompt""" + + # 提取README摘要 + readme_content = "未提供" + if main_repo.get('readme_excerpt'): + readme_content = main_repo.get('readme_excerpt') + + # 构建语言分布 + languages = main_repo.get('languages_detail', {}) + lang_distribution = [] + if languages: + total = sum(languages.values()) + for lang, bytes_val in languages.items(): + percentage = (bytes_val / total) * 100 + lang_distribution.append(f"{lang}: {percentage:.1f}%") + + lang_str = "未知" + if lang_distribution: + lang_str = ", ".join(lang_distribution) + + # 构建最终prompt + prompt = f"""当前时间: {current_time} + +## 主要仓库信息 + +### {main_repo.get('full_name')} + +- **描述**: {main_repo.get('description', '未提供')} +- **星标数**: {main_repo.get('stargazers_count', 0)} +- **Fork数**: {main_repo.get('forks_count', 0)} +- **Watch数**: {main_repo.get('watchers_count', 0)} +- **Issues数**: {main_repo.get('open_issues_count', 0)} +- **语言分布**: {lang_str} +- **许可证**: {main_repo.get('license', {}).get('name', '未指定') if main_repo.get('license') is not None else '未指定'} +- **创建时间**: {main_repo.get('created_at', '')[:10]} +- **最近更新**: {main_repo.get('updated_at', '')[:10]} +- **主题标签**: {', '.join(main_repo.get('topics', ['无']))} +- **GitHub链接**: 链接 + +### README摘要: +{readme_content} + +## 类似仓库: +{self._format_repos(similar_repos)} + +请提供以下内容: + +1. **项目概述** + - 详细解释{main_repo.get('name', '')}项目的主要功能和用途 + - 分析其技术特点、架构和实现原理 + - 讨论其在所属领域的地位和影响力 + - 评估项目成熟度和稳定性 + +2. **优势与特点** + - 与同类项目相比的独特优势 + - 显著的技术创新或设计模式 + - 值得学习或借鉴的代码实践 + +3. **使用场景** + - 最适合的应用场景 + - 潜在的使用限制和注意事项 + - 入门门槛和学习曲线评估 + - 产品级应用的可行性分析 + +4. **资源与生态** + - 相关学习资源推荐 + - 配套工具和库的建议 + - 社区支持和活跃度评估 + +5. **类似项目对比** + - 与列出的类似项目的详细对比 + - 不同场景下的最佳选择建议 + - 潜在的互补使用方案 + +提示:所有链接请使用链接文本格式,确保链接在新窗口打开。 + +请以专业、客观的技术分析角度回答,使用markdown格式提供结构化信息。 +""" + return prompt \ No newline at end of file diff --git a/crazy_functions/paper_fns/auto_git/handlers/topic_handler.py b/crazy_functions/paper_fns/auto_git/handlers/topic_handler.py new file mode 100644 index 00000000..6d6b4637 --- /dev/null +++ b/crazy_functions/paper_fns/auto_git/handlers/topic_handler.py @@ -0,0 +1,217 @@ +from typing import List, Dict, Any +from .base_handler import BaseHandler +from ..query_analyzer import SearchCriteria +import asyncio + +class TopicHandler(BaseHandler): + """主题搜索处理器""" + + def __init__(self, github, llm_kwargs=None): + super().__init__(github, llm_kwargs) + + async def handle( + self, + criteria: SearchCriteria, + chatbot: List[List[str]], + history: List[List[str]], + system_prompt: str, + llm_kwargs: Dict[str, Any], + plugin_kwargs: Dict[str, Any], + ) -> str: + """处理主题搜索请求,返回最终的prompt""" + + search_params = self._get_search_params(plugin_kwargs) + + # 搜索主题 + topics = await self._search_bilingual_topics( + english_query=criteria.github_params["query"], + chinese_query=criteria.github_params["chinese_query"], + per_page=search_params['max_repos'] + ) + + if not topics: + # 尝试用主题搜索仓库 + search_query = criteria.github_params["query"] + chinese_search_query = criteria.github_params["chinese_query"] + if "topic:" not in search_query: + search_query += " topic:" + criteria.main_topic.replace(" ", "-") + if "topic:" not in chinese_search_query: + chinese_search_query += " topic:" + criteria.main_topic.replace(" ", "-") + + repos = await self._search_bilingual_repositories( + english_query=search_query, + chinese_query=chinese_search_query, + language=criteria.language, + min_stars=criteria.min_stars, + per_page=search_params['max_repos'] + ) + + if not repos: + return self._generate_apology_prompt(criteria) + + # 获取仓库详情 + enhanced_repos = await self._get_repo_details(repos[:10]) + self.ranked_repos = enhanced_repos + + if not enhanced_repos: + return self._generate_apology_prompt(criteria) + + # 构建基于主题的仓库列表prompt + current_time = self._get_current_time() + final_prompt = f"""当前时间: {current_time} + +基于用户对主题"{criteria.main_topic}"的查询,我找到了以下相关GitHub仓库。 + +主题相关仓库: +{self._format_repos(enhanced_repos)} + +请提供: + +1. 主题综述: + - "{criteria.main_topic}"主题的概述和重要性 + - 该主题在技术领域中的应用和发展趋势 + - 主题相关的主要技术栈和知识体系 + +2. 仓库分析: + - 按功能、技术栈或应用场景对仓库进行分类 + - 每个仓库在该主题领域的定位和贡献 + - 不同仓库间的技术路线对比 + +3. 学习路径建议: + - 初学者入门该主题的推荐仓库和学习顺序 + - 进阶学习的关键仓库和技术要点 + - 实际应用中的最佳实践选择 + +4. 技术生态分析: + - 该主题下的主流工具和库 + - 社区活跃度和维护状况 + - 与其他相关技术的集成方案 + +重要提示: +- 主题"{criteria.main_topic}"是用户查询的核心,请围绕此主题展开分析 +- 注重仓库质量评估和使用建议 +- 提供基于事实的客观技术分析 +- 在介绍仓库时使用链接文本格式,确保链接在新窗口打开 +- 考虑不同技术水平用户的需求 + +使用markdown格式提供清晰的分节回复。 +""" + return final_prompt + + # 如果找到了主题,则获取主题下的热门仓库 + topic_repos = [] + for topic in topics[:5]: # 增加到5个主题 + topic_name = topic.get('name', '') + if topic_name: + # 搜索该主题下的仓库 + repos = await self._search_repositories( + query=f"topic:{topic_name}", + language=criteria.language, + min_stars=criteria.min_stars, + per_page=20 # 每个主题最多20个仓库 + ) + + if repos: + for repo in repos: + repo['topic_source'] = topic_name + topic_repos.append(repo) + + if not topic_repos: + return self._generate_apology_prompt(criteria) + + # 获取前N个仓库的详情 + enhanced_repos = await self._get_repo_details(topic_repos[:search_params['max_details']]) + self.ranked_repos = enhanced_repos + + if not enhanced_repos: + return self._generate_apology_prompt(criteria) + + # 构建最终的prompt + current_time = self._get_current_time() + final_prompt = f"""当前时间: {current_time} + +基于用户对"{criteria.main_topic}"主题的查询,我找到了以下相关GitHub主题和仓库。 + +主题相关仓库: +{self._format_topic_repos(enhanced_repos)} + +请提供: + +1. 主题概述: + - 对"{criteria.main_topic}"相关主题的介绍和技术背景 + - 这些主题在软件开发中的重要性和应用范围 + - 主题间的关联性和技术演进路径 + +2. 精选仓库分析: + - 每个主题下最具代表性的仓库详解 + - 仓库的技术亮点和创新点 + - 使用场景和技术成熟度评估 + +3. 技术趋势分析: + - 基于主题和仓库活跃度的技术发展趋势 + - 新兴解决方案和传统方案的对比 + - 未来可能的技术方向预测 + +4. 实践建议: + - 不同应用场景下的最佳仓库选择 + - 学习路径和资源推荐 + - 实际项目中的应用策略 + +重要提示: +- 将分析重点放在主题的技术内涵和价值上 +- 突出主题间的关联性和技术演进脉络 +- 提供基于数据(星标数、更新频率等)的客观分析 +- 考虑不同技术背景用户的需求 +- 所有链接请使用链接文本格式,确保链接在新窗口打开 + +使用markdown格式提供清晰的分节回复。 +""" + + return final_prompt + + def _format_topic_repos(self, repos: List[Dict]) -> str: + """按主题格式化仓库列表""" + # 按主题分组 + topics_dict = {} + for repo in repos: + topic = repo.get('topic_source', '其他') + if topic not in topics_dict: + topics_dict[topic] = [] + topics_dict[topic].append(repo) + + # 格式化输出 + formatted = [] + for topic, topic_repos in topics_dict.items(): + formatted.append(f"## 主题: {topic}\n") + + for i, repo in enumerate(topic_repos, 1): + # 构建仓库URL + repo_url = repo.get('html_url', '') + + # 构建引用 + reference = ( + f"{i}. **{repo.get('full_name', '')}**\n" + f" - 描述: {repo.get('description', 'N/A')}\n" + f" - 语言: {repo.get('language', 'N/A')}\n" + f" - 星标: {repo.get('stargazers_count', 0)}\n" + f" - Fork数: {repo.get('forks_count', 0)}\n" + f" - 更新时间: {repo.get('updated_at', 'N/A')[:10]}\n" + f" - URL: {repo_url}\n" + ) + + # 添加主题标签(如果有) + if repo.get('topics'): + topics_str = ", ".join(repo.get('topics')) + reference += f" - 主题标签: {topics_str}\n" + + # 添加README摘要(如果有) + if repo.get('readme_excerpt'): + # 截断README,只取前200个字符 + readme_short = repo.get('readme_excerpt')[:200].replace('\n', ' ') + reference += f" - README摘要: {readme_short}...\n" + + formatted.append(reference) + + formatted.append("\n") # 主题之间添加空行 + + return "\n".join(formatted) \ No newline at end of file diff --git a/crazy_functions/paper_fns/auto_git/handlers/user_handler.py b/crazy_functions/paper_fns/auto_git/handlers/user_handler.py new file mode 100644 index 00000000..923d0e90 --- /dev/null +++ b/crazy_functions/paper_fns/auto_git/handlers/user_handler.py @@ -0,0 +1,164 @@ +from typing import List, Dict, Any +from .base_handler import BaseHandler +from ..query_analyzer import SearchCriteria +import asyncio + +class UserSearchHandler(BaseHandler): + """用户搜索处理器""" + + def __init__(self, github, llm_kwargs=None): + super().__init__(github, llm_kwargs) + + async def handle( + self, + criteria: SearchCriteria, + chatbot: List[List[str]], + history: List[List[str]], + system_prompt: str, + llm_kwargs: Dict[str, Any], + plugin_kwargs: Dict[str, Any], + ) -> str: + """处理用户搜索请求,返回最终的prompt""" + + search_params = self._get_search_params(plugin_kwargs) + + # 搜索用户 + users = await self._search_bilingual_users( + english_query=criteria.github_params["query"], + chinese_query=criteria.github_params["chinese_query"], + per_page=search_params['max_repos'] + ) + + if not users: + return self._generate_apology_prompt(criteria) + + # 获取用户详情和仓库 + enhanced_users = await self._get_user_details(users[:search_params['max_details']]) + self.ranked_repos = [] # 添加用户top仓库进行展示 + + for user in enhanced_users: + if user.get('top_repos'): + self.ranked_repos.extend(user.get('top_repos')) + + if not enhanced_users: + return self._generate_apology_prompt(criteria) + + # 构建最终的prompt + current_time = self._get_current_time() + final_prompt = f"""当前时间: {current_time} + +基于用户对{criteria.main_topic}的查询,我找到了以下GitHub用户。 + +GitHub用户搜索结果: +{self._format_users(enhanced_users)} + +请提供: + +1. 用户综合分析: + - 各开发者的专业领域和技术专长 + - 他们在GitHub开源社区的影响力 + - 技术实力和项目质量评估 + +2. 对每位开发者: + - 其主要贡献领域和技术栈 + - 代表性项目及其价值 + - 编程风格和技术特点 + - 在相关领域的影响力 + +3. 项目推荐: + - 针对用户查询的最有价值项目 + - 值得学习和借鉴的代码实践 + - 不同用户项目的相互补充关系 + +4. 如何学习和使用: + - 如何从这些开发者项目中学习 + - 最适合入门学习的项目 + - 进阶学习的路径建议 + +重要提示: +- 关注开发者的技术专长和核心贡献 +- 分析其开源项目的技术价值 +- 根据用户的原始查询提供相关建议 +- 避免过度赞美或主观评价 +- 基于事实数据(项目数、星标数等)进行客观分析 +- 所有链接请使用链接文本格式,确保链接在新窗口打开 + +使用markdown格式提供清晰的分节回复。 +""" + + return final_prompt + + async def _get_user_details(self, users: List[Dict]) -> List[Dict]: + """获取用户详情和仓库""" + enhanced_users = [] + + for user in users: + try: + username = user.get('login') + + if username: + # 获取用户详情 + user_details = await self.github.get_user(username) + if user_details: + user.update(user_details) + + # 获取用户仓库 + repos = await self.github.get_user_repos( + username, + sort="stars", + per_page=10 # 增加到10个仓库 + ) + if repos: + user['top_repos'] = repos + + enhanced_users.append(user) + except Exception as e: + print(f"获取用户 {user.get('login')} 详情时出错: {str(e)}") + enhanced_users.append(user) # 添加原始信息 + + return enhanced_users + + def _format_users(self, users: List[Dict]) -> str: + """格式化用户列表""" + formatted = [] + + for i, user in enumerate(users, 1): + # 构建用户信息 + username = user.get('login', 'N/A') + name = user.get('name', username) + profile_url = user.get('html_url', '') + bio = user.get('bio', '无简介') + followers = user.get('followers', 0) + public_repos = user.get('public_repos', 0) + company = user.get('company', '未指定') + location = user.get('location', '未指定') + blog = user.get('blog', '') + + user_info = ( + f"### {i}. {name} (@{username})\n\n" + f"- **简介**: {bio}\n" + f"- **关注者**: {followers} | **公开仓库**: {public_repos}\n" + f"- **公司**: {company} | **地点**: {location}\n" + f"- **个人网站**: {blog}\n" + f"- **GitHub**: {username}\n\n" + ) + + # 添加用户的热门仓库 + top_repos = user.get('top_repos', []) + if top_repos: + user_info += "**热门仓库**:\n\n" + for repo in top_repos: + repo_name = repo.get('name', '') + repo_url = repo.get('html_url', '') + repo_desc = repo.get('description', '无描述') + repo_stars = repo.get('stargazers_count', 0) + repo_language = repo.get('language', '未指定') + + user_info += ( + f"- {repo_name} - ⭐ {repo_stars}, {repo_language}\n" + f" {repo_desc}\n\n" + ) + + formatted.append(user_info) + + return "\n".join(formatted) \ No newline at end of file diff --git a/crazy_functions/paper_fns/auto_git/query_analyzer.py b/crazy_functions/paper_fns/auto_git/query_analyzer.py new file mode 100644 index 00000000..605de715 --- /dev/null +++ b/crazy_functions/paper_fns/auto_git/query_analyzer.py @@ -0,0 +1,356 @@ +from typing import Dict, List +from dataclasses import dataclass +import re + +@dataclass +class SearchCriteria: + """搜索条件""" + query_type: str # 查询类型: repo/code/user/topic + main_topic: str # 主题 + sub_topics: List[str] # 子主题列表 + language: str # 编程语言 + min_stars: int # 最少星标数 + github_params: Dict # GitHub搜索参数 + original_query: str = "" # 原始查询字符串 + repo_id: str = "" # 特定仓库ID或名称 + +class QueryAnalyzer: + """查询分析器""" + + # 响应索引常量 + BASIC_QUERY_INDEX = 0 + GITHUB_QUERY_INDEX = 1 + + def __init__(self): + self.valid_types = { + "repo": ["repository", "project", "library", "framework", "tool"], + "code": ["code", "snippet", "implementation", "function", "class", "algorithm"], + "user": ["user", "developer", "organization", "contributor", "maintainer"], + "topic": ["topic", "category", "tag", "field", "area", "domain"] + } + + def analyze_query(self, query: str, chatbot: List, llm_kwargs: Dict): + """分析查询意图""" + from crazy_functions.crazy_utils import \ + request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency as request_gpt + + # 1. 基本查询分析 + type_prompt = f"""请分析这个与GitHub相关的查询,并严格按照以下XML格式回答: + +查询: {query} + +说明: +1. 你的回答必须使用下面显示的XML标签,不要有任何标签外的文本 +2. 从以下选项中选择查询类型: repo/code/user/topic + - repo: 用于查找仓库、项目、框架或库 + - code: 用于查找代码片段、函数实现或算法 + - user: 用于查找用户、开发者或组织 + - topic: 用于查找主题、类别或领域相关项目 +3. 识别主题和子主题 +4. 识别首选编程语言(如果有) +5. 确定最低星标数(如果适用) + +必需格式: +此处回答 +此处回答 +子主题1, 子主题2, ... +此处回答 +此处回答 + +示例回答: + +1. 仓库查询: +查询: "查找有至少1000颗星的Python web框架" +repo +web框架 +后端开发, HTTP服务器, ORM +Python +1000 + +2. 代码查询: +查询: "如何用JavaScript实现防抖函数" +code +防抖函数 +事件处理, 性能优化, 函数节流 +JavaScript +0""" + + # 2. 生成英文搜索条件 + github_prompt = f"""Optimize the following GitHub search query: + +Query: {query} + +Task: Convert the natural language query into an optimized GitHub search query. +Please use English, regardless of the language of the input query. + +Available search fields and filters: +1. Basic fields: + - in:name - Search in repository names + - in:description - Search in repository descriptions + - in:readme - Search in README files + - in:topic - Search in topics + - language:X - Filter by programming language + - user:X - Repositories from a specific user + - org:X - Repositories from a specific organization + +2. Code search fields: + - extension:X - Filter by file extension + - path:X - Filter by path + - filename:X - Filter by filename + +3. Metric filters: + - stars:>X - Has more than X stars + - forks:>X - Has more than X forks + - size:>X - Size greater than X KB + - created:>YYYY-MM-DD - Created after a specific date + - pushed:>YYYY-MM-DD - Updated after a specific date + +4. Other filters: + - is:public/private - Public or private repositories + - archived:true/false - Archived or not archived + - license:X - Specific license + - topic:X - Contains specific topic tag + +Examples: + +1. Query: "Find Python machine learning libraries with at least 1000 stars" +machine learning in:description language:python stars:>1000 + +2. Query: "Recently updated React UI component libraries" +UI components library in:readme in:description language:javascript topic:react pushed:>2023-01-01 + +3. Query: "Open source projects developed by Facebook" +org:facebook is:public + +4. Query: "Depth-first search implementation in JavaScript" +depth first search in:file language:javascript + +Please analyze the query and answer using only the XML tag: +Provide the optimized GitHub search query, using appropriate fields and operators""" + + # 3. 生成中文搜索条件 + chinese_github_prompt = f"""优化以下GitHub搜索查询: + +查询: {query} + +任务: 将自然语言查询转换为优化的GitHub搜索查询语句。 +为了搜索中文内容,请提取原始查询的关键词并使用中文形式,同时保留GitHub特定的搜索语法为英文。 + +可用的搜索字段和过滤器: +1. 基本字段: + - in:name - 在仓库名称中搜索 + - in:description - 在仓库描述中搜索 + - in:readme - 在README文件中搜索 + - in:topic - 在主题中搜索 + - language:X - 按编程语言筛选 + - user:X - 特定用户的仓库 + - org:X - 特定组织的仓库 + +2. 代码搜索字段: + - extension:X - 按文件扩展名筛选 + - path:X - 按路径筛选 + - filename:X - 按文件名筛选 + +3. 指标过滤器: + - stars:>X - 有超过X颗星 + - forks:>X - 有超过X个分支 + - size:>X - 大小超过X KB + - created:>YYYY-MM-DD - 在特定日期后创建 + - pushed:>YYYY-MM-DD - 在特定日期后更新 + +4. 其他过滤器: + - is:public/private - 公开或私有仓库 + - archived:true/false - 已归档或未归档 + - license:X - 特定许可证 + - topic:X - 含特定主题标签 + +示例: + +1. 查询: "找有关机器学习的Python库,至少1000颗星" +机器学习 in:description language:python stars:>1000 + +2. 查询: "最近更新的React UI组件库" +UI 组件库 in:readme in:description language:javascript topic:react pushed:>2023-01-01 + +3. 查询: "微信小程序开发框架" +微信小程序 开发框架 in:name in:description in:readme + +请分析查询并仅使用XML标签回答: +提供优化的GitHub搜索查询,使用适当的字段和运算符,保留中文关键词""" + + try: + # 构建提示数组 + prompts = [ + type_prompt, + github_prompt, + chinese_github_prompt, + ] + + show_messages = [ + "分析查询类型...", + "优化英文GitHub搜索参数...", + "优化中文GitHub搜索参数...", + ] + + sys_prompts = [ + "你是一个精通GitHub生态系统的专家,擅长分析与GitHub相关的查询。", + "You are a GitHub search expert, specialized in converting natural language queries into optimized GitHub search queries in English.", + "你是一个GitHub搜索专家,擅长处理查询并保留中文关键词进行搜索。", + ] + + # 使用同步方式调用LLM + responses = yield from request_gpt( + inputs_array=prompts, + inputs_show_user_array=show_messages, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history_array=[[] for _ in prompts], + sys_prompt_array=sys_prompts, + max_workers=3 + ) + + # 从收集的响应中提取我们需要的内容 + extracted_responses = [] + for i in range(len(prompts)): + if (i * 2 + 1) < len(responses): + response = responses[i * 2 + 1] + if response is None: + raise Exception(f"Response {i} is None") + if not isinstance(response, str): + try: + response = str(response) + except: + raise Exception(f"Cannot convert response {i} to string") + extracted_responses.append(response) + else: + raise Exception(f"未收到第 {i + 1} 个响应") + + # 解析基本信息 + query_type = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "query_type") + if not query_type: + print( + f"Debug - Failed to extract query_type. Response was: {extracted_responses[self.BASIC_QUERY_INDEX]}") + raise Exception("无法提取query_type标签内容") + query_type = query_type.lower() + + main_topic = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "main_topic") + if not main_topic: + print(f"Debug - Failed to extract main_topic. Using query as fallback.") + main_topic = query + + query_type = self._normalize_query_type(query_type, query) + + # 提取子主题 + sub_topics = [] + sub_topics_text = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "sub_topics") + if sub_topics_text: + sub_topics = [topic.strip() for topic in sub_topics_text.split(",")] + + # 提取语言 + language = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "language") + + # 提取最低星标数 + min_stars = 0 + min_stars_text = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "min_stars") + if min_stars_text and min_stars_text.isdigit(): + min_stars = int(min_stars_text) + + # 解析GitHub搜索参数 - 英文 + english_github_query = self._extract_tag(extracted_responses[self.GITHUB_QUERY_INDEX], "query") + + # 解析GitHub搜索参数 - 中文 + chinese_github_query = self._extract_tag(extracted_responses[2], "query") + + # 构建GitHub参数 + github_params = { + "query": english_github_query, + "chinese_query": chinese_github_query, + "sort": "stars", # 默认按星标排序 + "order": "desc", # 默认降序 + "per_page": 30, # 默认每页30条 + "page": 1 # 默认第1页 + } + + # 检查是否为特定仓库查询 + repo_id = "" + if "repo:" in english_github_query or "repository:" in english_github_query: + repo_match = re.search(r'(repo|repository):([a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+)', english_github_query) + if repo_match: + repo_id = repo_match.group(2) + + print(f"Debug - 提取的信息:") + print(f"查询类型: {query_type}") + print(f"主题: {main_topic}") + print(f"子主题: {sub_topics}") + print(f"语言: {language}") + print(f"最低星标数: {min_stars}") + print(f"英文GitHub参数: {english_github_query}") + print(f"中文GitHub参数: {chinese_github_query}") + print(f"特定仓库: {repo_id}") + + # 更新返回的 SearchCriteria,包含中英文查询 + return SearchCriteria( + query_type=query_type, + main_topic=main_topic, + sub_topics=sub_topics, + language=language, + min_stars=min_stars, + github_params=github_params, + original_query=query, + repo_id=repo_id + ) + + except Exception as e: + raise Exception(f"分析查询失败: {str(e)}") + + def _normalize_query_type(self, query_type: str, query: str) -> str: + """规范化查询类型""" + if query_type in ["repo", "code", "user", "topic"]: + return query_type + + query_lower = query.lower() + for type_name, keywords in self.valid_types.items(): + for keyword in keywords: + if keyword in query_lower: + return type_name + + query_type_lower = query_type.lower() + for type_name, keywords in self.valid_types.items(): + for keyword in keywords: + if keyword in query_type_lower: + return type_name + + return "repo" # 默认返回repo类型 + + def _extract_tag(self, text: str, tag: str) -> str: + """提取标记内容""" + if not text: + return "" + + # 标准XML格式(处理多行和特殊字符) + pattern = f"<{tag}>(.*?)" + match = re.search(pattern, text, re.DOTALL | re.IGNORECASE) + if match: + content = match.group(1).strip() + if content: + return content + + # 备用模式 + patterns = [ + rf"<{tag}>\s*([\s\S]*?)\s*", # 标准XML格式 + rf"<{tag}>([\s\S]*?)(?:|$)", # 未闭合的标签 + rf"[{tag}]([\s\S]*?)[/{tag}]", # 方括号格式 + rf"{tag}:\s*(.*?)(?=\n\w|$)", # 冒号格式 + rf"<{tag}>\s*(.*?)(?=<|$)" # 部分闭合 + ] + + # 尝试所有模式 + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE | re.DOTALL) + if match: + content = match.group(1).strip() + if content: # 确保提取的内容不为空 + return content + + # 如果所有模式都失败,返回空字符串 + return "" \ No newline at end of file diff --git a/crazy_functions/paper_fns/auto_git/sources/github_source.py b/crazy_functions/paper_fns/auto_git/sources/github_source.py new file mode 100644 index 00000000..28cd80a6 --- /dev/null +++ b/crazy_functions/paper_fns/auto_git/sources/github_source.py @@ -0,0 +1,701 @@ +import aiohttp +import asyncio +import base64 +import json +import random +from datetime import datetime +from typing import List, Dict, Optional, Union, Any + +class GitHubSource: + """GitHub API实现""" + + # 默认API密钥列表 - 可以放置多个GitHub令牌 + API_KEYS = [ + "github_pat_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + "github_pat_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", + # "your_github_token_1", + # "your_github_token_2", + # "your_github_token_3" + ] + + def __init__(self, api_key: Optional[Union[str, List[str]]] = None): + """初始化GitHub API客户端 + + Args: + api_key: GitHub个人访问令牌或令牌列表 + """ + if api_key is None: + self.api_keys = self.API_KEYS + elif isinstance(api_key, str): + self.api_keys = [api_key] + else: + self.api_keys = api_key + + self._initialize() + + def _initialize(self) -> None: + """初始化客户端,设置默认参数""" + self.base_url = "https://api.github.com" + self.headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + "User-Agent": "GitHub-API-Python-Client" + } + + # 如果有可用的API密钥,随机选择一个 + if self.api_keys: + selected_key = random.choice(self.api_keys) + self.headers["Authorization"] = f"Bearer {selected_key}" + print(f"已随机选择API密钥进行认证") + else: + print("警告: 未提供API密钥,将受到GitHub API请求限制") + + async def _request(self, method: str, endpoint: str, params: Dict = None, data: Dict = None) -> Any: + """发送API请求 + + Args: + method: HTTP方法 (GET, POST, PUT, DELETE等) + endpoint: API端点 + params: URL参数 + data: 请求体数据 + + Returns: + 解析后的响应JSON + """ + async with aiohttp.ClientSession(headers=self.headers) as session: + url = f"{self.base_url}{endpoint}" + + # 为调试目的打印请求信息 + print(f"请求: {method} {url}") + if params: + print(f"参数: {params}") + + # 发送请求 + request_kwargs = {} + if params: + request_kwargs["params"] = params + if data: + request_kwargs["json"] = data + + async with session.request(method, url, **request_kwargs) as response: + response_text = await response.text() + + # 检查HTTP状态码 + if response.status >= 400: + print(f"API请求失败: HTTP {response.status}") + print(f"响应内容: {response_text}") + return None + + # 解析JSON响应 + try: + return json.loads(response_text) + except json.JSONDecodeError: + print(f"JSON解析错误: {response_text}") + return None + + # ===== 用户相关方法 ===== + + async def get_user(self, username: Optional[str] = None) -> Dict: + """获取用户信息 + + Args: + username: 指定用户名,不指定则获取当前授权用户 + + Returns: + 用户信息字典 + """ + endpoint = "/user" if username is None else f"/users/{username}" + return await self._request("GET", endpoint) + + async def get_user_repos(self, username: Optional[str] = None, sort: str = "updated", + direction: str = "desc", per_page: int = 30, page: int = 1) -> List[Dict]: + """获取用户的仓库列表 + + Args: + username: 指定用户名,不指定则获取当前授权用户 + sort: 排序方式 (created, updated, pushed, full_name) + direction: 排序方向 (asc, desc) + per_page: 每页结果数量 + page: 页码 + + Returns: + 仓库列表 + """ + endpoint = "/user/repos" if username is None else f"/users/{username}/repos" + params = { + "sort": sort, + "direction": direction, + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + async def get_user_starred(self, username: Optional[str] = None, + per_page: int = 30, page: int = 1) -> List[Dict]: + """获取用户星标的仓库 + + Args: + username: 指定用户名,不指定则获取当前授权用户 + per_page: 每页结果数量 + page: 页码 + + Returns: + 星标仓库列表 + """ + endpoint = "/user/starred" if username is None else f"/users/{username}/starred" + params = { + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + # ===== 仓库相关方法 ===== + + async def get_repo(self, owner: str, repo: str) -> Dict: + """获取仓库信息 + + Args: + owner: 仓库所有者 + repo: 仓库名 + + Returns: + 仓库信息 + """ + endpoint = f"/repos/{owner}/{repo}" + return await self._request("GET", endpoint) + + async def get_repo_branches(self, owner: str, repo: str, per_page: int = 30, page: int = 1) -> List[Dict]: + """获取仓库的分支列表 + + Args: + owner: 仓库所有者 + repo: 仓库名 + per_page: 每页结果数量 + page: 页码 + + Returns: + 分支列表 + """ + endpoint = f"/repos/{owner}/{repo}/branches" + params = { + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + async def get_repo_commits(self, owner: str, repo: str, sha: Optional[str] = None, + path: Optional[str] = None, per_page: int = 30, page: int = 1) -> List[Dict]: + """获取仓库的提交历史 + + Args: + owner: 仓库所有者 + repo: 仓库名 + sha: 特定提交SHA或分支名 + path: 文件路径筛选 + per_page: 每页结果数量 + page: 页码 + + Returns: + 提交列表 + """ + endpoint = f"/repos/{owner}/{repo}/commits" + params = { + "per_page": per_page, + "page": page + } + if sha: + params["sha"] = sha + if path: + params["path"] = path + + return await self._request("GET", endpoint, params=params) + + async def get_commit_details(self, owner: str, repo: str, commit_sha: str) -> Dict: + """获取特定提交的详情 + + Args: + owner: 仓库所有者 + repo: 仓库名 + commit_sha: 提交SHA + + Returns: + 提交详情 + """ + endpoint = f"/repos/{owner}/{repo}/commits/{commit_sha}" + return await self._request("GET", endpoint) + + # ===== 内容相关方法 ===== + + async def get_file_content(self, owner: str, repo: str, path: str, ref: Optional[str] = None) -> Dict: + """获取文件内容 + + Args: + owner: 仓库所有者 + repo: 仓库名 + path: 文件路径 + ref: 分支名、标签名或提交SHA + + Returns: + 文件内容信息 + """ + endpoint = f"/repos/{owner}/{repo}/contents/{path}" + params = {} + if ref: + params["ref"] = ref + + response = await self._request("GET", endpoint, params=params) + if response and isinstance(response, dict) and "content" in response: + try: + # 解码Base64编码的文件内容 + content = base64.b64decode(response["content"].encode()).decode() + response["decoded_content"] = content + except Exception as e: + print(f"解码文件内容时出错: {str(e)}") + + return response + + async def get_directory_content(self, owner: str, repo: str, path: str, ref: Optional[str] = None) -> List[Dict]: + """获取目录内容 + + Args: + owner: 仓库所有者 + repo: 仓库名 + path: 目录路径 + ref: 分支名、标签名或提交SHA + + Returns: + 目录内容列表 + """ + # 注意:此方法与get_file_content使用相同的端点,但对于目录会返回列表 + endpoint = f"/repos/{owner}/{repo}/contents/{path}" + params = {} + if ref: + params["ref"] = ref + + return await self._request("GET", endpoint, params=params) + + # ===== Issues相关方法 ===== + + async def get_issues(self, owner: str, repo: str, state: str = "open", + sort: str = "created", direction: str = "desc", + per_page: int = 30, page: int = 1) -> List[Dict]: + """获取仓库的Issues列表 + + Args: + owner: 仓库所有者 + repo: 仓库名 + state: Issue状态 (open, closed, all) + sort: 排序方式 (created, updated, comments) + direction: 排序方向 (asc, desc) + per_page: 每页结果数量 + page: 页码 + + Returns: + Issues列表 + """ + endpoint = f"/repos/{owner}/{repo}/issues" + params = { + "state": state, + "sort": sort, + "direction": direction, + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + async def get_issue(self, owner: str, repo: str, issue_number: int) -> Dict: + """获取特定Issue的详情 + + Args: + owner: 仓库所有者 + repo: 仓库名 + issue_number: Issue编号 + + Returns: + Issue详情 + """ + endpoint = f"/repos/{owner}/{repo}/issues/{issue_number}" + return await self._request("GET", endpoint) + + async def get_issue_comments(self, owner: str, repo: str, issue_number: int) -> List[Dict]: + """获取Issue的评论 + + Args: + owner: 仓库所有者 + repo: 仓库名 + issue_number: Issue编号 + + Returns: + 评论列表 + """ + endpoint = f"/repos/{owner}/{repo}/issues/{issue_number}/comments" + return await self._request("GET", endpoint) + + # ===== Pull Requests相关方法 ===== + + async def get_pull_requests(self, owner: str, repo: str, state: str = "open", + sort: str = "created", direction: str = "desc", + per_page: int = 30, page: int = 1) -> List[Dict]: + """获取仓库的Pull Request列表 + + Args: + owner: 仓库所有者 + repo: 仓库名 + state: PR状态 (open, closed, all) + sort: 排序方式 (created, updated, popularity, long-running) + direction: 排序方向 (asc, desc) + per_page: 每页结果数量 + page: 页码 + + Returns: + Pull Request列表 + """ + endpoint = f"/repos/{owner}/{repo}/pulls" + params = { + "state": state, + "sort": sort, + "direction": direction, + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + async def get_pull_request(self, owner: str, repo: str, pr_number: int) -> Dict: + """获取特定Pull Request的详情 + + Args: + owner: 仓库所有者 + repo: 仓库名 + pr_number: Pull Request编号 + + Returns: + Pull Request详情 + """ + endpoint = f"/repos/{owner}/{repo}/pulls/{pr_number}" + return await self._request("GET", endpoint) + + async def get_pull_request_files(self, owner: str, repo: str, pr_number: int) -> List[Dict]: + """获取Pull Request中修改的文件 + + Args: + owner: 仓库所有者 + repo: 仓库名 + pr_number: Pull Request编号 + + Returns: + 修改文件列表 + """ + endpoint = f"/repos/{owner}/{repo}/pulls/{pr_number}/files" + return await self._request("GET", endpoint) + + # ===== 搜索相关方法 ===== + + async def search_repositories(self, query: str, sort: str = "stars", + order: str = "desc", per_page: int = 30, page: int = 1) -> Dict: + """搜索仓库 + + Args: + query: 搜索关键词 + sort: 排序方式 (stars, forks, updated) + order: 排序顺序 (asc, desc) + per_page: 每页结果数量 + page: 页码 + + Returns: + 搜索结果 + """ + endpoint = "/search/repositories" + params = { + "q": query, + "sort": sort, + "order": order, + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + async def search_code(self, query: str, sort: str = "indexed", + order: str = "desc", per_page: int = 30, page: int = 1) -> Dict: + """搜索代码 + + Args: + query: 搜索关键词 + sort: 排序方式 (indexed) + order: 排序顺序 (asc, desc) + per_page: 每页结果数量 + page: 页码 + + Returns: + 搜索结果 + """ + endpoint = "/search/code" + params = { + "q": query, + "sort": sort, + "order": order, + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + async def search_issues(self, query: str, sort: str = "created", + order: str = "desc", per_page: int = 30, page: int = 1) -> Dict: + """搜索Issues和Pull Requests + + Args: + query: 搜索关键词 + sort: 排序方式 (created, updated, comments) + order: 排序顺序 (asc, desc) + per_page: 每页结果数量 + page: 页码 + + Returns: + 搜索结果 + """ + endpoint = "/search/issues" + params = { + "q": query, + "sort": sort, + "order": order, + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + async def search_users(self, query: str, sort: str = "followers", + order: str = "desc", per_page: int = 30, page: int = 1) -> Dict: + """搜索用户 + + Args: + query: 搜索关键词 + sort: 排序方式 (followers, repositories, joined) + order: 排序顺序 (asc, desc) + per_page: 每页结果数量 + page: 页码 + + Returns: + 搜索结果 + """ + endpoint = "/search/users" + params = { + "q": query, + "sort": sort, + "order": order, + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + # ===== 组织相关方法 ===== + + async def get_organization(self, org: str) -> Dict: + """获取组织信息 + + Args: + org: 组织名称 + + Returns: + 组织信息 + """ + endpoint = f"/orgs/{org}" + return await self._request("GET", endpoint) + + async def get_organization_repos(self, org: str, type: str = "all", + sort: str = "created", direction: str = "desc", + per_page: int = 30, page: int = 1) -> List[Dict]: + """获取组织的仓库列表 + + Args: + org: 组织名称 + type: 仓库类型 (all, public, private, forks, sources, member, internal) + sort: 排序方式 (created, updated, pushed, full_name) + direction: 排序方向 (asc, desc) + per_page: 每页结果数量 + page: 页码 + + Returns: + 仓库列表 + """ + endpoint = f"/orgs/{org}/repos" + params = { + "type": type, + "sort": sort, + "direction": direction, + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + async def get_organization_members(self, org: str, per_page: int = 30, page: int = 1) -> List[Dict]: + """获取组织成员列表 + + Args: + org: 组织名称 + per_page: 每页结果数量 + page: 页码 + + Returns: + 成员列表 + """ + endpoint = f"/orgs/{org}/members" + params = { + "per_page": per_page, + "page": page + } + return await self._request("GET", endpoint, params=params) + + # ===== 更复杂的操作 ===== + + async def get_repository_languages(self, owner: str, repo: str) -> Dict: + """获取仓库使用的编程语言及其比例 + + Args: + owner: 仓库所有者 + repo: 仓库名 + + Returns: + 语言使用情况 + """ + endpoint = f"/repos/{owner}/{repo}/languages" + return await self._request("GET", endpoint) + + async def get_repository_stats_contributors(self, owner: str, repo: str) -> List[Dict]: + """获取仓库的贡献者统计 + + Args: + owner: 仓库所有者 + repo: 仓库名 + + Returns: + 贡献者统计信息 + """ + endpoint = f"/repos/{owner}/{repo}/stats/contributors" + return await self._request("GET", endpoint) + + async def get_repository_stats_commit_activity(self, owner: str, repo: str) -> List[Dict]: + """获取仓库的提交活动 + + Args: + owner: 仓库所有者 + repo: 仓库名 + + Returns: + 提交活动统计 + """ + endpoint = f"/repos/{owner}/{repo}/stats/commit_activity" + return await self._request("GET", endpoint) + +async def example_usage(): + """GitHubSource使用示例""" + # 创建客户端实例(可选传入API令牌) + # github = GitHubSource(api_key="your_github_token") + github = GitHubSource() + + try: + # 示例1:搜索热门Python仓库 + print("\n=== 示例1:搜索热门Python仓库 ===") + repos = await github.search_repositories( + query="language:python stars:>1000", + sort="stars", + order="desc", + per_page=5 + ) + + if repos and "items" in repos: + for i, repo in enumerate(repos["items"], 1): + print(f"\n--- 仓库 {i} ---") + print(f"名称: {repo['full_name']}") + print(f"描述: {repo['description']}") + print(f"星标数: {repo['stargazers_count']}") + print(f"Fork数: {repo['forks_count']}") + print(f"最近更新: {repo['updated_at']}") + print(f"URL: {repo['html_url']}") + + # 示例2:获取特定仓库的详情 + print("\n=== 示例2:获取特定仓库的详情 ===") + repo_details = await github.get_repo("microsoft", "vscode") + if repo_details: + print(f"名称: {repo_details['full_name']}") + print(f"描述: {repo_details['description']}") + print(f"星标数: {repo_details['stargazers_count']}") + print(f"Fork数: {repo_details['forks_count']}") + print(f"默认分支: {repo_details['default_branch']}") + print(f"开源许可: {repo_details.get('license', {}).get('name', '无')}") + print(f"语言: {repo_details['language']}") + print(f"Open Issues数: {repo_details['open_issues_count']}") + + # 示例3:获取仓库的提交历史 + print("\n=== 示例3:获取仓库的最近提交 ===") + commits = await github.get_repo_commits("tensorflow", "tensorflow", per_page=5) + if commits: + for i, commit in enumerate(commits, 1): + print(f"\n--- 提交 {i} ---") + print(f"SHA: {commit['sha'][:7]}") + print(f"作者: {commit['commit']['author']['name']}") + print(f"日期: {commit['commit']['author']['date']}") + print(f"消息: {commit['commit']['message'].splitlines()[0]}") + + # 示例4:搜索代码 + print("\n=== 示例4:搜索代码 ===") + code_results = await github.search_code( + query="filename:README.md language:markdown pytorch in:file", + per_page=3 + ) + if code_results and "items" in code_results: + print(f"共找到: {code_results['total_count']} 个结果") + for i, item in enumerate(code_results["items"], 1): + print(f"\n--- 代码 {i} ---") + print(f"仓库: {item['repository']['full_name']}") + print(f"文件: {item['path']}") + print(f"URL: {item['html_url']}") + + # 示例5:获取文件内容 + print("\n=== 示例5:获取文件内容 ===") + file_content = await github.get_file_content("python", "cpython", "README.rst") + if file_content and "decoded_content" in file_content: + content = file_content["decoded_content"] + print(f"文件名: {file_content['name']}") + print(f"大小: {file_content['size']} 字节") + print(f"内容预览: {content[:200]}...") + + # 示例6:获取仓库使用的编程语言 + print("\n=== 示例6:获取仓库使用的编程语言 ===") + languages = await github.get_repository_languages("facebook", "react") + if languages: + print(f"React仓库使用的编程语言:") + for lang, bytes_of_code in languages.items(): + print(f"- {lang}: {bytes_of_code} 字节") + + # 示例7:获取组织信息 + print("\n=== 示例7:获取组织信息 ===") + org_info = await github.get_organization("google") + if org_info: + print(f"名称: {org_info['name']}") + print(f"描述: {org_info.get('description', '无')}") + print(f"位置: {org_info.get('location', '未指定')}") + print(f"公共仓库数: {org_info['public_repos']}") + print(f"成员数: {org_info.get('public_members', 0)}") + print(f"URL: {org_info['html_url']}") + + # 示例8:获取用户信息 + print("\n=== 示例8:获取用户信息 ===") + user_info = await github.get_user("torvalds") + if user_info: + print(f"名称: {user_info['name']}") + print(f"公司: {user_info.get('company', '无')}") + print(f"博客: {user_info.get('blog', '无')}") + print(f"位置: {user_info.get('location', '未指定')}") + print(f"公共仓库数: {user_info['public_repos']}") + print(f"关注者数: {user_info['followers']}") + print(f"URL: {user_info['html_url']}") + + except Exception as e: + print(f"发生错误: {str(e)}") + import traceback + print(traceback.format_exc()) + +if __name__ == "__main__": + import asyncio + + # 运行示例 + asyncio.run(example_usage()) \ No newline at end of file diff --git a/crazy_functions/paper_fns/document_structure_extractor.py b/crazy_functions/paper_fns/document_structure_extractor.py new file mode 100644 index 00000000..18334106 --- /dev/null +++ b/crazy_functions/paper_fns/document_structure_extractor.py @@ -0,0 +1,593 @@ +from typing import List, Dict, Optional, Tuple, Union, Any +from dataclasses import dataclass, field +import os +import re +import logging + +from crazy_functions.doc_fns.read_fns.unstructured_all.paper_structure_extractor import ( + PaperStructureExtractor, PaperSection, StructuredPaper +) +from unstructured.partition.auto import partition +from unstructured.documents.elements import ( + Text, Title, NarrativeText, ListItem, Table, + Footer, Header, PageBreak, Image, Address +) + +@dataclass +class DocumentSection: + """通用文档章节数据类""" + title: str # 章节标题,如果没有标题则为空字符串 + content: str # 章节内容 + level: int = 0 # 标题级别,0为主标题,1为一级标题,以此类推 + section_type: str = "content" # 章节类型 + is_heading_only: bool = False # 是否仅包含标题 + subsections: List['DocumentSection'] = field(default_factory=list) # 子章节列表 + + +@dataclass +class StructuredDocument: + """结构化文档数据类""" + title: str = "" # 文档标题 + metadata: Dict[str, Any] = field(default_factory=dict) # 元数据 + sections: List[DocumentSection] = field(default_factory=list) # 章节列表 + full_text: str = "" # 完整文本 + is_paper: bool = False # 是否为学术论文 + + +class GenericDocumentStructureExtractor: + """通用文档结构提取器 + + 可以从各种文档格式中提取结构信息,包括标题和内容。 + 支持论文、报告、文章和一般文本文档。 + """ + + # 支持的文件扩展名 + SUPPORTED_EXTENSIONS = [ + '.pdf', '.docx', '.doc', '.pptx', '.ppt', + '.txt', '.md', '.html', '.htm', '.xml', + '.rtf', '.odt', '.epub', '.msg', '.eml' + ] + + # 常见的标题前缀模式 + HEADING_PATTERNS = [ + # 数字标题 (1., 1.1., etc.) + r'^\s*(\d+\.)+\s+', + # 中文数字标题 (一、, 二、, etc.) + r'^\s*[一二三四五六七八九十]+[、::]\s+', + # 带括号的数字标题 ((1), (2), etc.) + r'^\s*\(\s*\d+\s*\)\s+', + # 特定标记的标题 (Chapter 1, Section 1, etc.) + r'^\s*(chapter|section|part|附录|章|节)\s+\d+[\.::]\s+', + ] + + # 常见的文档分段标记词 + SECTION_MARKERS = { + 'introduction': ['简介', '导言', '引言', 'introduction', '概述', 'overview'], + 'background': ['背景', '现状', 'background', '理论基础', '相关工作'], + 'main_content': ['主要内容', '正文', 'main content', '分析', '讨论'], + 'conclusion': ['结论', '总结', 'conclusion', '结语', '小结', 'summary'], + 'reference': ['参考', '参考文献', 'references', '文献', 'bibliography'], + 'appendix': ['附录', 'appendix', '补充资料', 'supplementary'] + } + + def __init__(self): + """初始化提取器""" + self.paper_extractor = PaperStructureExtractor() # 论文专用提取器 + self._setup_logging() + + def _setup_logging(self): + """配置日志""" + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + def extract_document_structure(self, file_path: str, strategy: str = "fast") -> StructuredDocument: + """提取文档结构 + + Args: + file_path: 文件路径 + strategy: 提取策略 ("fast" 或 "accurate") + + Returns: + StructuredDocument: 结构化文档对象 + """ + try: + self.logger.info(f"正在处理文档结构: {file_path}") + + # 1. 首先尝试使用论文提取器 + try: + paper_result = self.paper_extractor.extract_paper_structure(file_path) + if paper_result and len(paper_result.sections) > 2: # 如果成功识别为论文结构 + self.logger.info(f"成功识别为学术论文: {file_path}") + # 将论文结构转换为通用文档结构 + return self._convert_paper_to_document(paper_result) + except Exception as e: + self.logger.debug(f"论文结构提取失败,将尝试通用提取: {str(e)}") + + # 2. 使用通用方法提取文档结构 + elements = partition( + str(file_path), + strategy=strategy, + include_metadata=True, + nlp=False + ) + + # 3. 使用通用提取器处理 + doc = self._extract_generic_structure(elements) + return doc + + except Exception as e: + self.logger.error(f"文档结构提取失败: {str(e)}") + # 返回一个空的结构化文档 + return StructuredDocument( + title="未能提取文档标题", + sections=[DocumentSection( + title="", + content="", + level=0, + section_type="content" + )] + ) + + def _convert_paper_to_document(self, paper: StructuredPaper) -> StructuredDocument: + """将论文结构转换为通用文档结构 + + Args: + paper: 结构化论文对象 + + Returns: + StructuredDocument: 转换后的通用文档结构 + """ + doc = StructuredDocument( + title=paper.metadata.title, + is_paper=True, + full_text=paper.full_text + ) + + # 转换元数据 + doc.metadata = { + 'title': paper.metadata.title, + 'authors': paper.metadata.authors, + 'keywords': paper.keywords, + 'abstract': paper.metadata.abstract if hasattr(paper.metadata, 'abstract') else "", + 'is_paper': True + } + + # 转换章节结构 + doc.sections = self._convert_paper_sections(paper.sections) + + return doc + + def _convert_paper_sections(self, paper_sections: List[PaperSection], level: int = 0) -> List[DocumentSection]: + """递归转换论文章节为通用文档章节 + + Args: + paper_sections: 论文章节列表 + level: 当前章节级别 + + Returns: + List[DocumentSection]: 通用文档章节列表 + """ + doc_sections = [] + + for section in paper_sections: + doc_section = DocumentSection( + title=section.title, + content=section.content, + level=section.level, + section_type=section.section_type, + is_heading_only=False if section.content else True + ) + + # 递归处理子章节 + if section.subsections: + doc_section.subsections = self._convert_paper_sections( + section.subsections, level + 1 + ) + + doc_sections.append(doc_section) + + return doc_sections + + def _extract_generic_structure(self, elements) -> StructuredDocument: + """从元素列表中提取通用文档结构 + + Args: + elements: 文档元素列表 + + Returns: + StructuredDocument: 结构化文档对象 + """ + # 创建结构化文档对象 + doc = StructuredDocument(full_text="") + + # 1. 提取文档标题 + title_candidates = [] + for i, element in enumerate(elements[:5]): # 只检查前5个元素 + if isinstance(element, Title): + title_text = str(element).strip() + title_candidates.append((i, title_text)) + + if title_candidates: + # 使用第一个标题作为文档标题 + doc.title = title_candidates[0][1] + + # 2. 识别所有标题元素和内容 + title_elements = [] + + # 2.1 首先识别所有标题 + for i, element in enumerate(elements): + is_heading = False + title_text = "" + level = 0 + + # 检查元素类型 + if isinstance(element, Title): + is_heading = True + title_text = str(element).strip() + + # 进一步检查是否为真正的标题 + if self._is_likely_heading(title_text, element, i, elements): + level = self._estimate_heading_level(title_text, element) + else: + is_heading = False + + # 也检查格式像标题的普通文本 + elif isinstance(element, (Text, NarrativeText)) and i > 0: + text = str(element).strip() + # 检查是否匹配标题模式 + if any(re.match(pattern, text) for pattern in self.HEADING_PATTERNS): + # 检查长度和后续内容以确认是否为标题 + if len(text) < 100 and self._has_sufficient_following_content(i, elements): + is_heading = True + title_text = text + level = self._estimate_heading_level(title_text, element) + + if is_heading: + section_type = self._identify_section_type(title_text) + title_elements.append((i, title_text, level, section_type)) + + # 2.2 为每个标题提取内容 + sections = [] + + for i, (index, title_text, level, section_type) in enumerate(title_elements): + # 确定内容范围 + content_start = index + 1 + content_end = elements[-1] # 默认到文档结束 + + # 如果有下一个标题,内容到下一个标题开始 + if i < len(title_elements) - 1: + content_end = title_elements[i+1][0] + else: + content_end = len(elements) + + # 提取内容 + content = self._extract_content_between(elements, content_start, content_end) + + # 创建章节 + section = DocumentSection( + title=title_text, + content=content, + level=level, + section_type=section_type, + is_heading_only=False if content.strip() else True + ) + + sections.append(section) + + # 3. 如果没有识别到任何章节,创建一个默认章节 + if not sections: + all_content = self._extract_content_between(elements, 0, len(elements)) + + # 尝试从内容中提取标题 + first_line = all_content.split('\n')[0] if all_content else "" + if first_line and len(first_line) < 100: + doc.title = first_line + all_content = '\n'.join(all_content.split('\n')[1:]) + + default_section = DocumentSection( + title="", + content=all_content, + level=0, + section_type="content" + ) + sections.append(default_section) + + # 4. 构建层次结构 + doc.sections = self._build_section_hierarchy(sections) + + # 5. 提取完整文本 + doc.full_text = "\n\n".join([str(element) for element in elements if isinstance(element, (Text, NarrativeText, Title, ListItem))]) + + return doc + + def _build_section_hierarchy(self, sections: List[DocumentSection]) -> List[DocumentSection]: + """构建章节层次结构 + + Args: + sections: 章节列表 + + Returns: + List[DocumentSection]: 具有层次结构的章节列表 + """ + if not sections: + return [] + + # 按层级排序 + top_level_sections = [] + current_parents = {0: None} # 每个层级的当前父节点 + + for section in sections: + # 找到当前节点的父节点 + parent_level = None + for level in sorted([k for k in current_parents.keys() if k < section.level], reverse=True): + parent_level = level + break + + if parent_level is None: + # 顶级章节 + top_level_sections.append(section) + else: + # 子章节 + parent = current_parents[parent_level] + if parent: + parent.subsections.append(section) + else: + top_level_sections.append(section) + + # 更新当前层级的父节点 + current_parents[section.level] = section + + # 清除所有更深层级的父节点缓存 + deeper_levels = [k for k in current_parents.keys() if k > section.level] + for level in deeper_levels: + current_parents.pop(level, None) + + return top_level_sections + + def _is_likely_heading(self, text: str, element, index: int, elements) -> bool: + """判断文本是否可能是标题 + + Args: + text: 文本内容 + element: 元素对象 + index: 元素索引 + elements: 所有元素列表 + + Returns: + bool: 是否可能是标题 + """ + # 1. 检查文本长度 - 标题通常不会太长 + if len(text) > 150: # 标题通常不超过150个字符 + return False + + # 2. 检查是否匹配标题的数字编号模式 + if any(re.match(pattern, text) for pattern in self.HEADING_PATTERNS): + return True + + # 3. 检查是否包含常见章节标记词 + lower_text = text.lower() + for markers in self.SECTION_MARKERS.values(): + if any(marker.lower() in lower_text for marker in markers): + return True + + # 4. 检查后续内容数量 - 标题后通常有足够多的内容 + if not self._has_sufficient_following_content(index, elements, min_chars=100): + # 但如果文本很短且以特定格式开头,仍可能是标题 + if len(text) < 50 and (text.endswith(':') or text.endswith(':')): + return True + return False + + # 5. 检查格式特征 + # 标题通常是元素的开头,不在段落中间 + if len(text.split('\n')) > 1: + # 多行文本不太可能是标题 + return False + + # 如果有元数据,检查字体特征(字体大小等) + if hasattr(element, 'metadata') and element.metadata: + try: + font_size = getattr(element.metadata, 'font_size', None) + is_bold = getattr(element.metadata, 'is_bold', False) + + # 字体较大或加粗的文本更可能是标题 + if font_size and font_size > 12: + return True + if is_bold: + return True + except (AttributeError, TypeError): + pass + + # 默认返回True,因为元素已被识别为Title类型 + return True + + def _estimate_heading_level(self, text: str, element) -> int: + """估计标题的层级 + + Args: + text: 标题文本 + element: 元素对象 + + Returns: + int: 标题层级 (0为主标题,1为一级标题, 等等) + """ + # 1. 通过编号模式判断层级 + for pattern, level in [ + (r'^\s*\d+\.\s+', 1), # 1. 开头 (一级标题) + (r'^\s*\d+\.\d+\.\s+', 2), # 1.1. 开头 (二级标题) + (r'^\s*\d+\.\d+\.\d+\.\s+', 3), # 1.1.1. 开头 (三级标题) + (r'^\s*\d+\.\d+\.\d+\.\d+\.\s+', 4), # 1.1.1.1. 开头 (四级标题) + ]: + if re.match(pattern, text): + return level + + # 2. 检查是否是常见的主要章节标题 + lower_text = text.lower() + main_sections = [ + 'abstract', 'introduction', 'background', 'methodology', + 'results', 'discussion', 'conclusion', 'references' + ] + for section in main_sections: + if section in lower_text: + return 1 # 主要章节为一级标题 + + # 3. 根据文本特征判断 + if text.isupper(): # 全大写文本可能是章标题 + return 1 + + # 4. 通过元数据判断层级 + if hasattr(element, 'metadata') and element.metadata: + try: + # 根据字体大小判断层级 + font_size = getattr(element.metadata, 'font_size', None) + if font_size is not None: + if font_size > 18: # 假设主标题字体最大 + return 0 + elif font_size > 16: + return 1 + elif font_size > 14: + return 2 + else: + return 3 + except (AttributeError, TypeError): + pass + + # 默认为二级标题 + return 2 + + def _identify_section_type(self, title_text: str) -> str: + """识别章节类型,包括参考文献部分""" + lower_text = title_text.lower() + + # 特别检查是否为参考文献部分 + references_patterns = [ + r'references', r'参考文献', r'bibliography', r'引用文献', + r'literature cited', r'^cited\s+literature', r'^文献$', r'^引用$' + ] + + for pattern in references_patterns: + if re.search(pattern, lower_text, re.IGNORECASE): + return "references" + + # 检查是否匹配其他常见章节类型 + for section_type, markers in self.SECTION_MARKERS.items(): + if any(marker.lower() in lower_text for marker in markers): + return section_type + + # 检查带编号的章节 + if re.match(r'^\d+\.', lower_text): + return "content" + + # 默认为内容章节 + return "content" + + def _has_sufficient_following_content(self, index: int, elements, min_chars: int = 150) -> bool: + """检查元素后是否有足够的内容 + + Args: + index: 当前元素索引 + elements: 所有元素列表 + min_chars: 最小字符数要求 + + Returns: + bool: 是否有足够的内容 + """ + total_chars = 0 + for i in range(index + 1, min(index + 5, len(elements))): + if isinstance(elements[i], Title): + # 如果紧接着是标题,就停止检查 + break + if isinstance(elements[i], (Text, NarrativeText, ListItem, Table)): + total_chars += len(str(elements[i])) + if total_chars >= min_chars: + return True + + return total_chars >= min_chars + + def _extract_content_between(self, elements, start_index: int, end_index: int) -> str: + """提取指定范围内的内容文本 + + Args: + elements: 元素列表 + start_index: 开始索引 + end_index: 结束索引 + + Returns: + str: 提取的内容文本 + """ + content_parts = [] + + for i in range(start_index, end_index): + if isinstance(elements[i], (Text, NarrativeText, ListItem, Table)): + content_parts.append(str(elements[i]).strip()) + + return "\n\n".join([part for part in content_parts if part]) + + def generate_markdown(self, doc: StructuredDocument) -> str: + """将结构化文档转换为Markdown格式 + + Args: + doc: 结构化文档对象 + + Returns: + str: Markdown格式文本 + """ + md_parts = [] + + # 添加标题 + if doc.title: + md_parts.append(f"# {doc.title}\n") + + # 添加元数据 + if doc.is_paper: + # 作者信息 + if 'authors' in doc.metadata and doc.metadata['authors']: + authors_str = ", ".join(doc.metadata['authors']) + md_parts.append(f"**作者:** {authors_str}\n") + + # 关键词 + if 'keywords' in doc.metadata and doc.metadata['keywords']: + keywords_str = ", ".join(doc.metadata['keywords']) + md_parts.append(f"**关键词:** {keywords_str}\n") + + # 摘要 + if 'abstract' in doc.metadata and doc.metadata['abstract']: + md_parts.append(f"## 摘要\n\n{doc.metadata['abstract']}\n") + + # 添加章节内容 + md_parts.append(self._format_sections_markdown(doc.sections)) + + return "\n".join(md_parts) + + def _format_sections_markdown(self, sections: List[DocumentSection], base_level: int = 0) -> str: + """递归格式化章节为Markdown + + Args: + sections: 章节列表 + base_level: 基础层级 + + Returns: + str: Markdown格式文本 + """ + md_parts = [] + + for section in sections: + # 计算标题级别 (确保不超过6级) + header_level = min(section.level + base_level + 1, 6) + + # 添加标题和内容 + if section.title: + md_parts.append(f"{'#' * header_level} {section.title}\n") + + if section.content: + md_parts.append(f"{section.content}\n") + + # 递归处理子章节 + if section.subsections: + md_parts.append(self._format_sections_markdown( + section.subsections, base_level + )) + + return "\n".join(md_parts) \ No newline at end of file diff --git a/crazy_functions/paper_fns/file2file_doc/__init__.py b/crazy_functions/paper_fns/file2file_doc/__init__.py new file mode 100644 index 00000000..7992185e --- /dev/null +++ b/crazy_functions/paper_fns/file2file_doc/__init__.py @@ -0,0 +1,4 @@ +from .txt_doc import TxtFormatter +from .markdown_doc import MarkdownFormatter +from .html_doc import HtmlFormatter +from .word_doc import WordFormatter \ No newline at end of file diff --git a/crazy_functions/paper_fns/file2file_doc/html_doc.py b/crazy_functions/paper_fns/file2file_doc/html_doc.py new file mode 100644 index 00000000..9ff14799 --- /dev/null +++ b/crazy_functions/paper_fns/file2file_doc/html_doc.py @@ -0,0 +1,300 @@ +class HtmlFormatter: + """HTML格式文档生成器 - 保留原始文档结构""" + + def __init__(self, processing_type="文本处理"): + self.processing_type = processing_type + self.css_styles = """ + :root { + --primary-color: #2563eb; + --primary-light: #eff6ff; + --secondary-color: #1e293b; + --background-color: #f8fafc; + --text-color: #334155; + --border-color: #e2e8f0; + --card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1); + } + + body { + font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + line-height: 1.8; + margin: 0; + padding: 2rem; + color: var(--text-color); + background-color: var(--background-color); + } + + .container { + max-width: 1200px; + margin: 0 auto; + background: white; + padding: 2rem; + border-radius: 16px; + box-shadow: var(--card-shadow); + } + ::selection { + background: var(--primary-light); + color: var(--primary-color); + } + @keyframes fadeIn { + from { opacity: 0; transform: translateY(20px); } + to { opacity: 1; transform: translateY(0); } + } + + .container { + animation: fadeIn 0.6s ease-out; + } + + .document-title { + color: var(--primary-color); + font-size: 2em; + text-align: center; + margin: 1rem 0 2rem; + padding-bottom: 1rem; + border-bottom: 2px solid var(--primary-color); + } + + .document-body { + display: flex; + flex-direction: column; + gap: 1.5rem; + margin: 2rem 0; + } + + .document-header { + display: flex; + flex-direction: column; + align-items: center; + margin-bottom: 2rem; + } + + .processing-type { + color: var(--secondary-color); + font-size: 1.2em; + margin: 0.5rem 0; + } + + .processing-date { + color: var(--text-color); + font-size: 0.9em; + opacity: 0.8; + } + + .document-content { + background: white; + padding: 1.5rem; + border-radius: 8px; + border-left: 4px solid var(--primary-color); + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); + } + + /* 保留文档结构的样式 */ + h1, h2, h3, h4, h5, h6 { + color: var(--secondary-color); + margin-top: 1.5em; + margin-bottom: 0.5em; + } + + h1 { font-size: 1.8em; } + h2 { font-size: 1.5em; } + h3 { font-size: 1.3em; } + h4 { font-size: 1.1em; } + + p { + margin: 0.8em 0; + } + + ul, ol { + margin: 1em 0; + padding-left: 2em; + } + + li { + margin: 0.5em 0; + } + + blockquote { + margin: 1em 0; + padding: 0.5em 1em; + border-left: 4px solid var(--primary-light); + background: rgba(0,0,0,0.02); + } + + code { + font-family: monospace; + background: rgba(0,0,0,0.05); + padding: 0.2em 0.4em; + border-radius: 3px; + } + + pre { + background: rgba(0,0,0,0.05); + padding: 1em; + border-radius: 5px; + overflow-x: auto; + } + + pre code { + background: transparent; + padding: 0; + } + + @media (prefers-color-scheme: dark) { + :root { + --background-color: #0f172a; + --text-color: #e2e8f0; + --border-color: #1e293b; + } + + .container, .document-content { + background: #1e293b; + } + + blockquote { + background: rgba(255,255,255,0.05); + } + + code, pre { + background: rgba(255,255,255,0.05); + } + } + """ + + def _escape_html(self, text): + """转义HTML特殊字符""" + import html + return html.escape(text) + + def _markdown_to_html(self, text): + """将Markdown格式转换为HTML格式,保留文档结构""" + try: + import markdown + # 使用Python-Markdown库将markdown转换为HTML,启用更多扩展以支持嵌套列表 + return markdown.markdown(text, extensions=['tables', 'fenced_code', 'codehilite', 'nl2br', 'sane_lists', 'smarty', 'extra']) + except ImportError: + # 如果没有markdown库,使用更复杂的替换来处理嵌套列表 + import re + + # 替换标题 + text = re.sub(r'^# (.+)$', r'

\1

', text, flags=re.MULTILINE) + text = re.sub(r'^## (.+)$', r'

\1

', text, flags=re.MULTILINE) + text = re.sub(r'^### (.+)$', r'

\1

', text, flags=re.MULTILINE) + + # 预处理列表 - 在列表项之间添加空行以正确分隔 + # 处理编号列表 + text = re.sub(r'(\n\d+\.\s.+)(\n\d+\.\s)', r'\1\n\2', text) + # 处理项目符号列表 + text = re.sub(r'(\n•\s.+)(\n•\s)', r'\1\n\2', text) + text = re.sub(r'(\n\*\s.+)(\n\*\s)', r'\1\n\2', text) + text = re.sub(r'(\n-\s.+)(\n-\s)', r'\1\n\2', text) + + # 处理嵌套列表 - 确保正确的缩进和结构 + lines = text.split('\n') + in_list = False + list_type = None # 'ol' 或 'ul' + list_html = [] + normal_lines = [] + + i = 0 + while i < len(lines): + line = lines[i] + + # 匹配编号列表项 + numbered_match = re.match(r'^(\d+)\.\s+(.+)$', line) + # 匹配项目符号列表项 + bullet_match = re.match(r'^[•\*-]\s+(.+)$', line) + + if numbered_match: + if not in_list or list_type != 'ol': + # 开始新的编号列表 + if in_list: + # 关闭前一个列表 + list_html.append(f'') + list_html.append('
    ') + in_list = True + list_type = 'ol' + + num, content = numbered_match.groups() + list_html.append(f'
  1. {content}
  2. ') + + elif bullet_match: + if not in_list or list_type != 'ul': + # 开始新的项目符号列表 + if in_list: + # 关闭前一个列表 + list_html.append(f'') + list_html.append('
      ') + in_list = True + list_type = 'ul' + + content = bullet_match.group(1) + list_html.append(f'
    • {content}
    • ') + + else: + if in_list: + # 结束当前列表 + list_html.append(f'') + in_list = False + # 将完成的列表添加到正常行中 + normal_lines.append(''.join(list_html)) + list_html = [] + + normal_lines.append(line) + + i += 1 + + # 如果最后还在列表中,确保关闭列表 + if in_list: + list_html.append(f'') + normal_lines.append(''.join(list_html)) + + # 重建文本 + text = '\n'.join(normal_lines) + + # 替换段落,但避免处理已经是HTML标签的部分 + paragraphs = text.split('\n\n') + for i, p in enumerate(paragraphs): + # 如果不是以HTML标签开始且不为空 + if not (p.strip().startswith('<') and p.strip().endswith('>')) and p.strip() != '': + paragraphs[i] = f'

      {p}

      ' + + return '\n'.join(paragraphs) + + def create_document(self, content: str) -> str: + """生成完整的HTML文档,保留原始文档结构 + + Args: + content: 处理后的文档内容 + + Returns: + str: 完整的HTML文档字符串 + """ + from datetime import datetime + + # 将markdown内容转换为HTML + html_content = self._markdown_to_html(content) + + return f""" + + + + + + 文档处理结果 + + + +
      +

      文档处理结果

      + +
      +
      处理方式: {self._escape_html(self.processing_type)}
      +
      处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
      +
      + +
      + {html_content} +
      +
      + + + """ \ No newline at end of file diff --git a/crazy_functions/paper_fns/file2file_doc/markdown_doc.py b/crazy_functions/paper_fns/file2file_doc/markdown_doc.py new file mode 100644 index 00000000..f0034856 --- /dev/null +++ b/crazy_functions/paper_fns/file2file_doc/markdown_doc.py @@ -0,0 +1,40 @@ +class MarkdownFormatter: + """Markdown格式文档生成器 - 保留原始文档结构""" + + def __init__(self): + self.content = [] + + def _add_content(self, text: str): + """添加正文内容""" + if text: + self.content.append(f"\n{text}\n") + + def create_document(self, content: str, processing_type: str = "文本处理") -> str: + """ + 创建完整的Markdown文档,保留原始文档结构 + Args: + content: 处理后的文档内容 + processing_type: 处理类型(润色、翻译等) + Returns: + str: 生成的Markdown文本 + """ + self.content = [] + + # 添加标题和说明 + self.content.append(f"# 文档处理结果\n") + self.content.append(f"## 处理方式: {processing_type}\n") + + # 添加处理时间 + from datetime import datetime + self.content.append(f"*处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n") + + # 添加分隔线 + self.content.append("---\n") + + # 添加原始内容,保留结构 + self.content.append(content) + + # 添加结尾分隔线 + self.content.append("\n---\n") + + return "\n".join(self.content) diff --git a/crazy_functions/paper_fns/file2file_doc/txt_doc.py b/crazy_functions/paper_fns/file2file_doc/txt_doc.py new file mode 100644 index 00000000..c642c737 --- /dev/null +++ b/crazy_functions/paper_fns/file2file_doc/txt_doc.py @@ -0,0 +1,69 @@ +import re + +def convert_markdown_to_txt(markdown_text): + """Convert markdown text to plain text while preserving formatting""" + # Standardize line endings + markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n') + + # 1. Handle headers but keep their formatting instead of removing them + markdown_text = re.sub(r'^#\s+(.+)$', r'# \1', markdown_text, flags=re.MULTILINE) + markdown_text = re.sub(r'^##\s+(.+)$', r'## \1', markdown_text, flags=re.MULTILINE) + markdown_text = re.sub(r'^###\s+(.+)$', r'### \1', markdown_text, flags=re.MULTILINE) + + # 2. Handle bold and italic - simply remove markers + markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text) + markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text) + + # 3. Handle lists but preserve formatting + markdown_text = re.sub(r'^\s*[-*+]\s+(.+?)(?=\n|$)', r'• \1', markdown_text, flags=re.MULTILINE) + + # 4. Handle links - keep only the text + markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 (\2)', markdown_text) + + # 5. Handle HTML links - convert to user-friendly format + markdown_text = re.sub(r'([^<]+)', r'\2 (\1)', markdown_text) + + # 6. Preserve paragraph breaks + markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text) # normalize multiple newlines to double newlines + + # 7. Clean up extra spaces but maintain indentation + markdown_text = re.sub(r' +', ' ', markdown_text) + + return markdown_text.strip() + + +class TxtFormatter: + """文本格式化器 - 保留原始文档结构""" + + def __init__(self): + self.content = [] + self._setup_document() + + def _setup_document(self): + """初始化文档标题""" + self.content.append("=" * 50) + self.content.append("处理后文档".center(48)) + self.content.append("=" * 50) + + def _format_header(self): + """创建文档头部信息""" + from datetime import datetime + date_str = datetime.now().strftime('%Y年%m月%d日') + return [ + date_str.center(48), + "\n" # 添加空行 + ] + + def create_document(self, content): + """生成保留原始结构的文档""" + # 添加头部信息 + self.content.extend(self._format_header()) + + # 处理内容,保留原始结构 + processed_content = convert_markdown_to_txt(content) + + # 添加处理后的内容 + self.content.append(processed_content) + + # 合并所有内容 + return "\n".join(self.content) diff --git a/crazy_functions/paper_fns/file2file_doc/word2pdf.py b/crazy_functions/paper_fns/file2file_doc/word2pdf.py new file mode 100644 index 00000000..7cca27f9 --- /dev/null +++ b/crazy_functions/paper_fns/file2file_doc/word2pdf.py @@ -0,0 +1,125 @@ +from docx2pdf import convert +import os +import platform +from typing import Union +from pathlib import Path +from datetime import datetime + +class WordToPdfConverter: + """Word文档转PDF转换器""" + + @staticmethod + def convert_to_pdf(word_path: Union[str, Path], pdf_path: Union[str, Path] = None) -> str: + """ + 将Word文档转换为PDF + + 参数: + word_path: Word文档的路径 + pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置 + + 返回: + 生成的PDF文件路径 + + 异常: + 如果转换失败,将抛出相应异常 + """ + try: + # 确保输入路径是Path对象 + word_path = Path(word_path) + + # 如果未指定pdf_path,则使用与word文档相同的名称 + if pdf_path is None: + pdf_path = word_path.with_suffix('.pdf') + else: + pdf_path = Path(pdf_path) + + # 检查操作系统 + if platform.system() == 'Linux': + # Linux系统需要安装libreoffice + if not os.system('which libreoffice') == 0: + raise RuntimeError("请先安装LibreOffice: sudo apt-get install libreoffice") + + # 使用libreoffice进行转换 + os.system(f'libreoffice --headless --convert-to pdf "{word_path}" --outdir "{pdf_path.parent}"') + + # 如果输出路径与默认生成的不同,则重命名 + default_pdf = word_path.with_suffix('.pdf') + if default_pdf != pdf_path: + os.rename(default_pdf, pdf_path) + else: + # Windows和MacOS使用docx2pdf + convert(word_path, pdf_path) + + return str(pdf_path) + + except Exception as e: + raise Exception(f"转换PDF失败: {str(e)}") + + @staticmethod + def batch_convert(word_dir: Union[str, Path], pdf_dir: Union[str, Path] = None) -> list: + """ + 批量转换目录下的所有Word文档 + + 参数: + word_dir: 包含Word文档的目录路径 + pdf_dir: 可选,PDF文件的输出目录。如果未指定,将使用与Word文档相同的目录 + + 返回: + 生成的PDF文件路径列表 + """ + word_dir = Path(word_dir) + if pdf_dir: + pdf_dir = Path(pdf_dir) + pdf_dir.mkdir(parents=True, exist_ok=True) + + converted_files = [] + + for word_file in word_dir.glob("*.docx"): + try: + if pdf_dir: + pdf_path = pdf_dir / word_file.with_suffix('.pdf').name + else: + pdf_path = word_file.with_suffix('.pdf') + + pdf_file = WordToPdfConverter.convert_to_pdf(word_file, pdf_path) + converted_files.append(pdf_file) + + except Exception as e: + print(f"转换 {word_file} 失败: {str(e)}") + + return converted_files + + @staticmethod + def convert_doc_to_pdf(doc, output_dir: Union[str, Path] = None) -> str: + """ + 将docx对象直接转换为PDF + + 参数: + doc: python-docx的Document对象 + output_dir: 可选,输出目录。如果未指定,将使用当前目录 + + 返回: + 生成的PDF文件路径 + """ + try: + # 设置临时文件路径和输出路径 + output_dir = Path(output_dir) if output_dir else Path.cwd() + output_dir.mkdir(parents=True, exist_ok=True) + + # 生成临时word文件 + temp_docx = output_dir / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx" + doc.save(temp_docx) + + # 转换为PDF + pdf_path = temp_docx.with_suffix('.pdf') + WordToPdfConverter.convert_to_pdf(temp_docx, pdf_path) + + # 删除临时word文件 + temp_docx.unlink() + + return str(pdf_path) + + except Exception as e: + if temp_docx.exists(): + temp_docx.unlink() + raise Exception(f"转换PDF失败: {str(e)}") \ No newline at end of file diff --git a/crazy_functions/paper_fns/file2file_doc/word_doc.py b/crazy_functions/paper_fns/file2file_doc/word_doc.py new file mode 100644 index 00000000..e0f905dd --- /dev/null +++ b/crazy_functions/paper_fns/file2file_doc/word_doc.py @@ -0,0 +1,236 @@ +import re +from docx import Document +from docx.shared import Cm, Pt +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING +from docx.enum.style import WD_STYLE_TYPE +from docx.oxml.ns import qn +from datetime import datetime + +def convert_markdown_to_word(markdown_text): + # 0. 首先标准化所有换行符为\n + markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n') + + # 1. 处理标题 - 支持更多级别的标题,使用更精确的正则 + # 保留标题标记,以便后续处理时还能识别出标题级别 + markdown_text = re.sub(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', r'\1 \2', markdown_text, flags=re.MULTILINE) + + # 2. 处理粗体、斜体和加粗斜体 + markdown_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', markdown_text) # 加粗斜体 + markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text) # 加粗 + markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text) # 斜体 + markdown_text = re.sub(r'_(.+?)_', r'\1', markdown_text) # 下划线斜体 + markdown_text = re.sub(r'__(.+?)__', r'\1', markdown_text) # 下划线加粗 + + # 3. 处理代码块 - 不移除,而是简化格式 + # 多行代码块 + markdown_text = re.sub(r'```(?:\w+)?\n([\s\S]*?)```', r'[代码块]\n\1[/代码块]', markdown_text) + # 单行代码 + markdown_text = re.sub(r'`([^`]+)`', r'[代码]\1[/代码]', markdown_text) + + # 4. 处理列表 - 保留列表结构 + # 匹配无序列表 + markdown_text = re.sub(r'^(\s*)[-*+]\s+(.+?)$', r'\1• \2', markdown_text, flags=re.MULTILINE) + + # 5. 处理Markdown链接 + markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+?)\s*(?:"[^"]*")?\)', r'\1 (\2)', markdown_text) + + # 6. 处理HTML链接 + markdown_text = re.sub(r'([^<]+)', r'\2 (\1)', markdown_text) + + # 7. 处理图片 + markdown_text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[图片:\1]', markdown_text) + + return markdown_text + + +class WordFormatter: + """文档Word格式化器 - 保留原始文档结构""" + + def __init__(self): + self.doc = Document() + self._setup_document() + self._create_styles() + + def _setup_document(self): + """设置文档基本格式,包括页面设置和页眉""" + sections = self.doc.sections + for section in sections: + # 设置页面大小为A4 + section.page_width = Cm(21) + section.page_height = Cm(29.7) + # 设置页边距 + section.top_margin = Cm(3.7) # 上边距37mm + section.bottom_margin = Cm(3.5) # 下边距35mm + section.left_margin = Cm(2.8) # 左边距28mm + section.right_margin = Cm(2.6) # 右边距26mm + # 设置页眉页脚距离 + section.header_distance = Cm(2.0) + section.footer_distance = Cm(2.0) + + # 添加页眉 + header = section.header + header_para = header.paragraphs[0] + header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT + header_run = header_para.add_run("文档处理结果") + header_run.font.name = '仿宋' + header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + header_run.font.size = Pt(9) + + def _create_styles(self): + """创建文档样式""" + # 创建正文样式 + style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH) + style.font.name = '仿宋' + style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + style.font.size = Pt(12) # 调整为12磅 + style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + style.paragraph_format.space_after = Pt(0) + + # 创建标题样式 + title_style = self.doc.styles.add_style('Title_Custom', WD_STYLE_TYPE.PARAGRAPH) + title_style.font.name = '黑体' + title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体') + title_style.font.size = Pt(22) # 调整为22磅 + title_style.font.bold = True + title_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + title_style.paragraph_format.space_before = Pt(0) + title_style.paragraph_format.space_after = Pt(24) + title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + + # 创建标题1样式 + h1_style = self.doc.styles.add_style('Heading1_Custom', WD_STYLE_TYPE.PARAGRAPH) + h1_style.font.name = '黑体' + h1_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体') + h1_style.font.size = Pt(18) + h1_style.font.bold = True + h1_style.paragraph_format.space_before = Pt(12) + h1_style.paragraph_format.space_after = Pt(6) + + # 创建标题2样式 + h2_style = self.doc.styles.add_style('Heading2_Custom', WD_STYLE_TYPE.PARAGRAPH) + h2_style.font.name = '黑体' + h2_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体') + h2_style.font.size = Pt(16) + h2_style.font.bold = True + h2_style.paragraph_format.space_before = Pt(10) + h2_style.paragraph_format.space_after = Pt(6) + + # 创建标题3样式 + h3_style = self.doc.styles.add_style('Heading3_Custom', WD_STYLE_TYPE.PARAGRAPH) + h3_style.font.name = '黑体' + h3_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体') + h3_style.font.size = Pt(14) + h3_style.font.bold = True + h3_style.paragraph_format.space_before = Pt(8) + h3_style.paragraph_format.space_after = Pt(4) + + # 创建代码块样式 + code_style = self.doc.styles.add_style('Code_Custom', WD_STYLE_TYPE.PARAGRAPH) + code_style.font.name = 'Courier New' + code_style.font.size = Pt(11) + code_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE + code_style.paragraph_format.space_before = Pt(6) + code_style.paragraph_format.space_after = Pt(6) + code_style.paragraph_format.left_indent = Pt(36) + code_style.paragraph_format.right_indent = Pt(36) + + # 创建列表样式 + list_style = self.doc.styles.add_style('List_Custom', WD_STYLE_TYPE.PARAGRAPH) + list_style.font.name = '仿宋' + list_style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + list_style.font.size = Pt(12) + list_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + list_style.paragraph_format.left_indent = Pt(21) + list_style.paragraph_format.first_line_indent = Pt(-21) + + def create_document(self, content: str, processing_type: str = "文本处理"): + """创建文档,保留原始结构""" + # 添加标题 + title_para = self.doc.add_paragraph(style='Title_Custom') + title_run = title_para.add_run('文档处理结果') + + # 添加处理类型 + processing_para = self.doc.add_paragraph() + processing_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + processing_run = processing_para.add_run(f"处理方式: {processing_type}") + processing_run.font.name = '仿宋' + processing_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + processing_run.font.size = Pt(14) + + # 添加日期 + date_para = self.doc.add_paragraph() + date_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + date_run = date_para.add_run(f"处理时间: {datetime.now().strftime('%Y年%m月%d日')}") + date_run.font.name = '仿宋' + date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + date_run.font.size = Pt(14) + + self.doc.add_paragraph() # 添加空行 + + # 预处理内容,将Markdown格式转换为适合Word的格式 + processed_content = convert_markdown_to_word(content) + + # 按行处理文本,保留结构 + lines = processed_content.split('\n') + in_code_block = False + current_paragraph = None + + for line in lines: + # 检查是否为标题 + header_match = re.match(r'^(#{1,6})\s+(.+)$', line) + + if header_match: + # 根据#的数量确定标题级别 + level = len(header_match.group(1)) + title_text = header_match.group(2) + + if level == 1: + style = 'Heading1_Custom' + elif level == 2: + style = 'Heading2_Custom' + else: + style = 'Heading3_Custom' + + self.doc.add_paragraph(title_text, style=style) + current_paragraph = None + + # 检查代码块标记 + elif '[代码块]' in line: + in_code_block = True + current_paragraph = self.doc.add_paragraph(style='Code_Custom') + code_line = line.replace('[代码块]', '').strip() + if code_line: + current_paragraph.add_run(code_line) + + elif '[/代码块]' in line: + in_code_block = False + code_line = line.replace('[/代码块]', '').strip() + if code_line and current_paragraph: + current_paragraph.add_run(code_line) + current_paragraph = None + + # 检查列表项 + elif line.strip().startswith('•'): + p = self.doc.add_paragraph(style='List_Custom') + p.add_run(line.strip()) + current_paragraph = None + + # 处理普通文本行 + elif line.strip(): + if in_code_block: + if current_paragraph: + current_paragraph.add_run('\n' + line) + else: + current_paragraph = self.doc.add_paragraph(line, style='Code_Custom') + else: + if current_paragraph is None or not current_paragraph.text: + current_paragraph = self.doc.add_paragraph(line, style='Normal_Custom') + else: + current_paragraph.add_run('\n' + line) + + # 处理空行,创建新段落 + elif not in_code_block: + current_paragraph = None + + return self.doc + diff --git a/crazy_functions/paper_fns/github_search.py b/crazy_functions/paper_fns/github_search.py new file mode 100644 index 00000000..8bbd8b39 --- /dev/null +++ b/crazy_functions/paper_fns/github_search.py @@ -0,0 +1,278 @@ +from typing import List, Dict, Tuple +import asyncio +from dataclasses import dataclass +from toolbox import CatchException, update_ui, promote_file_to_downloadzone, get_log_folder, get_user +from toolbox import update_ui, CatchException, report_exception, write_history_to_file +from crazy_functions.paper_fns.auto_git.query_analyzer import QueryAnalyzer, SearchCriteria +from crazy_functions.paper_fns.auto_git.handlers.repo_handler import RepositoryHandler +from crazy_functions.paper_fns.auto_git.handlers.code_handler import CodeSearchHandler +from crazy_functions.paper_fns.auto_git.handlers.user_handler import UserSearchHandler +from crazy_functions.paper_fns.auto_git.handlers.topic_handler import TopicHandler +from crazy_functions.paper_fns.auto_git.sources.github_source import GitHubSource +from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +import re +from datetime import datetime +import os +import json +from pathlib import Path +import time + +# 导入格式化器 +from crazy_functions.paper_fns.file2file_doc import ( + TxtFormatter, + MarkdownFormatter, + HtmlFormatter, + WordFormatter +) +from crazy_functions.paper_fns.file2file_doc.word2pdf import WordToPdfConverter + +@CatchException +def GitHub项目智能检索(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, + history: List, system_prompt: str, user_request: str): + """GitHub项目智能检索主函数""" + + # 初始化GitHub API调用源 + github_source = GitHubSource(api_key=plugin_kwargs.get("github_api_key")) + + # 初始化处理器 + handlers = { + "repo": RepositoryHandler(github_source, llm_kwargs), + "code": CodeSearchHandler(github_source, llm_kwargs), + "user": UserSearchHandler(github_source, llm_kwargs), + "topic": TopicHandler(github_source, llm_kwargs), + } + + # 分析查询意图 + chatbot.append(["分析查询意图", "正在分析您的查询需求..."]) + yield from update_ui(chatbot=chatbot, history=history) + + query_analyzer = QueryAnalyzer() + search_criteria = yield from query_analyzer.analyze_query( + txt, chatbot, llm_kwargs + ) + + # 根据查询类型选择处理器 + handler = handlers.get(search_criteria.query_type) + if not handler: + handler = handlers["repo"] # 默认使用仓库处理器 + + # 处理查询 + chatbot.append(["开始搜索", f"使用{handler.__class__.__name__}处理您的请求,正在搜索GitHub..."]) + yield from update_ui(chatbot=chatbot, history=history) + + final_prompt = asyncio.run(handler.handle( + criteria=search_criteria, + chatbot=chatbot, + history=history, + system_prompt=system_prompt, + llm_kwargs=llm_kwargs, + plugin_kwargs=plugin_kwargs + )) + + if final_prompt: + # 检查是否是道歉提示 + if "很抱歉,我们未能找到" in final_prompt: + chatbot.append([txt, final_prompt]) + yield from update_ui(chatbot=chatbot, history=history) + return + + # 在 final_prompt 末尾添加用户原始查询要求 + final_prompt += f""" + +原始用户查询: "{txt}" + +重要提示: +- 你的回答必须直接满足用户的原始查询要求 +- 在遵循之前指南的同时,优先回答用户明确提出的问题 +- 确保回答格式和内容与用户期望一致 +- 对于GitHub仓库需要提供链接地址, 回复中请采用以下格式的HTML链接: + * 对于GitHub仓库: 仓库名 +- 不要生成参考列表,引用信息将另行处理 +""" + + # 使用最终的prompt生成回答 + response = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=final_prompt, + inputs_show_user=txt, + llm_kwargs=llm_kwargs, + chatbot=chatbot, + history=[], + sys_prompt=f"你是一个熟悉GitHub生态系统的专业助手,能帮助用户找到合适的项目、代码和开发者。除非用户指定,否则请使用中文回复。" + ) + + # 1. 获取项目列表 + repos_list = handler.ranked_repos # 直接使用原始仓库数据 + + # 在新的对话中添加格式化的仓库参考列表 + if repos_list: + references = "" + for idx, repo in enumerate(repos_list, 1): + # 构建仓库引用 + stars_str = f"⭐ {repo.get('stargazers_count', 'N/A')}" if repo.get('stargazers_count') else "" + forks_str = f"🍴 {repo.get('forks_count', 'N/A')}" if repo.get('forks_count') else "" + stats = f"{stars_str} {forks_str}".strip() + stats = f" ({stats})" if stats else "" + + language = f" [{repo.get('language', '')}]" if repo.get('language') else "" + + reference = f"[{idx}] **{repo.get('name', '')}**{language}{stats} \n" + reference += f"👤 {repo.get('owner', {}).get('login', 'N/A') if repo.get('owner') is not None else 'N/A'} | " + reference += f"📅 {repo.get('updated_at', 'N/A')[:10]} | " + reference += f"GitHub \n" + + if repo.get('description'): + reference += f"{repo.get('description')} \n" + reference += " \n" + + references += reference + + # 添加新的对话显示参考仓库 + chatbot.append(["推荐项目如下:", references]) + yield from update_ui(chatbot=chatbot, history=history) + + # 2. 保存结果到文件 + # 创建保存目录 + save_dir = get_log_folder(get_user(chatbot), plugin_name='github_search') + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + # 生成文件名 + def get_safe_filename(txt, max_length=10): + # 获取文本前max_length个字符作为文件名 + filename = txt[:max_length].strip() + # 移除不安全的文件名字符 + filename = re.sub(r'[\\/:*?"<>|]', '', filename) + # 如果文件名为空,使用时间戳 + if not filename: + filename = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + return filename + + base_filename = get_safe_filename(txt) + + # 准备保存的内容 - 优化文档结构 + md_content = f"# GitHub搜索结果: {txt}\n\n" + md_content += f"搜索时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n" + + # 添加模型回复 + md_content += "## 搜索分析与总结\n\n" + md_content += response + "\n\n" + + # 添加所有搜索到的仓库详细信息 + md_content += "## 推荐项目详情\n\n" + + if not repos_list: + md_content += "未找到匹配的项目\n\n" + else: + md_content += f"共找到 {len(repos_list)} 个相关项目\n\n" + + # 添加项目简表 + md_content += "### 项目一览表\n\n" + md_content += "| 序号 | 项目名称 | 作者 | 语言 | 星标数 | 更新时间 |\n" + md_content += "| ---- | -------- | ---- | ---- | ------ | -------- |\n" + + for idx, repo in enumerate(repos_list, 1): + md_content += f"| {idx} | [{repo.get('name', '')}]({repo.get('html_url', '')}) | {repo.get('owner', {}).get('login', 'N/A') if repo.get('owner') is not None else 'N/A'} | {repo.get('language', 'N/A')} | {repo.get('stargazers_count', 'N/A')} | {repo.get('updated_at', 'N/A')[:10]} |\n" + + md_content += "\n" + + # 添加详细项目信息 + md_content += "### 项目详细信息\n\n" + for idx, repo in enumerate(repos_list, 1): + md_content += f"#### {idx}. {repo.get('name', '')}\n\n" + md_content += f"- **仓库**: [{repo.get('full_name', '')}]({repo.get('html_url', '')})\n" + md_content += f"- **作者**: [{repo.get('owner', {}).get('login', '') if repo.get('owner') is not None else 'N/A'}]({repo.get('owner', {}).get('html_url', '') if repo.get('owner') is not None else '#'})\n" + md_content += f"- **描述**: {repo.get('description', 'N/A')}\n" + md_content += f"- **语言**: {repo.get('language', 'N/A')}\n" + md_content += f"- **星标**: {repo.get('stargazers_count', 'N/A')}\n" + md_content += f"- **Fork数**: {repo.get('forks_count', 'N/A')}\n" + md_content += f"- **最近更新**: {repo.get('updated_at', 'N/A')[:10]}\n" + md_content += f"- **创建时间**: {repo.get('created_at', 'N/A')[:10]}\n" + md_content += f"- **开源许可**: {repo.get('license', {}).get('name', 'N/A') if repo.get('license') is not None else 'N/A'}\n" + if repo.get('topics'): + md_content += f"- **主题标签**: {', '.join(repo.get('topics', []))}\n" + if repo.get('homepage'): + md_content += f"- **项目主页**: [{repo.get('homepage')}]({repo.get('homepage')})\n" + md_content += "\n" + + # 添加查询信息和元数据 + md_content += "## 查询元数据\n\n" + md_content += f"- **原始查询**: {txt}\n" + md_content += f"- **查询类型**: {search_criteria.query_type}\n" + md_content += f"- **关键词**: {', '.join(search_criteria.keywords) if hasattr(search_criteria, 'keywords') and search_criteria.keywords else 'N/A'}\n" + md_content += f"- **搜索日期**: {datetime.now().strftime('%Y-%m-%d')}\n\n" + + # 保存为多种格式 + saved_files = [] + failed_files = [] + + # 1. 保存为TXT + try: + txt_formatter = TxtFormatter() + txt_content = txt_formatter.create_document(md_content) + txt_file = os.path.join(save_dir, f"github_results_{base_filename}.txt") + with open(txt_file, 'w', encoding='utf-8') as f: + f.write(txt_content) + promote_file_to_downloadzone(txt_file, chatbot=chatbot) + saved_files.append("TXT") + except Exception as e: + failed_files.append(f"TXT (错误: {str(e)})") + + # 2. 保存为Markdown + try: + md_formatter = MarkdownFormatter() + formatted_md_content = md_formatter.create_document(md_content, "GitHub项目搜索") + md_file = os.path.join(save_dir, f"github_results_{base_filename}.md") + with open(md_file, 'w', encoding='utf-8') as f: + f.write(formatted_md_content) + promote_file_to_downloadzone(md_file, chatbot=chatbot) + saved_files.append("Markdown") + except Exception as e: + failed_files.append(f"Markdown (错误: {str(e)})") + + # 3. 保存为HTML + try: + html_formatter = HtmlFormatter(processing_type="GitHub项目搜索") + html_content = html_formatter.create_document(md_content) + html_file = os.path.join(save_dir, f"github_results_{base_filename}.html") + with open(html_file, 'w', encoding='utf-8') as f: + f.write(html_content) + promote_file_to_downloadzone(html_file, chatbot=chatbot) + saved_files.append("HTML") + except Exception as e: + failed_files.append(f"HTML (错误: {str(e)})") + + # 4. 保存为Word + word_file = None + try: + word_formatter = WordFormatter() + doc = word_formatter.create_document(md_content, "GitHub项目搜索") + word_file = os.path.join(save_dir, f"github_results_{base_filename}.docx") + doc.save(word_file) + promote_file_to_downloadzone(word_file, chatbot=chatbot) + saved_files.append("Word") + except Exception as e: + failed_files.append(f"Word (错误: {str(e)})") + word_file = None + + # 5. 保存为PDF (仅当Word保存成功时) + if word_file and os.path.exists(word_file): + try: + pdf_file = WordToPdfConverter.convert_to_pdf(word_file) + promote_file_to_downloadzone(pdf_file, chatbot=chatbot) + saved_files.append("PDF") + except Exception as e: + failed_files.append(f"PDF (错误: {str(e)})") + + # 报告保存结果 + if saved_files: + success_message = f"成功保存以下格式: {', '.join(saved_files)}" + if failed_files: + failure_message = f"以下格式保存失败: {', '.join(failed_files)}" + chatbot.append(["部分格式保存成功", f"{success_message}。{failure_message}"]) + else: + chatbot.append(["所有格式保存成功", success_message]) + else: + chatbot.append(["保存失败", f"所有格式均保存失败: {', '.join(failed_files)}"]) + else: + report_exception(chatbot, history, a=f"处理失败", b=f"请尝试其他查询") + yield from update_ui(chatbot=chatbot, history=history) \ No newline at end of file diff --git a/crazy_functions/paper_fns/journal_paper_recom.py b/crazy_functions/paper_fns/journal_paper_recom.py new file mode 100644 index 00000000..09051c09 --- /dev/null +++ b/crazy_functions/paper_fns/journal_paper_recom.py @@ -0,0 +1,635 @@ +import os +import time +import glob +from typing import Dict, List, Generator, Tuple +from dataclasses import dataclass + +from crazy_functions.pdf_fns.text_content_loader import TextContentLoader +from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from toolbox import update_ui, promote_file_to_downloadzone, write_history_to_file, CatchException, report_exception +from shared_utils.fastapi_server import validate_path_safety +# 导入论文下载相关函数 +from crazy_functions.论文下载 import extract_paper_id, extract_paper_ids, get_arxiv_paper, format_arxiv_id, SciHub +from pathlib import Path +from datetime import datetime, timedelta +import calendar + + +@dataclass +class RecommendationQuestion: + """期刊会议推荐分析问题类""" + id: str # 问题ID + question: str # 问题内容 + importance: int # 重要性 (1-5,5最高) + description: str # 问题描述 + + +class JournalConferenceRecommender: + """论文期刊会议推荐器""" + + def __init__(self, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, history: List, system_prompt: str): + """初始化推荐器""" + self.llm_kwargs = llm_kwargs + self.plugin_kwargs = plugin_kwargs + self.chatbot = chatbot + self.history = history + self.system_prompt = system_prompt + self.paper_content = "" + self.analysis_results = {} + + # 定义论文分析问题库(针对期刊会议推荐) + self.questions = [ + RecommendationQuestion( + id="research_field_and_topic", + question="请分析这篇论文的研究领域、主题和关键词。具体包括:1)论文属于哪个主要学科领域(如自然科学、工程技术、医学、社会科学、人文学科等);2)具体的研究子领域或方向;3)论文的核心主题和关键概念;4)重要的学术关键词和专业术语;5)研究的跨学科特征(如果有);6)研究的地域性特征(国际性研究还是特定地区研究)。", + importance=5, + description="研究领域与主题分析" + ), + RecommendationQuestion( + id="methodology_and_approach", + question="请分析论文的研究方法和技术路线。包括:1)采用的主要研究方法(定量研究、定性研究、理论分析、实验研究、田野调查、文献综述、案例研究等);2)使用的技术手段、工具或分析方法;3)研究设计的严谨性和创新性;4)数据收集和分析方法的适当性;5)研究方法在该学科中的先进性或传统性;6)方法学上的贡献或局限性。", + importance=4, + description="研究方法与技术路线" + ), + RecommendationQuestion( + id="novelty_and_contribution", + question="请评估论文的创新性和学术贡献。包括:1)研究的新颖性程度(理论创新、方法创新、应用创新等);2)对现有知识体系的贡献或突破;3)解决问题的重要性和学术价值;4)研究成果的理论意义和实践价值;5)在该学科领域的地位和影响潜力;6)与国际前沿研究的关系;7)对后续研究的启发意义。", + importance=4, + description="创新性与学术贡献" + ), + RecommendationQuestion( + id="target_audience_and_scope", + question="请分析论文的目标受众和应用范围。包括:1)主要面向的学术群体(研究者、从业者、政策制定者等);2)研究成果的潜在应用领域和受益群体;3)对学术界和实践界的价值;4)研究的国际化程度和跨文化适用性;5)是否适合国际期刊还是区域性期刊;6)语言发表偏好(英文、中文或其他语言);7)开放获取的必要性和可行性。", + importance=3, + description="目标受众与应用范围" + ), + ] + + # 按重要性排序 + self.questions.sort(key=lambda q: q.importance, reverse=True) + + def _load_paper(self, paper_path: str) -> Generator: + """加载论文内容""" + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 使用TextContentLoader读取文件 + loader = TextContentLoader(self.chatbot, self.history) + + yield from loader.execute_single_file(paper_path) + + # 获取加载的内容 + if len(self.history) >= 2 and self.history[-2]: + self.paper_content = self.history[-2] + yield from update_ui(chatbot=self.chatbot, history=self.history) + return True + else: + self.chatbot.append(["错误", "无法读取论文内容,请检查文件是否有效"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return False + + def _analyze_question(self, question: RecommendationQuestion) -> Generator: + """分析单个问题""" + try: + # 创建分析提示 + prompt = f"请基于以下论文内容回答问题:\n\n{self.paper_content}\n\n问题:{question.question}" + + # 使用单线程版本的请求函数 + response = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=prompt, + inputs_show_user=question.question, # 显示问题本身 + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history=[], # 空历史,确保每个问题独立分析 + sys_prompt="你是一个专业的学术期刊会议推荐专家,需要仔细分析论文内容并提供准确的分析。请保持客观、专业,并基于论文内容提供深入分析。" + ) + + if response: + self.analysis_results[question.id] = response + return True + return False + + except Exception as e: + self.chatbot.append(["错误", f"分析问题时出错: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return False + + def _generate_journal_recommendations(self) -> Generator: + """生成期刊推荐""" + self.chatbot.append(["生成期刊推荐", "正在基于论文分析结果生成期刊推荐..."]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 构建期刊推荐提示 + journal_prompt = """请基于以下论文分析结果,为这篇论文推荐合适的学术期刊。 + +推荐要求: +1. 根据论文的创新性和工作质量,分别推荐不同级别的期刊: + - 顶级期刊(影响因子>8或该领域顶级期刊):2-3个 + - 高质量期刊(影响因子4-8或该领域知名期刊):3-4个 + - 中等期刊(影响因子1.5-4或该领域认可期刊):3-4个 + - 入门期刊(影响因子<1.5但声誉良好的期刊):2-3个 + +注意:不同学科的影响因子标准差异很大,请根据论文所属学科的实际情况调整标准。 +特别是医学领域,需要考虑: +- 临床医学期刊通常影响因子较高(顶级期刊IF>20,高质量期刊IF>10) +- 基础医学期刊影响因子相对较低但学术价值很高 +- 专科医学期刊在各自领域内具有权威性 +- 医学期刊的临床实用性和循证医学价值 + +2. 对每个期刊提供详细信息: + - 期刊全名和缩写 + - 最新影响因子(如果知道) + - 期刊级别分类(Q1/Q2/Q3/Q4或该学科的分类标准) + - 主要研究领域和范围 + - 与论文内容的匹配度评分(1-10分) + - 发表难度评估(容易/中等/困难/极难) + - 平均审稿周期 + - 开放获取政策 + - 期刊的学科分类(如SCI、SSCI、A&HCI等) + - 医学期刊特殊信息(如适用): + * PubMed收录情况 + * 是否为核心临床期刊 + * 专科领域权威性 + * 循证医学等级要求 + * 临床试验注册要求 + * 伦理委员会批准要求 + +3. 按推荐优先级排序,并说明推荐理由 +4. 提供针对性的投稿建议,考虑该学科的特点 + +论文分析结果:""" + + for q in self.questions: + if q.id in self.analysis_results: + journal_prompt += f"\n\n{q.description}:\n{self.analysis_results[q.id]}" + + journal_prompt += "\n\n请提供详细的期刊推荐报告,重点关注期刊的层次性和适配性。请根据论文的具体学科领域,采用该领域通用的期刊评价标准和分类体系。" + + try: + response = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=journal_prompt, + inputs_show_user="生成期刊推荐报告", + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history=[], + sys_prompt="你是一个资深的跨学科学术期刊推荐专家,熟悉各个学科领域不同层次的期刊。请根据论文的具体学科和创新性,推荐从顶级到入门级的各层次期刊。不同学科有不同的期刊评价标准:理工科重视影响因子和SCI收录,社会科学重视SSCI和学科声誉,人文学科重视A&HCI和同行评议,医学领域重视PubMed收录、临床实用性、循证医学价值和伦理规范。请根据论文所属学科采用相应的评价标准。" + ) + + if response: + return response + return "期刊推荐生成失败" + + except Exception as e: + self.chatbot.append(["错误", f"生成期刊推荐时出错: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return "期刊推荐生成失败: " + str(e) + + def _generate_conference_recommendations(self) -> Generator: + """生成会议推荐""" + self.chatbot.append(["生成会议推荐", "正在基于论文分析结果生成会议推荐..."]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 获取当前时间信息 + current_time = datetime.now() + current_date_str = current_time.strftime("%Y年%m月%d日") + current_year = current_time.year + current_month = current_time.month + + # 构建会议推荐提示 + conference_prompt = f"""请基于以下论文分析结果,为这篇论文推荐合适的学术会议。 + +**重要提示:当前时间是{current_date_str}({current_year}年{current_month}月),请基于这个时间点推断会议的举办时间和投稿截止时间。** + +推荐要求: +1. 根据论文的创新性和工作质量,分别推荐不同级别的会议: + - 顶级会议(该领域最权威的国际会议):2-3个 + - 高质量会议(该领域知名的国际或区域会议):3-4个 + - 中等会议(该领域认可的专业会议):3-4个 + - 专业会议(该领域细分方向的专门会议):2-3个 + +注意:不同学科的会议评价标准不同: +- 计算机科学:可参考CCF分类(A/B/C类) +- 工程学:可参考EI收录和影响力 +- 医学:可参考会议的临床影响和同行认可度 +- 社会科学:可参考会议的学术声誉和参与度 +- 人文学科:可参考会议的历史和学术传统 +- 自然科学:可参考会议的国际影响力和发表质量 + +特别是医学会议,需要考虑: +- 临床医学会议重视实用性和临床指导价值 +- 基础医学会议重视科学创新和机制研究 +- 专科医学会议在各自领域内具有权威性 +- 国际医学会议的CME学分认证情况 + +2. 对每个会议提供详细信息: + - 会议全名和缩写 + - 会议级别分类(根据该学科的评价标准) + - 主要研究领域和主题 + - 与论文内容的匹配度评分(1-10分) + - 录用难度评估(容易/中等/困难/极难) + - 会议举办周期(年会/双年会/不定期等) + - **基于当前时间{current_date_str},推断{current_year}年和{current_year+1}年的举办时间和地点**(请根据往年的举办时间规律进行推断) + - **基于推断的会议时间,估算论文提交截止时间**(通常在会议前3-6个月) + - 会议的国际化程度和影响范围 + - 医学会议特殊信息(如适用): + * 是否提供CME学分 + * 临床实践指导价值 + * 专科认证机构认可情况 + * 会议论文集的PubMed收录情况 + * 伦理和临床试验相关要求 + +3. 按推荐优先级排序,并说明推荐理由 +4. **基于当前时间{current_date_str},提供会议投稿的时间规划建议** + - 哪些会议可以赶上{current_year}年的投稿截止时间 + - 哪些会议需要准备{current_year+1}年的投稿 + - 具体的时间安排建议 + +论文分析结果:""" + + for q in self.questions: + if q.id in self.analysis_results: + conference_prompt += f"\n\n{q.description}:\n{self.analysis_results[q.id]}" + + conference_prompt += f"\n\n请提供详细的会议推荐报告,重点关注会议的层次性和时效性。请根据论文的具体学科领域,采用该领域通用的会议评价标准。\n\n**特别注意:请根据当前时间{current_date_str}和各会议的历史举办时间规律,准确推断{current_year}年和{current_year+1}年的会议时间安排,不要使用虚构的时间。**" + + try: + response = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=conference_prompt, + inputs_show_user="生成会议推荐报告", + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history=[], + sys_prompt="你是一个资深的跨学科学术会议推荐专家,熟悉各个学科领域不同层次的学术会议。请根据论文的具体学科和创新性,推荐从顶级到专业级的各层次会议。不同学科有不同的会议评价标准和文化:理工科重视技术创新和国际影响力,社会科学重视理论贡献和社会意义,人文学科重视学术深度和文化价值,医学领域重视临床实用性、CME学分认证、专科权威性和伦理规范。请根据论文所属学科采用相应的评价标准和推荐策略。" + ) + + if response: + return response + return "会议推荐生成失败" + + except Exception as e: + self.chatbot.append(["错误", f"生成会议推荐时出错: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return "会议推荐生成失败: " + str(e) + + def _generate_priority_summary(self, journal_recommendations: str, conference_recommendations: str) -> Generator: + """生成优先级总结""" + self.chatbot.append(["生成优先级总结", "正在生成投稿优先级总结..."]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 获取当前时间信息 + current_time = datetime.now() + current_date_str = current_time.strftime("%Y年%m月%d日") + current_month = current_time.strftime("%Y年%m月") + + # 计算未来时间点 + def add_months(date, months): + """安全地添加月份""" + month = date.month - 1 + months + year = date.year + month // 12 + month = month % 12 + 1 + day = min(date.day, calendar.monthrange(year, month)[1]) + return date.replace(year=year, month=month, day=day) + + future_6_months = add_months(current_time, 6).strftime('%Y年%m月') + future_12_months = add_months(current_time, 12).strftime('%Y年%m月') + future_year = (current_time.year + 1) + + priority_prompt = f"""请基于以下期刊和会议推荐结果,生成一个综合的投稿优先级总结。 + +**重要提示:当前时间是{current_date_str}({current_month}),请基于这个时间点制定投稿计划。** + +期刊推荐结果: +{journal_recommendations} + +会议推荐结果: +{conference_recommendations} + +请提供: +1. 综合投稿策略建议(考虑该学科的发表文化和惯例) + - 期刊优先还是会议优先(不同学科有不同偏好) + - 国际期刊/会议 vs 国内期刊/会议的选择策略 + - 英文发表 vs 中文发表的考虑 + +2. 按时间线排列的投稿计划(**基于当前时间{current_date_str},考虑截止时间和审稿周期**) + - 短期目标({current_month}起3-6个月内,即到{future_6_months}) + - 中期目标(6-12个月内,即到{future_12_months}) + - 长期目标(1年以上,即{future_year}年以后) + +3. 风险分散策略 + - 同时投稿多个不同级别的目标 + - 考虑该学科的一稿多投政策 + - 备选方案和应急策略 + +4. 针对论文可能需要的改进建议 + - 根据目标期刊/会议的要求调整内容 + - 语言和格式的优化建议 + - 补充实验或分析的建议 + +5. 预期的发表时间线和成功概率评估(基于当前时间{current_date_str}) + +6. 该学科特有的发表注意事项 + - 伦理审查要求(如医学、心理学等) + - 数据开放要求(如某些自然科学领域) + - 利益冲突声明(如医学、工程等) + - 医学领域特殊要求: + * 临床试验注册要求(ClinicalTrials.gov、中国临床试验注册中心等) + * 患者知情同意和隐私保护 + * 医学伦理委员会批准证明 + * CONSORT、STROBE、PRISMA等报告规范遵循 + * 药物/器械安全性数据要求 + * CME学分认证相关要求 + * 临床指南和循证医学等级要求 + - 其他学科特殊要求 + +请以表格形式总结前10个最推荐的投稿目标(期刊+会议),包括优先级排序、预期时间线和成功概率。 + +**注意:所有时间规划都应基于当前时间{current_date_str}进行计算,不要使用虚构的时间。**""" + + try: + response = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=priority_prompt, + inputs_show_user="生成投稿优先级总结", + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history=[], + sys_prompt="你是一个资深的跨学科学术发表策略专家,熟悉各个学科的发表文化、惯例和要求。请综合考虑不同学科的特点:理工科通常重视期刊发表和影响因子,社会科学平衡期刊和专著,人文学科重视同行评议和学术声誉,医学重视临床意义和伦理规范。请为作者制定最适合其学科背景的投稿策略和时间规划。" + ) + + if response: + return response + return "优先级总结生成失败" + + except Exception as e: + self.chatbot.append(["错误", f"生成优先级总结时出错: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return "优先级总结生成失败: " + str(e) + + def save_recommendations(self, journal_recommendations: str, conference_recommendations: str, priority_summary: str) -> Generator: + """保存推荐报告""" + timestamp = time.strftime("%Y%m%d_%H%M%S") + + # 保存为Markdown文件 + try: + md_content = f"""# 论文期刊会议推荐报告 + +## 投稿优先级总结 + +{priority_summary} + +## 期刊推荐 + +{journal_recommendations} + +## 会议推荐 + +{conference_recommendations} + +--- + +# 详细分析结果 +""" + + # 添加详细分析结果 + for q in self.questions: + if q.id in self.analysis_results: + md_content += f"\n\n## {q.description}\n\n{self.analysis_results[q.id]}" + + result_file = write_history_to_file( + history=[md_content], + file_basename=f"期刊会议推荐_{timestamp}.md" + ) + + if result_file and os.path.exists(result_file): + promote_file_to_downloadzone(result_file, chatbot=self.chatbot) + self.chatbot.append(["保存成功", f"推荐报告已保存至: {os.path.basename(result_file)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + else: + self.chatbot.append(["警告", "保存报告成功但找不到文件"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + except Exception as e: + self.chatbot.append(["警告", f"保存报告失败: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + def recommend_venues(self, paper_path: str) -> Generator: + """推荐期刊会议主流程""" + # 加载论文 + success = yield from self._load_paper(paper_path) + if not success: + return + + # 分析关键问题 + for question in self.questions: + yield from self._analyze_question(question) + + # 分别生成期刊和会议推荐 + journal_recommendations = yield from self._generate_journal_recommendations() + conference_recommendations = yield from self._generate_conference_recommendations() + + # 生成优先级总结 + priority_summary = yield from self._generate_priority_summary(journal_recommendations, conference_recommendations) + + # 显示结果 + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 保存报告 + yield from self.save_recommendations(journal_recommendations, conference_recommendations, priority_summary) + + # 将完整的分析结果和推荐内容添加到历史记录中,方便用户继续提问 + self._add_to_history(journal_recommendations, conference_recommendations, priority_summary) + + def _add_to_history(self, journal_recommendations: str, conference_recommendations: str, priority_summary: str): + """将分析结果和推荐内容添加到历史记录中""" + try: + # 构建完整的内容摘要 + history_content = f"""# 论文期刊会议推荐分析完成 + +## 📊 投稿优先级总结 +{priority_summary} + +## 📚 期刊推荐 +{journal_recommendations} + +## 🏛️ 会议推荐 +{conference_recommendations} + +## 📋 详细分析结果 +""" + + # 添加详细分析结果 + for q in self.questions: + if q.id in self.analysis_results: + history_content += f"\n### {q.description}\n{self.analysis_results[q.id]}\n" + + history_content += "\n---\n💡 您现在可以基于以上分析结果继续提问,比如询问特定期刊的详细信息、投稿策略建议、或者对推荐结果的进一步解释。" + + # 添加到历史记录中 + self.history.append("论文期刊会议推荐分析") + self.history.append(history_content) + + self.chatbot.append(["✅ 分析完成", "所有分析结果和推荐内容已添加到对话历史中,您可以继续基于这些内容提问。"]) + + except Exception as e: + self.chatbot.append(["警告", f"添加到历史记录时出错: {str(e)},但推荐报告已正常生成"]) + # 即使添加历史失败,也不影响主要功能 + + +def _find_paper_file(path: str) -> str: + """查找路径中的论文文件(简化版)""" + if os.path.isfile(path): + return path + + # 支持的文件扩展名(按优先级排序) + extensions = ["pdf", "docx", "doc", "txt", "md", "tex"] + + # 简单地遍历目录 + if os.path.isdir(path): + try: + for ext in extensions: + # 手动检查每个可能的文件,而不使用glob + potential_file = os.path.join(path, f"paper.{ext}") + if os.path.exists(potential_file) and os.path.isfile(potential_file): + return potential_file + + # 如果没找到特定命名的文件,检查目录中的所有文件 + for file in os.listdir(path): + file_path = os.path.join(path, file) + if os.path.isfile(file_path): + file_ext = file.split('.')[-1].lower() if '.' in file else "" + if file_ext in extensions: + return file_path + except Exception: + pass # 忽略任何错误 + + return None + + +def download_paper_by_id(paper_info, chatbot, history) -> str: + """下载论文并返回保存路径 + + Args: + paper_info: 元组,包含论文ID类型(arxiv或doi)和ID值 + chatbot: 聊天机器人对象 + history: 历史记录 + + Returns: + str: 下载的论文路径或None + """ + id_type, paper_id = paper_info + + # 创建保存目录 - 使用时间戳创建唯一文件夹 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + user_name = chatbot.get_user() if hasattr(chatbot, 'get_user') else "default" + from toolbox import get_log_folder, get_user + base_save_dir = get_log_folder(get_user(chatbot), plugin_name='paper_download') + save_dir = os.path.join(base_save_dir, f"papers_{timestamp}") + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_path = Path(save_dir) + + chatbot.append([f"下载论文", f"正在下载{'arXiv' if id_type == 'arxiv' else 'DOI'} {paper_id} 的论文..."]) + update_ui(chatbot=chatbot, history=history) + + pdf_path = None + + try: + if id_type == 'arxiv': + # 使用改进的arxiv查询方法 + formatted_id = format_arxiv_id(paper_id) + paper_result = get_arxiv_paper(formatted_id) + + if not paper_result: + chatbot.append([f"下载失败", f"未找到arXiv论文: {paper_id}"]) + update_ui(chatbot=chatbot, history=history) + return None + + # 下载PDF + filename = f"arxiv_{paper_id.replace('/', '_')}.pdf" + pdf_path = str(save_path / filename) + paper_result.download_pdf(filename=pdf_path) + + else: # doi + # 下载DOI + sci_hub = SciHub( + doi=paper_id, + path=save_path + ) + pdf_path = sci_hub.fetch() + + # 检查下载结果 + if pdf_path and os.path.exists(pdf_path): + promote_file_to_downloadzone(pdf_path, chatbot=chatbot) + chatbot.append([f"下载成功", f"已成功下载论文: {os.path.basename(pdf_path)}"]) + update_ui(chatbot=chatbot, history=history) + return pdf_path + else: + chatbot.append([f"下载失败", f"论文下载失败: {paper_id}"]) + update_ui(chatbot=chatbot, history=history) + return None + + except Exception as e: + chatbot.append([f"下载错误", f"下载论文时出错: {str(e)}"]) + update_ui(chatbot=chatbot, history=history) + return None + + +@CatchException +def 论文期刊会议推荐(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, + history: List, system_prompt: str, user_request: str): + """主函数 - 论文期刊会议推荐""" + # 初始化推荐器 + chatbot.append(["函数插件功能及使用方式", "论文期刊会议推荐:基于论文内容分析,为您推荐合适的学术期刊和会议投稿目标。适用于各个学科专业(自然科学、工程技术、医学、社会科学、人文学科等),根据不同学科的评价标准和发表文化,提供分层次的期刊会议推荐、影响因子分析、发表难度评估、投稿策略建议等。

      📋 使用方式:
      1、直接上传PDF文件
      2、输入DOI号或arXiv ID
      3、点击插件开始分析"]) + yield from update_ui(chatbot=chatbot, history=history) + + paper_file = None + + # 检查输入是否为论文ID(arxiv或DOI) + paper_info = extract_paper_id(txt) + + if paper_info: + # 如果是论文ID,下载论文 + chatbot.append(["检测到论文ID", f"检测到{'arXiv' if paper_info[0] == 'arxiv' else 'DOI'} ID: {paper_info[1]},准备下载论文..."]) + yield from update_ui(chatbot=chatbot, history=history) + + # 下载论文 + paper_file = download_paper_by_id(paper_info, chatbot, history) + + if not paper_file: + report_exception(chatbot, history, a=f"下载论文失败", b=f"无法下载{'arXiv' if paper_info[0] == 'arxiv' else 'DOI'}论文: {paper_info[1]}") + yield from update_ui(chatbot=chatbot, history=history) + return + else: + # 检查输入路径 + if not os.path.exists(txt): + report_exception(chatbot, history, a=f"解析论文: {txt}", b=f"找不到文件或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 验证路径安全性 + user_name = chatbot.get_user() + validate_path_safety(txt, user_name) + + # 查找论文文件 + paper_file = _find_paper_file(txt) + + if not paper_file: + report_exception(chatbot, history, a=f"解析论文", b=f"在路径 {txt} 中未找到支持的论文文件") + yield from update_ui(chatbot=chatbot, history=history) + return + + yield from update_ui(chatbot=chatbot, history=history) + + # 确保paper_file是字符串 + if paper_file is not None and not isinstance(paper_file, str): + # 尝试转换为字符串 + try: + paper_file = str(paper_file) + except: + report_exception(chatbot, history, a=f"类型错误", b=f"论文路径不是有效的字符串: {type(paper_file)}") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 开始推荐 + chatbot.append(["开始分析", f"正在分析论文并生成期刊会议推荐: {os.path.basename(paper_file)}"]) + yield from update_ui(chatbot=chatbot, history=history) + + recommender = JournalConferenceRecommender(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + yield from recommender.recommend_venues(paper_file) \ No newline at end of file diff --git a/crazy_functions/paper_fns/paper_download.py b/crazy_functions/paper_fns/paper_download.py new file mode 100644 index 00000000..72ce3b8f --- /dev/null +++ b/crazy_functions/paper_fns/paper_download.py @@ -0,0 +1,295 @@ +import re +import os +import zipfile +from toolbox import CatchException, update_ui, promote_file_to_downloadzone, get_log_folder, get_user + +from pathlib import Path +from datetime import datetime + +def extract_paper_id(txt): + """从输入文本中提取论文ID""" + # 尝试匹配DOI(将DOI匹配提前,因为其格式更加明确) + doi_patterns = [ + r'doi.org/([\w\./-]+)', # doi.org/10.1234/xxx + r'doi:\s*([\w\./-]+)', # doi: 10.1234/xxx + r'(10\.\d{4,}/[\w\.-]+)', # 直接输入DOI: 10.1234/xxx + ] + + for pattern in doi_patterns: + match = re.search(pattern, txt, re.IGNORECASE) + if match: + return ('doi', match.group(1)) + + # 尝试匹配arXiv ID + arxiv_patterns = [ + r'arxiv.org/abs/(\d+\.\d+)', # arxiv.org/abs/2103.14030 + r'arxiv.org/pdf/(\d+\.\d+)', # arxiv.org/pdf/2103.14030 + r'arxiv/(\d+\.\d+)', # arxiv/2103.14030 + r'^(\d{4}\.\d{4,5})$', # 直接输入ID: 2103.14030 + # 添加对早期arXiv ID的支持 + r'arxiv.org/abs/([\w-]+/\d{7})', # arxiv.org/abs/math/0211159 + r'arxiv.org/pdf/([\w-]+/\d{7})', # arxiv.org/pdf/hep-th/9901001 + r'^([\w-]+/\d{7})$', # 直接输入: math/0211159 + ] + + for pattern in arxiv_patterns: + match = re.search(pattern, txt, re.IGNORECASE) + if match: + paper_id = match.group(1) + # 如果是新格式(YYMM.NNNNN)或旧格式(category/NNNNNNN),都直接返回 + if re.match(r'^\d{4}\.\d{4,5}$', paper_id) or re.match(r'^[\w-]+/\d{7}$', paper_id): + return ('arxiv', paper_id) + + return None + +def extract_paper_ids(txt): + """从输入文本中提取多个论文ID""" + paper_ids = [] + + # 首先按换行符分割 + for line in txt.strip().split('\n'): + line = line.strip() + if not line: # 跳过空行 + continue + + # 对每一行再按空格分割 + for item in line.split(): + item = item.strip() + if not item: # 跳过空项 + continue + paper_info = extract_paper_id(item) + if paper_info: + paper_ids.append(paper_info) + + # 去除重复项,保持顺序 + unique_paper_ids = [] + seen = set() + for paper_info in paper_ids: + if paper_info not in seen: + seen.add(paper_info) + unique_paper_ids.append(paper_info) + + return unique_paper_ids + +def format_arxiv_id(paper_id): + """格式化arXiv ID,处理新旧两种格式""" + # 如果是旧格式 (e.g. astro-ph/0404140),需要去掉arxiv:前缀 + if '/' in paper_id: + return paper_id.replace('arxiv:', '') # 确保移除可能存在的arxiv:前缀 + return paper_id + +def get_arxiv_paper(paper_id): + """获取arXiv论文,处理新旧两种格式""" + import arxiv + + # 尝试不同的查询方式 + query_formats = [ + paper_id, # 原始ID + paper_id.replace('/', ''), # 移除斜杠 + f"id:{paper_id}", # 添加id:前缀 + ] + + for query in query_formats: + try: + # 使用Search查询 + search = arxiv.Search( + query=query, + max_results=1 + ) + result = next(arxiv.Client().results(search)) + if result: + return result + except: + continue + + try: + # 使用id_list查询 + search = arxiv.Search(id_list=[query]) + result = next(arxiv.Client().results(search)) + if result: + return result + except: + continue + + return None + +def create_zip_archive(files, save_path): + """将多个PDF文件打包成zip""" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + zip_filename = f"papers_{timestamp}.zip" + zip_path = str(save_path / zip_filename) + + with zipfile.ZipFile(zip_path, 'w') as zipf: + for file in files: + if os.path.exists(file): + # 只添加文件名,不包含路径 + zipf.write(file, os.path.basename(file)) + + return zip_path + +@CatchException +def 论文下载(txt: str, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + """ + txt: 用户输入,可以是DOI、arxiv ID或相关链接,支持多行输入进行批量下载 + """ + from crazy_functions.doc_fns.text_content_loader import TextContentLoader + from crazy_functions.review_fns.data_sources.arxiv_source import ArxivSource + from crazy_functions.review_fns.data_sources.scihub_source import SciHub + # 解析输入 + paper_infos = extract_paper_ids(txt) + if not paper_infos: + chatbot.append(["输入解析", "未能识别任何论文ID或DOI,请检查输入格式。支持以下格式:\n- arXiv ID (例如:2103.14030)\n- arXiv链接\n- DOI (例如:10.1234/xxx)\n- DOI链接\n\n多个论文ID请用换行分隔。"]) + yield from update_ui(chatbot=chatbot, history=history) + return + + # 创建保存目录 - 使用时间戳创建唯一文件夹 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + base_save_dir = get_log_folder(get_user(chatbot), plugin_name='paper_download') + save_dir = os.path.join(base_save_dir, f"papers_{timestamp}") + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_path = Path(save_dir) + + # 记录下载结果 + success_count = 0 + failed_papers = [] + downloaded_files = [] # 记录成功下载的文件路径 + + chatbot.append([f"开始下载", f"支持多行输入下载多篇论文,共检测到 {len(paper_infos)} 篇论文,开始下载..."]) + yield from update_ui(chatbot=chatbot, history=history) + + for id_type, paper_id in paper_infos: + try: + if id_type == 'arxiv': + chatbot.append([f"正在下载", f"从arXiv下载论文 {paper_id}..."]) + yield from update_ui(chatbot=chatbot, history=history) + + # 使用改进的arxiv查询方法 + formatted_id = format_arxiv_id(paper_id) + paper_result = get_arxiv_paper(formatted_id) + + if not paper_result: + failed_papers.append((paper_id, "未找到论文")) + continue + + # 下载PDF + try: + filename = f"arxiv_{paper_id.replace('/', '_')}.pdf" + pdf_path = str(save_path / filename) + paper_result.download_pdf(filename=pdf_path) + if os.path.exists(pdf_path): + downloaded_files.append(pdf_path) + except Exception as e: + failed_papers.append((paper_id, f"PDF下载失败: {str(e)}")) + continue + + else: # doi + chatbot.append([f"正在下载", f"从Sci-Hub下载论文 {paper_id}..."]) + yield from update_ui(chatbot=chatbot, history=history) + + sci_hub = SciHub( + doi=paper_id, + path=save_path + ) + pdf_path = sci_hub.fetch() + if pdf_path and os.path.exists(pdf_path): + downloaded_files.append(pdf_path) + + # 检查下载结果 + if pdf_path and os.path.exists(pdf_path): + promote_file_to_downloadzone(pdf_path, chatbot=chatbot) + success_count += 1 + else: + failed_papers.append((paper_id, "下载失败")) + + except Exception as e: + failed_papers.append((paper_id, str(e))) + + yield from update_ui(chatbot=chatbot, history=history) + + # 创建ZIP压缩包 + if downloaded_files: + try: + zip_path = create_zip_archive(downloaded_files, Path(base_save_dir)) + promote_file_to_downloadzone(zip_path, chatbot=chatbot) + chatbot.append([ + f"创建压缩包", + f"已将所有下载的论文打包为: {os.path.basename(zip_path)}" + ]) + yield from update_ui(chatbot=chatbot, history=history) + except Exception as e: + chatbot.append([ + f"创建压缩包失败", + f"打包文件时出现错误: {str(e)}" + ]) + yield from update_ui(chatbot=chatbot, history=history) + + # 生成最终报告 + summary = f"下载完成!成功下载 {success_count} 篇论文。\n" + if failed_papers: + summary += "\n以下论文下载失败:\n" + for paper_id, reason in failed_papers: + summary += f"- {paper_id}: {reason}\n" + + if downloaded_files: + summary += f"\n所有论文已存放在文件夹 '{save_dir}' 中,并打包到压缩文件中。您可以在下载区找到单个PDF文件和压缩包。" + + chatbot.append([ + f"下载完成", + summary + ]) + yield from update_ui(chatbot=chatbot, history=history) + + # 如果下载成功且用户想要直接阅读内容 + if downloaded_files: + chatbot.append([ + "提示", + "正在读取论文内容进行分析,请稍候..." + ]) + yield from update_ui(chatbot=chatbot, history=history) + + # 使用TextContentLoader加载整个文件夹的PDF文件内容 + loader = TextContentLoader(chatbot, history) + + # 删除提示信息 + chatbot.pop() + + # 加载PDF内容 - 传入文件夹路径而不是单个文件路径 + yield from loader.execute(save_dir) + + # 添加提示信息 + chatbot.append([ + "提示", + "论文内容已加载完毕,您可以直接向AI提问有关该论文的问题。" + ]) + yield from update_ui(chatbot=chatbot, history=history) + +if __name__ == "__main__": + # 测试代码 + import asyncio + async def test(): + # 测试批量输入 + batch_inputs = [ + # 换行分隔的测试 + """https://arxiv.org/abs/2103.14030 + math/0211159 + 10.1038/s41586-021-03819-2""", + + # 空格分隔的测试 + "https://arxiv.org/abs/2103.14030 math/0211159 10.1038/s41586-021-03819-2", + + # 混合分隔的测试 + """https://arxiv.org/abs/2103.14030 math/0211159 + 10.1038/s41586-021-03819-2 https://doi.org/10.1038/s41586-021-03819-2 + 2103.14030""", + ] + + for i, test_input in enumerate(batch_inputs, 1): + print(f"\n测试用例 {i}:") + print(f"输入: {test_input}") + results = extract_paper_ids(test_input) + print(f"解析结果:") + for result in results: + print(f" {result}") + + asyncio.run(test()) diff --git a/crazy_functions/paper_fns/reduce_aigc.py b/crazy_functions/paper_fns/reduce_aigc.py new file mode 100644 index 00000000..2f9585dd --- /dev/null +++ b/crazy_functions/paper_fns/reduce_aigc.py @@ -0,0 +1,867 @@ +import os +import time +import glob +import re +import threading +from typing import Dict, List, Generator, Tuple +from dataclasses import dataclass + +from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency +from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit +from crazy_functions.rag_fns.rag_file_support import extract_text, convert_to_markdown +from request_llms.bridge_all import model_info +from toolbox import update_ui, CatchException, report_exception, promote_file_to_downloadzone, write_history_to_file +from shared_utils.fastapi_server import validate_path_safety + +# 新增:导入结构化论文提取器 +from crazy_functions.doc_fns.read_fns.unstructured_all.paper_structure_extractor import PaperStructureExtractor, ExtractorConfig, StructuredPaper + +# 导入格式化器 +from crazy_functions.paper_fns.file2file_doc import ( + TxtFormatter, + MarkdownFormatter, + HtmlFormatter, + WordFormatter +) + +@dataclass +class TextFragment: + """文本片段数据类,用于组织处理单元""" + content: str + fragment_index: int + total_fragments: int + + +class DocumentProcessor: + """文档处理器 - 处理单个文档并输出结果""" + + def __init__(self, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, history: List, system_prompt: str): + """初始化处理器""" + self.llm_kwargs = llm_kwargs.copy() # 创建原始llm_kwargs的副本 + # 固定使用deepseek-reasoner模型 + self.llm_kwargs['llm_model'] = "deepseek-reasoner" + self.plugin_kwargs = plugin_kwargs + self.chatbot = chatbot + self.history = history + self.system_prompt = system_prompt + self.processed_results = [] + self.failed_fragments = [] + # 新增:初始化论文结构提取器 + self.paper_extractor = PaperStructureExtractor() + # 修改:设置降重次数,默认为1次(原为3次) + self.reduction_times = int(self.plugin_kwargs.get("reduction_times", 1)) + + def _get_token_limit(self) -> int: + """获取模型token限制,返回更小的值以确保更细粒度的分割""" + max_token = model_info[self.llm_kwargs['llm_model']]['max_token'] + # 降低token限制,使每个片段更小 + return max_token // 4 # 从3/4降低到1/4 + + def _create_batch_inputs(self, fragments: List[TextFragment], current_iteration: int = 1) -> Tuple[List, List, List]: + """创建批处理输入""" + inputs_array = [] + inputs_show_user_array = [] + history_array = [] + + # 基础指令 - 增强单次调用的效果 + user_instruction = self.plugin_kwargs.get("advanced_arg", """你是一个人类科研工作者,请对以下学术文本进行深度改写,以彻底消除AI生成特征。具体要求如下: + +1. 保持学术写作的严谨性和专业性,但加入一些人类写作的不完美元素 +2. 维持原文的核心论述和逻辑框架,但打破过于完美的结构 +3. 彻底改变句式结构: + - 灵活运用主动句与被动句,偏好使用主动语态 + - 适当使用不同长度的句子,包括一些较长的复杂句 + - 引入变化多样的句式,打破规律性 + - 完全避免AI常用的模板化句式和套路表达 + - 增加一些学术写作中常见的转折和连接方式 +4. 全面改善用词: + - 使用更多学术语境下的专业词汇和同义词替换 + - 避免过于机械和规律性的连接词,使用更自然的过渡 + - 重构专业术语的表达方式,但保持准确性 + - 增加词汇多样性,减少重复用词 + - 偶尔使用一些不太常见但恰当的词汇 +5. 模拟真实学者的写作风格: + - 注重论证的严密性,但允许存在一些微小的不对称性 + - 保持表达的客观性,同时适度体现个人学术见解 + - 在适当位置表达观点时更加自信和坚定 + - 避免过于完美和机械均衡的论述结构 + - 允许段落长度有所变化,不要过于均匀 +6. 引入人类学者常见的写作特点: + - 段落之间的过渡更加自然流畅 + - 适当使用一些学术界常见的修辞手法,但不过度使用 + - 偶尔使用一些强调和限定性表达 + - 适当使用一些学术界认可的个人化表达 +7. 彻底消除AI痕迹: + - 避免过于规整和均衡的段落结构 + - 避免机械性的句式变化和词汇替换模式 + - 避免过于完美的逻辑推导,适当增加一些转折 + - 减少公式化的表达方式""") + + # 对于单次调用的场景,不需要迭代前缀,直接使用更强力的改写指令 + for frag in fragments: + # 在单次调用时使用更强力的指令 + if self.reduction_times == 1: + i_say = (f'请对以下学术文本进行彻底改写,完全消除AI特征,使其像真实人类学者撰写的内容。\n\n{user_instruction}\n\n' + f'请记住以下几点:\n' + f'1. 避免过于规整和均衡的结构\n' + f'2. 引入一些人类写作的微小不完美之处\n' + f'3. 使用多样化的句式和词汇\n' + f'4. 打破可能的AI规律性表达模式\n' + f'5. 适当使用一些专业领域内的表达习惯\n\n' + f'请将对文本的处理结果放在标签之间。\n\n' + f'文本内容:\n```\n{frag.content}\n```') + else: + # 原有的迭代前缀逻辑 + iteration_prefix = "" + if current_iteration > 1: + iteration_prefix = f"这是第{current_iteration}次改写,请在保持学术性的基础上,采用更加人性化、不同的表达方式。" + if current_iteration == 2: + iteration_prefix += "在保持专业性的同时,进一步优化句式结构和用词,显著降低AI痕迹。" + elif current_iteration >= 3: + iteration_prefix += "请在确保不损失任何学术内容的前提下,彻底重构表达方式,并适当引入少量人类学者常用的表达技巧,避免过度使用比喻和类比。" + + i_say = (f'请按照以下要求处理文本内容:{iteration_prefix}{user_instruction}\n\n' + f'请将对文本的处理结果放在标签之间。\n\n' + f'文本内容:\n```\n{frag.content}\n```') + + i_say_show_user = f'正在处理文本片段 {frag.fragment_index + 1}/{frag.total_fragments}' + + inputs_array.append(i_say) + inputs_show_user_array.append(i_say_show_user) + history_array.append([]) + + return inputs_array, inputs_show_user_array, history_array + + def _extract_decision(self, text: str) -> str: + """从LLM响应中提取标签内的内容""" + import re + pattern = r'(.*?)' + matches = re.findall(pattern, text, re.DOTALL) + + if matches: + return matches[0].strip() + else: + # 如果没有找到标签,返回原始文本 + return text.strip() + + def process_file(self, file_path: str) -> Generator: + """处理单个文件""" + self.chatbot.append(["开始处理文件", f"文件路径: {file_path}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + try: + # 首先尝试转换为Markdown + file_path = convert_to_markdown(file_path) + + # 1. 检查文件是否为支持的论文格式 + is_paper_format = any(file_path.lower().endswith(ext) for ext in self.paper_extractor.SUPPORTED_EXTENSIONS) + + if is_paper_format: + # 使用结构化提取器处理论文 + return (yield from self._process_structured_paper(file_path)) + else: + # 使用原有方式处理普通文档 + return (yield from self._process_regular_file(file_path)) + + except Exception as e: + self.chatbot.append(["处理错误", f"文件处理失败: {str(e)}"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + + def _process_structured_paper(self, file_path: str) -> Generator: + """处理结构化论文文件""" + # 1. 提取论文结构 + self.chatbot[-1] = ["正在分析论文结构", f"文件路径: {file_path}"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + try: + paper = self.paper_extractor.extract_paper_structure(file_path) + + if not paper or not paper.sections: + self.chatbot.append(["无法提取论文结构", "将使用全文内容进行处理"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 使用全文内容进行段落切分 + if paper and paper.full_text: + # 使用增强的分割函数进行更细致的分割 + fragments = self._breakdown_section_content(paper.full_text) + + # 创建文本片段对象 + text_fragments = [] + for i, frag in enumerate(fragments): + if frag.strip(): + text_fragments.append(TextFragment( + content=frag, + fragment_index=i, + total_fragments=len(fragments) + )) + + # 多次降重处理 + if text_fragments: + current_fragments = text_fragments + + # 进行多轮降重处理 + for iteration in range(1, self.reduction_times + 1): + # 处理当前片段 + processed_content = yield from self._process_text_fragments(current_fragments, iteration) + + # 如果这是最后一次迭代,保存结果 + if iteration == self.reduction_times: + final_content = processed_content + break + + # 否则,准备下一轮迭代的片段 + # 从处理结果中提取处理后的内容 + next_fragments = [] + for idx, item in enumerate(self.processed_results): + next_fragments.append(TextFragment( + content=item['content'], + fragment_index=idx, + total_fragments=len(self.processed_results) + )) + + current_fragments = next_fragments + + # 更新UI显示最终结果 + self.chatbot[-1] = ["处理完成", f"共完成 {self.reduction_times} 轮降重"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + return final_content + else: + self.chatbot.append(["处理失败", "未能提取到有效的文本内容"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + else: + self.chatbot.append(["处理失败", "未能提取到论文内容"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + + # 2. 准备处理章节内容(不处理标题) + self.chatbot[-1] = ["已提取论文结构", f"共 {len(paper.sections)} 个主要章节"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 3. 收集所有需要处理的章节内容并分割为合适大小 + sections_to_process = [] + section_map = {} # 用于映射处理前后的内容 + + def collect_section_contents(sections, parent_path=""): + """递归收集章节内容,跳过参考文献部分""" + for i, section in enumerate(sections): + current_path = f"{parent_path}/{i}" if parent_path else f"{i}" + + # 检查是否为参考文献部分,如果是则跳过 + if section.section_type == 'references' or section.title.lower() in ['references', '参考文献', 'bibliography', '文献']: + continue # 跳过参考文献部分 + + # 只处理内容非空的章节 + if section.content and section.content.strip(): + # 使用增强的分割函数进行更细致的分割 + fragments = self._breakdown_section_content(section.content) + + for fragment_idx, fragment_content in enumerate(fragments): + if fragment_content.strip(): + fragment_index = len(sections_to_process) + sections_to_process.append(TextFragment( + content=fragment_content, + fragment_index=fragment_index, + total_fragments=0 # 临时值,稍后更新 + )) + + # 保存映射关系,用于稍后更新章节内容 + # 为每个片段存储原始章节和片段索引信息 + section_map[fragment_index] = (current_path, section, fragment_idx, len(fragments)) + + # 递归处理子章节 + if section.subsections: + collect_section_contents(section.subsections, current_path) + + # 收集所有章节内容 + collect_section_contents(paper.sections) + + # 更新总片段数 + total_fragments = len(sections_to_process) + for frag in sections_to_process: + frag.total_fragments = total_fragments + + # 4. 如果没有内容需要处理,直接返回 + if not sections_to_process: + self.chatbot.append(["处理完成", "未找到需要处理的内容"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + + # 5. 批量处理章节内容 + self.chatbot[-1] = ["开始处理论文内容", f"共 {len(sections_to_process)} 个内容片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 一次性准备所有输入 + inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(sections_to_process) + + # 使用系统提示 + instruction = self.plugin_kwargs.get("advanced_arg", """请对以下学术文本进行彻底改写,以显著降低AI生成特征。具体要求如下: + +1. 保持学术写作的严谨性和专业性 +2. 维持原文的核心论述和逻辑框架 +3. 优化句式结构: + - 灵活运用主动句与被动句 + - 适当拆分复杂句式,提高可读性 + - 注意句式的多样性,避免重复模式 + - 打破AI常用的句式模板 +4. 改善用词: + - 使用更多学术语境下的同义词替换 + - 避免过于机械和规律性的连接词 + - 适当调整专业术语的表达方式 + - 增加词汇多样性,减少重复用词 +5. 增强文本的学术特征: + - 注重论证的严密性 + - 保持表达的客观性 + - 适度体现作者的学术见解 + - 避免过于完美和均衡的论述结构 +6. 确保语言风格的一致性 +7. 减少AI生成文本常见的套路和模式""") + sys_prompt_array = [f"""作为一位专业的学术写作顾问,请按照以下要求改写文本: + +1. 严格保持学术写作规范 +2. 维持原文的核心论述和逻辑框架 +3. 通过优化句式结构和用词降低AI生成特征 +4. 确保语言风格的一致性和专业性 +5. 保持内容的客观性和准确性 +6. 避免AI常见的套路化表达和过于完美的结构"""] * len(sections_to_process) + + # 调用LLM一次性处理所有片段 + response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history_array=history_array, + sys_prompt_array=sys_prompt_array, + ) + + # 处理响应,重组章节内容 + section_contents = {} # 用于重组各章节的处理后内容 + + for j, frag in enumerate(sections_to_process): + try: + llm_response = response_collection[j * 2 + 1] + processed_text = self._extract_decision(llm_response) + + if processed_text and processed_text.strip(): + # 保存处理结果 + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': processed_text + }) + + # 存储处理后的文本片段,用于后续重组 + fragment_index = frag.fragment_index + if fragment_index in section_map: + path, section, fragment_idx, total_fragments = section_map[fragment_index] + + # 初始化此章节的内容容器(如果尚未创建) + if path not in section_contents: + section_contents[path] = [""] * total_fragments + + # 将处理后的片段放入正确位置 + section_contents[path][fragment_idx] = processed_text + else: + self.failed_fragments.append(frag) + except Exception as e: + self.failed_fragments.append(frag) + + # 重组每个章节的内容 + for path, fragments in section_contents.items(): + section = None + for idx in section_map: + if section_map[idx][0] == path: + section = section_map[idx][1] + break + + if section: + # 合并该章节的所有处理后片段 + section.content = "\n".join(fragments) + + # 6. 更新UI + success_count = total_fragments - len(self.failed_fragments) + self.chatbot[-1] = ["处理完成", f"成功处理 {success_count}/{total_fragments} 个内容片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 收集参考文献部分(不进行处理) + references_sections = [] + def collect_references(sections, parent_path=""): + """递归收集参考文献部分""" + for i, section in enumerate(sections): + current_path = f"{parent_path}/{i}" if parent_path else f"{i}" + + # 检查是否为参考文献部分 + if section.section_type == 'references' or section.title.lower() in ['references', '参考文献', 'bibliography', '文献']: + references_sections.append((current_path, section)) + + # 递归检查子章节 + if section.subsections: + collect_references(section.subsections, current_path) + + # 收集参考文献 + collect_references(paper.sections) + + # 7. 将处理后的结构化论文转换为Markdown + markdown_content = self.paper_extractor.generate_markdown(paper) + + # 8. 返回处理后的内容 + self.chatbot[-1] = ["处理完成", f"成功处理 {success_count}/{total_fragments} 个内容片段,参考文献部分未处理"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + return markdown_content + + except Exception as e: + self.chatbot.append(["结构化处理失败", f"错误: {str(e)},将尝试作为普通文件处理"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return (yield from self._process_regular_file(file_path)) + + def _process_regular_file(self, file_path: str) -> Generator: + """使用原有方式处理普通文件""" + # 原有的文件处理逻辑 + self.chatbot[-1] = ["正在读取文件", f"文件路径: {file_path}"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + content = extract_text(file_path) + if not content or not content.strip(): + self.chatbot.append(["处理失败", "文件内容为空或无法提取内容"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + + # 2. 分割文本 + self.chatbot[-1] = ["正在分析文件", "将文件内容分割为适当大小的片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 使用增强的分割函数 + fragments = self._breakdown_section_content(content) + + # 3. 创建文本片段对象 + text_fragments = [] + for i, frag in enumerate(fragments): + if frag.strip(): + text_fragments.append(TextFragment( + content=frag, + fragment_index=i, + total_fragments=len(fragments) + )) + + # 4. 多轮降重处理 + if not text_fragments: + self.chatbot.append(["处理失败", "未能提取到有效的文本内容"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + return None + + # 批处理大小 + batch_size = 8 # 每批处理的片段数 + + # 第一次迭代 + current_batches = [] + for i in range(0, len(text_fragments), batch_size): + current_batches.append(text_fragments[i:i + batch_size]) + + all_processed_fragments = [] + + # 进行多轮降重处理 + for iteration in range(1, self.reduction_times + 1): + self.chatbot[-1] = ["开始处理文本", f"第 {iteration}/{self.reduction_times} 次降重"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + next_batches = [] + all_processed_fragments = [] + + # 分批处理当前迭代的片段 + for batch in current_batches: + # 处理当前批次 + _ = yield from self._process_text_fragments(batch, iteration) + + # 收集处理结果 + processed_batch = [] + for item in self.processed_results: + processed_batch.append(TextFragment( + content=item['content'], + fragment_index=len(all_processed_fragments) + len(processed_batch), + total_fragments=0 # 临时值,稍后更新 + )) + + all_processed_fragments.extend(processed_batch) + + # 如果不是最后一轮迭代,准备下一批次 + if iteration < self.reduction_times: + for i in range(0, len(processed_batch), batch_size): + next_batches.append(processed_batch[i:i + batch_size]) + + # 更新总片段数 + for frag in all_processed_fragments: + frag.total_fragments = len(all_processed_fragments) + + # 为下一轮迭代准备批次 + current_batches = next_batches + + # 合并最终结果 + final_content = "\n\n".join([frag.content for frag in all_processed_fragments]) + + # 5. 更新UI显示最终结果 + self.chatbot[-1] = ["处理完成", f"共完成 {self.reduction_times} 轮降重"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + return final_content + + def save_results(self, content: str, original_file_path: str) -> List[str]: + """保存处理结果为TXT格式""" + if not content: + return [] + + timestamp = time.strftime("%Y%m%d_%H%M%S") + original_filename = os.path.basename(original_file_path) + filename_without_ext = os.path.splitext(original_filename)[0] + base_filename = f"{filename_without_ext}_processed_{timestamp}" + + result_files = [] + + # 只保存为TXT + try: + txt_formatter = TxtFormatter() + txt_content = txt_formatter.create_document(content) + txt_file = write_history_to_file( + history=[txt_content], + file_basename=f"{base_filename}.txt" + ) + result_files.append(txt_file) + except Exception as e: + self.chatbot.append(["警告", f"TXT格式保存失败: {str(e)}"]) + + # 添加到下载区 + for file in result_files: + promote_file_to_downloadzone(file, chatbot=self.chatbot) + + return result_files + + def _breakdown_section_content(self, content: str) -> List[str]: + """对文本内容进行分割与合并 + + 主要按段落进行组织,只合并较小的段落以减少片段数量 + 保留原始段落结构,不对长段落进行强制分割 + 针对中英文设置不同的阈值,因为字符密度不同 + """ + # 先按段落分割文本 + paragraphs = content.split('\n\n') + + # 检测语言类型 + chinese_char_count = sum(1 for char in content if '\u4e00' <= char <= '\u9fff') + is_chinese_text = chinese_char_count / max(1, len(content)) > 0.3 + + # 根据语言类型设置不同的阈值(只用于合并小段落) + if is_chinese_text: + # 中文文本:一个汉字就是一个字符,信息密度高 + min_chunk_size = 300 # 段落合并的最小阈值 + target_size = 800 # 理想的段落大小 + else: + # 英文文本:一个单词由多个字符组成,信息密度低 + min_chunk_size = 600 # 段落合并的最小阈值 + target_size = 1600 # 理想的段落大小 + + # 1. 只合并小段落,不对长段落进行分割 + result_fragments = [] + current_chunk = [] + current_length = 0 + + for para in paragraphs: + # 如果段落太小且不会超过目标大小,则合并 + if len(para) < min_chunk_size and current_length + len(para) <= target_size: + current_chunk.append(para) + current_length += len(para) + # 否则,创建新段落 + else: + # 如果当前块非空且与当前段落无关,先保存它 + if current_chunk and current_length > 0: + result_fragments.append('\n\n'.join(current_chunk)) + + # 当前段落作为新块 + current_chunk = [para] + current_length = len(para) + + # 如果当前块大小已接近目标大小,保存并开始新块 + if current_length >= target_size: + result_fragments.append('\n\n'.join(current_chunk)) + current_chunk = [] + current_length = 0 + + # 保存最后一个块 + if current_chunk: + result_fragments.append('\n\n'.join(current_chunk)) + + # 2. 处理可能过大的片段(确保不超过token限制) + final_fragments = [] + max_token = self._get_token_limit() + + for fragment in result_fragments: + # 检查fragment是否可能超出token限制 + # 根据语言类型调整token估算 + if is_chinese_text: + estimated_tokens = len(fragment) / 1.5 # 中文每个token约1-2个字符 + else: + estimated_tokens = len(fragment) / 4 # 英文每个token约4个字符 + + if estimated_tokens > max_token: + # 即使可能超出限制,也尽量保持段落的完整性 + # 使用breakdown_text但设置更大的限制来减少分割 + larger_limit = max_token * 0.95 # 使用95%的限制 + sub_fragments = breakdown_text_to_satisfy_token_limit( + txt=fragment, + limit=larger_limit, + llm_model=self.llm_kwargs['llm_model'] + ) + final_fragments.extend(sub_fragments) + else: + final_fragments.append(fragment) + + return final_fragments + + def _process_text_fragments(self, text_fragments: List[TextFragment], current_iteration: int = 1) -> str: + """处理文本片段,支持多次降重 + + Args: + text_fragments: 要处理的文本片段列表 + current_iteration: 当前是第几次降重迭代 + + Returns: + 处理后的文本内容 + """ + self.chatbot[-1] = ["开始处理文本", f"第 {current_iteration}/{self.reduction_times} 次降重,共 {len(text_fragments)} 个片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 重置处理结果,为当前迭代做准备 + self.processed_results = [] + self.failed_fragments = [] + + # 一次性准备所有输入 + inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(text_fragments, current_iteration) + + # 对于单次调用的特殊处理 - 使用更强力的系统提示词 + if self.reduction_times == 1: + sys_prompt = f"""你是一个伪装成人类科研学者的AI,能够将各种AI生成内容改写成人类学者常用的表达方式,让人类学者无法分辨。 + +你的任务是对学术文本进行深度改写,以彻底消除AI特征,使其具有真实人类学者的写作特点: + +1. 打破AI常见的规整句式结构和词汇选择模式 +2. 引入学术写作中的自然变化和不规则性 +3. 使用多样化的句式和段落结构,避免明显的平衡性和规律性 +4. 加入人类写作中常见的微小不完美之处,如段落长度变化、论述强调点不均等 +5. 在维持核心论点的同时,完全重构表达方式 +6. 适当使用学科领域内专家常用的表达习惯和术语 +7. 确保文本读起来自然流畅,避免机械感 +8. 打破AI生成内容常见的逻辑结构和表达套路 +9. 根据文本内容,在保持专业性的基础上增加写作的特色和个性 + +对于每个文本片段,尽可能彻底地重构,使其无法被识别为AI生成。""" + + sys_prompt_array = [sys_prompt] * len(text_fragments) + + else: + # 原有的多次迭代处理逻辑 + # 根据迭代次数调整系统提示词强度 + base_instruction = self.plugin_kwargs.get("advanced_arg", """你是一个人类科研工作者,请对以下学术文本进行彻底改写,以显著降低AI生成特征。具体要求如下: + +1. 保持学术写作的严谨性和专业性 +2. 维持原文的核心论述和逻辑框架 +3. 优化句式结构: + - 灵活运用主动句与被动句 + - 适当拆分复杂句式,提高可读性 + - 注意句式的多样性,避免重复模式 + - 打破AI常用的句式模板 +4. 改善用词: + - 使用更多学术语境下的同义词替换 + - 避免过于机械和规律性的连接词 + - 适当调整专业术语的表达方式 + - 增加词汇多样性,减少重复用词 +5. 增强文本的学术特征: + - 注重论证的严密性 + - 保持表达的客观性 + - 适度体现作者的学术见解 + - 避免过于完美和均衡的论述结构 +6. 确保语言风格的一致性 +7. 减少AI生成文本常见的套路和模式""") + + # 根据迭代次数增强强度 + if current_iteration == 1: + # 第一次迭代使用基础强度 + instruction = base_instruction + sys_prompt = f"""你是一个伪装成人类科研学者的AI,能够将各种AI生成内容改写成人类学者常用的表达方式,让人类学者无法分辨。请进行第{current_iteration}轮深度改写: + +1. 严格保持学术写作规范 +2. 维持原文的核心论述和逻辑框架 +3. 通过优化句式结构和用词降低AI生成特征 +4. 确保语言风格的一致性和专业性 +5. 保持内容的客观性和准确性 +6. 避免AI常见的套路化表达和过于完美的结构""" + elif current_iteration == 2: + # 第二次迭代使用中等强度 + instruction = base_instruction + """这是第二次改写,请进一步优化: + +1. 更深入地调整句式结构,增加表达多样性 +2. 在保持准确性的前提下,替换更多专业术语的表达方式 +3. 优化段落结构,增强逻辑连贯性 +4. 适当调整学术论证的表达方式 +5. 打破AI生成内容的常见模式,如过于均衡的结构和规律性的表达方式""" + sys_prompt = f"""你是一个伪装成人类科研学者的AI,能够将各种AI生成内容改写成人类学者常用的表达方式,让人类学者无法分辨。请进行第二轮改写: + +1. 在保持严谨性的基础上,优化表达方式 +2. 改善文本结构,提升学术论述的清晰度 +3. 增强专业术语使用的灵活性 +4. 确保论证过程的完整性和逻辑性 +5. 进一步减少AI生成内容的特征和模式 +6. 使文本表现出更自然、更人性化的写作风格""" + else: + # 第三次或更多次迭代使用高强度 + instruction = base_instruction + f"""这是第{current_iteration}次改写,需要在保持学术性的前提下彻底重构文本: + +1. 全面重构句式,采用不同的表达策略 +2. 系统性替换学术用语,保持专业准确性 +3. 重组段落结构,优化论证层次 +4. 深化学术观点的表达方式 +5. 增强文本的逻辑严密性 +6. 提升论述的专业深度 +7. 确保不遗漏任何学术观点和论证要素 +8. 适当使用学术表达技巧(如精准举例、恰当转折等),但不过分依赖比喻和类比 +9. 彻底消除AI生成内容的特征,使文本具有真实学者的写作风格""" + sys_prompt = f"""你是一个伪装成人类科研学者的AI,能够将各种AI生成内容改写成人类学者常用的表达方式,让人类学者无法分辨。请进行第{current_iteration}轮深度改写: + +1. 在保持专业水准的前提下,彻底重构表达方式,引入长难句 +2. 确保学术论证的严密性和完整性 +3. 优化专业术语的运用 +4. 提升文本的学术价值 +5. 保持论述的逻辑性和连贯性 +6. 适当使用少量学术表达技巧,提高文本说服力,但避免过度使用比喻和类比 +7. 消除所有明显的AI生成痕迹,使文本更接近真实学者的写作风格""" + + sys_prompt_array = [sys_prompt] * len(text_fragments) + + # 调用LLM一次性处理所有片段 + response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history_array=history_array, + sys_prompt_array=sys_prompt_array, + ) + + # 处理响应 + for j, frag in enumerate(text_fragments): + try: + llm_response = response_collection[j * 2 + 1] + processed_text = self._extract_decision(llm_response) + + if processed_text and processed_text.strip(): + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': processed_text + }) + else: + self.failed_fragments.append(frag) + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': frag.content + }) + except Exception as e: + self.failed_fragments.append(frag) + self.processed_results.append({ + 'index': frag.fragment_index, + 'content': frag.content + }) + + # 按原始顺序合并结果 + self.processed_results.sort(key=lambda x: x['index']) + final_content = "\n".join([item['content'] for item in self.processed_results]) + + # 更新UI + success_count = len(text_fragments) - len(self.failed_fragments) + self.chatbot[-1] = ["当前阶段处理完成", f"第 {current_iteration}/{self.reduction_times} 次降重,成功处理 {success_count}/{len(text_fragments)} 个片段"] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + return final_content + + +@CatchException +def 学术降重(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, + history: List, system_prompt: str, user_request: str): + """主函数 - 文件到文件处理""" + # 初始化 + # 从高级参数中提取降重次数 + if "advanced_arg" in plugin_kwargs and plugin_kwargs["advanced_arg"]: + # 检查是否包含降重次数的设置 + match = re.search(r'reduction_times\s*=\s*(\d+)', plugin_kwargs["advanced_arg"]) + if match: + reduction_times = int(match.group(1)) + # 替换掉高级参数中的reduction_times设置,但保留其他内容 + plugin_kwargs["advanced_arg"] = re.sub(r'reduction_times\s*=\s*\d+', '', plugin_kwargs["advanced_arg"]).strip() + # 添加到plugin_kwargs中作为单独的参数 + plugin_kwargs["reduction_times"] = reduction_times + + processor = DocumentProcessor(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + chatbot.append(["函数插件功能", f"文件内容处理:将文档内容进行{processor.reduction_times}次降重处理"]) + + # 更新用户提示,提供关于降重策略的详细说明 + if processor.reduction_times == 1: + chatbot.append(["降重策略", "将使用单次深度降重,这种方式能更有效地降低AI特征,减少查重率。我们采用特殊优化的提示词,通过一次性强力改写来实现降重效果。"]) + elif processor.reduction_times > 1: + chatbot.append(["降重策略", f"将进行{processor.reduction_times}轮迭代降重,每轮降重都会基于上一轮的结果,并逐渐增加降重强度。请注意,多轮迭代可能会引入新的AI特征,单次强力降重通常效果更好。"]) + + yield from update_ui(chatbot=chatbot, history=history) + + # 验证输入路径 + if not os.path.exists(txt): + report_exception(chatbot, history, a=f"解析路径: {txt}", b=f"找不到路径或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 验证路径安全性 + user_name = chatbot.get_user() + validate_path_safety(txt, user_name) + + # 获取文件列表 + if os.path.isfile(txt): + # 单个文件处理 + file_paths = [txt] + else: + # 目录处理 - 类似批量文件询问插件 + project_folder = txt + extract_folder = next((d for d in glob.glob(f'{project_folder}/*') + if os.path.isdir(d) and d.endswith('.extract')), project_folder) + + # 排除压缩文件 + exclude_patterns = r'/[^/]+\.(zip|rar|7z|tar|gz)$' + file_paths = [f for f in glob.glob(f'{extract_folder}/**', recursive=True) + if os.path.isfile(f) and not re.search(exclude_patterns, f)] + + # 过滤支持的文件格式 + file_paths = [f for f in file_paths if any(f.lower().endswith(ext) for ext in + list(processor.paper_extractor.SUPPORTED_EXTENSIONS) + ['.json', '.csv', '.xlsx', '.xls'])] + + if not file_paths: + report_exception(chatbot, history, a=f"解析路径: {txt}", b="未找到支持的文件类型") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 处理文件 + if len(file_paths) > 1: + chatbot.append(["发现多个文件", f"共找到 {len(file_paths)} 个文件,将处理第一个文件"]) + yield from update_ui(chatbot=chatbot, history=history) + + # 只处理第一个文件 + file_to_process = file_paths[0] + processed_content = yield from processor.process_file(file_to_process) + + if processed_content: + # 保存结果 + result_files = processor.save_results(processed_content, file_to_process) + + if result_files: + chatbot.append(["处理完成", f"已生成 {len(result_files)} 个结果文件"]) + else: + chatbot.append(["处理完成", "但未能保存任何结果文件"]) + else: + chatbot.append(["处理失败", "未能生成有效的处理结果"]) + + yield from update_ui(chatbot=chatbot, history=history) diff --git a/crazy_functions/paper_fns/wiki/wikipedia_api.py b/crazy_functions/paper_fns/wiki/wikipedia_api.py new file mode 100644 index 00000000..22c4ca6d --- /dev/null +++ b/crazy_functions/paper_fns/wiki/wikipedia_api.py @@ -0,0 +1,387 @@ +import aiohttp +import asyncio +from typing import List, Dict, Optional +import re +import random +import time + +class WikipediaAPI: + """维基百科API调用实现""" + + def __init__(self, language: str = "zh", user_agent: str = None, + max_concurrent: int = 5, request_delay: float = 0.5): + """ + 初始化维基百科API客户端 + + Args: + language: 语言代码 (zh: 中文, en: 英文, ja: 日文等) + user_agent: 用户代理信息,如果为None将使用默认值 + max_concurrent: 最大并发请求数 + request_delay: 请求间隔时间(秒) + """ + self.language = language + self.base_url = f"https://{language}.wikipedia.org/w/api.php" + self.user_agent = user_agent or "WikipediaAPIClient/1.0 (chatscholar@163.com)" + self.headers = { + "User-Agent": self.user_agent, + "Accept": "application/json" + } + # 添加并发控制 + self.semaphore = asyncio.Semaphore(max_concurrent) + self.request_delay = request_delay + self.last_request_time = 0 + + async def _make_request(self, url, params=None): + """ + 发起API请求,包含并发控制和请求延迟 + + Args: + url: 请求URL + params: 请求参数 + + Returns: + API响应数据 + """ + # 使用信号量控制并发 + async with self.semaphore: + # 添加请求间隔 + current_time = time.time() + time_since_last_request = current_time - self.last_request_time + if time_since_last_request < self.request_delay: + await asyncio.sleep(self.request_delay - time_since_last_request) + + # 设置随机延迟,避免规律性请求 + jitter = random.uniform(0, 0.2) + await asyncio.sleep(jitter) + + # 记录本次请求时间 + self.last_request_time = time.time() + + # 发起请求 + try: + async with aiohttp.ClientSession(headers=self.headers) as session: + async with session.get(url, params=params) as response: + if response.status == 429: # Too Many Requests + retry_after = int(response.headers.get('Retry-After', 5)) + print(f"达到请求限制,等待 {retry_after} 秒后重试...") + await asyncio.sleep(retry_after) + return await self._make_request(url, params) + + if response.status != 200: + print(f"API请求失败: HTTP {response.status}") + print(f"响应内容: {await response.text()}") + return None + + return await response.json() + except aiohttp.ClientError as e: + print(f"请求错误: {str(e)}") + return None + + async def search(self, query: str, limit: int = 10, namespace: int = 0) -> List[Dict]: + """ + 搜索维基百科文章 + + Args: + query: 搜索关键词 + limit: 返回结果数量 + namespace: 命名空间 (0表示文章, 14表示分类等) + + Returns: + 搜索结果列表 + """ + params = { + "action": "query", + "list": "search", + "srsearch": query, + "format": "json", + "srlimit": limit, + "srnamespace": namespace, + "srprop": "snippet|titlesnippet|sectiontitle|categorysnippet|size|wordcount|timestamp|redirecttitle" + } + + data = await self._make_request(self.base_url, params) + if not data: + return [] + + search_results = data.get("query", {}).get("search", []) + return search_results + + async def get_page_content(self, title: str, section: Optional[int] = None) -> Dict: + """ + 获取维基百科页面内容 + + Args: + title: 页面标题 + section: 特定章节编号(可选) + + Returns: + 页面内容字典 + """ + async with aiohttp.ClientSession(headers=self.headers) as session: + params = { + "action": "parse", + "page": title, + "format": "json", + "prop": "text|langlinks|categories|links|templates|images|externallinks|sections|revid|displaytitle|iwlinks|properties" + } + + # 如果指定了章节,只获取该章节内容 + if section is not None: + params["section"] = section + + async with session.get(self.base_url, params=params) as response: + if response.status != 200: + print(f"API请求失败: HTTP {response.status}") + return {} + + data = await response.json() + if "error" in data: + print(f"API错误: {data['error'].get('info', '未知错误')}") + return {} + + return data.get("parse", {}) + + async def get_summary(self, title: str, sentences: int = 3) -> str: + """ + 获取页面摘要 + + Args: + title: 页面标题 + sentences: 返回的句子数量 + + Returns: + 页面摘要文本 + """ + async with aiohttp.ClientSession(headers=self.headers) as session: + params = { + "action": "query", + "prop": "extracts", + "exintro": "1", + "exsentences": sentences, + "explaintext": "1", + "titles": title, + "format": "json" + } + + async with session.get(self.base_url, params=params) as response: + if response.status != 200: + print(f"API请求失败: HTTP {response.status}") + return "" + + data = await response.json() + pages = data.get("query", {}).get("pages", {}) + # 获取第一个页面ID的内容 + for page_id in pages: + return pages[page_id].get("extract", "") + return "" + + async def get_random_articles(self, count: int = 1, namespace: int = 0) -> List[Dict]: + """ + 获取随机文章 + + Args: + count: 需要的随机文章数量 + namespace: 命名空间 + + Returns: + 随机文章列表 + """ + async with aiohttp.ClientSession(headers=self.headers) as session: + params = { + "action": "query", + "list": "random", + "rnlimit": count, + "rnnamespace": namespace, + "format": "json" + } + + async with session.get(self.base_url, params=params) as response: + if response.status != 200: + print(f"API请求失败: HTTP {response.status}") + return [] + + data = await response.json() + return data.get("query", {}).get("random", []) + + async def login(self, username: str, password: str) -> bool: + """ + 使用维基百科账户登录 + + Args: + username: 维基百科用户名 + password: 维基百科密码 + + Returns: + 登录是否成功 + """ + async with aiohttp.ClientSession(headers=self.headers) as session: + # 获取登录令牌 + params = { + "action": "query", + "meta": "tokens", + "type": "login", + "format": "json" + } + + async with session.get(self.base_url, params=params) as response: + if response.status != 200: + print(f"获取登录令牌失败: HTTP {response.status}") + return False + + data = await response.json() + login_token = data.get("query", {}).get("tokens", {}).get("logintoken") + + if not login_token: + print("获取登录令牌失败") + return False + + # 使用令牌登录 + login_params = { + "action": "login", + "lgname": username, + "lgpassword": password, + "lgtoken": login_token, + "format": "json" + } + + async with session.post(self.base_url, data=login_params) as login_response: + login_data = await login_response.json() + + if login_data.get("login", {}).get("result") == "Success": + print(f"登录成功: {username}") + return True + else: + print(f"登录失败: {login_data.get('login', {}).get('reason', '未知原因')}") + return False + + async def setup_oauth(self, consumer_token: str, consumer_secret: str, + access_token: str = None, access_secret: str = None) -> bool: + """ + 设置OAuth认证 + + Args: + consumer_token: 消费者令牌 + consumer_secret: 消费者密钥 + access_token: 访问令牌(可选) + access_secret: 访问密钥(可选) + + Returns: + 设置是否成功 + """ + try: + # 需要安装 mwoauth 库: pip install mwoauth + import mwoauth + import requests_oauthlib + + # 设置OAuth + self.consumer_token = consumer_token + self.consumer_secret = consumer_secret + + if access_token and access_secret: + # 如果已有访问令牌 + self.auth = requests_oauthlib.OAuth1( + consumer_token, + consumer_secret, + access_token, + access_secret + ) + print("OAuth设置成功") + return True + else: + # 需要获取访问令牌(这通常需要用户在网页上授权) + print("请在开发环境中完成以下OAuth授权流程:") + + # 创建消费者 + consumer = mwoauth.Consumer( + consumer_token, consumer_secret + ) + + # 初始化握手 + redirect, request_token = mwoauth.initiate( + f"https://{self.language}.wikipedia.org/w/index.php", + consumer + ) + + print(f"请访问此URL授权应用: {redirect}") + # 这里通常会提示用户访问URL并输入授权码 + # 实际应用中需要实现适当的授权流程 + return False + except ImportError: + print("请安装 mwoauth 库: pip install mwoauth") + return False + except Exception as e: + print(f"设置OAuth时发生错误: {str(e)}") + return False + +async def example_usage(): + """演示WikipediaAPI的使用方法""" + # 创建默认中文维基百科API客户端 + wiki_zh = WikipediaAPI(language="zh") + + try: + # 示例1: 基本搜索 + print("\n=== 示例1: 搜索维基百科 ===") + results = await wiki_zh.search("人工智能", limit=3) + + for i, result in enumerate(results, 1): + print(f"\n--- 结果 {i} ---") + print(f"标题: {result.get('title')}") + snippet = result.get('snippet', '') + # 清理HTML标签 + snippet = re.sub(r'<.*?>', '', snippet) + print(f"摘要: {snippet}") + print(f"字数: {result.get('wordcount')}") + print(f"大小: {result.get('size')} 字节") + + # 示例2: 获取页面摘要 + print("\n=== 示例2: 获取页面摘要 ===") + summary = await wiki_zh.get_summary("深度学习", sentences=2) + print(f"深度学习摘要: {summary}") + + # 示例3: 获取页面内容 + print("\n=== 示例3: 获取页面内容 ===") + content = await wiki_zh.get_page_content("机器学习") + if content and "text" in content: + text = content["text"].get("*", "") + # 移除HTML标签以便控制台显示 + clean_text = re.sub(r'<.*?>', '', text) + print(f"机器学习页面内容片段: {clean_text[:200]}...") + + # 显示页面包含的分类数量 + categories = content.get("categories", []) + print(f"分类数量: {len(categories)}") + + # 显示页面包含的链接数量 + links = content.get("links", []) + print(f"链接数量: {len(links)}") + + # 示例4: 获取特定章节内容 + print("\n=== 示例4: 获取特定章节内容 ===") + # 获取引言部分(通常是0号章节) + intro_content = await wiki_zh.get_page_content("人工智能", section=0) + if intro_content and "text" in intro_content: + intro_text = intro_content["text"].get("*", "") + clean_intro = re.sub(r'<.*?>', '', intro_text) + print(f"人工智能引言内容片段: {clean_intro[:200]}...") + + # 示例5: 获取随机文章 + print("\n=== 示例5: 获取随机文章 ===") + random_articles = await wiki_zh.get_random_articles(count=2) + print("随机文章:") + for i, article in enumerate(random_articles, 1): + print(f"{i}. {article.get('title')}") + + # 显示随机文章的简短摘要 + article_summary = await wiki_zh.get_summary(article.get('title'), sentences=1) + print(f" 摘要: {article_summary[:100]}...") + + except Exception as e: + print(f"发生错误: {str(e)}") + import traceback + print(traceback.format_exc()) + +if __name__ == "__main__": + import asyncio + + # 运行示例 + asyncio.run(example_usage()) \ No newline at end of file diff --git a/crazy_functions/rag_fns/rag_file_support.py b/crazy_functions/rag_fns/rag_file_support.py index 3967ef37..e448f09c 100644 --- a/crazy_functions/rag_fns/rag_file_support.py +++ b/crazy_functions/rag_fns/rag_file_support.py @@ -1,7 +1,32 @@ +import subprocess import os supports_format = ['.csv', '.docx', '.epub', '.ipynb', '.mbox', '.md', '.pdf', '.txt', '.ppt', '.pptm', '.pptx', '.bat'] +def convert_to_markdown(file_path: str) -> str: + """ + 将支持的文件格式转换为Markdown格式 + Args: + file_path: 输入文件路径 + Returns: + str: 转换后的Markdown文件路径,如果转换失败则返回原始文件路径 + """ + _, ext = os.path.splitext(file_path.lower()) + + if ext in ['.docx', '.doc', '.pptx', '.ppt', '.pptm', '.xls', '.xlsx', '.csv', 'pdf']: + try: + # 创建输出Markdown文件路径 + md_path = os.path.splitext(file_path)[0] + '.md' + # 使用markitdown工具将文件转换为Markdown + command = f"markitdown {file_path} > {md_path}" + subprocess.run(command, shell=True, check=True) + print(f"已将{ext}文件转换为Markdown: {md_path}") + return md_path + except Exception as e: + print(f"{ext}转Markdown失败: {str(e)},将继续处理原文件") + return file_path + + return file_path # 修改后的 extract_text 函数,结合 SimpleDirectoryReader 和自定义解析逻辑 def extract_text(file_path):