from typing import Dict, List from dataclasses import dataclass import re @dataclass class SearchCriteria: """搜索条件""" query_type: str # 查询类型: repo/code/user/topic main_topic: str # 主题 sub_topics: List[str] # 子主题列表 language: str # 编程语言 min_stars: int # 最少星标数 github_params: Dict # GitHub搜索参数 original_query: str = "" # 原始查询字符串 repo_id: str = "" # 特定仓库ID或名称 class QueryAnalyzer: """查询分析器""" # 响应索引常量 BASIC_QUERY_INDEX = 0 GITHUB_QUERY_INDEX = 1 def __init__(self): self.valid_types = { "repo": ["repository", "project", "library", "framework", "tool"], "code": ["code", "snippet", "implementation", "function", "class", "algorithm"], "user": ["user", "developer", "organization", "contributor", "maintainer"], "topic": ["topic", "category", "tag", "field", "area", "domain"] } def analyze_query(self, query: str, chatbot: List, llm_kwargs: Dict): """分析查询意图""" from crazy_functions.crazy_utils import \ request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency as request_gpt # 1. 基本查询分析 type_prompt = f"""请分析这个与GitHub相关的查询,并严格按照以下XML格式回答: 查询: {query} 说明: 1. 你的回答必须使用下面显示的XML标签,不要有任何标签外的文本 2. 从以下选项中选择查询类型: repo/code/user/topic - repo: 用于查找仓库、项目、框架或库 - code: 用于查找代码片段、函数实现或算法 - user: 用于查找用户、开发者或组织 - topic: 用于查找主题、类别或领域相关项目 3. 识别主题和子主题 4. 识别首选编程语言(如果有) 5. 确定最低星标数(如果适用) 必需格式: 此处回答 此处回答 子主题1, 子主题2, ... 此处回答 此处回答 示例回答: 1. 仓库查询: 查询: "查找有至少1000颗星的Python web框架" repo web框架 后端开发, HTTP服务器, ORM Python 1000 2. 代码查询: 查询: "如何用JavaScript实现防抖函数" code 防抖函数 事件处理, 性能优化, 函数节流 JavaScript 0""" # 2. 生成英文搜索条件 github_prompt = f"""Optimize the following GitHub search query: Query: {query} Task: Convert the natural language query into an optimized GitHub search query. Please use English, regardless of the language of the input query. Available search fields and filters: 1. Basic fields: - in:name - Search in repository names - in:description - Search in repository descriptions - in:readme - Search in README files - in:topic - Search in topics - language:X - Filter by programming language - user:X - Repositories from a specific user - org:X - Repositories from a specific organization 2. Code search fields: - extension:X - Filter by file extension - path:X - Filter by path - filename:X - Filter by filename 3. Metric filters: - stars:>X - Has more than X stars - forks:>X - Has more than X forks - size:>X - Size greater than X KB - created:>YYYY-MM-DD - Created after a specific date - pushed:>YYYY-MM-DD - Updated after a specific date 4. Other filters: - is:public/private - Public or private repositories - archived:true/false - Archived or not archived - license:X - Specific license - topic:X - Contains specific topic tag Examples: 1. Query: "Find Python machine learning libraries with at least 1000 stars" machine learning in:description language:python stars:>1000 2. Query: "Recently updated React UI component libraries" UI components library in:readme in:description language:javascript topic:react pushed:>2023-01-01 3. Query: "Open source projects developed by Facebook" org:facebook is:public 4. Query: "Depth-first search implementation in JavaScript" depth first search in:file language:javascript Please analyze the query and answer using only the XML tag: Provide the optimized GitHub search query, using appropriate fields and operators""" # 3. 生成中文搜索条件 chinese_github_prompt = f"""优化以下GitHub搜索查询: 查询: {query} 任务: 将自然语言查询转换为优化的GitHub搜索查询语句。 为了搜索中文内容,请提取原始查询的关键词并使用中文形式,同时保留GitHub特定的搜索语法为英文。 可用的搜索字段和过滤器: 1. 基本字段: - in:name - 在仓库名称中搜索 - in:description - 在仓库描述中搜索 - in:readme - 在README文件中搜索 - in:topic - 在主题中搜索 - language:X - 按编程语言筛选 - user:X - 特定用户的仓库 - org:X - 特定组织的仓库 2. 代码搜索字段: - extension:X - 按文件扩展名筛选 - path:X - 按路径筛选 - filename:X - 按文件名筛选 3. 指标过滤器: - stars:>X - 有超过X颗星 - forks:>X - 有超过X个分支 - size:>X - 大小超过X KB - created:>YYYY-MM-DD - 在特定日期后创建 - pushed:>YYYY-MM-DD - 在特定日期后更新 4. 其他过滤器: - is:public/private - 公开或私有仓库 - archived:true/false - 已归档或未归档 - license:X - 特定许可证 - topic:X - 含特定主题标签 示例: 1. 查询: "找有关机器学习的Python库,至少1000颗星" 机器学习 in:description language:python stars:>1000 2. 查询: "最近更新的React UI组件库" UI 组件库 in:readme in:description language:javascript topic:react pushed:>2023-01-01 3. 查询: "微信小程序开发框架" 微信小程序 开发框架 in:name in:description in:readme 请分析查询并仅使用XML标签回答: 提供优化的GitHub搜索查询,使用适当的字段和运算符,保留中文关键词""" try: # 构建提示数组 prompts = [ type_prompt, github_prompt, chinese_github_prompt, ] show_messages = [ "分析查询类型...", "优化英文GitHub搜索参数...", "优化中文GitHub搜索参数...", ] sys_prompts = [ "你是一个精通GitHub生态系统的专家,擅长分析与GitHub相关的查询。", "You are a GitHub search expert, specialized in converting natural language queries into optimized GitHub search queries in English.", "你是一个GitHub搜索专家,擅长处理查询并保留中文关键词进行搜索。", ] # 使用同步方式调用LLM responses = yield from request_gpt( inputs_array=prompts, inputs_show_user_array=show_messages, llm_kwargs=llm_kwargs, chatbot=chatbot, history_array=[[] for _ in prompts], sys_prompt_array=sys_prompts, max_workers=3 ) # 从收集的响应中提取我们需要的内容 extracted_responses = [] for i in range(len(prompts)): if (i * 2 + 1) < len(responses): response = responses[i * 2 + 1] if response is None: raise Exception(f"Response {i} is None") if not isinstance(response, str): try: response = str(response) except: raise Exception(f"Cannot convert response {i} to string") extracted_responses.append(response) else: raise Exception(f"未收到第 {i + 1} 个响应") # 解析基本信息 query_type = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "query_type") if not query_type: print( f"Debug - Failed to extract query_type. Response was: {extracted_responses[self.BASIC_QUERY_INDEX]}") raise Exception("无法提取query_type标签内容") query_type = query_type.lower() main_topic = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "main_topic") if not main_topic: print(f"Debug - Failed to extract main_topic. Using query as fallback.") main_topic = query query_type = self._normalize_query_type(query_type, query) # 提取子主题 sub_topics = [] sub_topics_text = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "sub_topics") if sub_topics_text: sub_topics = [topic.strip() for topic in sub_topics_text.split(",")] # 提取语言 language = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "language") # 提取最低星标数 min_stars = 0 min_stars_text = self._extract_tag(extracted_responses[self.BASIC_QUERY_INDEX], "min_stars") if min_stars_text and min_stars_text.isdigit(): min_stars = int(min_stars_text) # 解析GitHub搜索参数 - 英文 english_github_query = self._extract_tag(extracted_responses[self.GITHUB_QUERY_INDEX], "query") # 解析GitHub搜索参数 - 中文 chinese_github_query = self._extract_tag(extracted_responses[2], "query") # 构建GitHub参数 github_params = { "query": english_github_query, "chinese_query": chinese_github_query, "sort": "stars", # 默认按星标排序 "order": "desc", # 默认降序 "per_page": 30, # 默认每页30条 "page": 1 # 默认第1页 } # 检查是否为特定仓库查询 repo_id = "" if "repo:" in english_github_query or "repository:" in english_github_query: repo_match = re.search(r'(repo|repository):([a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+)', english_github_query) if repo_match: repo_id = repo_match.group(2) print(f"Debug - 提取的信息:") print(f"查询类型: {query_type}") print(f"主题: {main_topic}") print(f"子主题: {sub_topics}") print(f"语言: {language}") print(f"最低星标数: {min_stars}") print(f"英文GitHub参数: {english_github_query}") print(f"中文GitHub参数: {chinese_github_query}") print(f"特定仓库: {repo_id}") # 更新返回的 SearchCriteria,包含中英文查询 return SearchCriteria( query_type=query_type, main_topic=main_topic, sub_topics=sub_topics, language=language, min_stars=min_stars, github_params=github_params, original_query=query, repo_id=repo_id ) except Exception as e: raise Exception(f"分析查询失败: {str(e)}") def _normalize_query_type(self, query_type: str, query: str) -> str: """规范化查询类型""" if query_type in ["repo", "code", "user", "topic"]: return query_type query_lower = query.lower() for type_name, keywords in self.valid_types.items(): for keyword in keywords: if keyword in query_lower: return type_name query_type_lower = query_type.lower() for type_name, keywords in self.valid_types.items(): for keyword in keywords: if keyword in query_type_lower: return type_name return "repo" # 默认返回repo类型 def _extract_tag(self, text: str, tag: str) -> str: """提取标记内容""" if not text: return "" # 标准XML格式(处理多行和特殊字符) pattern = f"<{tag}>(.*?)" match = re.search(pattern, text, re.DOTALL | re.IGNORECASE) if match: content = match.group(1).strip() if content: return content # 备用模式 patterns = [ rf"<{tag}>\s*([\s\S]*?)\s*", # 标准XML格式 rf"<{tag}>([\s\S]*?)(?:|$)", # 未闭合的标签 rf"[{tag}]([\s\S]*?)[/{tag}]", # 方括号格式 rf"{tag}:\s*(.*?)(?=\n\w|$)", # 冒号格式 rf"<{tag}>\s*(.*?)(?=<|$)" # 部分闭合 ] # 尝试所有模式 for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE | re.DOTALL) if match: content = match.group(1).strip() if content: # 确保提取的内容不为空 return content # 如果所有模式都失败,返回空字符串 return ""