Master 4.0 (#2210)

* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能，支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能，支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-12-06 06:26:47 +00:00 · 2025-08-23 15:59:22 +08:00
--- a/crazy_functions/review_fns/handlers/recommend_handler.py
+++ b/crazy_functions/review_fns/handlers/recommend_handler.py
@@ -0,0 +1,185 @@
+from typing import List, Dict, Any
+from .base_handler import BaseHandler
+from textwrap import dedent
+from crazy_functions.review_fns.query_analyzer import SearchCriteria
+from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency as request_gpt
+
+class 论文推荐功能(BaseHandler):
+    """论文推荐处理器"""
+
+    def __init__(self, arxiv, semantic, llm_kwargs=None):
+        super().__init__(arxiv, semantic, llm_kwargs)
+
+    async def handle(
+        self,
+        criteria: SearchCriteria,
+        chatbot: List[List[str]],
+        history: List[List[str]],
+        system_prompt: str,
+        llm_kwargs: Dict[str, Any],
+        plugin_kwargs: Dict[str, Any],
+    ) -> str:
+        """处理论文推荐请求，返回最终的prompt"""
+
+        search_params = self._get_search_params(plugin_kwargs)
+
+        # 1. 先搜索种子论文
+        seed_papers = await self._search_seed_papers(criteria, search_params)
+        if not seed_papers:
+            return self._generate_apology_prompt(criteria)
+
+        # 使用BGE重排序
+        all_papers = seed_papers
+
+        if not all_papers:
+            return self._generate_apology_prompt(criteria)
+
+        self.ranked_papers = self.paper_ranker.rank_papers(
+            query=criteria.original_query,
+            papers=all_papers,
+            search_criteria=criteria
+        )
+
+        if not self.ranked_papers:
+            return self._generate_apology_prompt(criteria)
+
+        # 构建最终的prompt
+        current_time = self._get_current_time()
+        final_prompt = dedent(f"""Current time: {current_time}
+
+            Based on the user's interest in {criteria.main_topic}, here are relevant papers.
+
+            Available papers for recommendation:
+            {self._format_papers(self.ranked_papers)}
+
+            Please provide:
+            1. Group papers by sub-topics or themes if applicable
+
+            2. For each paper:
+            - Publication time and venue (when available)
+            - Journal metrics (when available):
+                * Impact Factor (IF)
+                * JCR Quartile
+                * Chinese Academy of Sciences (CAS) Division
+            - The key contributions and main findings
+            - Why it's relevant to the user's interests
+            - How it relates to other recommended papers
+            - The paper's citation count and citation impact
+            - The paper's download link
+
+            3. A suggested reading order based on:
+            - Journal impact and quality metrics
+            - Chronological development of ideas
+            - Paper relationships and dependencies
+            - Difficulty level
+            - Impact and significance
+
+            4. Future Directions
+            - Emerging venues and research streams
+            - Novel methodological approaches
+            - Cross-disciplinary opportunities
+            - Research gaps by publication type
+
+
+            IMPORTANT:
+            - Focus on explaining why each paper is valuable
+            - Highlight connections between papers
+            - Consider both citation counts AND journal metrics when discussing impact
+            - When available, use IF, JCR quartile, and CAS division to assess paper quality
+            - Mention publication timing when discussing paper relationships
+            - When referring to papers, use HTML links in this format:
+            * For DOIs: <a href='https://doi.org/DOI_HERE' target='_blank'>DOI: DOI_HERE</a>
+            * For titles: <a href='PAPER_URL' target='_blank'>PAPER_TITLE</a>
+            - Present papers in a way that shows the evolution of ideas over time
+            - Base recommendations ONLY on the explicitly provided paper information
+            - Do not make ANY assumptions about papers beyond the given data
+            - When information is missing or unclear, acknowledge the limitation
+            - Never speculate about:
+            * Paper quality or rigor not evidenced in the data
+            * Research impact beyond citation counts and journal metrics
+            * Implementation details not mentioned
+            * Author expertise or background
+            * Future research directions not stated
+            - For each recommendation, cite only verifiable information
+            - Clearly distinguish between facts and potential implications
+
+            Format your response in markdown with clear sections.
+            Language requirement:
+            - If the query explicitly specifies a language, use that language
+            - Otherwise, match the language of the original user query
+            """
+        )
+        return final_prompt
+
+    async def _search_seed_papers(self, criteria: SearchCriteria, search_params: Dict) -> List:
+        """搜索种子论文"""
+        try:
+            # 使用_search_all_sources替代原来的并行搜索
+            all_papers = await self._search_all_sources(criteria, search_params)
+
+            if not all_papers:
+                return []
+
+            return all_papers
+
+        except Exception as e:
+            print(f"搜索种子论文时出错: {str(e)}")
+            return []
+
+    async def _get_recommendations(self, seed_papers: List, multiplier: int = 1) -> List:
+        """获取推荐论文"""
+        recommendations = []
+        base_limit = 3 * multiplier
+
+        # 将种子论文添加到推荐列表中
+        recommendations.extend(seed_papers)
+
+        # 只使用前5篇论文作为种子
+        seed_papers = seed_papers[:5]
+
+        for paper in seed_papers:
+            try:
+                if paper.doi and paper.doi.startswith("10.48550/arXiv."):
+                    # arXiv论文
+                    arxiv_id = paper.doi.split(".")[-1]
+                    paper_details = await self.arxiv.get_paper_details(arxiv_id)
+                    if paper_details and hasattr(paper_details, 'venue'):
+                        category = paper_details.venue.split(":")[-1]
+                        similar_papers = await self.arxiv.search_by_category(
+                            category,
+                            limit=base_limit,
+                            sort_by='relevance'
+                        )
+                        recommendations.extend(similar_papers)
+                elif paper.doi:  # 只对有DOI的论文获取推荐
+                    # Semantic Scholar论文
+                    similar_papers = await self.semantic.get_recommended_papers(
+                        paper.doi,
+                        limit=base_limit
+                    )
+                    if similar_papers:  # 只添加成功获取的推荐
+                        recommendations.extend(similar_papers)
+                else:
+                    # 对于没有DOI的论文，使用标题进行相关搜索
+                    if paper.title:
+                        similar_papers = await self.semantic.search(
+                            query=paper.title,
+                            limit=base_limit
+                        )
+                        recommendations.extend(similar_papers)
+
+            except Exception as e:
+                print(f"获取论文 '{paper.title}' 的推荐时发生错误: {str(e)}")
+                continue
+
+        # 去重处理
+        seen_dois = set()
+        unique_recommendations = []
+        for paper in recommendations:
+            if paper.doi and paper.doi not in seen_dois:
+                seen_dois.add(paper.doi)
+                unique_recommendations.append(paper)
+            elif not paper.doi and paper not in unique_recommendations:
+                unique_recommendations.append(paper)
+
+        return unique_recommendations