镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 06:26:47 +00:00
Master 4.0 (#2210)
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
这个提交包含在:
@@ -0,0 +1,185 @@
|
||||
from typing import List, Dict, Any
|
||||
from .base_handler import BaseHandler
|
||||
from textwrap import dedent
|
||||
from crazy_functions.review_fns.query_analyzer import SearchCriteria
|
||||
from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency as request_gpt
|
||||
|
||||
class 论文推荐功能(BaseHandler):
|
||||
"""论文推荐处理器"""
|
||||
|
||||
def __init__(self, arxiv, semantic, llm_kwargs=None):
|
||||
super().__init__(arxiv, semantic, llm_kwargs)
|
||||
|
||||
async def handle(
|
||||
self,
|
||||
criteria: SearchCriteria,
|
||||
chatbot: List[List[str]],
|
||||
history: List[List[str]],
|
||||
system_prompt: str,
|
||||
llm_kwargs: Dict[str, Any],
|
||||
plugin_kwargs: Dict[str, Any],
|
||||
) -> str:
|
||||
"""处理论文推荐请求,返回最终的prompt"""
|
||||
|
||||
search_params = self._get_search_params(plugin_kwargs)
|
||||
|
||||
# 1. 先搜索种子论文
|
||||
seed_papers = await self._search_seed_papers(criteria, search_params)
|
||||
if not seed_papers:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 使用BGE重排序
|
||||
all_papers = seed_papers
|
||||
|
||||
if not all_papers:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
self.ranked_papers = self.paper_ranker.rank_papers(
|
||||
query=criteria.original_query,
|
||||
papers=all_papers,
|
||||
search_criteria=criteria
|
||||
)
|
||||
|
||||
if not self.ranked_papers:
|
||||
return self._generate_apology_prompt(criteria)
|
||||
|
||||
# 构建最终的prompt
|
||||
current_time = self._get_current_time()
|
||||
final_prompt = dedent(f"""Current time: {current_time}
|
||||
|
||||
Based on the user's interest in {criteria.main_topic}, here are relevant papers.
|
||||
|
||||
Available papers for recommendation:
|
||||
{self._format_papers(self.ranked_papers)}
|
||||
|
||||
Please provide:
|
||||
1. Group papers by sub-topics or themes if applicable
|
||||
|
||||
2. For each paper:
|
||||
- Publication time and venue (when available)
|
||||
- Journal metrics (when available):
|
||||
* Impact Factor (IF)
|
||||
* JCR Quartile
|
||||
* Chinese Academy of Sciences (CAS) Division
|
||||
- The key contributions and main findings
|
||||
- Why it's relevant to the user's interests
|
||||
- How it relates to other recommended papers
|
||||
- The paper's citation count and citation impact
|
||||
- The paper's download link
|
||||
|
||||
3. A suggested reading order based on:
|
||||
- Journal impact and quality metrics
|
||||
- Chronological development of ideas
|
||||
- Paper relationships and dependencies
|
||||
- Difficulty level
|
||||
- Impact and significance
|
||||
|
||||
4. Future Directions
|
||||
- Emerging venues and research streams
|
||||
- Novel methodological approaches
|
||||
- Cross-disciplinary opportunities
|
||||
- Research gaps by publication type
|
||||
|
||||
|
||||
IMPORTANT:
|
||||
- Focus on explaining why each paper is valuable
|
||||
- Highlight connections between papers
|
||||
- Consider both citation counts AND journal metrics when discussing impact
|
||||
- When available, use IF, JCR quartile, and CAS division to assess paper quality
|
||||
- Mention publication timing when discussing paper relationships
|
||||
- When referring to papers, use HTML links in this format:
|
||||
* For DOIs: <a href='https://doi.org/DOI_HERE' target='_blank'>DOI: DOI_HERE</a>
|
||||
* For titles: <a href='PAPER_URL' target='_blank'>PAPER_TITLE</a>
|
||||
- Present papers in a way that shows the evolution of ideas over time
|
||||
- Base recommendations ONLY on the explicitly provided paper information
|
||||
- Do not make ANY assumptions about papers beyond the given data
|
||||
- When information is missing or unclear, acknowledge the limitation
|
||||
- Never speculate about:
|
||||
* Paper quality or rigor not evidenced in the data
|
||||
* Research impact beyond citation counts and journal metrics
|
||||
* Implementation details not mentioned
|
||||
* Author expertise or background
|
||||
* Future research directions not stated
|
||||
- For each recommendation, cite only verifiable information
|
||||
- Clearly distinguish between facts and potential implications
|
||||
|
||||
Format your response in markdown with clear sections.
|
||||
Language requirement:
|
||||
- If the query explicitly specifies a language, use that language
|
||||
- Otherwise, match the language of the original user query
|
||||
"""
|
||||
)
|
||||
return final_prompt
|
||||
|
||||
async def _search_seed_papers(self, criteria: SearchCriteria, search_params: Dict) -> List:
|
||||
"""搜索种子论文"""
|
||||
try:
|
||||
# 使用_search_all_sources替代原来的并行搜索
|
||||
all_papers = await self._search_all_sources(criteria, search_params)
|
||||
|
||||
if not all_papers:
|
||||
return []
|
||||
|
||||
return all_papers
|
||||
|
||||
except Exception as e:
|
||||
print(f"搜索种子论文时出错: {str(e)}")
|
||||
return []
|
||||
|
||||
async def _get_recommendations(self, seed_papers: List, multiplier: int = 1) -> List:
|
||||
"""获取推荐论文"""
|
||||
recommendations = []
|
||||
base_limit = 3 * multiplier
|
||||
|
||||
# 将种子论文添加到推荐列表中
|
||||
recommendations.extend(seed_papers)
|
||||
|
||||
# 只使用前5篇论文作为种子
|
||||
seed_papers = seed_papers[:5]
|
||||
|
||||
for paper in seed_papers:
|
||||
try:
|
||||
if paper.doi and paper.doi.startswith("10.48550/arXiv."):
|
||||
# arXiv论文
|
||||
arxiv_id = paper.doi.split(".")[-1]
|
||||
paper_details = await self.arxiv.get_paper_details(arxiv_id)
|
||||
if paper_details and hasattr(paper_details, 'venue'):
|
||||
category = paper_details.venue.split(":")[-1]
|
||||
similar_papers = await self.arxiv.search_by_category(
|
||||
category,
|
||||
limit=base_limit,
|
||||
sort_by='relevance'
|
||||
)
|
||||
recommendations.extend(similar_papers)
|
||||
elif paper.doi: # 只对有DOI的论文获取推荐
|
||||
# Semantic Scholar论文
|
||||
similar_papers = await self.semantic.get_recommended_papers(
|
||||
paper.doi,
|
||||
limit=base_limit
|
||||
)
|
||||
if similar_papers: # 只添加成功获取的推荐
|
||||
recommendations.extend(similar_papers)
|
||||
else:
|
||||
# 对于没有DOI的论文,使用标题进行相关搜索
|
||||
if paper.title:
|
||||
similar_papers = await self.semantic.search(
|
||||
query=paper.title,
|
||||
limit=base_limit
|
||||
)
|
||||
recommendations.extend(similar_papers)
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取论文 '{paper.title}' 的推荐时发生错误: {str(e)}")
|
||||
continue
|
||||
|
||||
# 去重处理
|
||||
seen_dois = set()
|
||||
unique_recommendations = []
|
||||
for paper in recommendations:
|
||||
if paper.doi and paper.doi not in seen_dois:
|
||||
seen_dois.add(paper.doi)
|
||||
unique_recommendations.append(paper)
|
||||
elif not paper.doi and paper not in unique_recommendations:
|
||||
unique_recommendations.append(paper)
|
||||
|
||||
return unique_recommendations
|
||||
在新工单中引用
屏蔽一个用户