Master 4.0 (#2210)

* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
这个提交包含在:
binary-husky
2025-08-23 15:59:22 +08:00
提交者 GitHub
父节点 65a4cf59c2
当前提交 8042750d41
共有 79 个文件被更改,包括 20850 次插入57 次删除

查看文件

@@ -0,0 +1,449 @@
from typing import List, Optional, Dict, Union
from datetime import datetime
import aiohttp
import asyncio
from crazy_functions.review_fns.data_sources.base_source import DataSource, PaperMetadata
import json
from tqdm import tqdm
import random
class ElsevierSource(DataSource):
"""Elsevier (Scopus) API实现"""
# 定义API密钥列表
API_KEYS = [
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
]
def __init__(self, api_key: str = None):
"""初始化
Args:
api_key: Elsevier API密钥,如果不提供则从预定义列表中随机选择
"""
self.api_key = api_key or random.choice(self.API_KEYS)
self._initialize()
def _initialize(self) -> None:
"""初始化基础URL和请求头"""
self.base_url = "https://api.elsevier.com/content"
self.headers = {
"X-ELS-APIKey": self.api_key,
"Accept": "application/json",
"Content-Type": "application/json",
# 添加更多必要的头部信息
"X-ELS-Insttoken": "", # 如果有机构令牌
}
async def _make_request(self, url: str, params: Dict = None) -> Optional[Dict]:
"""发送HTTP请求
Args:
url: 请求URL
params: 查询参数
Returns:
JSON响应
"""
try:
async with aiohttp.ClientSession(headers=self.headers) as session:
async with session.get(url, params=params) as response:
if response.status == 200:
return await response.json()
else:
# 添加更详细的错误信息
error_text = await response.text()
print(f"请求失败: {response.status}")
print(f"错误详情: {error_text}")
if response.status == 401:
print(f"使用的API密钥: {self.api_key}")
# 尝试切换到另一个API密钥
new_key = random.choice([k for k in self.API_KEYS if k != self.api_key])
print(f"尝试切换到新的API密钥: {new_key}")
self.api_key = new_key
self.headers["X-ELS-APIKey"] = new_key
# 重试请求
return await self._make_request(url, params)
return None
except Exception as e:
print(f"请求发生错误: {str(e)}")
return None
async def search(
self,
query: str,
limit: int = 100,
sort_by: str = "relevance",
start_year: int = None
) -> List[PaperMetadata]:
"""搜索论文"""
try:
params = {
"query": query,
"count": min(limit, 100),
"view": "STANDARD",
# 移除dc:description字段,因为它在STANDARD视图中不可用
"field": "dc:title,dc:creator,prism:doi,prism:coverDate,citedby-count,prism:publicationName"
}
# 添加年份过滤
if start_year:
params["date"] = f"{start_year}-present"
# 添加排序
if sort_by == "date":
params["sort"] = "-coverDate"
elif sort_by == "cited":
params["sort"] = "-citedby-count"
# 发送搜索请求
response = await self._make_request(
f"{self.base_url}/search/scopus",
params=params
)
if not response or "search-results" not in response:
return []
# 解析搜索结果
entries = response["search-results"].get("entry", [])
papers = [paper for paper in (self._parse_entry(entry) for entry in entries) if paper is not None]
# 尝试为每篇论文获取摘要
for paper in papers:
if paper.doi:
paper.abstract = await self.fetch_abstract(paper.doi) or ""
return papers
except Exception as e:
print(f"搜索论文时发生错误: {str(e)}")
return []
def _parse_entry(self, entry: Dict) -> Optional[PaperMetadata]:
"""解析Scopus API返回的条目"""
try:
# 获取作者列表
authors = []
creator = entry.get("dc:creator")
if creator:
authors = [creator]
# 获取发表年份
year = None
if "prism:coverDate" in entry:
try:
year = int(entry["prism:coverDate"][:4])
except:
pass
# 简化venue信息
venue_info = {
'source_id': entry.get("source-id"),
'issn': entry.get("prism:issn")
}
return PaperMetadata(
title=entry.get("dc:title", ""),
authors=authors,
abstract=entry.get("dc:description", ""), # 从响应中获取摘要
year=year,
doi=entry.get("prism:doi"),
url=entry.get("prism:url"),
citations=int(entry.get("citedby-count", 0)),
venue=entry.get("prism:publicationName"),
institutions=[], # 移除机构信息
venue_type="",
venue_name=entry.get("prism:publicationName"),
venue_info=venue_info
)
except Exception as e:
print(f"解析条目时发生错误: {str(e)}")
return None
async def get_citations(self, doi: str, limit: int = 100) -> List[PaperMetadata]:
"""获取引用该论文的文献"""
try:
params = {
"query": f"REF({doi})",
"count": min(limit, 100),
"view": "STANDARD"
}
response = await self._make_request(
f"{self.base_url}/search/scopus",
params=params
)
if not response or "search-results" not in response:
return []
entries = response["search-results"].get("entry", [])
return [self._parse_entry(entry) for entry in entries]
except Exception as e:
print(f"获取引用文献时发生错误: {str(e)}")
return []
async def get_references(self, doi: str) -> List[PaperMetadata]:
"""获取该论文引用的文献"""
try:
response = await self._make_request(
f"{self.base_url}/abstract/doi/{doi}/references",
params={"view": "STANDARD"}
)
if not response or "references" not in response:
return []
references = response["references"].get("reference", [])
papers = [paper for paper in (self._parse_reference(ref) for ref in references) if paper is not None]
return papers
except Exception as e:
print(f"获取参考文献时发生错误: {str(e)}")
return []
def _parse_reference(self, ref: Dict) -> Optional[PaperMetadata]:
"""解析参考文献数据"""
try:
authors = []
if "author-list" in ref:
author_list = ref["author-list"].get("author", [])
if isinstance(author_list, list):
authors = [f"{author.get('ce:given-name', '')} {author.get('ce:surname', '')}"
for author in author_list]
else:
authors = [f"{author_list.get('ce:given-name', '')} {author_list.get('ce:surname', '')}"]
year = None
if "prism:coverDate" in ref:
try:
year = int(ref["prism:coverDate"][:4])
except:
pass
return PaperMetadata(
title=ref.get("ce:title", ""),
authors=authors,
abstract="", # 参考文献通常不包含摘要
year=year,
doi=ref.get("prism:doi"),
url=None,
citations=None,
venue=ref.get("prism:publicationName"),
institutions=[],
venue_type="unknown",
venue_name=ref.get("prism:publicationName"),
venue_info={}
)
except Exception as e:
print(f"解析参考文献时发生错误: {str(e)}")
return None
async def search_by_author(
self,
author: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按作者搜索论文"""
query = f"AUTHOR-NAME({author})"
return await self.search(query, limit=limit, start_year=start_year)
async def search_by_affiliation(
self,
affiliation: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按机构搜索论文"""
query = f"AF-ID({affiliation})"
return await self.search(query, limit=limit, start_year=start_year)
async def search_by_venue(
self,
venue: str,
limit: int = 100,
start_year: int = None
) -> List[PaperMetadata]:
"""按期刊/会议搜索论文"""
query = f"SRCTITLE({venue})"
return await self.search(query, limit=limit, start_year=start_year)
async def test_api_access(self):
"""测试API访问权限"""
print(f"\n测试API密钥: {self.api_key}")
# 测试1: 基础搜索
basic_params = {
"query": "test",
"count": 1,
"view": "STANDARD"
}
print("\n1. 测试基础搜索...")
response = await self._make_request(
f"{self.base_url}/search/scopus",
params=basic_params
)
if response:
print("基础搜索成功")
print("可用字段:", list(response.get("search-results", {}).get("entry", [{}])[0].keys()))
# 测试2: 测试单篇文章访问
print("\n2. 测试文章详情访问...")
test_doi = "10.1016/j.artint.2021.103535" # 一个示例DOI
response = await self._make_request(
f"{self.base_url}/abstract/doi/{test_doi}",
params={"view": "STANDARD"} # 改为STANDARD视图
)
if response:
print("文章详情访问成功")
else:
print("文章详情访问失败")
async def get_paper_details(self, paper_id: str) -> Optional[PaperMetadata]:
"""获取论文详细信息
注意当前API权限不支持获取详细信息,返回None
Args:
paper_id: 论文ID
Returns:
None,因为当前API权限不支持此功能
"""
return None
async def fetch_abstract(self, doi: str) -> Optional[str]:
"""获取论文摘要
使用Scopus Abstract API获取论文摘要
Args:
doi: 论文的DOI
Returns:
摘要文本,如果获取失败则返回None
"""
try:
# 使用Abstract API而不是Search API
response = await self._make_request(
f"{self.base_url}/abstract/doi/{doi}",
params={
"view": "FULL" # 使用FULL视图
}
)
if response and "abstracts-retrieval-response" in response:
# 从coredata中获取摘要
coredata = response["abstracts-retrieval-response"].get("coredata", {})
return coredata.get("dc:description", "")
return None
except Exception as e:
print(f"获取摘要时发生错误: {str(e)}")
return None
async def example_usage():
"""ElsevierSource使用示例"""
elsevier = ElsevierSource()
try:
# 首先测试API访问权限
print("\n=== 测试API访问权限 ===")
await elsevier.test_api_access()
# 示例1基本搜索
print("\n=== 示例1搜索机器学习相关论文 ===")
papers = await elsevier.search("machine learning", limit=3)
for i, paper in enumerate(papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"URL: {paper.url}")
print(f"引用次数: {paper.citations}")
print(f"期刊/会议: {paper.venue}")
print("期刊信息:")
for key, value in paper.venue_info.items():
if value: # 只打印非空值
print(f" - {key}: {value}")
# 示例2获取引用信息
if papers and papers[0].doi:
print("\n=== 示例2获取引用该论文的文献 ===")
citations = await elsevier.get_citations(papers[0].doi, limit=3)
for i, paper in enumerate(citations, 1):
print(f"\n--- 引用论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"引用次数: {paper.citations}")
print(f"期刊/会议: {paper.venue}")
# 示例3获取参考文献
if papers and papers[0].doi:
print("\n=== 示例3获取论文的参考文献 ===")
references = await elsevier.get_references(papers[0].doi)
for i, paper in enumerate(references[:3], 1):
print(f"\n--- 参考文献 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"期刊/会议: {paper.venue}")
# 示例4按作者搜索
print("\n=== 示例4按作者搜索 ===")
author_papers = await elsevier.search_by_author("Hinton G", limit=3)
for i, paper in enumerate(author_papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"引用次数: {paper.citations}")
print(f"期刊/会议: {paper.venue}")
# 示例5按机构搜索
print("\n=== 示例5按机构搜索 ===")
affiliation_papers = await elsevier.search_by_affiliation("60027950", limit=3) # MIT的机构ID
for i, paper in enumerate(affiliation_papers, 1):
print(f"\n--- 论文 {i} ---")
print(f"标题: {paper.title}")
print(f"作者: {', '.join(paper.authors)}")
print(f"发表年份: {paper.year}")
print(f"DOI: {paper.doi}")
print(f"引用次数: {paper.citations}")
print(f"期刊/会议: {paper.venue}")
# 示例6获取论文摘要
print("\n=== 示例6获取论文摘要 ===")
test_doi = "10.1016/j.artint.2021.103535"
abstract = await elsevier.fetch_abstract(test_doi)
if abstract:
print(f"摘要: {abstract[:200]}...") # 只显示前200个字符
else:
print("无法获取摘要")
# 在搜索结果中显示摘要
print("\n=== 示例7搜索结果中的摘要 ===")
papers = await elsevier.search("machine learning", limit=1)
for paper in papers:
print(f"标题: {paper.title}")
print(f"摘要: {paper.abstract[:200]}..." if paper.abstract else "摘要: 无")
except Exception as e:
print(f"发生错误: {str(e)}")
import traceback
print(traceback.format_exc())
if __name__ == "__main__":
asyncio.run(example_usage())