镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-05 22:16:49 +00:00
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
449 行
16 KiB
Python
449 行
16 KiB
Python
from typing import List, Optional, Dict, Union
|
||
from datetime import datetime
|
||
import aiohttp
|
||
import asyncio
|
||
from crazy_functions.review_fns.data_sources.base_source import DataSource, PaperMetadata
|
||
import json
|
||
from tqdm import tqdm
|
||
import random
|
||
|
||
class ElsevierSource(DataSource):
|
||
"""Elsevier (Scopus) API实现"""
|
||
|
||
# 定义API密钥列表
|
||
API_KEYS = [
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||
]
|
||
|
||
def __init__(self, api_key: str = None):
|
||
"""初始化
|
||
|
||
Args:
|
||
api_key: Elsevier API密钥,如果不提供则从预定义列表中随机选择
|
||
"""
|
||
self.api_key = api_key or random.choice(self.API_KEYS)
|
||
self._initialize()
|
||
|
||
def _initialize(self) -> None:
|
||
"""初始化基础URL和请求头"""
|
||
self.base_url = "https://api.elsevier.com/content"
|
||
self.headers = {
|
||
"X-ELS-APIKey": self.api_key,
|
||
"Accept": "application/json",
|
||
"Content-Type": "application/json",
|
||
# 添加更多必要的头部信息
|
||
"X-ELS-Insttoken": "", # 如果有机构令牌
|
||
}
|
||
|
||
async def _make_request(self, url: str, params: Dict = None) -> Optional[Dict]:
|
||
"""发送HTTP请求
|
||
|
||
Args:
|
||
url: 请求URL
|
||
params: 查询参数
|
||
|
||
Returns:
|
||
JSON响应
|
||
"""
|
||
try:
|
||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||
async with session.get(url, params=params) as response:
|
||
if response.status == 200:
|
||
return await response.json()
|
||
else:
|
||
# 添加更详细的错误信息
|
||
error_text = await response.text()
|
||
print(f"请求失败: {response.status}")
|
||
print(f"错误详情: {error_text}")
|
||
if response.status == 401:
|
||
print(f"使用的API密钥: {self.api_key}")
|
||
# 尝试切换到另一个API密钥
|
||
new_key = random.choice([k for k in self.API_KEYS if k != self.api_key])
|
||
print(f"尝试切换到新的API密钥: {new_key}")
|
||
self.api_key = new_key
|
||
self.headers["X-ELS-APIKey"] = new_key
|
||
# 重试请求
|
||
return await self._make_request(url, params)
|
||
return None
|
||
except Exception as e:
|
||
print(f"请求发生错误: {str(e)}")
|
||
return None
|
||
|
||
async def search(
|
||
self,
|
||
query: str,
|
||
limit: int = 100,
|
||
sort_by: str = "relevance",
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""搜索论文"""
|
||
try:
|
||
params = {
|
||
"query": query,
|
||
"count": min(limit, 100),
|
||
"view": "STANDARD",
|
||
# 移除dc:description字段,因为它在STANDARD视图中不可用
|
||
"field": "dc:title,dc:creator,prism:doi,prism:coverDate,citedby-count,prism:publicationName"
|
||
}
|
||
|
||
# 添加年份过滤
|
||
if start_year:
|
||
params["date"] = f"{start_year}-present"
|
||
|
||
# 添加排序
|
||
if sort_by == "date":
|
||
params["sort"] = "-coverDate"
|
||
elif sort_by == "cited":
|
||
params["sort"] = "-citedby-count"
|
||
|
||
# 发送搜索请求
|
||
response = await self._make_request(
|
||
f"{self.base_url}/search/scopus",
|
||
params=params
|
||
)
|
||
|
||
if not response or "search-results" not in response:
|
||
return []
|
||
|
||
# 解析搜索结果
|
||
entries = response["search-results"].get("entry", [])
|
||
papers = [paper for paper in (self._parse_entry(entry) for entry in entries) if paper is not None]
|
||
|
||
# 尝试为每篇论文获取摘要
|
||
for paper in papers:
|
||
if paper.doi:
|
||
paper.abstract = await self.fetch_abstract(paper.doi) or ""
|
||
|
||
return papers
|
||
|
||
except Exception as e:
|
||
print(f"搜索论文时发生错误: {str(e)}")
|
||
return []
|
||
|
||
def _parse_entry(self, entry: Dict) -> Optional[PaperMetadata]:
|
||
"""解析Scopus API返回的条目"""
|
||
try:
|
||
# 获取作者列表
|
||
authors = []
|
||
creator = entry.get("dc:creator")
|
||
if creator:
|
||
authors = [creator]
|
||
|
||
# 获取发表年份
|
||
year = None
|
||
if "prism:coverDate" in entry:
|
||
try:
|
||
year = int(entry["prism:coverDate"][:4])
|
||
except:
|
||
pass
|
||
|
||
# 简化venue信息
|
||
venue_info = {
|
||
'source_id': entry.get("source-id"),
|
||
'issn': entry.get("prism:issn")
|
||
}
|
||
|
||
return PaperMetadata(
|
||
title=entry.get("dc:title", ""),
|
||
authors=authors,
|
||
abstract=entry.get("dc:description", ""), # 从响应中获取摘要
|
||
year=year,
|
||
doi=entry.get("prism:doi"),
|
||
url=entry.get("prism:url"),
|
||
citations=int(entry.get("citedby-count", 0)),
|
||
venue=entry.get("prism:publicationName"),
|
||
institutions=[], # 移除机构信息
|
||
venue_type="",
|
||
venue_name=entry.get("prism:publicationName"),
|
||
venue_info=venue_info
|
||
)
|
||
|
||
except Exception as e:
|
||
print(f"解析条目时发生错误: {str(e)}")
|
||
return None
|
||
|
||
async def get_citations(self, doi: str, limit: int = 100) -> List[PaperMetadata]:
|
||
"""获取引用该论文的文献"""
|
||
try:
|
||
params = {
|
||
"query": f"REF({doi})",
|
||
"count": min(limit, 100),
|
||
"view": "STANDARD"
|
||
}
|
||
|
||
response = await self._make_request(
|
||
f"{self.base_url}/search/scopus",
|
||
params=params
|
||
)
|
||
|
||
if not response or "search-results" not in response:
|
||
return []
|
||
|
||
entries = response["search-results"].get("entry", [])
|
||
return [self._parse_entry(entry) for entry in entries]
|
||
|
||
except Exception as e:
|
||
print(f"获取引用文献时发生错误: {str(e)}")
|
||
return []
|
||
|
||
async def get_references(self, doi: str) -> List[PaperMetadata]:
|
||
"""获取该论文引用的文献"""
|
||
try:
|
||
response = await self._make_request(
|
||
f"{self.base_url}/abstract/doi/{doi}/references",
|
||
params={"view": "STANDARD"}
|
||
)
|
||
|
||
if not response or "references" not in response:
|
||
return []
|
||
|
||
references = response["references"].get("reference", [])
|
||
papers = [paper for paper in (self._parse_reference(ref) for ref in references) if paper is not None]
|
||
return papers
|
||
|
||
except Exception as e:
|
||
print(f"获取参考文献时发生错误: {str(e)}")
|
||
return []
|
||
|
||
def _parse_reference(self, ref: Dict) -> Optional[PaperMetadata]:
|
||
"""解析参考文献数据"""
|
||
try:
|
||
authors = []
|
||
if "author-list" in ref:
|
||
author_list = ref["author-list"].get("author", [])
|
||
if isinstance(author_list, list):
|
||
authors = [f"{author.get('ce:given-name', '')} {author.get('ce:surname', '')}"
|
||
for author in author_list]
|
||
else:
|
||
authors = [f"{author_list.get('ce:given-name', '')} {author_list.get('ce:surname', '')}"]
|
||
|
||
year = None
|
||
if "prism:coverDate" in ref:
|
||
try:
|
||
year = int(ref["prism:coverDate"][:4])
|
||
except:
|
||
pass
|
||
|
||
return PaperMetadata(
|
||
title=ref.get("ce:title", ""),
|
||
authors=authors,
|
||
abstract="", # 参考文献通常不包含摘要
|
||
year=year,
|
||
doi=ref.get("prism:doi"),
|
||
url=None,
|
||
citations=None,
|
||
venue=ref.get("prism:publicationName"),
|
||
institutions=[],
|
||
venue_type="unknown",
|
||
venue_name=ref.get("prism:publicationName"),
|
||
venue_info={}
|
||
)
|
||
|
||
except Exception as e:
|
||
print(f"解析参考文献时发生错误: {str(e)}")
|
||
return None
|
||
|
||
async def search_by_author(
|
||
self,
|
||
author: str,
|
||
limit: int = 100,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""按作者搜索论文"""
|
||
query = f"AUTHOR-NAME({author})"
|
||
return await self.search(query, limit=limit, start_year=start_year)
|
||
|
||
async def search_by_affiliation(
|
||
self,
|
||
affiliation: str,
|
||
limit: int = 100,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""按机构搜索论文"""
|
||
query = f"AF-ID({affiliation})"
|
||
return await self.search(query, limit=limit, start_year=start_year)
|
||
|
||
async def search_by_venue(
|
||
self,
|
||
venue: str,
|
||
limit: int = 100,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""按期刊/会议搜索论文"""
|
||
query = f"SRCTITLE({venue})"
|
||
return await self.search(query, limit=limit, start_year=start_year)
|
||
|
||
async def test_api_access(self):
|
||
"""测试API访问权限"""
|
||
print(f"\n测试API密钥: {self.api_key}")
|
||
|
||
# 测试1: 基础搜索
|
||
basic_params = {
|
||
"query": "test",
|
||
"count": 1,
|
||
"view": "STANDARD"
|
||
}
|
||
print("\n1. 测试基础搜索...")
|
||
response = await self._make_request(
|
||
f"{self.base_url}/search/scopus",
|
||
params=basic_params
|
||
)
|
||
if response:
|
||
print("基础搜索成功")
|
||
print("可用字段:", list(response.get("search-results", {}).get("entry", [{}])[0].keys()))
|
||
|
||
# 测试2: 测试单篇文章访问
|
||
print("\n2. 测试文章详情访问...")
|
||
test_doi = "10.1016/j.artint.2021.103535" # 一个示例DOI
|
||
response = await self._make_request(
|
||
f"{self.base_url}/abstract/doi/{test_doi}",
|
||
params={"view": "STANDARD"} # 改为STANDARD视图
|
||
)
|
||
if response:
|
||
print("文章详情访问成功")
|
||
else:
|
||
print("文章详情访问失败")
|
||
|
||
async def get_paper_details(self, paper_id: str) -> Optional[PaperMetadata]:
|
||
"""获取论文详细信息
|
||
|
||
注意:当前API权限不支持获取详细信息,返回None
|
||
|
||
Args:
|
||
paper_id: 论文ID
|
||
|
||
Returns:
|
||
None,因为当前API权限不支持此功能
|
||
"""
|
||
return None
|
||
|
||
async def fetch_abstract(self, doi: str) -> Optional[str]:
|
||
"""获取论文摘要
|
||
|
||
使用Scopus Abstract API获取论文摘要
|
||
|
||
Args:
|
||
doi: 论文的DOI
|
||
|
||
Returns:
|
||
摘要文本,如果获取失败则返回None
|
||
"""
|
||
try:
|
||
# 使用Abstract API而不是Search API
|
||
response = await self._make_request(
|
||
f"{self.base_url}/abstract/doi/{doi}",
|
||
params={
|
||
"view": "FULL" # 使用FULL视图
|
||
}
|
||
)
|
||
|
||
if response and "abstracts-retrieval-response" in response:
|
||
# 从coredata中获取摘要
|
||
coredata = response["abstracts-retrieval-response"].get("coredata", {})
|
||
return coredata.get("dc:description", "")
|
||
|
||
return None
|
||
|
||
except Exception as e:
|
||
print(f"获取摘要时发生错误: {str(e)}")
|
||
return None
|
||
|
||
async def example_usage():
|
||
"""ElsevierSource使用示例"""
|
||
elsevier = ElsevierSource()
|
||
|
||
try:
|
||
# 首先测试API访问权限
|
||
print("\n=== 测试API访问权限 ===")
|
||
await elsevier.test_api_access()
|
||
|
||
# 示例1:基本搜索
|
||
print("\n=== 示例1:搜索机器学习相关论文 ===")
|
||
papers = await elsevier.search("machine learning", limit=3)
|
||
for i, paper in enumerate(papers, 1):
|
||
print(f"\n--- 论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"DOI: {paper.doi}")
|
||
print(f"URL: {paper.url}")
|
||
print(f"引用次数: {paper.citations}")
|
||
print(f"期刊/会议: {paper.venue}")
|
||
print("期刊信息:")
|
||
for key, value in paper.venue_info.items():
|
||
if value: # 只打印非空值
|
||
print(f" - {key}: {value}")
|
||
|
||
# 示例2:获取引用信息
|
||
if papers and papers[0].doi:
|
||
print("\n=== 示例2:获取引用该论文的文献 ===")
|
||
citations = await elsevier.get_citations(papers[0].doi, limit=3)
|
||
for i, paper in enumerate(citations, 1):
|
||
print(f"\n--- 引用论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"DOI: {paper.doi}")
|
||
print(f"引用次数: {paper.citations}")
|
||
print(f"期刊/会议: {paper.venue}")
|
||
|
||
# 示例3:获取参考文献
|
||
if papers and papers[0].doi:
|
||
print("\n=== 示例3:获取论文的参考文献 ===")
|
||
references = await elsevier.get_references(papers[0].doi)
|
||
for i, paper in enumerate(references[:3], 1):
|
||
print(f"\n--- 参考文献 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"DOI: {paper.doi}")
|
||
print(f"期刊/会议: {paper.venue}")
|
||
|
||
# 示例4:按作者搜索
|
||
print("\n=== 示例4:按作者搜索 ===")
|
||
author_papers = await elsevier.search_by_author("Hinton G", limit=3)
|
||
for i, paper in enumerate(author_papers, 1):
|
||
print(f"\n--- 论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"DOI: {paper.doi}")
|
||
print(f"引用次数: {paper.citations}")
|
||
print(f"期刊/会议: {paper.venue}")
|
||
|
||
# 示例5:按机构搜索
|
||
print("\n=== 示例5:按机构搜索 ===")
|
||
affiliation_papers = await elsevier.search_by_affiliation("60027950", limit=3) # MIT的机构ID
|
||
for i, paper in enumerate(affiliation_papers, 1):
|
||
print(f"\n--- 论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"DOI: {paper.doi}")
|
||
print(f"引用次数: {paper.citations}")
|
||
print(f"期刊/会议: {paper.venue}")
|
||
|
||
# 示例6:获取论文摘要
|
||
print("\n=== 示例6:获取论文摘要 ===")
|
||
test_doi = "10.1016/j.artint.2021.103535"
|
||
abstract = await elsevier.fetch_abstract(test_doi)
|
||
if abstract:
|
||
print(f"摘要: {abstract[:200]}...") # 只显示前200个字符
|
||
else:
|
||
print("无法获取摘要")
|
||
|
||
# 在搜索结果中显示摘要
|
||
print("\n=== 示例7:搜索结果中的摘要 ===")
|
||
papers = await elsevier.search("machine learning", limit=1)
|
||
for paper in papers:
|
||
print(f"标题: {paper.title}")
|
||
print(f"摘要: {paper.abstract[:200]}..." if paper.abstract else "摘要: 无")
|
||
|
||
except Exception as e:
|
||
print(f"发生错误: {str(e)}")
|
||
import traceback
|
||
print(traceback.format_exc())
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(example_usage()) |