镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-07 06:56:48 +00:00
Master 4.0 (#2210)
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
这个提交包含在:
@@ -0,0 +1,400 @@
|
||||
from typing import List, Optional, Dict, Union
|
||||
from datetime import datetime
|
||||
import aiohttp
|
||||
import random
|
||||
from .base_source import DataSource, PaperMetadata
|
||||
from tqdm import tqdm
|
||||
|
||||
class ScopusSource(DataSource):
|
||||
"""Scopus API实现"""
|
||||
|
||||
# 定义API密钥列表
|
||||
API_KEYS = [
|
||||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||||
]
|
||||
|
||||
def __init__(self, api_key: str = None):
|
||||
"""初始化
|
||||
|
||||
Args:
|
||||
api_key: Scopus API密钥,如果不提供则从预定义列表中随机选择
|
||||
"""
|
||||
self.api_key = api_key or random.choice(self.API_KEYS)
|
||||
self._initialize()
|
||||
|
||||
def _initialize(self) -> None:
|
||||
"""初始化基础URL和请求头"""
|
||||
self.base_url = "https://api.elsevier.com/content"
|
||||
self.headers = {
|
||||
"X-ELS-APIKey": self.api_key,
|
||||
"Accept": "application/json"
|
||||
}
|
||||
|
||||
async def _make_request(self, url: str, params: Dict = None) -> Optional[Dict]:
|
||||
"""发送HTTP请求
|
||||
|
||||
Args:
|
||||
url: 请求URL
|
||||
params: 查询参数
|
||||
|
||||
Returns:
|
||||
响应JSON数据
|
||||
"""
|
||||
try:
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
async with session.get(url, params=params) as response:
|
||||
if response.status == 200:
|
||||
return await response.json()
|
||||
else:
|
||||
print(f"请求失败: {response.status}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"请求发生错误: {str(e)}")
|
||||
return None
|
||||
|
||||
def _parse_paper_data(self, data: Dict) -> PaperMetadata:
|
||||
"""解析Scopus API返回的数据
|
||||
|
||||
Args:
|
||||
data: Scopus API返回的论文数据
|
||||
|
||||
Returns:
|
||||
解析后的论文元数据
|
||||
"""
|
||||
try:
|
||||
# 提取基本信息
|
||||
title = data.get("dc:title", "")
|
||||
|
||||
# 提取作者信息
|
||||
authors = []
|
||||
if "author" in data:
|
||||
if isinstance(data["author"], list):
|
||||
for author in data["author"]:
|
||||
if "given-name" in author and "surname" in author:
|
||||
authors.append(f"{author['given-name']} {author['surname']}")
|
||||
elif "indexed-name" in author:
|
||||
authors.append(author["indexed-name"])
|
||||
elif isinstance(data["author"], dict):
|
||||
if "given-name" in data["author"] and "surname" in data["author"]:
|
||||
authors.append(f"{data['author']['given-name']} {data['author']['surname']}")
|
||||
elif "indexed-name" in data["author"]:
|
||||
authors.append(data["author"]["indexed-name"])
|
||||
|
||||
# 提取摘要
|
||||
abstract = data.get("dc:description", "")
|
||||
|
||||
# 提取年份
|
||||
year = None
|
||||
if "prism:coverDate" in data:
|
||||
try:
|
||||
year = int(data["prism:coverDate"][:4])
|
||||
except:
|
||||
pass
|
||||
|
||||
# 提取DOI
|
||||
doi = data.get("prism:doi")
|
||||
|
||||
# 提取引用次数
|
||||
citations = data.get("citedby-count")
|
||||
if citations:
|
||||
try:
|
||||
citations = int(citations)
|
||||
except:
|
||||
citations = None
|
||||
|
||||
# 提取期刊信息
|
||||
venue = data.get("prism:publicationName")
|
||||
|
||||
# 提取机构信息
|
||||
institutions = []
|
||||
if "affiliation" in data:
|
||||
if isinstance(data["affiliation"], list):
|
||||
for aff in data["affiliation"]:
|
||||
if "affilname" in aff:
|
||||
institutions.append(aff["affilname"])
|
||||
elif isinstance(data["affiliation"], dict):
|
||||
if "affilname" in data["affiliation"]:
|
||||
institutions.append(data["affiliation"]["affilname"])
|
||||
|
||||
# 构建venue信息
|
||||
venue_info = {
|
||||
"issn": data.get("prism:issn"),
|
||||
"eissn": data.get("prism:eIssn"),
|
||||
"volume": data.get("prism:volume"),
|
||||
"issue": data.get("prism:issueIdentifier"),
|
||||
"page_range": data.get("prism:pageRange"),
|
||||
"article_number": data.get("article-number"),
|
||||
"publication_date": data.get("prism:coverDate")
|
||||
}
|
||||
|
||||
return PaperMetadata(
|
||||
title=title,
|
||||
authors=authors,
|
||||
abstract=abstract,
|
||||
year=year,
|
||||
doi=doi,
|
||||
url=data.get("link", [{}])[0].get("@href"),
|
||||
citations=citations,
|
||||
venue=venue,
|
||||
institutions=institutions,
|
||||
venue_type="journal",
|
||||
venue_name=venue,
|
||||
venue_info=venue_info
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"解析论文数据时发生错误: {str(e)}")
|
||||
return None
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 100,
|
||||
sort_by: str = None,
|
||||
start_year: int = None
|
||||
) -> List[PaperMetadata]:
|
||||
"""搜索论文
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
limit: 返回结果数量限制
|
||||
sort_by: 排序方式 ('relevance', 'date', 'citations')
|
||||
start_year: 起始年份
|
||||
|
||||
Returns:
|
||||
论文列表
|
||||
"""
|
||||
try:
|
||||
# 构建查询参数
|
||||
params = {
|
||||
"query": query,
|
||||
"count": min(limit, 100), # Scopus API单次请求限制
|
||||
"start": 0
|
||||
}
|
||||
|
||||
# 添加年份过滤
|
||||
if start_year:
|
||||
params["date"] = f"{start_year}-present"
|
||||
|
||||
# 添加排序
|
||||
if sort_by:
|
||||
sort_map = {
|
||||
"relevance": "-score",
|
||||
"date": "-coverDate",
|
||||
"citations": "-citedby-count"
|
||||
}
|
||||
if sort_by in sort_map:
|
||||
params["sort"] = sort_map[sort_by]
|
||||
|
||||
# 发送请求
|
||||
url = f"{self.base_url}/search/scopus"
|
||||
response = await self._make_request(url, params)
|
||||
|
||||
if not response or "search-results" not in response:
|
||||
return []
|
||||
|
||||
# 解析结果
|
||||
results = response["search-results"].get("entry", [])
|
||||
papers = []
|
||||
|
||||
for result in results:
|
||||
paper = self._parse_paper_data(result)
|
||||
if paper:
|
||||
papers.append(paper)
|
||||
|
||||
return papers
|
||||
|
||||
except Exception as e:
|
||||
print(f"搜索论文时发生错误: {str(e)}")
|
||||
return []
|
||||
|
||||
async def get_paper_details(self, paper_id: str) -> Optional[PaperMetadata]:
|
||||
"""获取论文详情
|
||||
|
||||
Args:
|
||||
paper_id: Scopus ID或DOI
|
||||
|
||||
Returns:
|
||||
论文详情
|
||||
"""
|
||||
try:
|
||||
# 判断是否为DOI
|
||||
if "/" in paper_id:
|
||||
url = f"{self.base_url}/article/doi/{paper_id}"
|
||||
else:
|
||||
url = f"{self.base_url}/abstract/scopus_id/{paper_id}"
|
||||
|
||||
response = await self._make_request(url)
|
||||
|
||||
if not response or "abstracts-retrieval-response" not in response:
|
||||
return None
|
||||
|
||||
data = response["abstracts-retrieval-response"]
|
||||
return self._parse_paper_data(data)
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取论文详情时发生错误: {str(e)}")
|
||||
return None
|
||||
|
||||
async def get_citations(self, paper_id: str) -> List[PaperMetadata]:
|
||||
"""获取引用该论文的文献
|
||||
|
||||
Args:
|
||||
paper_id: Scopus ID
|
||||
|
||||
Returns:
|
||||
引用论文列表
|
||||
"""
|
||||
try:
|
||||
url = f"{self.base_url}/abstract/citations/{paper_id}"
|
||||
response = await self._make_request(url)
|
||||
|
||||
if not response or "citing-papers" not in response:
|
||||
return []
|
||||
|
||||
results = response["citing-papers"].get("papers", [])
|
||||
papers = []
|
||||
|
||||
for result in results:
|
||||
paper = self._parse_paper_data(result)
|
||||
if paper:
|
||||
papers.append(paper)
|
||||
|
||||
return papers
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取引用信息时发生错误: {str(e)}")
|
||||
return []
|
||||
|
||||
async def get_references(self, paper_id: str) -> List[PaperMetadata]:
|
||||
"""获取该论文引用的文献
|
||||
|
||||
Args:
|
||||
paper_id: Scopus ID
|
||||
|
||||
Returns:
|
||||
参考文献列表
|
||||
"""
|
||||
try:
|
||||
url = f"{self.base_url}/abstract/references/{paper_id}"
|
||||
response = await self._make_request(url)
|
||||
|
||||
if not response or "references" not in response:
|
||||
return []
|
||||
|
||||
results = response["references"].get("reference", [])
|
||||
papers = []
|
||||
|
||||
for result in results:
|
||||
paper = self._parse_paper_data(result)
|
||||
if paper:
|
||||
papers.append(paper)
|
||||
|
||||
return papers
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取参考文献时发生错误: {str(e)}")
|
||||
return []
|
||||
|
||||
async def search_by_author(
|
||||
self,
|
||||
author: str,
|
||||
limit: int = 100,
|
||||
start_year: int = None
|
||||
) -> List[PaperMetadata]:
|
||||
"""按作者搜索论文"""
|
||||
query = f"AUTHOR-NAME({author})"
|
||||
if start_year:
|
||||
query += f" AND PUBYEAR > {start_year}"
|
||||
return await self.search(query, limit=limit)
|
||||
|
||||
async def search_by_journal(
|
||||
self,
|
||||
journal: str,
|
||||
limit: int = 100,
|
||||
start_year: int = None
|
||||
) -> List[PaperMetadata]:
|
||||
"""按期刊搜索论文"""
|
||||
query = f"SRCTITLE({journal})"
|
||||
if start_year:
|
||||
query += f" AND PUBYEAR > {start_year}"
|
||||
return await self.search(query, limit=limit)
|
||||
|
||||
async def get_latest_papers(
|
||||
self,
|
||||
days: int = 7,
|
||||
limit: int = 100
|
||||
) -> List[PaperMetadata]:
|
||||
"""获取最新论文"""
|
||||
query = f"LOAD-DATE > NOW() - {days}d"
|
||||
return await self.search(query, limit=limit, sort_by="date")
|
||||
|
||||
async def example_usage():
|
||||
"""ScopusSource使用示例"""
|
||||
scopus = ScopusSource()
|
||||
|
||||
try:
|
||||
# 示例1:基本搜索
|
||||
print("\n=== 示例1:搜索机器学习相关论文 ===")
|
||||
papers = await scopus.search("machine learning", limit=3)
|
||||
print(f"\n找到 {len(papers)} 篇相关论文:")
|
||||
for i, paper in enumerate(papers, 1):
|
||||
print(f"\n论文 {i}:")
|
||||
print(f"标题: {paper.title}")
|
||||
print(f"作者: {', '.join(paper.authors)}")
|
||||
print(f"发表年份: {paper.year}")
|
||||
print(f"发表期刊: {paper.venue}")
|
||||
print(f"引用次数: {paper.citations}")
|
||||
print(f"DOI: {paper.doi}")
|
||||
if paper.abstract:
|
||||
print(f"摘要:\n{paper.abstract}")
|
||||
print("-" * 80)
|
||||
|
||||
# 示例2:按作者搜索
|
||||
print("\n=== 示例2:搜索特定作者的论文 ===")
|
||||
author_papers = await scopus.search_by_author("Hinton G.", limit=3)
|
||||
print(f"\n找到 {len(author_papers)} 篇 Hinton 的论文:")
|
||||
for i, paper in enumerate(author_papers, 1):
|
||||
print(f"\n论文 {i}:")
|
||||
print(f"标题: {paper.title}")
|
||||
print(f"作者: {', '.join(paper.authors)}")
|
||||
print(f"发表年份: {paper.year}")
|
||||
print(f"发表期刊: {paper.venue}")
|
||||
print(f"引用次数: {paper.citations}")
|
||||
print(f"DOI: {paper.doi}")
|
||||
if paper.abstract:
|
||||
print(f"摘要:\n{paper.abstract}")
|
||||
print("-" * 80)
|
||||
|
||||
# 示例3:根据关键词搜索相关论文
|
||||
print("\n=== 示例3:搜索人工智能相关论文 ===")
|
||||
keywords = "artificial intelligence AND deep learning"
|
||||
papers = await scopus.search(
|
||||
query=keywords,
|
||||
limit=5,
|
||||
sort_by="citations", # 按引用次数排序
|
||||
start_year=2020 # 只搜索2020年之后的论文
|
||||
)
|
||||
|
||||
print(f"\n找到 {len(papers)} 篇相关论文:")
|
||||
for i, paper in enumerate(papers, 1):
|
||||
print(f"\n论文 {i}:")
|
||||
print(f"标题: {paper.title}")
|
||||
print(f"作者: {', '.join(paper.authors)}")
|
||||
print(f"发表年份: {paper.year}")
|
||||
print(f"发表期刊: {paper.venue}")
|
||||
print(f"引用次数: {paper.citations}")
|
||||
print(f"DOI: {paper.doi}")
|
||||
if paper.abstract:
|
||||
print(f"摘要:\n{paper.abstract}")
|
||||
print("-" * 80)
|
||||
|
||||
except Exception as e:
|
||||
print(f"发生错误: {str(e)}")
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(example_usage())
|
||||
在新工单中引用
屏蔽一个用户