镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-05 22:16:49 +00:00
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
279 行
9.4 KiB
Python
279 行
9.4 KiB
Python
from typing import List, Optional, Dict, Union
|
||
from datetime import datetime
|
||
import aiohttp
|
||
import asyncio
|
||
from crazy_functions.review_fns.data_sources.base_source import DataSource, PaperMetadata
|
||
import json
|
||
from tqdm import tqdm
|
||
import random
|
||
|
||
class AdsabsSource(DataSource):
|
||
"""ADS (Astrophysics Data System) API实现"""
|
||
|
||
# 定义API密钥列表
|
||
API_KEYS = [
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||
]
|
||
|
||
def __init__(self, api_key: str = None):
|
||
"""初始化
|
||
|
||
Args:
|
||
api_key: ADS API密钥,如果不提供则从预定义列表中随机选择
|
||
"""
|
||
self.api_key = api_key or random.choice(self.API_KEYS) # 随机选择一个API密钥
|
||
self._initialize()
|
||
|
||
def _initialize(self) -> None:
|
||
"""初始化基础URL和请求头"""
|
||
self.base_url = "https://api.adsabs.harvard.edu/v1"
|
||
self.headers = {
|
||
"Authorization": f"Bearer {self.api_key}",
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
async def _make_request(self, url: str, method: str = "GET", data: dict = None) -> Optional[dict]:
|
||
"""发送HTTP请求
|
||
|
||
Args:
|
||
url: 请求URL
|
||
method: HTTP方法
|
||
data: POST请求数据
|
||
|
||
Returns:
|
||
响应内容
|
||
"""
|
||
try:
|
||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||
if method == "GET":
|
||
async with session.get(url) as response:
|
||
if response.status == 200:
|
||
return await response.json()
|
||
elif method == "POST":
|
||
async with session.post(url, json=data) as response:
|
||
if response.status == 200:
|
||
return await response.json()
|
||
return None
|
||
except Exception as e:
|
||
print(f"请求发生错误: {str(e)}")
|
||
return None
|
||
|
||
def _parse_paper(self, doc: dict) -> PaperMetadata:
|
||
"""解析ADS文献数据
|
||
|
||
Args:
|
||
doc: ADS文献数据
|
||
|
||
Returns:
|
||
解析后的论文数据
|
||
"""
|
||
try:
|
||
return PaperMetadata(
|
||
title=doc.get('title', [''])[0] if doc.get('title') else '',
|
||
authors=doc.get('author', []),
|
||
abstract=doc.get('abstract', ''),
|
||
year=doc.get('year'),
|
||
doi=doc.get('doi', [''])[0] if doc.get('doi') else None,
|
||
url=f"https://ui.adsabs.harvard.edu/abs/{doc.get('bibcode')}/abstract" if doc.get('bibcode') else None,
|
||
citations=doc.get('citation_count'),
|
||
venue=doc.get('pub', ''),
|
||
institutions=doc.get('aff', []),
|
||
venue_type="journal",
|
||
venue_name=doc.get('pub', ''),
|
||
venue_info={
|
||
'volume': doc.get('volume'),
|
||
'issue': doc.get('issue'),
|
||
'pub_date': doc.get('pubdate', '')
|
||
},
|
||
source='adsabs'
|
||
)
|
||
except Exception as e:
|
||
print(f"解析文章时发生错误: {str(e)}")
|
||
return None
|
||
|
||
async def search(
|
||
self,
|
||
query: str,
|
||
limit: int = 100,
|
||
sort_by: str = "relevance",
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""搜索论文
|
||
|
||
Args:
|
||
query: 搜索关键词
|
||
limit: 返回结果数量限制
|
||
sort_by: 排序方式 ('relevance', 'date', 'citations')
|
||
start_year: 起始年份
|
||
|
||
Returns:
|
||
论文列表
|
||
"""
|
||
try:
|
||
# 构建查询
|
||
if start_year:
|
||
query = f"{query} year:{start_year}-"
|
||
|
||
# 设置排序
|
||
sort_mapping = {
|
||
'relevance': 'score desc',
|
||
'date': 'date desc',
|
||
'citations': 'citation_count desc'
|
||
}
|
||
sort = sort_mapping.get(sort_by, 'score desc')
|
||
|
||
# 构建搜索请求
|
||
search_url = f"{self.base_url}/search/query"
|
||
params = {
|
||
"q": query,
|
||
"rows": limit,
|
||
"sort": sort,
|
||
"fl": "title,author,abstract,year,doi,bibcode,citation_count,pub,aff,volume,issue,pubdate"
|
||
}
|
||
|
||
response = await self._make_request(f"{search_url}?{self._build_query_string(params)}")
|
||
if not response or 'response' not in response:
|
||
return []
|
||
|
||
# 解析结果
|
||
papers = []
|
||
for doc in response['response']['docs']:
|
||
paper = self._parse_paper(doc)
|
||
if paper:
|
||
papers.append(paper)
|
||
|
||
return papers
|
||
|
||
except Exception as e:
|
||
print(f"搜索论文时发生错误: {str(e)}")
|
||
return []
|
||
|
||
def _build_query_string(self, params: dict) -> str:
|
||
"""构建查询字符串"""
|
||
return "&".join([f"{k}={v}" for k, v in params.items()])
|
||
|
||
async def get_paper_details(self, bibcode: str) -> Optional[PaperMetadata]:
|
||
"""获取指定bibcode的论文详情"""
|
||
search_url = f"{self.base_url}/search/query"
|
||
params = {
|
||
"q": f"identifier:{bibcode}",
|
||
"fl": "title,author,abstract,year,doi,bibcode,citation_count,pub,aff,volume,issue,pubdate"
|
||
}
|
||
|
||
response = await self._make_request(f"{search_url}?{self._build_query_string(params)}")
|
||
if response and 'response' in response and response['response']['docs']:
|
||
return self._parse_paper(response['response']['docs'][0])
|
||
return None
|
||
|
||
async def get_related_papers(self, bibcode: str, limit: int = 100) -> List[PaperMetadata]:
|
||
"""获取相关论文"""
|
||
url = f"{self.base_url}/search/query"
|
||
params = {
|
||
"q": f"citations(identifier:{bibcode}) OR references(identifier:{bibcode})",
|
||
"rows": limit,
|
||
"fl": "title,author,abstract,year,doi,bibcode,citation_count,pub,aff,volume,issue,pubdate"
|
||
}
|
||
|
||
response = await self._make_request(f"{url}?{self._build_query_string(params)}")
|
||
if not response or 'response' not in response:
|
||
return []
|
||
|
||
papers = []
|
||
for doc in response['response']['docs']:
|
||
paper = self._parse_paper(doc)
|
||
if paper:
|
||
papers.append(paper)
|
||
return papers
|
||
|
||
async def search_by_author(
|
||
self,
|
||
author: str,
|
||
limit: int = 100,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""按作者搜索论文"""
|
||
query = f"author:\"{author}\""
|
||
return await self.search(query, limit=limit, start_year=start_year)
|
||
|
||
async def search_by_journal(
|
||
self,
|
||
journal: str,
|
||
limit: int = 100,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""按期刊搜索论文"""
|
||
query = f"pub:\"{journal}\""
|
||
return await self.search(query, limit=limit, start_year=start_year)
|
||
|
||
async def get_latest_papers(
|
||
self,
|
||
days: int = 7,
|
||
limit: int = 100
|
||
) -> List[PaperMetadata]:
|
||
"""获取最新论文"""
|
||
query = f"entdate:[NOW-{days}DAYS TO NOW]"
|
||
return await self.search(query, limit=limit, sort_by="date")
|
||
|
||
async def get_citations(self, bibcode: str) -> List[PaperMetadata]:
|
||
"""获取引用该论文的文献"""
|
||
url = f"{self.base_url}/search/query"
|
||
params = {
|
||
"q": f"citations(identifier:{bibcode})",
|
||
"fl": "title,author,abstract,year,doi,bibcode,citation_count,pub,aff,volume,issue,pubdate"
|
||
}
|
||
|
||
response = await self._make_request(f"{url}?{self._build_query_string(params)}")
|
||
if not response or 'response' not in response:
|
||
return []
|
||
|
||
papers = []
|
||
for doc in response['response']['docs']:
|
||
paper = self._parse_paper(doc)
|
||
if paper:
|
||
papers.append(paper)
|
||
return papers
|
||
|
||
async def get_references(self, bibcode: str) -> List[PaperMetadata]:
|
||
"""获取该论文引用的文献"""
|
||
url = f"{self.base_url}/search/query"
|
||
params = {
|
||
"q": f"references(identifier:{bibcode})",
|
||
"fl": "title,author,abstract,year,doi,bibcode,citation_count,pub,aff,volume,issue,pubdate"
|
||
}
|
||
|
||
response = await self._make_request(f"{url}?{self._build_query_string(params)}")
|
||
if not response or 'response' not in response:
|
||
return []
|
||
|
||
papers = []
|
||
for doc in response['response']['docs']:
|
||
paper = self._parse_paper(doc)
|
||
if paper:
|
||
papers.append(paper)
|
||
return papers
|
||
|
||
async def example_usage():
|
||
"""AdsabsSource使用示例"""
|
||
ads = AdsabsSource()
|
||
|
||
try:
|
||
# 示例1:基本搜索
|
||
print("\n=== 示例1:搜索黑洞相关论文 ===")
|
||
papers = await ads.search("black hole", limit=3)
|
||
for i, paper in enumerate(papers, 1):
|
||
print(f"\n--- 论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"DOI: {paper.doi}")
|
||
|
||
# 其他示例...
|
||
|
||
except Exception as e:
|
||
print(f"发生错误: {str(e)}")
|
||
|
||
if __name__ == "__main__":
|
||
# python -m crazy_functions.review_fns.data_sources.adsabs_source
|
||
asyncio.run(example_usage()) |