镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-05 22:16:49 +00:00
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
459 行
15 KiB
Python
459 行
15 KiB
Python
from typing import List, Optional, Dict, Union
|
||
from datetime import datetime
|
||
import aiohttp
|
||
import asyncio
|
||
from crazy_functions.review_fns.data_sources.base_source import DataSource, PaperMetadata
|
||
import xml.etree.ElementTree as ET
|
||
from urllib.parse import quote
|
||
import json
|
||
from tqdm import tqdm
|
||
import random
|
||
|
||
class PubMedSource(DataSource):
|
||
"""PubMed API实现"""
|
||
|
||
# 定义API密钥列表
|
||
API_KEYS = [
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||
]
|
||
|
||
def __init__(self, api_key: str = None):
|
||
"""初始化
|
||
|
||
Args:
|
||
api_key: PubMed API密钥,如果不提供则从预定义列表中随机选择
|
||
"""
|
||
self.api_key = api_key or random.choice(self.API_KEYS) # 随机选择一个API密钥
|
||
self._initialize()
|
||
|
||
def _initialize(self) -> None:
|
||
"""初始化基础URL和请求头"""
|
||
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
||
self.headers = {
|
||
"User-Agent": "Mozilla/5.0 PubMedDataSource/1.0",
|
||
"Accept": "application/json"
|
||
}
|
||
|
||
async def _make_request(self, url: str) -> Optional[str]:
|
||
"""发送HTTP请求
|
||
|
||
Args:
|
||
url: 请求URL
|
||
|
||
Returns:
|
||
响应内容
|
||
"""
|
||
try:
|
||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||
async with session.get(url) as response:
|
||
if response.status == 200:
|
||
return await response.text()
|
||
else:
|
||
print(f"请求失败: {response.status}")
|
||
return None
|
||
except Exception as e:
|
||
print(f"请求发生错误: {str(e)}")
|
||
return None
|
||
|
||
async def search(
|
||
self,
|
||
query: str,
|
||
limit: int = 100,
|
||
sort_by: str = "relevance",
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""搜索论文
|
||
|
||
Args:
|
||
query: 搜索关键词
|
||
limit: 返回结果数量限制
|
||
sort_by: 排序方式 ('relevance', 'date', 'citations')
|
||
start_year: 起始年份
|
||
|
||
Returns:
|
||
论文列表
|
||
"""
|
||
try:
|
||
# 添加年份过滤
|
||
if start_year:
|
||
query = f"{query} AND {start_year}:3000[dp]"
|
||
|
||
# 构建搜索URL
|
||
search_url = (
|
||
f"{self.base_url}/esearch.fcgi?"
|
||
f"db=pubmed&term={quote(query)}&retmax={limit}"
|
||
f"&usehistory=y&api_key={self.api_key}"
|
||
)
|
||
|
||
if sort_by == "date":
|
||
search_url += "&sort=date"
|
||
|
||
# 获取搜索结果
|
||
response = await self._make_request(search_url)
|
||
if not response:
|
||
return []
|
||
|
||
# 解析XML响应
|
||
root = ET.fromstring(response)
|
||
id_list = root.findall(".//Id")
|
||
pmids = [id_elem.text for id_elem in id_list]
|
||
|
||
if not pmids:
|
||
return []
|
||
|
||
# 批量获取论文详情
|
||
papers = []
|
||
batch_size = 50
|
||
for i in range(0, len(pmids), batch_size):
|
||
batch = pmids[i:i + batch_size]
|
||
batch_papers = await self._fetch_papers_batch(batch)
|
||
papers.extend(batch_papers)
|
||
|
||
return papers
|
||
|
||
except Exception as e:
|
||
print(f"搜索论文时发生错误: {str(e)}")
|
||
return []
|
||
|
||
async def _fetch_papers_batch(self, pmids: List[str]) -> List[PaperMetadata]:
|
||
"""批量获取论文详情
|
||
|
||
Args:
|
||
pmids: PubMed ID列表
|
||
|
||
Returns:
|
||
论文详情列表
|
||
"""
|
||
try:
|
||
# 构建批量获取URL
|
||
fetch_url = (
|
||
f"{self.base_url}/efetch.fcgi?"
|
||
f"db=pubmed&id={','.join(pmids)}"
|
||
f"&retmode=xml&api_key={self.api_key}"
|
||
)
|
||
|
||
response = await self._make_request(fetch_url)
|
||
if not response:
|
||
return []
|
||
|
||
# 解析XML响应
|
||
root = ET.fromstring(response)
|
||
articles = root.findall(".//PubmedArticle")
|
||
|
||
return [self._parse_article(article) for article in articles]
|
||
|
||
except Exception as e:
|
||
print(f"获取论文批次时发生错误: {str(e)}")
|
||
return []
|
||
|
||
def _parse_article(self, article: ET.Element) -> PaperMetadata:
|
||
"""解析PubMed文章XML
|
||
|
||
Args:
|
||
article: XML元素
|
||
|
||
Returns:
|
||
解析后的论文数据
|
||
"""
|
||
try:
|
||
# 提取基本信息
|
||
pmid = article.find(".//PMID").text
|
||
article_meta = article.find(".//Article")
|
||
|
||
# 获取标题
|
||
title = article_meta.find(".//ArticleTitle")
|
||
title = title.text if title is not None else ""
|
||
|
||
# 获取作者列表
|
||
authors = []
|
||
author_list = article_meta.findall(".//Author")
|
||
for author in author_list:
|
||
last_name = author.find("LastName")
|
||
fore_name = author.find("ForeName")
|
||
if last_name is not None and fore_name is not None:
|
||
authors.append(f"{fore_name.text} {last_name.text}")
|
||
elif last_name is not None:
|
||
authors.append(last_name.text)
|
||
|
||
# 获取摘要
|
||
abstract = article_meta.find(".//Abstract/AbstractText")
|
||
abstract = abstract.text if abstract is not None else ""
|
||
|
||
# 获取发表年份
|
||
pub_date = article_meta.find(".//PubDate/Year")
|
||
year = int(pub_date.text) if pub_date is not None else None
|
||
|
||
# 获取DOI
|
||
doi = article.find(".//ELocationID[@EIdType='doi']")
|
||
doi = doi.text if doi is not None else None
|
||
|
||
# 获取期刊信息
|
||
journal = article_meta.find(".//Journal")
|
||
if journal is not None:
|
||
journal_title = journal.find(".//Title")
|
||
venue = journal_title.text if journal_title is not None else None
|
||
|
||
# 获取期刊详细信息
|
||
venue_info = {
|
||
'issn': journal.findtext(".//ISSN"),
|
||
'volume': journal.findtext(".//Volume"),
|
||
'issue': journal.findtext(".//Issue"),
|
||
'pub_date': journal.findtext(".//PubDate/MedlineDate") or
|
||
f"{journal.findtext('.//PubDate/Year', '')}-{journal.findtext('.//PubDate/Month', '')}"
|
||
}
|
||
else:
|
||
venue = None
|
||
venue_info = {}
|
||
|
||
# 获取机构信息
|
||
institutions = []
|
||
affiliations = article_meta.findall(".//Affiliation")
|
||
for affiliation in affiliations:
|
||
if affiliation is not None and affiliation.text:
|
||
institutions.append(affiliation.text)
|
||
|
||
return PaperMetadata(
|
||
title=title,
|
||
authors=authors,
|
||
abstract=abstract,
|
||
year=year,
|
||
doi=doi,
|
||
url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" if pmid else None,
|
||
citations=None, # PubMed API不直接提供引用数据
|
||
venue=venue,
|
||
institutions=institutions,
|
||
venue_type="journal",
|
||
venue_name=venue,
|
||
venue_info=venue_info,
|
||
source='pubmed' # 添加来源标记
|
||
)
|
||
|
||
except Exception as e:
|
||
print(f"解析文章时发生错误: {str(e)}")
|
||
return None
|
||
|
||
async def get_paper_details(self, pmid: str) -> Optional[PaperMetadata]:
|
||
"""获取指定PMID的论文详情"""
|
||
papers = await self._fetch_papers_batch([pmid])
|
||
return papers[0] if papers else None
|
||
|
||
async def get_related_papers(self, pmid: str, limit: int = 100) -> List[PaperMetadata]:
|
||
"""获取相关论文
|
||
|
||
使用PubMed的相关文章功能
|
||
|
||
Args:
|
||
pmid: PubMed ID
|
||
limit: 返回结果数量限制
|
||
|
||
Returns:
|
||
相关论文列表
|
||
"""
|
||
try:
|
||
# 构建相关文章URL
|
||
link_url = (
|
||
f"{self.base_url}/elink.fcgi?"
|
||
f"db=pubmed&id={pmid}&cmd=neighbor&api_key={self.api_key}"
|
||
)
|
||
|
||
response = await self._make_request(link_url)
|
||
if not response:
|
||
return []
|
||
|
||
# 解析XML响应
|
||
root = ET.fromstring(response)
|
||
related_ids = root.findall(".//Link/Id")
|
||
pmids = [id_elem.text for id_elem in related_ids][:limit]
|
||
|
||
if not pmids:
|
||
return []
|
||
|
||
# 获取相关论文详情
|
||
return await self._fetch_papers_batch(pmids)
|
||
|
||
except Exception as e:
|
||
print(f"获取相关论文时发生错误: {str(e)}")
|
||
return []
|
||
|
||
async def search_by_author(
|
||
self,
|
||
author: str,
|
||
limit: int = 100,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""按作者搜索论文"""
|
||
query = f"{author}[Author]"
|
||
if start_year:
|
||
query += f" AND {start_year}:3000[dp]"
|
||
return await self.search(query, limit=limit)
|
||
|
||
async def search_by_journal(
|
||
self,
|
||
journal: str,
|
||
limit: int = 100,
|
||
start_year: int = None
|
||
) -> List[PaperMetadata]:
|
||
"""按期刊搜索论文"""
|
||
query = f"{journal}[Journal]"
|
||
if start_year:
|
||
query += f" AND {start_year}:3000[dp]"
|
||
return await self.search(query, limit=limit)
|
||
|
||
async def get_latest_papers(
|
||
self,
|
||
days: int = 7,
|
||
limit: int = 100
|
||
) -> List[PaperMetadata]:
|
||
"""获取最新论文
|
||
|
||
Args:
|
||
days: 最近几天的论文
|
||
limit: 返回结果数量限制
|
||
|
||
Returns:
|
||
最新论文列表
|
||
"""
|
||
query = f"last {days} days[dp]"
|
||
return await self.search(query, limit=limit, sort_by="date")
|
||
|
||
async def get_citations(self, paper_id: str) -> List[PaperMetadata]:
|
||
"""获取引用该论文的文献
|
||
|
||
注意:PubMed API本身不提供引用数据,此方法将返回空列表
|
||
未来可以考虑集成其他数据源(如CrossRef)来获取引用信息
|
||
|
||
Args:
|
||
paper_id: PubMed ID
|
||
|
||
Returns:
|
||
空列表,因为PubMed不提供引用数据
|
||
"""
|
||
return []
|
||
|
||
async def get_references(self, paper_id: str) -> List[PaperMetadata]:
|
||
"""获取该论文引用的文献
|
||
|
||
从PubMed文章的参考文献列表获取引用的文献
|
||
|
||
Args:
|
||
paper_id: PubMed ID
|
||
|
||
Returns:
|
||
引用的文献列表
|
||
"""
|
||
try:
|
||
# 构建获取参考文献的URL
|
||
refs_url = (
|
||
f"{self.base_url}/elink.fcgi?"
|
||
f"dbfrom=pubmed&db=pubmed&id={paper_id}"
|
||
f"&cmd=neighbor_history&linkname=pubmed_pubmed_refs"
|
||
f"&api_key={self.api_key}"
|
||
)
|
||
|
||
response = await self._make_request(refs_url)
|
||
if not response:
|
||
return []
|
||
|
||
# 解析XML响应
|
||
root = ET.fromstring(response)
|
||
ref_ids = root.findall(".//Link/Id")
|
||
pmids = [id_elem.text for id_elem in ref_ids]
|
||
|
||
if not pmids:
|
||
return []
|
||
|
||
# 获取参考文献详情
|
||
return await self._fetch_papers_batch(pmids)
|
||
|
||
except Exception as e:
|
||
print(f"获取参考文献时发生错误: {str(e)}")
|
||
return []
|
||
|
||
async def example_usage():
|
||
"""PubMedSource使用示例"""
|
||
pubmed = PubMedSource()
|
||
|
||
try:
|
||
# 示例1:基本搜索
|
||
print("\n=== 示例1:搜索COVID-19相关论文 ===")
|
||
papers = await pubmed.search("COVID-19", limit=3)
|
||
for i, paper in enumerate(papers, 1):
|
||
print(f"\n--- 论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
print(f"DOI: {paper.doi}")
|
||
if paper.abstract:
|
||
print(f"摘要: {paper.abstract[:200]}...")
|
||
|
||
# 示例2:获取论文详情
|
||
if papers:
|
||
print("\n=== 示例2:获取论文详情 ===")
|
||
paper_id = papers[0].url.split("/")[-2]
|
||
paper = await pubmed.get_paper_details(paper_id)
|
||
if paper:
|
||
print(f"标题: {paper.title}")
|
||
print(f"期刊: {paper.venue}")
|
||
print(f"机构: {', '.join(paper.institutions)}")
|
||
|
||
# 示例3:获取相关论文
|
||
if papers:
|
||
print("\n=== 示例3:获取相关论文 ===")
|
||
related = await pubmed.get_related_papers(paper_id, limit=3)
|
||
for i, paper in enumerate(related, 1):
|
||
print(f"\n--- 相关论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
|
||
# 示例4:按作者搜索
|
||
print("\n=== 示例4:按作者搜索 ===")
|
||
author_papers = await pubmed.search_by_author("Fauci AS", limit=3)
|
||
for i, paper in enumerate(author_papers, 1):
|
||
print(f"\n--- 论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"发表年份: {paper.year}")
|
||
|
||
# 示例5:按期刊搜索
|
||
print("\n=== 示例5:按期刊搜索 ===")
|
||
journal_papers = await pubmed.search_by_journal("Nature", limit=3)
|
||
for i, paper in enumerate(journal_papers, 1):
|
||
print(f"\n--- 论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"发表年份: {paper.year}")
|
||
|
||
# 示例6:获取最新论文
|
||
print("\n=== 示例6:获取最新论文 ===")
|
||
latest = await pubmed.get_latest_papers(days=7, limit=3)
|
||
for i, paper in enumerate(latest, 1):
|
||
print(f"\n--- 最新论文 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"发表日期: {paper.venue_info.get('pub_date')}")
|
||
|
||
# 示例7:获取论文的参考文献
|
||
if papers:
|
||
print("\n=== 示例7:获取论文的参考文献 ===")
|
||
paper_id = papers[0].url.split("/")[-2]
|
||
references = await pubmed.get_references(paper_id)
|
||
for i, paper in enumerate(references[:3], 1):
|
||
print(f"\n--- 参考文献 {i} ---")
|
||
print(f"标题: {paper.title}")
|
||
print(f"作者: {', '.join(paper.authors)}")
|
||
print(f"发表年份: {paper.year}")
|
||
|
||
# 示例8:尝试获取引用信息(将返回空列表)
|
||
if papers:
|
||
print("\n=== 示例8:获取论文的引用信息 ===")
|
||
paper_id = papers[0].url.split("/")[-2]
|
||
citations = await pubmed.get_citations(paper_id)
|
||
print(f"引用数据:{len(citations)} (PubMed API不提供引用信息)")
|
||
|
||
except Exception as e:
|
||
print(f"发生错误: {str(e)}")
|
||
import traceback
|
||
print(traceback.format_exc())
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(example_usage())
|