文件
gpt_academic/crazy_functions/review_fns/data_sources/scihub_source.py
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

326 行
13 KiB
Python

此文件含有模棱两可的 Unicode 字符

此文件含有可能会与其他字符混淆的 Unicode 字符。 如果您是想特意这样的,可以安全地忽略该警告。 使用 Escape 按钮显示他们。

from pathlib import Path
import requests
from bs4 import BeautifulSoup
import time
from loguru import logger
import PyPDF2
import io
class SciHub:
# 更新的镜像列表,包含更多可用的镜像
MIRRORS = [
'https://sci-hub.se/',
'https://sci-hub.st/',
'https://sci-hub.ru/',
'https://sci-hub.wf/',
'https://sci-hub.ee/',
'https://sci-hub.ren/',
'https://sci-hub.tf/',
'https://sci-hub.si/',
'https://sci-hub.do/',
'https://sci-hub.hkvisa.net/',
'https://sci-hub.mksa.top/',
'https://sci-hub.shop/',
'https://sci-hub.yncjkj.com/',
'https://sci-hub.41610.org/',
'https://sci-hub.automic.us/',
'https://sci-hub.et-fine.com/',
'https://sci-hub.pooh.mu/',
'https://sci-hub.bban.top/',
'https://sci-hub.usualwant.com/',
'https://sci-hub.unblockit.kim/'
]
def __init__(self, doi: str, path: Path, url=None, timeout=60, use_proxy=True):
self.timeout = timeout
self.path = path
self.doi = str(doi)
self.use_proxy = use_proxy
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
self.payload = {
'sci-hub-plugin-check': '',
'request': self.doi
}
self.url = url if url else self.MIRRORS[0]
self.proxies = {
"http": "socks5h://localhost:10880",
"https": "socks5h://localhost:10880",
} if use_proxy else None
def _test_proxy_connection(self):
"""测试代理连接是否可用"""
if not self.use_proxy:
return True
try:
# 测试代理连接
test_response = requests.get(
'https://httpbin.org/ip',
proxies=self.proxies,
timeout=10
)
if test_response.status_code == 200:
logger.info("代理连接测试成功")
return True
except Exception as e:
logger.warning(f"代理连接测试失败: {str(e)}")
return False
return False
def _check_pdf_validity(self, content):
"""检查PDF文件是否有效"""
try:
# 使用PyPDF2检查PDF是否可以正常打开和读取
pdf = PyPDF2.PdfReader(io.BytesIO(content))
if len(pdf.pages) > 0:
return True
return False
except Exception as e:
logger.error(f"PDF文件无效: {str(e)}")
return False
def _send_request(self):
"""发送请求到Sci-Hub镜像站点"""
# 首先测试代理连接
if self.use_proxy and not self._test_proxy_connection():
logger.warning("代理连接不可用,切换到直连模式")
self.use_proxy = False
self.proxies = None
last_exception = None
working_mirrors = []
# 先测试哪些镜像可用
logger.info("正在测试镜像站点可用性...")
for mirror in self.MIRRORS:
try:
test_response = requests.get(
mirror,
headers=self.headers,
proxies=self.proxies,
timeout=10
)
if test_response.status_code == 200:
working_mirrors.append(mirror)
logger.info(f"镜像 {mirror} 可用")
if len(working_mirrors) >= 5: # 找到5个可用镜像就够了
break
except Exception as e:
logger.debug(f"镜像 {mirror} 不可用: {str(e)}")
continue
if not working_mirrors:
raise Exception("没有找到可用的镜像站点")
logger.info(f"找到 {len(working_mirrors)} 个可用镜像,开始尝试下载...")
# 使用可用的镜像进行下载
for mirror in working_mirrors:
try:
res = requests.post(
mirror,
headers=self.headers,
data=self.payload,
proxies=self.proxies,
timeout=self.timeout
)
if res.ok:
logger.info(f"成功使用镜像站点: {mirror}")
self.url = mirror # 更新当前使用的镜像
time.sleep(1) # 降低等待时间以提高效率
return res
except Exception as e:
logger.error(f"尝试镜像 {mirror} 失败: {str(e)}")
last_exception = e
continue
if last_exception:
raise last_exception
raise Exception("所有可用镜像站点均无法完成下载")
def _extract_url(self, response):
"""从响应中提取PDF下载链接"""
soup = BeautifulSoup(response.content, 'html.parser')
try:
# 尝试多种方式提取PDF链接
pdf_element = soup.find(id='pdf')
if pdf_element:
content_url = pdf_element.get('src')
else:
# 尝试其他可能的选择器
pdf_element = soup.find('iframe')
if pdf_element:
content_url = pdf_element.get('src')
else:
# 查找直接的PDF链接
pdf_links = soup.find_all('a', href=lambda x: x and '.pdf' in x)
if pdf_links:
content_url = pdf_links[0].get('href')
else:
raise AttributeError("未找到PDF链接")
if content_url:
content_url = content_url.replace('#navpanes=0&view=FitH', '').replace('//', '/')
if not content_url.endswith('.pdf') and 'pdf' not in content_url.lower():
raise AttributeError("找到的链接不是PDF文件")
except AttributeError:
logger.error(f"未找到论文 {self.doi}")
return None
current_mirror = self.url.rstrip('/')
if content_url.startswith('/'):
return current_mirror + content_url
elif content_url.startswith('http'):
return content_url
else:
return 'https:/' + content_url
def _download_pdf(self, pdf_url):
"""下载PDF文件并验证其完整性"""
try:
# 尝试不同的下载方式
download_methods = [
# 方法1直接下载
lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout),
# 方法2添加 Referer 头
lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout,
headers={**self.headers, 'Referer': self.url}),
# 方法3使用原始域名作为 Referer
lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout,
headers={**self.headers, 'Referer': pdf_url.split('/downloads')[0] if '/downloads' in pdf_url else self.url})
]
for i, download_method in enumerate(download_methods):
try:
logger.info(f"尝试下载方式 {i+1}/3...")
response = download_method()
if response.status_code == 200:
content = response.content
if len(content) > 1000 and self._check_pdf_validity(content): # 确保文件不是太小
logger.info(f"PDF下载成功,文件大小: {len(content)} bytes")
return content
else:
logger.warning("下载的文件可能不是有效的PDF")
elif response.status_code == 403:
logger.warning(f"访问被拒绝 (403 Forbidden),尝试其他下载方式")
continue
else:
logger.warning(f"下载失败,状态码: {response.status_code}")
continue
except Exception as e:
logger.warning(f"下载方式 {i+1} 失败: {str(e)}")
continue
# 如果所有方法都失败,尝试构造替代URL
try:
logger.info("尝试使用替代镜像下载...")
# 从原始URL提取关键信息
if '/downloads/' in pdf_url:
file_part = pdf_url.split('/downloads/')[-1]
alternative_mirrors = [
f"https://sci-hub.se/downloads/{file_part}",
f"https://sci-hub.st/downloads/{file_part}",
f"https://sci-hub.ru/downloads/{file_part}",
f"https://sci-hub.wf/downloads/{file_part}",
f"https://sci-hub.ee/downloads/{file_part}",
f"https://sci-hub.ren/downloads/{file_part}",
f"https://sci-hub.tf/downloads/{file_part}"
]
for alt_url in alternative_mirrors:
try:
response = requests.get(
alt_url,
proxies=self.proxies,
timeout=self.timeout,
headers={**self.headers, 'Referer': alt_url.split('/downloads')[0]}
)
if response.status_code == 200:
content = response.content
if len(content) > 1000 and self._check_pdf_validity(content):
logger.info(f"使用替代镜像成功下载: {alt_url}")
return content
except Exception as e:
logger.debug(f"替代镜像 {alt_url} 下载失败: {str(e)}")
continue
except Exception as e:
logger.error(f"所有下载方式都失败: {str(e)}")
return None
except Exception as e:
logger.error(f"下载PDF文件失败: {str(e)}")
return None
def fetch(self):
"""获取论文PDF,包含重试和验证机制"""
for attempt in range(2): # 最多重试3次
try:
logger.info(f"开始第 {attempt + 1} 次尝试下载论文: {self.doi}")
# 获取PDF下载链接
response = self._send_request()
pdf_url = self._extract_url(response)
if pdf_url is None:
logger.warning(f"{attempt + 1} 次尝试未找到PDF下载链接")
continue
logger.info(f"找到PDF下载链接: {pdf_url}")
# 下载并验证PDF
pdf_content = self._download_pdf(pdf_url)
if pdf_content is None:
logger.warning(f"{attempt + 1} 次尝试PDF下载失败")
continue
# 保存PDF文件
pdf_name = f"{self.doi.replace('/', '_').replace(':', '_')}.pdf"
pdf_path = self.path.joinpath(pdf_name)
pdf_path.write_bytes(pdf_content)
logger.info(f"成功下载论文: {pdf_name},文件大小: {len(pdf_content)} bytes")
return str(pdf_path)
except Exception as e:
logger.error(f"{attempt + 1} 次尝试失败: {str(e)}")
if attempt < 2: # 不是最后一次尝试
wait_time = (attempt + 1) * 3 # 递增等待时间
logger.info(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
continue
raise Exception(f"无法下载论文 {self.doi},所有重试都失败了")
# Usage Example
if __name__ == '__main__':
# 创建一个用于保存PDF的目录
save_path = Path('./downloaded_papers')
save_path.mkdir(exist_ok=True)
# DOI示例
sample_doi = '10.3897/rio.7.e67379' # 这是一篇Nature的论文DOI
try:
# 初始化SciHub下载器,先尝试使用代理
logger.info("尝试使用代理模式...")
downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=True)
# 开始下载
result = downloader.fetch()
print(f"论文已保存到: {result}")
except Exception as e:
print(f"使用代理模式失败: {str(e)}")
try:
# 如果代理模式失败,尝试直连模式
logger.info("尝试直连模式...")
downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=False)
result = downloader.fetch()
print(f"论文已保存到: {result}")
except Exception as e2:
print(f"直连模式也失败: {str(e2)}")
print("建议检查网络连接或尝试其他DOI")