Master 4.0 (#2210)

* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
这个提交包含在:
binary-husky
2025-08-23 15:59:22 +08:00
提交者 GitHub
父节点 65a4cf59c2
当前提交 8042750d41
共有 79 个文件被更改,包括 20850 次插入57 次删除

查看文件

@@ -0,0 +1,326 @@
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import time
from loguru import logger
import PyPDF2
import io
class SciHub:
# 更新的镜像列表,包含更多可用的镜像
MIRRORS = [
'https://sci-hub.se/',
'https://sci-hub.st/',
'https://sci-hub.ru/',
'https://sci-hub.wf/',
'https://sci-hub.ee/',
'https://sci-hub.ren/',
'https://sci-hub.tf/',
'https://sci-hub.si/',
'https://sci-hub.do/',
'https://sci-hub.hkvisa.net/',
'https://sci-hub.mksa.top/',
'https://sci-hub.shop/',
'https://sci-hub.yncjkj.com/',
'https://sci-hub.41610.org/',
'https://sci-hub.automic.us/',
'https://sci-hub.et-fine.com/',
'https://sci-hub.pooh.mu/',
'https://sci-hub.bban.top/',
'https://sci-hub.usualwant.com/',
'https://sci-hub.unblockit.kim/'
]
def __init__(self, doi: str, path: Path, url=None, timeout=60, use_proxy=True):
self.timeout = timeout
self.path = path
self.doi = str(doi)
self.use_proxy = use_proxy
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
self.payload = {
'sci-hub-plugin-check': '',
'request': self.doi
}
self.url = url if url else self.MIRRORS[0]
self.proxies = {
"http": "socks5h://localhost:10880",
"https": "socks5h://localhost:10880",
} if use_proxy else None
def _test_proxy_connection(self):
"""测试代理连接是否可用"""
if not self.use_proxy:
return True
try:
# 测试代理连接
test_response = requests.get(
'https://httpbin.org/ip',
proxies=self.proxies,
timeout=10
)
if test_response.status_code == 200:
logger.info("代理连接测试成功")
return True
except Exception as e:
logger.warning(f"代理连接测试失败: {str(e)}")
return False
return False
def _check_pdf_validity(self, content):
"""检查PDF文件是否有效"""
try:
# 使用PyPDF2检查PDF是否可以正常打开和读取
pdf = PyPDF2.PdfReader(io.BytesIO(content))
if len(pdf.pages) > 0:
return True
return False
except Exception as e:
logger.error(f"PDF文件无效: {str(e)}")
return False
def _send_request(self):
"""发送请求到Sci-Hub镜像站点"""
# 首先测试代理连接
if self.use_proxy and not self._test_proxy_connection():
logger.warning("代理连接不可用,切换到直连模式")
self.use_proxy = False
self.proxies = None
last_exception = None
working_mirrors = []
# 先测试哪些镜像可用
logger.info("正在测试镜像站点可用性...")
for mirror in self.MIRRORS:
try:
test_response = requests.get(
mirror,
headers=self.headers,
proxies=self.proxies,
timeout=10
)
if test_response.status_code == 200:
working_mirrors.append(mirror)
logger.info(f"镜像 {mirror} 可用")
if len(working_mirrors) >= 5: # 找到5个可用镜像就够了
break
except Exception as e:
logger.debug(f"镜像 {mirror} 不可用: {str(e)}")
continue
if not working_mirrors:
raise Exception("没有找到可用的镜像站点")
logger.info(f"找到 {len(working_mirrors)} 个可用镜像,开始尝试下载...")
# 使用可用的镜像进行下载
for mirror in working_mirrors:
try:
res = requests.post(
mirror,
headers=self.headers,
data=self.payload,
proxies=self.proxies,
timeout=self.timeout
)
if res.ok:
logger.info(f"成功使用镜像站点: {mirror}")
self.url = mirror # 更新当前使用的镜像
time.sleep(1) # 降低等待时间以提高效率
return res
except Exception as e:
logger.error(f"尝试镜像 {mirror} 失败: {str(e)}")
last_exception = e
continue
if last_exception:
raise last_exception
raise Exception("所有可用镜像站点均无法完成下载")
def _extract_url(self, response):
"""从响应中提取PDF下载链接"""
soup = BeautifulSoup(response.content, 'html.parser')
try:
# 尝试多种方式提取PDF链接
pdf_element = soup.find(id='pdf')
if pdf_element:
content_url = pdf_element.get('src')
else:
# 尝试其他可能的选择器
pdf_element = soup.find('iframe')
if pdf_element:
content_url = pdf_element.get('src')
else:
# 查找直接的PDF链接
pdf_links = soup.find_all('a', href=lambda x: x and '.pdf' in x)
if pdf_links:
content_url = pdf_links[0].get('href')
else:
raise AttributeError("未找到PDF链接")
if content_url:
content_url = content_url.replace('#navpanes=0&view=FitH', '').replace('//', '/')
if not content_url.endswith('.pdf') and 'pdf' not in content_url.lower():
raise AttributeError("找到的链接不是PDF文件")
except AttributeError:
logger.error(f"未找到论文 {self.doi}")
return None
current_mirror = self.url.rstrip('/')
if content_url.startswith('/'):
return current_mirror + content_url
elif content_url.startswith('http'):
return content_url
else:
return 'https:/' + content_url
def _download_pdf(self, pdf_url):
"""下载PDF文件并验证其完整性"""
try:
# 尝试不同的下载方式
download_methods = [
# 方法1直接下载
lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout),
# 方法2添加 Referer 头
lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout,
headers={**self.headers, 'Referer': self.url}),
# 方法3使用原始域名作为 Referer
lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout,
headers={**self.headers, 'Referer': pdf_url.split('/downloads')[0] if '/downloads' in pdf_url else self.url})
]
for i, download_method in enumerate(download_methods):
try:
logger.info(f"尝试下载方式 {i+1}/3...")
response = download_method()
if response.status_code == 200:
content = response.content
if len(content) > 1000 and self._check_pdf_validity(content): # 确保文件不是太小
logger.info(f"PDF下载成功,文件大小: {len(content)} bytes")
return content
else:
logger.warning("下载的文件可能不是有效的PDF")
elif response.status_code == 403:
logger.warning(f"访问被拒绝 (403 Forbidden),尝试其他下载方式")
continue
else:
logger.warning(f"下载失败,状态码: {response.status_code}")
continue
except Exception as e:
logger.warning(f"下载方式 {i+1} 失败: {str(e)}")
continue
# 如果所有方法都失败,尝试构造替代URL
try:
logger.info("尝试使用替代镜像下载...")
# 从原始URL提取关键信息
if '/downloads/' in pdf_url:
file_part = pdf_url.split('/downloads/')[-1]
alternative_mirrors = [
f"https://sci-hub.se/downloads/{file_part}",
f"https://sci-hub.st/downloads/{file_part}",
f"https://sci-hub.ru/downloads/{file_part}",
f"https://sci-hub.wf/downloads/{file_part}",
f"https://sci-hub.ee/downloads/{file_part}",
f"https://sci-hub.ren/downloads/{file_part}",
f"https://sci-hub.tf/downloads/{file_part}"
]
for alt_url in alternative_mirrors:
try:
response = requests.get(
alt_url,
proxies=self.proxies,
timeout=self.timeout,
headers={**self.headers, 'Referer': alt_url.split('/downloads')[0]}
)
if response.status_code == 200:
content = response.content
if len(content) > 1000 and self._check_pdf_validity(content):
logger.info(f"使用替代镜像成功下载: {alt_url}")
return content
except Exception as e:
logger.debug(f"替代镜像 {alt_url} 下载失败: {str(e)}")
continue
except Exception as e:
logger.error(f"所有下载方式都失败: {str(e)}")
return None
except Exception as e:
logger.error(f"下载PDF文件失败: {str(e)}")
return None
def fetch(self):
"""获取论文PDF,包含重试和验证机制"""
for attempt in range(2): # 最多重试3次
try:
logger.info(f"开始第 {attempt + 1} 次尝试下载论文: {self.doi}")
# 获取PDF下载链接
response = self._send_request()
pdf_url = self._extract_url(response)
if pdf_url is None:
logger.warning(f"{attempt + 1} 次尝试未找到PDF下载链接")
continue
logger.info(f"找到PDF下载链接: {pdf_url}")
# 下载并验证PDF
pdf_content = self._download_pdf(pdf_url)
if pdf_content is None:
logger.warning(f"{attempt + 1} 次尝试PDF下载失败")
continue
# 保存PDF文件
pdf_name = f"{self.doi.replace('/', '_').replace(':', '_')}.pdf"
pdf_path = self.path.joinpath(pdf_name)
pdf_path.write_bytes(pdf_content)
logger.info(f"成功下载论文: {pdf_name},文件大小: {len(pdf_content)} bytes")
return str(pdf_path)
except Exception as e:
logger.error(f"{attempt + 1} 次尝试失败: {str(e)}")
if attempt < 2: # 不是最后一次尝试
wait_time = (attempt + 1) * 3 # 递增等待时间
logger.info(f"等待 {wait_time} 秒后重试...")
time.sleep(wait_time)
continue
raise Exception(f"无法下载论文 {self.doi},所有重试都失败了")
# Usage Example
if __name__ == '__main__':
# 创建一个用于保存PDF的目录
save_path = Path('./downloaded_papers')
save_path.mkdir(exist_ok=True)
# DOI示例
sample_doi = '10.3897/rio.7.e67379' # 这是一篇Nature的论文DOI
try:
# 初始化SciHub下载器,先尝试使用代理
logger.info("尝试使用代理模式...")
downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=True)
# 开始下载
result = downloader.fetch()
print(f"论文已保存到: {result}")
except Exception as e:
print(f"使用代理模式失败: {str(e)}")
try:
# 如果代理模式失败,尝试直连模式
logger.info("尝试直连模式...")
downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=False)
result = downloader.fetch()
print(f"论文已保存到: {result}")
except Exception as e2:
print(f"直连模式也失败: {str(e2)}")
print("建议检查网络连接或尝试其他DOI")