Master 4.0 (#2210)

* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能，支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能，支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-12-06 06:26:47 +00:00 · 2025-08-23 15:59:22 +08:00
--- a/crazy_functions/review_fns/data_sources/scihub_source.py
+++ b/crazy_functions/review_fns/data_sources/scihub_source.py
@@ -0,0 +1,326 @@
+from pathlib import Path
+import requests
+from bs4 import BeautifulSoup
+import time
+from loguru import logger
+import PyPDF2
+import io
+
+
+class SciHub:
+    # 更新的镜像列表，包含更多可用的镜像
+    MIRRORS = [
+        'https://sci-hub.se/',
+        'https://sci-hub.st/',
+        'https://sci-hub.ru/',
+        'https://sci-hub.wf/',
+        'https://sci-hub.ee/',
+        'https://sci-hub.ren/',
+        'https://sci-hub.tf/',
+        'https://sci-hub.si/',
+        'https://sci-hub.do/',
+        'https://sci-hub.hkvisa.net/',
+        'https://sci-hub.mksa.top/',
+        'https://sci-hub.shop/',
+        'https://sci-hub.yncjkj.com/',
+        'https://sci-hub.41610.org/',
+        'https://sci-hub.automic.us/',
+        'https://sci-hub.et-fine.com/',
+        'https://sci-hub.pooh.mu/',
+        'https://sci-hub.bban.top/',
+        'https://sci-hub.usualwant.com/',
+        'https://sci-hub.unblockit.kim/'
+    ]
+
+    def __init__(self, doi: str, path: Path, url=None, timeout=60, use_proxy=True):
+        self.timeout = timeout
+        self.path = path
+        self.doi = str(doi)
+        self.use_proxy = use_proxy
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        }
+        self.payload = {
+            'sci-hub-plugin-check': '',
+            'request': self.doi
+        }
+        self.url = url if url else self.MIRRORS[0]
+        self.proxies = {
+            "http": "socks5h://localhost:10880",
+            "https": "socks5h://localhost:10880",
+        } if use_proxy else None
+
+    def _test_proxy_connection(self):
+        """测试代理连接是否可用"""
+        if not self.use_proxy:
+            return True
+            
+        try:
+            # 测试代理连接
+            test_response = requests.get(
+                'https://httpbin.org/ip', 
+                proxies=self.proxies, 
+                timeout=10
+            )
+            if test_response.status_code == 200:
+                logger.info("代理连接测试成功")
+                return True
+        except Exception as e:
+            logger.warning(f"代理连接测试失败: {str(e)}")
+            return False
+        return False
+
+    def _check_pdf_validity(self, content):
+        """检查PDF文件是否有效"""
+        try:
+            # 使用PyPDF2检查PDF是否可以正常打开和读取
+            pdf = PyPDF2.PdfReader(io.BytesIO(content))
+            if len(pdf.pages) > 0:
+                return True
+            return False
+        except Exception as e:
+            logger.error(f"PDF文件无效: {str(e)}")
+            return False
+
+    def _send_request(self):
+        """发送请求到Sci-Hub镜像站点"""
+        # 首先测试代理连接
+        if self.use_proxy and not self._test_proxy_connection():
+            logger.warning("代理连接不可用，切换到直连模式")
+            self.use_proxy = False
+            self.proxies = None
+
+        last_exception = None
+        working_mirrors = []
+        
+        # 先测试哪些镜像可用
+        logger.info("正在测试镜像站点可用性...")
+        for mirror in self.MIRRORS:
+            try:
+                test_response = requests.get(
+                    mirror, 
+                    headers=self.headers,
+                    proxies=self.proxies,
+                    timeout=10
+                )
+                if test_response.status_code == 200:
+                    working_mirrors.append(mirror)
+                    logger.info(f"镜像 {mirror} 可用")
+                    if len(working_mirrors) >= 5:  # 找到5个可用镜像就够了
+                        break
+            except Exception as e:
+                logger.debug(f"镜像 {mirror} 不可用: {str(e)}")
+                continue
+        
+        if not working_mirrors:
+            raise Exception("没有找到可用的镜像站点")
+        
+        logger.info(f"找到 {len(working_mirrors)} 个可用镜像，开始尝试下载...")
+        
+        # 使用可用的镜像进行下载
+        for mirror in working_mirrors:
+            try:
+                res = requests.post(
+                    mirror, 
+                    headers=self.headers, 
+                    data=self.payload, 
+                    proxies=self.proxies,
+                    timeout=self.timeout
+                )
+                if res.ok:
+                    logger.info(f"成功使用镜像站点: {mirror}")
+                    self.url = mirror  # 更新当前使用的镜像
+                    time.sleep(1)  # 降低等待时间以提高效率
+                    return res
+            except Exception as e:
+                logger.error(f"尝试镜像 {mirror} 失败: {str(e)}")
+                last_exception = e
+                continue
+        
+        if last_exception:
+            raise last_exception
+        raise Exception("所有可用镜像站点均无法完成下载")
+
+    def _extract_url(self, response):
+        """从响应中提取PDF下载链接"""
+        soup = BeautifulSoup(response.content, 'html.parser')
+        try:
+            # 尝试多种方式提取PDF链接
+            pdf_element = soup.find(id='pdf')
+            if pdf_element:
+                content_url = pdf_element.get('src')
+            else:
+                # 尝试其他可能的选择器
+                pdf_element = soup.find('iframe')
+                if pdf_element:
+                    content_url = pdf_element.get('src')
+                else:
+                    # 查找直接的PDF链接
+                    pdf_links = soup.find_all('a', href=lambda x: x and '.pdf' in x)
+                    if pdf_links:
+                        content_url = pdf_links[0].get('href')
+                    else:
+                        raise AttributeError("未找到PDF链接")
+            
+            if content_url:
+                content_url = content_url.replace('#navpanes=0&view=FitH', '').replace('//', '/')
+                if not content_url.endswith('.pdf') and 'pdf' not in content_url.lower():
+                    raise AttributeError("找到的链接不是PDF文件")
+        except AttributeError:
+            logger.error(f"未找到论文 {self.doi}")
+            return None
+
+        current_mirror = self.url.rstrip('/')
+        if content_url.startswith('/'):
+            return current_mirror + content_url
+        elif content_url.startswith('http'):
+            return content_url
+        else:
+            return 'https:/' + content_url
+
+    def _download_pdf(self, pdf_url):
+        """下载PDF文件并验证其完整性"""
+        try:
+            # 尝试不同的下载方式
+            download_methods = [
+                # 方法1：直接下载
+                lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout),
+                # 方法2：添加 Referer 头
+                lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout, 
+                                   headers={**self.headers, 'Referer': self.url}),
+                # 方法3：使用原始域名作为 Referer
+                lambda: requests.get(pdf_url, proxies=self.proxies, timeout=self.timeout,
+                                   headers={**self.headers, 'Referer': pdf_url.split('/downloads')[0] if '/downloads' in pdf_url else self.url})
+            ]
+
+            for i, download_method in enumerate(download_methods):
+                try:
+                    logger.info(f"尝试下载方式 {i+1}/3...")
+                    response = download_method()
+                    if response.status_code == 200:
+                        content = response.content
+                        if len(content) > 1000 and self._check_pdf_validity(content):  # 确保文件不是太小
+                            logger.info(f"PDF下载成功，文件大小: {len(content)} bytes")
+                            return content
+                        else:
+                            logger.warning("下载的文件可能不是有效的PDF")
+                    elif response.status_code == 403:
+                        logger.warning(f"访问被拒绝 (403 Forbidden)，尝试其他下载方式")
+                        continue
+                    else:
+                        logger.warning(f"下载失败，状态码: {response.status_code}")
+                        continue
+                except Exception as e:
+                    logger.warning(f"下载方式 {i+1} 失败: {str(e)}")
+                    continue
+
+            # 如果所有方法都失败，尝试构造替代URL
+            try:
+                logger.info("尝试使用替代镜像下载...")
+                # 从原始URL提取关键信息
+                if '/downloads/' in pdf_url:
+                    file_part = pdf_url.split('/downloads/')[-1]
+                    alternative_mirrors = [
+                        f"https://sci-hub.se/downloads/{file_part}",
+                        f"https://sci-hub.st/downloads/{file_part}",
+                        f"https://sci-hub.ru/downloads/{file_part}",
+                        f"https://sci-hub.wf/downloads/{file_part}",
+                        f"https://sci-hub.ee/downloads/{file_part}",
+                        f"https://sci-hub.ren/downloads/{file_part}",
+                        f"https://sci-hub.tf/downloads/{file_part}"
+                    ]
+                    
+                    for alt_url in alternative_mirrors:
+                        try:
+                            response = requests.get(
+                                alt_url, 
+                                proxies=self.proxies, 
+                                timeout=self.timeout,
+                                headers={**self.headers, 'Referer': alt_url.split('/downloads')[0]}
+                            )
+                            if response.status_code == 200:
+                                content = response.content
+                                if len(content) > 1000 and self._check_pdf_validity(content):
+                                    logger.info(f"使用替代镜像成功下载: {alt_url}")
+                                    return content
+                        except Exception as e:
+                            logger.debug(f"替代镜像 {alt_url} 下载失败: {str(e)}")
+                            continue
+
+            except Exception as e:
+                logger.error(f"所有下载方式都失败: {str(e)}")
+            
+            return None
+        
+        except Exception as e:
+            logger.error(f"下载PDF文件失败: {str(e)}")
+            return None
+
+    def fetch(self):
+        """获取论文PDF，包含重试和验证机制"""
+        for attempt in range(2):  # 最多重试3次
+            try:
+                logger.info(f"开始第 {attempt + 1} 次尝试下载论文: {self.doi}")
+                
+                # 获取PDF下载链接
+                response = self._send_request()
+                pdf_url = self._extract_url(response)
+                if pdf_url is None:
+                    logger.warning(f"第 {attempt + 1} 次尝试：未找到PDF下载链接")
+                    continue
+
+                logger.info(f"找到PDF下载链接: {pdf_url}")
+
+                # 下载并验证PDF
+                pdf_content = self._download_pdf(pdf_url)
+                if pdf_content is None:
+                    logger.warning(f"第 {attempt + 1} 次尝试：PDF下载失败")
+                    continue
+
+                # 保存PDF文件
+                pdf_name = f"{self.doi.replace('/', '_').replace(':', '_')}.pdf"
+                pdf_path = self.path.joinpath(pdf_name)
+                pdf_path.write_bytes(pdf_content)
+                
+                logger.info(f"成功下载论文: {pdf_name}，文件大小: {len(pdf_content)} bytes")
+                return str(pdf_path)
+
+            except Exception as e:
+                logger.error(f"第 {attempt + 1} 次尝试失败: {str(e)}")
+                if attempt < 2:  # 不是最后一次尝试
+                    wait_time = (attempt + 1) * 3  # 递增等待时间
+                    logger.info(f"等待 {wait_time} 秒后重试...")
+                    time.sleep(wait_time)
+                continue
+                
+        raise Exception(f"无法下载论文 {self.doi}，所有重试都失败了")
+
+# Usage Example
+if __name__ == '__main__':
+    # 创建一个用于保存PDF的目录
+    save_path = Path('./downloaded_papers')
+    save_path.mkdir(exist_ok=True)
+    
+    # DOI示例
+    sample_doi = '10.3897/rio.7.e67379'  # 这是一篇Nature的论文DOI
+    
+    try:
+        # 初始化SciHub下载器，先尝试使用代理
+        logger.info("尝试使用代理模式...")
+        downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=True)
+        
+        # 开始下载
+        result = downloader.fetch()
+        print(f"论文已保存到: {result}")
+        
+    except Exception as e:
+        print(f"使用代理模式失败: {str(e)}")
+        try:
+            # 如果代理模式失败，尝试直连模式
+            logger.info("尝试直连模式...")
+            downloader = SciHub(doi=sample_doi, path=save_path, use_proxy=False)
+            result = downloader.fetch()
+            print(f"论文已保存到: {result}")
+        except Exception as e2:
+            print(f"直连模式也失败: {str(e2)}")
+            print("建议检查网络连接或尝试其他DOI")