Master 4.0 (#2210)

* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能，支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能，支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-12-06 06:26:47 +00:00 · 2025-08-23 15:59:22 +08:00
--- a/crazy_functions/review_fns/data_sources/unpaywall_source.py
+++ b/crazy_functions/review_fns/data_sources/unpaywall_source.py
@@ -0,0 +1,46 @@
+import aiohttp
+from typing import List, Dict, Optional
+from datetime import datetime
+from .base_source import DataSource, PaperMetadata
+
+class UnpaywallSource(DataSource):
+    """Unpaywall API实现"""
+    
+    def _initialize(self) -> None:
+        self.base_url = "https://api.unpaywall.org/v2"
+        self.email = self.api_key  # Unpaywall使用email作为API key
+        
+    async def search(self, query: str, limit: int = 100) -> List[PaperMetadata]:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(
+                f"{self.base_url}/search",
+                params={
+                    "query": query,
+                    "email": self.email,
+                    "limit": limit
+                }
+            ) as response:
+                data = await response.json()
+                return [self._parse_response(item.response) 
+                        for item in data.get("results", [])]
+                
+    def _parse_response(self, data: Dict) -> PaperMetadata:
+        """解析Unpaywall返回的数据"""
+        return PaperMetadata(
+            title=data.get("title", ""),
+            authors=[
+                f"{author.get('given', '')} {author.get('family', '')}"
+                for author in data.get("z_authors", [])
+            ],
+            institutions=[
+                aff.get("name", "")
+                for author in data.get("z_authors", [])
+                for aff in author.get("affiliation", [])
+            ],
+            abstract="",  # Unpaywall不提供摘要
+            year=data.get("year"),
+            doi=data.get("doi"),
+            url=data.get("doi_url"),
+            citations=None,  # Unpaywall不提供引用计数
+            venue=data.get("journal_name")
+        )