Master 4.0 (#2210)

* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能，支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能，支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-12-07 06:56:48 +00:00 · 2025-08-23 15:59:22 +08:00
--- a/crazy_functions/review_fns/data_sources/scopus_source.py
+++ b/crazy_functions/review_fns/data_sources/scopus_source.py
@@ -0,0 +1,400 @@
+from typing import List, Optional, Dict, Union
+from datetime import datetime
+import aiohttp
+import random
+from .base_source import DataSource, PaperMetadata
+from tqdm import tqdm
+
+class ScopusSource(DataSource):
+    """Scopus API实现"""
+
+    # 定义API密钥列表
+    API_KEYS = [
+        "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
+        "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+    ]
+
+    def __init__(self, api_key: str = None):
+        """初始化
+
+        Args:
+            api_key: Scopus API密钥，如果不提供则从预定义列表中随机选择
+        """
+        self.api_key = api_key or random.choice(self.API_KEYS)
+        self._initialize()
+
+    def _initialize(self) -> None:
+        """初始化基础URL和请求头"""
+        self.base_url = "https://api.elsevier.com/content"
+        self.headers = {
+            "X-ELS-APIKey": self.api_key,
+            "Accept": "application/json"
+        }
+
+    async def _make_request(self, url: str, params: Dict = None) -> Optional[Dict]:
+        """发送HTTP请求
+
+        Args:
+            url: 请求URL
+            params: 查询参数
+
+        Returns:
+            响应JSON数据
+        """
+        try:
+            async with aiohttp.ClientSession(headers=self.headers) as session:
+                async with session.get(url, params=params) as response:
+                    if response.status == 200:
+                        return await response.json()
+                    else:
+                        print(f"请求失败: {response.status}")
+                        return None
+        except Exception as e:
+            print(f"请求发生错误: {str(e)}")
+            return None
+
+    def _parse_paper_data(self, data: Dict) -> PaperMetadata:
+        """解析Scopus API返回的数据
+
+        Args:
+            data: Scopus API返回的论文数据
+
+        Returns:
+            解析后的论文元数据
+        """
+        try:
+            # 提取基本信息
+            title = data.get("dc:title", "")
+
+            # 提取作者信息
+            authors = []
+            if "author" in data:
+                if isinstance(data["author"], list):
+                    for author in data["author"]:
+                        if "given-name" in author and "surname" in author:
+                            authors.append(f"{author['given-name']} {author['surname']}")
+                        elif "indexed-name" in author:
+                            authors.append(author["indexed-name"])
+                elif isinstance(data["author"], dict):
+                    if "given-name" in data["author"] and "surname" in data["author"]:
+                        authors.append(f"{data['author']['given-name']} {data['author']['surname']}")
+                    elif "indexed-name" in data["author"]:
+                        authors.append(data["author"]["indexed-name"])
+
+            # 提取摘要
+            abstract = data.get("dc:description", "")
+
+            # 提取年份
+            year = None
+            if "prism:coverDate" in data:
+                try:
+                    year = int(data["prism:coverDate"][:4])
+                except:
+                    pass
+
+            # 提取DOI
+            doi = data.get("prism:doi")
+
+            # 提取引用次数
+            citations = data.get("citedby-count")
+            if citations:
+                try:
+                    citations = int(citations)
+                except:
+                    citations = None
+
+            # 提取期刊信息
+            venue = data.get("prism:publicationName")
+
+            # 提取机构信息
+            institutions = []
+            if "affiliation" in data:
+                if isinstance(data["affiliation"], list):
+                    for aff in data["affiliation"]:
+                        if "affilname" in aff:
+                            institutions.append(aff["affilname"])
+                elif isinstance(data["affiliation"], dict):
+                    if "affilname" in data["affiliation"]:
+                        institutions.append(data["affiliation"]["affilname"])
+
+            # 构建venue信息
+            venue_info = {
+                "issn": data.get("prism:issn"),
+                "eissn": data.get("prism:eIssn"),
+                "volume": data.get("prism:volume"),
+                "issue": data.get("prism:issueIdentifier"),
+                "page_range": data.get("prism:pageRange"),
+                "article_number": data.get("article-number"),
+                "publication_date": data.get("prism:coverDate")
+            }
+
+            return PaperMetadata(
+                title=title,
+                authors=authors,
+                abstract=abstract,
+                year=year,
+                doi=doi,
+                url=data.get("link", [{}])[0].get("@href"),
+                citations=citations,
+                venue=venue,
+                institutions=institutions,
+                venue_type="journal",
+                venue_name=venue,
+                venue_info=venue_info
+            )
+
+        except Exception as e:
+            print(f"解析论文数据时发生错误: {str(e)}")
+            return None
+
+    async def search(
+        self,
+        query: str,
+        limit: int = 100,
+        sort_by: str = None,
+        start_year: int = None
+    ) -> List[PaperMetadata]:
+        """搜索论文
+
+        Args:
+            query: 搜索关键词
+            limit: 返回结果数量限制
+            sort_by: 排序方式 ('relevance', 'date', 'citations')
+            start_year: 起始年份
+
+        Returns:
+            论文列表
+        """
+        try:
+            # 构建查询参数
+            params = {
+                "query": query,
+                "count": min(limit, 100),  # Scopus API单次请求限制
+                "start": 0
+            }
+
+            # 添加年份过滤
+            if start_year:
+                params["date"] = f"{start_year}-present"
+
+            # 添加排序
+            if sort_by:
+                sort_map = {
+                    "relevance": "-score",
+                    "date": "-coverDate",
+                    "citations": "-citedby-count"
+                }
+                if sort_by in sort_map:
+                    params["sort"] = sort_map[sort_by]
+
+            # 发送请求
+            url = f"{self.base_url}/search/scopus"
+            response = await self._make_request(url, params)
+
+            if not response or "search-results" not in response:
+                return []
+
+            # 解析结果
+            results = response["search-results"].get("entry", [])
+            papers = []
+
+            for result in results:
+                paper = self._parse_paper_data(result)
+                if paper:
+                    papers.append(paper)
+
+            return papers
+
+        except Exception as e:
+            print(f"搜索论文时发生错误: {str(e)}")
+            return []
+
+    async def get_paper_details(self, paper_id: str) -> Optional[PaperMetadata]:
+        """获取论文详情
+
+        Args:
+            paper_id: Scopus ID或DOI
+
+        Returns:
+            论文详情
+        """
+        try:
+            # 判断是否为DOI
+            if "/" in paper_id:
+                url = f"{self.base_url}/article/doi/{paper_id}"
+            else:
+                url = f"{self.base_url}/abstract/scopus_id/{paper_id}"
+
+            response = await self._make_request(url)
+
+            if not response or "abstracts-retrieval-response" not in response:
+                return None
+
+            data = response["abstracts-retrieval-response"]
+            return self._parse_paper_data(data)
+
+        except Exception as e:
+            print(f"获取论文详情时发生错误: {str(e)}")
+            return None
+
+    async def get_citations(self, paper_id: str) -> List[PaperMetadata]:
+        """获取引用该论文的文献
+
+        Args:
+            paper_id: Scopus ID
+
+        Returns:
+            引用论文列表
+        """
+        try:
+            url = f"{self.base_url}/abstract/citations/{paper_id}"
+            response = await self._make_request(url)
+
+            if not response or "citing-papers" not in response:
+                return []
+
+            results = response["citing-papers"].get("papers", [])
+            papers = []
+
+            for result in results:
+                paper = self._parse_paper_data(result)
+                if paper:
+                    papers.append(paper)
+
+            return papers
+
+        except Exception as e:
+            print(f"获取引用信息时发生错误: {str(e)}")
+            return []
+
+    async def get_references(self, paper_id: str) -> List[PaperMetadata]:
+        """获取该论文引用的文献
+
+        Args:
+            paper_id: Scopus ID
+
+        Returns:
+            参考文献列表
+        """
+        try:
+            url = f"{self.base_url}/abstract/references/{paper_id}"
+            response = await self._make_request(url)
+
+            if not response or "references" not in response:
+                return []
+
+            results = response["references"].get("reference", [])
+            papers = []
+
+            for result in results:
+                paper = self._parse_paper_data(result)
+                if paper:
+                    papers.append(paper)
+
+            return papers
+
+        except Exception as e:
+            print(f"获取参考文献时发生错误: {str(e)}")
+            return []
+
+    async def search_by_author(
+        self,
+        author: str,
+        limit: int = 100,
+        start_year: int = None
+    ) -> List[PaperMetadata]:
+        """按作者搜索论文"""
+        query = f"AUTHOR-NAME({author})"
+        if start_year:
+            query += f" AND PUBYEAR > {start_year}"
+        return await self.search(query, limit=limit)
+
+    async def search_by_journal(
+        self,
+        journal: str,
+        limit: int = 100,
+        start_year: int = None
+    ) -> List[PaperMetadata]:
+        """按期刊搜索论文"""
+        query = f"SRCTITLE({journal})"
+        if start_year:
+            query += f" AND PUBYEAR > {start_year}"
+        return await self.search(query, limit=limit)
+
+    async def get_latest_papers(
+        self,
+        days: int = 7,
+        limit: int = 100
+    ) -> List[PaperMetadata]:
+        """获取最新论文"""
+        query = f"LOAD-DATE > NOW() - {days}d"
+        return await self.search(query, limit=limit, sort_by="date")
+
+async def example_usage():
+    """ScopusSource使用示例"""
+    scopus = ScopusSource()
+
+    try:
+        # 示例1：基本搜索
+        print("\n=== 示例1：搜索机器学习相关论文 ===")
+        papers = await scopus.search("machine learning", limit=3)
+        print(f"\n找到 {len(papers)} 篇相关论文:")
+        for i, paper in enumerate(papers, 1):
+            print(f"\n论文 {i}:")
+            print(f"标题: {paper.title}")
+            print(f"作者: {', '.join(paper.authors)}")
+            print(f"发表年份: {paper.year}")
+            print(f"发表期刊: {paper.venue}")
+            print(f"引用次数: {paper.citations}")
+            print(f"DOI: {paper.doi}")
+            if paper.abstract:
+                print(f"摘要:\n{paper.abstract}")
+            print("-" * 80)
+
+        # 示例2：按作者搜索
+        print("\n=== 示例2：搜索特定作者的论文 ===")
+        author_papers = await scopus.search_by_author("Hinton G.", limit=3)
+        print(f"\n找到 {len(author_papers)} 篇 Hinton 的论文:")
+        for i, paper in enumerate(author_papers, 1):
+            print(f"\n论文 {i}:")
+            print(f"标题: {paper.title}")
+            print(f"作者: {', '.join(paper.authors)}")
+            print(f"发表年份: {paper.year}")
+            print(f"发表期刊: {paper.venue}")
+            print(f"引用次数: {paper.citations}")
+            print(f"DOI: {paper.doi}")
+            if paper.abstract:
+                print(f"摘要:\n{paper.abstract}")
+            print("-" * 80)
+
+        # 示例3：根据关键词搜索相关论文
+        print("\n=== 示例3：搜索人工智能相关论文 ===")
+        keywords = "artificial intelligence AND deep learning"
+        papers = await scopus.search(
+            query=keywords,
+            limit=5,
+            sort_by="citations",  # 按引用次数排序
+            start_year=2020  # 只搜索2020年之后的论文
+        )
+
+        print(f"\n找到 {len(papers)} 篇相关论文:")
+        for i, paper in enumerate(papers, 1):
+            print(f"\n论文 {i}:")
+            print(f"标题: {paper.title}")
+            print(f"作者: {', '.join(paper.authors)}")
+            print(f"发表年份: {paper.year}")
+            print(f"发表期刊: {paper.venue}")
+            print(f"引用次数: {paper.citations}")
+            print(f"DOI: {paper.doi}")
+            if paper.abstract:
+                print(f"摘要:\n{paper.abstract}")
+            print("-" * 80)
+
+    except Exception as e:
+        print(f"发生错误: {str(e)}")
+        import traceback
+        print(traceback.format_exc())
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(example_usage())