Master 4.0 (#2210)

* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能，支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能，支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-12-06 14:36:48 +00:00 · 2025-08-23 15:59:22 +08:00
--- a/crazy_functions/paper_fns/wiki/wikipedia_api.py
+++ b/crazy_functions/paper_fns/wiki/wikipedia_api.py
@@ -0,0 +1,387 @@
+import aiohttp
+import asyncio
+from typing import List, Dict, Optional
+import re
+import random
+import time
+
+class WikipediaAPI:
+    """维基百科API调用实现"""
+    
+    def __init__(self, language: str = "zh", user_agent: str = None, 
+                 max_concurrent: int = 5, request_delay: float = 0.5):
+        """
+        初始化维基百科API客户端
+        
+        Args:
+            language: 语言代码 (zh: 中文, en: 英文, ja: 日文等)
+            user_agent: 用户代理信息，如果为None将使用默认值
+            max_concurrent: 最大并发请求数
+            request_delay: 请求间隔时间(秒)
+        """
+        self.language = language
+        self.base_url = f"https://{language}.wikipedia.org/w/api.php"
+        self.user_agent = user_agent or "WikipediaAPIClient/1.0 (chatscholar@163.com)"
+        self.headers = {
+            "User-Agent": self.user_agent,
+            "Accept": "application/json"
+        }
+        # 添加并发控制
+        self.semaphore = asyncio.Semaphore(max_concurrent)
+        self.request_delay = request_delay
+        self.last_request_time = 0
+    
+    async def _make_request(self, url, params=None):
+        """
+        发起API请求，包含并发控制和请求延迟
+        
+        Args:
+            url: 请求URL
+            params: 请求参数
+            
+        Returns:
+            API响应数据
+        """
+        # 使用信号量控制并发
+        async with self.semaphore:
+            # 添加请求间隔
+            current_time = time.time()
+            time_since_last_request = current_time - self.last_request_time
+            if time_since_last_request < self.request_delay:
+                await asyncio.sleep(self.request_delay - time_since_last_request)
+            
+            # 设置随机延迟，避免规律性请求
+            jitter = random.uniform(0, 0.2)
+            await asyncio.sleep(jitter)
+            
+            # 记录本次请求时间
+            self.last_request_time = time.time()
+            
+            # 发起请求
+            try:
+                async with aiohttp.ClientSession(headers=self.headers) as session:
+                    async with session.get(url, params=params) as response:
+                        if response.status == 429:  # Too Many Requests
+                            retry_after = int(response.headers.get('Retry-After', 5))
+                            print(f"达到请求限制，等待 {retry_after} 秒后重试...")
+                            await asyncio.sleep(retry_after)
+                            return await self._make_request(url, params)
+                        
+                        if response.status != 200:
+                            print(f"API请求失败: HTTP {response.status}")
+                            print(f"响应内容: {await response.text()}")
+                            return None
+                        
+                        return await response.json()
+            except aiohttp.ClientError as e:
+                print(f"请求错误: {str(e)}")
+                return None
+    
+    async def search(self, query: str, limit: int = 10, namespace: int = 0) -> List[Dict]:
+        """
+        搜索维基百科文章
+        
+        Args:
+            query: 搜索关键词
+            limit: 返回结果数量
+            namespace: 命名空间 (0表示文章, 14表示分类等)
+            
+        Returns:
+            搜索结果列表
+        """
+        params = {
+            "action": "query",
+            "list": "search",
+            "srsearch": query,
+            "format": "json",
+            "srlimit": limit,
+            "srnamespace": namespace,
+            "srprop": "snippet|titlesnippet|sectiontitle|categorysnippet|size|wordcount|timestamp|redirecttitle"
+        }
+        
+        data = await self._make_request(self.base_url, params)
+        if not data:
+            return []
+        
+        search_results = data.get("query", {}).get("search", [])
+        return search_results
+    
+    async def get_page_content(self, title: str, section: Optional[int] = None) -> Dict:
+        """
+        获取维基百科页面内容
+        
+        Args:
+            title: 页面标题
+            section: 特定章节编号(可选)
+            
+        Returns:
+            页面内容字典
+        """
+        async with aiohttp.ClientSession(headers=self.headers) as session:
+            params = {
+                "action": "parse",
+                "page": title,
+                "format": "json",
+                "prop": "text|langlinks|categories|links|templates|images|externallinks|sections|revid|displaytitle|iwlinks|properties"
+            }
+            
+            # 如果指定了章节，只获取该章节内容
+            if section is not None:
+                params["section"] = section
+            
+            async with session.get(self.base_url, params=params) as response:
+                if response.status != 200:
+                    print(f"API请求失败: HTTP {response.status}")
+                    return {}
+                
+                data = await response.json()
+                if "error" in data:
+                    print(f"API错误: {data['error'].get('info', '未知错误')}")
+                    return {}
+                
+                return data.get("parse", {})
+    
+    async def get_summary(self, title: str, sentences: int = 3) -> str:
+        """
+        获取页面摘要
+        
+        Args:
+            title: 页面标题
+            sentences: 返回的句子数量
+            
+        Returns:
+            页面摘要文本
+        """
+        async with aiohttp.ClientSession(headers=self.headers) as session:
+            params = {
+                "action": "query",
+                "prop": "extracts",
+                "exintro": "1",
+                "exsentences": sentences,
+                "explaintext": "1",
+                "titles": title,
+                "format": "json"
+            }
+            
+            async with session.get(self.base_url, params=params) as response:
+                if response.status != 200:
+                    print(f"API请求失败: HTTP {response.status}")
+                    return ""
+                
+                data = await response.json()
+                pages = data.get("query", {}).get("pages", {})
+                # 获取第一个页面ID的内容
+                for page_id in pages:
+                    return pages[page_id].get("extract", "")
+                return ""
+    
+    async def get_random_articles(self, count: int = 1, namespace: int = 0) -> List[Dict]:
+        """
+        获取随机文章
+        
+        Args:
+            count: 需要的随机文章数量
+            namespace: 命名空间
+            
+        Returns:
+            随机文章列表
+        """
+        async with aiohttp.ClientSession(headers=self.headers) as session:
+            params = {
+                "action": "query",
+                "list": "random",
+                "rnlimit": count,
+                "rnnamespace": namespace,
+                "format": "json"
+            }
+            
+            async with session.get(self.base_url, params=params) as response:
+                if response.status != 200:
+                    print(f"API请求失败: HTTP {response.status}")
+                    return []
+                
+                data = await response.json()
+                return data.get("query", {}).get("random", [])
+
+    async def login(self, username: str, password: str) -> bool:
+        """
+        使用维基百科账户登录
+        
+        Args:
+            username: 维基百科用户名
+            password: 维基百科密码
+            
+        Returns:
+            登录是否成功
+        """
+        async with aiohttp.ClientSession(headers=self.headers) as session:
+            # 获取登录令牌
+            params = {
+                "action": "query",
+                "meta": "tokens",
+                "type": "login",
+                "format": "json"
+            }
+            
+            async with session.get(self.base_url, params=params) as response:
+                if response.status != 200:
+                    print(f"获取登录令牌失败: HTTP {response.status}")
+                    return False
+                
+                data = await response.json()
+                login_token = data.get("query", {}).get("tokens", {}).get("logintoken")
+                
+                if not login_token:
+                    print("获取登录令牌失败")
+                    return False
+                
+                # 使用令牌登录
+                login_params = {
+                    "action": "login",
+                    "lgname": username,
+                    "lgpassword": password,
+                    "lgtoken": login_token,
+                    "format": "json"
+                }
+                
+                async with session.post(self.base_url, data=login_params) as login_response:
+                    login_data = await login_response.json()
+                    
+                    if login_data.get("login", {}).get("result") == "Success":
+                        print(f"登录成功: {username}")
+                        return True
+                    else:
+                        print(f"登录失败: {login_data.get('login', {}).get('reason', '未知原因')}")
+                        return False
+
+    async def setup_oauth(self, consumer_token: str, consumer_secret: str, 
+                       access_token: str = None, access_secret: str = None) -> bool:
+        """
+        设置OAuth认证
+        
+        Args:
+            consumer_token: 消费者令牌
+            consumer_secret: 消费者密钥
+            access_token: 访问令牌（可选）
+            access_secret: 访问密钥（可选）
+            
+        Returns:
+            设置是否成功
+        """
+        try:
+            # 需要安装 mwoauth 库: pip install mwoauth
+            import mwoauth
+            import requests_oauthlib
+            
+            # 设置OAuth
+            self.consumer_token = consumer_token
+            self.consumer_secret = consumer_secret
+            
+            if access_token and access_secret:
+                # 如果已有访问令牌
+                self.auth = requests_oauthlib.OAuth1(
+                    consumer_token,
+                    consumer_secret,
+                    access_token,
+                    access_secret
+                )
+                print("OAuth设置成功")
+                return True
+            else:
+                # 需要获取访问令牌（这通常需要用户在网页上授权）
+                print("请在开发环境中完成以下OAuth授权流程:")
+                
+                # 创建消费者
+                consumer = mwoauth.Consumer(
+                    consumer_token, consumer_secret
+                )
+                
+                # 初始化握手
+                redirect, request_token = mwoauth.initiate(
+                    f"https://{self.language}.wikipedia.org/w/index.php",
+                    consumer
+                )
+                
+                print(f"请访问此URL授权应用: {redirect}")
+                # 这里通常会提示用户访问URL并输入授权码
+                # 实际应用中需要实现适当的授权流程
+                return False
+        except ImportError:
+            print("请安装 mwoauth 库: pip install mwoauth")
+            return False
+        except Exception as e:
+            print(f"设置OAuth时发生错误: {str(e)}")
+            return False
+
+async def example_usage():
+    """演示WikipediaAPI的使用方法"""
+    # 创建默认中文维基百科API客户端
+    wiki_zh = WikipediaAPI(language="zh")
+    
+    try:
+        # 示例1: 基本搜索
+        print("\n=== 示例1: 搜索维基百科 ===")
+        results = await wiki_zh.search("人工智能", limit=3)
+        
+        for i, result in enumerate(results, 1):
+            print(f"\n--- 结果 {i} ---")
+            print(f"标题: {result.get('title')}")
+            snippet = result.get('snippet', '')
+            # 清理HTML标签
+            snippet = re.sub(r'<.*?>', '', snippet)
+            print(f"摘要: {snippet}")
+            print(f"字数: {result.get('wordcount')}")
+            print(f"大小: {result.get('size')} 字节")
+
+        # 示例2: 获取页面摘要
+        print("\n=== 示例2: 获取页面摘要 ===")
+        summary = await wiki_zh.get_summary("深度学习", sentences=2)
+        print(f"深度学习摘要: {summary}")
+
+        # 示例3: 获取页面内容
+        print("\n=== 示例3: 获取页面内容 ===")
+        content = await wiki_zh.get_page_content("机器学习")
+        if content and "text" in content:
+            text = content["text"].get("*", "")
+            # 移除HTML标签以便控制台显示
+            clean_text = re.sub(r'<.*?>', '', text)
+            print(f"机器学习页面内容片段: {clean_text[:200]}...")
+            
+            # 显示页面包含的分类数量
+            categories = content.get("categories", [])
+            print(f"分类数量: {len(categories)}")
+            
+            # 显示页面包含的链接数量
+            links = content.get("links", [])
+            print(f"链接数量: {len(links)}")
+
+        # 示例4: 获取特定章节内容
+        print("\n=== 示例4: 获取特定章节内容 ===")
+        # 获取引言部分(通常是0号章节)
+        intro_content = await wiki_zh.get_page_content("人工智能", section=0)
+        if intro_content and "text" in intro_content:
+            intro_text = intro_content["text"].get("*", "")
+            clean_intro = re.sub(r'<.*?>', '', intro_text)
+            print(f"人工智能引言内容片段: {clean_intro[:200]}...")
+
+        # 示例5: 获取随机文章
+        print("\n=== 示例5: 获取随机文章 ===")
+        random_articles = await wiki_zh.get_random_articles(count=2)
+        print("随机文章:")
+        for i, article in enumerate(random_articles, 1):
+            print(f"{i}. {article.get('title')}")
+            
+            # 显示随机文章的简短摘要
+            article_summary = await wiki_zh.get_summary(article.get('title'), sentences=1)
+            print(f"   摘要: {article_summary[:100]}...")
+
+    except Exception as e:
+        print(f"发生错误: {str(e)}")
+        import traceback
+        print(traceback.format_exc())
+
+if __name__ == "__main__":
+    import asyncio
+    
+    # 运行示例
+    asyncio.run(example_usage())