Master 4.0 (#2210)

* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
这个提交包含在:
binary-husky
2025-08-23 15:59:22 +08:00
提交者 GitHub
父节点 65a4cf59c2
当前提交 8042750d41
共有 79 个文件被更改,包括 20850 次插入57 次删除

查看文件

@@ -0,0 +1,387 @@
import aiohttp
import asyncio
from typing import List, Dict, Optional
import re
import random
import time
class WikipediaAPI:
"""维基百科API调用实现"""
def __init__(self, language: str = "zh", user_agent: str = None,
max_concurrent: int = 5, request_delay: float = 0.5):
"""
初始化维基百科API客户端
Args:
language: 语言代码 (zh: 中文, en: 英文, ja: 日文等)
user_agent: 用户代理信息,如果为None将使用默认值
max_concurrent: 最大并发请求数
request_delay: 请求间隔时间(秒)
"""
self.language = language
self.base_url = f"https://{language}.wikipedia.org/w/api.php"
self.user_agent = user_agent or "WikipediaAPIClient/1.0 (chatscholar@163.com)"
self.headers = {
"User-Agent": self.user_agent,
"Accept": "application/json"
}
# 添加并发控制
self.semaphore = asyncio.Semaphore(max_concurrent)
self.request_delay = request_delay
self.last_request_time = 0
async def _make_request(self, url, params=None):
"""
发起API请求,包含并发控制和请求延迟
Args:
url: 请求URL
params: 请求参数
Returns:
API响应数据
"""
# 使用信号量控制并发
async with self.semaphore:
# 添加请求间隔
current_time = time.time()
time_since_last_request = current_time - self.last_request_time
if time_since_last_request < self.request_delay:
await asyncio.sleep(self.request_delay - time_since_last_request)
# 设置随机延迟,避免规律性请求
jitter = random.uniform(0, 0.2)
await asyncio.sleep(jitter)
# 记录本次请求时间
self.last_request_time = time.time()
# 发起请求
try:
async with aiohttp.ClientSession(headers=self.headers) as session:
async with session.get(url, params=params) as response:
if response.status == 429: # Too Many Requests
retry_after = int(response.headers.get('Retry-After', 5))
print(f"达到请求限制,等待 {retry_after} 秒后重试...")
await asyncio.sleep(retry_after)
return await self._make_request(url, params)
if response.status != 200:
print(f"API请求失败: HTTP {response.status}")
print(f"响应内容: {await response.text()}")
return None
return await response.json()
except aiohttp.ClientError as e:
print(f"请求错误: {str(e)}")
return None
async def search(self, query: str, limit: int = 10, namespace: int = 0) -> List[Dict]:
"""
搜索维基百科文章
Args:
query: 搜索关键词
limit: 返回结果数量
namespace: 命名空间 (0表示文章, 14表示分类等)
Returns:
搜索结果列表
"""
params = {
"action": "query",
"list": "search",
"srsearch": query,
"format": "json",
"srlimit": limit,
"srnamespace": namespace,
"srprop": "snippet|titlesnippet|sectiontitle|categorysnippet|size|wordcount|timestamp|redirecttitle"
}
data = await self._make_request(self.base_url, params)
if not data:
return []
search_results = data.get("query", {}).get("search", [])
return search_results
async def get_page_content(self, title: str, section: Optional[int] = None) -> Dict:
"""
获取维基百科页面内容
Args:
title: 页面标题
section: 特定章节编号(可选)
Returns:
页面内容字典
"""
async with aiohttp.ClientSession(headers=self.headers) as session:
params = {
"action": "parse",
"page": title,
"format": "json",
"prop": "text|langlinks|categories|links|templates|images|externallinks|sections|revid|displaytitle|iwlinks|properties"
}
# 如果指定了章节,只获取该章节内容
if section is not None:
params["section"] = section
async with session.get(self.base_url, params=params) as response:
if response.status != 200:
print(f"API请求失败: HTTP {response.status}")
return {}
data = await response.json()
if "error" in data:
print(f"API错误: {data['error'].get('info', '未知错误')}")
return {}
return data.get("parse", {})
async def get_summary(self, title: str, sentences: int = 3) -> str:
"""
获取页面摘要
Args:
title: 页面标题
sentences: 返回的句子数量
Returns:
页面摘要文本
"""
async with aiohttp.ClientSession(headers=self.headers) as session:
params = {
"action": "query",
"prop": "extracts",
"exintro": "1",
"exsentences": sentences,
"explaintext": "1",
"titles": title,
"format": "json"
}
async with session.get(self.base_url, params=params) as response:
if response.status != 200:
print(f"API请求失败: HTTP {response.status}")
return ""
data = await response.json()
pages = data.get("query", {}).get("pages", {})
# 获取第一个页面ID的内容
for page_id in pages:
return pages[page_id].get("extract", "")
return ""
async def get_random_articles(self, count: int = 1, namespace: int = 0) -> List[Dict]:
"""
获取随机文章
Args:
count: 需要的随机文章数量
namespace: 命名空间
Returns:
随机文章列表
"""
async with aiohttp.ClientSession(headers=self.headers) as session:
params = {
"action": "query",
"list": "random",
"rnlimit": count,
"rnnamespace": namespace,
"format": "json"
}
async with session.get(self.base_url, params=params) as response:
if response.status != 200:
print(f"API请求失败: HTTP {response.status}")
return []
data = await response.json()
return data.get("query", {}).get("random", [])
async def login(self, username: str, password: str) -> bool:
"""
使用维基百科账户登录
Args:
username: 维基百科用户名
password: 维基百科密码
Returns:
登录是否成功
"""
async with aiohttp.ClientSession(headers=self.headers) as session:
# 获取登录令牌
params = {
"action": "query",
"meta": "tokens",
"type": "login",
"format": "json"
}
async with session.get(self.base_url, params=params) as response:
if response.status != 200:
print(f"获取登录令牌失败: HTTP {response.status}")
return False
data = await response.json()
login_token = data.get("query", {}).get("tokens", {}).get("logintoken")
if not login_token:
print("获取登录令牌失败")
return False
# 使用令牌登录
login_params = {
"action": "login",
"lgname": username,
"lgpassword": password,
"lgtoken": login_token,
"format": "json"
}
async with session.post(self.base_url, data=login_params) as login_response:
login_data = await login_response.json()
if login_data.get("login", {}).get("result") == "Success":
print(f"登录成功: {username}")
return True
else:
print(f"登录失败: {login_data.get('login', {}).get('reason', '未知原因')}")
return False
async def setup_oauth(self, consumer_token: str, consumer_secret: str,
access_token: str = None, access_secret: str = None) -> bool:
"""
设置OAuth认证
Args:
consumer_token: 消费者令牌
consumer_secret: 消费者密钥
access_token: 访问令牌(可选)
access_secret: 访问密钥(可选)
Returns:
设置是否成功
"""
try:
# 需要安装 mwoauth 库: pip install mwoauth
import mwoauth
import requests_oauthlib
# 设置OAuth
self.consumer_token = consumer_token
self.consumer_secret = consumer_secret
if access_token and access_secret:
# 如果已有访问令牌
self.auth = requests_oauthlib.OAuth1(
consumer_token,
consumer_secret,
access_token,
access_secret
)
print("OAuth设置成功")
return True
else:
# 需要获取访问令牌(这通常需要用户在网页上授权)
print("请在开发环境中完成以下OAuth授权流程:")
# 创建消费者
consumer = mwoauth.Consumer(
consumer_token, consumer_secret
)
# 初始化握手
redirect, request_token = mwoauth.initiate(
f"https://{self.language}.wikipedia.org/w/index.php",
consumer
)
print(f"请访问此URL授权应用: {redirect}")
# 这里通常会提示用户访问URL并输入授权码
# 实际应用中需要实现适当的授权流程
return False
except ImportError:
print("请安装 mwoauth 库: pip install mwoauth")
return False
except Exception as e:
print(f"设置OAuth时发生错误: {str(e)}")
return False
async def example_usage():
"""演示WikipediaAPI的使用方法"""
# 创建默认中文维基百科API客户端
wiki_zh = WikipediaAPI(language="zh")
try:
# 示例1: 基本搜索
print("\n=== 示例1: 搜索维基百科 ===")
results = await wiki_zh.search("人工智能", limit=3)
for i, result in enumerate(results, 1):
print(f"\n--- 结果 {i} ---")
print(f"标题: {result.get('title')}")
snippet = result.get('snippet', '')
# 清理HTML标签
snippet = re.sub(r'<.*?>', '', snippet)
print(f"摘要: {snippet}")
print(f"字数: {result.get('wordcount')}")
print(f"大小: {result.get('size')} 字节")
# 示例2: 获取页面摘要
print("\n=== 示例2: 获取页面摘要 ===")
summary = await wiki_zh.get_summary("深度学习", sentences=2)
print(f"深度学习摘要: {summary}")
# 示例3: 获取页面内容
print("\n=== 示例3: 获取页面内容 ===")
content = await wiki_zh.get_page_content("机器学习")
if content and "text" in content:
text = content["text"].get("*", "")
# 移除HTML标签以便控制台显示
clean_text = re.sub(r'<.*?>', '', text)
print(f"机器学习页面内容片段: {clean_text[:200]}...")
# 显示页面包含的分类数量
categories = content.get("categories", [])
print(f"分类数量: {len(categories)}")
# 显示页面包含的链接数量
links = content.get("links", [])
print(f"链接数量: {len(links)}")
# 示例4: 获取特定章节内容
print("\n=== 示例4: 获取特定章节内容 ===")
# 获取引言部分(通常是0号章节)
intro_content = await wiki_zh.get_page_content("人工智能", section=0)
if intro_content and "text" in intro_content:
intro_text = intro_content["text"].get("*", "")
clean_intro = re.sub(r'<.*?>', '', intro_text)
print(f"人工智能引言内容片段: {clean_intro[:200]}...")
# 示例5: 获取随机文章
print("\n=== 示例5: 获取随机文章 ===")
random_articles = await wiki_zh.get_random_articles(count=2)
print("随机文章:")
for i, article in enumerate(random_articles, 1):
print(f"{i}. {article.get('title')}")
# 显示随机文章的简短摘要
article_summary = await wiki_zh.get_summary(article.get('title'), sentences=1)
print(f" 摘要: {article_summary[:100]}...")
except Exception as e:
print(f"发生错误: {str(e)}")
import traceback
print(traceback.format_exc())
if __name__ == "__main__":
import asyncio
# 运行示例
asyncio.run(example_usage())