Master 4.0 (#2210)

* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
这个提交包含在:
binary-husky
2025-08-23 15:59:22 +08:00
提交者 GitHub
父节点 65a4cf59c2
当前提交 8042750d41
共有 79 个文件被更改,包括 20850 次插入57 次删除

查看文件

@@ -0,0 +1,46 @@
import aiohttp
from typing import List, Dict, Optional
from datetime import datetime
from .base_source import DataSource, PaperMetadata
class UnpaywallSource(DataSource):
"""Unpaywall API实现"""
def _initialize(self) -> None:
self.base_url = "https://api.unpaywall.org/v2"
self.email = self.api_key # Unpaywall使用email作为API key
async def search(self, query: str, limit: int = 100) -> List[PaperMetadata]:
async with aiohttp.ClientSession() as session:
async with session.get(
f"{self.base_url}/search",
params={
"query": query,
"email": self.email,
"limit": limit
}
) as response:
data = await response.json()
return [self._parse_response(item.response)
for item in data.get("results", [])]
def _parse_response(self, data: Dict) -> PaperMetadata:
"""解析Unpaywall返回的数据"""
return PaperMetadata(
title=data.get("title", ""),
authors=[
f"{author.get('given', '')} {author.get('family', '')}"
for author in data.get("z_authors", [])
],
institutions=[
aff.get("name", "")
for author in data.get("z_authors", [])
for aff in author.get("affiliation", [])
],
abstract="", # Unpaywall不提供摘要
year=data.get("year"),
doi=data.get("doi"),
url=data.get("doi_url"),
citations=None, # Unpaywall不提供引用计数
venue=data.get("journal_name")
)