Master 4.0 (#2210)

* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
这个提交包含在:
binary-husky
2025-08-23 15:59:22 +08:00
提交者 GitHub
父节点 65a4cf59c2
当前提交 8042750d41
共有 79 个文件被更改,包括 20850 次插入57 次删除

查看文件

@@ -0,0 +1,142 @@
import json
import os
from typing import Dict, Optional
class JournalMetrics:
"""期刊指标管理类"""
def __init__(self):
self.journal_data: Dict = {} # 期刊名称到指标的映射
self.issn_map: Dict = {} # ISSN到指标的映射
self.name_map: Dict = {} # 标准化名称到指标的映射
self._load_journal_data()
def _normalize_journal_name(self, name: str) -> str:
"""标准化期刊名称
Args:
name: 原始期刊名称
Returns:
标准化后的期刊名称
"""
if not name:
return ""
# 转换为小写
name = name.lower()
# 移除常见的前缀和后缀
prefixes = ['the ', 'proceedings of ', 'journal of ']
suffixes = [' journal', ' proceedings', ' magazine', ' review', ' letters']
for prefix in prefixes:
if name.startswith(prefix):
name = name[len(prefix):]
for suffix in suffixes:
if name.endswith(suffix):
name = name[:-len(suffix)]
# 移除特殊字符,保留字母、数字和空格
name = ''.join(c for c in name if c.isalnum() or c.isspace())
# 移除多余的空格
name = ' '.join(name.split())
return name
def _convert_if_value(self, if_str: str) -> Optional[float]:
"""转换IF值为float,处理特殊情况"""
try:
if if_str.startswith('<'):
# 对于<0.1这样的值,返回0.1
return float(if_str.strip('<'))
return float(if_str)
except (ValueError, AttributeError):
return None
def _load_journal_data(self):
"""加载期刊数据"""
try:
file_path = os.path.join(os.path.dirname(__file__), 'cas_if.json')
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# 建立期刊名称到指标的映射
for journal in data:
# 准备指标数据
metrics = {
'if_factor': self._convert_if_value(journal.get('IF')),
'jcr_division': journal.get('Q'),
'cas_division': journal.get('B')
}
# 存储期刊名称映射(使用标准化名称)
if journal.get('journal'):
normalized_name = self._normalize_journal_name(journal['journal'])
self.journal_data[normalized_name] = metrics
self.name_map[normalized_name] = metrics
# 存储期刊缩写映射
if journal.get('jabb'):
normalized_abbr = self._normalize_journal_name(journal['jabb'])
self.journal_data[normalized_abbr] = metrics
self.name_map[normalized_abbr] = metrics
# 存储ISSN映射
if journal.get('issn'):
self.issn_map[journal['issn']] = metrics
if journal.get('eissn'):
self.issn_map[journal['eissn']] = metrics
except Exception as e:
print(f"加载期刊数据时出错: {str(e)}")
self.journal_data = {}
self.issn_map = {}
self.name_map = {}
def get_journal_metrics(self, venue_name: str, venue_info: dict) -> dict:
"""获取期刊指标
Args:
venue_name: 期刊名称
venue_info: 期刊详细信息
Returns:
包含期刊指标的字典
"""
try:
metrics = {}
# 1. 首先尝试通过ISSN匹配
if venue_info and 'issn' in venue_info:
issn_value = venue_info['issn']
# 处理ISSN可能是列表的情况
if isinstance(issn_value, list):
# 尝试每个ISSN
for issn in issn_value:
metrics = self.issn_map.get(issn, {})
if metrics: # 如果找到匹配的指标,就停止搜索
break
else: # ISSN是字符串的情况
metrics = self.issn_map.get(issn_value, {})
# 2. 如果ISSN匹配失败,尝试通过期刊名称匹配
if not metrics and venue_name:
# 标准化期刊名称
normalized_name = self._normalize_journal_name(venue_name)
metrics = self.name_map.get(normalized_name, {})
# 如果完全匹配失败,尝试部分匹配
# if not metrics:
# for db_name, db_metrics in self.name_map.items():
# if normalized_name in db_name:
# metrics = db_metrics
# break
return metrics
except Exception as e:
print(f"获取期刊指标时出错: {str(e)}")
return {}