文件
binary-husky 8042750d41 Master 4.0 (#2210)
* stage academic conversation

* stage document conversation

* fix buggy gradio version

* file dynamic load

* merge more academic plugins

* accelerate nltk

* feat: 为predict函数添加文件和URL读取功能
- 添加URL检测和网页内容提取功能,支持自动提取网页文本
- 添加文件路径识别和文件内容读取功能,支持private_upload路径格式
- 集成WebTextExtractor处理网页内容提取
- 集成TextContentLoader处理本地文件读取
- 支持文件路径与问题组合的智能处理

* back

* block unstable

---------

Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
2025-08-23 15:59:22 +08:00

109 行
4.4 KiB
Python

import re
import requests
from loguru import logger
from typing import List, Dict
from urllib3.util import Retry
from requests.adapters import HTTPAdapter
from textwrap import dedent
from request_llms.bridge_all import predict_no_ui_long_connection
class BGELLMRanker:
"""使用LLM进行论文相关性判断的类"""
def __init__(self, llm_kwargs):
self.llm_kwargs = llm_kwargs
def is_paper_relevant(self, query: str, paper_text: str) -> bool:
"""判断论文是否与查询相关"""
prompt = dedent(f"""
Evaluate if this academic paper contains information that directly addresses the user's query.
Query: {query}
Paper Content:
{paper_text}
Evaluation Criteria:
1. The paper must contain core information that directly answers the query
2. The paper's main research focus must be highly relevant to the query
3. Papers that only mention query-related content in abstract should be excluded
4. Papers with superficial or general discussions should be excluded
5. For queries about "recent" or "latest" advances, paper should be from last 3 years
Instructions:
- Carefully evaluate against ALL criteria above
- Return true ONLY if paper meets ALL criteria
- If any criteria is not met or unclear, return false
- Be strict but not overly restrictive
Output Rules:
- Must ONLY respond with <decision>true</decision> or <decision>false</decision>
- true = paper contains relevant information to answer the query
- false = paper does not contain sufficient relevant information
Do not include any explanation or additional text."""
)
response = predict_no_ui_long_connection(
inputs=prompt,
history=[],
llm_kwargs=self.llm_kwargs,
sys_prompt="You are an expert at determining paper relevance to queries. Respond only with <decision>true</decision> or <decision>false</decision>."
)
# 提取decision标签中的内容
match = re.search(r'<decision>(.*?)</decision>', response, re.IGNORECASE)
if match:
decision = match.group(1).lower()
return decision == "true"
else:
return False
def batch_check_relevance(self, query: str, paper_texts: List[str], show_progress: bool = True) -> List[bool]:
"""批量检查论文相关性
Args:
query: 用户查询
paper_texts: 论文文本列表
show_progress: 是否显示进度条
Returns:
List[bool]: 相关性判断结果列表
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
results = [False] * len(paper_texts)
# 减少并发线程数以避免连接池耗尽
max_workers = min(20, len(paper_texts)) # 限制最大线程数
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_idx = {
executor.submit(self.is_paper_relevant, query, text): i
for i, text in enumerate(paper_texts)
}
iterator = as_completed(future_to_idx)
if show_progress:
iterator = tqdm(iterator, total=len(paper_texts), desc="判断论文相关性")
for future in iterator:
idx = future_to_idx[future]
try:
results[idx] = future.result()
except Exception as e:
logger.exception(f"处理论文 {idx} 时出错: {str(e)}")
results[idx] = False
return results
def main():
# 测试代码
ranker = BGELLMRanker()
query = "Recent advances in transformer models"
paper_text = """
Title: Attention Is All You Need
Abstract: The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely...
"""
is_relevant = ranker.is_paper_relevant(query, paper_text)
print(f"Paper relevant: {is_relevant}")
if __name__ == "__main__":
main()