镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 14:36:48 +00:00
up
这个提交包含在:
@@ -39,7 +39,7 @@ AVAIL_LLM_MODELS = ["gpt-4-1106-preview", "gpt-4-turbo-preview", "gpt-4-vision-p
|
||||
"gemini-1.5-pro", "chatglm3"
|
||||
]
|
||||
|
||||
EMBEDDING_MODEL = "text-embedding-3-small"
|
||||
EMBEDDING_MODEL = "netease-youdao/bce-embedding-base_v1"
|
||||
|
||||
# --- --- --- ---
|
||||
# P.S. 其他可用的模型还包括
|
||||
|
||||
@@ -22,7 +22,7 @@ MAX_HISTORY_ROUND = 5 # 最大历史对话轮数
|
||||
MAX_CONTEXT_TOKEN_LIMIT = 4096 # 上下文最大token数
|
||||
REMEMBER_PREVIEW = 1000 # 记忆预览长度
|
||||
VECTOR_STORE_TYPE = "Simple" # 向量存储类型:Simple或Milvus
|
||||
MAX_CONCURRENT_PAPERS = 5 # 最大并行处理论文数
|
||||
MAX_CONCURRENT_PAPERS = 20 # 最大并行处理论文数
|
||||
MAX_WORKERS = 3 # 最大工作线程数
|
||||
|
||||
# 配置日志
|
||||
|
||||
@@ -269,7 +269,8 @@ if __name__ == "__main__":
|
||||
"test_cache/2411.03663/3_method.tex",
|
||||
"test_cache/2411.03663/4_experiment.tex",
|
||||
"test_cache/2411.03663/5_related_work.tex",
|
||||
"test_cache/2411.03663/6_conclu.tex"
|
||||
"test_cache/2411.03663/6_conclu.tex",
|
||||
"test_cache/2411.03663/reference.bib"
|
||||
]
|
||||
for file_path in file_path_list:
|
||||
tex_content = read_tex_file(file_path)
|
||||
|
||||
@@ -1,314 +0,0 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
import re
|
||||
from loguru import logger
|
||||
import tiktoken
|
||||
from abc import ABC, abstractmethod
|
||||
import numpy as np
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkingConfig:
|
||||
"""文档分块配置
|
||||
|
||||
Attributes:
|
||||
chunk_size: 每个块的目标大小(tokens)
|
||||
chunk_overlap: 相邻块之间的重叠大小(tokens)
|
||||
model_name: 使用的tokenizer模型名称
|
||||
min_chunk_chars: 最小块大小(字符)
|
||||
max_chunk_chars: 最大块大小(字符)
|
||||
"""
|
||||
chunk_size: int = 1000
|
||||
chunk_overlap: int = 200
|
||||
model_name: str = "gpt-3.5-turbo"
|
||||
min_chunk_chars: int = 100 # 最小块字符数
|
||||
max_chunk_chars: int = 2000 # 最大块字符数
|
||||
chunk_size_buffer: int = 50 # chunk大小的容差范围
|
||||
|
||||
def __post_init__(self):
|
||||
"""验证配置参数"""
|
||||
if self.chunk_overlap >= self.chunk_size:
|
||||
raise ValueError("chunk_overlap must be smaller than chunk_size")
|
||||
if self.min_chunk_chars >= self.max_chunk_chars:
|
||||
raise ValueError("min_chunk_chars must be smaller than max_chunk_chars")
|
||||
|
||||
|
||||
class TextChunker:
|
||||
"""基于tiktoken的文本分块器"""
|
||||
|
||||
def __init__(self, config: ChunkingConfig = None):
|
||||
"""初始化分块器
|
||||
|
||||
Args:
|
||||
config: 分块配置,如果不提供则使用默认配置
|
||||
"""
|
||||
self.config = config or ChunkingConfig()
|
||||
self._init_tokenizer()
|
||||
|
||||
def _init_tokenizer(self):
|
||||
"""初始化tokenizer"""
|
||||
try:
|
||||
self.tokenizer = tiktoken.encoding_for_model(self.config.model_name)
|
||||
logger.info(f"Using tokenizer for model: {self.config.model_name}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Fallback to cl100k_base tokenizer: {e}")
|
||||
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
def get_token_count(self, text: str) -> int:
|
||||
"""计算文本的token数量
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
|
||||
Returns:
|
||||
token数量
|
||||
"""
|
||||
if not text.strip():
|
||||
return 0
|
||||
return len(self.tokenizer.encode(text))
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""将文本分割为chunks
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
|
||||
Returns:
|
||||
文本块列表
|
||||
"""
|
||||
if not text.strip():
|
||||
return []
|
||||
|
||||
try:
|
||||
# 首先按段落分割
|
||||
paragraphs = self._split_into_paragraphs(text)
|
||||
|
||||
# 处理每个段落
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
for para in paragraphs:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
# 计算段落token数
|
||||
para_length = self.get_token_count(para)
|
||||
|
||||
# 如果段落太长,需要进一步分割
|
||||
if para_length > self.config.chunk_size:
|
||||
# 先处理当前累积的chunk
|
||||
if current_chunk:
|
||||
chunks.append(self._join_text_with_newlines(current_chunk))
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
# 分割长段落
|
||||
para_chunks = self._split_long_paragraph(para)
|
||||
chunks.extend(para_chunks)
|
||||
continue
|
||||
|
||||
# 检查是否需要开始新的chunk
|
||||
if (current_length + para_length > self.config.chunk_size and
|
||||
current_chunk):
|
||||
chunks.append(self._join_text_with_newlines(current_chunk))
|
||||
|
||||
# 保持重叠
|
||||
if len(current_chunk) > 1:
|
||||
# 保留最后一段作为重叠
|
||||
current_chunk = [current_chunk[-1]]
|
||||
current_length = self.get_token_count(current_chunk[-1])
|
||||
else:
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
current_chunk.append(para)
|
||||
current_length += para_length
|
||||
|
||||
# 处理最后一个chunk
|
||||
if current_chunk:
|
||||
chunks.append(self._join_text_with_newlines(current_chunk))
|
||||
|
||||
# 后处理确保chunk大小合适
|
||||
return self._post_process_chunks(chunks)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in splitting text: {e}")
|
||||
# 如果发生错误,返回原文本作为单个chunk
|
||||
return [text] if text.strip() else []
|
||||
|
||||
def _split_into_paragraphs(self, text: str) -> List[str]:
|
||||
"""按段落分割文本
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
|
||||
Returns:
|
||||
段落列表
|
||||
"""
|
||||
# 处理多种换行符
|
||||
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 分割成段落
|
||||
paragraphs = [p.strip() for p in text.split('\n')]
|
||||
|
||||
# 移除空段落
|
||||
return [p for p in paragraphs if p]
|
||||
|
||||
def _split_long_paragraph(self, paragraph: str) -> List[str]:
|
||||
"""分割过长的段落
|
||||
|
||||
Args:
|
||||
paragraph: 需要分割的段落
|
||||
|
||||
Returns:
|
||||
分割后的文本块列表
|
||||
"""
|
||||
# 首先尝试按句子分割
|
||||
sentences = self._split_into_sentences(paragraph)
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
for sentence in sentences:
|
||||
sentence_length = self.get_token_count(sentence)
|
||||
|
||||
# 如果单个句子超过chunk大小
|
||||
if sentence_length > self.config.chunk_size:
|
||||
# 处理当前累积的chunk
|
||||
if current_chunk:
|
||||
chunks.append(" ".join(current_chunk))
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
# 对长句子进行硬分割
|
||||
chunks.extend(self._split_by_tokens(sentence))
|
||||
continue
|
||||
|
||||
# 检查是否需要开始新的chunk
|
||||
if current_length + sentence_length > self.config.chunk_size and current_chunk:
|
||||
chunks.append(" ".join(current_chunk))
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
|
||||
current_chunk.append(sentence)
|
||||
current_length += sentence_length
|
||||
|
||||
# 处理最后一个chunk
|
||||
if current_chunk:
|
||||
chunks.append(" ".join(current_chunk))
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_into_sentences(self, text: str) -> List[str]:
|
||||
"""将文本分割为句子
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
|
||||
Returns:
|
||||
句子列表
|
||||
"""
|
||||
# 句子分隔符模式
|
||||
pattern = r'(?<=[。!?.!?])\s+'
|
||||
|
||||
# 分割文本
|
||||
sentences = re.split(pattern, text)
|
||||
|
||||
# 确保每个句子都以句号结尾
|
||||
sentences = [s + '。' if not s.strip().endswith(('。', '!', '?', '.', '!', '?'))
|
||||
else s for s in sentences]
|
||||
|
||||
return [s.strip() for s in sentences if s.strip()]
|
||||
|
||||
def _split_by_tokens(self, text: str) -> List[str]:
|
||||
"""按token数量硬分割文本
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
|
||||
Returns:
|
||||
分割后的文本块列表
|
||||
"""
|
||||
tokens = self.tokenizer.encode(text)
|
||||
chunks = []
|
||||
|
||||
for i in range(0, len(tokens), self.config.chunk_size):
|
||||
chunk_tokens = tokens[i:i + self.config.chunk_size]
|
||||
chunk_text = self.tokenizer.decode(chunk_tokens).strip()
|
||||
if chunk_text:
|
||||
chunks.append(chunk_text)
|
||||
|
||||
return chunks
|
||||
|
||||
def _join_text_with_newlines(self, text_list: List[str]) -> str:
|
||||
"""用换行符连接文本列表
|
||||
|
||||
Args:
|
||||
text_list: 文本列表
|
||||
|
||||
Returns:
|
||||
连接后的文本
|
||||
"""
|
||||
return '\n'.join(text_list)
|
||||
|
||||
def _post_process_chunks(self, chunks: List[str]) -> List[str]:
|
||||
"""对分割后的chunks进行后处理
|
||||
|
||||
Args:
|
||||
chunks: 初始chunks列表
|
||||
|
||||
Returns:
|
||||
处理后的chunks列表
|
||||
"""
|
||||
processed_chunks = []
|
||||
|
||||
for chunk in chunks:
|
||||
# 移除空白chunk
|
||||
if not chunk.strip():
|
||||
continue
|
||||
|
||||
# 检查chunk大小
|
||||
chunk_length = len(chunk)
|
||||
if chunk_length < self.config.min_chunk_chars:
|
||||
logger.debug(f"Chunk too small ({chunk_length} chars), skipping")
|
||||
continue
|
||||
|
||||
if chunk_length > self.config.max_chunk_chars:
|
||||
logger.debug(f"Chunk too large ({chunk_length} chars), splitting")
|
||||
sub_chunks = self._split_by_tokens(chunk)
|
||||
processed_chunks.extend(sub_chunks)
|
||||
else:
|
||||
processed_chunks.append(chunk)
|
||||
|
||||
return processed_chunks
|
||||
|
||||
|
||||
def chunk_document(
|
||||
text: str,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
model_name: str = "gpt-3.5-turbo",
|
||||
**kwargs
|
||||
) -> List[str]:
|
||||
"""文档分块的便捷函数
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
chunk_size: 每个块的目标大小(tokens)
|
||||
chunk_overlap: 相邻块之间的重叠大小(tokens)
|
||||
model_name: 使用的tokenizer模型名称
|
||||
**kwargs: 其他分块配置参数
|
||||
|
||||
Returns:
|
||||
分块后的文本列表
|
||||
"""
|
||||
config = ChunkingConfig(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
model_name=model_name,
|
||||
**{k: v for k, v in kwargs.items() if hasattr(ChunkingConfig, k)}
|
||||
)
|
||||
|
||||
chunker = TextChunker(config)
|
||||
return chunker.split_text(text)
|
||||
@@ -1,268 +0,0 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from collections import defaultdict
|
||||
import re
|
||||
from enum import Enum, auto
|
||||
|
||||
|
||||
class ExtractionPhase(Enum):
|
||||
"""Extraction process phases"""
|
||||
INITIAL = auto()
|
||||
GLEANING = auto()
|
||||
VERIFY = auto()
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionState:
|
||||
"""State of the extraction process"""
|
||||
chunks: Dict[str, dict]
|
||||
current_chunk_index: int = 0
|
||||
phase: ExtractionPhase = ExtractionPhase.INITIAL
|
||||
gleaning_count: int = 0
|
||||
extracted_nodes: Dict[str, List[dict]] = field(default_factory=lambda: defaultdict(list))
|
||||
extracted_edges: Dict[Tuple[str, str], List[dict]] = field(default_factory=lambda: defaultdict(list))
|
||||
|
||||
|
||||
@dataclass
|
||||
class PromptInfo:
|
||||
"""Information about a prompt to be sent to LLM"""
|
||||
prompt: str
|
||||
prompt_type: str
|
||||
chunk_key: str
|
||||
history: List[Tuple[str, str]] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityRelationExtractor:
|
||||
"""
|
||||
Extract entities and their relationships from text chunks.
|
||||
|
||||
This class implements a stateful extraction process that can:
|
||||
1. Extract entities and their relationships from text chunks
|
||||
2. Support multiple rounds of extraction (gleaning)
|
||||
3. External LLM calling for flexibility
|
||||
4. Maintain extraction history and state
|
||||
|
||||
Attributes:
|
||||
prompt_templates: PromptTemplates instance containing required prompts
|
||||
entity_extract_max_gleaning: Maximum number of gleaning iterations
|
||||
required_prompts: Set of required prompt templates
|
||||
"""
|
||||
prompt_templates: 'PromptTemplates'
|
||||
entity_extract_max_gleaning: int = 1
|
||||
required_prompts: Set[str] = field(default_factory=lambda: {
|
||||
'entity_extraction',
|
||||
'entiti_continue_extraction',
|
||||
'entiti_if_loop_extraction'
|
||||
})
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate prompt templates and initialize state"""
|
||||
self._validate_prompts()
|
||||
self._state: Optional[ExtractionState] = None
|
||||
|
||||
def _validate_prompts(self) -> None:
|
||||
"""Validate all required prompts exist in templates"""
|
||||
missing_prompts = self.required_prompts - set(dir(self.prompt_templates))
|
||||
if missing_prompts:
|
||||
raise ValueError(f"Missing required prompt templates: {missing_prompts}")
|
||||
|
||||
def initialize_extraction(self, chunks: Dict[str, dict]) -> List[PromptInfo]:
|
||||
"""
|
||||
Initialize new extraction process for given chunks.
|
||||
|
||||
Args:
|
||||
chunks: Dictionary of text chunks to process
|
||||
|
||||
Returns:
|
||||
List of prompts to be sent to LLM
|
||||
"""
|
||||
self._state = ExtractionState(chunks=chunks)
|
||||
return self._get_next_prompts()
|
||||
|
||||
def _get_next_prompts(self) -> List[PromptInfo]:
|
||||
"""Generate next batch of prompts based on current state"""
|
||||
if not self._state or self._state.current_chunk_index >= len(self._state.chunks):
|
||||
return []
|
||||
|
||||
chunk_items = list(self._state.chunks.items())
|
||||
chunk_key, chunk_data = chunk_items[self._state.current_chunk_index]
|
||||
|
||||
if self._state.phase == ExtractionPhase.INITIAL:
|
||||
return [self._create_initial_prompt(chunk_key, chunk_data)]
|
||||
elif self._state.phase == ExtractionPhase.GLEANING:
|
||||
return [self._create_gleaning_prompt(chunk_key)]
|
||||
elif self._state.phase == ExtractionPhase.VERIFY:
|
||||
return [self._create_verify_prompt(chunk_key)]
|
||||
return []
|
||||
|
||||
def _create_initial_prompt(self, chunk_key: str, chunk_data: dict) -> PromptInfo:
|
||||
"""Create initial extraction prompt for a chunk"""
|
||||
prompt = self.prompt_templates.format_entity_extraction(
|
||||
text=chunk_data['content'],
|
||||
entity_types=self.prompt_templates.default_entity_types
|
||||
)
|
||||
return PromptInfo(
|
||||
prompt=prompt,
|
||||
prompt_type='initial_extraction',
|
||||
chunk_key=chunk_key
|
||||
)
|
||||
|
||||
def _create_gleaning_prompt(self, chunk_key: str) -> PromptInfo:
|
||||
"""Create gleaning prompt for additional extraction"""
|
||||
return PromptInfo(
|
||||
prompt=self.prompt_templates.entiti_continue_extraction,
|
||||
prompt_type='continue_extraction',
|
||||
chunk_key=chunk_key
|
||||
)
|
||||
|
||||
def _create_verify_prompt(self, chunk_key: str) -> PromptInfo:
|
||||
"""Create verification prompt"""
|
||||
return PromptInfo(
|
||||
prompt=self.prompt_templates.entity_if_loop_extraction,
|
||||
prompt_type='verify_extraction',
|
||||
chunk_key=chunk_key
|
||||
)
|
||||
|
||||
def process_response(self, response: str, prompt_info: PromptInfo) -> List[PromptInfo]:
|
||||
"""
|
||||
Process LLM response and determine next steps.
|
||||
|
||||
Args:
|
||||
response: LLM response text
|
||||
prompt_info: Information about the prompt that generated this response
|
||||
|
||||
Returns:
|
||||
List of next prompts to be sent to LLM
|
||||
"""
|
||||
if not self._state:
|
||||
raise RuntimeError("Extraction not initialized")
|
||||
|
||||
if prompt_info.prompt_type == 'initial_extraction':
|
||||
return self._handle_initial_response(response, prompt_info)
|
||||
elif prompt_info.prompt_type == 'continue_extraction':
|
||||
return self._handle_gleaning_response(response, prompt_info)
|
||||
elif prompt_info.prompt_type == 'verify_extraction':
|
||||
return self._handle_verify_response(response, prompt_info)
|
||||
return []
|
||||
|
||||
def _handle_initial_response(self, response: str, prompt_info: PromptInfo) -> List[PromptInfo]:
|
||||
"""Handle response from initial extraction"""
|
||||
self._process_extraction_response(response, prompt_info.chunk_key)
|
||||
|
||||
if self.entity_extract_max_gleaning > 0:
|
||||
self._state.phase = ExtractionPhase.GLEANING
|
||||
return self._get_next_prompts()
|
||||
return self._move_to_next_chunk()
|
||||
|
||||
def _handle_gleaning_response(self, response: str, prompt_info: PromptInfo) -> List[PromptInfo]:
|
||||
"""Handle response from gleaning extraction"""
|
||||
self._process_extraction_response(response, prompt_info.chunk_key)
|
||||
self._state.gleaning_count += 1
|
||||
|
||||
if self._state.gleaning_count >= self.entity_extract_max_gleaning:
|
||||
return self._move_to_next_chunk()
|
||||
|
||||
self._state.phase = ExtractionPhase.VERIFY
|
||||
return self._get_next_prompts()
|
||||
|
||||
def _handle_verify_response(self, response: str, prompt_info: PromptInfo) -> List[PromptInfo]:
|
||||
"""Handle response from verification prompt"""
|
||||
if self._clean_str(response).lower() == 'yes':
|
||||
self._state.phase = ExtractionPhase.GLEANING
|
||||
return self._get_next_prompts()
|
||||
return self._move_to_next_chunk()
|
||||
|
||||
def _move_to_next_chunk(self) -> List[PromptInfo]:
|
||||
"""Move to next chunk and return appropriate prompts"""
|
||||
self._state.current_chunk_index += 1
|
||||
self._state.phase = ExtractionPhase.INITIAL
|
||||
self._state.gleaning_count = 0
|
||||
return self._get_next_prompts()
|
||||
|
||||
def _process_extraction_response(self, response: str, chunk_key: str) -> None:
|
||||
"""Process a single extraction response"""
|
||||
records = self._split_into_records(response)
|
||||
|
||||
for record in records:
|
||||
record_match = re.search(r'\((.*?)\)', record)
|
||||
if not record_match:
|
||||
continue
|
||||
|
||||
record_content = record_match.group(1)
|
||||
attributes = self._split_record_attributes(record_content)
|
||||
|
||||
if len(attributes) < 1:
|
||||
continue
|
||||
|
||||
if attributes[0] == '"entity"':
|
||||
entity_data = self._extract_entity(attributes, chunk_key)
|
||||
if entity_data:
|
||||
self._state.extracted_nodes[entity_data['entity_name']].append(entity_data)
|
||||
|
||||
elif attributes[0] == '"relationship"':
|
||||
relation_data = self._extract_relationship(attributes, chunk_key)
|
||||
if relation_data:
|
||||
key = (relation_data['src_id'], relation_data['tgt_id'])
|
||||
self._state.extracted_edges[key].append(relation_data)
|
||||
|
||||
def _split_into_records(self, text: str) -> List[str]:
|
||||
"""Split response text into individual records"""
|
||||
markers = [self.prompt_templates.record_delimiter, self.prompt_templates.completion_delimiter]
|
||||
results = re.split("|".join(re.escape(marker) for marker in markers), text)
|
||||
return [r.strip() for r in results if r.strip()]
|
||||
|
||||
|
||||
def _split_record_attributes(self, record: str) -> List[str]:
|
||||
"""Split record into attributes"""
|
||||
return [attr for attr in record.split(self.prompt_templates.tuple_delimiter) if attr.strip()]
|
||||
|
||||
def _extract_entity(self, attributes: List[str], chunk_key: str) -> Optional[dict]:
|
||||
"""Extract entity data from attributes"""
|
||||
if len(attributes) < 4:
|
||||
return None
|
||||
|
||||
entity_name = self._clean_str(attributes[1].upper())
|
||||
if not entity_name:
|
||||
return None
|
||||
|
||||
return {
|
||||
'entity_name': entity_name,
|
||||
'entity_type': self._clean_str(attributes[2].upper()),
|
||||
'description': self._clean_str(attributes[3]),
|
||||
'source_id': chunk_key
|
||||
}
|
||||
|
||||
def _extract_relationship(self, attributes: List[str], chunk_key: str) -> Optional[dict]:
|
||||
"""Extract relationship data from attributes"""
|
||||
if len(attributes) < 6:
|
||||
return None
|
||||
|
||||
return {
|
||||
'src_id': self._clean_str(attributes[1].upper()),
|
||||
'tgt_id': self._clean_str(attributes[2].upper()),
|
||||
'description': self._clean_str(attributes[3]),
|
||||
'keywords': self._clean_str(attributes[4]),
|
||||
'weight': float(attributes[5]) if self._is_float(attributes[5]) else 1.0,
|
||||
'source_id': chunk_key
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _clean_str(s: str) -> str:
|
||||
"""Clean a string by removing quotes and extra whitespace"""
|
||||
return s.strip().strip('"').strip("'")
|
||||
|
||||
@staticmethod
|
||||
def _is_float(s: str) -> bool:
|
||||
"""Check if string can be converted to float"""
|
||||
try:
|
||||
float(s.strip().strip('"').strip("'"))
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def get_results(self) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
|
||||
"""Get the final extracted nodes and edges"""
|
||||
if not self._state:
|
||||
return defaultdict(list), defaultdict(list)
|
||||
return self._state.extracted_nodes, self._state.extracted_edges
|
||||
@@ -1,239 +0,0 @@
|
||||
GRAPH_FIELD_SEP = "<SEP>"
|
||||
|
||||
PROMPTS = {}
|
||||
|
||||
PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
|
||||
PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
|
||||
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
|
||||
PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
||||
|
||||
PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"]
|
||||
|
||||
PROMPTS["entity_extraction"] = """-Goal-
|
||||
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
|
||||
|
||||
-Steps-
|
||||
1. Identify all entities. For each identified entity, extract the following information:
|
||||
- entity_name: Name of the entity, capitalized
|
||||
- entity_type: One of the following types: [{entity_types}]
|
||||
- entity_description: Comprehensive description of the entity's attributes and activities
|
||||
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>
|
||||
|
||||
2. From the entities identified in core 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
|
||||
For each pair of related entities, extract the following information:
|
||||
- source_entity: name of the source entity, as identified in core 1
|
||||
- target_entity: name of the target entity, as identified in core 1
|
||||
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
|
||||
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
|
||||
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
|
||||
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)
|
||||
|
||||
3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
|
||||
Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
|
||||
|
||||
4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
|
||||
|
||||
5. When finished, output {completion_delimiter}
|
||||
|
||||
######################
|
||||
-Examples-
|
||||
######################
|
||||
Example 1:
|
||||
|
||||
Entity_types: [person, technology, mission, organization, location]
|
||||
Text:
|
||||
while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.
|
||||
|
||||
Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.”
|
||||
|
||||
The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce.
|
||||
|
||||
It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths
|
||||
################
|
||||
Output:
|
||||
("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter}
|
||||
("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter}
|
||||
#############################
|
||||
Example 2:
|
||||
|
||||
Entity_types: [person, technology, mission, organization, location]
|
||||
Text:
|
||||
They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve.
|
||||
|
||||
Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril.
|
||||
|
||||
Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly
|
||||
#############
|
||||
Output:
|
||||
("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter}
|
||||
("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter}
|
||||
#############################
|
||||
Example 3:
|
||||
|
||||
Entity_types: [person, role, technology, organization, event, location, concept]
|
||||
Text:
|
||||
their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data.
|
||||
|
||||
"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning."
|
||||
|
||||
Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back."
|
||||
|
||||
Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history.
|
||||
|
||||
The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation
|
||||
#############
|
||||
Output:
|
||||
("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter}
|
||||
("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter}
|
||||
("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter}
|
||||
("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter}
|
||||
#############################
|
||||
-Real Data-
|
||||
######################
|
||||
Entity_types: {entity_types}
|
||||
Text: {input_text}
|
||||
######################
|
||||
Output:
|
||||
"""
|
||||
|
||||
PROMPTS[
|
||||
"summarize_entity_descriptions"
|
||||
] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
|
||||
Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
|
||||
Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
|
||||
If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
|
||||
Make sure it is written in third person, and include the entity names so we the have full context.You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
|
||||
Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
|
||||
Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
|
||||
If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
|
||||
Make sure it is written in third person, and include the entity names so we the have full context.
|
||||
|
||||
#######
|
||||
-Data-
|
||||
Entities: {entity_name}
|
||||
Description List: {description_list}
|
||||
#######
|
||||
Output:
|
||||
"""
|
||||
|
||||
PROMPTS[
|
||||
"entiti_continue_extraction"
|
||||
] = """MANY entities were missed in the last extraction. Add them below using the same format:
|
||||
"""
|
||||
|
||||
PROMPTS[
|
||||
"entiti_if_loop_extraction"
|
||||
] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added.
|
||||
"""
|
||||
|
||||
PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question."
|
||||
|
||||
PROMPTS["rag_response"] = """---Role---
|
||||
|
||||
You are a helpful assistant responding to questions about data in the tables provided.
|
||||
|
||||
|
||||
---Goal---
|
||||
|
||||
Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
|
||||
If you don't know the answer, just say so. Do not make anything up.
|
||||
Do not include information where the supporting evidence for it is not provided.
|
||||
|
||||
---Target response length and format---
|
||||
|
||||
{response_type}
|
||||
|
||||
---Data tables---
|
||||
|
||||
{context_data}
|
||||
|
||||
Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
|
||||
"""
|
||||
|
||||
PROMPTS["keywords_extraction"] = """---Role---
|
||||
|
||||
You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query.
|
||||
|
||||
---Goal---
|
||||
|
||||
Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms.
|
||||
|
||||
---Instructions---
|
||||
|
||||
- Output the keywords in JSON format.
|
||||
- The JSON should have two keys:
|
||||
- "high_level_keywords" for overarching concepts or themes.
|
||||
- "low_level_keywords" for specific entities or details.
|
||||
|
||||
######################
|
||||
-Examples-
|
||||
######################
|
||||
Example 1:
|
||||
|
||||
Query: "How does international trade influence global economic stability?"
|
||||
################
|
||||
Output:
|
||||
{{
|
||||
"high_level_keywords": ["International trade", "Global economic stability", "Economic impact"],
|
||||
"low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"]
|
||||
}}
|
||||
#############################
|
||||
Example 2:
|
||||
|
||||
Query: "What are the environmental consequences of deforestation on biodiversity?"
|
||||
################
|
||||
Output:
|
||||
{{
|
||||
"high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"],
|
||||
"low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"]
|
||||
}}
|
||||
#############################
|
||||
Example 3:
|
||||
|
||||
Query: "What is the role of education in reducing poverty?"
|
||||
################
|
||||
Output:
|
||||
{{
|
||||
"high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"],
|
||||
"low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"]
|
||||
}}
|
||||
#############################
|
||||
-Real Data-
|
||||
######################
|
||||
Query: {query}
|
||||
######################
|
||||
Output:
|
||||
|
||||
"""
|
||||
|
||||
PROMPTS["naive_rag_response"] = """You're a helpful assistant
|
||||
Below are the knowledge you know:
|
||||
{content_data}
|
||||
---
|
||||
If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up.
|
||||
Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
|
||||
If you don't know the answer, just say so. Do not make anything up.
|
||||
Do not include information where the supporting evidence for it is not provided.
|
||||
---Target response length and format---
|
||||
{response_type}
|
||||
"""
|
||||
@@ -1,125 +0,0 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List
|
||||
import importlib.util
|
||||
import os
|
||||
|
||||
|
||||
@dataclass
|
||||
class PromptTemplates:
|
||||
"""Manage system prompt templates for RAG system"""
|
||||
|
||||
# Delimiters and separators
|
||||
field_separator: str = field(default="<SEP>")
|
||||
tuple_delimiter: str = field(default="<|>")
|
||||
record_delimiter: str = field(default="##")
|
||||
completion_delimiter: str = field(default="<|COMPLETE|>")
|
||||
|
||||
# Process tickers
|
||||
process_tickers: List[str] = field(
|
||||
default_factory=lambda: ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
||||
)
|
||||
|
||||
# Default entity types
|
||||
default_entity_types: List[str] = field(
|
||||
default_factory=lambda: ["organization", "person", "geo", "event"]
|
||||
)
|
||||
|
||||
# All prompt templates predefined with empty defaults
|
||||
entity_extraction: str = field(default="")
|
||||
summarize_entity_descriptions: str = field(default="")
|
||||
entiti_continue_extraction: str = field(default="")
|
||||
entiti_if_loop_extraction: str = field(default="")
|
||||
fail_response: str = field(default="")
|
||||
rag_response: str = field(default="")
|
||||
naive_rag_response: str = field(default="")
|
||||
keywords_extraction: str = field(default="")
|
||||
|
||||
def __post_init__(self):
|
||||
"""Load prompts from the prompt.py file after initialization"""
|
||||
self.load_prompts()
|
||||
|
||||
def load_prompts(self, prompts_path: str = None):
|
||||
"""Load prompts from the specified file path or from default location"""
|
||||
if prompts_path is None:
|
||||
# Default path relative to current directory
|
||||
prompts_path = os.path.join(os.path.dirname(__file__), "prompt.py")
|
||||
|
||||
try:
|
||||
# Load the module from file path
|
||||
spec = importlib.util.spec_from_file_location("prompt", prompts_path)
|
||||
if spec and spec.loader:
|
||||
prompt_module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(prompt_module)
|
||||
|
||||
# Get the PROMPTS dictionary
|
||||
prompts_dict = getattr(prompt_module, "PROMPTS", {})
|
||||
|
||||
# Load field separator from module level
|
||||
self.field_separator = getattr(prompt_module, "GRAPH_FIELD_SEP", self.field_separator)
|
||||
|
||||
# Load delimiters from PROMPTS dictionary
|
||||
self.tuple_delimiter = prompts_dict.get("DEFAULT_TUPLE_DELIMITER", self.tuple_delimiter)
|
||||
self.record_delimiter = prompts_dict.get("DEFAULT_RECORD_DELIMITER", self.record_delimiter)
|
||||
self.completion_delimiter = prompts_dict.get("DEFAULT_COMPLETION_DELIMITER", self.completion_delimiter)
|
||||
|
||||
# Load process tickers
|
||||
self.process_tickers = prompts_dict.get("process_tickers", self.process_tickers)
|
||||
|
||||
# Load entity types
|
||||
self.default_entity_types = prompts_dict.get("DEFAULT_ENTITY_TYPES", self.default_entity_types)
|
||||
|
||||
# Load all prompt templates
|
||||
for key, value in prompts_dict.items():
|
||||
# Convert the key to match class attribute names if needed
|
||||
attr_name = key.lower()
|
||||
if hasattr(self, attr_name):
|
||||
setattr(self, attr_name, value)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading prompts from {prompts_path}: {str(e)}")
|
||||
raise
|
||||
|
||||
def format_entity_extraction(self, text: str, entity_types: List[str] = None) -> str:
|
||||
"""Format entity extraction prompt"""
|
||||
if entity_types is None:
|
||||
entity_types = self.default_entity_types
|
||||
return self.entity_extraction.format(
|
||||
entity_types=", ".join(entity_types),
|
||||
input_text=text,
|
||||
tuple_delimiter=self.tuple_delimiter,
|
||||
record_delimiter=self.record_delimiter,
|
||||
completion_delimiter=self.completion_delimiter
|
||||
)
|
||||
|
||||
def format_summarize_entity(self, entity_name: str, description_list: List[str]) -> str:
|
||||
"""Format entity summarization prompt"""
|
||||
return self.summarize_entity_descriptions.format(
|
||||
entity_name=entity_name,
|
||||
description_list="\n".join(description_list)
|
||||
)
|
||||
|
||||
def format_keyword_extraction(self, query: str) -> str:
|
||||
"""Format keyword extraction prompt"""
|
||||
return self.keywords_extraction.format(query=query)
|
||||
|
||||
def format_rag_response(
|
||||
self,
|
||||
context_data: str,
|
||||
response_type: str = "detailed paragraph"
|
||||
) -> str:
|
||||
"""Format RAG response prompt"""
|
||||
return self.rag_response.format(
|
||||
context_data=context_data,
|
||||
response_type=response_type
|
||||
)
|
||||
|
||||
def format_naive_rag_response(
|
||||
self,
|
||||
content_data: str,
|
||||
response_type: str = "detailed paragraph"
|
||||
) -> str:
|
||||
"""Format naive RAG response prompt"""
|
||||
return self.naive_rag_response.format(
|
||||
content_data=content_data,
|
||||
response_type=response_type
|
||||
)
|
||||
@@ -1,369 +0,0 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Set, Union, TypeVar, Generic, Tuple, Any
|
||||
import os
|
||||
import json
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
import html
|
||||
from datetime import datetime
|
||||
|
||||
from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
|
||||
from crazy_functions.rag_fns.llama_index_worker import LlamaIndexRagWorker
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
@dataclass
|
||||
class StorageBase:
|
||||
"""Base class for all storage implementations"""
|
||||
namespace: str
|
||||
working_dir: str
|
||||
|
||||
async def index_done_callback(self):
|
||||
"""Hook called after indexing operations"""
|
||||
pass
|
||||
|
||||
async def query_done_callback(self):
|
||||
"""Hook called after query operations"""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class JsonKVStorage(StorageBase, Generic[T]):
|
||||
"""
|
||||
Key-Value storage using JSON files
|
||||
|
||||
Attributes:
|
||||
namespace (str): Storage namespace
|
||||
working_dir (str): Working directory for storage files
|
||||
_file_name (str): JSON file path
|
||||
_data (Dict[str, T]): In-memory storage
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
"""Initialize storage file and load data"""
|
||||
self._file_name = os.path.join(self.working_dir, f"kv_store_{self.namespace}.json")
|
||||
self._data: Dict[str, T] = {}
|
||||
self.load()
|
||||
|
||||
def load(self):
|
||||
"""Load data from JSON file"""
|
||||
if os.path.exists(self._file_name):
|
||||
with open(self._file_name, 'r', encoding='utf-8') as f:
|
||||
self._data = json.load(f)
|
||||
logger.info(f"Loaded {len(self._data)} items from {self._file_name}")
|
||||
|
||||
async def save(self):
|
||||
"""Save data to JSON file"""
|
||||
os.makedirs(os.path.dirname(self._file_name), exist_ok=True)
|
||||
with open(self._file_name, 'w', encoding='utf-8') as f:
|
||||
json.dump(self._data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
async def get_by_id(self, id: str) -> Optional[T]:
|
||||
"""Get item by ID"""
|
||||
return self._data.get(id)
|
||||
|
||||
async def get_by_ids(self, ids: List[str], fields: Optional[Set[str]] = None) -> List[Optional[T]]:
|
||||
"""Get multiple items by IDs with optional field filtering"""
|
||||
if fields is None:
|
||||
return [self._data.get(id) for id in ids]
|
||||
return [{k: v for k, v in self._data[id].items() if k in fields}
|
||||
if id in self._data else None
|
||||
for id in ids]
|
||||
|
||||
async def filter_keys(self, keys: List[str]) -> Set[str]:
|
||||
"""Return keys that don't exist in storage"""
|
||||
return set(k for k in keys if k not in self._data)
|
||||
|
||||
async def upsert(self, data: Dict[str, T]):
|
||||
"""Insert or update items"""
|
||||
self._data.update(data)
|
||||
await self.save()
|
||||
|
||||
async def drop(self):
|
||||
"""Clear all data"""
|
||||
self._data = {}
|
||||
if os.path.exists(self._file_name):
|
||||
os.remove(self._file_name)
|
||||
|
||||
async def all_keys(self) -> List[str]:
|
||||
"""Get all keys in storage"""
|
||||
return list(self._data.keys())
|
||||
|
||||
async def index_done_callback(self):
|
||||
"""Save after indexing"""
|
||||
await self.save()
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class VectorStorage(StorageBase):
|
||||
"""
|
||||
Vector storage using LlamaIndexRagWorker
|
||||
|
||||
Attributes:
|
||||
namespace (str): Storage namespace (e.g., 'entities', 'relationships', 'chunks')
|
||||
working_dir (str): Working directory for storage files
|
||||
llm_kwargs (dict): LLM configuration
|
||||
embedding_func (OpenAiEmbeddingModel): Embedding function
|
||||
meta_fields (Set[str]): Additional metadata fields to store
|
||||
"""
|
||||
llm_kwargs: dict
|
||||
embedding_func: OpenAiEmbeddingModel
|
||||
meta_fields: Set[str] = field(default_factory=set)
|
||||
|
||||
def __post_init__(self):
|
||||
"""Initialize LlamaIndex worker"""
|
||||
# 使用正确的文件命名格式
|
||||
self._vector_file = os.path.join(self.working_dir, f"vdb_{self.namespace}.json")
|
||||
|
||||
# 设置检查点目录
|
||||
checkpoint_dir = os.path.join(self.working_dir, f"vector_{self.namespace}_checkpoint")
|
||||
os.makedirs(checkpoint_dir, exist_ok=True)
|
||||
|
||||
# 初始化向量存储
|
||||
self.vector_store = LlamaIndexRagWorker(
|
||||
user_name=self.namespace,
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
checkpoint_dir=checkpoint_dir,
|
||||
auto_load_checkpoint=True
|
||||
)
|
||||
logger.info(f"Initialized vector storage for {self.namespace}")
|
||||
|
||||
async def query(self, query: str, top_k: int = 5, metadata_filters: Optional[Dict[str, Any]] = None) -> List[dict]:
|
||||
"""
|
||||
Query vectors by similarity with optional metadata filtering
|
||||
|
||||
Args:
|
||||
query: Query text
|
||||
top_k: Maximum number of results to return
|
||||
metadata_filters: Optional metadata filters
|
||||
|
||||
Returns:
|
||||
List of similar documents with scores
|
||||
"""
|
||||
try:
|
||||
if metadata_filters:
|
||||
nodes = self.vector_store.retrieve_with_metadata_filter(query, metadata_filters, top_k)
|
||||
else:
|
||||
nodes = self.vector_store.retrieve_from_store_with_query(query)[:top_k]
|
||||
|
||||
results = []
|
||||
for node in nodes:
|
||||
result = {
|
||||
"id": node.node_id,
|
||||
"text": node.text,
|
||||
"score": node.score if hasattr(node, 'score') else 0.0,
|
||||
}
|
||||
# Add metadata fields if they exist and are in meta_fields
|
||||
if hasattr(node, 'metadata'):
|
||||
result.update({
|
||||
k: node.metadata[k]
|
||||
for k in self.meta_fields
|
||||
if k in node.metadata
|
||||
})
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in vector query: {e}")
|
||||
raise
|
||||
|
||||
async def upsert(self, data: Dict[str, dict]):
|
||||
"""
|
||||
Insert or update vectors
|
||||
|
||||
Args:
|
||||
data: Dictionary of documents to insert/update with format:
|
||||
{id: {"content": text, "metadata": dict}}
|
||||
"""
|
||||
try:
|
||||
for doc_id, item in data.items():
|
||||
content = item["content"]
|
||||
# 提取元数据
|
||||
metadata = {
|
||||
k: item[k]
|
||||
for k in self.meta_fields
|
||||
if k in item
|
||||
}
|
||||
# 添加文档ID到元数据
|
||||
metadata["doc_id"] = doc_id
|
||||
|
||||
# 添加到向量存储
|
||||
self.vector_store.add_text_with_metadata(content, metadata)
|
||||
|
||||
# 导出向量数据到json文件
|
||||
self.vector_store.export_nodes(
|
||||
self._vector_file,
|
||||
format="json",
|
||||
include_embeddings=True
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in vector upsert: {e}")
|
||||
raise
|
||||
|
||||
async def save(self):
|
||||
"""Save vector store to checkpoint and export data"""
|
||||
try:
|
||||
# 保存检查点
|
||||
self.vector_store.save_to_checkpoint()
|
||||
|
||||
# 导出向量数据
|
||||
self.vector_store.export_nodes(
|
||||
self._vector_file,
|
||||
format="json",
|
||||
include_embeddings=True
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving vector storage: {e}")
|
||||
raise
|
||||
|
||||
async def index_done_callback(self):
|
||||
"""Save after indexing"""
|
||||
await self.save()
|
||||
|
||||
def get_statistics(self) -> Dict[str, Any]:
|
||||
"""Get vector store statistics"""
|
||||
return self.vector_store.get_statistics()
|
||||
|
||||
|
||||
@dataclass
|
||||
class NetworkStorage(StorageBase):
|
||||
"""
|
||||
Graph storage using NetworkX
|
||||
|
||||
Attributes:
|
||||
namespace (str): Storage namespace
|
||||
working_dir (str): Working directory for storage files
|
||||
"""
|
||||
|
||||
def __post_init__(self):
|
||||
"""Initialize graph and storage file"""
|
||||
self._file_name = os.path.join(self.working_dir, f"graph_{self.namespace}.graphml")
|
||||
self._graph = self._load_graph() or nx.Graph()
|
||||
logger.info(f"Initialized graph storage for {self.namespace}")
|
||||
|
||||
def _load_graph(self) -> Optional[nx.Graph]:
|
||||
"""Load graph from GraphML file"""
|
||||
if os.path.exists(self._file_name):
|
||||
try:
|
||||
graph = nx.read_graphml(self._file_name)
|
||||
logger.info(f"Loaded graph with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")
|
||||
return graph
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading graph from {self._file_name}: {e}")
|
||||
return None
|
||||
return None
|
||||
|
||||
async def save_graph(self):
|
||||
"""Save graph to GraphML file"""
|
||||
try:
|
||||
os.makedirs(os.path.dirname(self._file_name), exist_ok=True)
|
||||
logger.info(
|
||||
f"Saving graph with {self._graph.number_of_nodes()} nodes, {self._graph.number_of_edges()} edges")
|
||||
nx.write_graphml(self._graph, self._file_name)
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving graph: {e}")
|
||||
raise
|
||||
|
||||
async def has_node(self, node_id: str) -> bool:
|
||||
"""Check if node exists"""
|
||||
return self._graph.has_node(node_id)
|
||||
|
||||
async def has_edge(self, source_id: str, target_id: str) -> bool:
|
||||
"""Check if edge exists"""
|
||||
return self._graph.has_edge(source_id, target_id)
|
||||
|
||||
async def get_node(self, node_id: str) -> Optional[dict]:
|
||||
"""Get node attributes"""
|
||||
if not self._graph.has_node(node_id):
|
||||
return None
|
||||
return dict(self._graph.nodes[node_id])
|
||||
|
||||
async def get_edge(self, source_id: str, target_id: str) -> Optional[dict]:
|
||||
"""Get edge attributes"""
|
||||
if not self._graph.has_edge(source_id, target_id):
|
||||
return None
|
||||
return dict(self._graph.edges[source_id, target_id])
|
||||
|
||||
async def node_degree(self, node_id: str) -> int:
|
||||
"""Get node degree"""
|
||||
return self._graph.degree(node_id)
|
||||
|
||||
async def edge_degree(self, source_id: str, target_id: str) -> int:
|
||||
"""Get sum of degrees of edge endpoints"""
|
||||
return self._graph.degree(source_id) + self._graph.degree(target_id)
|
||||
|
||||
async def get_node_edges(self, source_id: str) -> Optional[List[Tuple[str, str]]]:
|
||||
"""Get all edges connected to node"""
|
||||
if not self._graph.has_node(source_id):
|
||||
return None
|
||||
return list(self._graph.edges(source_id))
|
||||
|
||||
async def upsert_node(self, node_id: str, node_data: Dict[str, str]):
|
||||
"""Insert or update node"""
|
||||
cleaned_data = {k: html.escape(str(v).upper().strip()) for k, v in node_data.items()}
|
||||
self._graph.add_node(node_id, **cleaned_data)
|
||||
await self.save_graph()
|
||||
|
||||
async def upsert_edge(self, source_id: str, target_id: str, edge_data: Dict[str, str]):
|
||||
"""Insert or update edge"""
|
||||
cleaned_data = {k: html.escape(str(v).strip()) for k, v in edge_data.items()}
|
||||
self._graph.add_edge(source_id, target_id, **cleaned_data)
|
||||
await self.save_graph()
|
||||
|
||||
async def index_done_callback(self):
|
||||
"""Save after indexing"""
|
||||
await self.save_graph()
|
||||
|
||||
def get_largest_connected_component(self) -> nx.Graph:
|
||||
"""Get the largest connected component of the graph"""
|
||||
if not self._graph:
|
||||
return nx.Graph()
|
||||
|
||||
components = list(nx.connected_components(self._graph))
|
||||
if not components:
|
||||
return nx.Graph()
|
||||
|
||||
largest_component = max(components, key=len)
|
||||
return self._graph.subgraph(largest_component).copy()
|
||||
|
||||
async def embed_nodes(
|
||||
self,
|
||||
algorithm: str = "node2vec",
|
||||
dimensions: int = 128,
|
||||
walk_length: int = 30,
|
||||
num_walks: int = 200,
|
||||
workers: int = 4,
|
||||
window: int = 10,
|
||||
min_count: int = 1,
|
||||
**kwargs
|
||||
) -> Tuple[np.ndarray, List[str]]:
|
||||
"""Generate node embeddings using specified algorithm"""
|
||||
if algorithm == "node2vec":
|
||||
from node2vec import Node2Vec
|
||||
|
||||
# Create and train node2vec model
|
||||
n2v = Node2Vec(
|
||||
self._graph,
|
||||
dimensions=dimensions,
|
||||
walk_length=walk_length,
|
||||
num_walks=num_walks,
|
||||
workers=workers
|
||||
)
|
||||
|
||||
model = n2v.fit(
|
||||
window=window,
|
||||
min_count=min_count
|
||||
)
|
||||
|
||||
# Get embeddings for all nodes
|
||||
node_ids = list(self._graph.nodes())
|
||||
embeddings = np.array([model.wv[node] for node in node_ids])
|
||||
|
||||
return embeddings, node_ids
|
||||
|
||||
raise ValueError(f"Unsupported embedding algorithm: {algorithm}")
|
||||
@@ -1,394 +0,0 @@
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
import asyncio
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pprint import pprint
|
||||
import json
|
||||
from loguru import logger
|
||||
import numpy as np
|
||||
|
||||
from core.prompt_templates import PromptTemplates
|
||||
from core.extractor import EntityRelationExtractor, PromptInfo
|
||||
from core.storage import JsonKVStorage, VectorStorage, NetworkStorage
|
||||
from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(api_key=os.getenv("API_KEY"), base_url=os.getenv("API_URL"))
|
||||
|
||||
|
||||
class ExtractionExample:
|
||||
"""Example class demonstrating comprehensive RAG system functionality"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize RAG system components"""
|
||||
# 设置工作目录
|
||||
self.working_dir = f"crazy_functions/rag_fns/LightRAG/rag_cache_{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
|
||||
os.makedirs(self.working_dir, exist_ok=True)
|
||||
logger.info(f"Working directory: {self.working_dir}")
|
||||
|
||||
# 初始化embedding
|
||||
self.llm_kwargs = {
|
||||
'api_key': os.getenv("one_api_key"),
|
||||
'client_ip': '127.0.0.1',
|
||||
'embed_model': 'text-embedding-3-small',
|
||||
'llm_model': 'one-api-Qwen2.5-72B-Instruct',
|
||||
'max_length': 4096,
|
||||
'most_recent_uploaded': None,
|
||||
'temperature': 1,
|
||||
'top_p': 1
|
||||
}
|
||||
self.embedding_func = OpenAiEmbeddingModel(self.llm_kwargs)
|
||||
|
||||
# 初始化提示模板和抽取器
|
||||
self.prompt_templates = PromptTemplates()
|
||||
self.extractor = EntityRelationExtractor(
|
||||
prompt_templates=self.prompt_templates,
|
||||
required_prompts={'entity_extraction'},
|
||||
entity_extract_max_gleaning=1
|
||||
)
|
||||
|
||||
# 初始化存储系统
|
||||
self._init_storage_system()
|
||||
|
||||
# 对话历史
|
||||
self.conversation_history = {}
|
||||
|
||||
def _init_storage_system(self):
|
||||
"""Initialize storage components"""
|
||||
# KV存储 - 用于原始文本和分块
|
||||
self.text_chunks = JsonKVStorage[dict](
|
||||
namespace="text_chunks",
|
||||
working_dir=self.working_dir
|
||||
)
|
||||
|
||||
self.full_docs = JsonKVStorage[dict](
|
||||
namespace="full_docs",
|
||||
working_dir=self.working_dir
|
||||
)
|
||||
|
||||
# 向量存储 - 用于实体、关系和文本块的向量表示
|
||||
self.entities_vdb = VectorStorage(
|
||||
namespace="entities",
|
||||
working_dir=self.working_dir,
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
embedding_func=self.embedding_func,
|
||||
meta_fields={"entity_name", "entity_type"}
|
||||
)
|
||||
|
||||
self.relationships_vdb = VectorStorage(
|
||||
namespace="relationships",
|
||||
working_dir=self.working_dir,
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
embedding_func=self.embedding_func,
|
||||
meta_fields={"src_id", "tgt_id"}
|
||||
)
|
||||
|
||||
self.chunks_vdb = VectorStorage(
|
||||
namespace="chunks",
|
||||
working_dir=self.working_dir,
|
||||
llm_kwargs=self.llm_kwargs,
|
||||
embedding_func=self.embedding_func
|
||||
)
|
||||
|
||||
# 图存储 - 用于实体关系
|
||||
self.graph_store = NetworkStorage(
|
||||
namespace="chunk_entity_relation",
|
||||
working_dir=self.working_dir
|
||||
)
|
||||
|
||||
async def simulate_llm_call(self, prompt: str, prompt_info: PromptInfo) -> str:
|
||||
"""Simulate LLM call with conversation history"""
|
||||
# 获取当前chunk的对话历史
|
||||
chunk_history = self.conversation_history.get(prompt_info.chunk_key, [])
|
||||
|
||||
messages = [
|
||||
{"role": "system",
|
||||
"content": "You are a helpful assistant specialized in entity and relationship extraction."}
|
||||
]
|
||||
|
||||
# 添加历史对话
|
||||
for msg in chunk_history:
|
||||
messages.append(msg)
|
||||
|
||||
# 添加当前prompt
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
try:
|
||||
# 调用LLM
|
||||
response = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=messages,
|
||||
stream=False
|
||||
)
|
||||
|
||||
response_content = response.choices[0].message.content
|
||||
|
||||
# 更新对话历史
|
||||
chunk_history.extend([
|
||||
{"role": "user", "content": prompt},
|
||||
{"role": "assistant", "content": response_content}
|
||||
])
|
||||
self.conversation_history[prompt_info.chunk_key] = chunk_history
|
||||
|
||||
logger.info(f"\nProcessing chunk: {prompt_info.chunk_key}")
|
||||
logger.info(f"Phase: {prompt_info.prompt_type}")
|
||||
logger.info(f"Response: {response_content[:200]}...")
|
||||
|
||||
return response_content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in LLM call: {e}")
|
||||
raise
|
||||
|
||||
async def process_document(self, content: str) -> Tuple[Dict, Dict]:
|
||||
"""Process a single document through the RAG pipeline"""
|
||||
doc_id = f"doc_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||
|
||||
# 存储原始文档
|
||||
await self.full_docs.upsert({
|
||||
doc_id: {"content": content}
|
||||
})
|
||||
|
||||
# 文档分块
|
||||
from core.chunking import chunk_document
|
||||
chunks = chunk_document(content)
|
||||
chunk_dict = {
|
||||
f"{doc_id}_chunk_{i}": {"content": chunk, "doc_id": doc_id}
|
||||
for i, chunk in enumerate(chunks)
|
||||
}
|
||||
|
||||
# 存储分块
|
||||
await self.text_chunks.upsert(chunk_dict)
|
||||
|
||||
# 处理分块并提取实体关系
|
||||
nodes, edges = await self.process_chunk_batch(chunk_dict)
|
||||
|
||||
return nodes, edges
|
||||
|
||||
async def process_chunk_batch(self, chunks: Dict[str, dict]):
|
||||
"""Process text chunks and store results"""
|
||||
try:
|
||||
# 向量存储
|
||||
logger.info("Adding chunks to vector store...")
|
||||
await self.chunks_vdb.upsert(chunks)
|
||||
|
||||
# 初始化对话历史
|
||||
self.conversation_history = {chunk_key: [] for chunk_key in chunks.keys()}
|
||||
|
||||
# 提取实体和关系
|
||||
logger.info("Extracting entities and relationships...")
|
||||
prompts = self.extractor.initialize_extraction(chunks)
|
||||
|
||||
while prompts:
|
||||
# 处理prompts
|
||||
responses = await asyncio.gather(
|
||||
*[self.simulate_llm_call(p.prompt, p) for p in prompts]
|
||||
)
|
||||
|
||||
# 处理响应
|
||||
next_prompts = []
|
||||
for response, prompt_info in zip(responses, prompts):
|
||||
next_batch = self.extractor.process_response(response, prompt_info)
|
||||
next_prompts.extend(next_batch)
|
||||
|
||||
prompts = next_prompts
|
||||
|
||||
# 获取结果
|
||||
nodes, edges = self.extractor.get_results()
|
||||
|
||||
# 存储实体到向量数据库和图数据库
|
||||
for node_name, node_instances in nodes.items():
|
||||
for node in node_instances:
|
||||
# 存储到向量数据库
|
||||
await self.entities_vdb.upsert({
|
||||
f"entity_{node_name}": {
|
||||
"content": f"{node_name}: {node['description']}",
|
||||
"entity_name": node_name,
|
||||
"entity_type": node['entity_type']
|
||||
}
|
||||
})
|
||||
# 存储到图数据库
|
||||
await self.graph_store.upsert_node(node_name, node)
|
||||
|
||||
# 存储关系到向量数据库和图数据库
|
||||
for (src, tgt), edge_instances in edges.items():
|
||||
for edge in edge_instances:
|
||||
# 存储到向量数据库
|
||||
await self.relationships_vdb.upsert({
|
||||
f"rel_{src}_{tgt}": {
|
||||
"content": f"{edge['description']} | {edge['keywords']}",
|
||||
"src_id": src,
|
||||
"tgt_id": tgt
|
||||
}
|
||||
})
|
||||
# 存储到图数据库
|
||||
await self.graph_store.upsert_edge(src, tgt, edge)
|
||||
|
||||
return nodes, edges
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in processing chunks: {e}")
|
||||
raise
|
||||
|
||||
async def query_knowledge_base(self, query: str, top_k: int = 5):
|
||||
"""Query the knowledge base using various methods"""
|
||||
try:
|
||||
# 向量相似度搜索 - 文本块
|
||||
chunk_results = await self.chunks_vdb.query(query, top_k=top_k)
|
||||
|
||||
# 向量相似度搜索 - 实体
|
||||
entity_results = await self.entities_vdb.query(query, top_k=top_k)
|
||||
|
||||
# 获取相关文本块
|
||||
chunk_ids = [r["id"] for r in chunk_results]
|
||||
chunks = await self.text_chunks.get_by_ids(chunk_ids)
|
||||
|
||||
# 获取实体相关的图结构信息
|
||||
relevant_edges = []
|
||||
for entity in entity_results:
|
||||
if "entity_name" in entity:
|
||||
entity_name = entity["entity_name"]
|
||||
if await self.graph_store.has_node(entity_name):
|
||||
edges = await self.graph_store.get_node_edges(entity_name)
|
||||
if edges:
|
||||
edge_data = []
|
||||
for edge in edges:
|
||||
edge_info = await self.graph_store.get_edge(edge[0], edge[1])
|
||||
if edge_info:
|
||||
edge_data.append({
|
||||
"source": edge[0],
|
||||
"target": edge[1],
|
||||
"data": edge_info
|
||||
})
|
||||
relevant_edges.extend(edge_data)
|
||||
|
||||
return {
|
||||
"chunks": chunks,
|
||||
"entities": entity_results,
|
||||
"relationships": relevant_edges
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in querying knowledge base: {e}")
|
||||
raise
|
||||
|
||||
def export_knowledge_base(self, export_dir: str):
|
||||
"""Export the entire knowledge base"""
|
||||
os.makedirs(export_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# 导出统计信息
|
||||
storage_stats = {
|
||||
"chunks": {
|
||||
"total": len(self.text_chunks._data),
|
||||
"vector_stats": self.chunks_vdb.get_statistics()
|
||||
},
|
||||
"entities": {
|
||||
"vector_stats": self.entities_vdb.get_statistics()
|
||||
},
|
||||
"relationships": {
|
||||
"vector_stats": self.relationships_vdb.get_statistics()
|
||||
},
|
||||
"graph": {
|
||||
"total_nodes": len(list(self.graph_store._graph.nodes())),
|
||||
"total_edges": len(list(self.graph_store._graph.edges())),
|
||||
"node_degrees": dict(self.graph_store._graph.degree()),
|
||||
"largest_component_size": len(self.graph_store.get_largest_connected_component())
|
||||
}
|
||||
}
|
||||
|
||||
# 导出统计
|
||||
with open(os.path.join(export_dir, "storage_stats.json"), "w") as f:
|
||||
json.dump(storage_stats, f, indent=2)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in exporting knowledge base: {e}")
|
||||
raise
|
||||
|
||||
def print_extraction_results(self, nodes: Dict[str, List[dict]], edges: Dict[tuple, List[dict]]):
|
||||
"""Print extraction results and statistics"""
|
||||
print("\nExtracted Entities:")
|
||||
print("-" * 50)
|
||||
for entity_name, entity_instances in nodes.items():
|
||||
print(f"\nEntity: {entity_name}")
|
||||
for inst in entity_instances:
|
||||
pprint(inst, indent=2)
|
||||
|
||||
print("\nExtracted Relationships:")
|
||||
print("-" * 50)
|
||||
for (src, tgt), rel_instances in edges.items():
|
||||
print(f"\nRelationship: {src} -> {tgt}")
|
||||
for inst in rel_instances:
|
||||
pprint(inst, indent=2)
|
||||
|
||||
print("\nStorage Statistics:")
|
||||
print("-" * 50)
|
||||
print(f"Working Directory: {self.working_dir}")
|
||||
print(f"Number of Documents: {len(self.full_docs._data)}")
|
||||
print(f"Number of Chunks: {len(self.text_chunks._data)}")
|
||||
print(f"Conversation Turns: {sum(len(h) // 2 for h in self.conversation_history.values())}")
|
||||
|
||||
# 打印图统计
|
||||
print("\nGraph Statistics:")
|
||||
print("-" * 50)
|
||||
print(f"Total Nodes: {len(list(self.graph_store._graph.nodes()))}")
|
||||
print(f"Total Edges: {len(list(self.graph_store._graph.edges()))}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run comprehensive RAG example"""
|
||||
# 测试文档
|
||||
documents = {
|
||||
"tech_news": """
|
||||
Apple Inc. announced new iPhone models today in Cupertino.
|
||||
Tim Cook, the CEO, presented the keynote. The presentation highlighted
|
||||
the company's commitment to innovation and sustainability. The new iPhone
|
||||
features groundbreaking AI capabilities.
|
||||
""",
|
||||
}
|
||||
|
||||
try:
|
||||
# 创建RAG系统实例
|
||||
example = ExtractionExample()
|
||||
|
||||
# 处理文档
|
||||
all_nodes = {}
|
||||
all_edges = {}
|
||||
|
||||
for doc_name, content in documents.items():
|
||||
logger.info(f"\nProcessing document: {doc_name}")
|
||||
nodes, edges = await example.process_document(content)
|
||||
all_nodes.update(nodes)
|
||||
all_edges.update(edges)
|
||||
|
||||
# 打印结果
|
||||
example.print_extraction_results(all_nodes, all_edges)
|
||||
|
||||
# 测试查询
|
||||
query = "What are the latest developments in AI?"
|
||||
logger.info(f"\nTesting query: {query}")
|
||||
results = await example.query_knowledge_base(query)
|
||||
|
||||
print("\nQuery Results:")
|
||||
print("-" * 50)
|
||||
pprint(results)
|
||||
|
||||
# 导出知识库
|
||||
export_dir = os.path.join(example.working_dir, "export")
|
||||
print("\nExporting knowledge base...")
|
||||
logger.info(f"\nExporting knowledge base to: {export_dir}")
|
||||
example.export_knowledge_base(export_dir)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in main: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def run_example():
|
||||
"""Run the example"""
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_example()
|
||||
在新工单中引用
屏蔽一个用户