from abc import ABC, abstractmethod from typing import Any, Dict, Optional, Type, TypeVar, Generic, Union from dataclasses import dataclass from enum import Enum, auto import logging from datetime import datetime from crazy_functions.rag_fns.arxiv_fns.section_fragment import SectionFragment # 设置日志 logger = logging.getLogger(__name__) # 自定义异常类定义 class FoldingError(Exception): """折叠相关的自定义异常基类""" pass class FormattingError(FoldingError): """格式化过程中的错误""" pass class MetadataError(FoldingError): """元数据相关的错误""" pass class ValidationError(FoldingError): """验证错误""" pass class FoldingStyle(Enum): """折叠样式枚举""" SIMPLE = auto() # 简单折叠 DETAILED = auto() # 详细折叠(带有额外信息) NESTED = auto() # 嵌套折叠 @dataclass class FoldingOptions: """折叠选项配置""" style: FoldingStyle = FoldingStyle.DETAILED code_language: Optional[str] = None # 代码块的语言 show_timestamp: bool = False # 是否显示时间戳 indent_level: int = 0 # 缩进级别 custom_css: Optional[str] = None # 自定义CSS类 T = TypeVar('T') # 用于泛型类型 class BaseMetadata(ABC): """元数据基类""" @abstractmethod def validate(self) -> bool: """验证元数据的有效性""" pass def _validate_non_empty_str(self, value: Optional[str]) -> bool: """验证字符串非空""" return bool(value and value.strip()) @dataclass class FileMetadata(BaseMetadata): """文件元数据""" rel_path: str size: float last_modified: Optional[datetime] = None mime_type: Optional[str] = None encoding: str = 'utf-8' def validate(self) -> bool: """验证文件元数据的有效性""" try: if not self._validate_non_empty_str(self.rel_path): return False if self.size < 0: return False return True except Exception as e: logger.error(f"File metadata validation error: {str(e)}") return False class ContentFormatter(ABC, Generic[T]): """内容格式化抽象基类 支持泛型类型参数,可以指定具体的元数据类型。 """ @abstractmethod def format(self, content: str, metadata: T, options: Optional[FoldingOptions] = None) -> str: """格式化内容 Args: content: 需要格式化的内容 metadata: 类型化的元数据 options: 折叠选项 Returns: str: 格式化后的内容 Raises: FormattingError: 格式化过程中的错误 """ pass def _create_summary(self, metadata: T) -> str: """创建折叠摘要,可被子类重写""" return str(metadata) def _format_content_block(self, content: str, options: Optional[FoldingOptions]) -> str: """格式化内容块,处理代码块等特殊格式""" if not options: return content if options.code_language: return f"```{options.code_language}\n{content}\n```" return content def _add_indent(self, text: str, level: int) -> str: """添加缩进""" if level <= 0: return text indent = " " * level return "\n".join(indent + line for line in text.splitlines()) class FileContentFormatter(ContentFormatter[FileMetadata]): """文件内容格式化器""" def format(self, content: str, metadata: FileMetadata, options: Optional[FoldingOptions] = None) -> str: """格式化文件内容""" if not metadata.validate(): raise MetadataError("Invalid file metadata") try: options = options or FoldingOptions() # 构建摘要信息 summary_parts = [ f"{metadata.rel_path} ({metadata.size:.2f}MB)", f"Type: {metadata.mime_type}" if metadata.mime_type else None, (f"Modified: {metadata.last_modified.strftime('%Y-%m-%d %H:%M:%S')}" if metadata.last_modified and options.show_timestamp else None) ] summary = " | ".join(filter(None, summary_parts)) # 构建HTML类 css_class = f' class="{options.custom_css}"' if options.custom_css else '' # 格式化内容 formatted_content = self._format_content_block(content, options) # 组装最终结果 result = ( f'{summary}\n\n' f'{formatted_content}\n\n' f'\n\n' ) return self._add_indent(result, options.indent_level) except Exception as e: logger.error(f"Error formatting file content: {str(e)}") raise FormattingError(f"Failed to format file content: {str(e)}") class ContentFoldingManager: """内容折叠管理器""" def __init__(self): """初始化折叠管理器""" self._formatters: Dict[str, ContentFormatter] = {} self._register_default_formatters() def _register_default_formatters(self) -> None: """注册默认的格式化器""" self.register_formatter('file', FileContentFormatter()) def register_formatter(self, name: str, formatter: ContentFormatter) -> None: """注册新的格式化器""" if not isinstance(formatter, ContentFormatter): raise TypeError("Formatter must implement ContentFormatter interface") self._formatters[name] = formatter def _guess_language(self, extension: str) -> Optional[str]: """根据文件扩展名猜测编程语言""" extension = extension.lower().lstrip('.') language_map = { 'py': 'python', 'js': 'javascript', 'java': 'java', 'cpp': 'cpp', 'cs': 'csharp', 'html': 'html', 'css': 'css', 'md': 'markdown', 'json': 'json', 'xml': 'xml', 'sql': 'sql', 'sh': 'bash', 'yaml': 'yaml', 'yml': 'yaml', 'txt': None # 纯文本不需要语言标识 } return language_map.get(extension) def format_content(self, content: str, formatter_type: str, metadata: Union[FileMetadata], options: Optional[FoldingOptions] = None) -> str: """格式化内容""" formatter = self._formatters.get(formatter_type) if not formatter: raise KeyError(f"No formatter registered for type: {formatter_type}") if not isinstance(metadata, FileMetadata): raise TypeError("Invalid metadata type") return formatter.format(content, metadata, options) @dataclass class PaperMetadata(BaseMetadata): """论文元数据""" title: str authors: str abstract: str catalogs: str arxiv_id: str = "" def validate(self) -> bool: """验证论文元数据的有效性""" try: if not self._validate_non_empty_str(self.title): return False if not self._validate_non_empty_str(self.authors): return False if not self._validate_non_empty_str(self.abstract): return False if not self._validate_non_empty_str(self.catalogs): return False return True except Exception as e: logger.error(f"Paper metadata validation error: {str(e)}") return False class PaperContentFormatter(ContentFormatter[PaperMetadata]): """论文内容格式化器""" def format(self, fragments: list[SectionFragment], metadata: PaperMetadata, options: Optional[FoldingOptions] = None) -> str: """格式化论文内容 Args: fragments: 论文片段列表 metadata: 论文元数据 options: 折叠选项 Returns: str: 格式化后的论文内容 """ if not metadata.validate(): raise MetadataError("Invalid paper metadata") try: options = options or FoldingOptions() # 1. 生成标题部分(不折叠) result = [f"# {metadata.title}\n"] # 2. 生成作者信息(折叠) result.append(self._create_folded_section( "Authors", metadata.authors, options )) # 3. 生成摘要(折叠) result.append(self._create_folded_section( "Abstract", metadata.abstract, options )) # 4. 生成目录树(折叠) result.append(self._create_folded_section( "Table of Contents", f"```\n{metadata.catalogs}\n```", options )) # 5. 按章节组织并生成内容 sections = self._organize_sections(fragments) for section, section_fragments in sections.items(): # 拼接该章节的所有内容 section_content = "\n\n".join( fragment.content for fragment in section_fragments ) result.append(self._create_folded_section( section, section_content, options )) # 6. 生成参考文献(折叠) # 收集所有非空的参考文献 all_refs = "\n".join(filter(None, (fragment.bibliography for fragment in fragments) )) if all_refs: result.append(self._create_folded_section( "Bibliography", f"```bibtex\n{all_refs}\n```", options )) return "\n\n".join(result) except Exception as e: logger.error(f"Error formatting paper content: {str(e)}") raise FormattingError(f"Failed to format paper content: {str(e)}") def _create_folded_section(self, title: str, content: str, options: FoldingOptions) -> str: """创建折叠区块 Args: title: 区块标题 content: 区块内容 options: 折叠选项 Returns: str: 格式化后的折叠区块 """ css_class = f' class="{options.custom_css}"' if options.custom_css else '' result = ( f'{title}\n\n' f'{content}\n\n' f'' ) return self._add_indent(result, options.indent_level) def _organize_sections(self, fragments: list[SectionFragment] ) -> Dict[str, list[SectionFragment]]: """将片段按章节分组 Args: fragments: 论文片段列表 Returns: Dict[str, list[SectionFragment]]: 按章节分组的片段字典 """ sections: Dict[str, list[SectionFragment]] = {} for fragment in fragments: section = fragment.current_section or "Uncategorized" if section not in sections: sections[section] = [] sections[section].append(fragment) return sections