diff --git a/crazy_functions/rag_fns/arxiv_fns/essay_structure.py b/crazy_functions/rag_fns/arxiv_fns/essay_structure.py new file mode 100644 index 00000000..91ea0474 --- /dev/null +++ b/crazy_functions/rag_fns/arxiv_fns/essay_structure.py @@ -0,0 +1,356 @@ +""" +LaTeX Document Parser + +This module provides functionality for parsing and extracting structured information from LaTeX documents, +including metadata, document structure, and content. It uses modular design and clean architecture principles. +""" + + +import re +from abc import ABC, abstractmethod +from enum import Enum +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +from dataclasses import dataclass, field +from typing import List, Optional, Dict +from enum import Enum +import logging +from copy import deepcopy +from crazy_functions.rag_fns.arxiv_fns.latex_cleaner import clean_latex_commands + +logger = logging.getLogger(__name__) + + +class SectionLevel(Enum): + CHAPTER = 0 + SECTION = 1 + SUBSECTION = 2 + SUBSUBSECTION = 3 + PARAGRAPH = 4 + SUBPARAGRAPH = 5 + + +@dataclass +class Section: + level: SectionLevel + title: str + content: str = '' + subsections: List['Section'] = field(default_factory=list) + + def merge(self, other: 'Section') -> 'Section': + """Merge this section with another section.""" + if self.title != other.title or self.level != other.level: + raise ValueError("Can only merge sections with same title and level") + + merged = deepcopy(self) + merged.content = self._merge_content(self.content, other.content) + + # Create subsections lookup for efficient merging + subsections_map = {s.title: s for s in merged.subsections} + + for other_subsection in other.subsections: + if other_subsection.title in subsections_map: + # Merge existing subsection + idx = next(i for i, s in enumerate(merged.subsections) + if s.title == other_subsection.title) + merged.subsections[idx] = merged.subsections[idx].merge(other_subsection) + else: + # Add new subsection + merged.subsections.append(deepcopy(other_subsection)) + + return merged + + @staticmethod + def _merge_content(content1: str, content2: str) -> str: + """Merge content strings intelligently.""" + if not content1: + return content2 + if not content2: + return content1 + # Combine non-empty contents with a separator + return f"{content1}\n\n{content2}" + + +@dataclass +class DocumentStructure: + title: str = '' + abstract: str = '' + toc: List[Section] = field(default_factory=list) + metadata: Dict[str, str] = field(default_factory=dict) + + def merge(self, other: 'DocumentStructure', strategy: str = 'smart') -> 'DocumentStructure': + """ + Merge this document structure with another one. + + Args: + other: Another DocumentStructure to merge with + strategy: Merge strategy - 'smart' (default) or 'append' + 'smart' - Intelligently merge sections with same titles + 'append' - Simply append sections from other document + """ + merged = deepcopy(self) + + # Merge title if needed + if not merged.title and other.title: + merged.title = other.title + + # Merge abstract + merged.abstract = self._merge_abstract(merged.abstract, other.abstract) + + # Merge metadata + merged.metadata.update(other.metadata) + + if strategy == 'append': + merged.toc.extend(deepcopy(other.toc)) + else: # smart merge + # Create sections lookup for efficient merging + sections_map = {s.title: s for s in merged.toc} + + for other_section in other.toc: + if other_section.title in sections_map: + # Merge existing section + idx = next(i for i, s in enumerate(merged.toc) + if s.title == other_section.title) + merged.toc[idx] = merged.toc[idx].merge(other_section) + else: + # Add new section + merged.toc.append(deepcopy(other_section)) + + return merged + + @staticmethod + def _merge_abstract(abstract1: str, abstract2: str) -> str: + """Merge abstracts intelligently.""" + if not abstract1: + return abstract2 + if not abstract2: + return abstract1 + # Combine non-empty abstracts with a separator + return f"{abstract1}\n\n{abstract2}" + + + + + + +class BaseExtractor(ABC): + """Base class for LaTeX content extractors.""" + + @abstractmethod + def extract(self, content: str) -> str: + """Extract specific content from LaTeX document.""" + pass + + +class TitleExtractor(BaseExtractor): + """Extracts title from LaTeX document.""" + + PATTERNS = [ + r'\\title{(.+?)}', + r'\\title\[.*?\]{(.+?)}', + r'\\Title{(.+?)}', + r'\\TITLE{(.+?)}', + r'\\begin{document}\s*\\section[*]?{(.+?)}', + r'\\maketitle\s*\\section[*]?{(.+?)}', + r'\\chapter[*]?{(.+?)}' + ] + + def extract(self, content: str) -> str: + """Extract title using defined patterns.""" + for pattern in self.PATTERNS: + matches = list(re.finditer(pattern, content, re.IGNORECASE | re.DOTALL)) + for match in matches: + title = match.group(1).strip() + if title: + return clean_latex_commands(title) + return '' + + +class AbstractExtractor(BaseExtractor): + """Extracts abstract from LaTeX document.""" + + PATTERNS = [ + r'\\begin{abstract}(.*?)\\end{abstract}', + r'\\abstract{(.*?)}', + r'\\ABSTRACT{(.*?)}', + r'\\Abstract{(.*?)}', + r'\\begin{Abstract}(.*?)\\end{Abstract}', + r'\\section[*]?{(?:Abstract|ABSTRACT)}\s*(.*?)(?:\\section|\Z)', + r'\\chapter[*]?{(?:Abstract|ABSTRACT)}\s*(.*?)(?:\\chapter|\Z)' + ] + + def extract(self, content: str) -> str: + """Extract abstract using defined patterns.""" + for pattern in self.PATTERNS: + matches = list(re.finditer(pattern, content, re.IGNORECASE | re.DOTALL)) + for match in matches: + abstract = match.group(1).strip() + if abstract: + return clean_latex_commands(abstract) + return '' + + +class SectionExtractor: + """Extracts document structure including sections and their content.""" + + def __init__(self): + self.section_pattern = self._compile_section_pattern() + + def _compile_section_pattern(self) -> str: + """Create pattern for matching section commands.""" + section_types = '|'.join(level.name.lower() for level in SectionLevel) + return fr'\\({section_types})\*?(?:\[.*?\])?\{{(.*?)\}}' + + def extract(self, content: str) -> List[Section]: + """Extract sections and build document hierarchy.""" + sections = [] + section_stack = [] + matches = list(re.finditer(self.section_pattern, content, re.IGNORECASE)) + + for i, match in enumerate(matches): + cmd_type = match.group(1).lower() + section_title = match.group(2) + level = SectionLevel[cmd_type.upper()] + + content = self._extract_section_content(content, match, + matches[i + 1] if i < len(matches) - 1 else None) + + new_section = Section( + level=level, + title=clean_latex_commands(section_title), + content=clean_latex_commands(content) + ) + + self._update_section_hierarchy(sections, section_stack, new_section) + + return sections + + def _extract_section_content(self, content: str, current_match: re.Match, + next_match: Optional[re.Match]) -> str: + """Extract content between current section and next section.""" + start_pos = current_match.end() + end_pos = next_match.start() if next_match else len(content) + return content[start_pos:end_pos].strip() + + def _update_section_hierarchy(self, sections: List[Section], + stack: List[Section], new_section: Section): + """Update section hierarchy based on section levels.""" + while stack and stack[-1].level.value >= new_section.level.value: + stack.pop() + + if stack: + stack[-1].subsections.append(new_section) + else: + sections.append(new_section) + + stack.append(new_section) + + +class EssayStructureParser: + """Main class for parsing LaTeX documents.""" + + def __init__(self): + self.title_extractor = TitleExtractor() + self.abstract_extractor = AbstractExtractor() + self.section_extractor = SectionExtractor() + + def parse(self, content: str) -> DocumentStructure: + """Parse LaTeX document and extract structured information.""" + try: + content = self._preprocess_content(content) + + return DocumentStructure( + title=self.title_extractor.extract(content), + abstract=self.abstract_extractor.extract(content), + toc=self.section_extractor.extract(content) + ) + except Exception as e: + logger.error(f"Error parsing LaTeX document: {str(e)}") + raise + + def _preprocess_content(self, content: str) -> str: + """Preprocess LaTeX content for parsing.""" + # Remove comments + content = re.sub(r'(? max_content_length: + preview += "..." + print(" " * (indent + 1) + f"Content: {preview}") + for subsection in section.subsections: + print_section(subsection, indent + 1) + + for section in doc.toc: + print_section(section) + + +# Example usage: +if __name__ == "__main__": + # Sample main.tex + main_tex = r""" + \documentclass{article} + \title{Research Paper} + \begin{document} + \begin{abstract} + Main abstract introducing the research. + \end{abstract} + \section{Introduction} + Overview of the topic... + \section{Background} + Part 1 of background... + \end{document} + """ + + # Sample background.tex + background_tex = r""" + \section{Background} + Part 2 of background... + \subsection{Related Work} + Discussion of related work... + \section{Methodology} + Research methods... + """ + + # Parse both files + parser = EssayStructureParser() # Assuming LaTeXParser class from previous code + main_doc = parser.parse(main_tex) + background_doc = parser.parse(background_tex) + + # Merge documents using smart strategy + merged_doc = main_doc.merge(background_doc) + + # Example of how sections are merged: + print("Original Background section content:", + [s for s in main_doc.toc if s.title == "Background"][0].content) + print("\nMerged Background section content:", + [s for s in merged_doc.toc if s.title == "Background"][0].content) + print("\nMerged structure:") + pretty_print_structure(merged_doc) # Assuming pretty_print_structure from previous code + + # Example of appending sections + appended_doc = main_doc.merge(background_doc, strategy='append') + print("\nAppended structure (may have duplicate sections):") + pretty_print_structure(appended_doc) \ No newline at end of file diff --git a/crazy_functions/rag_fns/arxiv_fns/latex_cleaner.py b/crazy_functions/rag_fns/arxiv_fns/latex_cleaner.py new file mode 100644 index 00000000..6f2ea06c --- /dev/null +++ b/crazy_functions/rag_fns/arxiv_fns/latex_cleaner.py @@ -0,0 +1,220 @@ +from dataclasses import dataclass, field +from typing import Set, Dict, Pattern, Optional +import re +from enum import Enum +import logging +from functools import lru_cache + + +class EnvType(Enum): + """Environment classification types.""" + PRESERVE = "preserve" + REMOVE = "remove" + EXTRACT = "extract" + + +@dataclass +class LatexConfig: + """Configuration for LaTeX processing.""" + preserve_envs: Set[str] = field(default_factory=lambda: { + # Math environments + 'equation', 'equation*', 'align', 'align*', 'displaymath', + 'math', 'eqnarray', 'gather', 'gather*', 'multline', 'multline*', + # Tables and figures + 'table', 'table*', 'tabular', 'tabularx', 'array', 'matrix', + 'figure', 'figure*', 'subfigure', + # Algorithms and code + 'algorithm', 'algorithmic', 'lstlisting', + # Theorems and proofs + 'theorem', 'proof', 'definition', 'lemma', 'corollary', + 'proposition', 'example', 'remark' + }) + + preserve_commands: Set[str] = field(default_factory=lambda: { + # Citations and references + 'caption', 'label', 'ref', 'cite', 'citep', 'citet', 'eqref', + # Text formatting + 'emph', 'textbf', 'textit', 'underline', 'texttt', 'footnote', + 'section', 'subsection', 'subsubsection', 'paragraph', + # Math operators + 'frac', 'sum', 'int', 'prod', 'lim', 'sup', 'inf' + }) + + remove_commands: Set[str] = field(default_factory=lambda: { + # Document setup + 'documentclass', 'usepackage', 'input', 'include', 'includeonly', + 'bibliography', 'bibliographystyle', 'frontmatter', 'mainmatter', + # Layout and spacing + 'pagestyle', 'thispagestyle', 'vspace', 'hspace', 'vfill', 'hfill', + 'newpage', 'clearpage', 'pagebreak', 'linebreak', 'newline', + 'setlength', 'setcounter', 'addtocounter', 'renewcommand', + 'newcommand', 'makeatletter', 'makeatother', 'pagenumbering', + # Margins and columns + 'marginpar', 'marginparsep', 'columnsep', 'columnseprule', + 'twocolumn', 'onecolumn', 'minipage', 'parbox' + }) + + latex_chars: Dict[str, str] = field(default_factory=lambda: { + '~': ' ', '\\&': '&', '\\%': '%', '\\_': '_', '\\$': '$', + '\\#': '#', '\\{': '{', '\\}': '}', '``': '"', "''": '"', + '\\textbackslash': '\\', '\\ldots': '...', '\\dots': '...', + '\\textasciitilde': '~', '\\textasciicircum': '^', + '\\quad': ' ', '\\qquad': ' ', '\\,': '', '\\;': '', '\\:': '', + '\\!': '', '\\space': ' ', '\\noindent': '' + }) + + inline_math_delimiters: Set[str] = field(default_factory=lambda: { + '$', '\\(', '\\)', '\\[', '\\]' + }) + + +class LatexCleaner: + """Efficient and modular LaTeX text cleaner.""" + + def __init__(self, config: Optional[LatexConfig] = None): + self.config = config or LatexConfig() + self.logger = logging.getLogger(__name__) + + @lru_cache(maxsize=128) + def _get_env_pattern(self, env_name: str) -> Pattern: + return re.compile(fr'\\begin{{{env_name}}}(.*?)\\end{{{env_name}}}', re.DOTALL) + + def _get_env_type(self, env_name: str) -> EnvType: + """Determine environment processing type.""" + if env_name.rstrip('*') in {name.rstrip('*') for name in self.config.preserve_envs}: + return EnvType.PRESERVE + elif env_name in {'verbatim', 'comment'}: + return EnvType.REMOVE + return EnvType.EXTRACT + + def _process_environment(self, match: re.Match) -> str: + try: + env_name = match.group(1) + content = match.group(2) + env_type = self._get_env_type(env_name) + + if env_type == EnvType.PRESERVE: + # Preserve math content without markers for inline math + if env_name in {'math', 'displaymath'}: + return f" {content} " + return f" [BEGIN_{env_name}] {content} [END_{env_name}] " + elif env_type == EnvType.REMOVE: + return ' ' + # Process nested environments recursively + return self._clean_nested_environments(content) + except Exception as e: + self.logger.error(f"Error processing environment {env_name}: {e}") + return content + + def _clean_nested_environments(self, text: str) -> str: + """Process nested environments recursively.""" + return re.sub( + r'\\begin{(\w+)}(.*?)\\end{\1}', + self._process_environment, + text, + flags=re.DOTALL + ) + + def _clean_commands(self, text: str) -> str: + """Clean LaTeX commands while preserving specified content.""" + # Remove complete commands + for cmd in self.config.remove_commands: + text = re.sub(fr'\\{cmd}\*?(?:\[.*?\])?(?:{{.*?}})*', '', text) + + # Process commands with content + def handle_command(match: re.Match) -> str: + cmd = match.group(1).rstrip('*') # Handle starred versions + content = match.group(2) + + # Keep math content intact + if cmd in {'[', ']', '(', ')', '$'} or cmd in self.config.inline_math_delimiters: + return content + + return content if cmd in self.config.preserve_commands else ' ' + + # Handle commands with arguments + text = re.sub(r'\\(\w+)\*?(?:\[.*?\])?{(.*?)}', handle_command, text) + + # Handle inline math + text = self._preserve_inline_math(text) + + # Remove remaining standalone commands + return re.sub(r'\\[a-zA-Z]+\*?(?:\[\])?', '', text) + + def _preserve_inline_math(self, text: str) -> str: + """Preserve inline math content.""" + # Handle $...$ math + text = re.sub(r'\$(.+?)\$', r' \1 ', text) + # Handle \(...\) math + text = re.sub(r'\\[\(\[](.+?)\\[\)\]]', r' \1 ', text) + return text + + def _normalize_text(self, text: str) -> str: + """Normalize special characters and whitespace.""" + # Replace special characters + for char, replacement in self.config.latex_chars.items(): + text = text.replace(char, replacement) + + # Clean up whitespace + text = re.sub(r'\s+', ' ', text) + text = re.sub(r'\s*\[BEGIN_(\w+)\]\s*', r' [BEGIN_\1] ', text) + text = re.sub(r'\s*\[END_(\w+)\]\s*', r' [END_\1] ', text) + + # Remove empty brackets and braces + text = re.sub(r'{\s*}|\[\s*\]|\(\s*\)', '', text) + + return text.strip() + + def clean_text(self, text: str) -> str: + """Clean LaTeX text while preserving meaningful content.""" + if not text: + raise ValueError("Input text cannot be empty") + + try: + # Remove comments not inside environments + text = re.sub(r'(? str: + """Convenience function for quick text cleaning with default config.""" + config = LatexConfig( + preserve_envs={'equation', 'theorem'}, + preserve_commands={'textbf', 'emph', "label"}, + latex_chars={'~': ' ', '\\&': '&'} + ) + return LatexCleaner(config).clean_text(text) + + +# Example usage: +if __name__ == "__main__": + # Basic usage with inline math + text = clean_latex_commands(r""" + \textbf{Important} result: $E=mc^2$ and + \begin{equation} + F = ma + \end{equation} + """) + print(text) + + # Custom configuration + config = LatexConfig( + preserve_envs={'equation', 'theorem'}, + preserve_commands={'textbf', 'emph'}, + latex_chars={'~': ' ', '\\&': '&'} + ) + cleaner = LatexCleaner(config) + text = cleaner.clean_text(r"\textbf{Custom} cleaning") + print(text) \ No newline at end of file