这个提交包含在:
lbykkkk
2024-12-01 17:35:57 +08:00
父节点 cf51d4b205
当前提交 b3aef6b393
共有 13 个文件被更改,包括 398 次插入234 次删除

查看文件

@@ -5,14 +5,15 @@ This module provides functionality for parsing and extracting structured informa
including metadata, document structure, and content. It uses modular design and clean architecture principles.
"""
import logging
import re
from abc import ABC, abstractmethod
import logging
from dataclasses import dataclass, field
from typing import List, Optional, Dict
from copy import deepcopy
from dataclasses import dataclass, field
from typing import List, Dict
from crazy_functions.rag_fns.arxiv_fns.latex_cleaner import clean_latex_commands
from crazy_functions.rag_fns.arxiv_fns.section_extractor import Section, SectionLevel, EnhancedSectionExtractor
from crazy_functions.rag_fns.arxiv_fns.section_extractor import Section, EnhancedSectionExtractor
# Configure logging
logging.basicConfig(level=logging.INFO)
@@ -28,6 +29,7 @@ def read_tex_file(file_path):
except UnicodeDecodeError:
continue
@dataclass
class DocumentStructure:
title: str = ''
@@ -68,7 +70,7 @@ class DocumentStructure:
if other_section.title in sections_map:
# Merge existing section
idx = next(i for i, s in enumerate(merged.toc)
if s.title == other_section.title)
if s.title == other_section.title)
merged.toc[idx] = merged.toc[idx].merge(other_section)
else:
# Add new section
@@ -149,6 +151,8 @@ class DocumentStructure:
result.extend(_format_section(section, 0) for section in self.toc)
return "".join(result)
class BaseExtractor(ABC):
"""Base class for LaTeX content extractors."""
@@ -157,6 +161,7 @@ class BaseExtractor(ABC):
"""Extract specific content from LaTeX document."""
pass
class TitleExtractor(BaseExtractor):
"""Extracts title from LaTeX document."""
@@ -180,6 +185,7 @@ class TitleExtractor(BaseExtractor):
return clean_latex_commands(title)
return ''
class AbstractExtractor(BaseExtractor):
"""Extracts abstract from LaTeX document."""
@@ -203,6 +209,7 @@ class AbstractExtractor(BaseExtractor):
return clean_latex_commands(abstract)
return ''
class EssayStructureParser:
"""Main class for parsing LaTeX documents."""
@@ -231,6 +238,7 @@ class EssayStructureParser:
content = re.sub(r'(?<!\\)%.*$', '', content, flags=re.MULTILINE)
return content
def pretty_print_structure(doc: DocumentStructure, max_content_length: int = 100):
"""Print document structure in a readable format."""
print(f"Title: {doc.title}\n")
@@ -250,10 +258,10 @@ def pretty_print_structure(doc: DocumentStructure, max_content_length: int = 100
for section in doc.toc:
print_section(section)
# Example usage:
if __name__ == "__main__":
# Test with a file
file_path = 'test_cache/2411.03663/neurips_2024.tex'
main_tex = read_tex_file(file_path)
@@ -278,5 +286,5 @@ if __name__ == "__main__":
additional_doc = parser.parse(tex_content)
main_doc = main_doc.merge(additional_doc)
tree= main_doc.generate_toc_tree()
pretty_print_structure(main_doc)
tree = main_doc.generate_toc_tree()
pretty_print_structure(main_doc)