镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-07 23:16:48 +00:00
up
这个提交包含在:
@@ -5,14 +5,15 @@ This module provides functionality for parsing and extracting structured informa
|
||||
including metadata, document structure, and content. It uses modular design and clean architecture principles.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict
|
||||
|
||||
from crazy_functions.rag_fns.arxiv_fns.latex_cleaner import clean_latex_commands
|
||||
from crazy_functions.rag_fns.arxiv_fns.section_extractor import Section, SectionLevel, EnhancedSectionExtractor
|
||||
from crazy_functions.rag_fns.arxiv_fns.section_extractor import Section, EnhancedSectionExtractor
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -28,6 +29,7 @@ def read_tex_file(file_path):
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentStructure:
|
||||
title: str = ''
|
||||
@@ -68,7 +70,7 @@ class DocumentStructure:
|
||||
if other_section.title in sections_map:
|
||||
# Merge existing section
|
||||
idx = next(i for i, s in enumerate(merged.toc)
|
||||
if s.title == other_section.title)
|
||||
if s.title == other_section.title)
|
||||
merged.toc[idx] = merged.toc[idx].merge(other_section)
|
||||
else:
|
||||
# Add new section
|
||||
@@ -149,6 +151,8 @@ class DocumentStructure:
|
||||
result.extend(_format_section(section, 0) for section in self.toc)
|
||||
|
||||
return "".join(result)
|
||||
|
||||
|
||||
class BaseExtractor(ABC):
|
||||
"""Base class for LaTeX content extractors."""
|
||||
|
||||
@@ -157,6 +161,7 @@ class BaseExtractor(ABC):
|
||||
"""Extract specific content from LaTeX document."""
|
||||
pass
|
||||
|
||||
|
||||
class TitleExtractor(BaseExtractor):
|
||||
"""Extracts title from LaTeX document."""
|
||||
|
||||
@@ -180,6 +185,7 @@ class TitleExtractor(BaseExtractor):
|
||||
return clean_latex_commands(title)
|
||||
return ''
|
||||
|
||||
|
||||
class AbstractExtractor(BaseExtractor):
|
||||
"""Extracts abstract from LaTeX document."""
|
||||
|
||||
@@ -203,6 +209,7 @@ class AbstractExtractor(BaseExtractor):
|
||||
return clean_latex_commands(abstract)
|
||||
return ''
|
||||
|
||||
|
||||
class EssayStructureParser:
|
||||
"""Main class for parsing LaTeX documents."""
|
||||
|
||||
@@ -231,6 +238,7 @@ class EssayStructureParser:
|
||||
content = re.sub(r'(?<!\\)%.*$', '', content, flags=re.MULTILINE)
|
||||
return content
|
||||
|
||||
|
||||
def pretty_print_structure(doc: DocumentStructure, max_content_length: int = 100):
|
||||
"""Print document structure in a readable format."""
|
||||
print(f"Title: {doc.title}\n")
|
||||
@@ -250,10 +258,10 @@ def pretty_print_structure(doc: DocumentStructure, max_content_length: int = 100
|
||||
for section in doc.toc:
|
||||
print_section(section)
|
||||
|
||||
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
# Test with a file
|
||||
file_path = 'test_cache/2411.03663/neurips_2024.tex'
|
||||
main_tex = read_tex_file(file_path)
|
||||
@@ -278,5 +286,5 @@ if __name__ == "__main__":
|
||||
additional_doc = parser.parse(tex_content)
|
||||
main_doc = main_doc.merge(additional_doc)
|
||||
|
||||
tree= main_doc.generate_toc_tree()
|
||||
pretty_print_structure(main_doc)
|
||||
tree = main_doc.generate_toc_tree()
|
||||
pretty_print_structure(main_doc)
|
||||
|
||||
在新工单中引用
屏蔽一个用户