From 795a6a9333ae9d5d263c4510cfae5296f4a3f4da Mon Sep 17 00:00:00 2001 From: lbykkkk Date: Sun, 1 Dec 2024 23:26:02 +0800 Subject: [PATCH] add tex html formatter --- config.py | 2 +- crazy_functions/Arxiv_论文对话.py | 18 +- crazy_functions/doc_fns/tex_html_formatter.py | 354 ++++++++++++++++++ .../rag_fns/arxiv_fns/arxiv_splitter.py | 30 +- 4 files changed, 390 insertions(+), 14 deletions(-) create mode 100644 crazy_functions/doc_fns/tex_html_formatter.py diff --git a/config.py b/config.py index fb127eca..e71a59b1 100644 --- a/config.py +++ b/config.py @@ -39,7 +39,7 @@ AVAIL_LLM_MODELS = ["gpt-4-1106-preview", "gpt-4-turbo-preview", "gpt-4-vision-p "gemini-1.5-pro", "chatglm3" ] -EMBEDDING_MODEL = "netease-youdao/bce-embedding-base_v1" +EMBEDDING_MODEL = "text-embedding-3-small" # --- --- --- --- # P.S. 其他可用的模型还包括 diff --git a/crazy_functions/Arxiv_论文对话.py b/crazy_functions/Arxiv_论文对话.py index 7a32222b..d3e115c5 100644 --- a/crazy_functions/Arxiv_论文对话.py +++ b/crazy_functions/Arxiv_论文对话.py @@ -414,6 +414,7 @@ def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: web_port: Web端口 """ # 初始化时,提示用户需要 arxiv ID/URL + from toolbox import promote_file_to_downloadzone if len(history) == 0 and not txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')): chatbot.append((txt, "请先提供Arxiv论文链接或ID。")) yield from update_ui(chatbot=chatbot, history=history) @@ -421,14 +422,16 @@ def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: user_name = chatbot.get_user() arxiv_worker = ArxivRagWorker(user_name, llm_kwargs, arxiv_id=txt) + arxiv_id = arxiv_worker.arxiv_id # 处理新论文的情况 if txt.lower().strip().startswith(('https://arxiv.org', 'arxiv.org', '0', '1', '2')) and not arxiv_worker.loading: chatbot.append((txt, "正在处理论文,请稍等...")) yield from update_ui(chatbot=chatbot, history=history) - arxiv_id = arxiv_worker.arxiv_id - fragments, formatted_content, output_dir = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_worker.arxiv_id) - chatbot.append(["论文下载成功,接下来将编码论文,预计等待两分钟,请耐心等待,论文内容如下:", formatted_content]) + fragments, formatted_content, fragment_output_dir = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_id) + promote_file_to_downloadzone(fragment_output_dir, chatbot=chatbot) + chatbot.append(["论文下载成功,接下来将编码论文,预计等待两分钟,请耐心等待,等待过程中,可以查看论文:", formatted_content]) + yield from update_ui(chatbot=chatbot, history=history) try: # 创建新的事件循环 loop = asyncio.new_event_loop() @@ -471,8 +474,15 @@ def Arxiv论文对话(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: # 获取用户询问指令 user_query = plugin_kwargs.get("advanced_arg", "What is the main research question or problem addressed in this paper?") + if len(history)<2: + fragments, formatted_content, fragment_output_files = process_arxiv_sync(arxiv_worker.arxiv_splitter, arxiv_id) + for file in fragment_output_files: + promote_file_to_downloadzone(file, chatbot=chatbot) + chatbot.append(["论文的文字内容为:", formatted_content]) + chatbot.append(["处理完成", f"论文文字内容已保存至下载区"]) + yield from update_ui(chatbot=chatbot, history=history) if not user_query: - user_query = "What is the main research question or problem addressed in this paper about graph attention network?" + user_query = "What is the main research question or problem addressed in this paper?" # chatbot.append((txt, "请提供您的问题。")) # yield from update_ui(chatbot=chatbot, history=history) # return diff --git a/crazy_functions/doc_fns/tex_html_formatter.py b/crazy_functions/doc_fns/tex_html_formatter.py new file mode 100644 index 00000000..57958b60 --- /dev/null +++ b/crazy_functions/doc_fns/tex_html_formatter.py @@ -0,0 +1,354 @@ +from pathlib import Path +from typing import List, Dict +from dataclasses import dataclass +from datetime import datetime +import os +import re + + +@dataclass +class SectionFragment: + """Arxiv论文片段数据类""" + title: str + authors: str + abstract: str + catalogs: str + arxiv_id: str = "" + current_section: str = "Introduction" + content: str = '' + bibliography: str = '' + + +class PaperHtmlFormatter: + """HTML格式论文文档生成器""" + + def __init__(self, fragments: List[SectionFragment], output_dir: Path): + self.fragments = fragments + self.output_dir = output_dir + self.css_styles = """ + :root { + --primary-color: #1a73e8; + --secondary-color: #34495e; + --background-color: #f8f9fa; + --text-color: #2c3e50; + --border-color: #e0e0e0; + --code-bg-color: #f6f8fa; + } + + body { + font-family: "Source Serif Pro", "Times New Roman", serif; + line-height: 1.8; + max-width: 1000px; + margin: 0 auto; + padding: 2rem; + color: var(--text-color); + background-color: var(--background-color); + font-size: 16px; + } + + .container { + background: white; + padding: 2rem; + border-radius: 8px; + box-shadow: 0 2px 12px rgba(0,0,0,0.1); + } + + h1 { + color: var(--primary-color); + font-size: 2.2em; + text-align: center; + margin: 1.5rem 0; + padding-bottom: 1rem; + border-bottom: 3px solid var(--primary-color); + } + + h2 { + color: var(--secondary-color); + font-size: 1.8em; + margin-top: 2rem; + padding-left: 1rem; + border-left: 4px solid var(--primary-color); + } + + h3 { + color: var(--text-color); + font-size: 1.5em; + margin-top: 1.5rem; + border-bottom: 2px solid var(--border-color); + padding-bottom: 0.5rem; + } + + .authors { + text-align: center; + color: var(--secondary-color); + font-size: 1.1em; + margin: 1rem 0 2rem; + } + + .abstract-container { + background: var(--background-color); + padding: 1.5rem; + border-radius: 6px; + margin: 2rem 0; + } + + .abstract-title { + font-weight: bold; + color: var(--primary-color); + margin-bottom: 1rem; + } + + .abstract-content { + font-style: italic; + line-height: 1.7; + } + + .toc { + background: white; + padding: 1.5rem; + border-radius: 6px; + margin: 2rem 0; + box-shadow: 0 2px 8px rgba(0,0,0,0.05); + } + + .toc-title { + color: var(--primary-color); + font-size: 1.4em; + margin-bottom: 1rem; + } + + .section-content { + background: white; + padding: 1.5rem; + border-radius: 6px; + margin: 1.5rem 0; + box-shadow: 0 1px 3px rgba(0,0,0,0.05); + } + + .fragment { + margin: 2rem 0; + padding-left: 1rem; + border-left: 3px solid var(--border-color); + } + + .fragment:hover { + border-left-color: var(--primary-color); + } + + .bibliography { + background: var(--code-bg-color); + padding: 1rem; + border-radius: 4px; + font-family: "Source Code Pro", monospace; + font-size: 0.9em; + white-space: pre-wrap; + margin-top: 1rem; + } + + pre { + background: var(--code-bg-color); + padding: 1rem; + border-radius: 4px; + overflow-x: auto; + font-family: "Source Code Pro", monospace; + } + + .paper-info { + background: white; + padding: 2rem; + border-radius: 8px; + margin: 2rem 0; + box-shadow: 0 2px 8px rgba(0,0,0,0.1); + } + + .arxiv-id { + text-align: center; + color: #666; + font-size: 0.9em; + margin: 1rem 0; + } + + .section-title { + display: flex; + align-items: center; + gap: 0.5rem; + color: var(--secondary-color); + } + + .section-icon { + color: var(--primary-color); + } + + @media print { + body { + background: white; + } + .container { + box-shadow: none; + } + } + """ + + def _sanitize_html(self, text: str) -> str: + """清理HTML特殊字符""" + if not text: + return "" + + replacements = { + "&": "&", + "<": "<", + ">": ">", + '"': """, + "'": "'" + } + + for old, new in replacements.items(): + text = text.replace(old, new) + return text + + def _create_section_id(self, section: str) -> str: + """创建section的ID""" + section = section.strip() or "uncategorized" + # 移除特殊字符,转换为小写并用连字符替换空格 + section_id = re.sub(r'[^\w\s-]', '', section.lower()) + return section_id.replace(' ', '-') + + def format_paper_info(self) -> str: + """格式化论文基本信息""" + if not self.fragments: + return "" + + first_fragment = self.fragments[0] + paper_info = ['
'] + + # 添加标题 + if first_fragment.title: + paper_info.append(f'

{self._sanitize_html(first_fragment.title)}

') + + # 添加arXiv ID + if first_fragment.arxiv_id: + paper_info.append(f'
arXiv: {self._sanitize_html(first_fragment.arxiv_id)}
') + + # 添加作者 + if first_fragment.authors: + paper_info.append(f'
{self._sanitize_html(first_fragment.authors)}
') + + # 添加摘要 + if first_fragment.abstract: + paper_info.append('
') + paper_info.append('
Abstract
') + paper_info.append(f'
{self._sanitize_html(first_fragment.abstract)}
') + paper_info.append('
') + + # 添加目录结构 + if first_fragment.catalogs: + paper_info.append('

Document Structure

') + paper_info.append('
')
+            paper_info.append(self._sanitize_html(first_fragment.catalogs))
+            paper_info.append('
') + + paper_info.append('
') + return '\n'.join(paper_info) + + def format_table_of_contents(self, sections: Dict[str, List[SectionFragment]]) -> str: + """生成目录""" + toc = ['
'] + toc.append('
Table of Contents
') + toc.append('') + toc.append('
') + return '\n'.join(toc) + + def format_sections(self) -> str: + """格式化论文各部分内容""" + sections = {} + for fragment in self.fragments: + section = fragment.current_section or "Uncategorized" + if section not in sections: + sections[section] = [] + sections[section].append(fragment) + + formatted_html = ['
'] + formatted_html.append(self.format_table_of_contents(sections)) + + # 生成各部分内容 + for section, fragments in sections.items(): + section_id = self._create_section_id(section) + formatted_html.append(f'

') + formatted_html.append(f'') + formatted_html.append(f'§') + formatted_html.append(f'{self._sanitize_html(section)}') + formatted_html.append('') + formatted_html.append('

') + + formatted_html.append('
') + + for i, fragment in enumerate(fragments, 1): + formatted_html.append('
') + + # 添加内容 + if fragment.content: + formatted_html.append( + f'
{self._sanitize_html(fragment.content)}
' + ) + + # 添加参考文献 + if fragment.bibliography: + formatted_html.append('
') + formatted_html.append(f'{self._sanitize_html(fragment.bibliography)}') + formatted_html.append('
') + + formatted_html.append('
') + + formatted_html.append('
') + + formatted_html.append('
') + return '\n'.join(formatted_html) + + def save_html(self) -> Path: + """保存HTML文档""" + try: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"paper_content_{timestamp}.html" + file_path = self.output_dir / filename + + html_content = f""" + + + + + + {self._sanitize_html(self.fragments[0].title if self.fragments else 'Paper Content')} + + + +
+ {self.format_paper_info()} + {self.format_sections()} +
+ + + """ + + with open(file_path, "w", encoding="utf-8") as f: + f.write(html_content) + + print(f"HTML document saved to: {file_path}") + return file_path + + except Exception as e: + print(f"Error saving HTML document: {str(e)}") + raise + +# 使用示例: +# formatter = PaperHtmlFormatter(fragments, output_dir) +# output_path = formatter.save_html() \ No newline at end of file diff --git a/crazy_functions/rag_fns/arxiv_fns/arxiv_splitter.py b/crazy_functions/rag_fns/arxiv_fns/arxiv_splitter.py index 83efd966..808f0931 100644 --- a/crazy_functions/rag_fns/arxiv_fns/arxiv_splitter.py +++ b/crazy_functions/rag_fns/arxiv_fns/arxiv_splitter.py @@ -734,7 +734,7 @@ class ArxivSplitter: return content.strip() -def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[SectionFragment], str, Path]: +def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[SectionFragment], str, List[Path]]: """ 同步处理 ArXiv 文档并返回分割后的片段 @@ -746,19 +746,24 @@ def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[Sec list: 分割后的文档片段列表 """ try: + from crazy_functions.doc_fns.tex_html_formatter import PaperHtmlFormatter # 创建一个异步函数来执行异步操作 async def _process(): return await splitter.process(arxiv_id) # 使用 asyncio.run() 运行异步函数 + output_files=[] fragments = asyncio.run(_process()) - + file_save_path = splitter.root_dir / "arxiv_fragments" # 保存片段到文件 - output_dir = save_fragments_to_file( - fragments, - output_dir=splitter.root_dir / "arxiv_fragments" - ) - print(f"Output saved to: {output_dir}") + try: + md_output_dir = save_fragments_to_file( + fragments, + output_dir = file_save_path + ) + output_files.append(md_output_dir) + except: + pass # 创建论文格式化器 formatter = PaperContentFormatter() @@ -775,7 +780,14 @@ def process_arxiv_sync(splitter: ArxivSplitter, arxiv_id: str) -> tuple[List[Sec # 格式化内容 formatted_content = formatter.format(fragments, metadata) - return fragments, formatted_content, output_dir + + try: + html_formatter = PaperHtmlFormatter(fragments, file_save_path) + html_output_dir = html_formatter.save_html() + output_files.append(html_output_dir) + except: + pass + return fragments, formatted_content, output_files except Exception as e: print(f"✗ Processing failed for {arxiv_id}: {str(e)}") @@ -821,4 +833,4 @@ def test_arxiv_splitter(): if __name__ == "__main__": - asyncio.run(test_arxiv_splitter()) + test_arxiv_splitter()