From bfa72fb4cf74069de10b25c843d7692367ec6b0e Mon Sep 17 00:00:00 2001
From: lbykkkk <liuboyin2019@ia.ac.cn>
Date: Sat, 9 Nov 2024 14:59:47 +0800
Subject: [PATCH] up

---
 crazy_functions/Arxiv_论文对话.py             |  65 ++++
 .../rag_essay_fns/paper_processing.py         | 312 ++++++++++++++++++
 crazy_functions/rag_essay_fns/rag_handler.py  |  85 +++++
 instruction.txt                               | 192 +++++++++++
 4 files changed, 654 insertions(+)
 create mode 100644 crazy_functions/Arxiv_论文对话.py
 create mode 100644 crazy_functions/rag_essay_fns/paper_processing.py
 create mode 100644 crazy_functions/rag_essay_fns/rag_handler.py
 create mode 100644 instruction.txt

diff --git a/crazy_functions/Arxiv_论文对话.py b/crazy_functions/Arxiv_论文对话.py
new file mode 100644
index 00000000..be448319
--- /dev/null
+++ b/crazy_functions/Arxiv_论文对话.py
@@ -0,0 +1,65 @@
+from toolbox import CatchException, update_ui
+from crazy_functions.rag_essay_fns.paper_processing import ArxivPaperProcessor
+from crazy_functions.rag_essay_fns.rag_handler import RagHandler
+import asyncio
+
+@CatchException
+def Rag论文对话(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+    """
+    txt: 用户输入，通常是arxiv论文链接
+    功能：RAG论文总结和对话
+    """
+    # 初始化处理器
+    processor = ArxivPaperProcessor()
+    rag_handler = RagHandler()
+
+    # Step 1: 下载和提取论文
+    download_result = processor.download_and_extract(txt, chatbot, history)
+    project_folder, arxiv_id = None, None
+    
+    for result in download_result:
+        if isinstance(result, tuple) and len(result) == 2:
+            project_folder, arxiv_id = result
+            break
+
+    if not project_folder or not arxiv_id:
+        return
+
+    # Step 2: 合并TEX文件
+    paper_content = processor.merge_tex_files(project_folder, chatbot, history)
+    if not paper_content:
+        return
+        
+    # Step 3: RAG处理
+    chatbot.append(["正在构建知识图谱...", "处理中..."])
+    yield from update_ui(chatbot=chatbot, history=history)
+    
+    # 处理论文内容
+    rag_handler.process_paper_content(paper_content)
+    
+    # 生成初始摘要
+    summary = rag_handler.query("请总结这篇论文的主要内容，包括研究目的、方法、结果和结论。")
+    chatbot.append(["论文摘要", summary])
+    yield from update_ui(chatbot=chatbot, history=history)
+    
+    # 交互式问答
+    chatbot.append(["知识图谱构建完成", "您可以开始提问了。支持以下类型的问题：\n1. 论文的具体内容\n2. 研究方法的细节\n3. 实验结果分析\n4. 与其他工作的比较"])
+    yield from update_ui(chatbot=chatbot, history=history)
+    
+    # 等待用户提问并回答
+    while True:
+        question = yield from wait_user_input()
+        if not question:
+            break
+            
+        # 根据问题类型选择不同的查询模式
+        if "比较" in question or "关系" in question:
+            mode = "global"  # 使用全局模式处理比较类问题
+        elif "具体" in question or "细节" in question:
+            mode = "local"   # 使用局部模式处理细节问题
+        else:
+            mode = "hybrid"  # 默认使用混合模式
+            
+        response = rag_handler.query(question, mode=mode)
+        chatbot.append([question, response])
+        yield from update_ui(chatbot=chatbot, history=history)
\ No newline at end of file
diff --git a/crazy_functions/rag_essay_fns/paper_processing.py b/crazy_functions/rag_essay_fns/paper_processing.py
new file mode 100644
index 00000000..5e60dc67
--- /dev/null
+++ b/crazy_functions/rag_essay_fns/paper_processing.py
@@ -0,0 +1,312 @@
+from typing import Tuple, Optional, Generator, List
+from toolbox import update_ui, update_ui_lastest_msg, get_conf
+import os, tarfile, requests, time, re
+
+class ArxivPaperProcessor:
+    """Arxiv论文处理器类"""
+    
+    def __init__(self):
+        self.supported_encodings = ['utf-8', 'latin1', 'gbk', 'gb2312', 'ascii']
+        self.arxiv_cache_dir = get_conf("ARXIV_CACHE_DIR")
+
+    def download_and_extract(self, txt: str, chatbot, history) -> Generator[Optional[Tuple[str, str]], None, None]:
+        """
+        Step 1: 下载和提取arxiv论文
+        返回: 生成器: (project_folder, arxiv_id)
+        """
+        try:
+            if txt == "":
+                chatbot.append(("", "请输入arxiv论文链接或ID"))
+                yield from update_ui(chatbot=chatbot, history=history)
+                return
+
+            project_folder, arxiv_id = self.arxiv_download(txt, chatbot, history)
+            if project_folder is None or arxiv_id is None:
+                return
+
+            if not os.path.exists(project_folder):
+                chatbot.append((txt, f"找不到项目文件夹: {project_folder}"))
+                yield from update_ui(chatbot=chatbot, history=history)
+                return
+
+            # 期望的返回值
+            yield project_folder, arxiv_id
+
+        except Exception as e:
+            print(e)
+            # yield from update_ui_lastest_msg(
+            #     "下载失败，请手动下载latex源码：请前往arxiv打开此论文下载页面，点other Formats，然后download source。",
+            #     chatbot=chatbot, history=history)
+            return
+
+    def arxiv_download(self, txt: str, chatbot, history) -> Tuple[str, str]:
+        """
+        下载arxiv论文并提取
+        返回: (project_folder, arxiv_id)
+        """
+        def is_float(s: str) -> bool:
+            try:
+                float(s)
+                return True
+            except ValueError:
+                return False
+
+        if txt.startswith('https://arxiv.org/pdf/'):
+            arxiv_id = txt.split('/')[-1]  # 2402.14207v2.pdf
+            txt = arxiv_id.split('v')[0]  # 2402.14207
+
+        if ('.' in txt) and ('/' not in txt) and is_float(txt):  # is arxiv ID
+            txt = 'https://arxiv.org/abs/' + txt.strip()
+        if ('.' in txt) and ('/' not in txt) and is_float(txt[:10]):  # is arxiv ID
+            txt = 'https://arxiv.org/abs/' + txt[:10]
+
+        if not txt.startswith('https://arxiv.org'):
+            chatbot.append((txt, "不是有效的arxiv链接或ID"))
+            # yield from update_ui(chatbot=chatbot, history=history)
+            return None, None  # 返回两个值，即使其中一个为None
+
+        chatbot.append([f"检测到arxiv文档连接", '尝试下载 ...'])
+        # yield from update_ui(chatbot=chatbot, history=history)
+
+        url_ = txt  # https://arxiv.org/abs/1707.06690
+
+        if not txt.startswith('https://arxiv.org/abs/'):
+            msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}。"
+            # yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history)  # 刷新界面
+            return None, None  # 返回两个值，即使其中一个为None
+
+        arxiv_id = url_.split('/')[-1].split('v')[0]
+
+        dst = os.path.join(self.arxiv_cache_dir, arxiv_id, f'{arxiv_id}.tar.gz')
+        project_folder = os.path.join(self.arxiv_cache_dir, arxiv_id)
+
+        success = self.download_arxiv_paper(url_, dst, chatbot, history)
+
+        # if os.path.exists(dst) and get_conf('allow_cache'):
+        #     # yield from update_ui_lastest_msg(f"调用缓存 {arxiv_id}", chatbot=chatbot, history=history)  # 刷新界面
+        #     success = True
+        # else:
+        #     # yield from update_ui_lastest_msg(f"开始下载 {arxiv_id}", chatbot=chatbot, history=history)  # 刷新界面
+        #     success = self.download_arxiv_paper(url_, dst, chatbot, history)
+        #     # yield from update_ui_lastest_msg(f"下载完成 {arxiv_id}", chatbot=chatbot, history=history)  # 刷新界面
+
+        if not success:
+            # chatbot.append([f"下载失败 {arxiv_id}", ""])
+            # yield from update_ui(chatbot=chatbot, history=history)
+            raise tarfile.ReadError(f"论文下载失败 {arxiv_id}")
+
+        # yield from update_ui_lastest_msg(f"开始解压 {arxiv_id}", chatbot=chatbot, history=history)  # 刷新界面
+        extract_dst = self.extract_tar_file(dst, project_folder, chatbot, history)
+        # yield from update_ui_lastest_msg(f"解压完成 {arxiv_id}", chatbot=chatbot, history=history)  # 刷新界面
+
+        return extract_dst, arxiv_id
+
+    def download_arxiv_paper(self, url_: str, dst: str, chatbot, history) -> bool:
+        """下载arxiv论文"""
+        try:
+            proxies = get_conf('proxies')
+            for url_tar in [url_.replace('/abs/', '/src/'), url_.replace('/abs/', '/e-print/')]:
+                r = requests.get(url_tar, proxies=proxies)
+                if r.status_code == 200:
+                    with open(dst, 'wb+') as f:
+                        f.write(r.content)
+                    return True
+            return False
+        except requests.RequestException as e:
+            # chatbot.append((f"下载失败 {url_}", str(e)))
+            # yield from update_ui(chatbot=chatbot, history=history)
+            return False
+
+    def extract_tar_file(self, file_path: str, dest_dir: str, chatbot, history) -> str:
+        """解压arxiv论文"""
+        try:
+            with tarfile.open(file_path, 'r:gz') as tar:
+                tar.extractall(path=dest_dir)
+            return dest_dir
+        except tarfile.ReadError as e:
+            chatbot.append((f"解压失败 {file_path}", str(e)))
+            yield from update_ui(chatbot=chatbot, history=history)
+            raise e
+
+    def find_main_tex_file(self, tex_files: list) -> str:
+        """查找主TEX文件"""
+        for tex_file in tex_files:
+            with open(tex_file, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+                if r'\documentclass' in content:
+                    return tex_file
+        return max(tex_files, key=lambda x: os.path.getsize(x))
+
+    def read_file_with_encoding(self, file_path: str) -> Optional[str]:
+        """使用多种编码尝试读取文件"""
+        for encoding in self.supported_encodings:
+            try:
+                with open(file_path, 'r', encoding=encoding) as f:
+                    return f.read()
+            except UnicodeDecodeError:
+                continue
+        return None
+
+    def process_tex_content(self, content: str, base_path: str, processed_files=None) -> str:
+        """处理TEX内容，包括递归处理包含的文件"""
+        if processed_files is None:
+            processed_files = set()
+
+        include_patterns = [
+            r'\\input{([^}]+)}',
+            r'\\include{([^}]+)}',
+            r'\\subfile{([^}]+)}',
+            r'\\input\s+([^\s{]+)',
+        ]
+
+        for pattern in include_patterns:
+            matches = re.finditer(pattern, content)
+            for match in matches:
+                include_file = match.group(1)
+                if not include_file.endswith('.tex'):
+                    include_file += '.tex'
+                
+                include_path = os.path.join(base_path, include_file)
+                include_path = os.path.normpath(include_path)
+                
+                if include_path in processed_files:
+                    continue
+                processed_files.add(include_path)
+
+                if os.path.exists(include_path):
+                    included_content = self.read_file_with_encoding(include_path)
+                    if included_content:
+                        included_content = self.process_tex_content(
+                            included_content, 
+                            os.path.dirname(include_path),
+                            processed_files
+                        )
+                        content = content.replace(match.group(0), included_content)
+
+        return content
+
+    def merge_tex_files(self, folder_path: str, chatbot, history) -> Optional[str]:
+        """
+        Step 2: 合并TEX文件
+        返回: 合并后的内容
+        """
+        try:
+            tex_files = []
+            for root, _, files in os.walk(folder_path):
+                tex_files.extend([os.path.join(root, f) for f in files if f.endswith('.tex')])
+
+            if not tex_files:
+                chatbot.append(("", "未找到任何TEX文件"))
+                yield from update_ui(chatbot=chatbot, history=history)
+                return None
+
+            main_tex_file = self.find_main_tex_file(tex_files)
+            chatbot.append(("", f"找到主TEX文件：{os.path.basename(main_tex_file)}"))
+            yield from update_ui(chatbot=chatbot, history=history)
+
+            tex_content = self.read_file_with_encoding(main_tex_file)
+            if tex_content is None:
+                chatbot.append(("", "无法读取TEX文件，可能是编码问题"))
+                yield from update_ui(chatbot=chatbot, history=history)
+                return None
+
+            full_content = self.process_tex_content(
+                tex_content,
+                os.path.dirname(main_tex_file)
+            )
+
+            cleaned_content = self.clean_tex_content(full_content)
+
+            chatbot.append(("", 
+                f"成功处理所有TEX文件：\n"
+                f"- 原始内容大小：{len(full_content)}字符\n"
+                f"- 清理后内容大小：{len(cleaned_content)}字符"
+            ))
+            yield from update_ui(chatbot=chatbot, history=history)
+
+            # 添加标题和摘要提取
+            title = ""
+            abstract = ""
+            if tex_content:
+                # 提取标题
+                title_match = re.search(r'\\title{([^}]*)}', tex_content)
+                if title_match:
+                    title = title_match.group(1)
+                    
+                # 提取摘要
+                abstract_match = re.search(r'\\begin{abstract}(.*?)\\end{abstract}', 
+                                         tex_content, re.DOTALL)
+                if abstract_match:
+                    abstract = abstract_match.group(1)
+            
+            # 按token限制分段
+            def split_by_token_limit(text: str, token_limit: int = 1024) -> List[str]:
+                segments = []
+                current_segment = []
+                current_tokens = 0
+                
+                for line in text.split('\n'):
+                    line_tokens = len(line.split())
+                    if current_tokens + line_tokens > token_limit:
+                        segments.append('\n'.join(current_segment))
+                        current_segment = [line]
+                        current_tokens = line_tokens
+                    else:
+                        current_segment.append(line)
+                        current_tokens += line_tokens
+                        
+                if current_segment:
+                    segments.append('\n'.join(current_segment))
+                    
+                return segments
+            
+            text_segments = split_by_token_limit(cleaned_content)
+            
+            return {
+                'title': title,
+                'abstract': abstract,
+                'segments': text_segments
+            }
+
+        except Exception as e:
+            chatbot.append(("", f"处理TEX文件时发生错误：{str(e)}"))
+            yield from update_ui(chatbot=chatbot, history=history)
+            return None
+
+    @staticmethod
+    def clean_tex_content(content: str) -> str:
+        """清理TEX内容"""
+        content = re.sub(r'(?m)%.*$', '', content)  # 移除注释
+        content = re.sub(r'\\cite{[^}]*}', '', content)  # 移除引用
+        content = re.sub(r'\\label{[^}]*}', '', content)  # 移除标签
+        content = re.sub(r'\s+', ' ', content)  # 规范化空白
+        return content.strip()
+
+if __name__ == "__main__":
+    # 测试 arxiv_download 函数
+    processor = ArxivPaperProcessor()
+    chatbot = []
+    history = []
+
+    # 测试不同格式的输入
+    test_inputs = [
+        "https://arxiv.org/abs/2402.14207",           # 标准格式
+        "https://arxiv.org/pdf/2402.14207.pdf",       # PDF链接格式
+        "2402.14207",                                 # 纯ID格式
+        "2402.14207v1",                              # 带版本号的ID格式
+        "https://invalid.url",                        # 无效URL测试
+    ]
+
+    for input_url in test_inputs:
+        print(f"\n测试输入: {input_url}")
+        try:
+            project_folder, arxiv_id = processor.arxiv_download(input_url, chatbot, history)
+            if project_folder and arxiv_id:
+                print(f"下载成功:")
+                print(f"- 项目文件夹: {project_folder}")
+                print(f"- Arxiv ID: {arxiv_id}")
+                print(f"- 文件夹是否存在: {os.path.exists(project_folder)}")
+            else:
+                print("下载失败: 返回值为 None")
+        except Exception as e:
+            print(f"发生错误: {str(e)}")
diff --git a/crazy_functions/rag_essay_fns/rag_handler.py b/crazy_functions/rag_essay_fns/rag_handler.py
new file mode 100644
index 00000000..89631dae
--- /dev/null
+++ b/crazy_functions/rag_essay_fns/rag_handler.py
@@ -0,0 +1,85 @@
+from typing import Dict, List, Optional
+from lightrag import LightRAG, QueryParam
+from lightrag.utils import EmbeddingFunc
+import numpy as np
+import os
+from toolbox import get_conf
+import openai
+
+class RagHandler:
+    def __init__(self):
+        # 初始化工作目录
+        self.working_dir = os.path.join(get_conf('ARXIV_CACHE_DIR'), 'rag_cache')
+        if not os.path.exists(self.working_dir):
+            os.makedirs(self.working_dir)
+            
+        # 初始化 LightRAG
+        self.rag = LightRAG(
+            working_dir=self.working_dir,
+            llm_model_func=self._llm_model_func,
+            embedding_func=EmbeddingFunc(
+                embedding_dim=1536,  # OpenAI embedding 维度
+                max_token_size=8192,
+                func=self._embedding_func,
+            ),
+        )
+        
+    async def _llm_model_func(self, prompt: str, system_prompt: str = None, 
+                            history_messages: List = None, **kwargs) -> str:
+        """LLM 模型函数"""
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        if history_messages:
+            messages.extend(history_messages)
+        messages.append({"role": "user", "content": prompt})
+        
+        response = await openai.ChatCompletion.acreate(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            temperature=kwargs.get("temperature", 0),
+            max_tokens=kwargs.get("max_tokens", 1000)
+        )
+        return response.choices[0].message.content
+
+    async def _embedding_func(self, texts: List[str]) -> np.ndarray:
+        """Embedding 函数"""
+        response = await openai.Embedding.acreate(
+            model="text-embedding-ada-002",
+            input=texts
+        )
+        embeddings = [item["embedding"] for item in response["data"]]
+        return np.array(embeddings)
+
+    def process_paper_content(self, paper_content: Dict) -> None:
+        """处理论文内容，构建知识图谱"""
+        # 处理标题和摘要
+        content_list = []
+        if paper_content['title']:
+            content_list.append(f"Title: {paper_content['title']}")
+        if paper_content['abstract']:
+            content_list.append(f"Abstract: {paper_content['abstract']}")
+            
+        # 添加分段内容
+        content_list.extend(paper_content['segments'])
+        
+        # 插入到 RAG 系统
+        self.rag.insert(content_list)
+
+    def query(self, question: str, mode: str = "hybrid") -> str:
+        """查询论文内容
+        mode: 查询模式，可选 naive/local/global/hybrid
+        """
+        try:
+            response = self.rag.query(
+                question, 
+                param=QueryParam(
+                    mode=mode,
+                    top_k=5,  # 返回相关度最高的5个结果
+                    max_token_for_text_unit=2048,  # 每个文本单元的最大token数
+                    response_type="detailed"  # 返回详细回答
+                )
+            )
+            return response
+        except Exception as e:
+            return f"查询出错: {str(e)}"
\ No newline at end of file
diff --git a/instruction.txt b/instruction.txt
new file mode 100644
index 00000000..64e8221f
--- /dev/null
+++ b/instruction.txt
@@ -0,0 +1,192 @@
+
+1、GPT Academic 项目结构
+.
+├── Dockerfile
+├── LICENSE
+├── README.md
+├── check_proxy.py
+├── config.py
+├── config_private.py
+├── core_functional.py
+├── crazy_functional.py
+├── crazy_functions
+│         ├── Arxiv_论文对话.py
+│         ├── Conversation_To_File.py
+│         ├── Image_Generate.py
+│         ├── Image_Generate_Wrap.py
+│         ├── Internet_GPT.py
+│         ├── Internet_GPT_Wrap.py
+│         ├── Latex_Function.py
+│         ├── Latex_Function_Wrap.py
+│         ├── Latex全文润色.py
+│         ├── Latex全文翻译.py
+│         ├── Markdown_Translate.py
+│         ├── PDF_Translate.py
+│         ├── PDF_Translate_Wrap.py
+│         ├── Rag_Interface.py
+│         ├── Social_Helper.py
+│         ├── SourceCode_Analyse.py
+│         ├── SourceCode_Comment.py
+│         ├── SourceCode_Comment_Wrap.py
+│         ├── __init__.py
+│         │         ├── auto_agent.py
+│         │         ├── echo_agent.py
+│         │         ├── general.py
+│         │         ├── persistent.py
+│         │         ├── pipe.py
+│         │         ├── python_comment_agent.py
+│         │         ├── python_comment_compare.html
+│         │         └── watchdog.py
+│         ├── ast_fns
+│         │         └── comment_remove.py
+│         ├── chatglm微调工具.py
+│         ├── crazy_utils.py
+│         ├── diagram_fns
+│         │         └── file_tree.py
+│         ├── game_fns
+│         │         ├── game_ascii_art.py
+│         │         ├── game_interactive_story.py
+│         │         └── game_utils.py
+│         ├── gen_fns
+│         │         └── gen_fns_shared.py
+│         ├── ipc_fns
+│         │         └── mp.py
+│         ├── json_fns
+│         │         ├── pydantic_io.py
+│         │         └── select_tool.py
+│         ├── latex_fns
+│         │         ├── latex_actions.py
+│         │         ├── latex_pickle_io.py
+│         │         └── latex_toolbox.py
+│         ├── live_audio
+│         │         ├── aliyunASR.py
+│         │         └── audio_io.py
+│         ├── multi_stage
+│         │         └── multi_stage_utils.py
+│         ├── rag_essay_fns
+│         │         └── multi_stage_utils.py
+│         ├── pdf_fns
+│         │         ├── breakdown_txt.py
+│         │         ├── parse_pdf.py
+│         │         ├── parse_pdf_grobid.py
+│         │         ├── parse_pdf_legacy.py
+│         │         ├── parse_pdf_via_doc2x.py
+│         │         ├── parse_word.py
+│         │         ├── report_gen_html.py
+│         │         ├── report_template.html
+│         │         └── report_template_v2.html
+│         ├── plugin_template
+│         │         └── plugin_class_template.py
+│         ├── prompts
+│         │         └── internet.py
+│         ├── rag_fns
+│         │         ├── llama_index_worker.py
+│         │         ├── milvus_worker.py
+│         │         ├── rag_file_support.py
+│         │         └── vector_store_index.py
+│         ├── vector_fns
+│         │         ├── __init__.py
+│         │         ├── general_file_loader.py
+│         │         └── vector_database.py
+│         ├── vt_fns
+│         │         ├── vt_call_plugin.py
+│         │         ├── vt_modify_config.py
+│         │         └── vt_state.py
+│         ├── 下载arxiv论文翻译摘要.py
+│         ├── 互动小游戏.py
+│         ├── 交互功能函数模板.py
+│         ├── 函数动态生成.py
+│         ├── 命令行助手.py
+│         ├── 多智能体.py
+│         ├── 总结word文档.py
+│         ├── 总结音视频.py
+│         ├── 批量总结PDF文档.py
+│         ├── 批量总结PDF文档pdfminer.py
+│         ├── 批量翻译PDF文档_NOUGAT.py
+│         ├── 数学动画生成manim.py
+│         ├── 理解PDF文档内容.py
+│         ├── 生成函数注释.py
+│         ├── 生成多种Mermaid图表.py
+│         ├── 知识库问答.py
+│         ├── 联网的ChatGPT.py
+│         ├── 联网的ChatGPT_bing版.py
+│         ├── 虚空终端.py
+│         ├── 解析JupyterNotebook.py
+│         ├── 询问多个大语言模型.py
+│         ├── 语音助手.py
+│         ├── 读文章写摘要.py
+│         ├── 谷歌检索小助手.py
+│         ├── 辅助功能.py
+│         └── 高级功能函数模板.py
+├── docker-compose.yml
+├── instruction.txt
+├── main.py
+├── multi_language.py
+├── requirements.txt
+├── shared_utils
+│         ├── advanced_markdown_format.py
+│         ├── char_visual_effect.py
+│         ├── colorful.py
+│         ├── config_loader.py
+│         ├── connect_void_terminal.py
+│         ├── cookie_manager.py
+│         ├── fastapi_server.py
+│         ├── handle_upload.py
+│         ├── key_pattern_manager.py
+│         ├── logging.py
+│         ├── map_names.py
+│         └── text_mask.py
+├── toolbox.py
+└── version
+
+2、light_rag的实现方案路径为crazy_functions/rag_fns/LightRAG，主要功能实现文件为operate.py，rag使用到的其他文件为prompt.py、base.py、storage.py、utils.py，请参考实现方案实现插件功能。light_rag的使用案例可以参考crazy_functions/rag_fns/LightRAG/examples路径下的lightrag_hf_demo.py、lightrag_lmdeploy_demo.py：
+路径目录结构为
+
+├── README.md
+├── examples
+│   ├── batch_eval.py
+│   ├── generate_query.py
+│   ├── graph_visual_with_html.py
+│   ├── graph_visual_with_neo4j.py
+│   ├── lightrag_azure_openai_demo.py
+│   ├── lightrag_bedrock_demo.py
+│   ├── lightrag_hf_demo.py
+│   ├── lightrag_ollama_demo.py
+│   ├── lightrag_openai_compatible_demo.py
+│   ├── lightrag_openai_demo.py
+│   └── vram_management_demo.py
+├── lightrag
+│   ├── __init__.py
+│   ├── base.py
+│   ├── lightrag.py
+│   ├── llm.py
+│   ├── operate.py
+│   ├── prompt.py
+│   ├── storage.py
+│   └── utils.py
+├── reproduce
+│   ├── Step_0.py
+│   ├── Step_1.py
+│   ├── Step_1_openai_compatible.py
+│   ├── Step_2.py
+│   ├── Step_3.py
+│   └── Step_3_openai_compatible.py
+├── requirements.txt
+└── setup.py
+
+
+3、我需要开发一个rag插件，请帮我实现一个插件，插件的名称是rag论文总结，插件主入口在crazy_functions/Arxiv_论文对话.py中的Rag论文对话函数，插件的功能步骤分为文件处理和RAG两个步骤
+    文件处理步骤流程和要求按顺序如下，请参考gpt_academic已实现的功能复用现有函数即可：
+        a. 支持从 arXiv 下载论文源码、检查本地项目路径、扫描 .tex 文件，此步骤可参考crazy_functions/Latex_Function.py。
+        b、在项目中找到主要的 LaTeX 文件，将多个 TEX 文件合并成一个大的 TEX 文件，便于统一处理，此步骤可参考crazy_functions/Latex_Function.py。
+        c、将合并后的文档进行精细切分，包括读取标题和摘要，此步骤可参考crazy_functions/Latex_Function.py。
+        d、将文档按照 token 限制（1024）进行进一步分段，此步骤可参考crazy_functions/Latex_Function.py。
+    
+3、对于RAG，我希望采用light_rag的方案，参考已有方案其主要的功能实现是：
+主要功能包括：
+    e 参考- `chunking_by_token_size`，利用`_handle_entity_relation_summary`函数对d步骤生成的文本块进行实体或关系的摘要。
+    f 利用`_handle_single_entity_extraction` 和 `_handle_single_relationship_extraction`：从记录中提取单个实体或关系信息。
+    g `_merge_nodes_then_upsert` 和 `_merge_edges_then_upsert`：合并并插入节点或边。
+    h `extract_entities`：处理多个文本块，提取实体和关系，并存储在知识图谱和向量数据库中。
+    i `local_query`：根据查询提取关键词并生成响应。
+