diff --git a/.github/workflows/build-with-all-capacity-beta.yml b/.github/workflows/build-with-all-capacity-beta.yml deleted file mode 100644 index 5a2a1a54..00000000 --- a/.github/workflows/build-with-all-capacity-beta.yml +++ /dev/null @@ -1,44 +0,0 @@ -# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages -name: build-with-all-capacity-beta - -on: - push: - branches: - - 'master' - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }}_with_all_capacity_beta - -jobs: - build-and-push-image: - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Log in to the Container registry - uses: docker/login-action@v2 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@v4 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - - - name: Build and push Docker image - uses: docker/build-push-action@v4 - with: - context: . - push: true - file: docs/GithubAction+AllCapacityBeta - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/build-with-jittorllms.yml b/.github/workflows/build-with-jittorllms.yml deleted file mode 100644 index d56ef963..00000000 --- a/.github/workflows/build-with-jittorllms.yml +++ /dev/null @@ -1,44 +0,0 @@ -# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages -name: build-with-jittorllms - -on: - push: - branches: - - 'master' - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }}_jittorllms - -jobs: - build-and-push-image: - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Log in to the Container registry - uses: docker/login-action@v2 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@v4 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - - - name: Build and push Docker image - uses: docker/build-push-action@v4 - with: - context: . - push: true - file: docs/GithubAction+JittorLLMs - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} diff --git a/README.md b/README.md index 97da208f..2b8ffaa5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ > [!IMPORTANT] -> 2024.6.1: 版本3.80加入插件二级菜单功能(详见wiki) +> 2024.10.10: 突发停电,紧急恢复了提供[whl包](https://drive.google.com/file/d/19U_hsLoMrjOlQSzYS3pzWX9fTzyusArP/view?usp=sharing)的文件服务器 +> 2024.10.8: 版本3.90加入对llama-index的初步支持,版本3.80加入插件二级菜单功能(详见wiki) > 2024.5.1: 加入Doc2x翻译PDF论文的功能,[查看详情](https://github.com/binary-husky/gpt_academic/wiki/Doc2x) > 2024.3.11: 全力支持Qwen、GLM、DeepseekCoder等中文大语言模型! SoVits语音克隆模块,[查看详情](https://www.bilibili.com/video/BV1Rp421S7tF/) > 2024.1.17: 安装依赖时,请选择`requirements.txt`中**指定的版本**。 安装命令:`pip install -r requirements.txt`。本项目完全开源免费,您可通过订阅[在线服务](https://github.com/binary-husky/gpt_academic/wiki/online)的方式鼓励本项目的发展。 diff --git a/crazy_functional.py b/crazy_functional.py index 1ddd8c20..de07c1bb 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -6,7 +6,6 @@ from loguru import logger def get_crazy_functions(): from crazy_functions.读文章写摘要 import 读文章写摘要 from crazy_functions.生成函数注释 import 批量生成函数注释 - from crazy_functions.Rag_Interface import Rag问答 from crazy_functions.SourceCode_Analyse import 解析项目本身 from crazy_functions.SourceCode_Analyse import 解析一个Python项目 from crazy_functions.SourceCode_Analyse import 解析一个Matlab项目 @@ -52,13 +51,6 @@ def get_crazy_functions(): from crazy_functions.SourceCode_Comment import 注释Python项目 function_plugins = { - "Rag智能召回": { - "Group": "对话", - "Color": "stop", - "AsButton": False, - "Info": "将问答数据记录到向量库中,作为长期参考。", - "Function": HotReload(Rag问答), - }, "虚空终端": { "Group": "对话|编程|学术|智能体", "Color": "stop", @@ -707,6 +699,31 @@ def get_crazy_functions(): logger.error(trimmed_format_exc()) logger.error("Load function plugin failed") + try: + from crazy_functions.Rag_Interface import Rag问答 + + function_plugins.update( + { + "Rag智能召回": { + "Group": "对话", + "Color": "stop", + "AsButton": False, + "Info": "将问答数据记录到向量库中,作为长期参考。", + "Function": HotReload(Rag问答), + }, + } + ) + except: + logger.error(trimmed_format_exc()) + logger.error("Load function plugin failed") + + + + + + + + # try: # from crazy_functions.高级功能函数模板 import 测试图表渲染 # function_plugins.update({ diff --git a/crazy_functions/Rag_Interface.py b/crazy_functions/Rag_Interface.py index d83d8ca5..1bc740ad 100644 --- a/crazy_functions/Rag_Interface.py +++ b/crazy_functions/Rag_Interface.py @@ -2,20 +2,7 @@ from toolbox import CatchException, update_ui, get_conf, get_log_folder, update_ from crazy_functions.crazy_utils import input_clipping from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive -VECTOR_STORE_TYPE = "Milvus" - -if VECTOR_STORE_TYPE == "Milvus": - try: - from crazy_functions.rag_fns.milvus_worker import MilvusRagWorker as LlamaIndexRagWorker - except: - VECTOR_STORE_TYPE = "Simple" - -if VECTOR_STORE_TYPE == "Simple": - from crazy_functions.rag_fns.llama_index_worker import LlamaIndexRagWorker - - RAG_WORKER_REGISTER = {} - MAX_HISTORY_ROUND = 5 MAX_CONTEXT_TOKEN_LIMIT = 4096 REMEMBER_PREVIEW = 1000 @@ -23,6 +10,16 @@ REMEMBER_PREVIEW = 1000 @CatchException def Rag问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + # import vector store lib + VECTOR_STORE_TYPE = "Milvus" + if VECTOR_STORE_TYPE == "Milvus": + try: + from crazy_functions.rag_fns.milvus_worker import MilvusRagWorker as LlamaIndexRagWorker + except: + VECTOR_STORE_TYPE = "Simple" + if VECTOR_STORE_TYPE == "Simple": + from crazy_functions.rag_fns.llama_index_worker import LlamaIndexRagWorker + # 1. we retrieve rag worker from global context user_name = chatbot.get_user() checkpoint_dir = get_log_folder(user_name, plugin_name='experimental_rag') diff --git a/crazy_functions/latex_fns/latex_toolbox.py b/crazy_functions/latex_fns/latex_toolbox.py index 81e191ab..a49ffc4e 100644 --- a/crazy_functions/latex_fns/latex_toolbox.py +++ b/crazy_functions/latex_fns/latex_toolbox.py @@ -644,6 +644,191 @@ def run_in_subprocess(func): def _merge_pdfs(pdf1_path, pdf2_path, output_path): + try: + logger.info("Merging PDFs using _merge_pdfs_ng") + _merge_pdfs_ng(pdf1_path, pdf2_path, output_path) + except: + logger.info("Merging PDFs using _merge_pdfs_legacy") + _merge_pdfs_legacy(pdf1_path, pdf2_path, output_path) + + +def _merge_pdfs_ng(pdf1_path, pdf2_path, output_path): + import PyPDF2 # PyPDF2这个库有严重的内存泄露问题,把它放到子进程中运行,从而方便内存的释放 + from PyPDF2.generic import NameObject, TextStringObject, ArrayObject, FloatObject, NumberObject + + Percent = 1 + # raise RuntimeError('PyPDF2 has a serious memory leak problem, please use other tools to merge PDF files.') + # Open the first PDF file + with open(pdf1_path, "rb") as pdf1_file: + pdf1_reader = PyPDF2.PdfFileReader(pdf1_file) + # Open the second PDF file + with open(pdf2_path, "rb") as pdf2_file: + pdf2_reader = PyPDF2.PdfFileReader(pdf2_file) + # Create a new PDF file to store the merged pages + output_writer = PyPDF2.PdfFileWriter() + # Determine the number of pages in each PDF file + num_pages = max(pdf1_reader.numPages, pdf2_reader.numPages) + # Merge the pages from the two PDF files + for page_num in range(num_pages): + # Add the page from the first PDF file + if page_num < pdf1_reader.numPages: + page1 = pdf1_reader.getPage(page_num) + else: + page1 = PyPDF2.PageObject.createBlankPage(pdf1_reader) + # Add the page from the second PDF file + if page_num < pdf2_reader.numPages: + page2 = pdf2_reader.getPage(page_num) + else: + page2 = PyPDF2.PageObject.createBlankPage(pdf1_reader) + # Create a new empty page with double width + new_page = PyPDF2.PageObject.createBlankPage( + width=int( + int(page1.mediaBox.getWidth()) + + int(page2.mediaBox.getWidth()) * Percent + ), + height=max(page1.mediaBox.getHeight(), page2.mediaBox.getHeight()), + ) + new_page.mergeTranslatedPage(page1, 0, 0) + new_page.mergeTranslatedPage( + page2, + int( + int(page1.mediaBox.getWidth()) + - int(page2.mediaBox.getWidth()) * (1 - Percent) + ), + 0, + ) + if "/Annots" in page1: + page1_annot_id = [annot.idnum for annot in page1["/Annots"]] + else: + page1_annot_id = [] + + if "/Annots" in page2: + page2_annot_id = [annot.idnum for annot in page2["/Annots"]] + else: + page2_annot_id = [] + if "/Annots" in new_page: + annotations = new_page["/Annots"] + for i, annot in enumerate(annotations): + annot_obj = annot.get_object() + + # 检查注释类型是否是链接(/Link) + if annot_obj.get("/Subtype") == "/Link": + # 检查是否为内部链接跳转(/GoTo)或外部URI链接(/URI) + action = annot_obj.get("/A") + if action: + + if "/S" in action and action["/S"] == "/GoTo": + # 内部链接:跳转到文档中的某个页面 + dest = action.get("/D") # 目标页或目标位置 + if dest and annot.idnum in page2_annot_id: + # 获取原始文件中跳转信息,包括跳转页面 + destination = pdf2_reader.named_destinations[ + dest + ] + page_number = ( + pdf2_reader.get_destination_page_number( + destination + ) + ) + # 更新跳转信息,跳转到对应的页面和,指定坐标 (100, 150),缩放比例为 100% + # “/D”:[10,'/XYZ',100,100,0] + annot_obj["/A"].update( + { + NameObject("/D"): ArrayObject( + [ + NumberObject(page_number), + destination.dest_array[1], + FloatObject( + destination.dest_array[2] + + int( + page1.mediaBox.getWidth() + ) + ), + destination.dest_array[3], + destination.dest_array[4], + ] + ) # 确保键和值是 PdfObject + } + ) + rect = annot_obj.get("/Rect") + # 更新点击坐标 + rect = ArrayObject( + [ + FloatObject( + rect[0] + + int(page1.mediaBox.getWidth()) + ), + rect[1], + FloatObject( + rect[2] + + int(page1.mediaBox.getWidth()) + ), + rect[3], + ] + ) + annot_obj.update( + { + NameObject( + "/Rect" + ): rect # 确保键和值是 PdfObject + } + ) + if dest and annot.idnum in page1_annot_id: + # 获取原始文件中跳转信息,包括跳转页面 + destination = pdf1_reader.named_destinations[ + dest + ] + page_number = ( + pdf1_reader.get_destination_page_number( + destination + ) + ) + # 更新跳转信息,跳转到对应的页面和,指定坐标 (100, 150),缩放比例为 100% + # “/D”:[10,'/XYZ',100,100,0] + annot_obj["/A"].update( + { + NameObject("/D"): ArrayObject( + [ + NumberObject(page_number), + destination.dest_array[1], + FloatObject( + destination.dest_array[2] + ), + destination.dest_array[3], + destination.dest_array[4], + ] + ) # 确保键和值是 PdfObject + } + ) + rect = annot_obj.get("/Rect") + rect = ArrayObject( + [ + FloatObject(rect[0]), + rect[1], + FloatObject(rect[2]), + rect[3], + ] + ) + annot_obj.update( + { + NameObject( + "/Rect" + ): rect # 确保键和值是 PdfObject + } + ) + + elif "/S" in action and action["/S"] == "/URI": + # 外部链接:跳转到某个URI + uri = action.get("/URI") + + output_writer.addPage(new_page) + # Save the merged PDF file + with open(output_path, "wb") as output_file: + output_writer.write(output_file) + + + +def _merge_pdfs_legacy(pdf1_path, pdf2_path, output_path): import PyPDF2 # PyPDF2这个库有严重的内存泄露问题,把它放到子进程中运行,从而方便内存的释放 Percent = 0.95 diff --git a/docs/Dockerfile+JittorLLM b/docs/Dockerfile+JittorLLM deleted file mode 100644 index b10be807..00000000 --- a/docs/Dockerfile+JittorLLM +++ /dev/null @@ -1 +0,0 @@ -# 此Dockerfile不再维护,请前往docs/GithubAction+JittorLLMs diff --git a/docs/GithubAction+AllCapacityBeta b/docs/GithubAction+AllCapacityBeta deleted file mode 100644 index e942926c..00000000 --- a/docs/GithubAction+AllCapacityBeta +++ /dev/null @@ -1,57 +0,0 @@ -# docker build -t gpt-academic-all-capacity -f docs/GithubAction+AllCapacity --network=host --build-arg http_proxy=http://localhost:10881 --build-arg https_proxy=http://localhost:10881 . -# docker build -t gpt-academic-all-capacity -f docs/GithubAction+AllCapacityBeta --network=host . -# docker run -it --net=host gpt-academic-all-capacity bash - -# 从NVIDIA源,从而支持显卡(检查宿主的nvidia-smi中的cuda版本必须>=11.3) -FROM fuqingxu/11.3.1-runtime-ubuntu20.04-with-texlive:latest - -# edge-tts需要的依赖,某些pip包所需的依赖 -RUN apt update && apt install ffmpeg build-essential -y - -# use python3 as the system default python -WORKDIR /gpt -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.8 - -# # 非必要步骤,更换pip源 (以下三行,可以删除) -# RUN echo '[global]' > /etc/pip.conf && \ -# echo 'index-url = https://mirrors.aliyun.com/pypi/simple/' >> /etc/pip.conf && \ -# echo 'trusted-host = mirrors.aliyun.com' >> /etc/pip.conf - -# 下载pytorch -RUN python3 -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113 -# 准备pip依赖 -RUN python3 -m pip install openai numpy arxiv rich -RUN python3 -m pip install colorama Markdown pygments pymupdf -RUN python3 -m pip install python-docx moviepy pdfminer -RUN python3 -m pip install zh_langchain==0.2.1 pypinyin -RUN python3 -m pip install rarfile py7zr -RUN python3 -m pip install aliyun-python-sdk-core==2.13.3 pyOpenSSL webrtcvad scipy git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git -# 下载分支 -WORKDIR /gpt -RUN git clone --depth=1 https://github.com/binary-husky/gpt_academic.git -WORKDIR /gpt/gpt_academic -RUN git clone --depth=1 https://github.com/OpenLMLab/MOSS.git request_llms/moss - -RUN python3 -m pip install -r requirements.txt -RUN python3 -m pip install -r request_llms/requirements_moss.txt -RUN python3 -m pip install -r request_llms/requirements_qwen.txt -RUN python3 -m pip install -r request_llms/requirements_chatglm.txt -RUN python3 -m pip install -r request_llms/requirements_newbing.txt -RUN python3 -m pip install nougat-ocr - - -# 预热Tiktoken模块 -RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' - -# 安装知识库插件的额外依赖 -RUN apt-get update && apt-get install libgl1 -y -RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade -RUN pip3 install unstructured[all-docs] --upgrade -RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()' -RUN rm -rf /usr/local/lib/python3.8/dist-packages/tests - - -# COPY .cache /root/.cache -# COPY config_private.py config_private.py -# 启动 -CMD ["python3", "-u", "main.py"] diff --git a/requirements.txt b/requirements.txt index e916bc01..7a3d9f86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,14 +2,15 @@ https://public.agent-matrix.com/publish/gradio-3.32.10-py3-none-any.whl fastapi==0.110 gradio-client==0.8 pypdf2==2.12.1 +httpx<=0.25.2 zhipuai==2.0.1 tiktoken>=0.3.3 requests[socks] -pydantic==2.5.2 -llama-index~=0.10 +pydantic==2.9.2 protobuf==3.20 transformers>=4.27.1,<4.42 scipdf_parser>=0.52 +spacy==3.7.4 anthropic>=0.18.1 python-markdown-math pymdown-extensions @@ -32,3 +33,14 @@ loguru arxiv numpy rich + + +llama-index-core==0.10.68 +llama-index-legacy==0.9.48 +llama-index-readers-file==0.1.33 +llama-index-readers-llama-parse==0.1.6 +llama-index-embeddings-azure-openai==0.1.10 +llama-index-embeddings-openai==0.1.10 +llama-parse==0.4.9 +mdit-py-plugins>=0.3.3 +linkify-it-py==2.0.3 \ No newline at end of file diff --git a/tests/test_anim_gen.py b/tests/test_anim_gen.py new file mode 100644 index 00000000..0084f841 --- /dev/null +++ b/tests/test_anim_gen.py @@ -0,0 +1,12 @@ +""" +对项目中的各个插件进行测试。运行方法:直接运行 python tests/test_plugins.py +""" + +import init_test +import os, sys + + +if __name__ == "__main__": + from test_utils import plugin_test + + plugin_test(plugin='crazy_functions.数学动画生成manim->动画生成', main_input="A point moving along function culve y=sin(x), starting from x=0 and stop at x=4*\pi.") diff --git a/version b/version index d417ea2a..e0e936dd 100644 --- a/version +++ b/version @@ -1,5 +1,5 @@ { - "version": 3.83, + "version": 3.90, "show_feature": true, - "new_feature": "增加欢迎页面 <-> 优化图像生成插件 <-> 添加紫东太初大模型支持 <-> 保留主题选择 <-> 支持更复杂的插件框架 <-> 上传文件时显示进度条" + "new_feature": "增加RAG组件 <-> 升级多合一主提交键" }