gpt_academic/crazy_functions/批量总结PDF文档_初步.py

import zipfile
import os
from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str, get_log_folder
from toolbox import CatchException, report_exception
from toolbox import write_history_to_file, promote_file_to_downloadzone
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import read_and_clean_pdf_text
from .crazy_utils import input_clipping
pj = os.path.join


def move_file_to_zip(file_path, zip_file):
    zip_file.write(file_path, os.path.basename(file_path))
    os.remove(file_path)


def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
    zip_file_path = pj(get_log_folder(), 'result.zip')
    with zipfile.ZipFile(zip_file_path, 'w') as zip_file:
        for file_name in file_manifest:
            file_write_buffer = []
            print('begin analysis on:', file_name)
            ############################## <第 0 步，切割PDF> ##################################
            # 递归地切割PDF文件，每一块（尽量是完整的一个section，比如introduction，experiment等，必要时再进行切割）
            # 的长度必须小于 2500 个 Token
            file_content, page_one = read_and_clean_pdf_text(file_name) # （尝试）按照章节切割PDF
            file_content = file_content.encode('utf-8', 'ignore').decode()   # avoid reading non-utf8 chars
            page_one = str(page_one).encode('utf-8', 'ignore').decode()  # avoid reading non-utf8 chars

            TOKEN_LIMIT_PER_FRAGMENT = 2500

            from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
            from request_llms.bridge_all import model_info
            enc = model_info["gpt-3.5-turbo"]['tokenizer']
            def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
            paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
                txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
            page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
                txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
            # 为了更好的效果，我们剥离Introduction之后的部分（如果有）
            paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]

            ############################## <第 1 步，从摘要中提取高价值信息，放到history中> ##################################
            final_results = []
            final_results.append("## metadata\n\n" + paper_meta + "\n\n## metadata")

            ############################## <第 2 步，迭代地历遍整个文章，提取精炼信息> ##################################
            i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
            chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI

            iteration_results = []
            last_iteration_result = paper_meta  # 初始值是摘要
            MAX_WORD_TOTAL = 4096 * 0.7
            n_fragment = len(paper_fragments)
            if n_fragment >= 20: print('文章极长，不能达到预期效果')
            for i in range(n_fragment):
                NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
                i_say = f"Read this section, recapitulate the content of this section in Chinese with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}"
                i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}"
                gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user,  # i_say=真正给chatgpt的提问， i_say_show_user=给用户看的提问
                                                                                    llm_kwargs, chatbot,
                                                                                    history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
                                                                                    sys_prompt="Extract the main idea of this section with Chinese."  # 提示
                                                                                    )
                iteration_results.append(gpt_say)
                last_iteration_result = gpt_say

            ############################## <第 3 步，整理history，提取总结> ##################################
            final_results.extend(iteration_results)
            file_write_buffer.extend(final_results)

            ############################## <第 4 步，设置一个token上限> ##################################
            _, final_results = input_clipping("", final_results, max_token_limit=3200)
            yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了

            res = write_history_to_file(
                file_write_buffer,
                file_basename=os.path.splitext(os.path.basename(file_name))[0] + '.md',
                auto_caption=False
            )
            if len(file_manifest) == 1:
                promote_file_to_downloadzone(res, chatbot=chatbot)
                return
            move_file_to_zip(res, zip_file)

    promote_file_to_downloadzone(zip_file_path, chatbot=chatbot)


@CatchException
def 批量总结PDF文档_初步(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
    import glob, os

    # 基本信息：功能、贡献者
    chatbot.append([
        "函数插件功能？",
        "批量总结PDF文档。函数插件贡献者: ValeriaWong，Eralien，Joshua Reed"])
    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

    # 尝试导入依赖，如果缺少依赖，则给出安装建议
    try:
        import fitz
    except:
        report_exception(chatbot, history,
            a = f"解析项目: {txt}",
            b = f"导入软件依赖失败。使用该模块需要额外依赖，安装方法```pip install --upgrade pymupdf```。")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return

    # 清空历史，以免输入溢出
    history = []

    # 检测输入参数，如没有给定输入参数，直接退出
    if os.path.exists(txt):
        project_folder = txt
    else:
        if txt == "": txt = '空空如也的输入栏'
        report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return

    # 搜索需要处理的文件清单
    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)]

    # 如果没找到任何文件
    if len(file_manifest) == 0:
        report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或.pdf文件: {txt}")
        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
        return

    # 开始正式执行任务
    yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)