improve long text breakdown perfomance

这个提交包含在:
binary-husky
2023-12-19 19:30:44 +08:00
父节点 6e9936531d
当前提交 a0bfa7ba1c
共有 13 个文件被更改,包括 186 次插入143 次删除

查看文件

@@ -31,15 +31,11 @@ def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot
print(file_content)
# private_upload里面的文件名在解压zip后容易出现乱码rar和7z格式正常,故可以只分析文章内容,不输入文件名
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
from request_llms.bridge_all import model_info
max_token = model_info[llm_kwargs['llm_model']]['max_token']
TOKEN_LIMIT_PER_FRAGMENT = max_token * 3 // 4
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
txt=file_content,
get_token_fn=model_info[llm_kwargs['llm_model']]['token_cnt'],
limit=TOKEN_LIMIT_PER_FRAGMENT
)
paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
this_paper_history = []
for i, paper_frag in enumerate(paper_fragments):
i_say = f'请对下面的文章片段用中文做概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{paper_frag}```'