diff --git a/crazy_functions/game_fns/game_ascii_art.py b/crazy_functions/game_fns/game_ascii_art.py new file mode 100644 index 00000000..e0b70087 --- /dev/null +++ b/crazy_functions/game_fns/game_ascii_art.py @@ -0,0 +1,42 @@ +from toolbox import CatchException, update_ui, update_ui_lastest_msg +from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState +from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from request_llms.bridge_all import predict_no_ui_long_connection +from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing +import random + + +class MiniGame_ASCII_Art(GptAcademicGameBaseState): + def step(self, prompt, chatbot, history): + if self.step_cnt == 0: + chatbot.append(["我画你猜(动物)", "请稍等..."]) + else: + if prompt.strip() == 'exit': + self.delete_game = True + yield from update_ui_lastest_msg(lastmsg=f"谜底是{self.obj},游戏结束。", chatbot=chatbot, history=history, delay=0.) + return + chatbot.append([prompt, ""]) + yield from update_ui(chatbot=chatbot, history=history) + + if self.step_cnt == 0: + self.lock_plugin(chatbot) + self.cur_task = 'draw' + + if self.cur_task == 'draw': + avail_obj = ["狗","猫","鸟","鱼","老鼠","蛇"] + self.obj = random.choice(avail_obj) + inputs = "I want to play a game called Guess the ASCII art. You can draw the ASCII art and I will try to guess it. " + \ + f"This time you draw a {self.obj}. Note that you must not indicate what you have draw in the text, and you should only produce the ASCII art wrapped by ```. " + raw_res = predict_no_ui_long_connection(inputs=inputs, llm_kwargs=self.llm_kwargs, history=[], sys_prompt="") + self.cur_task = 'identify user guess' + res = get_code_block(raw_res) + history += ['', f'the answer is {self.obj}', inputs, res] + yield from update_ui_lastest_msg(lastmsg=res, chatbot=chatbot, history=history, delay=0.) + + elif self.cur_task == 'identify user guess': + if is_same_thing(self.obj, prompt, self.llm_kwargs): + self.delete_game = True + yield from update_ui_lastest_msg(lastmsg="你猜对了!", chatbot=chatbot, history=history, delay=0.) + else: + self.cur_task = 'identify user guess' + yield from update_ui_lastest_msg(lastmsg="猜错了,再试试,输入“exit”获取答案。", chatbot=chatbot, history=history, delay=0.) \ No newline at end of file diff --git a/crazy_functions/game_fns/game_interactive_story.py b/crazy_functions/game_fns/game_interactive_story.py new file mode 100644 index 00000000..5c25f4a3 --- /dev/null +++ b/crazy_functions/game_fns/game_interactive_story.py @@ -0,0 +1,212 @@ +prompts_hs = """ 请以“{headstart}”为开头,编写一个小说的第一幕。 + +- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。 +- 出现人物时,给出人物的名字。 +- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。 +- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。 +- 字数要求:第一幕的字数少于300字,且少于2个段落。 +""" + +prompts_interact = """ 小说的前文回顾: +「 +{previously_on_story} +」 + +你是一个作家,根据以上的情节,给出4种不同的后续剧情发展方向,每个发展方向都精明扼要地用一句话说明。稍后,我将在这4个选择中,挑选一种剧情发展。 + +输出格式例如: +1. 后续剧情发展1 +2. 后续剧情发展2 +3. 后续剧情发展3 +4. 后续剧情发展4 +""" + + +prompts_resume = """小说的前文回顾: +「 +{previously_on_story} +」 + +你是一个作家,我们正在互相讨论,确定后续剧情的发展。 +在以下的剧情发展中, +「 +{choice} +」 +我认为更合理的是:{user_choice}。 +请在前文的基础上(不要重复前文),围绕我选定的剧情情节,编写小说的下一幕。 + +- 禁止杜撰不符合我选择的剧情。 +- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。 +- 不要重复前文。 +- 出现人物时,给出人物的名字。 +- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。 +- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。 +- 小说的下一幕字数少于300字,且少于2个段落。 +""" + + +prompts_terminate = """小说的前文回顾: +「 +{previously_on_story} +」 + +你是一个作家,我们正在互相讨论,确定后续剧情的发展。 +现在,故事该结束了,我认为最合理的故事结局是:{user_choice}。 + +请在前文的基础上(不要重复前文),编写小说的最后一幕。 + +- 不要重复前文。 +- 出现人物时,给出人物的名字。 +- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。 +- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。 +- 字数要求:最后一幕的字数少于1000字。 +""" + + +from toolbox import CatchException, update_ui, update_ui_lastest_msg +from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState +from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from request_llms.bridge_all import predict_no_ui_long_connection +from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing +import random + + +class MiniGame_ResumeStory(GptAcademicGameBaseState): + story_headstart = [ + '先行者知道,他现在是全宇宙中唯一的一个人了。', + '深夜,一个年轻人穿过天安门广场向纪念堂走去。在二十二世纪编年史中,计算机把他的代号定为M102。', + '他知道,这最后一课要提前讲了。又一阵剧痛从肝部袭来,几乎使他晕厥过去。', + '在距地球五万光年的远方,在银河系的中心,一场延续了两万年的星际战争已接近尾声。那里的太空中渐渐隐现出一个方形区域,仿佛灿烂的群星的背景被剪出一个方口。', + '伊依一行三人乘坐一艘游艇在南太平洋上做吟诗航行,他们的目的地是南极,如果几天后能顺利到达那里,他们将钻出地壳去看诗云。', + '很多人生来就会莫名其妙地迷上一样东西,仿佛他的出生就是要和这东西约会似的,正是这样,圆圆迷上了肥皂泡。' + ] + + + def begin_game_step_0(self, prompt, chatbot, history): + # init game at step 0 + self.headstart = random.choice(self.story_headstart) + self.story = [] + chatbot.append(["互动写故事", f"这次的故事开头是:{self.headstart}"]) + self.sys_prompt_ = '你是一个想象力丰富的杰出作家。正在与你的朋友互动,一起写故事,因此你每次写的故事段落应少于300字(结局除外)。' + + + def generate_story_image(self, story_paragraph): + try: + from crazy_functions.图片生成 import gen_image + prompt_ = predict_no_ui_long_connection(inputs=story_paragraph, llm_kwargs=self.llm_kwargs, history=[], sys_prompt='你需要根据用户给出的小说段落,进行简短的环境描写。要求:80字以内。') + image_url, image_path = gen_image(self.llm_kwargs, prompt_, '512x512', model="dall-e-2", quality='standard', style='natural') + return f'
' + except: + return '' + + def step(self, prompt, chatbot, history): + + """ + 首先,处理游戏初始化等特殊情况 + """ + if self.step_cnt == 0: + self.begin_game_step_0(prompt, chatbot, history) + self.lock_plugin(chatbot) + self.cur_task = 'head_start' + else: + if prompt.strip() == 'exit' or prompt.strip() == '结束剧情': + # should we terminate game here? + self.delete_game = True + yield from update_ui_lastest_msg(lastmsg=f"游戏结束。", chatbot=chatbot, history=history, delay=0.) + return + if '剧情收尾' in prompt: + self.cur_task = 'story_terminate' + # # well, game resumes + # chatbot.append([prompt, ""]) + # update ui, don't keep the user waiting + yield from update_ui(chatbot=chatbot, history=history) + + + """ + 处理游戏的主体逻辑 + """ + if self.cur_task == 'head_start': + """ + 这是游戏的第一步 + """ + inputs_ = prompts_hs.format(headstart=self.headstart) + history_ = [] + story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, '故事开头', self.llm_kwargs, + chatbot, history_, self.sys_prompt_ + ) + self.story.append(story_paragraph) + # # 配图 + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.) + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.) + + # # 构建后续剧情引导 + previously_on_story = "" + for s in self.story: + previously_on_story += s + '\n' + inputs_ = prompts_interact.format(previously_on_story=previously_on_story) + history_ = [] + self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, '请在以下几种故事走向中,选择一种(当然,您也可以选择给出其他故事走向):', self.llm_kwargs, + chatbot, + history_, + self.sys_prompt_ + ) + self.cur_task = 'user_choice' + + + elif self.cur_task == 'user_choice': + """ + 根据用户的提示,确定故事的下一步 + """ + if '请在以下几种故事走向中,选择一种' in chatbot[-1][0]: chatbot.pop(-1) + previously_on_story = "" + for s in self.story: + previously_on_story += s + '\n' + inputs_ = prompts_resume.format(previously_on_story=previously_on_story, choice=self.next_choices, user_choice=prompt) + history_ = [] + story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, f'下一段故事(您的选择是:{prompt})。', self.llm_kwargs, + chatbot, history_, self.sys_prompt_ + ) + self.story.append(story_paragraph) + # # 配图 + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.) + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.) + + # # 构建后续剧情引导 + previously_on_story = "" + for s in self.story: + previously_on_story += s + '\n' + inputs_ = prompts_interact.format(previously_on_story=previously_on_story) + history_ = [] + self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, + '请在以下几种故事走向中,选择一种。当然,您也可以给出您心中的其他故事走向。另外,如果您希望剧情立即收尾,请输入剧情走向,并以“剧情收尾”四个字提示程序。', self.llm_kwargs, + chatbot, + history_, + self.sys_prompt_ + ) + self.cur_task = 'user_choice' + + + elif self.cur_task == 'story_terminate': + """ + 根据用户的提示,确定故事的结局 + """ + previously_on_story = "" + for s in self.story: + previously_on_story += s + '\n' + inputs_ = prompts_terminate.format(previously_on_story=previously_on_story, user_choice=prompt) + history_ = [] + story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs_, f'故事收尾(您的选择是:{prompt})。', self.llm_kwargs, + chatbot, history_, self.sys_prompt_ + ) + # # 配图 + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.) + yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.) + + # terminate game + self.delete_game = True + return diff --git a/crazy_functions/game_fns/game_utils.py b/crazy_functions/game_fns/game_utils.py new file mode 100644 index 00000000..09b6f7a9 --- /dev/null +++ b/crazy_functions/game_fns/game_utils.py @@ -0,0 +1,35 @@ + +from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError +from request_llms.bridge_all import predict_no_ui_long_connection +def get_code_block(reply): + import re + pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks + matches = re.findall(pattern, reply) # find all code blocks in text + if len(matches) == 1: + return "```" + matches[0] + "```" # code block + raise RuntimeError("GPT is not generating proper code.") + +def is_same_thing(a, b, llm_kwargs): + from pydantic import BaseModel, Field + class IsSameThing(BaseModel): + is_same_thing: bool = Field(description="determine whether two objects are same thing.", default=False) + + def run_gpt_fn(inputs, sys_prompt, history=[]): + return predict_no_ui_long_connection( + inputs=inputs, llm_kwargs=llm_kwargs, + history=history, sys_prompt=sys_prompt, observe_window=[] + ) + + gpt_json_io = GptJsonIO(IsSameThing) + inputs_01 = "Identity whether the user input and the target is the same thing: \n target object: {a} \n user input object: {b} \n\n\n".format(a=a, b=b) + inputs_01 += "\n\n\n Note that the user may describe the target object with a different language, e.g. cat and 猫 are the same thing." + analyze_res_cot_01 = run_gpt_fn(inputs_01, "", []) + + inputs_02 = inputs_01 + gpt_json_io.format_instructions + analyze_res = run_gpt_fn(inputs_02, "", [inputs_01, analyze_res_cot_01]) + + try: + res = gpt_json_io.generate_output_auto_repair(analyze_res, run_gpt_fn) + return res.is_same_thing + except JsonStringError as e: + return False \ No newline at end of file diff --git a/crazy_functions/ipc_fns/mp.py b/crazy_functions/ipc_fns/mp.py new file mode 100644 index 00000000..575d47cc --- /dev/null +++ b/crazy_functions/ipc_fns/mp.py @@ -0,0 +1,37 @@ +import platform +import pickle +import multiprocessing + +def run_in_subprocess_wrapper_func(v_args): + func, args, kwargs, return_dict, exception_dict = pickle.loads(v_args) + import sys + try: + result = func(*args, **kwargs) + return_dict['result'] = result + except Exception as e: + exc_info = sys.exc_info() + exception_dict['exception'] = exc_info + +def run_in_subprocess_with_timeout(func, timeout=60): + if platform.system() == 'Linux': + def wrapper(*args, **kwargs): + return_dict = multiprocessing.Manager().dict() + exception_dict = multiprocessing.Manager().dict() + v_args = pickle.dumps((func, args, kwargs, return_dict, exception_dict)) + process = multiprocessing.Process(target=run_in_subprocess_wrapper_func, args=(v_args,)) + process.start() + process.join(timeout) + if process.is_alive(): + process.terminate() + raise TimeoutError(f'功能单元{str(func)}未能在规定时间内完成任务') + process.close() + if 'exception' in exception_dict: + # ooops, the subprocess ran into an exception + exc_info = exception_dict['exception'] + raise exc_info[1].with_traceback(exc_info[2]) + if 'result' in return_dict.keys(): + # If the subprocess ran successfully, return the result + return return_dict['result'] + return wrapper + else: + return func \ No newline at end of file diff --git a/crazy_functions/pdf_fns/breakdown_txt.py b/crazy_functions/pdf_fns/breakdown_txt.py new file mode 100644 index 00000000..a9614814 --- /dev/null +++ b/crazy_functions/pdf_fns/breakdown_txt.py @@ -0,0 +1,125 @@ +from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout + +def force_breakdown(txt, limit, get_token_fn): + """ 当无法用标点、空行分割时,我们用最暴力的方法切割 + """ + for i in reversed(range(len(txt))): + if get_token_fn(txt[:i]) < limit: + return txt[:i], txt[i:] + return "Tiktoken未知错误", "Tiktoken未知错误" + + +def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage): + """ 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage + 当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出 + """ + _min = int(5e4) + _max = int(1e5) + # print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage)) + if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0: + remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage + remain_txt_to_cut_storage = "" + if len(remain_txt_to_cut) > _max: + remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage + remain_txt_to_cut = remain_txt_to_cut[:_max] + return remain_txt_to_cut, remain_txt_to_cut_storage + + +def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False): + """ 文本切分 + """ + res = [] + total_len = len(txt_tocut) + fin_len = 0 + remain_txt_to_cut = txt_tocut + remain_txt_to_cut_storage = "" + # 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage + remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) + + while True: + if get_token_fn(remain_txt_to_cut) <= limit: + # 如果剩余文本的token数小于限制,那么就不用切了 + res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut) + break + else: + # 如果剩余文本的token数大于限制,那么就切 + lines = remain_txt_to_cut.split('\n') + + # 估计一个切分点 + estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines) + estimated_line_cut = int(estimated_line_cut) + + # 开始查找合适切分点的偏移(cnt) + cnt = 0 + for cnt in reversed(range(estimated_line_cut)): + if must_break_at_empty_line: + # 首先尝试用双空行(\n\n)作为切分点 + if lines[cnt] != "": + continue + prev = "\n".join(lines[:cnt]) + post = "\n".join(lines[cnt:]) + if get_token_fn(prev) < limit: + break + + if cnt == 0: + # 如果没有找到合适的切分点 + if break_anyway: + # 是否允许暴力切分 + prev, post = force_breakdown(txt_tocut, limit, get_token_fn) + else: + # 不允许直接报错 + raise RuntimeError(f"存在一行极长的文本!{txt_tocut}") + + # 追加列表 + res.append(prev); fin_len+=len(prev) + # 准备下一次迭代 + remain_txt_to_cut = post + remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) + process = fin_len/total_len + print(f'正在文本切分 {int(process*100)}%') + if len(remain_txt_to_cut.strip()) == 0: + break + return res + + +def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"): + """ 使用多种方式尝试切分文本,以满足 token 限制 + """ + from request_llms.bridge_all import model_info + enc = model_info[llm_model]['tokenizer'] + def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=())) + try: + # 第1次尝试,将双空行(\n\n)作为切分点 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=True) + except RuntimeError: + try: + # 第2次尝试,将单空行(\n)作为切分点 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=False) + except RuntimeError: + try: + # 第3次尝试,将英文句号(.)作为切分点 + res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在 + return [r.replace('。\n', '.') for r in res] + except RuntimeError as e: + try: + # 第4次尝试,将中文句号(。)作为切分点 + res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False) + return [r.replace('。。\n', '。') for r in res] + except RuntimeError as e: + # 第5次尝试,没办法了,随便切一下吧 + return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True) + +breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60) + +if __name__ == '__main__': + from crazy_functions.crazy_utils import read_and_clean_pdf_text + file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf") + + from request_llms.bridge_all import model_info + for i in range(5): + file_content += file_content + + print(len(file_content)) + TOKEN_LIMIT_PER_FRAGMENT = 2500 + res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT) + diff --git a/crazy_functions/vector_fns/__init__.py b/crazy_functions/vector_fns/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crazy_functions/vector_fns/general_file_loader.py b/crazy_functions/vector_fns/general_file_loader.py new file mode 100644 index 00000000..a512c483 --- /dev/null +++ b/crazy_functions/vector_fns/general_file_loader.py @@ -0,0 +1,70 @@ +# From project chatglm-langchain + + +from langchain.document_loaders import UnstructuredFileLoader +from langchain.text_splitter import CharacterTextSplitter +import re +from typing import List + +class ChineseTextSplitter(CharacterTextSplitter): + def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs): + super().__init__(**kwargs) + self.pdf = pdf + self.sentence_size = sentence_size + + def split_text1(self, text: str) -> List[str]: + if self.pdf: + text = re.sub(r"\n{3,}", "\n", text) + text = re.sub('\s', ' ', text) + text = text.replace("\n\n", "") + sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; + sent_list = [] + for ele in sent_sep_pattern.split(text): + if sent_sep_pattern.match(ele) and sent_list: + sent_list[-1] += ele + elif ele: + sent_list.append(ele) + return sent_list + + def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑 + if self.pdf: + text = re.sub(r"\n{3,}", r"\n", text) + text = re.sub('\s', " ", text) + text = re.sub("\n\n", "", text) + + text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符 + text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号 + text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号 + text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text) + # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 + text = text.rstrip() # 段尾如果有多余的\n就去掉它 + # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 + ls = [i for i in text.split("\n") if i] + for ele in ls: + if len(ele) > self.sentence_size: + ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele) + ele1_ls = ele1.split("\n") + for ele_ele1 in ele1_ls: + if len(ele_ele1) > self.sentence_size: + ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1) + ele2_ls = ele_ele2.split("\n") + for ele_ele2 in ele2_ls: + if len(ele_ele2) > self.sentence_size: + ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2) + ele2_id = ele2_ls.index(ele_ele2) + ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[ + ele2_id + 1:] + ele_id = ele1_ls.index(ele_ele1) + ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:] + + id = ls.index(ele) + ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:] + return ls + +def load_file(filepath, sentence_size): + loader = UnstructuredFileLoader(filepath, mode="elements") + textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) + docs = loader.load_and_split(text_splitter=textsplitter) + # write_check_file(filepath, docs) + return docs + diff --git a/crazy_functions/vector_fns/vector_database.py b/crazy_functions/vector_fns/vector_database.py new file mode 100644 index 00000000..cffa22cf --- /dev/null +++ b/crazy_functions/vector_fns/vector_database.py @@ -0,0 +1,338 @@ +# From project chatglm-langchain + +import threading +from toolbox import Singleton +import os +import shutil +import os +import uuid +import tqdm +from langchain.vectorstores import FAISS +from langchain.docstore.document import Document +from typing import List, Tuple +import numpy as np +from crazy_functions.vector_fns.general_file_loader import load_file + +embedding_model_dict = { + "ernie-tiny": "nghuyong/ernie-3.0-nano-zh", + "ernie-base": "nghuyong/ernie-3.0-base-zh", + "text2vec-base": "shibing624/text2vec-base-chinese", + "text2vec": "GanymedeNil/text2vec-large-chinese", +} + +# Embedding model name +EMBEDDING_MODEL = "text2vec" + +# Embedding running device +EMBEDDING_DEVICE = "cpu" + +# 基于上下文的prompt模版,请务必保留"{question}"和"{context}" +PROMPT_TEMPLATE = """已知信息: +{context} + +根据上述已知信息,简洁和专业的来回答用户的问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题” 或 “没有提供足够的相关信息”,不允许在答案中添加编造成分,答案请使用中文。 问题是:{question}""" + +# 文本分句长度 +SENTENCE_SIZE = 100 + +# 匹配后单段上下文长度 +CHUNK_SIZE = 250 + +# LLM input history length +LLM_HISTORY_LEN = 3 + +# return top-k text chunk from vector store +VECTOR_SEARCH_TOP_K = 5 + +# 知识检索内容相关度 Score, 数值范围约为0-1100,如果为0,则不生效,经测试设置为小于500时,匹配结果更精准 +VECTOR_SEARCH_SCORE_THRESHOLD = 0 + +NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data") + +FLAG_USER_NAME = uuid.uuid4().hex + +# 是否开启跨域,默认为False,如果需要开启,请设置为True +# is open cross domain +OPEN_CROSS_DOMAIN = False + +def similarity_search_with_score_by_vector( + self, embedding: List[float], k: int = 4 +) -> List[Tuple[Document, float]]: + + def seperate_list(ls: List[int]) -> List[List[int]]: + lists = [] + ls1 = [ls[0]] + for i in range(1, len(ls)): + if ls[i - 1] + 1 == ls[i]: + ls1.append(ls[i]) + else: + lists.append(ls1) + ls1 = [ls[i]] + lists.append(ls1) + return lists + + scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k) + docs = [] + id_set = set() + store_len = len(self.index_to_docstore_id) + for j, i in enumerate(indices[0]): + if i == -1 or 0 < self.score_threshold < scores[0][j]: + # This happens when not enough docs are returned. + continue + _id = self.index_to_docstore_id[i] + doc = self.docstore.search(_id) + if not self.chunk_conent: + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + doc.metadata["score"] = int(scores[0][j]) + docs.append(doc) + continue + id_set.add(i) + docs_len = len(doc.page_content) + for k in range(1, max(i, store_len - i)): + break_flag = False + for l in [i + k, i - k]: + if 0 <= l < len(self.index_to_docstore_id): + _id0 = self.index_to_docstore_id[l] + doc0 = self.docstore.search(_id0) + if docs_len + len(doc0.page_content) > self.chunk_size: + break_flag = True + break + elif doc0.metadata["source"] == doc.metadata["source"]: + docs_len += len(doc0.page_content) + id_set.add(l) + if break_flag: + break + if not self.chunk_conent: + return docs + if len(id_set) == 0 and self.score_threshold > 0: + return [] + id_list = sorted(list(id_set)) + id_lists = seperate_list(id_list) + for id_seq in id_lists: + for id in id_seq: + if id == id_seq[0]: + _id = self.index_to_docstore_id[id] + doc = self.docstore.search(_id) + else: + _id0 = self.index_to_docstore_id[id] + doc0 = self.docstore.search(_id0) + doc.page_content += " " + doc0.page_content + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + doc_score = min([scores[0][id] for id in [indices[0].tolist().index(i) for i in id_seq if i in indices[0]]]) + doc.metadata["score"] = int(doc_score) + docs.append(doc) + return docs + + +class LocalDocQA: + llm: object = None + embeddings: object = None + top_k: int = VECTOR_SEARCH_TOP_K + chunk_size: int = CHUNK_SIZE + chunk_conent: bool = True + score_threshold: int = VECTOR_SEARCH_SCORE_THRESHOLD + + def init_cfg(self, + top_k=VECTOR_SEARCH_TOP_K, + ): + + self.llm = None + self.top_k = top_k + + def init_knowledge_vector_store(self, + filepath, + vs_path: str or os.PathLike = None, + sentence_size=SENTENCE_SIZE, + text2vec=None): + loaded_files = [] + failed_files = [] + if isinstance(filepath, str): + if not os.path.exists(filepath): + print("路径不存在") + return None + elif os.path.isfile(filepath): + file = os.path.split(filepath)[-1] + try: + docs = load_file(filepath, SENTENCE_SIZE) + print(f"{file} 已成功加载") + loaded_files.append(filepath) + except Exception as e: + print(e) + print(f"{file} 未能成功加载") + return None + elif os.path.isdir(filepath): + docs = [] + for file in tqdm(os.listdir(filepath), desc="加载文件"): + fullfilepath = os.path.join(filepath, file) + try: + docs += load_file(fullfilepath, SENTENCE_SIZE) + loaded_files.append(fullfilepath) + except Exception as e: + print(e) + failed_files.append(file) + + if len(failed_files) > 0: + print("以下文件未能成功加载:") + for file in failed_files: + print(f"{file}\n") + + else: + docs = [] + for file in filepath: + docs += load_file(file, SENTENCE_SIZE) + print(f"{file} 已成功加载") + loaded_files.append(file) + + if len(docs) > 0: + print("文件加载完毕,正在生成向量库") + if vs_path and os.path.isdir(vs_path): + try: + self.vector_store = FAISS.load_local(vs_path, text2vec) + self.vector_store.add_documents(docs) + except: + self.vector_store = FAISS.from_documents(docs, text2vec) + else: + self.vector_store = FAISS.from_documents(docs, text2vec) # docs 为Document列表 + + self.vector_store.save_local(vs_path) + return vs_path, loaded_files + else: + raise RuntimeError("文件加载失败,请检查文件格式是否正确") + + def get_loaded_file(self, vs_path): + ds = self.vector_store.docstore + return set([ds._dict[k].metadata['source'].split(vs_path)[-1] for k in ds._dict]) + + + # query 查询内容 + # vs_path 知识库路径 + # chunk_conent 是否启用上下文关联 + # score_threshold 搜索匹配score阈值 + # vector_search_top_k 搜索知识库内容条数,默认搜索5条结果 + # chunk_sizes 匹配单段内容的连接上下文长度 + def get_knowledge_based_conent_test(self, query, vs_path, chunk_conent, + score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD, + vector_search_top_k=VECTOR_SEARCH_TOP_K, chunk_size=CHUNK_SIZE, + text2vec=None): + self.vector_store = FAISS.load_local(vs_path, text2vec) + self.vector_store.chunk_conent = chunk_conent + self.vector_store.score_threshold = score_threshold + self.vector_store.chunk_size = chunk_size + + embedding = self.vector_store.embedding_function.embed_query(query) + related_docs_with_score = similarity_search_with_score_by_vector(self.vector_store, embedding, k=vector_search_top_k) + + if not related_docs_with_score: + response = {"query": query, + "source_documents": []} + return response, "" + # prompt = f"{query}. You should answer this question using information from following documents: \n\n" + prompt = f"{query}. 你必须利用以下文档中包含的信息回答这个问题: \n\n---\n\n" + prompt += "\n\n".join([f"({k}): " + doc.page_content for k, doc in enumerate(related_docs_with_score)]) + prompt += "\n\n---\n\n" + prompt = prompt.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + # print(prompt) + response = {"query": query, "source_documents": related_docs_with_score} + return response, prompt + + + + +def construct_vector_store(vs_id, vs_path, files, sentence_size, history, one_conent, one_content_segmentation, text2vec): + for file in files: + assert os.path.exists(file), "输入文件不存在:" + file + import nltk + if NLTK_DATA_PATH not in nltk.data.path: nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path + local_doc_qa = LocalDocQA() + local_doc_qa.init_cfg() + filelist = [] + if not os.path.exists(os.path.join(vs_path, vs_id)): + os.makedirs(os.path.join(vs_path, vs_id)) + for file in files: + file_name = file.name if not isinstance(file, str) else file + filename = os.path.split(file_name)[-1] + shutil.copyfile(file_name, os.path.join(vs_path, vs_id, filename)) + filelist.append(os.path.join(vs_path, vs_id, filename)) + vs_path, loaded_files = local_doc_qa.init_knowledge_vector_store(filelist, os.path.join(vs_path, vs_id), sentence_size, text2vec) + + if len(loaded_files): + file_status = f"已添加 {'、'.join([os.path.split(i)[-1] for i in loaded_files if i])} 内容至知识库,并已加载知识库,请开始提问" + else: + pass + # file_status = "文件未成功加载,请重新上传文件" + # print(file_status) + return local_doc_qa, vs_path + +@Singleton +class knowledge_archive_interface(): + def __init__(self) -> None: + self.threadLock = threading.Lock() + self.current_id = "" + self.kai_path = None + self.qa_handle = None + self.text2vec_large_chinese = None + + def get_chinese_text2vec(self): + if self.text2vec_large_chinese is None: + # < -------------------预热文本向量化模组--------------- > + from toolbox import ProxyNetworkActivate + print('Checking Text2vec ...') + from langchain.embeddings.huggingface import HuggingFaceEmbeddings + with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 + self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese") + + return self.text2vec_large_chinese + + + def feed_archive(self, file_manifest, vs_path, id="default"): + self.threadLock.acquire() + # import uuid + self.current_id = id + self.qa_handle, self.kai_path = construct_vector_store( + vs_id=self.current_id, + vs_path=vs_path, + files=file_manifest, + sentence_size=100, + history=[], + one_conent="", + one_content_segmentation="", + text2vec = self.get_chinese_text2vec(), + ) + self.threadLock.release() + + def get_current_archive_id(self): + return self.current_id + + def get_loaded_file(self, vs_path): + return self.qa_handle.get_loaded_file(vs_path) + + def answer_with_archive_by_id(self, txt, id, vs_path): + self.threadLock.acquire() + if not self.current_id == id: + self.current_id = id + self.qa_handle, self.kai_path = construct_vector_store( + vs_id=self.current_id, + vs_path=vs_path, + files=[], + sentence_size=100, + history=[], + one_conent="", + one_content_segmentation="", + text2vec = self.get_chinese_text2vec(), + ) + VECTOR_SEARCH_SCORE_THRESHOLD = 0 + VECTOR_SEARCH_TOP_K = 4 + CHUNK_SIZE = 512 + resp, prompt = self.qa_handle.get_knowledge_based_conent_test( + query = txt, + vs_path = self.kai_path, + score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD, + vector_search_top_k=VECTOR_SEARCH_TOP_K, + chunk_conent=True, + chunk_size=CHUNK_SIZE, + text2vec = self.get_chinese_text2vec(), + ) + self.threadLock.release() + return resp, prompt \ No newline at end of file diff --git a/crazy_functions/互动小游戏.py b/crazy_functions/互动小游戏.py new file mode 100644 index 00000000..f3786c31 --- /dev/null +++ b/crazy_functions/互动小游戏.py @@ -0,0 +1,40 @@ +from toolbox import CatchException, update_ui, update_ui_lastest_msg +from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState +from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from request_llms.bridge_all import predict_no_ui_long_connection +from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing + +@CatchException +def 随机小游戏(prompt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + from crazy_functions.game_fns.game_interactive_story import MiniGame_ResumeStory + # 清空历史 + history = [] + # 选择游戏 + cls = MiniGame_ResumeStory + # 如果之前已经初始化了游戏实例,则继续该实例;否则重新初始化 + state = cls.sync_state(chatbot, + llm_kwargs, + cls, + plugin_name='MiniGame_ResumeStory', + callback_fn='crazy_functions.互动小游戏->随机小游戏', + lock_plugin=True + ) + yield from state.continue_game(prompt, chatbot, history) + + +@CatchException +def 随机小游戏1(prompt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + from crazy_functions.game_fns.game_ascii_art import MiniGame_ASCII_Art + # 清空历史 + history = [] + # 选择游戏 + cls = MiniGame_ASCII_Art + # 如果之前已经初始化了游戏实例,则继续该实例;否则重新初始化 + state = cls.sync_state(chatbot, + llm_kwargs, + cls, + plugin_name='MiniGame_ASCII_Art', + callback_fn='crazy_functions.互动小游戏->随机小游戏1', + lock_plugin=True + ) + yield from state.continue_game(prompt, chatbot, history) diff --git a/crazy_functions/知识库问答.py b/crazy_functions/知识库问答.py new file mode 100644 index 00000000..e1cd00ca --- /dev/null +++ b/crazy_functions/知识库问答.py @@ -0,0 +1,117 @@ +from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg, get_log_folder, get_user +from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything + +install_msg =""" + +1. python -m pip install torch --index-url https://download.pytorch.org/whl/cpu + +2. python -m pip install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade + +3. python -m pip install unstructured[all-docs] --upgrade + +4. python -c 'import nltk; nltk.download("punkt")' +""" + +@CatchException +def 知识库文件注入(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + """ + txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 + llm_kwargs gpt模型参数, 如温度和top_p等, 一般原样传递下去就行 + plugin_kwargs 插件模型的参数,暂时没有用武之地 + chatbot 聊天显示框的句柄,用于显示给用户 + history 聊天历史,前情提要 + system_prompt 给gpt的静默提醒 + web_port 当前软件运行的端口号 + """ + history = [] # 清空历史,以免输入溢出 + + # < --------------------读取参数--------------- > + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + kai_id = plugin_kwargs.get("advanced_arg", 'default') + + chatbot.append((f"向`{kai_id}`知识库中添加文件。", "[Local Message] 从一批文件(txt, md, tex)中读取数据构建知识库, 然后进行问答。")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # resolve deps + try: + # from zh_langchain import construct_vector_store + # from langchain.embeddings.huggingface import HuggingFaceEmbeddings + from crazy_functions.vector_fns.vector_database import knowledge_archive_interface + except Exception as e: + chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + # from .crazy_utils import try_install_deps + # try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) + # yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) + return + + # < --------------------读取文件--------------- > + file_manifest = [] + spl = ["txt", "doc", "docx", "email", "epub", "html", "json", "md", "msg", "pdf", "ppt", "pptx", "rtf"] + for sp in spl: + _, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}') + file_manifest += file_manifest_tmp + + if len(file_manifest) == 0: + chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # < -------------------预热文本向量化模组--------------- > + chatbot.append(['
'.join(file_manifest), "正在预热文本向量化模组, 如果是第一次运行, 将消耗较长时间下载中文向量化模型..."]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + print('Checking Text2vec ...') + from langchain.embeddings.huggingface import HuggingFaceEmbeddings + with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 + HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese") + + # < -------------------构建知识库--------------- > + chatbot.append(['
'.join(file_manifest), "正在构建知识库..."]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + print('Establishing knowledge archive ...') + with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 + kai = knowledge_archive_interface() + vs_path = get_log_folder(user=get_user(chatbot), plugin_name='vec_store') + kai.feed_archive(file_manifest=file_manifest, vs_path=vs_path, id=kai_id) + kai_files = kai.get_loaded_file(vs_path=vs_path) + kai_files = '
'.join(kai_files) + # chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"]) + # yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + # chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id() + # chatbot._cookies['lock_plugin'] = 'crazy_functions.知识库文件注入->读取知识库作答' + # chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"]) + chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问答”插件进行知识库访问, 或者使用此插件继续上传更多文件。"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 + +@CatchException +def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1): + # resolve deps + try: + # from zh_langchain import construct_vector_store + # from langchain.embeddings.huggingface import HuggingFaceEmbeddings + from crazy_functions.vector_fns.vector_database import knowledge_archive_interface + except Exception as e: + chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + # from .crazy_utils import try_install_deps + # try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) + # yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) + return + + # < ------------------- --------------- > + kai = knowledge_archive_interface() + + if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") + kai_id = plugin_kwargs.get("advanced_arg", 'default') + vs_path = get_log_folder(user=get_user(chatbot), plugin_name='vec_store') + resp, prompt = kai.answer_with_archive_by_id(txt, kai_id, vs_path) + + chatbot.append((txt, f'[知识库 {kai_id}] ' + prompt)) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 + gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=prompt, inputs_show_user=txt, + llm_kwargs=llm_kwargs, chatbot=chatbot, history=[], + sys_prompt=system_prompt + ) + history.extend((prompt, gpt_say)) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 diff --git a/docs/GithubAction+AllCapacityBeta b/docs/GithubAction+AllCapacityBeta new file mode 100644 index 00000000..d3a06ee1 --- /dev/null +++ b/docs/GithubAction+AllCapacityBeta @@ -0,0 +1,53 @@ +# docker build -t gpt-academic-all-capacity -f docs/GithubAction+AllCapacity --network=host --build-arg http_proxy=http://localhost:10881 --build-arg https_proxy=http://localhost:10881 . +# docker build -t gpt-academic-all-capacity -f docs/GithubAction+AllCapacityBeta --network=host . +# docker run -it --net=host gpt-academic-all-capacity bash + +# 从NVIDIA源,从而支持显卡(检查宿主的nvidia-smi中的cuda版本必须>=11.3) +FROM fuqingxu/11.3.1-runtime-ubuntu20.04-with-texlive:latest + +# use python3 as the system default python +WORKDIR /gpt +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.8 + +# # 非必要步骤,更换pip源 (以下三行,可以删除) +# RUN echo '[global]' > /etc/pip.conf && \ +# echo 'index-url = https://mirrors.aliyun.com/pypi/simple/' >> /etc/pip.conf && \ +# echo 'trusted-host = mirrors.aliyun.com' >> /etc/pip.conf + +# 下载pytorch +RUN python3 -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113 +# 准备pip依赖 +RUN python3 -m pip install openai numpy arxiv rich +RUN python3 -m pip install colorama Markdown pygments pymupdf +RUN python3 -m pip install python-docx moviepy pdfminer +RUN python3 -m pip install zh_langchain==0.2.1 pypinyin +RUN python3 -m pip install rarfile py7zr +RUN python3 -m pip install aliyun-python-sdk-core==2.13.3 pyOpenSSL webrtcvad scipy git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git +# 下载分支 +WORKDIR /gpt +RUN git clone --depth=1 https://github.com/binary-husky/gpt_academic.git +WORKDIR /gpt/gpt_academic +RUN git clone --depth=1 https://github.com/OpenLMLab/MOSS.git request_llms/moss + +RUN python3 -m pip install -r requirements.txt +RUN python3 -m pip install -r request_llms/requirements_moss.txt +RUN python3 -m pip install -r request_llms/requirements_qwen.txt +RUN python3 -m pip install -r request_llms/requirements_chatglm.txt +RUN python3 -m pip install -r request_llms/requirements_newbing.txt +RUN python3 -m pip install nougat-ocr + +# 预热Tiktoken模块 +RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' + +# 安装知识库插件的额外依赖 +RUN apt-get update && apt-get install libgl1 -y +RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade +RUN pip3 install unstructured[all-docs] --upgrade +RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()' +RUN rm -rf /usr/local/lib/python3.8/dist-packages/tests + + +# COPY .cache /root/.cache +# COPY config_private.py config_private.py +# 启动 +CMD ["python3", "-u", "main.py"] diff --git a/docs/GithubAction+NoLocal+Vectordb b/docs/GithubAction+NoLocal+Vectordb new file mode 100644 index 00000000..45074d93 --- /dev/null +++ b/docs/GithubAction+NoLocal+Vectordb @@ -0,0 +1,26 @@ +# 此Dockerfile适用于“无本地模型”的环境构建,如果需要使用chatglm等本地模型,请参考 docs/Dockerfile+ChatGLM +# 如何构建: 先修改 `config.py`, 然后 docker build -t gpt-academic-nolocal-vs -f docs/GithubAction+NoLocal+Vectordb . +# 如何运行: docker run --rm -it --net=host gpt-academic-nolocal-vs +FROM python:3.11 + +# 指定路径 +WORKDIR /gpt + +# 装载项目文件 +COPY . . + +# 安装依赖 +RUN pip3 install -r requirements.txt + +# 安装知识库插件的额外依赖 +RUN apt-get update && apt-get install libgl1 -y +RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu +RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade +RUN pip3 install unstructured[all-docs] --upgrade +RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()' + +# 可选步骤,用于预热模块 +RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' + +# 启动 +CMD ["python3", "-u", "main.py"] diff --git a/request_llms/bridge_qwen_local.py b/request_llms/bridge_qwen_local.py new file mode 100644 index 00000000..e6c2dd5c --- /dev/null +++ b/request_llms/bridge_qwen_local.py @@ -0,0 +1,59 @@ +model_name = "Qwen_Local" +cmd_to_install = "`pip install -r request_llms/requirements_qwen_local.txt`" + +from toolbox import ProxyNetworkActivate, get_conf +from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns + + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 Local Model +# ------------------------------------------------------------------------------------------------------------------------ +class GetQwenLMHandle(LocalLLMHandle): + + def load_model_info(self): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + self.model_name = model_name + self.cmd_to_install = cmd_to_install + + def load_model_and_tokenizer(self): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + # from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig + from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers.generation import GenerationConfig + with ProxyNetworkActivate('Download_LLM'): + model_id = get_conf('QWEN_LOCAL_MODEL_SELECTION') + self._tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, resume_download=True) + # use fp16 + model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True).eval() + model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参 + self._model = model + + return self._model, self._tokenizer + + def llm_stream_generator(self, **kwargs): + # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行 + def adaptor(kwargs): + query = kwargs['query'] + max_length = kwargs['max_length'] + top_p = kwargs['top_p'] + temperature = kwargs['temperature'] + history = kwargs['history'] + return query, max_length, top_p, temperature, history + + query, max_length, top_p, temperature, history = adaptor(kwargs) + + for response in self._model.chat_stream(self._tokenizer, query, history=history): + yield response + + def try_to_import_special_deps(self, **kwargs): + # import something that will raise error if the user does not install requirement_*.txt + # 🏃‍♂️🏃‍♂️🏃‍♂️ 主进程执行 + import importlib + importlib.import_module('modelscope') + + +# ------------------------------------------------------------------------------------------------------------------------ +# 🔌💻 GPT-Academic Interface +# ------------------------------------------------------------------------------------------------------------------------ +predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetQwenLMHandle, model_name) \ No newline at end of file diff --git a/request_llms/com_qwenapi.py b/request_llms/com_qwenapi.py new file mode 100644 index 00000000..5807600d --- /dev/null +++ b/request_llms/com_qwenapi.py @@ -0,0 +1,94 @@ +from http import HTTPStatus +from toolbox import get_conf +import threading +import logging + +timeout_bot_msg = '[Local Message] Request timeout. Network error.' + +class QwenRequestInstance(): + def __init__(self): + import dashscope + self.time_to_yield_event = threading.Event() + self.time_to_exit_event = threading.Event() + self.result_buf = "" + + def validate_key(): + DASHSCOPE_API_KEY = get_conf("DASHSCOPE_API_KEY") + if DASHSCOPE_API_KEY == '': return False + return True + + if not validate_key(): + raise RuntimeError('请配置 DASHSCOPE_API_KEY') + dashscope.api_key = get_conf("DASHSCOPE_API_KEY") + + + def generate(self, inputs, llm_kwargs, history, system_prompt): + # import _thread as thread + from dashscope import Generation + QWEN_MODEL = { + 'qwen-turbo': Generation.Models.qwen_turbo, + 'qwen-plus': Generation.Models.qwen_plus, + 'qwen-max': Generation.Models.qwen_max, + }[llm_kwargs['llm_model']] + top_p = llm_kwargs.get('top_p', 0.8) + if top_p == 0: top_p += 1e-5 + if top_p == 1: top_p -= 1e-5 + + self.result_buf = "" + responses = Generation.call( + model=QWEN_MODEL, + messages=generate_message_payload(inputs, llm_kwargs, history, system_prompt), + top_p=top_p, + temperature=llm_kwargs.get('temperature', 1.0), + result_format='message', + stream=True, + incremental_output=True + ) + + for response in responses: + if response.status_code == HTTPStatus.OK: + if response.output.choices[0].finish_reason == 'stop': + yield self.result_buf + break + elif response.output.choices[0].finish_reason == 'length': + self.result_buf += "[Local Message] 生成长度过长,后续输出被截断" + yield self.result_buf + break + else: + self.result_buf += response.output.choices[0].message.content + yield self.result_buf + else: + self.result_buf += f"[Local Message] 请求错误:状态码:{response.status_code},错误码:{response.code},消息:{response.message}" + yield self.result_buf + break + logging.info(f'[raw_input] {inputs}') + logging.info(f'[response] {self.result_buf}') + return self.result_buf + + +def generate_message_payload(inputs, llm_kwargs, history, system_prompt): + conversation_cnt = len(history) // 2 + if system_prompt == '': system_prompt = 'Hello!' + messages = [{"role": "user", "content": system_prompt}, {"role": "assistant", "content": "Certainly!"}] + if conversation_cnt: + for index in range(0, 2*conversation_cnt, 2): + what_i_have_asked = {} + what_i_have_asked["role"] = "user" + what_i_have_asked["content"] = history[index] + what_gpt_answer = {} + what_gpt_answer["role"] = "assistant" + what_gpt_answer["content"] = history[index+1] + if what_i_have_asked["content"] != "": + if what_gpt_answer["content"] == "": + continue + if what_gpt_answer["content"] == timeout_bot_msg: + continue + messages.append(what_i_have_asked) + messages.append(what_gpt_answer) + else: + messages[-1]['content'] = what_gpt_answer['content'] + what_i_ask_now = {} + what_i_ask_now["role"] = "user" + what_i_ask_now["content"] = inputs + messages.append(what_i_ask_now) + return messages diff --git a/request_llms/requirements_qwen_local.txt b/request_llms/requirements_qwen_local.txt new file mode 100644 index 00000000..de6bf3c9 --- /dev/null +++ b/request_llms/requirements_qwen_local.txt @@ -0,0 +1,5 @@ +modelscope +transformers_stream_generator +auto-gptq +optimum +urllib3<2 \ No newline at end of file diff --git a/tests/test_vector_plugins.py b/tests/test_vector_plugins.py new file mode 100644 index 00000000..9b75463b --- /dev/null +++ b/tests/test_vector_plugins.py @@ -0,0 +1,17 @@ +""" +对项目中的各个插件进行测试。运行方法:直接运行 python tests/test_plugins.py +""" + + +import os, sys +def validate_path(): dir_name = os.path.dirname(__file__); root_dir_assume = os.path.abspath(dir_name + '/..'); os.chdir(root_dir_assume); sys.path.append(root_dir_assume) +validate_path() # 返回项目根路径 + +if __name__ == "__main__": + from tests.test_utils import plugin_test + + plugin_test(plugin='crazy_functions.知识库问答->知识库文件注入', main_input="./README.md") + + plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="What is the installation method?") + + plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="远程云服务器部署?") \ No newline at end of file diff --git a/themes/cookies.py b/themes/cookies.py new file mode 100644 index 00000000..e69de29b