rag version one

这个提交包含在:
binary-husky
2024-08-28 15:14:13 +00:00
父节点 294716c832
当前提交 08c3c56f53
共有 9 个文件被更改,包括 313 次插入79 次删除

查看文件

@@ -38,6 +38,9 @@ AVAIL_LLM_MODELS = ["gpt-4-1106-preview", "gpt-4-turbo-preview", "gpt-4-vision-p
"gpt-4", "gpt-4-32k", "azure-gpt-4", "glm-4", "glm-4v", "glm-3-turbo", "gpt-4", "gpt-4-32k", "azure-gpt-4", "glm-4", "glm-4v", "glm-3-turbo",
"gemini-1.5-pro", "chatglm3" "gemini-1.5-pro", "chatglm3"
] ]
EMBEDDING_MODEL = "text-embedding-3-small"
# --- --- --- --- # --- --- --- ---
# P.S. 其他可用的模型还包括 # P.S. 其他可用的模型还包括
# AVAIL_LLM_MODELS = [ # AVAIL_LLM_MODELS = [

查看文件

@@ -5,6 +5,7 @@ from toolbox import trimmed_format_exc
def get_crazy_functions(): def get_crazy_functions():
from crazy_functions.读文章写摘要 import 读文章写摘要 from crazy_functions.读文章写摘要 import 读文章写摘要
from crazy_functions.生成函数注释 import 批量生成函数注释 from crazy_functions.生成函数注释 import 批量生成函数注释
from crazy_functions.Rag_Interface import Rag问答
from crazy_functions.SourceCode_Analyse import 解析项目本身 from crazy_functions.SourceCode_Analyse import 解析项目本身
from crazy_functions.SourceCode_Analyse import 解析一个Python项目 from crazy_functions.SourceCode_Analyse import 解析一个Python项目
from crazy_functions.SourceCode_Analyse import 解析一个Matlab项目 from crazy_functions.SourceCode_Analyse import 解析一个Matlab项目
@@ -50,6 +51,13 @@ def get_crazy_functions():
from crazy_functions.SourceCode_Comment import 注释Python项目 from crazy_functions.SourceCode_Comment import 注释Python项目
function_plugins = { function_plugins = {
"Rag智能召回": {
"Group": "对话",
"Color": "stop",
"AsButton": False,
"Info": "将问答数据记录到向量库中,作为长期参考。",
"Function": HotReload(Rag问答),
},
"虚空终端": { "虚空终端": {
"Group": "对话|编程|学术|智能体", "Group": "对话|编程|学术|智能体",
"Color": "stop", "Color": "stop",

查看文件

@@ -0,0 +1,39 @@
from toolbox import CatchException, update_ui, get_conf, get_log_folder
from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from crazy_functions.rag_fns.llama_index_worker import LlamaIndexRagWorker
RAG_WORKER_REGISTER = {}
@CatchException
def Rag问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
# first, we retrieve rag worker from global context
user_name = chatbot.get_user()
if user_name in RAG_WORKER_REGISTER:
rag_worker = RAG_WORKER_REGISTER[user_name]
else:
rag_worker = RAG_WORKER_REGISTER[user_name] = LlamaIndexRagWorker(
user_name,
llm_kwargs,
checkpoint_dir=get_log_folder(user_name, plugin_name='experimental_rag'),
auto_load_checkpoint=True)
# second, we search vector store and build prompts
i_say = txt
nodes = rag_worker.retrieve_from_store_with_query(i_say)
prompt = rag_worker.build_prompt(query=i_say, nodes=nodes)
# third, it is time to query llms
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=prompt, inputs_show_user=i_say,
llm_kwargs=llm_kwargs, chatbot=chatbot, history=history,
sys_prompt=system_prompt,
retry_times_at_unknown_error=0
)
# finally, remember what has been asked / answered
rag_worker.remember_qa(i_say, gpt_say)
history.extend([i_say, gpt_say])
# yield, see you next time
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新

查看文件

@@ -1,34 +1,122 @@
import llama_index import llama_index
from llama_index.core import Document
from llama_index.core.schema import TextNode
from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
from shared_utils.connect_void_terminal import get_chat_default_kwargs
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from crazy_functions.rag_fns.vector_store_index import GptacVectorStoreIndex
from llama_index.core.ingestion import run_transformations
from llama_index.core import PromptTemplate
from llama_index.core.response_synthesizers import TreeSummarize
class rag_worker(): DEFAULT_QUERY_GENERATION_PROMPT = """\
def __init__(self) -> None: Now, you have context information as below:
pass ---------------------
{context_str}
---------------------
Answer the user request below (use the context information if necessary, otherwise you can ignore them):
---------------------
{query_str}
"""
QUESTION_ANSWER_RECORD = """\
{{
"type": "This is a previous conversation with the user",
"question": "{question}",
"answer": "{answer}",
}}
"""
class SaveLoad():
def does_checkpoint_exist(self, checkpoint_dir=None):
import os, glob
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
if not os.path.exists(checkpoint_dir): return False
if len(glob.glob(os.path.join(checkpoint_dir, "*.json"))) == 0: return False
return True
def save_to_checkpoint(self, checkpoint_dir=None):
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
self.vs_index.storage_context.persist(persist_dir=checkpoint_dir)
def load_from_checkpoint(self, checkpoint_dir=None):
if checkpoint_dir is None: checkpoint_dir = self.checkpoint_dir
if self.does_checkpoint_exist(checkpoint_dir=checkpoint_dir):
print('loading checkpoint from disk')
from llama_index.core import StorageContext, load_index_from_storage
storage_context = StorageContext.from_defaults(persist_dir=checkpoint_dir)
self.vs_index = load_index_from_storage(storage_context, embed_model=self.embed_model)
return self.vs_index
else:
return self.create_new_vs()
def create_new_vs(self):
return GptacVectorStoreIndex.default_vector_store(embed_model=self.embed_model)
class LlamaIndexRagWorker(SaveLoad):
def __init__(self, user_name, llm_kwargs, auto_load_checkpoint=True, checkpoint_dir=None) -> None:
self.debug_mode = True
self.embed_model = OpenAiEmbeddingModel(llm_kwargs)
self.user_name = user_name
self.checkpoint_dir = checkpoint_dir
if auto_load_checkpoint:
self.vs_index = self.load_from_checkpoint(checkpoint_dir)
else:
self.vs_index = self.create_new_vs()
def assign_embedding_model(self): def assign_embedding_model(self):
pass pass
def save_to_checkpoint(self):
pass
def load_from_checkpoint(self):
pass
def add_documents_to_vector_store(self, documents):
pass
def add_text_to_vector_store(self, documents):
pass
def inspect_vector_store(self): def inspect_vector_store(self):
pass # This function is for debugging
self.vs_index.storage_context.index_store.to_dict()
docstore = self.vs_index.storage_context.docstore.docs
vector_store_preview = "\n".join([ f"{_id} | {tn.text}" for _id, tn in docstore.items() ])
print('\n++ --------inspect_vector_store begin--------')
print(vector_store_preview)
print('oo --------inspect_vector_store end--------')
return vector_store_preview
def add_documents_to_vector_store(self, document_list):
documents = [Document(text=t) for t in document_list]
documents_nodes = run_transformations(
documents, # type: ignore
self.vs_index._transformations,
show_progress=True
)
self.vs_index.insert_nodes(documents_nodes)
if self.debug_mode: self.inspect_vector_store()
def add_text_to_vector_store(self, text):
node = TextNode(text=text)
documents_nodes = run_transformations(
[node],
self.vs_index._transformations,
show_progress=True
)
self.vs_index.insert_nodes(documents_nodes)
if self.debug_mode: self.inspect_vector_store()
def remember_qa(self, question, answer):
formatted_str = QUESTION_ANSWER_RECORD.format(question=question, answer=answer)
self.add_text_to_vector_store(formatted_str)
def retrieve_from_store_with_query(self, query): def retrieve_from_store_with_query(self, query):
pass if self.debug_mode: self.inspect_vector_store()
retriever = self.vs_index.as_retriever()
@staticmethod return retriever.retrieve(query)
def build_prompt(self):
pass
def build_prompt(self, query, nodes):
context_str = self.generate_node_array_preview(nodes)
return DEFAULT_QUERY_GENERATION_PROMPT.format(context_str=context_str, query_str=query)
def generate_node_array_preview(self, nodes):
buf = "\n".join(([f"(No.{i+1} | score {n.score:.3f}): {n.text}" for i, n in enumerate(nodes)]))
if self.debug_mode: print(buf)
return buf

查看文件

@@ -118,8 +118,8 @@ def main():
choices=[ choices=[
"常规对话", "常规对话",
"多模型对话", "多模型对话",
"智能召回 RAG",
# "智能上下文", # "智能上下文",
# "智能召回 RAG",
], value="常规对话", ], value="常规对话",
interactive=True, label='', show_label=False, interactive=True, label='', show_label=False,
elem_classes='normal_mut_select', elem_id="gpt-submit-dropdown").style(container=False) elem_classes='normal_mut_select', elem_id="gpt-submit-dropdown").style(container=False)

查看文件

@@ -5,11 +5,40 @@ from toolbox import CatchException, update_ui, get_conf, select_api_key, get_log
from shared_utils.key_pattern_manager import select_api_key_for_embed_models from shared_utils.key_pattern_manager import select_api_key_for_embed_models
from typing import List, Any from typing import List, Any
class OpenAiEmbeddingModel(): import numpy as np
def mean_agg(embeddings):
"""Mean aggregation for embeddings."""
return np.array(embeddings).mean(axis=0).tolist()
class EmbeddingModel():
def get_agg_embedding_from_queries(
self,
queries: List[str],
agg_fn = None,
):
"""Get aggregated embedding from multiple queries."""
query_embeddings = [self.get_query_embedding(query) for query in queries]
agg_fn = agg_fn or mean_agg
return agg_fn(query_embeddings)
def get_text_embedding_batch(
self,
texts: List[str],
show_progress: bool = False,
):
return self.compute_embedding(texts, batch_mode=True)
class OpenAiEmbeddingModel(EmbeddingModel):
def __init__(self, llm_kwargs:dict=None): def __init__(self, llm_kwargs:dict=None):
self.llm_kwargs = llm_kwargs self.llm_kwargs = llm_kwargs
def get_query_embedding(self, query: str):
return self.compute_embedding(query)
def compute_embedding(self, text="这是要计算嵌入的文本", llm_kwargs:dict=None, batch_mode=False): def compute_embedding(self, text="这是要计算嵌入的文本", llm_kwargs:dict=None, batch_mode=False):
from .bridge_all_embed import embed_model_info from .bridge_all_embed import embed_model_info
@@ -20,9 +49,9 @@ class OpenAiEmbeddingModel():
raise RuntimeError("llm_kwargs is not provided!") raise RuntimeError("llm_kwargs is not provided!")
# setup api and req url # setup api and req url
api_key = select_api_key_for_embed_models(llm_kwargs['api_key'], llm_kwargs['llm_model']) api_key = select_api_key_for_embed_models(llm_kwargs['api_key'], llm_kwargs['embed_model'])
embed_model = llm_kwargs['llm_model'] embed_model = llm_kwargs['embed_model']
base_url = embed_model_info[llm_kwargs['llm_model']]['embed_endpoint'].replace('embeddings', '') base_url = embed_model_info[llm_kwargs['embed_model']]['embed_endpoint'].replace('embeddings', '')
# send and compute # send and compute
with ProxyNetworkActivate("Connect_OpenAI_Embedding"): with ProxyNetworkActivate("Connect_OpenAI_Embedding"):
@@ -40,21 +69,11 @@ class OpenAiEmbeddingModel():
embedding = [d.embedding for d in res.data] embedding = [d.embedding for d in res.data]
else: else:
embedding = res.data[0].embedding embedding = res.data[0].embedding
return embedding return embedding
def embedding_dimension(self, llm_kwargs): def embedding_dimension(self, llm_kwargs):
from .bridge_all_embed import embed_model_info from .bridge_all_embed import embed_model_info
return embed_model_info[llm_kwargs['llm_model']]['embed_dimension'] return embed_model_info[llm_kwargs['embed_model']]['embed_dimension']
def get_text_embedding_batch(
self,
texts: List[str],
show_progress: bool = False,
):
return self.compute_embedding(texts, batch_mode=True)
if __name__ == "__main__": if __name__ == "__main__":
pass pass

查看文件

@@ -1,68 +1,134 @@
def validate_path(): def validate_path():
import os, sys import os, sys
os.path.dirname(__file__) os.path.dirname(__file__)
root_dir_assume = os.path.abspath(os.path.dirname(__file__) + "/..") root_dir_assume = os.path.abspath(os.path.dirname(__file__) + "/..")
os.chdir(root_dir_assume) os.chdir(root_dir_assume)
sys.path.append(root_dir_assume) sys.path.append(root_dir_assume)
validate_path() # validate path so you can run from base directory validate_path() # validate path so you can run from base directory
# # """
# # Test 1
# # """
# # from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
# # from shared_utils.connect_void_terminal import get_chat_default_kwargs
# # oaiem = OpenAiEmbeddingModel()
# # chat_kwargs = get_chat_default_kwargs()
# # llm_kwargs = chat_kwargs['llm_kwargs']
# # llm_kwargs.update({
# # 'llm_model': "text-embedding-3-small"
# # })
# # res = oaiem.compute_embedding("你好", llm_kwargs)
# # print(res)
# """ # """
# Test 1 # Test 2
# """ # """
# from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel # from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
# from shared_utils.connect_void_terminal import get_chat_default_kwargs
# oaiem = OpenAiEmbeddingModel()
# chat_kwargs = get_chat_default_kwargs()
# llm_kwargs = chat_kwargs['llm_kwargs']
# llm_kwargs.update({
# 'llm_model': "text-embedding-3-small"
# })
# res = oaiem.compute_embedding("你好", llm_kwargs)
# print(res)
"""
Test 2
"""
from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
from shared_utils.connect_void_terminal import get_chat_default_kwargs from shared_utils.connect_void_terminal import get_chat_default_kwargs
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader # from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from crazy_functions.rag_fns.vector_store_index import GptacVectorStoreIndex # from crazy_functions.rag_fns.vector_store_index import GptacVectorStoreIndex
from llama_index.core.ingestion import run_transformations # from llama_index.core.ingestion import run_transformations
# from llama_index.core import PromptTemplate
# from llama_index.core.response_synthesizers import TreeSummarize
# # NOTE: we add an extra tone_name variable here
# DEFAULT_QUESTION_GENERATION_PROMPT = """\
# Context information is below.
# ---------------------
# {context_str}
# ---------------------
# Given the context information and not prior knowledge.
# generate only questions based on the below query.
# {query_str}
# """
chat_kwargs = get_chat_default_kwargs() chat_kwargs = get_chat_default_kwargs()
llm_kwargs = chat_kwargs['llm_kwargs'] llm_kwargs = chat_kwargs['llm_kwargs']
llm_kwargs.update({ llm_kwargs.update({
'llm_model': "text-embedding-3-small" 'llm_model': "text-embedding-3-small"
}) })
embed_model = OpenAiEmbeddingModel(llm_kwargs) # embed_model = OpenAiEmbeddingModel(llm_kwargs)
## dir # ## dir
documents = SimpleDirectoryReader("private_upload/rag_test/").load_data() # documents = SimpleDirectoryReader("private_upload/rag_test/").load_data()
## single files # ## single files
# from llama_index.core import Document # # from llama_index.core import Document
# text_list = [text1, text2, ...] # # text_list = [text1, text2, ...]
# documents = [Document(text=t) for t in text_list] # # documents = [Document(text=t) for t in text_list]
vsi = GptacVectorStoreIndex.default_vector_store(embed_model=embed_model) # vsi = GptacVectorStoreIndex.default_vector_store(embed_model=embed_model)
documents_nodes = run_transformations( # documents_nodes = run_transformations(
documents, # type: ignore # documents, # type: ignore
vsi._transformations, # vsi._transformations,
show_progress=True # show_progress=True
) # )
index = vsi.insert_nodes(documents_nodes) # index = vsi.insert_nodes(documents_nodes)
# retriever = vsi.as_retriever()
# query = "what is core_functional.py"
# res = retriever.retrieve(query)
# context_str = '\n'.join([r.text for r in res])
# query_str = query
# query = DEFAULT_QUESTION_GENERATION_PROMPT.format(context_str=context_str, query_str=query_str)
# print(res)
# print(res)
query_engine = index.as_query_engine() # # response = query_engine.query("Some question about the data should go here")
response = query_engine.query("Some question about the data should go here") # # print(response)
print(response)
from crazy_functions.rag_fns.llama_index_worker import LlamaIndexRagWorker
rag_worker = LlamaIndexRagWorker('good-man-user', llm_kwargs, checkpoint_dir='./good_man_vector_store')
# rag_worker.add_text_to_vector_store("""
# 熊童子Cotyledon tomentosa是景天科,银波锦属的多年生肉质草本植物,植株多分枝,茎绿色,肉质叶肥厚,交互对生,卵圆形,绿色,密生白色短毛。叶端具红色爪样齿,二歧聚伞花序,小花黄色,花期7-9月。
# 该种原产于南非开普省。喜温暖干燥,阳光充足,通风良好的环境。夏季温度过高会休眠。忌寒冷和过分潮湿。繁殖方法有扦插。
# 该种叶形叶色较美,花朵玲珑小巧,叶片形似小熊的脚掌,形态奇特,十分可爱,观赏价值很高。
# 物种索引IN4679748
# """)
# rag_worker.add_text_to_vector_store("""
# 碧光环是番杏科碧光玉属 [4]多年生肉质草本植物。 [5]碧光环叶表面有半透明的颗粒感,晶莹剔透;两片圆柱形的叶子,在生长初期像兔耳,非常可爱,长大后叶子会慢慢变长变粗,缺水时容易耷拉下来;具枝干,易群生。
# 碧光环原产于南非。碧光环喜温暖和散射光充足的环境,较耐寒,忌强光暴晒,夏季高温休眠明显。 [6]碧光环的繁殖方式有扦插和播种。 [7]
# 碧光环小巧饱满、圆滚滚的样子很可爱,长得好像长耳朵小兔,萌萌的样子让人爱不释手,而且养起来也不难,极具观赏价值。 [8]
# 物种索引IN985654
# """)
# rag_worker.add_text_to_vector_store("""
# 福娘为景天科银波锦属的肉质草本植物。对生的叶片呈短棒状,叶色灰绿,表覆白粉,叶缘外围镶着紫红色,叶片外形多有变化有短圆形、厚厚的方形等不同叶形; [5]花期夏秋。 [6]
# 福娘原产于非洲西南部的纳米比亚,现世界多地均有栽培。性喜欢凉爽通风、日照充足的环境,较喜光照,喜肥,生长适温为15-25℃,冬季温度不低于5℃,生长期要见干见湿。 [7]在通风透气、排水良好的土壤上生长良好,一般可用泥炭土、蛭石和珍珠岩的混合土。繁殖方式一般为扦插繁殖,多用枝插,叶插的繁殖成功率不高。 [8]
# 因福娘的叶形叶色较美,所以具有一定的观赏价值,可盆栽放置于电视、电脑旁,吸收辐射,亦可栽植于室内以吸收甲醛等物质,净化空气。 [9]
# 物种索引IN772
# """)
# rag_worker.add_text_to_vector_store("""
# 石莲( Sinocrassula indica (Decne.) A. Berger是景天科石莲属 [8]的二年生草本植物。基生叶莲座状,匙状长圆形;茎生叶互生,宽倒披针状线形或近倒卵形;花序圆锥状或近伞房状,萼片呈宽三角形,花瓣呈红色,披针形或卵形,雄蕊呈正方形;蓇葖果的喙反曲;种子平滑;花期9月;果期10月 [9]。锯叶石莲为石莲的变种,与原变种的不同处为叶上部有渐尖的锯齿。茎和花无毛,叶被毛 [10]。因叶子有棱有角,又似玉石,故而得名“石莲” [11]。
# 物种索引IN455674
# """)
# rag_worker.add_text_to_vector_store("""
# 虹之玉锦Sedum × rubrotinctum 'Aurora' [1]是景天科景天属的多肉植物,为虹之玉的锦化品种。虹之玉锦与虹之玉的叶片大小没有特别大的变化,但颜色会稍有不同,虹之玉锦一般会有粉红色、中绿色等 [2]。生长速度较虹之玉慢很多 [3]。
# 物种索引IN88
# """)
query = '福娘的物种'
nodes = rag_worker.retrieve_from_store_with_query(query)
build_prompt = rag_worker.build_prompt(query, nodes)
preview = rag_worker.generate_node_array_preview(nodes)
print(preview)
print(build_prompt)
print(nodes)
# vs = rag_worker.load_from_checkpoint('./good_man_vector_store')
# rag_worker.add_text_to_vector_store(r"I see that the (0.6.0) index persisted on disk contains: docstore.json, index_store.json and vector_store.json, but they don't seem to contain file paths or title metadata from the original documents, so maybe that's not captured and stored?")
# rag_worker.add_text_to_vector_store(r"Thanks! I'm trying to cluster (all) the vectors, then generate a description (label) for each cluster by sending (just) the vectors in each cluster to GPT to summarize, then associate the vectors with the original documents and classify each document by applying a sort of weighted sum of its cluster-labeled snippets. Not sure how useful that will be, but I want to try! I've got the vectors now (although I'm bit worried that the nested structure I'm getting them from might change without warning in the future!), and I'm able to cluster them, but I don't know how to associate the vectors (via their nodes) back to the original documents yet...")
# res = rag_worker.retrieve_from_store_with_query('cluster')
# rag_worker.save_to_checkpoint(checkpoint_dir = './good_man_vector_store')
# print(vs)

查看文件

@@ -1354,6 +1354,11 @@ async function multiplex_function_begin(multiplex_sel) {
call_plugin_via_name(_align_name_in_crazy_function_py); call_plugin_via_name(_align_name_in_crazy_function_py);
return; return;
} }
if (multiplex_sel === "智能召回 RAG") {
let _align_name_in_crazy_function_py = "Rag智能召回";
call_plugin_via_name(_align_name_in_crazy_function_py);
return;
}
} }
async function run_multiplex_shift(multiplex_sel){ async function run_multiplex_shift(multiplex_sel){
let key = multiplex_sel; let key = multiplex_sel;

查看文件

@@ -100,16 +100,19 @@ def ArgsGeneralWrapper(f):
user_name = request.username user_name = request.username
else: else:
user_name = default_user_name user_name = default_user_name
embed_model = get_conf("EMBEDDING_MODEL")
cookies.update({ cookies.update({
'top_p': top_p, 'top_p': top_p,
'api_key': cookies['api_key'], 'api_key': cookies['api_key'],
'llm_model': llm_model, 'llm_model': llm_model,
'embed_model': embed_model,
'temperature': temperature, 'temperature': temperature,
'user_name': user_name, 'user_name': user_name,
}) })
llm_kwargs = { llm_kwargs = {
'api_key': cookies['api_key'], 'api_key': cookies['api_key'],
'llm_model': llm_model, 'llm_model': llm_model,
'embed_model': embed_model,
'top_p': top_p, 'top_p': top_p,
'max_length': max_length, 'max_length': max_length,
'temperature': temperature, 'temperature': temperature,
@@ -621,9 +624,12 @@ def load_chat_cookies():
} }
} }
) )
EMBEDDING_MODEL = get_conf("EMBEDDING_MODEL")
return { return {
"api_key": API_KEY, "api_key": API_KEY,
"llm_model": LLM_MODEL, "llm_model": LLM_MODEL,
"embed_model": EMBEDDING_MODEL,
"customize_fn_overwrite": customize_fn_overwrite_, "customize_fn_overwrite": customize_fn_overwrite_,
} }