From 294716c8321e407b9815a1deeefe6e2d66b9f0c8 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Wed, 21 Aug 2024 14:24:37 +0000 Subject: [PATCH] begin rag project with llama index --- config.py | 2 +- crazy_functions/rag_fns/llama_index_worker.py | 34 ++++++++++ crazy_functions/rag_fns/vector_store_index.py | 58 ++++++++++++++++ request_llms/embed_models/bridge_all_embed.py | 40 +++++++++++ request_llms/embed_models/openai_embed.py | 60 ++++++++++++++++ shared_utils/key_pattern_manager.py | 16 +++++ tests/test_embed.py | 68 +++++++++++++++++++ 7 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 crazy_functions/rag_fns/llama_index_worker.py create mode 100644 crazy_functions/rag_fns/vector_store_index.py create mode 100644 request_llms/embed_models/bridge_all_embed.py create mode 100644 request_llms/embed_models/openai_embed.py create mode 100644 tests/test_embed.py diff --git a/config.py b/config.py index b8b84de8..6010989d 100644 --- a/config.py +++ b/config.py @@ -296,7 +296,7 @@ ARXIV_CACHE_DIR = "gpt_log/arxiv_cache" # 除了连接OpenAI之外,还有哪些场合允许使用代理,请尽量不要修改 WHEN_TO_USE_PROXY = ["Download_LLM", "Download_Gradio_Theme", "Connect_Grobid", - "Warmup_Modules", "Nougat_Download", "AutoGen"] + "Warmup_Modules", "Nougat_Download", "AutoGen", "Connect_OpenAI_Embedding"] # 启用插件热加载 diff --git a/crazy_functions/rag_fns/llama_index_worker.py b/crazy_functions/rag_fns/llama_index_worker.py new file mode 100644 index 00000000..7c1bcf36 --- /dev/null +++ b/crazy_functions/rag_fns/llama_index_worker.py @@ -0,0 +1,34 @@ +import llama_index + +class rag_worker(): + def __init__(self) -> None: + pass + + def assign_embedding_model(self): + pass + + def save_to_checkpoint(self): + pass + + def load_from_checkpoint(self): + pass + + def add_documents_to_vector_store(self, documents): + pass + + def add_text_to_vector_store(self, documents): + pass + + def inspect_vector_store(self): + pass + + def retrieve_from_store_with_query(self, query): + pass + + @staticmethod + def build_prompt(self): + pass + + + + diff --git a/crazy_functions/rag_fns/vector_store_index.py b/crazy_functions/rag_fns/vector_store_index.py new file mode 100644 index 00000000..74e8b09d --- /dev/null +++ b/crazy_functions/rag_fns/vector_store_index.py @@ -0,0 +1,58 @@ +from llama_index.core import VectorStoreIndex +from typing import Any, List, Optional + +from llama_index.core.callbacks.base import CallbackManager +from llama_index.core.schema import TransformComponent +from llama_index.core.service_context import ServiceContext +from llama_index.core.settings import ( + Settings, + callback_manager_from_settings_or_context, + transformations_from_settings_or_context, +) +from llama_index.core.storage.storage_context import StorageContext + + +class GptacVectorStoreIndex(VectorStoreIndex): + + @classmethod + def default_vector_store( + cls, + storage_context: Optional[StorageContext] = None, + show_progress: bool = False, + callback_manager: Optional[CallbackManager] = None, + transformations: Optional[List[TransformComponent]] = None, + # deprecated + service_context: Optional[ServiceContext] = None, + embed_model = None, + **kwargs: Any, + ): + """Create index from documents. + + Args: + documents (Optional[Sequence[BaseDocument]]): List of documents to + build the index from. + + """ + storage_context = storage_context or StorageContext.from_defaults() + docstore = storage_context.docstore + callback_manager = ( + callback_manager + or callback_manager_from_settings_or_context(Settings, service_context) + ) + transformations = transformations or transformations_from_settings_or_context( + Settings, service_context + ) + + with callback_manager.as_trace("index_construction"): + + return cls( + nodes=[], + storage_context=storage_context, + callback_manager=callback_manager, + show_progress=show_progress, + transformations=transformations, + service_context=service_context, + embed_model=embed_model, + **kwargs, + ) + diff --git a/request_llms/embed_models/bridge_all_embed.py b/request_llms/embed_models/bridge_all_embed.py new file mode 100644 index 00000000..e3ea8a6f --- /dev/null +++ b/request_llms/embed_models/bridge_all_embed.py @@ -0,0 +1,40 @@ +import tiktoken, copy, re +from functools import lru_cache +from concurrent.futures import ThreadPoolExecutor +from toolbox import get_conf, trimmed_format_exc, apply_gpt_academic_string_mask, read_one_api_model_name + +# Endpoint 重定向 +API_URL_REDIRECT, AZURE_ENDPOINT, AZURE_ENGINE = get_conf("API_URL_REDIRECT", "AZURE_ENDPOINT", "AZURE_ENGINE") +openai_endpoint = "https://api.openai.com/v1/chat/completions" +if not AZURE_ENDPOINT.endswith('/'): AZURE_ENDPOINT += '/' +azure_endpoint = AZURE_ENDPOINT + f'openai/deployments/{AZURE_ENGINE}/chat/completions?api-version=2023-05-15' + + +if openai_endpoint in API_URL_REDIRECT: openai_endpoint = API_URL_REDIRECT[openai_endpoint] + +openai_embed_endpoint = openai_endpoint.replace("chat/completions", "embeddings") + +from .openai_embed import OpenAiEmbeddingModel + +embed_model_info = { + # text-embedding-3-small Increased performance over 2nd generation ada embedding model | 1,536 + "text-embedding-3-small": { + "embed_class": OpenAiEmbeddingModel, + "embed_endpoint": openai_embed_endpoint, + "embed_dimension": 1536, + }, + + # text-embedding-3-large Most capable embedding model for both english and non-english tasks | 3,072 + "text-embedding-3-large": { + "embed_class": OpenAiEmbeddingModel, + "embed_endpoint": openai_embed_endpoint, + "embed_dimension": 3072, + }, + + # text-embedding-ada-002 Most capable 2nd generation embedding model, replacing 16 first generation models | 1,536 + "text-embedding-ada-002": { + "embed_class": OpenAiEmbeddingModel, + "embed_endpoint": openai_embed_endpoint, + "embed_dimension": 1536, + }, +} diff --git a/request_llms/embed_models/openai_embed.py b/request_llms/embed_models/openai_embed.py new file mode 100644 index 00000000..9ec2521e --- /dev/null +++ b/request_llms/embed_models/openai_embed.py @@ -0,0 +1,60 @@ +from llama_index.embeddings.openai import OpenAIEmbedding +from openai import OpenAI +from toolbox import get_conf +from toolbox import CatchException, update_ui, get_conf, select_api_key, get_log_folder, ProxyNetworkActivate +from shared_utils.key_pattern_manager import select_api_key_for_embed_models +from typing import List, Any + +class OpenAiEmbeddingModel(): + + def __init__(self, llm_kwargs:dict=None): + self.llm_kwargs = llm_kwargs + + def compute_embedding(self, text="这是要计算嵌入的文本", llm_kwargs:dict=None, batch_mode=False): + from .bridge_all_embed import embed_model_info + + # load kwargs + if llm_kwargs is None: + llm_kwargs = self.llm_kwargs + if llm_kwargs is None: + raise RuntimeError("llm_kwargs is not provided!") + + # setup api and req url + api_key = select_api_key_for_embed_models(llm_kwargs['api_key'], llm_kwargs['llm_model']) + embed_model = llm_kwargs['llm_model'] + base_url = embed_model_info[llm_kwargs['llm_model']]['embed_endpoint'].replace('embeddings', '') + + # send and compute + with ProxyNetworkActivate("Connect_OpenAI_Embedding"): + self.oai_client = OpenAI(api_key=api_key, base_url=base_url) + if batch_mode: + input = text + assert isinstance(text, list) + else: + input = [text] + assert isinstance(text, str) + res = self.oai_client.embeddings.create(input=input, model=embed_model) + + # parse result + if batch_mode: + embedding = [d.embedding for d in res.data] + else: + embedding = res.data[0].embedding + + return embedding + + + def embedding_dimension(self, llm_kwargs): + + from .bridge_all_embed import embed_model_info + return embed_model_info[llm_kwargs['llm_model']]['embed_dimension'] + + def get_text_embedding_batch( + self, + texts: List[str], + show_progress: bool = False, + ): + return self.compute_embedding(texts, batch_mode=True) + +if __name__ == "__main__": + pass \ No newline at end of file diff --git a/shared_utils/key_pattern_manager.py b/shared_utils/key_pattern_manager.py index 138a3b09..37784cd8 100644 --- a/shared_utils/key_pattern_manager.py +++ b/shared_utils/key_pattern_manager.py @@ -95,3 +95,19 @@ def select_api_key(keys, llm_model): api_key = random.choice(avail_key_list) # 随机负载均衡 return api_key + + +def select_api_key_for_embed_models(keys, llm_model): + import random + avail_key_list = [] + key_list = keys.split(',') + + if llm_model.startswith('text-embedding-'): + for k in key_list: + if is_openai_api_key(k): avail_key_list.append(k) + + if len(avail_key_list) == 0: + raise RuntimeError(f"您提供的api-key不满足要求,不包含任何可用于{llm_model}的api-key。您可能选择了错误的模型或请求源。") + + api_key = random.choice(avail_key_list) # 随机负载均衡 + return api_key diff --git a/tests/test_embed.py b/tests/test_embed.py new file mode 100644 index 00000000..4cda16e0 --- /dev/null +++ b/tests/test_embed.py @@ -0,0 +1,68 @@ +def validate_path(): + import os, sys + + os.path.dirname(__file__) + root_dir_assume = os.path.abspath(os.path.dirname(__file__) + "/..") + os.chdir(root_dir_assume) + sys.path.append(root_dir_assume) + + +validate_path() # validate path so you can run from base directory + + +# """ +# Test 1 +# """ + +# from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel +# from shared_utils.connect_void_terminal import get_chat_default_kwargs +# oaiem = OpenAiEmbeddingModel() + +# chat_kwargs = get_chat_default_kwargs() +# llm_kwargs = chat_kwargs['llm_kwargs'] +# llm_kwargs.update({ +# 'llm_model': "text-embedding-3-small" +# }) + +# res = oaiem.compute_embedding("你好", llm_kwargs) +# print(res) + + + +""" +Test 2 +""" + +from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel +from shared_utils.connect_void_terminal import get_chat_default_kwargs +from llama_index.core import VectorStoreIndex, SimpleDirectoryReader +from crazy_functions.rag_fns.vector_store_index import GptacVectorStoreIndex +from llama_index.core.ingestion import run_transformations + +chat_kwargs = get_chat_default_kwargs() +llm_kwargs = chat_kwargs['llm_kwargs'] +llm_kwargs.update({ + 'llm_model': "text-embedding-3-small" +}) +embed_model = OpenAiEmbeddingModel(llm_kwargs) + +## dir +documents = SimpleDirectoryReader("private_upload/rag_test/").load_data() + +## single files +# from llama_index.core import Document +# text_list = [text1, text2, ...] +# documents = [Document(text=t) for t in text_list] +vsi = GptacVectorStoreIndex.default_vector_store(embed_model=embed_model) +documents_nodes = run_transformations( + documents, # type: ignore + vsi._transformations, + show_progress=True + ) +index = vsi.insert_nodes(documents_nodes) + + +query_engine = index.as_query_engine() +response = query_engine.query("Some question about the data should go here") +print(response) +