镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 14:36:48 +00:00
begin rag project with llama index
这个提交包含在:
@@ -296,7 +296,7 @@ ARXIV_CACHE_DIR = "gpt_log/arxiv_cache"
|
|||||||
|
|
||||||
# 除了连接OpenAI之外,还有哪些场合允许使用代理,请尽量不要修改
|
# 除了连接OpenAI之外,还有哪些场合允许使用代理,请尽量不要修改
|
||||||
WHEN_TO_USE_PROXY = ["Download_LLM", "Download_Gradio_Theme", "Connect_Grobid",
|
WHEN_TO_USE_PROXY = ["Download_LLM", "Download_Gradio_Theme", "Connect_Grobid",
|
||||||
"Warmup_Modules", "Nougat_Download", "AutoGen"]
|
"Warmup_Modules", "Nougat_Download", "AutoGen", "Connect_OpenAI_Embedding"]
|
||||||
|
|
||||||
|
|
||||||
# 启用插件热加载
|
# 启用插件热加载
|
||||||
|
|||||||
@@ -0,0 +1,34 @@
|
|||||||
|
import llama_index
|
||||||
|
|
||||||
|
class rag_worker():
|
||||||
|
def __init__(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def assign_embedding_model(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def save_to_checkpoint(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load_from_checkpoint(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add_documents_to_vector_store(self, documents):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add_text_to_vector_store(self, documents):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def inspect_vector_store(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def retrieve_from_store_with_query(self, query):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_prompt(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
from llama_index.core import VectorStoreIndex
|
||||||
|
from typing import Any, List, Optional
|
||||||
|
|
||||||
|
from llama_index.core.callbacks.base import CallbackManager
|
||||||
|
from llama_index.core.schema import TransformComponent
|
||||||
|
from llama_index.core.service_context import ServiceContext
|
||||||
|
from llama_index.core.settings import (
|
||||||
|
Settings,
|
||||||
|
callback_manager_from_settings_or_context,
|
||||||
|
transformations_from_settings_or_context,
|
||||||
|
)
|
||||||
|
from llama_index.core.storage.storage_context import StorageContext
|
||||||
|
|
||||||
|
|
||||||
|
class GptacVectorStoreIndex(VectorStoreIndex):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def default_vector_store(
|
||||||
|
cls,
|
||||||
|
storage_context: Optional[StorageContext] = None,
|
||||||
|
show_progress: bool = False,
|
||||||
|
callback_manager: Optional[CallbackManager] = None,
|
||||||
|
transformations: Optional[List[TransformComponent]] = None,
|
||||||
|
# deprecated
|
||||||
|
service_context: Optional[ServiceContext] = None,
|
||||||
|
embed_model = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
):
|
||||||
|
"""Create index from documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
documents (Optional[Sequence[BaseDocument]]): List of documents to
|
||||||
|
build the index from.
|
||||||
|
|
||||||
|
"""
|
||||||
|
storage_context = storage_context or StorageContext.from_defaults()
|
||||||
|
docstore = storage_context.docstore
|
||||||
|
callback_manager = (
|
||||||
|
callback_manager
|
||||||
|
or callback_manager_from_settings_or_context(Settings, service_context)
|
||||||
|
)
|
||||||
|
transformations = transformations or transformations_from_settings_or_context(
|
||||||
|
Settings, service_context
|
||||||
|
)
|
||||||
|
|
||||||
|
with callback_manager.as_trace("index_construction"):
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
nodes=[],
|
||||||
|
storage_context=storage_context,
|
||||||
|
callback_manager=callback_manager,
|
||||||
|
show_progress=show_progress,
|
||||||
|
transformations=transformations,
|
||||||
|
service_context=service_context,
|
||||||
|
embed_model=embed_model,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
import tiktoken, copy, re
|
||||||
|
from functools import lru_cache
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from toolbox import get_conf, trimmed_format_exc, apply_gpt_academic_string_mask, read_one_api_model_name
|
||||||
|
|
||||||
|
# Endpoint 重定向
|
||||||
|
API_URL_REDIRECT, AZURE_ENDPOINT, AZURE_ENGINE = get_conf("API_URL_REDIRECT", "AZURE_ENDPOINT", "AZURE_ENGINE")
|
||||||
|
openai_endpoint = "https://api.openai.com/v1/chat/completions"
|
||||||
|
if not AZURE_ENDPOINT.endswith('/'): AZURE_ENDPOINT += '/'
|
||||||
|
azure_endpoint = AZURE_ENDPOINT + f'openai/deployments/{AZURE_ENGINE}/chat/completions?api-version=2023-05-15'
|
||||||
|
|
||||||
|
|
||||||
|
if openai_endpoint in API_URL_REDIRECT: openai_endpoint = API_URL_REDIRECT[openai_endpoint]
|
||||||
|
|
||||||
|
openai_embed_endpoint = openai_endpoint.replace("chat/completions", "embeddings")
|
||||||
|
|
||||||
|
from .openai_embed import OpenAiEmbeddingModel
|
||||||
|
|
||||||
|
embed_model_info = {
|
||||||
|
# text-embedding-3-small Increased performance over 2nd generation ada embedding model | 1,536
|
||||||
|
"text-embedding-3-small": {
|
||||||
|
"embed_class": OpenAiEmbeddingModel,
|
||||||
|
"embed_endpoint": openai_embed_endpoint,
|
||||||
|
"embed_dimension": 1536,
|
||||||
|
},
|
||||||
|
|
||||||
|
# text-embedding-3-large Most capable embedding model for both english and non-english tasks | 3,072
|
||||||
|
"text-embedding-3-large": {
|
||||||
|
"embed_class": OpenAiEmbeddingModel,
|
||||||
|
"embed_endpoint": openai_embed_endpoint,
|
||||||
|
"embed_dimension": 3072,
|
||||||
|
},
|
||||||
|
|
||||||
|
# text-embedding-ada-002 Most capable 2nd generation embedding model, replacing 16 first generation models | 1,536
|
||||||
|
"text-embedding-ada-002": {
|
||||||
|
"embed_class": OpenAiEmbeddingModel,
|
||||||
|
"embed_endpoint": openai_embed_endpoint,
|
||||||
|
"embed_dimension": 1536,
|
||||||
|
},
|
||||||
|
}
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||||
|
from openai import OpenAI
|
||||||
|
from toolbox import get_conf
|
||||||
|
from toolbox import CatchException, update_ui, get_conf, select_api_key, get_log_folder, ProxyNetworkActivate
|
||||||
|
from shared_utils.key_pattern_manager import select_api_key_for_embed_models
|
||||||
|
from typing import List, Any
|
||||||
|
|
||||||
|
class OpenAiEmbeddingModel():
|
||||||
|
|
||||||
|
def __init__(self, llm_kwargs:dict=None):
|
||||||
|
self.llm_kwargs = llm_kwargs
|
||||||
|
|
||||||
|
def compute_embedding(self, text="这是要计算嵌入的文本", llm_kwargs:dict=None, batch_mode=False):
|
||||||
|
from .bridge_all_embed import embed_model_info
|
||||||
|
|
||||||
|
# load kwargs
|
||||||
|
if llm_kwargs is None:
|
||||||
|
llm_kwargs = self.llm_kwargs
|
||||||
|
if llm_kwargs is None:
|
||||||
|
raise RuntimeError("llm_kwargs is not provided!")
|
||||||
|
|
||||||
|
# setup api and req url
|
||||||
|
api_key = select_api_key_for_embed_models(llm_kwargs['api_key'], llm_kwargs['llm_model'])
|
||||||
|
embed_model = llm_kwargs['llm_model']
|
||||||
|
base_url = embed_model_info[llm_kwargs['llm_model']]['embed_endpoint'].replace('embeddings', '')
|
||||||
|
|
||||||
|
# send and compute
|
||||||
|
with ProxyNetworkActivate("Connect_OpenAI_Embedding"):
|
||||||
|
self.oai_client = OpenAI(api_key=api_key, base_url=base_url)
|
||||||
|
if batch_mode:
|
||||||
|
input = text
|
||||||
|
assert isinstance(text, list)
|
||||||
|
else:
|
||||||
|
input = [text]
|
||||||
|
assert isinstance(text, str)
|
||||||
|
res = self.oai_client.embeddings.create(input=input, model=embed_model)
|
||||||
|
|
||||||
|
# parse result
|
||||||
|
if batch_mode:
|
||||||
|
embedding = [d.embedding for d in res.data]
|
||||||
|
else:
|
||||||
|
embedding = res.data[0].embedding
|
||||||
|
|
||||||
|
return embedding
|
||||||
|
|
||||||
|
|
||||||
|
def embedding_dimension(self, llm_kwargs):
|
||||||
|
|
||||||
|
from .bridge_all_embed import embed_model_info
|
||||||
|
return embed_model_info[llm_kwargs['llm_model']]['embed_dimension']
|
||||||
|
|
||||||
|
def get_text_embedding_batch(
|
||||||
|
self,
|
||||||
|
texts: List[str],
|
||||||
|
show_progress: bool = False,
|
||||||
|
):
|
||||||
|
return self.compute_embedding(texts, batch_mode=True)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
||||||
@@ -95,3 +95,19 @@ def select_api_key(keys, llm_model):
|
|||||||
|
|
||||||
api_key = random.choice(avail_key_list) # 随机负载均衡
|
api_key = random.choice(avail_key_list) # 随机负载均衡
|
||||||
return api_key
|
return api_key
|
||||||
|
|
||||||
|
|
||||||
|
def select_api_key_for_embed_models(keys, llm_model):
|
||||||
|
import random
|
||||||
|
avail_key_list = []
|
||||||
|
key_list = keys.split(',')
|
||||||
|
|
||||||
|
if llm_model.startswith('text-embedding-'):
|
||||||
|
for k in key_list:
|
||||||
|
if is_openai_api_key(k): avail_key_list.append(k)
|
||||||
|
|
||||||
|
if len(avail_key_list) == 0:
|
||||||
|
raise RuntimeError(f"您提供的api-key不满足要求,不包含任何可用于{llm_model}的api-key。您可能选择了错误的模型或请求源。")
|
||||||
|
|
||||||
|
api_key = random.choice(avail_key_list) # 随机负载均衡
|
||||||
|
return api_key
|
||||||
|
|||||||
68
tests/test_embed.py
普通文件
68
tests/test_embed.py
普通文件
@@ -0,0 +1,68 @@
|
|||||||
|
def validate_path():
|
||||||
|
import os, sys
|
||||||
|
|
||||||
|
os.path.dirname(__file__)
|
||||||
|
root_dir_assume = os.path.abspath(os.path.dirname(__file__) + "/..")
|
||||||
|
os.chdir(root_dir_assume)
|
||||||
|
sys.path.append(root_dir_assume)
|
||||||
|
|
||||||
|
|
||||||
|
validate_path() # validate path so you can run from base directory
|
||||||
|
|
||||||
|
|
||||||
|
# """
|
||||||
|
# Test 1
|
||||||
|
# """
|
||||||
|
|
||||||
|
# from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
|
||||||
|
# from shared_utils.connect_void_terminal import get_chat_default_kwargs
|
||||||
|
# oaiem = OpenAiEmbeddingModel()
|
||||||
|
|
||||||
|
# chat_kwargs = get_chat_default_kwargs()
|
||||||
|
# llm_kwargs = chat_kwargs['llm_kwargs']
|
||||||
|
# llm_kwargs.update({
|
||||||
|
# 'llm_model': "text-embedding-3-small"
|
||||||
|
# })
|
||||||
|
|
||||||
|
# res = oaiem.compute_embedding("你好", llm_kwargs)
|
||||||
|
# print(res)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Test 2
|
||||||
|
"""
|
||||||
|
|
||||||
|
from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel
|
||||||
|
from shared_utils.connect_void_terminal import get_chat_default_kwargs
|
||||||
|
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
|
||||||
|
from crazy_functions.rag_fns.vector_store_index import GptacVectorStoreIndex
|
||||||
|
from llama_index.core.ingestion import run_transformations
|
||||||
|
|
||||||
|
chat_kwargs = get_chat_default_kwargs()
|
||||||
|
llm_kwargs = chat_kwargs['llm_kwargs']
|
||||||
|
llm_kwargs.update({
|
||||||
|
'llm_model': "text-embedding-3-small"
|
||||||
|
})
|
||||||
|
embed_model = OpenAiEmbeddingModel(llm_kwargs)
|
||||||
|
|
||||||
|
## dir
|
||||||
|
documents = SimpleDirectoryReader("private_upload/rag_test/").load_data()
|
||||||
|
|
||||||
|
## single files
|
||||||
|
# from llama_index.core import Document
|
||||||
|
# text_list = [text1, text2, ...]
|
||||||
|
# documents = [Document(text=t) for t in text_list]
|
||||||
|
vsi = GptacVectorStoreIndex.default_vector_store(embed_model=embed_model)
|
||||||
|
documents_nodes = run_transformations(
|
||||||
|
documents, # type: ignore
|
||||||
|
vsi._transformations,
|
||||||
|
show_progress=True
|
||||||
|
)
|
||||||
|
index = vsi.insert_nodes(documents_nodes)
|
||||||
|
|
||||||
|
|
||||||
|
query_engine = index.as_query_engine()
|
||||||
|
response = query_engine.query("Some question about the data should go here")
|
||||||
|
print(response)
|
||||||
|
|
||||||
在新工单中引用
屏蔽一个用户