镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 14:36:48 +00:00
Merge branch 'frontier' into production
这个提交包含在:
@@ -242,6 +242,13 @@ for model in AVAIL_LLM_MODELS:
|
||||
mi.update({"endpoint": api2d_endpoint})
|
||||
model_info.update({model: mi})
|
||||
|
||||
# -=-=-=-=-=-=- azure 对齐支持 -=-=-=-=-=-=-
|
||||
for model in AVAIL_LLM_MODELS:
|
||||
if model.startswith('azure-') and (model.replace('azure-','') in model_info.keys()):
|
||||
mi = model_info[model.replace('azure-','')]
|
||||
mi.update({"endpoint": azure_endpoint})
|
||||
model_info.update({model: mi})
|
||||
|
||||
# -=-=-=-=-=-=- 以下部分是新加入的模型,可能附带额外依赖 -=-=-=-=-=-=-
|
||||
if "claude-1-100k" in AVAIL_LLM_MODELS or "claude-2" in AVAIL_LLM_MODELS:
|
||||
from .bridge_claude import predict_no_ui_long_connection as claude_noui
|
||||
@@ -564,7 +571,7 @@ def LLM_CATCH_EXCEPTION(f):
|
||||
return decorated
|
||||
|
||||
|
||||
def predict_no_ui_long_connection(inputs, llm_kwargs, history, sys_prompt, observe_window, console_slience=False):
|
||||
def predict_no_ui_long_connection(inputs, llm_kwargs, history, sys_prompt, observe_window=[], console_slience=False):
|
||||
"""
|
||||
发送至LLM,等待回复,一次性完成,不显示中间过程。但内部用stream的方法避免中途网线被掐。
|
||||
inputs:
|
||||
|
||||
@@ -4,14 +4,13 @@ cmd_to_install = "`pip install -r request_llms/requirements_chatglm.txt`"
|
||||
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
from toolbox import get_conf, ProxyNetworkActivate
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetGLM2Handle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
|
||||
@@ -4,14 +4,13 @@ cmd_to_install = "`pip install -r request_llms/requirements_chatglm.txt`"
|
||||
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
from toolbox import get_conf, ProxyNetworkActivate
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetGLM3Handle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
|
||||
@@ -8,7 +8,7 @@ import threading
|
||||
import importlib
|
||||
from toolbox import update_ui, get_conf
|
||||
from multiprocessing import Process, Pipe
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
|
||||
|
||||
from .chatglmoonx import ChatGLMModel, chat_template
|
||||
|
||||
@@ -17,7 +17,6 @@ from .chatglmoonx import ChatGLMModel, chat_template
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetONNXGLMHandle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
|
||||
@@ -7,8 +7,7 @@
|
||||
1. predict: 正常对话时使用,具备完备的交互功能,不可多线程
|
||||
|
||||
具备多线程调用能力的函数
|
||||
2. predict_no_ui:高级实验性功能模块调用,不会实时显示在界面上,参数简单,可以多线程并行,方便实现复杂的功能逻辑
|
||||
3. predict_no_ui_long_connection:在实验过程中发现调用predict_no_ui处理长文档时,和openai的连接容易断掉,这个函数用stream的方式解决这个问题,同样支持多线程
|
||||
2. predict_no_ui_long_connection:支持多线程
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
@@ -7,8 +7,7 @@
|
||||
1. predict: 正常对话时使用,具备完备的交互功能,不可多线程
|
||||
|
||||
具备多线程调用能力的函数
|
||||
2. predict_no_ui:高级实验性功能模块调用,不会实时显示在界面上,参数简单,可以多线程并行,方便实现复杂的功能逻辑
|
||||
3. predict_no_ui_long_connection:在实验过程中发现调用predict_no_ui处理长文档时,和openai的连接容易断掉,这个函数用stream的方式解决这个问题,同样支持多线程
|
||||
2. predict_no_ui_long_connection:支持多线程
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
1. predict: 正常对话时使用,具备完备的交互功能,不可多线程
|
||||
|
||||
具备多线程调用能力的函数
|
||||
2. predict_no_ui_long_connection:在实验过程中发现调用predict_no_ui处理长文档时,和openai的连接容易断掉,这个函数用stream的方式解决这个问题,同样支持多线程
|
||||
2. predict_no_ui_long_connection:支持多线程
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
@@ -5,9 +5,9 @@ from transformers import AutoModel, AutoTokenizer
|
||||
import time
|
||||
import threading
|
||||
import importlib
|
||||
from toolbox import update_ui, get_conf
|
||||
from toolbox import update_ui, get_conf, ProxyNetworkActivate
|
||||
from multiprocessing import Process, Pipe
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@@ -34,7 +34,6 @@ def combine_history(prompt, hist):
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetInternlmHandle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
@@ -53,14 +52,15 @@ class GetInternlmHandle(LocalLLMHandle):
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
device = get_conf('LOCAL_MODEL_DEVICE')
|
||||
if self._model is None:
|
||||
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
||||
if device=='cpu':
|
||||
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16).cuda()
|
||||
with ProxyNetworkActivate('Download_LLM'):
|
||||
if self._model is None:
|
||||
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
||||
if device=='cpu':
|
||||
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16).cuda()
|
||||
|
||||
model = model.eval()
|
||||
model = model.eval()
|
||||
return model, tokenizer
|
||||
|
||||
def llm_stream_generator(self, **kwargs):
|
||||
@@ -94,8 +94,9 @@ class GetInternlmHandle(LocalLLMHandle):
|
||||
|
||||
inputs = tokenizer([prompt], padding=True, return_tensors="pt")
|
||||
input_length = len(inputs["input_ids"][0])
|
||||
device = get_conf('LOCAL_MODEL_DEVICE')
|
||||
for k, v in inputs.items():
|
||||
inputs[k] = v.cuda()
|
||||
inputs[k] = v.to(device)
|
||||
input_ids = inputs["input_ids"]
|
||||
batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
|
||||
if generation_config is None:
|
||||
|
||||
@@ -5,14 +5,13 @@ cmd_to_install = "`pip install -r request_llms/requirements_chatglm.txt`"
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
||||
from toolbox import update_ui, get_conf, ProxyNetworkActivate
|
||||
from multiprocessing import Process, Pipe
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
|
||||
from threading import Thread
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetONNXGLMHandle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
|
||||
@@ -6,16 +6,15 @@ from transformers import AutoModel, AutoTokenizer
|
||||
import time
|
||||
import threading
|
||||
import importlib
|
||||
from toolbox import update_ui, get_conf
|
||||
from toolbox import update_ui, get_conf, ProxyNetworkActivate
|
||||
from multiprocessing import Process, Pipe
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
|
||||
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
# 🔌💻 Local Model
|
||||
# ------------------------------------------------------------------------------------------------------------------------
|
||||
@SingletonLocalLLM
|
||||
class GetONNXGLMHandle(LocalLLMHandle):
|
||||
|
||||
def load_model_info(self):
|
||||
@@ -30,13 +29,13 @@ class GetONNXGLMHandle(LocalLLMHandle):
|
||||
import platform
|
||||
from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
||||
|
||||
model_id = 'qwen/Qwen-7B-Chat'
|
||||
revision = 'v1.0.1'
|
||||
self._tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
|
||||
# use fp16
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", revision=revision, trust_remote_code=True, fp16=True).eval()
|
||||
model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
|
||||
self._model = model
|
||||
with ProxyNetworkActivate('Download_LLM'):
|
||||
model_id = 'qwen/Qwen-7B-Chat'
|
||||
self._tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B-Chat', trust_remote_code=True, resume_download=True)
|
||||
# use fp16
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True, fp16=True).eval()
|
||||
model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
|
||||
self._model = model
|
||||
|
||||
return self._model, self._tokenizer
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import time
|
||||
import threading
|
||||
from toolbox import update_ui
|
||||
from toolbox import update_ui, Singleton
|
||||
from multiprocessing import Process, Pipe
|
||||
from contextlib import redirect_stdout
|
||||
from request_llms.queued_pipe import create_queue_pipe
|
||||
@@ -26,23 +26,20 @@ class ThreadLock(object):
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.release()
|
||||
|
||||
def SingletonLocalLLM(cls):
|
||||
"""
|
||||
Singleton Decroator for LocalLLMHandle
|
||||
"""
|
||||
_instance = {}
|
||||
@Singleton
|
||||
class GetSingletonHandle():
|
||||
def __init__(self):
|
||||
self.llm_model_already_running = {}
|
||||
|
||||
def _singleton(*args, **kargs):
|
||||
if cls not in _instance:
|
||||
_instance[cls] = cls(*args, **kargs)
|
||||
return _instance[cls]
|
||||
elif _instance[cls].corrupted:
|
||||
_instance[cls] = cls(*args, **kargs)
|
||||
return _instance[cls]
|
||||
def get_llm_model_instance(self, cls, *args, **kargs):
|
||||
if cls not in self.llm_model_already_running:
|
||||
self.llm_model_already_running[cls] = cls(*args, **kargs)
|
||||
return self.llm_model_already_running[cls]
|
||||
elif self.llm_model_already_running[cls].corrupted:
|
||||
self.llm_model_already_running[cls] = cls(*args, **kargs)
|
||||
return self.llm_model_already_running[cls]
|
||||
else:
|
||||
return _instance[cls]
|
||||
return _singleton
|
||||
|
||||
return self.llm_model_already_running[cls]
|
||||
|
||||
def reset_tqdm_output():
|
||||
import sys, tqdm
|
||||
@@ -76,7 +73,6 @@ class LocalLLMHandle(Process):
|
||||
self.parent_state, self.child_state = create_queue_pipe()
|
||||
# allow redirect_stdout
|
||||
self.std_tag = "[Subprocess Message] "
|
||||
self.child.write = lambda x: self.child.send(self.std_tag + x)
|
||||
self.running = True
|
||||
self._model = None
|
||||
self._tokenizer = None
|
||||
@@ -137,6 +133,8 @@ class LocalLLMHandle(Process):
|
||||
def run(self):
|
||||
# 🏃♂️🏃♂️🏃♂️ run in child process
|
||||
# 第一次运行,加载参数
|
||||
self.child.flush = lambda *args: None
|
||||
self.child.write = lambda x: self.child.send(self.std_tag + x)
|
||||
reset_tqdm_output()
|
||||
self.set_state("`尝试加载模型`")
|
||||
try:
|
||||
@@ -220,7 +218,7 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
|
||||
"""
|
||||
refer to request_llms/bridge_all.py
|
||||
"""
|
||||
_llm_handle = LLMSingletonClass()
|
||||
_llm_handle = GetSingletonHandle().get_llm_model_instance(LLMSingletonClass)
|
||||
if len(observe_window) >= 1:
|
||||
observe_window[0] = load_message + "\n\n" + _llm_handle.get_state()
|
||||
if not _llm_handle.running:
|
||||
@@ -268,7 +266,7 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
|
||||
"""
|
||||
chatbot.append((inputs, ""))
|
||||
|
||||
_llm_handle = LLMSingletonClass()
|
||||
_llm_handle = GetSingletonHandle().get_llm_model_instance(LLMSingletonClass)
|
||||
chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.get_state())
|
||||
yield from update_ui(chatbot=chatbot, history=[])
|
||||
if not _llm_handle.running:
|
||||
|
||||
在新工单中引用
屏蔽一个用户