Merge branch 'frontier' into production

2025-12-06 14:36:48 +00:00 · 2023-11-11 23:51:10 +08:00
--- a/request_llms/bridge_all.py
+++ b/request_llms/bridge_all.py
@@ -242,6 +242,13 @@ for model in AVAIL_LLM_MODELS:
        mi.update({"endpoint": api2d_endpoint})
        model_info.update({model: mi})

+# -=-=-=-=-=-=- azure 对齐支持 -=-=-=-=-=-=-
+for model in AVAIL_LLM_MODELS:
+    if model.startswith('azure-') and (model.replace('azure-','') in model_info.keys()):
+        mi = model_info[model.replace('azure-','')]
+        mi.update({"endpoint": azure_endpoint})
+        model_info.update({model: mi})
+
 # -=-=-=-=-=-=- 以下部分是新加入的模型，可能附带额外依赖 -=-=-=-=-=-=-
 if "claude-1-100k" in AVAIL_LLM_MODELS or "claude-2" in AVAIL_LLM_MODELS:
    from .bridge_claude import predict_no_ui_long_connection as claude_noui
@@ -564,7 +571,7 @@ def LLM_CATCH_EXCEPTION(f):
    return decorated


-def predict_no_ui_long_connection(inputs, llm_kwargs, history, sys_prompt, observe_window, console_slience=False):
+def predict_no_ui_long_connection(inputs, llm_kwargs, history, sys_prompt, observe_window=[], console_slience=False):
    """
    发送至LLM，等待回复，一次性完成，不显示中间过程。但内部用stream的方法避免中途网线被掐。
    inputs：
--- a/request_llms/bridge_chatglm.py
+++ b/request_llms/bridge_chatglm.py
@@ -4,14 +4,13 @@ cmd_to_install = "`pip install -r request_llms/requirements_chatglm.txt`"

 from transformers import AutoModel, AutoTokenizer
 from toolbox import get_conf, ProxyNetworkActivate
-from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns



 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Local Model
 # ------------------------------------------------------------------------------------------------------------------------
-@SingletonLocalLLM
 class GetGLM2Handle(LocalLLMHandle):

    def load_model_info(self):
--- a/request_llms/bridge_chatglm3.py
+++ b/request_llms/bridge_chatglm3.py
@@ -4,14 +4,13 @@ cmd_to_install = "`pip install -r request_llms/requirements_chatglm.txt`"

 from transformers import AutoModel, AutoTokenizer
 from toolbox import get_conf, ProxyNetworkActivate
-from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns



 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Local Model
 # ------------------------------------------------------------------------------------------------------------------------
-@SingletonLocalLLM
 class GetGLM3Handle(LocalLLMHandle):

    def load_model_info(self):
--- a/request_llms/bridge_chatglmonnx.py
+++ b/request_llms/bridge_chatglmonnx.py
@@ -8,7 +8,7 @@ import threading
 import importlib
 from toolbox import update_ui, get_conf
 from multiprocessing import Process, Pipe
-from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns

 from .chatglmoonx import ChatGLMModel, chat_template

@@ -17,7 +17,6 @@ from .chatglmoonx import ChatGLMModel, chat_template
 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Local Model
 # ------------------------------------------------------------------------------------------------------------------------
-@SingletonLocalLLM
 class GetONNXGLMHandle(LocalLLMHandle):

    def load_model_info(self):
--- a/request_llms/bridge_chatgpt.py
+++ b/request_llms/bridge_chatgpt.py
@@ -7,8 +7,7 @@
    1. predict: 正常对话时使用，具备完备的交互功能，不可多线程

    具备多线程调用能力的函数
-    2. predict_no_ui：高级实验性功能模块调用，不会实时显示在界面上，参数简单，可以多线程并行，方便实现复杂的功能逻辑
-    3. predict_no_ui_long_connection：在实验过程中发现调用predict_no_ui处理长文档时，和openai的连接容易断掉，这个函数用stream的方式解决这个问题，同样支持多线程
+    2. predict_no_ui_long_connection：支持多线程
 """

 import json
--- a/request_llms/bridge_chatgpt_website.py
+++ b/request_llms/bridge_chatgpt_website.py
@@ -7,8 +7,7 @@
    1. predict: 正常对话时使用，具备完备的交互功能，不可多线程

    具备多线程调用能力的函数
-    2. predict_no_ui：高级实验性功能模块调用，不会实时显示在界面上，参数简单，可以多线程并行，方便实现复杂的功能逻辑
-    3. predict_no_ui_long_connection：在实验过程中发现调用predict_no_ui处理长文档时，和openai的连接容易断掉，这个函数用stream的方式解决这个问题，同样支持多线程
+    2. predict_no_ui_long_connection：支持多线程
 """

 import json
--- a/request_llms/bridge_claude.py
+++ b/request_llms/bridge_claude.py
@@ -7,7 +7,7 @@
    1. predict: 正常对话时使用，具备完备的交互功能，不可多线程

    具备多线程调用能力的函数
-    2. predict_no_ui_long_connection：在实验过程中发现调用predict_no_ui处理长文档时，和openai的连接容易断掉，这个函数用stream的方式解决这个问题，同样支持多线程
+    2. predict_no_ui_long_connection：支持多线程
 """

 import os
--- a/request_llms/bridge_internlm.py
+++ b/request_llms/bridge_internlm.py
@@ -5,9 +5,9 @@ from transformers import AutoModel, AutoTokenizer
 import time
 import threading
 import importlib
-from toolbox import update_ui, get_conf
+from toolbox import update_ui, get_conf, ProxyNetworkActivate
 from multiprocessing import Process, Pipe
-from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns


 # ------------------------------------------------------------------------------------------------------------------------
@@ -34,7 +34,6 @@ def combine_history(prompt, hist):
 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Local Model
 # ------------------------------------------------------------------------------------------------------------------------
-@SingletonLocalLLM
 class GetInternlmHandle(LocalLLMHandle):

    def load_model_info(self):
@@ -53,14 +52,15 @@ class GetInternlmHandle(LocalLLMHandle):
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer
        device = get_conf('LOCAL_MODEL_DEVICE')
-        if self._model is None:
-            tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
-            if device=='cpu':
-                model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16)
-            else:
-                model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16).cuda()
+        with ProxyNetworkActivate('Download_LLM'):
+            if self._model is None:
+                tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
+                if device=='cpu':
+                    model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16)
+                else:
+                    model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).to(torch.bfloat16).cuda()

-            model = model.eval()
+                model = model.eval()
        return model, tokenizer

    def llm_stream_generator(self, **kwargs):
@@ -94,8 +94,9 @@ class GetInternlmHandle(LocalLLMHandle):

        inputs = tokenizer([prompt], padding=True, return_tensors="pt")
        input_length = len(inputs["input_ids"][0])
+        device = get_conf('LOCAL_MODEL_DEVICE')
        for k, v in inputs.items():
-            inputs[k] = v.cuda()
+            inputs[k] = v.to(device)
        input_ids = inputs["input_ids"]
        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
        if generation_config is None:
--- a/request_llms/bridge_llama2.py
+++ b/request_llms/bridge_llama2.py
@@ -5,14 +5,13 @@ cmd_to_install = "`pip install -r request_llms/requirements_chatglm.txt`"
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from toolbox import update_ui, get_conf, ProxyNetworkActivate
 from multiprocessing import Process, Pipe
-from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
 from threading import Thread


 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Local Model
 # ------------------------------------------------------------------------------------------------------------------------
-@SingletonLocalLLM
 class GetONNXGLMHandle(LocalLLMHandle):

    def load_model_info(self):
--- a/request_llms/bridge_qwen.py
+++ b/request_llms/bridge_qwen.py
@@ -6,16 +6,15 @@ from transformers import AutoModel, AutoTokenizer
 import time
 import threading
 import importlib
-from toolbox import update_ui, get_conf
+from toolbox import update_ui, get_conf, ProxyNetworkActivate
 from multiprocessing import Process, Pipe
-from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns, SingletonLocalLLM
+from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns



 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Local Model
 # ------------------------------------------------------------------------------------------------------------------------
-@SingletonLocalLLM
 class GetONNXGLMHandle(LocalLLMHandle):

    def load_model_info(self):
@@ -30,13 +29,13 @@ class GetONNXGLMHandle(LocalLLMHandle):
        import platform
        from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

-        model_id = 'qwen/Qwen-7B-Chat'
-        revision = 'v1.0.1'
-        self._tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
-        # use fp16
-        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", revision=revision, trust_remote_code=True, fp16=True).eval()
-        model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True)  # 可指定不同的生成长度、top_p等相关超参
-        self._model = model
+        with ProxyNetworkActivate('Download_LLM'):
+            model_id = 'qwen/Qwen-7B-Chat'
+            self._tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B-Chat', trust_remote_code=True, resume_download=True)
+            # use fp16
+            model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True, fp16=True).eval()
+            model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True)  # 可指定不同的生成长度、top_p等相关超参
+            self._model = model

        return self._model, self._tokenizer

--- a/request_llms/local_llm_class.py
+++ b/request_llms/local_llm_class.py
@@ -1,6 +1,6 @@
 import time
 import threading
-from toolbox import update_ui
+from toolbox import update_ui, Singleton
 from multiprocessing import Process, Pipe
 from contextlib import redirect_stdout
 from request_llms.queued_pipe import create_queue_pipe
@@ -26,23 +26,20 @@ class ThreadLock(object):
    def __exit__(self, type, value, traceback):
        self.release()

-def SingletonLocalLLM(cls):
-    """
-    Singleton Decroator for LocalLLMHandle
-    """
-    _instance = {}
+@Singleton
+class GetSingletonHandle():
+    def __init__(self):
+        self.llm_model_already_running = {}

-    def _singleton(*args, **kargs):
-        if cls not in _instance:
-            _instance[cls] = cls(*args, **kargs)
-            return _instance[cls]
-        elif _instance[cls].corrupted:
-            _instance[cls] = cls(*args, **kargs)
-            return _instance[cls]
+    def get_llm_model_instance(self, cls, *args, **kargs):
+        if cls not in self.llm_model_already_running:
+            self.llm_model_already_running[cls] = cls(*args, **kargs)
+            return self.llm_model_already_running[cls]
+        elif self.llm_model_already_running[cls].corrupted:
+            self.llm_model_already_running[cls] = cls(*args, **kargs)
+            return self.llm_model_already_running[cls]
        else:
-            return _instance[cls]
-    return _singleton
-
+            return self.llm_model_already_running[cls]

 def reset_tqdm_output():
    import sys, tqdm
@@ -76,7 +73,6 @@ class LocalLLMHandle(Process):
        self.parent_state, self.child_state = create_queue_pipe()
        # allow redirect_stdout
        self.std_tag = "[Subprocess Message] "
-        self.child.write = lambda x: self.child.send(self.std_tag + x)
        self.running = True
        self._model = None
        self._tokenizer = None
@@ -137,6 +133,8 @@ class LocalLLMHandle(Process):
    def run(self):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ run in child process
        # 第一次运行，加载参数
+        self.child.flush = lambda *args: None
+        self.child.write = lambda x: self.child.send(self.std_tag + x)
        reset_tqdm_output()
        self.set_state("`尝试加载模型`")
        try:
@@ -220,7 +218,7 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
        """
            refer to request_llms/bridge_all.py
        """
-        _llm_handle = LLMSingletonClass()
+        _llm_handle = GetSingletonHandle().get_llm_model_instance(LLMSingletonClass)
        if len(observe_window) >= 1:
            observe_window[0] = load_message + "\n\n" + _llm_handle.get_state()
        if not _llm_handle.running:
@@ -268,7 +266,7 @@ def get_local_llm_predict_fns(LLMSingletonClass, model_name, history_format='cla
        """
        chatbot.append((inputs, ""))

-        _llm_handle = LLMSingletonClass()
+        _llm_handle = GetSingletonHandle().get_llm_model_instance(LLMSingletonClass)
        chatbot[-1] = (inputs, load_message + "\n\n" + _llm_handle.get_state())
        yield from update_ui(chatbot=chatbot, history=[])
        if not _llm_handle.running: