version 3.75 (#1702)

* Update version to 3.74 * Add support for Yi Model API (#1635) * 更新以支持零一万物模型 * 删除newbing * 修改config --------- Co-authored-by: binary-husky <qingxu.fu@outlook.com> * Refactor function signatures in bridge files * fix qwen api change * rename and ref functions * rename and move some cookie functions * 增加haiku模型，新增endpoint配置说明 (#1626) * haiku added * 新增haiku，新增endpoint配置说明 * Haiku added * 将说明同步至最新Endpoint --------- Co-authored-by: binary-husky <qingxu.fu@outlook.com> * private_upload目录下进行文件鉴权 (#1596) * private_upload目录下进行文件鉴权 * minor fastapi adjustment * Add logging functionality to enable saving conversation records * waiting to fix username retrieve * support 2rd web path * allow accessing default user dir --------- Co-authored-by: binary-husky <qingxu.fu@outlook.com> * remove yaml deps * fix favicon * fix abs path auth problem * forget to write a return * add `dashscope` to deps * fix GHSA-v9q9-xj86-953p * 用户名重叠越权访问patch (#1681) * add cohere model api access * cohere + can_multi_thread * fix block user access(fail) * fix fastapi bug * change cohere api endpoint * explain version * # fix com_zhipuglm.py illegal temperature problem (#1687) * Update com_zhipuglm.py # fix 用户在使用 zhipuai 界面时遇到了关于温度参数的非法参数错误 * allow store lm model dropdown * add a btn to reverse previous reset * remove extra fns * Add support for glm-4v model (#1700) * 修改chatglm3量化加载方式 (#1688) Co-authored-by: zym9804 <ren990603@gmail.com> * save chat stage 1 * consider null cookie situation * 在点击复制按钮时激活语音 * miss some parts * move all to js * done first stage * add edge tts * bug fix * bug fix * remove console log * bug fix * bug fix * bug fix * audio switch * update tts readme * remove tempfile when done * disable auto audio follow * avoid play queue update after shut up * feat: minimizing common.js * improve tts functionality * deterine whether the cached model is in choices * Add support for Ollama (#1740) * print err when doc2x not successful * add icon * adjust url for doc2x key version * prepare merge --------- Co-authored-by: Menghuan1918 <menghuan2003@outlook.com> Co-authored-by: Skyzayre <120616113+Skyzayre@users.noreply.github.com> Co-authored-by: XIao <46100050+Kilig947@users.noreply.github.com> Co-authored-by: Yuki <903728862@qq.com> Co-authored-by: zyren123 <91042213+zyren123@users.noreply.github.com> Co-authored-by: zym9804 <ren990603@gmail.com>
2025-12-06 06:26:47 +00:00 · 2024-04-30 20:37:41 +08:00
--- a/request_llms/bridge_chatglm3.py
+++ b/request_llms/bridge_chatglm3.py
@@ -6,7 +6,6 @@ from toolbox import get_conf, ProxyNetworkActivate
 from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns


-
 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 Local Model
 # ------------------------------------------------------------------------------------------------------------------------
@@ -23,20 +22,45 @@ class GetGLM3Handle(LocalLLMHandle):
        import os, glob
        import os
        import platform
-        LOCAL_MODEL_QUANT, device = get_conf('LOCAL_MODEL_QUANT', 'LOCAL_MODEL_DEVICE')

-        if LOCAL_MODEL_QUANT == "INT4":         # INT4
-            _model_name_ = "THUDM/chatglm3-6b-int4"
-        elif LOCAL_MODEL_QUANT == "INT8":       # INT8
-            _model_name_ = "THUDM/chatglm3-6b-int8"
-        else:
-            _model_name_ = "THUDM/chatglm3-6b"  # FP16
-        with ProxyNetworkActivate('Download_LLM'):
-            chatglm_tokenizer = AutoTokenizer.from_pretrained(_model_name_, trust_remote_code=True)
-            if device=='cpu':
-                chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True, device='cpu').float()
+        LOCAL_MODEL_QUANT, device = get_conf("LOCAL_MODEL_QUANT", "LOCAL_MODEL_DEVICE")
+        _model_name_ = "THUDM/chatglm3-6b"
+        # if LOCAL_MODEL_QUANT == "INT4":  # INT4
+        #     _model_name_ = "THUDM/chatglm3-6b-int4"
+        # elif LOCAL_MODEL_QUANT == "INT8":  # INT8
+        #     _model_name_ = "THUDM/chatglm3-6b-int8"
+        # else:
+        #     _model_name_ = "THUDM/chatglm3-6b"  # FP16
+        with ProxyNetworkActivate("Download_LLM"):
+            chatglm_tokenizer = AutoTokenizer.from_pretrained(
+                _model_name_, trust_remote_code=True
+            )
+            if device == "cpu":
+                chatglm_model = AutoModel.from_pretrained(
+                    _model_name_,
+                    trust_remote_code=True,
+                    device="cpu",
+                ).float()
+            elif LOCAL_MODEL_QUANT == "INT4":  # INT4
+                chatglm_model = AutoModel.from_pretrained(
+                    pretrained_model_name_or_path=_model_name_,
+                    trust_remote_code=True,
+                    device="cuda",
+                    load_in_4bit=True,
+                )
+            elif LOCAL_MODEL_QUANT == "INT8":  # INT8
+                chatglm_model = AutoModel.from_pretrained(
+                    pretrained_model_name_or_path=_model_name_,
+                    trust_remote_code=True,
+                    device="cuda",
+                    load_in_8bit=True,
+                )
            else:
-                chatglm_model = AutoModel.from_pretrained(_model_name_, trust_remote_code=True, device='cuda')
+                chatglm_model = AutoModel.from_pretrained(
+                    pretrained_model_name_or_path=_model_name_,
+                    trust_remote_code=True,
+                    device="cuda",
+                )
            chatglm_model = chatglm_model.eval()

        self._model = chatglm_model
@@ -46,32 +70,36 @@ class GetGLM3Handle(LocalLLMHandle):
    def llm_stream_generator(self, **kwargs):
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 子进程执行
        def adaptor(kwargs):
-            query = kwargs['query']
-            max_length = kwargs['max_length']
-            top_p = kwargs['top_p']
-            temperature = kwargs['temperature']
-            history = kwargs['history']
+            query = kwargs["query"]
+            max_length = kwargs["max_length"]
+            top_p = kwargs["top_p"]
+            temperature = kwargs["temperature"]
+            history = kwargs["history"]
            return query, max_length, top_p, temperature, history

        query, max_length, top_p, temperature, history = adaptor(kwargs)

-        for response, history in self._model.stream_chat(self._tokenizer,
-                                                         query,
-                                                         history,
-                                                         max_length=max_length,
-                                                         top_p=top_p,
-                                                         temperature=temperature,
-                                                         ):
+        for response, history in self._model.stream_chat(
+            self._tokenizer,
+            query,
+            history,
+            max_length=max_length,
+            top_p=top_p,
+            temperature=temperature,
+        ):
            yield response

    def try_to_import_special_deps(self, **kwargs):
        # import something that will raise error if the user does not install requirement_*.txt
        # 🏃‍♂️🏃‍♂️🏃‍♂️ 主进程执行
        import importlib
+
        # importlib.import_module('modelscope')


 # ------------------------------------------------------------------------------------------------------------------------
 # 🔌💻 GPT-Academic Interface
 # ------------------------------------------------------------------------------------------------------------------------
-predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetGLM3Handle, model_name, history_format='chatglm3')
+predict_no_ui_long_connection, predict = get_local_llm_predict_fns(
+    GetGLM3Handle, model_name, history_format="chatglm3"
+)