Add Support for Gemini 1.5 Pro & Gemini 1.5 Flash (#1926)

* Add Support for Gemini 1.5 Pro & 1.5 Flash. * Update bridge_all.py fix a spelling error in comments. * Add Support for Gemini 1.5 Pro & Gemini 1.5 Flash
2025-12-06 14:36:48 +00:00 · 2024-08-12 21:44:24 +08:00
--- a/request_llms/com_google.py
+++ b/request_llms/com_google.py
@@ -7,7 +7,7 @@ import os
 import re
 import requests
 from typing import List, Dict, Tuple
-from toolbox import get_conf, encode_image, get_pictures_list, to_markdown_tabs
+from toolbox import get_conf, update_ui, encode_image, get_pictures_list, to_markdown_tabs

 proxies, TIMEOUT_SECONDS = get_conf("proxies", "TIMEOUT_SECONDS")

@@ -112,6 +112,14 @@ def html_local_img(__file, layout="left", max_width=None, max_height=None, md=Tr
    return a


+def reverse_base64_from_input(inputs):
+    pattern = re.compile(r'<br/><br/><div align="center"><img[^<>]+base64="([^"]+)"></div>')
+    base64_strings = pattern.findall(inputs)
+    return base64_strings
+
+def contain_base64(inputs):
+    base64_strings = reverse_base64_from_input(inputs)
+    return len(base64_strings) > 0

 class GoogleChatInit:
    def __init__(self, llm_kwargs):
@@ -119,9 +127,9 @@ class GoogleChatInit:
        endpoint = model_info[llm_kwargs['llm_model']]['endpoint']
        self.url_gemini = endpoint + "/%m:streamGenerateContent?key=%k"

-    def generate_chat(self, inputs, llm_kwargs, history, system_prompt):
+    def generate_chat(self, inputs, llm_kwargs, history, system_prompt, image_base64_array:list=[], has_multimodal_capacity:bool=False):
        headers, payload = self.generate_message_payload(
-            inputs, llm_kwargs, history, system_prompt
+            inputs, llm_kwargs, history, system_prompt, image_base64_array, has_multimodal_capacity
        )
        response = requests.post(
            url=self.url_gemini,
@@ -133,13 +141,16 @@ class GoogleChatInit:
        )
        return response.iter_lines()

-    def __conversation_user(self, user_input, llm_kwargs):
+    def __conversation_user(self, user_input, llm_kwargs, enable_multimodal_capacity):
        what_i_have_asked = {"role": "user", "parts": []}
-        if "vision" not in self.url_gemini:
+        from .bridge_all import model_info
+            
+        if enable_multimodal_capacity:
+            input_, encode_img = input_encode_handler(user_input, llm_kwargs=llm_kwargs)
+        else:
            input_ = user_input
            encode_img = []
-        else:
-            input_, encode_img = input_encode_handler(user_input, llm_kwargs=llm_kwargs)
+
        what_i_have_asked["parts"].append({"text": input_})
        if encode_img:
            for data in encode_img:
@@ -153,12 +164,12 @@ class GoogleChatInit:
                )
        return what_i_have_asked

-    def __conversation_history(self, history, llm_kwargs):
+    def __conversation_history(self, history, llm_kwargs, enable_multimodal_capacity):
        messages = []
        conversation_cnt = len(history) // 2
        if conversation_cnt:
            for index in range(0, 2 * conversation_cnt, 2):
-                what_i_have_asked = self.__conversation_user(history[index], llm_kwargs)
+                what_i_have_asked = self.__conversation_user(history[index], llm_kwargs, enable_multimodal_capacity)
                what_gpt_answer = {
                    "role": "model",
                    "parts": [{"text": history[index + 1]}],
@@ -168,7 +179,7 @@ class GoogleChatInit:
        return messages

    def generate_message_payload(
-        self, inputs, llm_kwargs, history, system_prompt
+        self, inputs, llm_kwargs, history, system_prompt, image_base64_array:list=[], has_multimodal_capacity:bool=False
    ) -> Tuple[Dict, Dict]:
        messages = [
            # {"role": "system", "parts": [{"text": system_prompt}]},  # gemini 不允许对话轮次为偶数，所以这个没有用，看后续支持吧。。。
@@ -179,21 +190,29 @@ class GoogleChatInit:
            "%m", llm_kwargs["llm_model"]
        ).replace("%k", get_conf("GEMINI_API_KEY"))
        header = {"Content-Type": "application/json"}
-        if "vision" not in self.url_gemini:  # 不是vision 才处理history
+
+        if has_multimodal_capacity:
+            enable_multimodal_capacity = (len(image_base64_array) > 0) or any([contain_base64(h) for h in history])
+        else:
+            enable_multimodal_capacity = False
+        
+        if not enable_multimodal_capacity:
            messages.extend(
-                self.__conversation_history(history, llm_kwargs)
+                self.__conversation_history(history, llm_kwargs, enable_multimodal_capacity)
            )  # 处理 history
-        messages.append(self.__conversation_user(inputs, llm_kwargs))  # 处理用户对话
+            
+        messages.append(self.__conversation_user(inputs, llm_kwargs, enable_multimodal_capacity))  # 处理用户对话
        payload = {
            "contents": messages,
            "generationConfig": {
-                # "maxOutputTokens": 800,
+                # "maxOutputTokens": llm_kwargs.get("max_token", 1024),
                "stopSequences": str(llm_kwargs.get("stop", "")).split(" "),
                "temperature": llm_kwargs.get("temperature", 1),
                "topP": llm_kwargs.get("top_p", 0.8),
                "topK": 10,
            },
        }
+
        return header, payload