langgenius · crazywoola · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/models/gemini/manifest.yaml b/models/gemini/manifest.yaml
@@ -34,4 +34,4 @@ resource:
     tool:
       enabled: true
 type: plugin
-version: 0.4.0
+version: 0.4.1
diff --git a/models/gemini/models/llm/llm.py b/models/gemini/models/llm/llm.py
@@ -235,6 +235,48 @@ def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]
             ],
         }
 
+    @staticmethod
+    def _calculate_tokens_from_usage_metadata(
+        usage_metadata: types.GenerateContentResponseUsageMetadata | None,
+    ) -> tuple[int, int]:
+        """
+        Calculate prompt and completion tokens from usage metadata.
+
+        :param usage_metadata: Usage metadata from Gemini response
+        :return: Tuple of (prompt_tokens, completion_tokens)
+        """
+        if not usage_metadata:
+            return 0, 0
+
+        # The pricing of tokens varies depending on the input modality.
+        prompt_tokens_standard = 0
+
+        # [ Pricing ]
+        # https://ai.google.dev/gemini-api/docs/pricing?hl=zh-cn#gemini-2.5-pro
+        # FIXME: Currently, Dify's pricing model cannot cover the tokens of multimodal resources
+        # FIXME: Unable to track caching, Grounding, Live API
+        for _mtc in usage_metadata.prompt_tokens_details:
+            if _mtc.modality in [
+                types.MediaModality.TEXT,
+                types.MediaModality.IMAGE,
+                types.MediaModality.VIDEO,
+                types.MediaModality.MODALITY_UNSPECIFIED,
+                types.MediaModality.AUDIO,
+                types.MediaModality.DOCUMENT,
+            ]:
+                prompt_tokens_standard += _mtc.token_count
+
+        # Number of tokens present in thoughts output.
+        thoughts_token_count = usage_metadata.thoughts_token_count or 0
+        # Number of tokens in the response(s).
+        candidates_token_count = usage_metadata.candidates_token_count or 0
+        # The reasoning content and final answer of the Gemini model are priced using the same standard.
+        completion_tokens = thoughts_token_count + candidates_token_count
+        # The `prompt_tokens` includes the historical conversation QA plus the current input.
+        prompt_tokens = prompt_tokens_standard
+
+        return prompt_tokens, completion_tokens
+
     def validate_credentials(self, model: str, credentials: dict) -> None:
         """
         Validate model credentials
@@ -500,33 +542,32 @@ def _handle_generate_response(
         assistant_prompt_message = AssistantPromptMessage(content=response.text)
 
         # calculate num tokens
-        if response.usage_metadata:
-            prompt_tokens = response.usage_metadata.prompt_token_count
-            if prompt_tokens is None:
-                raise ValueError("prompt_tokens is None")
-            completion_tokens = response.usage_metadata.candidates_token_count
-            if completion_tokens is None:
-                raise ValueError("completion_tokens is None")
-        else:
+        prompt_tokens, completion_tokens = self._calculate_tokens_from_usage_metadata(
+            response.usage_metadata
+        )
+
+        # Fallback to manual calculation if tokens are not available
+        if prompt_tokens == 0 or completion_tokens == 0:
             prompt_tokens = self.get_num_tokens(model, credentials, prompt_messages)
             completion_tokens = self.get_num_tokens(model, credentials, [assistant_prompt_message])
 
         # transform usage
         # copy credentials to avoid modifying the original dict
         usage = self._calc_response_usage(
-            model, dict(credentials), prompt_tokens, completion_tokens
+            model=model,
+            credentials=dict(credentials),
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
         )
 
         # transform response
-        result = LLMResult(
+        return LLMResult(
             model=model,
             prompt_messages=prompt_messages,
             message=assistant_prompt_message,
             usage=usage,
         )
 
-        return result
-
     def _handle_generate_stream_response(
         self,
         model: str,
@@ -537,6 +578,20 @@ def _handle_generate_stream_response(
         """
         Handle llm stream response
 
+        # -- Usage Sample -- #
+        chunk.usage_metadata=GenerateContentResponseUsageMetadata(
+          candidates_token_count=58,
+          prompt_token_count=24,
+          prompt_tokens_details=[
+            ModalityTokenCount(
+              modality=<MediaModality.TEXT: 'TEXT'>,
+              token_count=24
+            ),
+          ],
+          thoughts_token_count=862,
+          total_token_count=944
+        )
+
         :param model: model name
         :param credentials: credentials
         :param response: response
@@ -547,6 +602,7 @@ def _handle_generate_stream_response(
         prompt_tokens = 0
         completion_tokens = 0
         self.is_thinking = False
+
         for chunk in response:
             if not chunk.candidates:
                 continue
@@ -557,10 +613,6 @@ def _handle_generate_stream_response(
                 message = self._parse_parts(candidate.content.parts)
                 index += len(candidate.content.parts)
 
-                # TODO(QIN2DIM): Fix the issue of inaccurate counting of Gemini Tokens
-                if chunk.usage_metadata:
-                    completion_tokens += chunk.usage_metadata.candidates_token_count or 0
-
                 # if the stream is not finished, yield the chunk
                 if not candidate.finish_reason:
                     yield LLMResultChunk(
@@ -573,12 +625,12 @@ def _handle_generate_stream_response(
                     # If we're still in thinking mode at the end, close it
                     if self.is_thinking:
                         message.content.append(TextPromptMessageContent(data="\n\n</think>"))
-                    if chunk.usage_metadata:
-                        prompt_tokens = chunk.usage_metadata.prompt_token_count or 0
-                        if chunk.usage_metadata.thoughts_token_count:
-                            completion_tokens = (
-                                completion_tokens + chunk.usage_metadata.thoughts_token_count
-                            )
+
+                    prompt_tokens, completion_tokens = self._calculate_tokens_from_usage_metadata(
+                        chunk.usage_metadata
+                    )
+
+                    # Fallback to manual calculation if tokens are not available
                     if prompt_tokens == 0 or completion_tokens == 0:
                         prompt_tokens = self.get_num_tokens(
                             model=model, credentials=credentials, prompt_messages=prompt_messages

diff --git a/models/gemini/pyproject.toml b/models/gemini/pyproject.toml
@@ -8,7 +8,7 @@ requires-python = ">=3.12"
 # uv pip compile pyproject.toml -o ./requirements.txt
 dependencies = [
     "dify-plugin>=0.4.3,<0.5.0",
-    "google-genai>=1.27.0,<2.0.0",
+    "google-genai>=1.29.0,<2.0.0",
     "google-generativeai>=0.8.5",
     "numpy>=2.3.2",
 ]

diff --git a/models/gemini/requirements.txt b/models/gemini/requirements.txt
@@ -52,7 +52,7 @@ google-auth==2.40.3
     #   google-generativeai
 google-auth-httplib2==0.2.0
     # via google-api-python-client
-google-genai==1.27.0
+google-genai==1.29.0
     # via gemini-g9cie8 (pyproject.toml)
 google-generativeai==0.8.5
     # via gemini-g9cie8 (pyproject.toml)