From 8398b9bf85fee7773cae74cf275745d5a23e22ce Mon Sep 17 00:00:00 2001 From: QIN2DIM <62018067+QIN2DIM@users.noreply.github.com> Date: Tue, 12 Aug 2025 13:05:35 +0800 Subject: [PATCH 1/4] feat(gemini): enhance token usage tracking and update dependencies - Improve token counting accuracy by properly handling multimodal input types in usage metadata - Add detailed token tracking for thoughts and candidates in both streaming and non-streaming responses - Update google-genai dependency to version 1.29.0 for improved compatibility and features - Bump version to 0.4.1 to reflect new functionality and improvements - Add comprehensive comments and usage examples for better code maintainability --- models/gemini/manifest.yaml | 2 +- models/gemini/models/llm/llm.py | 102 +++++++++++++++++++++++++------- models/gemini/pyproject.toml | 2 +- 3 files changed, 81 insertions(+), 25 deletions(-) diff --git a/models/gemini/manifest.yaml b/models/gemini/manifest.yaml index 820db5879..7220df473 100644 --- a/models/gemini/manifest.yaml +++ b/models/gemini/manifest.yaml @@ -34,4 +34,4 @@ resource: tool: enabled: true type: plugin -version: 0.4.0 +version: 0.4.1 diff --git a/models/gemini/models/llm/llm.py b/models/gemini/models/llm/llm.py index e63b1fd1c..f75b444c7 100644 --- a/models/gemini/models/llm/llm.py +++ b/models/gemini/models/llm/llm.py @@ -350,7 +350,6 @@ def _generate( config.tools.append(self._convert_tools_to_gemini_tool(tools)) # == InvokeModel == # - if stream: response = genai_client.models.generate_content_stream( model=model, contents=contents, config=config @@ -500,33 +499,55 @@ def _handle_generate_response( assistant_prompt_message = AssistantPromptMessage(content=response.text) # calculate num tokens - if response.usage_metadata: - prompt_tokens = response.usage_metadata.prompt_token_count - if prompt_tokens is None: - raise ValueError("prompt_tokens is None") - completion_tokens = response.usage_metadata.candidates_token_count - if completion_tokens is None: - raise ValueError("completion_tokens is None") - else: + prompt_tokens = 0 + completion_tokens = 0 + + if usage_metadata := response.usage_metadata: + # The pricing of tokens varies depending on the input modality. + prompt_tokens_standard = 0 + for _mtc in usage_metadata.prompt_tokens_details: + if _mtc.modality in [ + types.MediaModality.TEXT, + types.MediaModality.IMAGE, + types.MediaModality.VIDEO, + types.MediaModality.MODALITY_UNSPECIFIED, + ]: + prompt_tokens_standard += _mtc.token_count + elif _mtc.modality == types.MediaModality.AUDIO: + prompt_tokens_standard += _mtc.token_count + elif _mtc.modality == types.MediaModality.DOCUMENT: + prompt_tokens_standard += _mtc.token_count + + # Number of tokens present in thoughts output. + thoughts_token_count = usage_metadata.thoughts_token_count or 0 + # Number of tokens in the response(s). + candidates_token_count = usage_metadata.candidates_token_count or 0 + # The reasoning content and final answer of the Gemini model are priced using the same standard. + completion_tokens = thoughts_token_count + candidates_token_count + # The `prompt_tokens` includes the historical conversation QA plus the current input. + prompt_tokens = prompt_tokens_standard + + if prompt_tokens == 0 or completion_tokens == 0: prompt_tokens = self.get_num_tokens(model, credentials, prompt_messages) completion_tokens = self.get_num_tokens(model, credentials, [assistant_prompt_message]) # transform usage # copy credentials to avoid modifying the original dict usage = self._calc_response_usage( - model, dict(credentials), prompt_tokens, completion_tokens + model=model, + credentials=dict(credentials), + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, ) # transform response - result = LLMResult( + return LLMResult( model=model, prompt_messages=prompt_messages, message=assistant_prompt_message, usage=usage, ) - return result - def _handle_generate_stream_response( self, model: str, @@ -537,6 +558,20 @@ def _handle_generate_stream_response( """ Handle llm stream response + # -- Usage Sample -- # + chunk.usage_metadata=GenerateContentResponseUsageMetadata( + candidates_token_count=58, + prompt_token_count=24, + prompt_tokens_details=[ + ModalityTokenCount( + modality=, + token_count=24 + ), + ], + thoughts_token_count=862, + total_token_count=944 + ) + :param model: model name :param credentials: credentials :param response: response @@ -547,6 +582,7 @@ def _handle_generate_stream_response( prompt_tokens = 0 completion_tokens = 0 self.is_thinking = False + for chunk in response: if not chunk.candidates: continue @@ -557,10 +593,6 @@ def _handle_generate_stream_response( message = self._parse_parts(candidate.content.parts) index += len(candidate.content.parts) - # TODO(QIN2DIM): Fix the issue of inaccurate counting of Gemini Tokens - if chunk.usage_metadata: - completion_tokens += chunk.usage_metadata.candidates_token_count or 0 - # if the stream is not finished, yield the chunk if not candidate.finish_reason: yield LLMResultChunk( @@ -573,12 +605,36 @@ def _handle_generate_stream_response( # If we're still in thinking mode at the end, close it if self.is_thinking: message.content.append(TextPromptMessageContent(data="\n\n")) - if chunk.usage_metadata: - prompt_tokens = chunk.usage_metadata.prompt_token_count or 0 - if chunk.usage_metadata.thoughts_token_count: - completion_tokens = ( - completion_tokens + chunk.usage_metadata.thoughts_token_count - ) + # [ Pricing ] + # https://ai.google.dev/gemini-api/docs/pricing?hl=zh-cn#gemini-2.5-pro + # FIXME: Currently, Dify's pricing model cannot cover the tokens of multimodal resources + # FIXME: Unable to track caching, Grounding, Live API + if usage_metadata := chunk.usage_metadata: + # The pricing of tokens varies depending on the input modality. + prompt_tokens_standard = 0 + for _mtc in usage_metadata.prompt_tokens_details: + if _mtc.modality in [ + types.MediaModality.TEXT, + types.MediaModality.IMAGE, + types.MediaModality.VIDEO, + types.MediaModality.MODALITY_UNSPECIFIED, + ]: + prompt_tokens_standard += _mtc.token_count + elif _mtc.modality == types.MediaModality.AUDIO: + prompt_tokens_standard += _mtc.token_count + elif _mtc.modality == types.MediaModality.DOCUMENT: + prompt_tokens_standard += _mtc.token_count + + # Number of tokens present in thoughts output. + thoughts_token_count = usage_metadata.thoughts_token_count or 0 + # Number of tokens in the response(s). + candidates_token_count = usage_metadata.candidates_token_count or 0 + # The reasoning content and final answer of the Gemini model are priced using the same standard. + completion_tokens = thoughts_token_count + candidates_token_count + # The `prompt_tokens` includes the historical conversation QA plus the current input. + prompt_tokens = prompt_tokens_standard + + # Fallback to the number of tokens in the prompt if the completion tokens are not available. if prompt_tokens == 0 or completion_tokens == 0: prompt_tokens = self.get_num_tokens( model=model, credentials=credentials, prompt_messages=prompt_messages diff --git a/models/gemini/pyproject.toml b/models/gemini/pyproject.toml index 6fe3ef911..ca35ad565 100644 --- a/models/gemini/pyproject.toml +++ b/models/gemini/pyproject.toml @@ -8,7 +8,7 @@ requires-python = ">=3.12" # uv pip compile pyproject.toml -o ./requirements.txt dependencies = [ "dify-plugin>=0.4.3,<0.5.0", - "google-genai>=1.27.0,<2.0.0", + "google-genai>=1.29.0,<2.0.0", "google-generativeai>=0.8.5", "numpy>=2.3.2", ] From 075bfe9866f4e4df6cfb102557c582040eeae05e Mon Sep 17 00:00:00 2001 From: QIN2DIM <62018067+QIN2DIM@users.noreply.github.com> Date: Tue, 12 Aug 2025 13:58:24 +0800 Subject: [PATCH 2/4] refactor(llm): extract token calculation logic into helper method Extracted the token calculation logic from multiple locations in the GoogleLargeLanguageModel class into a static method `_calculate_tokens_from_usage_metadata`. This improves code maintainability and reduces duplication across different parts of the codebase that handle token counting. The new method: - Handles all token types including text, image, video, audio, and document modalities - Correctly calculates prompt and completion tokens based on usage metadata - Includes proper fallback to manual token counting when metadata is unavailable The change also removes redundant comments and improves code readability while preserving all existing functionality. --- models/gemini/models/llm/llm.py | 109 +++++++++++++++----------------- 1 file changed, 52 insertions(+), 57 deletions(-) diff --git a/models/gemini/models/llm/llm.py b/models/gemini/models/llm/llm.py index f75b444c7..dbba3ba30 100644 --- a/models/gemini/models/llm/llm.py +++ b/models/gemini/models/llm/llm.py @@ -235,6 +235,48 @@ def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]] ], } + @staticmethod + def _calculate_tokens_from_usage_metadata( + usage_metadata: types.GenerateContentResponseUsageMetadata | None, + ) -> tuple[int, int]: + """ + Calculate prompt and completion tokens from usage metadata. + + :param usage_metadata: Usage metadata from Gemini response + :return: Tuple of (prompt_tokens, completion_tokens) + """ + if not usage_metadata: + return 0, 0 + + # The pricing of tokens varies depending on the input modality. + prompt_tokens_standard = 0 + + # [ Pricing ] + # https://ai.google.dev/gemini-api/docs/pricing?hl=zh-cn#gemini-2.5-pro + # FIXME: Currently, Dify's pricing model cannot cover the tokens of multimodal resources + # FIXME: Unable to track caching, Grounding, Live API + for _mtc in usage_metadata.prompt_tokens_details: + if _mtc.modality in [ + types.MediaModality.TEXT, + types.MediaModality.IMAGE, + types.MediaModality.VIDEO, + types.MediaModality.MODALITY_UNSPECIFIED, + types.MediaModality.AUDIO, + types.MediaModality.DOCUMENT, + ]: + prompt_tokens_standard += _mtc.token_count + + # Number of tokens present in thoughts output. + thoughts_token_count = usage_metadata.thoughts_token_count or 0 + # Number of tokens in the response(s). + candidates_token_count = usage_metadata.candidates_token_count or 0 + # The reasoning content and final answer of the Gemini model are priced using the same standard. + completion_tokens = thoughts_token_count + candidates_token_count + # The `prompt_tokens` includes the historical conversation QA plus the current input. + prompt_tokens = prompt_tokens_standard + + return prompt_tokens, completion_tokens + def validate_credentials(self, model: str, credentials: dict) -> None: """ Validate model credentials @@ -499,34 +541,11 @@ def _handle_generate_response( assistant_prompt_message = AssistantPromptMessage(content=response.text) # calculate num tokens - prompt_tokens = 0 - completion_tokens = 0 - - if usage_metadata := response.usage_metadata: - # The pricing of tokens varies depending on the input modality. - prompt_tokens_standard = 0 - for _mtc in usage_metadata.prompt_tokens_details: - if _mtc.modality in [ - types.MediaModality.TEXT, - types.MediaModality.IMAGE, - types.MediaModality.VIDEO, - types.MediaModality.MODALITY_UNSPECIFIED, - ]: - prompt_tokens_standard += _mtc.token_count - elif _mtc.modality == types.MediaModality.AUDIO: - prompt_tokens_standard += _mtc.token_count - elif _mtc.modality == types.MediaModality.DOCUMENT: - prompt_tokens_standard += _mtc.token_count - - # Number of tokens present in thoughts output. - thoughts_token_count = usage_metadata.thoughts_token_count or 0 - # Number of tokens in the response(s). - candidates_token_count = usage_metadata.candidates_token_count or 0 - # The reasoning content and final answer of the Gemini model are priced using the same standard. - completion_tokens = thoughts_token_count + candidates_token_count - # The `prompt_tokens` includes the historical conversation QA plus the current input. - prompt_tokens = prompt_tokens_standard + prompt_tokens, completion_tokens = self._calculate_tokens_from_usage_metadata( + response.usage_metadata + ) + # Fallback to manual calculation if tokens are not available if prompt_tokens == 0 or completion_tokens == 0: prompt_tokens = self.get_num_tokens(model, credentials, prompt_messages) completion_tokens = self.get_num_tokens(model, credentials, [assistant_prompt_message]) @@ -605,36 +624,12 @@ def _handle_generate_stream_response( # If we're still in thinking mode at the end, close it if self.is_thinking: message.content.append(TextPromptMessageContent(data="\n\n")) - # [ Pricing ] - # https://ai.google.dev/gemini-api/docs/pricing?hl=zh-cn#gemini-2.5-pro - # FIXME: Currently, Dify's pricing model cannot cover the tokens of multimodal resources - # FIXME: Unable to track caching, Grounding, Live API - if usage_metadata := chunk.usage_metadata: - # The pricing of tokens varies depending on the input modality. - prompt_tokens_standard = 0 - for _mtc in usage_metadata.prompt_tokens_details: - if _mtc.modality in [ - types.MediaModality.TEXT, - types.MediaModality.IMAGE, - types.MediaModality.VIDEO, - types.MediaModality.MODALITY_UNSPECIFIED, - ]: - prompt_tokens_standard += _mtc.token_count - elif _mtc.modality == types.MediaModality.AUDIO: - prompt_tokens_standard += _mtc.token_count - elif _mtc.modality == types.MediaModality.DOCUMENT: - prompt_tokens_standard += _mtc.token_count - - # Number of tokens present in thoughts output. - thoughts_token_count = usage_metadata.thoughts_token_count or 0 - # Number of tokens in the response(s). - candidates_token_count = usage_metadata.candidates_token_count or 0 - # The reasoning content and final answer of the Gemini model are priced using the same standard. - completion_tokens = thoughts_token_count + candidates_token_count - # The `prompt_tokens` includes the historical conversation QA plus the current input. - prompt_tokens = prompt_tokens_standard - - # Fallback to the number of tokens in the prompt if the completion tokens are not available. + + prompt_tokens, completion_tokens = self._calculate_tokens_from_usage_metadata( + chunk.usage_metadata + ) + + # Fallback to manual calculation if tokens are not available if prompt_tokens == 0 or completion_tokens == 0: prompt_tokens = self.get_num_tokens( model=model, credentials=credentials, prompt_messages=prompt_messages From 0bfa25b6aa1f11daaf3433ed1f449bf218a095a7 Mon Sep 17 00:00:00 2001 From: QIN2DIM <62018067+QIN2DIM@users.noreply.github.com> Date: Tue, 12 Aug 2025 14:01:11 +0800 Subject: [PATCH 3/4] Update llm.py --- models/gemini/models/llm/llm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/models/gemini/models/llm/llm.py b/models/gemini/models/llm/llm.py index dbba3ba30..df04e9a66 100644 --- a/models/gemini/models/llm/llm.py +++ b/models/gemini/models/llm/llm.py @@ -392,6 +392,7 @@ def _generate( config.tools.append(self._convert_tools_to_gemini_tool(tools)) # == InvokeModel == # + if stream: response = genai_client.models.generate_content_stream( model=model, contents=contents, config=config From 0511cea78663fa2c93c9740231e55bc993582f7f Mon Sep 17 00:00:00 2001 From: QIN2DIM <62018067+QIN2DIM@users.noreply.github.com> Date: Tue, 12 Aug 2025 14:32:40 +0800 Subject: [PATCH 4/4] fix(gemini): update google-genai to version 1.29.0 Updated the google-genai dependency from version 1.27.0 to 1.29.0 to include the latest bug fixes and improvements. This ensures compatibility with the latest features and security patches from the Google Generative AI library. --- models/gemini/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/gemini/requirements.txt b/models/gemini/requirements.txt index fb84fd5ea..f52c76140 100644 --- a/models/gemini/requirements.txt +++ b/models/gemini/requirements.txt @@ -52,7 +52,7 @@ google-auth==2.40.3 # google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-genai==1.27.0 +google-genai==1.29.0 # via gemini-g9cie8 (pyproject.toml) google-generativeai==0.8.5 # via gemini-g9cie8 (pyproject.toml)