diff --git a/models/gemini/manifest.yaml b/models/gemini/manifest.yaml index 820db5879..7220df473 100644 --- a/models/gemini/manifest.yaml +++ b/models/gemini/manifest.yaml @@ -34,4 +34,4 @@ resource: tool: enabled: true type: plugin -version: 0.4.0 +version: 0.4.1 diff --git a/models/gemini/models/llm/llm.py b/models/gemini/models/llm/llm.py index e63b1fd1c..df04e9a66 100644 --- a/models/gemini/models/llm/llm.py +++ b/models/gemini/models/llm/llm.py @@ -235,6 +235,48 @@ def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]] ], } + @staticmethod + def _calculate_tokens_from_usage_metadata( + usage_metadata: types.GenerateContentResponseUsageMetadata | None, + ) -> tuple[int, int]: + """ + Calculate prompt and completion tokens from usage metadata. + + :param usage_metadata: Usage metadata from Gemini response + :return: Tuple of (prompt_tokens, completion_tokens) + """ + if not usage_metadata: + return 0, 0 + + # The pricing of tokens varies depending on the input modality. + prompt_tokens_standard = 0 + + # [ Pricing ] + # https://ai.google.dev/gemini-api/docs/pricing?hl=zh-cn#gemini-2.5-pro + # FIXME: Currently, Dify's pricing model cannot cover the tokens of multimodal resources + # FIXME: Unable to track caching, Grounding, Live API + for _mtc in usage_metadata.prompt_tokens_details: + if _mtc.modality in [ + types.MediaModality.TEXT, + types.MediaModality.IMAGE, + types.MediaModality.VIDEO, + types.MediaModality.MODALITY_UNSPECIFIED, + types.MediaModality.AUDIO, + types.MediaModality.DOCUMENT, + ]: + prompt_tokens_standard += _mtc.token_count + + # Number of tokens present in thoughts output. + thoughts_token_count = usage_metadata.thoughts_token_count or 0 + # Number of tokens in the response(s). + candidates_token_count = usage_metadata.candidates_token_count or 0 + # The reasoning content and final answer of the Gemini model are priced using the same standard. + completion_tokens = thoughts_token_count + candidates_token_count + # The `prompt_tokens` includes the historical conversation QA plus the current input. + prompt_tokens = prompt_tokens_standard + + return prompt_tokens, completion_tokens + def validate_credentials(self, model: str, credentials: dict) -> None: """ Validate model credentials @@ -500,33 +542,32 @@ def _handle_generate_response( assistant_prompt_message = AssistantPromptMessage(content=response.text) # calculate num tokens - if response.usage_metadata: - prompt_tokens = response.usage_metadata.prompt_token_count - if prompt_tokens is None: - raise ValueError("prompt_tokens is None") - completion_tokens = response.usage_metadata.candidates_token_count - if completion_tokens is None: - raise ValueError("completion_tokens is None") - else: + prompt_tokens, completion_tokens = self._calculate_tokens_from_usage_metadata( + response.usage_metadata + ) + + # Fallback to manual calculation if tokens are not available + if prompt_tokens == 0 or completion_tokens == 0: prompt_tokens = self.get_num_tokens(model, credentials, prompt_messages) completion_tokens = self.get_num_tokens(model, credentials, [assistant_prompt_message]) # transform usage # copy credentials to avoid modifying the original dict usage = self._calc_response_usage( - model, dict(credentials), prompt_tokens, completion_tokens + model=model, + credentials=dict(credentials), + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, ) # transform response - result = LLMResult( + return LLMResult( model=model, prompt_messages=prompt_messages, message=assistant_prompt_message, usage=usage, ) - return result - def _handle_generate_stream_response( self, model: str, @@ -537,6 +578,20 @@ def _handle_generate_stream_response( """ Handle llm stream response + # -- Usage Sample -- # + chunk.usage_metadata=GenerateContentResponseUsageMetadata( + candidates_token_count=58, + prompt_token_count=24, + prompt_tokens_details=[ + ModalityTokenCount( + modality=, + token_count=24 + ), + ], + thoughts_token_count=862, + total_token_count=944 + ) + :param model: model name :param credentials: credentials :param response: response @@ -547,6 +602,7 @@ def _handle_generate_stream_response( prompt_tokens = 0 completion_tokens = 0 self.is_thinking = False + for chunk in response: if not chunk.candidates: continue @@ -557,10 +613,6 @@ def _handle_generate_stream_response( message = self._parse_parts(candidate.content.parts) index += len(candidate.content.parts) - # TODO(QIN2DIM): Fix the issue of inaccurate counting of Gemini Tokens - if chunk.usage_metadata: - completion_tokens += chunk.usage_metadata.candidates_token_count or 0 - # if the stream is not finished, yield the chunk if not candidate.finish_reason: yield LLMResultChunk( @@ -573,12 +625,12 @@ def _handle_generate_stream_response( # If we're still in thinking mode at the end, close it if self.is_thinking: message.content.append(TextPromptMessageContent(data="\n\n")) - if chunk.usage_metadata: - prompt_tokens = chunk.usage_metadata.prompt_token_count or 0 - if chunk.usage_metadata.thoughts_token_count: - completion_tokens = ( - completion_tokens + chunk.usage_metadata.thoughts_token_count - ) + + prompt_tokens, completion_tokens = self._calculate_tokens_from_usage_metadata( + chunk.usage_metadata + ) + + # Fallback to manual calculation if tokens are not available if prompt_tokens == 0 or completion_tokens == 0: prompt_tokens = self.get_num_tokens( model=model, credentials=credentials, prompt_messages=prompt_messages diff --git a/models/gemini/pyproject.toml b/models/gemini/pyproject.toml index 6fe3ef911..ca35ad565 100644 --- a/models/gemini/pyproject.toml +++ b/models/gemini/pyproject.toml @@ -8,7 +8,7 @@ requires-python = ">=3.12" # uv pip compile pyproject.toml -o ./requirements.txt dependencies = [ "dify-plugin>=0.4.3,<0.5.0", - "google-genai>=1.27.0,<2.0.0", + "google-genai>=1.29.0,<2.0.0", "google-generativeai>=0.8.5", "numpy>=2.3.2", ] diff --git a/models/gemini/requirements.txt b/models/gemini/requirements.txt index fb84fd5ea..f52c76140 100644 --- a/models/gemini/requirements.txt +++ b/models/gemini/requirements.txt @@ -52,7 +52,7 @@ google-auth==2.40.3 # google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-genai==1.27.0 +google-genai==1.29.0 # via gemini-g9cie8 (pyproject.toml) google-generativeai==0.8.5 # via gemini-g9cie8 (pyproject.toml)