diff --git a/packages/uipath-platform/pyproject.toml b/packages/uipath-platform/pyproject.toml
index 4a5c0b83c..50dd25553 100644
--- a/packages/uipath-platform/pyproject.toml
+++ b/packages/uipath-platform/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath-platform"
-version = "0.1.62"
+version = "0.1.63"
 description = "HTTP client library for programmatic access to UiPath Platform"
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
index ffe0bff99..cb02d8af3 100644
--- a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
+++ b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py
@@ -401,7 +401,7 @@ async def chat_completions(
         presence_penalty: float = 0,
         top_p: float | None = 1,
         top_k: int | None = None,
-        tools: list[ToolDefinition] | None = None,
+        tools: list[ToolDefinition | dict[str, Any]] | None = None,
         tool_choice: ToolChoice | None = None,
         response_format: dict[str, Any] | type[BaseModel] | None = None,
         api_version: str = NORMALIZED_API_VERSION,
@@ -436,9 +436,11 @@ async def chat_completions(
                 Controls diversity by considering only the top p probability mass. Defaults to 1.
             top_k (int, optional): Nucleus sampling parameter.
                 Controls diversity by considering only the top k most probable tokens. Defaults to None.
-            tools (Optional[List[ToolDefinition]], optional): List of tool definitions that the
-                model can call. Tools enable the model to perform actions or retrieve information
-                beyond text generation. Defaults to None.
+            tools (Optional[List[ToolDefinition | dict]], optional): List of tool definitions
+                that the model can call. Tools enable the model to perform actions or retrieve
+                information beyond text generation. A tool given as a dict must already be in
+                UiPath wire format and is forwarded unchanged, which allows arbitrary nested
+                JSON schemas in its parameters. Defaults to None.
             tool_choice (Optional[ToolChoice], optional): Controls which tools the model can call.
                 Can be "auto" (model decides), "none" (no tools), or a specific tool choice.
                 Defaults to None.
@@ -583,10 +585,15 @@ class Country(BaseModel):
                 # Use provided dictionary format directly
                 request_body["response_format"] = response_format
 
-        # Add tools if provided - convert to UiPath format
+        # Add tools if provided. A tool already in UiPath wire format (a dict) is
+        # passed through unchanged so callers can supply an arbitrary JSON schema
+        # for the parameters; ToolDefinition objects are converted as before.
         if tools:
             request_body["tools"] = [
-                self._convert_tool_to_uipath_format(tool) for tool in tools
+                tool
+                if isinstance(tool, dict)
+                else self._convert_tool_to_uipath_format(tool)
+                for tool in tools
             ]
 
         # Handle tool_choice
diff --git a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
index 124ccad8b..9e2292c60 100644
--- a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
+++ b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py
@@ -7,6 +7,7 @@
 from uipath.platform.chat import (
     AutoToolChoice,
     ChatModels,
+    RequiredToolChoice,
     SpecificToolChoice,
     ToolDefinition,
     ToolFunctionDefinition,
@@ -369,6 +370,87 @@ async def test_tool_call_required_mocked(self, mock_request, llm_service):
         assert result.choices[0].message.tool_calls[0].arguments["name"] == "John"
         assert result.choices[0].message.tool_calls[0].arguments["password"] == "1234"
 
+    @pytest.mark.asyncio
+    @patch.object(UiPathLlmChatService, "request_async")
+    async def test_raw_dict_tool_passthrough_mocked(self, mock_request, llm_service):
+        """A tool supplied as a raw dict is sent unchanged, preserving nested schema.
+
+        ToolDefinition's converter only emits flat properties, so callers that need
+        an arbitrary nested JSON schema (e.g. the eval mockers) pass the tool as a
+        dict already in UiPath wire format. It must reach the gateway verbatim.
+        """
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "id": "chatcmpl-raw",
+            "object": "chat.completion",
+            "created": 1677858242,
+            "model": "gpt-4o-mini-2024-07-18",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_raw",
+                                "name": "submit_tool_response",
+                                "arguments": {"response": {"items": [{"sku": "A1"}]}},
+                            }
+                        ],
+                    },
+                    "finish_reason": "tool_calls",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 10,
+                "completion_tokens": 5,
+                "total_tokens": 15,
+                "cache_read_input_tokens": None,
+            },
+        }
+        mock_request.return_value = mock_response
+
+        nested_tool = {
+            "name": "submit_tool_response",
+            "description": "Return the simulated response matching the schema.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "response": {
+                        "type": "object",
+                        "properties": {
+                            "items": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {"sku": {"type": "string"}},
+                                },
+                            }
+                        },
+                    }
+                },
+                "required": ["response"],
+            },
+        }
+
+        result = await llm_service.chat_completions(
+            messages=[{"role": "user", "content": "go"}],
+            model=ChatModels.gpt_4_1_mini_2025_04_14,
+            tools=[nested_tool],
+            tool_choice=RequiredToolChoice(),
+        )
+
+        mock_request.assert_called_once()
+        _, kwargs = mock_request.call_args
+        body = kwargs["json"]
+        # The dict tool is forwarded byte-for-byte, nested array schema intact.
+        assert body["tools"] == [nested_tool]
+        assert body["tool_choice"] == {"type": "required"}
+        assert result.choices[0].message.tool_calls[0].arguments == {
+            "response": {"items": [{"sku": "A1"}]}
+        }
+
     @pytest.mark.asyncio
     @patch.object(UiPathLlmChatService, "request_async")
     async def test_chat_with_conversation_history_mocked(
diff --git a/packages/uipath-platform/uv.lock b/packages/uipath-platform/uv.lock
index ac4761377..2f56c1df5 100644
--- a/packages/uipath-platform/uv.lock
+++ b/packages/uipath-platform/uv.lock
@@ -1095,7 +1095,7 @@ dev = [
 
 [[package]]
 name = "uipath-platform"
-version = "0.1.62"
+version = "0.1.63"
 source = { editable = "." }
 dependencies = [
     { name = "httpx" },
diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
index 79881bbc5..ffaa7f881 100644
--- a/packages/uipath/pyproject.toml
+++ b/packages/uipath/pyproject.toml
@@ -1,13 +1,13 @@
 [project]
 name = "uipath"
-version = "2.10.80"
+version = "2.10.81"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
 dependencies = [
   "uipath-core>=0.5.17, <0.6.0",
   "uipath-runtime>=0.11.0, <0.12.0",
-  "uipath-platform>=0.1.60, <0.2.0",
+  "uipath-platform>=0.1.63, <0.2.0",
   "click>=8.3.1",
   "httpx>=0.28.1",
   "pyjwt>=2.10.1",
diff --git a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
index 57a727ec1..a542fc7ad 100644
--- a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
+++ b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py
@@ -15,6 +15,7 @@
 from .._execution_context import eval_set_run_id_context
 from ._mock_context import cache_manager_context
 from ._mocker import UiPathInputMockingError
+from ._structured_output import generate_structured_output
 from ._types import (
     InputMockingStrategy,
 )
@@ -105,15 +106,6 @@ async def generate_llm_input(
 
         prompt = get_input_mocking_prompt(**prompt_generation_args)
 
-        response_format = {
-            "type": "json_schema",
-            "json_schema": {
-                "name": "agent_input",
-                "strict": False,
-                "schema": input_schema,
-            },
-        }
-
         model_parameters = mocking_strategy.model if mocking_strategy else None
         completion_kwargs = (
             model_parameters.model_dump(by_alias=False, exclude_none=True)
@@ -128,7 +120,7 @@ async def generate_llm_input(
 
         if cache_manager is not None:
             cache_key_data = {
-                "response_format": response_format,
+                "input_schema": input_schema,
                 "completion_kwargs": completion_kwargs,
                 "prompt_generation_args": prompt_generation_args,
             }
@@ -142,15 +134,15 @@ async def generate_llm_input(
             if cached_response is not None:
                 return cached_response
 
-        response = await llm.chat_completions(
+        result = await generate_structured_output(
+            llm,
             [{"role": "user", "content": prompt}],
-            response_format=response_format,
-            **completion_kwargs,
+            schema=input_schema,
+            response_format_name="agent_input",
+            description="Return the simulated agent input matching the required schema.",
+            completion_kwargs=completion_kwargs,
         )
 
-        generated_input_str = response.choices[0].message.content
-        result = json.loads(generated_input_str)
-
         if cache_manager is not None:
             cache_manager.set(
                 mocker_type="input_mocker",
@@ -160,10 +152,6 @@ async def generate_llm_input(
             )
 
         return result
-    except json.JSONDecodeError as e:
-        raise UiPathInputMockingError(
-            f"Failed to parse LLM response as JSON: {str(e)}"
-        ) from e
     except UiPathInputMockingError:
         raise
     except Exception as e:
diff --git a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
index d1fd2a1c9..a9ab7005e 100644
--- a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
+++ b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py
@@ -28,6 +28,7 @@
     UiPathMockResponseGenerationError,
     UiPathNoMockFoundError,
 )
+from ._structured_output import generate_structured_output
 from ._types import (
     ExampleCall,
     LLMMockingStrategy,
@@ -125,14 +126,7 @@ async def response(
                 "output_schema", TypeAdapter(return_type).json_schema()
             )
 
-            response_format = {
-                "type": "json_schema",
-                "json_schema": {
-                    "name": "OutputSchema",
-                    "strict": False,
-                    "schema": _cleanup_schema(output_schema),
-                },
-            }
+            cleaned_schema = _cleanup_schema(output_schema)
             try:
                 # Safely pull examples from params.
                 example_calls = params.get("example_calls", [])
@@ -197,7 +191,7 @@ async def response(
                 formatted_prompt = PROMPT.format(**prompt_generation_args)
 
                 cache_key_data = {
-                    "response_format": response_format,
+                    "output_schema": cleaned_schema,
                     "completion_kwargs": completion_kwargs,
                     "prompt_generation_args": prompt_generation_args,
                 }
@@ -213,17 +207,17 @@ async def response(
                     if cached_response is not None:
                         return cached_response
 
-                response = await llm.chat_completions(
-                    [
-                        {
-                            "role": "user",
-                            "content": formatted_prompt,
-                        },
-                    ],
-                    response_format=response_format,
-                    **completion_kwargs,
+                result = await generate_structured_output(
+                    llm,
+                    [{"role": "user", "content": formatted_prompt}],
+                    schema=cleaned_schema,
+                    response_format_name="OutputSchema",
+                    description=(
+                        "Return the simulated response for tool "
+                        f"'{function_name}' matching the required schema."
+                    ),
+                    completion_kwargs=completion_kwargs,
                 )
-                result = json.loads(response.choices[0].message.content)
 
                 if cache_manager is not None:
                     cache_manager.set(
@@ -235,7 +229,7 @@ async def response(
 
                 return result
             except Exception as e:
-                raise UiPathMockResponseGenerationError() from e
+                raise UiPathMockResponseGenerationError(str(e)) from e
         else:
             raise UiPathNoMockFoundError(f"Method '{function_name}' is not simulated.")
 
diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
new file mode 100644
index 000000000..599780353
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/mocks/_structured_output.py
@@ -0,0 +1,259 @@
+"""Provider-aware structured output for the eval mockers.
+
+The normalized LLM Gateway handles OpenAI-style ``response_format``
+(json_schema) differently per provider — live-verified against the gateway:
+
+- **OpenAI**: honors ``response_format`` and returns valid JSON content,
+  including native ``$defs`` support.
+- **Anthropic (Claude)**: ignores it and answers with plain prose content.
+- **Gemini**: returns empty content.
+
+Forced function calling works across all three providers, so each provider
+gets a small strategy class: OpenAI prefers ``response_format`` (more reliable
+for it on some schemas) with a tool-call fallback; Claude and Gemini go
+straight to the forced tool call; unknown providers try ``response_format``
+first and fall back.
+"""
+
+import json
+import logging
+from typing import Any
+
+from uipath.platform.chat.llm_gateway import RequiredToolChoice
+
+RESPONSE_TOOL_NAME = "submit_tool_response"
+RESPONSE_KEY = "response"
+_DEFS_PREFIX = "#/$defs/"
+
+logger = logging.getLogger(__name__)
+
+
+def _inline_defs(
+    schema: dict[str, Any],
+) -> tuple[dict[str, Any], dict[str, Any]]:
+    """Inline ``$defs``/``$ref`` into a self-contained schema.
+
+    Nested Pydantic models and enums emit root ``$defs`` referenced by ``$ref``.
+    The normalized gateway accepts those in ``response_format`` but not inside a
+    tool's ``parameters``, so they are inlined here. Sibling keys on a ``$ref``
+    node (e.g. a field ``description``) are merged over the inlined definition.
+    Self-referential definitions cannot be inlined without looping; any ``$ref``
+    reached while its target is already on the current resolution path is left
+    untouched and its definitions are returned so the caller can keep them
+    reachable.
+
+    Returns:
+        A tuple of (inlined schema, leftover ``$defs`` needed for cyclic refs).
+    """
+    defs = schema.get("$defs", {})
+    leftover: dict[str, Any] = {}
+
+    def resolve(node: Any, active: frozenset[str]) -> Any:
+        if isinstance(node, dict):
+            ref = node.get("$ref")
+            if isinstance(ref, str) and ref.startswith(_DEFS_PREFIX):
+                name = ref[len(_DEFS_PREFIX) :]
+                if name in defs and name not in active:
+                    resolved = resolve(defs[name], active | {name})
+                    siblings = {
+                        key: resolve(value, active)
+                        for key, value in node.items()
+                        if key not in ("$ref", "$defs")
+                    }
+                    if isinstance(resolved, dict):
+                        return {**resolved, **siblings}
+                    return resolved
+                # Cyclic or unknown ref: keep it and preserve its definition.
+                if name in defs:
+                    leftover[name] = defs[name]
+                return dict(node)
+            return {
+                key: resolve(value, active)
+                for key, value in node.items()
+                if key != "$defs"
+            }
+        if isinstance(node, list):
+            return [resolve(item, active) for item in node]
+        return node
+
+    root = {key: value for key, value in schema.items() if key != "$defs"}
+    inlined = resolve(root, frozenset())
+    return inlined, leftover
+
+
+def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, Any]:
+    """Build a normalized-API function tool that wraps ``schema`` under ``response``.
+
+    Tool-call arguments are always a JSON object, so an arbitrary output schema
+    (which may be a scalar, array, or object) is nested under a single
+    ``response`` property and unwrapped after the call. ``$defs``/``$ref`` are
+    inlined so the tool parameters are self-contained, which the gateway requires
+    for tool schemas (unlike ``response_format``).
+    """
+    response_schema, leftover_defs = _inline_defs(schema)
+    parameters: dict[str, Any] = {
+        "type": "object",
+        "properties": {RESPONSE_KEY: response_schema},
+        "required": [RESPONSE_KEY],
+    }
+    if leftover_defs:
+        parameters["$defs"] = leftover_defs
+
+    return {
+        "name": RESPONSE_TOOL_NAME,
+        "description": description,
+        "parameters": parameters,
+    }
+
+
+def extract_response(response: Any) -> Any:
+    """Extract the wrapped value from the forced tool call.
+
+    Raises:
+        ValueError: if the response carries no usable tool call or is missing the
+            wrapped ``response`` key.
+    """
+    choices = getattr(response, "choices", None)
+    if not choices:
+        raise ValueError("LLM response contained no choices")
+
+    message = choices[0].message
+    tool_calls = getattr(message, "tool_calls", None)
+    if not tool_calls:
+        raise ValueError(
+            f"LLM response contained no tool calls (content={message.content!r})"
+        )
+
+    arguments = tool_calls[0].arguments
+    if RESPONSE_KEY not in arguments:
+        raise ValueError(
+            f"Tool call arguments missing '{RESPONSE_KEY}' key: {arguments}"
+        )
+
+    return arguments[RESPONSE_KEY]
+
+
+class ToolCallStructuredOutput:
+    """Structured output via a forced tool call — works on every provider."""
+
+    async def generate(
+        self,
+        llm: Any,
+        messages: list[dict[str, str]],
+        *,
+        schema: dict[str, Any],
+        response_format_name: str,
+        description: str,
+        completion_kwargs: dict[str, Any],
+    ) -> Any:
+        """Force a tool call wrapping ``schema`` and unwrap its arguments."""
+        tool = build_response_tool(schema, description)
+        response = await llm.chat_completions(
+            messages,
+            tools=[tool],
+            tool_choice=RequiredToolChoice(),
+            **completion_kwargs,
+        )
+        return extract_response(response)
+
+
+class ResponseFormatStructuredOutput(ToolCallStructuredOutput):
+    """Prefer ``response_format`` (json_schema); fall back to a forced tool call.
+
+    The fallback fires when the provider rejects the request, returns empty
+    content, or returns content that is not valid JSON (Claude's behavior on
+    the normalized gateway is to answer with plain prose).
+    """
+
+    async def generate(
+        self,
+        llm: Any,
+        messages: list[dict[str, str]],
+        *,
+        schema: dict[str, Any],
+        response_format_name: str,
+        description: str,
+        completion_kwargs: dict[str, Any],
+    ) -> Any:
+        """Try ``response_format`` first, falling back to a forced tool call."""
+        response_format = {
+            "type": "json_schema",
+            "json_schema": {
+                "name": response_format_name,
+                "strict": False,
+                "schema": schema,
+            },
+        }
+
+        content: str | None = None
+        try:
+            response = await llm.chat_completions(
+                messages, response_format=response_format, **completion_kwargs
+            )
+            choices = getattr(response, "choices", None)
+            if choices:
+                content = choices[0].message.content
+        except Exception as e:
+            logger.info("response_format path failed, falling back to tools: %s", e)
+
+        if content:
+            try:
+                return json.loads(content)
+            except json.JSONDecodeError:
+                logger.info(
+                    "response_format content was not JSON, falling back to tools"
+                )
+
+        return await super().generate(
+            llm,
+            messages,
+            schema=schema,
+            response_format_name=response_format_name,
+            description=description,
+            completion_kwargs=completion_kwargs,
+        )
+
+
+class OpenAIStructuredOutput(ResponseFormatStructuredOutput):
+    """OpenAI honors ``response_format`` natively (including ``$defs``)."""
+
+
+class AnthropicStructuredOutput(ToolCallStructuredOutput):
+    """Claude answers ``response_format`` with prose; go straight to tools."""
+
+
+class GeminiStructuredOutput(ToolCallStructuredOutput):
+    """Gemini returns empty content for ``response_format``; go straight to tools."""
+
+
+def _strategy_for_model(model: str | None) -> ToolCallStructuredOutput:
+    name = (model or "").lower()
+    if "claude" in name or name.startswith("anthropic"):
+        return AnthropicStructuredOutput()
+    if "gemini" in name:
+        return GeminiStructuredOutput()
+    if name.startswith(("gpt", "o1", "o3", "o4")):
+        return OpenAIStructuredOutput()
+    # Unknown providers: try response_format, fall back to tools.
+    return ResponseFormatStructuredOutput()
+
+
+async def generate_structured_output(
+    llm: Any,
+    messages: list[dict[str, str]],
+    *,
+    schema: dict[str, Any],
+    response_format_name: str,
+    description: str,
+    completion_kwargs: dict[str, Any],
+) -> Any:
+    """Generate structured output using the strategy for the requested model."""
+    strategy = _strategy_for_model(completion_kwargs.get("model"))
+    return await strategy.generate(
+        llm,
+        messages,
+        schema=schema,
+        response_format_name=response_format_name,
+        description=description,
+        completion_kwargs=completion_kwargs,
+    )
diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
index 72b3765df..a8a8a64ec 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py
@@ -112,3 +112,10 @@ async def test_generate_llm_input_with_model_settings(
     assert len(chat_completion_requests) == 1, (
         "Expected exactly one chat completion request"
     )
+
+    # OpenAI returns content via response_format; no tool-call fallback needed.
+    import json
+
+    body = json.loads(chat_completion_requests[0].content.decode("utf-8"))
+    assert "response_format" in body
+    assert "tools" not in body
diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
index 19a432fef..d02c5d242 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py
@@ -212,6 +212,14 @@ async def test_simulate_input_span_on_error(httpx_mock: HTTPXMock, monkeypatch):
                 },
             },
         )
+        # The prose content above triggers the tool-call fallback; an empty
+        # response there fails the fallback too, producing the error span.
+        httpx_mock.add_response(
+            url="https://example.com/llm/api/chat/completions"
+            "?api-version=2024-08-01-preview",
+            status_code=200,
+            json={},
+        )
 
         mocking_strategy = InputMockingStrategy(
             prompt="Generate input",
diff --git a/packages/uipath/tests/cli/eval/mocks/test_mocks.py b/packages/uipath/tests/cli/eval/mocks/test_mocks.py
index c4bc26ee3..e59b07d2f 100644
--- a/packages/uipath/tests/cli/eval/mocks/test_mocks.py
+++ b/packages/uipath/tests/cli/eval/mocks/test_mocks.py
@@ -610,12 +610,14 @@ def foofoo(*args, **kwargs):
 
     with pytest.raises(NotImplementedError):
         assert foofoo()
-    httpx_mock.add_response(
-        url="https://example.com/llm/api/chat/completions"
-        "?api-version=2024-08-01-preview",
-        status_code=200,
-        json={},
-    )
+    # Two empty responses: the response_format attempt and the tool-call fallback.
+    for _ in range(2):
+        httpx_mock.add_response(
+            url="https://example.com/llm/api/chat/completions"
+            "?api-version=2024-08-01-preview",
+            status_code=200,
+            json={},
+        )
     with pytest.raises(UiPathMockResponseGenerationError):
         assert foo()
 
@@ -720,12 +722,14 @@ async def foofoo(*args, **kwargs):
     with pytest.raises(NotImplementedError):
         assert await foofoo()
 
-    httpx_mock.add_response(
-        url="https://example.com/llm/api/chat/completions"
-        "?api-version=2024-08-01-preview",
-        status_code=200,
-        json={},
-    )
+    # Two empty responses: the response_format attempt and the tool-call fallback.
+    for _ in range(2):
+        httpx_mock.add_response(
+            url="https://example.com/llm/api/chat/completions"
+            "?api-version=2024-08-01-preview",
+            status_code=200,
+            json={},
+        )
     with pytest.raises(UiPathMockResponseGenerationError):
         assert await foo()
 
@@ -931,6 +935,106 @@ async def foo(*args, **kwargs) -> dict[str, Any]:
     }
 
 
+@pytest.mark.asyncio
+@pytest.mark.httpx_mock(assert_all_responses_were_requested=False)
+async def test_llm_mockable_uses_tool_call_directly_for_non_openai(
+    httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch
+):
+    """Tool simulation works for non-OpenAI providers (AE-1646).
+
+    Non-OpenAI providers don't honor ``response_format`` on the normalized
+    gateway (Claude answers with prose, Gemini with empty content), so their
+    strategies go straight to a forced tool call — a single request.
+    """
+    monkeypatch.setenv("UIPATH_URL", "https://example.com")
+    monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890")
+    monkeypatch.setattr(CacheManager, "get", lambda *args, **kwargs: None)
+    monkeypatch.setattr(CacheManager, "set", lambda *args, **kwargs: None)
+
+    @mockable()
+    async def foo(*args, **kwargs) -> str:
+        raise NotImplementedError()
+
+    evaluation_item: dict[str, Any] = {
+        "id": "evaluation-id",
+        "name": "Mock foo",
+        "inputs": {},
+        "evaluationCriterias": {
+            "ExactMatchEvaluator": None,
+        },
+        "mockingStrategy": {
+            "type": "llm",
+            "prompt": "response is 'bar1'",
+            "toolsToSimulate": [{"name": "foo"}],
+            "model": {"model": "anthropic.claude-sonnet-4-5-20250929-v1:0"},
+        },
+    }
+    evaluation = EvaluationItem(**evaluation_item)
+    assert isinstance(evaluation.mocking_strategy, LLMMockingStrategy)
+    httpx_mock.add_response(
+        url="https://example.com/agenthub_/llm/api/capabilities",
+        status_code=200,
+        json={},
+    )
+    httpx_mock.add_response(
+        url="https://example.com/orchestrator_/llm/api/capabilities",
+        status_code=200,
+        json={},
+    )
+
+    def _completion(message: dict[str, Any]) -> dict[str, Any]:
+        return {
+            "id": "response-id",
+            "object": "",
+            "created": 0,
+            "model": "anthropic.claude-sonnet-4-5-20250929-v1:0",
+            "choices": [{"index": 0, "message": message, "finish_reason": "stop"}],
+            "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
+        }
+
+    # Claude goes straight to function calling: one request, one response.
+    httpx_mock.add_response(
+        url="https://example.com/llm/api/chat/completions"
+        "?api-version=2024-08-01-preview",
+        status_code=200,
+        json=_completion(
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "name": "submit_tool_response",
+                        "arguments": {"response": "bar1"},
+                    }
+                ],
+            }
+        ),
+    )
+
+    set_execution_context(
+        MockingContext(
+            strategy=evaluation.mocking_strategy,
+            name=evaluation.name,
+            inputs=evaluation.inputs,
+        ),
+        _mock_span_collector,
+        "test-execution-id",
+    )
+
+    assert await foo() == "bar1"
+
+    requests = [
+        r for r in httpx_mock.get_requests() if "chat/completions" in str(r.url)
+    ]
+    assert len(requests) == 1
+    body = json.loads(requests[0].content.decode("utf-8"))
+    # Non-OpenAI providers use a forced tool call directly — no response_format.
+    assert body["tool_choice"] == {"type": "required"}
+    assert body["tools"][0]["name"] == "submit_tool_response"
+    assert "response_format" not in body
+
+
 class TestUiPathMockRuntime:
     """Tests for UiPathMockRuntime execute/stream/get_schema paths."""
 
diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
new file mode 100644
index 000000000..79ad31591
--- /dev/null
+++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py
@@ -0,0 +1,295 @@
+"""Unit tests for the provider-agnostic structured-output helpers."""
+
+import json
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+
+from uipath.eval.mocks._structured_output import (
+    RESPONSE_KEY,
+    RESPONSE_TOOL_NAME,
+    build_response_tool,
+    extract_response,
+    generate_structured_output,
+)
+
+
+def _response(message: SimpleNamespace | None) -> SimpleNamespace:
+    choices = [] if message is None else [SimpleNamespace(message=message)]
+    return SimpleNamespace(choices=choices)
+
+
+class _FakeLLM:
+    """Records chat_completions calls and replays queued responses in order."""
+
+    def __init__(self, responses: list[Any]):
+        self._responses = list(responses)
+        self.calls: list[dict[str, Any]] = []
+
+    async def chat_completions(self, messages: Any, **kwargs: Any) -> Any:
+        self.calls.append(kwargs)
+        nxt = self._responses.pop(0)
+        if isinstance(nxt, Exception):
+            raise nxt
+        return nxt
+
+
+def test_build_response_tool_wraps_schema_under_response():
+    tool = build_response_tool({"type": "string"}, description="desc")
+    assert tool["name"] == RESPONSE_TOOL_NAME
+    assert tool["description"] == "desc"
+    assert tool["parameters"]["properties"][RESPONSE_KEY] == {"type": "string"}
+    assert tool["parameters"]["required"] == [RESPONSE_KEY]
+
+
+def test_build_response_tool_inlines_refs_into_self_contained_schema():
+    # Nested Pydantic models / enums emit $defs + $ref. The normalized gateway
+    # accepts $ref/$defs in response_format but NOT in a tool's parameters, so the
+    # schema must be inlined into a self-contained form (no $ref/$defs anywhere).
+    operator_def = {"enum": ["+", "-", "*", "/"], "type": "string"}
+    item_def = {"type": "object", "properties": {"sku": {"type": "string"}}}
+    schema = {
+        "type": "object",
+        "properties": {
+            "operator": {"$ref": "#/$defs/Operator"},
+            "items": {"type": "array", "items": {"$ref": "#/$defs/Item"}},
+        },
+        "required": ["operator"],
+        "$defs": {"Operator": operator_def, "Item": item_def},
+    }
+
+    tool = build_response_tool(schema, description="d")
+    params = tool["parameters"]
+
+    blob = json.dumps(params)
+    assert "$ref" not in blob
+    assert "$defs" not in blob
+
+    response = params["properties"][RESPONSE_KEY]
+    assert response["properties"]["operator"] == operator_def
+    assert response["properties"]["items"]["items"] == item_def
+    # caller's schema is not mutated
+    assert "$defs" in schema
+
+
+def test_build_response_tool_keeps_defs_for_cyclic_refs():
+    # Self-referential schemas can't be fully inlined; keep $defs hoisted so the
+    # remaining $ref still resolves rather than infinite-looping.
+    node_def = {
+        "type": "object",
+        "properties": {"child": {"$ref": "#/$defs/Node"}},
+    }
+    schema = {
+        "type": "object",
+        "properties": {"root": {"$ref": "#/$defs/Node"}},
+        "$defs": {"Node": node_def},
+    }
+
+    tool = build_response_tool(schema, description="d")
+    params = tool["parameters"]
+
+    assert "$defs" in params
+    assert "$ref" in json.dumps(params)
+    # the caller's schema dict is not mutated
+    assert "$defs" in schema
+
+
+def test_extract_response_returns_wrapped_value():
+    message = SimpleNamespace(
+        content=None,
+        tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})],
+    )
+    assert extract_response(_response(message)) == {"a": 1}
+
+
+def test_extract_response_raises_when_no_choices():
+    with pytest.raises(ValueError, match="no choices"):
+        extract_response(_response(None))
+
+
+def test_extract_response_raises_when_no_tool_calls():
+    # Non-OpenAI text response without a tool call: surface a clear error.
+    message = SimpleNamespace(content="not a tool call", tool_calls=None)
+    with pytest.raises(ValueError, match="no tool calls"):
+        extract_response(_response(message))
+
+
+def test_extract_response_raises_when_response_key_missing():
+    message = SimpleNamespace(
+        content=None, tool_calls=[SimpleNamespace(arguments={"other": 1})]
+    )
+    with pytest.raises(ValueError, match=RESPONSE_KEY):
+        extract_response(_response(message))
+
+
+@pytest.mark.asyncio
+async def test_generate_structured_output_prefers_response_format_content():
+    # OpenAI returns content via response_format; no fallback call is made.
+    llm = _FakeLLM([_response(SimpleNamespace(content='{"a": 1}', tool_calls=None))])
+    result = await generate_structured_output(
+        llm,
+        [{"role": "user", "content": "x"}],
+        schema={"type": "object"},
+        response_format_name="OutputSchema",
+        description="d",
+        completion_kwargs={},
+    )
+    assert result == {"a": 1}
+    assert len(llm.calls) == 1
+    assert "response_format" in llm.calls[0]
+    assert "tools" not in llm.calls[0]
+
+
+@pytest.mark.asyncio
+async def test_generate_structured_output_falls_back_on_prose_content():
+    # Claude on the normalized gateway answers response_format requests with
+    # plain prose (e.g. "Tokyo") — truthy but not JSON. Must fall back to tools
+    # instead of raising JSONDecodeError (AE-1646).
+    llm = _FakeLLM(
+        [
+            _response(SimpleNamespace(content="Tokyo", tool_calls=None)),
+            _response(
+                SimpleNamespace(
+                    content=None,
+                    tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})],
+                )
+            ),
+        ]
+    )
+    result = await generate_structured_output(
+        llm,
+        [{"role": "user", "content": "x"}],
+        schema={"type": "object"},
+        response_format_name="OutputSchema",
+        description="d",
+        completion_kwargs={},
+    )
+    assert result == {"a": 1}
+    assert len(llm.calls) == 2
+    assert "tools" in llm.calls[1]
+
+
+@pytest.mark.asyncio
+async def test_generate_structured_output_falls_back_on_empty_content():
+    # Non-OpenAI: response_format yields empty content -> fall back to tool call.
+    llm = _FakeLLM(
+        [
+            _response(SimpleNamespace(content=None, tool_calls=None)),
+            _response(
+                SimpleNamespace(
+                    content=None,
+                    tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})],
+                )
+            ),
+        ]
+    )
+    result = await generate_structured_output(
+        llm,
+        [{"role": "user", "content": "x"}],
+        schema={"type": "object"},
+        response_format_name="OutputSchema",
+        description="d",
+        completion_kwargs={},
+    )
+    assert result == {"a": 1}
+    assert len(llm.calls) == 2
+    assert "response_format" in llm.calls[0]
+    assert "tools" in llm.calls[1] and "tool_choice" in llm.calls[1]
+
+
+@pytest.mark.asyncio
+async def test_generate_structured_output_falls_back_when_response_format_raises():
+    # A provider that rejects response_format outright still gets a tool fallback.
+    llm = _FakeLLM(
+        [
+            RuntimeError("response_format unsupported"),
+            _response(
+                SimpleNamespace(
+                    content=None,
+                    tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: "ok"})],
+                )
+            ),
+        ]
+    )
+    result = await generate_structured_output(
+        llm,
+        [{"role": "user", "content": "x"}],
+        schema={"type": "string"},
+        response_format_name="OutputSchema",
+        description="d",
+        completion_kwargs={},
+    )
+    assert result == "ok"
+    assert len(llm.calls) == 2
+
+
+def test_build_response_tool_merges_ref_sibling_keys():
+    # Pydantic can emit sibling keys (e.g. description) next to $ref; they
+    # must survive inlining since they guide the LLM.
+    schema = {
+        "type": "object",
+        "properties": {
+            "op": {"$ref": "#/$defs/Op", "description": "the operator to use"}
+        },
+        "$defs": {"Op": {"type": "string", "enum": ["+", "-"]}},
+    }
+    tool = build_response_tool(schema, description="d")
+    op = tool["parameters"]["properties"][RESPONSE_KEY]["properties"]["op"]
+    assert op == {
+        "type": "string",
+        "enum": ["+", "-"],
+        "description": "the operator to use",
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model",
+    [
+        "anthropic.claude-sonnet-4-5-20250929-v1:0",
+        "claude-haiku-4-5",
+        "gemini-2.5-pro",
+    ],
+)
+async def test_non_openai_models_use_tool_call_directly(model: str):
+    # Claude/Gemini don't honor response_format on the normalized gateway, so
+    # their strategies skip it entirely: a single forced tool call.
+    llm = _FakeLLM(
+        [
+            _response(
+                SimpleNamespace(
+                    content=None,
+                    tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: "ok"})],
+                )
+            )
+        ]
+    )
+    result = await generate_structured_output(
+        llm,
+        [{"role": "user", "content": "x"}],
+        schema={"type": "string"},
+        response_format_name="OutputSchema",
+        description="d",
+        completion_kwargs={"model": model},
+    )
+    assert result == "ok"
+    assert len(llm.calls) == 1
+    assert "tools" in llm.calls[0] and "tool_choice" in llm.calls[0]
+    assert "response_format" not in llm.calls[0]
+
+
+@pytest.mark.asyncio
+async def test_openai_models_prefer_response_format():
+    llm = _FakeLLM([_response(SimpleNamespace(content='{"a": 1}', tool_calls=None))])
+    result = await generate_structured_output(
+        llm,
+        [{"role": "user", "content": "x"}],
+        schema={"type": "object"},
+        response_format_name="OutputSchema",
+        description="d",
+        completion_kwargs={"model": "gpt-4.1-mini-2025-04-14"},
+    )
+    assert result == {"a": 1}
+    assert len(llm.calls) == 1
+    assert "response_format" in llm.calls[0]
diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock
index f78dd4cbe..89d6d082d 100644
--- a/packages/uipath/uv.lock
+++ b/packages/uipath/uv.lock
@@ -2552,7 +2552,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.10.80"
+version = "2.10.81"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },
@@ -2691,7 +2691,7 @@ dev = [
 
 [[package]]
 name = "uipath-platform"
-version = "0.1.62"
+version = "0.1.63"
 source = { editable = "../uipath-platform" }
 dependencies = [
     { name = "httpx" },