diff --git a/packages/uipath-platform/pyproject.toml b/packages/uipath-platform/pyproject.toml index 4a5c0b83c..50dd25553 100644 --- a/packages/uipath-platform/pyproject.toml +++ b/packages/uipath-platform/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath-platform" -version = "0.1.62" +version = "0.1.63" description = "HTTP client library for programmatic access to UiPath Platform" readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py index ffe0bff99..cb02d8af3 100644 --- a/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py +++ b/packages/uipath-platform/src/uipath/platform/chat/_llm_gateway_service.py @@ -401,7 +401,7 @@ async def chat_completions( presence_penalty: float = 0, top_p: float | None = 1, top_k: int | None = None, - tools: list[ToolDefinition] | None = None, + tools: list[ToolDefinition | dict[str, Any]] | None = None, tool_choice: ToolChoice | None = None, response_format: dict[str, Any] | type[BaseModel] | None = None, api_version: str = NORMALIZED_API_VERSION, @@ -436,9 +436,11 @@ async def chat_completions( Controls diversity by considering only the top p probability mass. Defaults to 1. top_k (int, optional): Nucleus sampling parameter. Controls diversity by considering only the top k most probable tokens. Defaults to None. - tools (Optional[List[ToolDefinition]], optional): List of tool definitions that the - model can call. Tools enable the model to perform actions or retrieve information - beyond text generation. Defaults to None. + tools (Optional[List[ToolDefinition | dict]], optional): List of tool definitions + that the model can call. Tools enable the model to perform actions or retrieve + information beyond text generation. A tool given as a dict must already be in + UiPath wire format and is forwarded unchanged, which allows arbitrary nested + JSON schemas in its parameters. Defaults to None. tool_choice (Optional[ToolChoice], optional): Controls which tools the model can call. Can be "auto" (model decides), "none" (no tools), or a specific tool choice. Defaults to None. @@ -583,10 +585,15 @@ class Country(BaseModel): # Use provided dictionary format directly request_body["response_format"] = response_format - # Add tools if provided - convert to UiPath format + # Add tools if provided. A tool already in UiPath wire format (a dict) is + # passed through unchanged so callers can supply an arbitrary JSON schema + # for the parameters; ToolDefinition objects are converted as before. if tools: request_body["tools"] = [ - self._convert_tool_to_uipath_format(tool) for tool in tools + tool + if isinstance(tool, dict) + else self._convert_tool_to_uipath_format(tool) + for tool in tools ] # Handle tool_choice diff --git a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py index 124ccad8b..9e2292c60 100644 --- a/packages/uipath-platform/tests/services/test_uipath_llm_integration.py +++ b/packages/uipath-platform/tests/services/test_uipath_llm_integration.py @@ -7,6 +7,7 @@ from uipath.platform.chat import ( AutoToolChoice, ChatModels, + RequiredToolChoice, SpecificToolChoice, ToolDefinition, ToolFunctionDefinition, @@ -369,6 +370,87 @@ async def test_tool_call_required_mocked(self, mock_request, llm_service): assert result.choices[0].message.tool_calls[0].arguments["name"] == "John" assert result.choices[0].message.tool_calls[0].arguments["password"] == "1234" + @pytest.mark.asyncio + @patch.object(UiPathLlmChatService, "request_async") + async def test_raw_dict_tool_passthrough_mocked(self, mock_request, llm_service): + """A tool supplied as a raw dict is sent unchanged, preserving nested schema. + + ToolDefinition's converter only emits flat properties, so callers that need + an arbitrary nested JSON schema (e.g. the eval mockers) pass the tool as a + dict already in UiPath wire format. It must reach the gateway verbatim. + """ + mock_response = MagicMock() + mock_response.json.return_value = { + "id": "chatcmpl-raw", + "object": "chat.completion", + "created": 1677858242, + "model": "gpt-4o-mini-2024-07-18", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_raw", + "name": "submit_tool_response", + "arguments": {"response": {"items": [{"sku": "A1"}]}}, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + "cache_read_input_tokens": None, + }, + } + mock_request.return_value = mock_response + + nested_tool = { + "name": "submit_tool_response", + "description": "Return the simulated response matching the schema.", + "parameters": { + "type": "object", + "properties": { + "response": { + "type": "object", + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "properties": {"sku": {"type": "string"}}, + }, + } + }, + } + }, + "required": ["response"], + }, + } + + result = await llm_service.chat_completions( + messages=[{"role": "user", "content": "go"}], + model=ChatModels.gpt_4_1_mini_2025_04_14, + tools=[nested_tool], + tool_choice=RequiredToolChoice(), + ) + + mock_request.assert_called_once() + _, kwargs = mock_request.call_args + body = kwargs["json"] + # The dict tool is forwarded byte-for-byte, nested array schema intact. + assert body["tools"] == [nested_tool] + assert body["tool_choice"] == {"type": "required"} + assert result.choices[0].message.tool_calls[0].arguments == { + "response": {"items": [{"sku": "A1"}]} + } + @pytest.mark.asyncio @patch.object(UiPathLlmChatService, "request_async") async def test_chat_with_conversation_history_mocked( diff --git a/packages/uipath-platform/uv.lock b/packages/uipath-platform/uv.lock index ac4761377..2f56c1df5 100644 --- a/packages/uipath-platform/uv.lock +++ b/packages/uipath-platform/uv.lock @@ -1095,7 +1095,7 @@ dev = [ [[package]] name = "uipath-platform" -version = "0.1.62" +version = "0.1.63" source = { editable = "." } dependencies = [ { name = "httpx" }, diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml index 79881bbc5..ffaa7f881 100644 --- a/packages/uipath/pyproject.toml +++ b/packages/uipath/pyproject.toml @@ -1,13 +1,13 @@ [project] name = "uipath" -version = "2.10.80" +version = "2.10.81" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" dependencies = [ "uipath-core>=0.5.17, <0.6.0", "uipath-runtime>=0.11.0, <0.12.0", - "uipath-platform>=0.1.60, <0.2.0", + "uipath-platform>=0.1.63, <0.2.0", "click>=8.3.1", "httpx>=0.28.1", "pyjwt>=2.10.1", diff --git a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py index 57a727ec1..a542fc7ad 100644 --- a/packages/uipath/src/uipath/eval/mocks/_input_mocker.py +++ b/packages/uipath/src/uipath/eval/mocks/_input_mocker.py @@ -15,6 +15,7 @@ from .._execution_context import eval_set_run_id_context from ._mock_context import cache_manager_context from ._mocker import UiPathInputMockingError +from ._structured_output import generate_structured_output from ._types import ( InputMockingStrategy, ) @@ -105,15 +106,6 @@ async def generate_llm_input( prompt = get_input_mocking_prompt(**prompt_generation_args) - response_format = { - "type": "json_schema", - "json_schema": { - "name": "agent_input", - "strict": False, - "schema": input_schema, - }, - } - model_parameters = mocking_strategy.model if mocking_strategy else None completion_kwargs = ( model_parameters.model_dump(by_alias=False, exclude_none=True) @@ -128,7 +120,7 @@ async def generate_llm_input( if cache_manager is not None: cache_key_data = { - "response_format": response_format, + "input_schema": input_schema, "completion_kwargs": completion_kwargs, "prompt_generation_args": prompt_generation_args, } @@ -142,15 +134,15 @@ async def generate_llm_input( if cached_response is not None: return cached_response - response = await llm.chat_completions( + result = await generate_structured_output( + llm, [{"role": "user", "content": prompt}], - response_format=response_format, - **completion_kwargs, + schema=input_schema, + response_format_name="agent_input", + description="Return the simulated agent input matching the required schema.", + completion_kwargs=completion_kwargs, ) - generated_input_str = response.choices[0].message.content - result = json.loads(generated_input_str) - if cache_manager is not None: cache_manager.set( mocker_type="input_mocker", @@ -160,10 +152,6 @@ async def generate_llm_input( ) return result - except json.JSONDecodeError as e: - raise UiPathInputMockingError( - f"Failed to parse LLM response as JSON: {str(e)}" - ) from e except UiPathInputMockingError: raise except Exception as e: diff --git a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py index d1fd2a1c9..a9ab7005e 100644 --- a/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py +++ b/packages/uipath/src/uipath/eval/mocks/_llm_mocker.py @@ -28,6 +28,7 @@ UiPathMockResponseGenerationError, UiPathNoMockFoundError, ) +from ._structured_output import generate_structured_output from ._types import ( ExampleCall, LLMMockingStrategy, @@ -125,14 +126,7 @@ async def response( "output_schema", TypeAdapter(return_type).json_schema() ) - response_format = { - "type": "json_schema", - "json_schema": { - "name": "OutputSchema", - "strict": False, - "schema": _cleanup_schema(output_schema), - }, - } + cleaned_schema = _cleanup_schema(output_schema) try: # Safely pull examples from params. example_calls = params.get("example_calls", []) @@ -197,7 +191,7 @@ async def response( formatted_prompt = PROMPT.format(**prompt_generation_args) cache_key_data = { - "response_format": response_format, + "output_schema": cleaned_schema, "completion_kwargs": completion_kwargs, "prompt_generation_args": prompt_generation_args, } @@ -213,17 +207,17 @@ async def response( if cached_response is not None: return cached_response - response = await llm.chat_completions( - [ - { - "role": "user", - "content": formatted_prompt, - }, - ], - response_format=response_format, - **completion_kwargs, + result = await generate_structured_output( + llm, + [{"role": "user", "content": formatted_prompt}], + schema=cleaned_schema, + response_format_name="OutputSchema", + description=( + "Return the simulated response for tool " + f"'{function_name}' matching the required schema." + ), + completion_kwargs=completion_kwargs, ) - result = json.loads(response.choices[0].message.content) if cache_manager is not None: cache_manager.set( @@ -235,7 +229,7 @@ async def response( return result except Exception as e: - raise UiPathMockResponseGenerationError() from e + raise UiPathMockResponseGenerationError(str(e)) from e else: raise UiPathNoMockFoundError(f"Method '{function_name}' is not simulated.") diff --git a/packages/uipath/src/uipath/eval/mocks/_structured_output.py b/packages/uipath/src/uipath/eval/mocks/_structured_output.py new file mode 100644 index 000000000..599780353 --- /dev/null +++ b/packages/uipath/src/uipath/eval/mocks/_structured_output.py @@ -0,0 +1,259 @@ +"""Provider-aware structured output for the eval mockers. + +The normalized LLM Gateway handles OpenAI-style ``response_format`` +(json_schema) differently per provider — live-verified against the gateway: + +- **OpenAI**: honors ``response_format`` and returns valid JSON content, + including native ``$defs`` support. +- **Anthropic (Claude)**: ignores it and answers with plain prose content. +- **Gemini**: returns empty content. + +Forced function calling works across all three providers, so each provider +gets a small strategy class: OpenAI prefers ``response_format`` (more reliable +for it on some schemas) with a tool-call fallback; Claude and Gemini go +straight to the forced tool call; unknown providers try ``response_format`` +first and fall back. +""" + +import json +import logging +from typing import Any + +from uipath.platform.chat.llm_gateway import RequiredToolChoice + +RESPONSE_TOOL_NAME = "submit_tool_response" +RESPONSE_KEY = "response" +_DEFS_PREFIX = "#/$defs/" + +logger = logging.getLogger(__name__) + + +def _inline_defs( + schema: dict[str, Any], +) -> tuple[dict[str, Any], dict[str, Any]]: + """Inline ``$defs``/``$ref`` into a self-contained schema. + + Nested Pydantic models and enums emit root ``$defs`` referenced by ``$ref``. + The normalized gateway accepts those in ``response_format`` but not inside a + tool's ``parameters``, so they are inlined here. Sibling keys on a ``$ref`` + node (e.g. a field ``description``) are merged over the inlined definition. + Self-referential definitions cannot be inlined without looping; any ``$ref`` + reached while its target is already on the current resolution path is left + untouched and its definitions are returned so the caller can keep them + reachable. + + Returns: + A tuple of (inlined schema, leftover ``$defs`` needed for cyclic refs). + """ + defs = schema.get("$defs", {}) + leftover: dict[str, Any] = {} + + def resolve(node: Any, active: frozenset[str]) -> Any: + if isinstance(node, dict): + ref = node.get("$ref") + if isinstance(ref, str) and ref.startswith(_DEFS_PREFIX): + name = ref[len(_DEFS_PREFIX) :] + if name in defs and name not in active: + resolved = resolve(defs[name], active | {name}) + siblings = { + key: resolve(value, active) + for key, value in node.items() + if key not in ("$ref", "$defs") + } + if isinstance(resolved, dict): + return {**resolved, **siblings} + return resolved + # Cyclic or unknown ref: keep it and preserve its definition. + if name in defs: + leftover[name] = defs[name] + return dict(node) + return { + key: resolve(value, active) + for key, value in node.items() + if key != "$defs" + } + if isinstance(node, list): + return [resolve(item, active) for item in node] + return node + + root = {key: value for key, value in schema.items() if key != "$defs"} + inlined = resolve(root, frozenset()) + return inlined, leftover + + +def build_response_tool(schema: dict[str, Any], description: str) -> dict[str, Any]: + """Build a normalized-API function tool that wraps ``schema`` under ``response``. + + Tool-call arguments are always a JSON object, so an arbitrary output schema + (which may be a scalar, array, or object) is nested under a single + ``response`` property and unwrapped after the call. ``$defs``/``$ref`` are + inlined so the tool parameters are self-contained, which the gateway requires + for tool schemas (unlike ``response_format``). + """ + response_schema, leftover_defs = _inline_defs(schema) + parameters: dict[str, Any] = { + "type": "object", + "properties": {RESPONSE_KEY: response_schema}, + "required": [RESPONSE_KEY], + } + if leftover_defs: + parameters["$defs"] = leftover_defs + + return { + "name": RESPONSE_TOOL_NAME, + "description": description, + "parameters": parameters, + } + + +def extract_response(response: Any) -> Any: + """Extract the wrapped value from the forced tool call. + + Raises: + ValueError: if the response carries no usable tool call or is missing the + wrapped ``response`` key. + """ + choices = getattr(response, "choices", None) + if not choices: + raise ValueError("LLM response contained no choices") + + message = choices[0].message + tool_calls = getattr(message, "tool_calls", None) + if not tool_calls: + raise ValueError( + f"LLM response contained no tool calls (content={message.content!r})" + ) + + arguments = tool_calls[0].arguments + if RESPONSE_KEY not in arguments: + raise ValueError( + f"Tool call arguments missing '{RESPONSE_KEY}' key: {arguments}" + ) + + return arguments[RESPONSE_KEY] + + +class ToolCallStructuredOutput: + """Structured output via a forced tool call — works on every provider.""" + + async def generate( + self, + llm: Any, + messages: list[dict[str, str]], + *, + schema: dict[str, Any], + response_format_name: str, + description: str, + completion_kwargs: dict[str, Any], + ) -> Any: + """Force a tool call wrapping ``schema`` and unwrap its arguments.""" + tool = build_response_tool(schema, description) + response = await llm.chat_completions( + messages, + tools=[tool], + tool_choice=RequiredToolChoice(), + **completion_kwargs, + ) + return extract_response(response) + + +class ResponseFormatStructuredOutput(ToolCallStructuredOutput): + """Prefer ``response_format`` (json_schema); fall back to a forced tool call. + + The fallback fires when the provider rejects the request, returns empty + content, or returns content that is not valid JSON (Claude's behavior on + the normalized gateway is to answer with plain prose). + """ + + async def generate( + self, + llm: Any, + messages: list[dict[str, str]], + *, + schema: dict[str, Any], + response_format_name: str, + description: str, + completion_kwargs: dict[str, Any], + ) -> Any: + """Try ``response_format`` first, falling back to a forced tool call.""" + response_format = { + "type": "json_schema", + "json_schema": { + "name": response_format_name, + "strict": False, + "schema": schema, + }, + } + + content: str | None = None + try: + response = await llm.chat_completions( + messages, response_format=response_format, **completion_kwargs + ) + choices = getattr(response, "choices", None) + if choices: + content = choices[0].message.content + except Exception as e: + logger.info("response_format path failed, falling back to tools: %s", e) + + if content: + try: + return json.loads(content) + except json.JSONDecodeError: + logger.info( + "response_format content was not JSON, falling back to tools" + ) + + return await super().generate( + llm, + messages, + schema=schema, + response_format_name=response_format_name, + description=description, + completion_kwargs=completion_kwargs, + ) + + +class OpenAIStructuredOutput(ResponseFormatStructuredOutput): + """OpenAI honors ``response_format`` natively (including ``$defs``).""" + + +class AnthropicStructuredOutput(ToolCallStructuredOutput): + """Claude answers ``response_format`` with prose; go straight to tools.""" + + +class GeminiStructuredOutput(ToolCallStructuredOutput): + """Gemini returns empty content for ``response_format``; go straight to tools.""" + + +def _strategy_for_model(model: str | None) -> ToolCallStructuredOutput: + name = (model or "").lower() + if "claude" in name or name.startswith("anthropic"): + return AnthropicStructuredOutput() + if "gemini" in name: + return GeminiStructuredOutput() + if name.startswith(("gpt", "o1", "o3", "o4")): + return OpenAIStructuredOutput() + # Unknown providers: try response_format, fall back to tools. + return ResponseFormatStructuredOutput() + + +async def generate_structured_output( + llm: Any, + messages: list[dict[str, str]], + *, + schema: dict[str, Any], + response_format_name: str, + description: str, + completion_kwargs: dict[str, Any], +) -> Any: + """Generate structured output using the strategy for the requested model.""" + strategy = _strategy_for_model(completion_kwargs.get("model")) + return await strategy.generate( + llm, + messages, + schema=schema, + response_format_name=response_format_name, + description=description, + completion_kwargs=completion_kwargs, + ) diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py index 72b3765df..a8a8a64ec 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py +++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker.py @@ -112,3 +112,10 @@ async def test_generate_llm_input_with_model_settings( assert len(chat_completion_requests) == 1, ( "Expected exactly one chat completion request" ) + + # OpenAI returns content via response_format; no tool-call fallback needed. + import json + + body = json.loads(chat_completion_requests[0].content.decode("utf-8")) + assert "response_format" in body + assert "tools" not in body diff --git a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py index 19a432fef..d02c5d242 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py +++ b/packages/uipath/tests/cli/eval/mocks/test_input_mocker_span.py @@ -212,6 +212,14 @@ async def test_simulate_input_span_on_error(httpx_mock: HTTPXMock, monkeypatch): }, }, ) + # The prose content above triggers the tool-call fallback; an empty + # response there fails the fallback too, producing the error span. + httpx_mock.add_response( + url="https://example.com/llm/api/chat/completions" + "?api-version=2024-08-01-preview", + status_code=200, + json={}, + ) mocking_strategy = InputMockingStrategy( prompt="Generate input", diff --git a/packages/uipath/tests/cli/eval/mocks/test_mocks.py b/packages/uipath/tests/cli/eval/mocks/test_mocks.py index c4bc26ee3..e59b07d2f 100644 --- a/packages/uipath/tests/cli/eval/mocks/test_mocks.py +++ b/packages/uipath/tests/cli/eval/mocks/test_mocks.py @@ -610,12 +610,14 @@ def foofoo(*args, **kwargs): with pytest.raises(NotImplementedError): assert foofoo() - httpx_mock.add_response( - url="https://example.com/llm/api/chat/completions" - "?api-version=2024-08-01-preview", - status_code=200, - json={}, - ) + # Two empty responses: the response_format attempt and the tool-call fallback. + for _ in range(2): + httpx_mock.add_response( + url="https://example.com/llm/api/chat/completions" + "?api-version=2024-08-01-preview", + status_code=200, + json={}, + ) with pytest.raises(UiPathMockResponseGenerationError): assert foo() @@ -720,12 +722,14 @@ async def foofoo(*args, **kwargs): with pytest.raises(NotImplementedError): assert await foofoo() - httpx_mock.add_response( - url="https://example.com/llm/api/chat/completions" - "?api-version=2024-08-01-preview", - status_code=200, - json={}, - ) + # Two empty responses: the response_format attempt and the tool-call fallback. + for _ in range(2): + httpx_mock.add_response( + url="https://example.com/llm/api/chat/completions" + "?api-version=2024-08-01-preview", + status_code=200, + json={}, + ) with pytest.raises(UiPathMockResponseGenerationError): assert await foo() @@ -931,6 +935,106 @@ async def foo(*args, **kwargs) -> dict[str, Any]: } +@pytest.mark.asyncio +@pytest.mark.httpx_mock(assert_all_responses_were_requested=False) +async def test_llm_mockable_uses_tool_call_directly_for_non_openai( + httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch +): + """Tool simulation works for non-OpenAI providers (AE-1646). + + Non-OpenAI providers don't honor ``response_format`` on the normalized + gateway (Claude answers with prose, Gemini with empty content), so their + strategies go straight to a forced tool call — a single request. + """ + monkeypatch.setenv("UIPATH_URL", "https://example.com") + monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890") + monkeypatch.setattr(CacheManager, "get", lambda *args, **kwargs: None) + monkeypatch.setattr(CacheManager, "set", lambda *args, **kwargs: None) + + @mockable() + async def foo(*args, **kwargs) -> str: + raise NotImplementedError() + + evaluation_item: dict[str, Any] = { + "id": "evaluation-id", + "name": "Mock foo", + "inputs": {}, + "evaluationCriterias": { + "ExactMatchEvaluator": None, + }, + "mockingStrategy": { + "type": "llm", + "prompt": "response is 'bar1'", + "toolsToSimulate": [{"name": "foo"}], + "model": {"model": "anthropic.claude-sonnet-4-5-20250929-v1:0"}, + }, + } + evaluation = EvaluationItem(**evaluation_item) + assert isinstance(evaluation.mocking_strategy, LLMMockingStrategy) + httpx_mock.add_response( + url="https://example.com/agenthub_/llm/api/capabilities", + status_code=200, + json={}, + ) + httpx_mock.add_response( + url="https://example.com/orchestrator_/llm/api/capabilities", + status_code=200, + json={}, + ) + + def _completion(message: dict[str, Any]) -> dict[str, Any]: + return { + "id": "response-id", + "object": "", + "created": 0, + "model": "anthropic.claude-sonnet-4-5-20250929-v1:0", + "choices": [{"index": 0, "message": message, "finish_reason": "stop"}], + "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2}, + } + + # Claude goes straight to function calling: one request, one response. + httpx_mock.add_response( + url="https://example.com/llm/api/chat/completions" + "?api-version=2024-08-01-preview", + status_code=200, + json=_completion( + { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_1", + "name": "submit_tool_response", + "arguments": {"response": "bar1"}, + } + ], + } + ), + ) + + set_execution_context( + MockingContext( + strategy=evaluation.mocking_strategy, + name=evaluation.name, + inputs=evaluation.inputs, + ), + _mock_span_collector, + "test-execution-id", + ) + + assert await foo() == "bar1" + + requests = [ + r for r in httpx_mock.get_requests() if "chat/completions" in str(r.url) + ] + assert len(requests) == 1 + body = json.loads(requests[0].content.decode("utf-8")) + # Non-OpenAI providers use a forced tool call directly — no response_format. + assert body["tool_choice"] == {"type": "required"} + assert body["tools"][0]["name"] == "submit_tool_response" + assert "response_format" not in body + + class TestUiPathMockRuntime: """Tests for UiPathMockRuntime execute/stream/get_schema paths.""" diff --git a/packages/uipath/tests/cli/eval/mocks/test_structured_output.py b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py new file mode 100644 index 000000000..79ad31591 --- /dev/null +++ b/packages/uipath/tests/cli/eval/mocks/test_structured_output.py @@ -0,0 +1,295 @@ +"""Unit tests for the provider-agnostic structured-output helpers.""" + +import json +from types import SimpleNamespace +from typing import Any + +import pytest + +from uipath.eval.mocks._structured_output import ( + RESPONSE_KEY, + RESPONSE_TOOL_NAME, + build_response_tool, + extract_response, + generate_structured_output, +) + + +def _response(message: SimpleNamespace | None) -> SimpleNamespace: + choices = [] if message is None else [SimpleNamespace(message=message)] + return SimpleNamespace(choices=choices) + + +class _FakeLLM: + """Records chat_completions calls and replays queued responses in order.""" + + def __init__(self, responses: list[Any]): + self._responses = list(responses) + self.calls: list[dict[str, Any]] = [] + + async def chat_completions(self, messages: Any, **kwargs: Any) -> Any: + self.calls.append(kwargs) + nxt = self._responses.pop(0) + if isinstance(nxt, Exception): + raise nxt + return nxt + + +def test_build_response_tool_wraps_schema_under_response(): + tool = build_response_tool({"type": "string"}, description="desc") + assert tool["name"] == RESPONSE_TOOL_NAME + assert tool["description"] == "desc" + assert tool["parameters"]["properties"][RESPONSE_KEY] == {"type": "string"} + assert tool["parameters"]["required"] == [RESPONSE_KEY] + + +def test_build_response_tool_inlines_refs_into_self_contained_schema(): + # Nested Pydantic models / enums emit $defs + $ref. The normalized gateway + # accepts $ref/$defs in response_format but NOT in a tool's parameters, so the + # schema must be inlined into a self-contained form (no $ref/$defs anywhere). + operator_def = {"enum": ["+", "-", "*", "/"], "type": "string"} + item_def = {"type": "object", "properties": {"sku": {"type": "string"}}} + schema = { + "type": "object", + "properties": { + "operator": {"$ref": "#/$defs/Operator"}, + "items": {"type": "array", "items": {"$ref": "#/$defs/Item"}}, + }, + "required": ["operator"], + "$defs": {"Operator": operator_def, "Item": item_def}, + } + + tool = build_response_tool(schema, description="d") + params = tool["parameters"] + + blob = json.dumps(params) + assert "$ref" not in blob + assert "$defs" not in blob + + response = params["properties"][RESPONSE_KEY] + assert response["properties"]["operator"] == operator_def + assert response["properties"]["items"]["items"] == item_def + # caller's schema is not mutated + assert "$defs" in schema + + +def test_build_response_tool_keeps_defs_for_cyclic_refs(): + # Self-referential schemas can't be fully inlined; keep $defs hoisted so the + # remaining $ref still resolves rather than infinite-looping. + node_def = { + "type": "object", + "properties": {"child": {"$ref": "#/$defs/Node"}}, + } + schema = { + "type": "object", + "properties": {"root": {"$ref": "#/$defs/Node"}}, + "$defs": {"Node": node_def}, + } + + tool = build_response_tool(schema, description="d") + params = tool["parameters"] + + assert "$defs" in params + assert "$ref" in json.dumps(params) + # the caller's schema dict is not mutated + assert "$defs" in schema + + +def test_extract_response_returns_wrapped_value(): + message = SimpleNamespace( + content=None, + tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})], + ) + assert extract_response(_response(message)) == {"a": 1} + + +def test_extract_response_raises_when_no_choices(): + with pytest.raises(ValueError, match="no choices"): + extract_response(_response(None)) + + +def test_extract_response_raises_when_no_tool_calls(): + # Non-OpenAI text response without a tool call: surface a clear error. + message = SimpleNamespace(content="not a tool call", tool_calls=None) + with pytest.raises(ValueError, match="no tool calls"): + extract_response(_response(message)) + + +def test_extract_response_raises_when_response_key_missing(): + message = SimpleNamespace( + content=None, tool_calls=[SimpleNamespace(arguments={"other": 1})] + ) + with pytest.raises(ValueError, match=RESPONSE_KEY): + extract_response(_response(message)) + + +@pytest.mark.asyncio +async def test_generate_structured_output_prefers_response_format_content(): + # OpenAI returns content via response_format; no fallback call is made. + llm = _FakeLLM([_response(SimpleNamespace(content='{"a": 1}', tool_calls=None))]) + result = await generate_structured_output( + llm, + [{"role": "user", "content": "x"}], + schema={"type": "object"}, + response_format_name="OutputSchema", + description="d", + completion_kwargs={}, + ) + assert result == {"a": 1} + assert len(llm.calls) == 1 + assert "response_format" in llm.calls[0] + assert "tools" not in llm.calls[0] + + +@pytest.mark.asyncio +async def test_generate_structured_output_falls_back_on_prose_content(): + # Claude on the normalized gateway answers response_format requests with + # plain prose (e.g. "Tokyo") — truthy but not JSON. Must fall back to tools + # instead of raising JSONDecodeError (AE-1646). + llm = _FakeLLM( + [ + _response(SimpleNamespace(content="Tokyo", tool_calls=None)), + _response( + SimpleNamespace( + content=None, + tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})], + ) + ), + ] + ) + result = await generate_structured_output( + llm, + [{"role": "user", "content": "x"}], + schema={"type": "object"}, + response_format_name="OutputSchema", + description="d", + completion_kwargs={}, + ) + assert result == {"a": 1} + assert len(llm.calls) == 2 + assert "tools" in llm.calls[1] + + +@pytest.mark.asyncio +async def test_generate_structured_output_falls_back_on_empty_content(): + # Non-OpenAI: response_format yields empty content -> fall back to tool call. + llm = _FakeLLM( + [ + _response(SimpleNamespace(content=None, tool_calls=None)), + _response( + SimpleNamespace( + content=None, + tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: {"a": 1}})], + ) + ), + ] + ) + result = await generate_structured_output( + llm, + [{"role": "user", "content": "x"}], + schema={"type": "object"}, + response_format_name="OutputSchema", + description="d", + completion_kwargs={}, + ) + assert result == {"a": 1} + assert len(llm.calls) == 2 + assert "response_format" in llm.calls[0] + assert "tools" in llm.calls[1] and "tool_choice" in llm.calls[1] + + +@pytest.mark.asyncio +async def test_generate_structured_output_falls_back_when_response_format_raises(): + # A provider that rejects response_format outright still gets a tool fallback. + llm = _FakeLLM( + [ + RuntimeError("response_format unsupported"), + _response( + SimpleNamespace( + content=None, + tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: "ok"})], + ) + ), + ] + ) + result = await generate_structured_output( + llm, + [{"role": "user", "content": "x"}], + schema={"type": "string"}, + response_format_name="OutputSchema", + description="d", + completion_kwargs={}, + ) + assert result == "ok" + assert len(llm.calls) == 2 + + +def test_build_response_tool_merges_ref_sibling_keys(): + # Pydantic can emit sibling keys (e.g. description) next to $ref; they + # must survive inlining since they guide the LLM. + schema = { + "type": "object", + "properties": { + "op": {"$ref": "#/$defs/Op", "description": "the operator to use"} + }, + "$defs": {"Op": {"type": "string", "enum": ["+", "-"]}}, + } + tool = build_response_tool(schema, description="d") + op = tool["parameters"]["properties"][RESPONSE_KEY]["properties"]["op"] + assert op == { + "type": "string", + "enum": ["+", "-"], + "description": "the operator to use", + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model", + [ + "anthropic.claude-sonnet-4-5-20250929-v1:0", + "claude-haiku-4-5", + "gemini-2.5-pro", + ], +) +async def test_non_openai_models_use_tool_call_directly(model: str): + # Claude/Gemini don't honor response_format on the normalized gateway, so + # their strategies skip it entirely: a single forced tool call. + llm = _FakeLLM( + [ + _response( + SimpleNamespace( + content=None, + tool_calls=[SimpleNamespace(arguments={RESPONSE_KEY: "ok"})], + ) + ) + ] + ) + result = await generate_structured_output( + llm, + [{"role": "user", "content": "x"}], + schema={"type": "string"}, + response_format_name="OutputSchema", + description="d", + completion_kwargs={"model": model}, + ) + assert result == "ok" + assert len(llm.calls) == 1 + assert "tools" in llm.calls[0] and "tool_choice" in llm.calls[0] + assert "response_format" not in llm.calls[0] + + +@pytest.mark.asyncio +async def test_openai_models_prefer_response_format(): + llm = _FakeLLM([_response(SimpleNamespace(content='{"a": 1}', tool_calls=None))]) + result = await generate_structured_output( + llm, + [{"role": "user", "content": "x"}], + schema={"type": "object"}, + response_format_name="OutputSchema", + description="d", + completion_kwargs={"model": "gpt-4.1-mini-2025-04-14"}, + ) + assert result == {"a": 1} + assert len(llm.calls) == 1 + assert "response_format" in llm.calls[0] diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock index f78dd4cbe..89d6d082d 100644 --- a/packages/uipath/uv.lock +++ b/packages/uipath/uv.lock @@ -2552,7 +2552,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.10.80" +version = "2.10.81" source = { editable = "." } dependencies = [ { name = "applicationinsights" }, @@ -2691,7 +2691,7 @@ dev = [ [[package]] name = "uipath-platform" -version = "0.1.62" +version = "0.1.63" source = { editable = "../uipath-platform" } dependencies = [ { name = "httpx" },