diff --git a/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py b/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py index 912dcd41d..e40fbe4e3 100644 --- a/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py +++ b/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py @@ -500,25 +500,33 @@ def tool_calls_args_score( tool_key = f"{call.name}_{tool_counters[call.name]}" tool_counters[call.name] += 1 - # Check arguments based on mode - if subset: - # Subset mode: safely check if all expected args exist and match - # Capture 'call' as a default argument to bind the loop variable - args_check = ( # noqa: E731 - lambda k, v, call=call: k in call.args and call.args[k] == v - ) - else: - # Exact mode: direct access (may raise KeyError) - # Capture 'call' as a default argument to bind the loop variable - args_check = lambda k, v, call=call: call.args[k] == v # noqa: E731 - - try: - args_match = all( - args_check(k, v) for k, v in expected_tool_call.args.items() - ) - except KeyError: - # Only possible in exact mode when key is missing + # Empty expected.args with non-empty actual.args is a mismatch. + # Without this guard the generator below iterates zero times + # and `all([])` returns True, vacuously passing any actual + # call — masking the common authoring case of an evaluator + # whose expected.args was never filled in. + if not expected_tool_call.args and call.args: args_match = False + else: + # Check arguments based on mode + if subset: + # Subset mode: safely check if all expected args exist and match + # Capture 'call' as a default argument to bind the loop variable + args_check = ( # noqa: E731 + lambda k, v, call=call: k in call.args and call.args[k] == v + ) + else: + # Exact mode: direct access (may raise KeyError) + # Capture 'call' as a default argument to bind the loop variable + args_check = lambda k, v, call=call: call.args[k] == v # noqa: E731 + + try: + args_match = all( + args_check(k, v) for k, v in expected_tool_call.args.items() + ) + except KeyError: + # Only possible in exact mode when key is missing + args_match = False justifications[justification_key][tool_key] = ( f"Actual: {call.args}, Expected: {expected_tool_call.args}, Score: {float(args_match)}" diff --git a/packages/uipath/tests/evaluators/test_evaluator_helpers.py b/packages/uipath/tests/evaluators/test_evaluator_helpers.py index 061b57aab..776f7c7e4 100644 --- a/packages/uipath/tests/evaluators/test_evaluator_helpers.py +++ b/packages/uipath/tests/evaluators/test_evaluator_helpers.py @@ -282,6 +282,45 @@ def test_strict_mode_with_mismatch(self) -> None: # In strict mode, partial match should still score proportionally unless all match assert score == 0.0 # strict mode requires all to match + def test_empty_expected_args_against_non_empty_actual_fails(self) -> None: + """Empty expected.args with non-empty actual.args must score 0, not vacuously pass. + + Regression for the ``all([]) is True`` trap: the per-key generator + iterates zero times when expected.args is empty, so without an + explicit guard the result vacuously matches any actual call. + """ + actual = [ + ToolCall(name="tool1", args={"provider": "GCS", "query": "Q"}), + ] + expected = [ToolCall(name="tool1", args={})] + score, justification = tool_calls_args_score( + actual, expected, strict=False, subset=False + ) + + assert score == 0.0 + assert "Score: 0.0" in justification["explained_tool_calls_args"]["tool1_0"] + + def test_empty_expected_args_against_non_empty_actual_fails_subset_mode( + self, + ) -> None: + """Same guard applies in subset mode — empty expected ≠ ``anything goes``.""" + actual = [ToolCall(name="tool1", args={"a": 1, "b": 2})] + expected = [ToolCall(name="tool1", args={})] + score, _ = tool_calls_args_score(actual, expected, strict=False, subset=True) + + assert score == 0.0 + + def test_empty_expected_args_against_empty_actual_passes(self) -> None: + """Both args empty is a legitimate match — confirms the new guard is targeted.""" + actual = [ToolCall(name="tool1", args={})] + expected = [ToolCall(name="tool1", args={})] + score, justification = tool_calls_args_score( + actual, expected, strict=False, subset=False + ) + + assert score == 1.0 + assert "Score: 1.0" in justification["explained_tool_calls_args"]["tool1_0"] + class TestToolCallsOutputScore: """Test tool_calls_output_score helper function."""