Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 26 additions & 18 deletions packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,25 +500,33 @@ def tool_calls_args_score(
tool_key = f"{call.name}_{tool_counters[call.name]}"
tool_counters[call.name] += 1

# Check arguments based on mode
if subset:
# Subset mode: safely check if all expected args exist and match
# Capture 'call' as a default argument to bind the loop variable
args_check = ( # noqa: E731
lambda k, v, call=call: k in call.args and call.args[k] == v
)
else:
# Exact mode: direct access (may raise KeyError)
# Capture 'call' as a default argument to bind the loop variable
args_check = lambda k, v, call=call: call.args[k] == v # noqa: E731

try:
args_match = all(
args_check(k, v) for k, v in expected_tool_call.args.items()
)
except KeyError:
# Only possible in exact mode when key is missing
# Empty expected.args with non-empty actual.args is a mismatch.
# Without this guard the generator below iterates zero times
# and `all([])` returns True, vacuously passing any actual
# call — masking the common authoring case of an evaluator
# whose expected.args was never filled in.
if not expected_tool_call.args and call.args:
args_match = False
else:
# Check arguments based on mode
if subset:
# Subset mode: safely check if all expected args exist and match
# Capture 'call' as a default argument to bind the loop variable
args_check = ( # noqa: E731
lambda k, v, call=call: k in call.args and call.args[k] == v
)
else:
# Exact mode: direct access (may raise KeyError)
# Capture 'call' as a default argument to bind the loop variable
args_check = lambda k, v, call=call: call.args[k] == v # noqa: E731

try:
args_match = all(
args_check(k, v) for k, v in expected_tool_call.args.items()
)
except KeyError:
# Only possible in exact mode when key is missing
args_match = False

justifications[justification_key][tool_key] = (
f"Actual: {call.args}, Expected: {expected_tool_call.args}, Score: {float(args_match)}"
Expand Down
39 changes: 39 additions & 0 deletions packages/uipath/tests/evaluators/test_evaluator_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,45 @@ def test_strict_mode_with_mismatch(self) -> None:
# In strict mode, partial match should still score proportionally unless all match
assert score == 0.0 # strict mode requires all to match

def test_empty_expected_args_against_non_empty_actual_fails(self) -> None:
"""Empty expected.args with non-empty actual.args must score 0, not vacuously pass.

Regression for the ``all([]) is True`` trap: the per-key generator
iterates zero times when expected.args is empty, so without an
explicit guard the result vacuously matches any actual call.
"""
actual = [
ToolCall(name="tool1", args={"provider": "GCS", "query": "Q"}),
]
expected = [ToolCall(name="tool1", args={})]
score, justification = tool_calls_args_score(
actual, expected, strict=False, subset=False
)

assert score == 0.0
assert "Score: 0.0" in justification["explained_tool_calls_args"]["tool1_0"]

def test_empty_expected_args_against_non_empty_actual_fails_subset_mode(
self,
) -> None:
"""Same guard applies in subset mode — empty expected ≠ ``anything goes``."""
actual = [ToolCall(name="tool1", args={"a": 1, "b": 2})]
expected = [ToolCall(name="tool1", args={})]
score, _ = tool_calls_args_score(actual, expected, strict=False, subset=True)

assert score == 0.0

def test_empty_expected_args_against_empty_actual_passes(self) -> None:
"""Both args empty is a legitimate match — confirms the new guard is targeted."""
actual = [ToolCall(name="tool1", args={})]
expected = [ToolCall(name="tool1", args={})]
score, justification = tool_calls_args_score(
actual, expected, strict=False, subset=False
)

assert score == 1.0
assert "Score: 1.0" in justification["explained_tool_calls_args"]["tool1_0"]


class TestToolCallsOutputScore:
"""Test tool_calls_output_score helper function."""
Expand Down
Loading