UiPath · ajay-kesavan · Jun 18, 2026
diff --git a/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py b/packages/uipath/src/uipath/eval/_helpers/evaluators_helpers.py
@@ -500,25 +500,33 @@ def tool_calls_args_score(
                 tool_key = f"{call.name}_{tool_counters[call.name]}"
                 tool_counters[call.name] += 1
 
-                # Check arguments based on mode
-                if subset:
-                    # Subset mode: safely check if all expected args exist and match
-                    # Capture 'call' as a default argument to bind the loop variable
-                    args_check = (  # noqa: E731
-                        lambda k, v, call=call: k in call.args and call.args[k] == v
-                    )
-                else:
-                    # Exact mode: direct access (may raise KeyError)
-                    # Capture 'call' as a default argument to bind the loop variable
-                    args_check = lambda k, v, call=call: call.args[k] == v  # noqa: E731
-
-                try:
-                    args_match = all(
-                        args_check(k, v) for k, v in expected_tool_call.args.items()
-                    )
-                except KeyError:
-                    # Only possible in exact mode when key is missing
+                # Empty expected.args with non-empty actual.args is a mismatch.
+                # Without this guard the generator below iterates zero times
+                # and `all([])` returns True, vacuously passing any actual
+                # call — masking the common authoring case of an evaluator
+                # whose expected.args was never filled in.
+                if not expected_tool_call.args and call.args:
                     args_match = False
+                else:
+                    # Check arguments based on mode
+                    if subset:
+                        # Subset mode: safely check if all expected args exist and match
+                        # Capture 'call' as a default argument to bind the loop variable
+                        args_check = (  # noqa: E731
+                            lambda k, v, call=call: k in call.args and call.args[k] == v
+                        )
+                    else:
+                        # Exact mode: direct access (may raise KeyError)
+                        # Capture 'call' as a default argument to bind the loop variable
+                        args_check = lambda k, v, call=call: call.args[k] == v  # noqa: E731
+
+                    try:
+                        args_match = all(
+                            args_check(k, v) for k, v in expected_tool_call.args.items()
+                        )
+                    except KeyError:
+                        # Only possible in exact mode when key is missing
+                        args_match = False
 
                 justifications[justification_key][tool_key] = (
                     f"Actual: {call.args}, Expected: {expected_tool_call.args}, Score: {float(args_match)}"

diff --git a/packages/uipath/tests/evaluators/test_evaluator_helpers.py b/packages/uipath/tests/evaluators/test_evaluator_helpers.py
@@ -282,6 +282,45 @@ def test_strict_mode_with_mismatch(self) -> None:
         # In strict mode, partial match should still score proportionally unless all match
         assert score == 0.0  # strict mode requires all to match
 
+    def test_empty_expected_args_against_non_empty_actual_fails(self) -> None:
+        """Empty expected.args with non-empty actual.args must score 0, not vacuously pass.
+
+        Regression for the ``all([]) is True`` trap: the per-key generator
+        iterates zero times when expected.args is empty, so without an
+        explicit guard the result vacuously matches any actual call.
+        """
+        actual = [
+            ToolCall(name="tool1", args={"provider": "GCS", "query": "Q"}),
+        ]
+        expected = [ToolCall(name="tool1", args={})]
+        score, justification = tool_calls_args_score(
+            actual, expected, strict=False, subset=False
+        )
+
+        assert score == 0.0
+        assert "Score: 0.0" in justification["explained_tool_calls_args"]["tool1_0"]
+
+    def test_empty_expected_args_against_non_empty_actual_fails_subset_mode(
+        self,
+    ) -> None:
+        """Same guard applies in subset mode — empty expected ≠ ``anything goes``."""
+        actual = [ToolCall(name="tool1", args={"a": 1, "b": 2})]
+        expected = [ToolCall(name="tool1", args={})]
+        score, _ = tool_calls_args_score(actual, expected, strict=False, subset=True)
+
+        assert score == 0.0
+
+    def test_empty_expected_args_against_empty_actual_passes(self) -> None:
+        """Both args empty is a legitimate match — confirms the new guard is targeted."""
+        actual = [ToolCall(name="tool1", args={})]
+        expected = [ToolCall(name="tool1", args={})]
+        score, justification = tool_calls_args_score(
+            actual, expected, strict=False, subset=False
+        )
+
+        assert score == 1.0
+        assert "Score: 1.0" in justification["explained_tool_calls_args"]["tool1_0"]
+
 
 class TestToolCallsOutputScore:
     """Test tool_calls_output_score helper function."""