Azure · salma-elshafey · Feb 12, 2026 · Copilot · Feb 12, 2026 · Copilot
@@ -183,12 +183,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             }
 
         binary_result = self._get_binary_result(score)
-        return {
-            self._result_key: float(score),
-            f"gpt_{self._result_key}": float(score),
-            f"{self._result_key}_result": binary_result,
-            f"{self._result_key}_threshold": self._threshold,
-        }
+        raise EvaluationException(
+            message="Evaluator returned invalid output.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.FAILED_EXECUTION,
+            target=ErrorTarget.EVALUATE,
+        )
 
     @staticmethod
     def _get_built_in_tool_definition(tool_name: str):

@@ -204,14 +204,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
             }
             return response_dict
-        # If llm_output is not a dictionary, return NaN for the score. This should never happen
-        if logger:
-            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
-
-        binary_result = self._get_binary_result(score)
-        return {
-            self._result_key: float(score),
-            f"gpt_{self._result_key}": float(score),
-            f"{self._result_key}_result": binary_result,
-            f"{self._result_key}_threshold": self._threshold,
-        }
+        # If llm_output is not a dictionary, raise exception
+        raise EvaluationException(
+            message="Evaluator returned invalid output.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.FAILED_EXECUTION,
+            target=ErrorTarget.EVALUATE,
+        )
@@ -220,12 +220,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_sample_output": result.get("sample_output", ""),
             }
 
-        if logger:
-            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
-
-        binary_result = self._get_binary_result(score)
-        return {
-            self._result_key: float(score),
-            f"{self._result_key}_result": binary_result,
-            f"{self._result_key}_threshold": self._threshold,
-        }
+        raise EvaluationException(
+            message="Evaluator returned invalid output.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.FAILED_EXECUTION,
+            target=ErrorTarget.EVALUATE,
+        )
@@ -191,12 +191,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_sample_output": result.get("sample_output", ""),
             }
 
-        if logger:
-            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
-
-        binary_result = self._get_binary_result(score)
-        return {
-            self._result_key: float(score),
-            f"{self._result_key}_result": binary_result,
-            f"{self._result_key}_threshold": self._threshold,
-        }
+        raise EvaluationException(
+            message="Evaluator returned invalid output.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.FAILED_EXECUTION,
+            target=ErrorTarget.EVALUATE,
+        )
@@ -241,7 +241,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
                 f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
             }
 
-        if logger:
-            logger.warning("LLM output is not a dictionary, returning 0 for the success.")
-
-        return {self._result_key: 0}
+        raise EvaluationException(
+            message="Evaluator returned invalid output.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.FAILED_EXECUTION,
+            target=ErrorTarget.EVALUATE,
+        )
@@ -194,6 +194,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
                 f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
             }
-        if logger:
-            logger.warning("LLM output is not a dictionary, returning 0 for the success.")
-        return {self._result_key: 0}
+        raise EvaluationException(
+            message="Evaluator returned invalid output.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.FAILED_EXECUTION,
+            target=ErrorTarget.EVALUATE,
+        )
@@ -262,10 +262,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
 
         else:
             raise EvaluationException(
-                message="Tool call accuracy evaluator returned invalid output.",
+                message="Evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
                 category=ErrorCategory.FAILED_EXECUTION,
-                target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+                target=ErrorTarget.EVALUATE,
             )
 
     async def _real_call(self, **kwargs):

@@ -207,16 +207,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
                 f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
                 f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
             }
-        if logger:
-            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
-
-        score = math.nan
-        binary_result = self._get_binary_result(score)
-        return {
-            self._result_key: float(score),
-            f"{self._result_key}_result": binary_result,
-            f"{self._result_key}_threshold": self._threshold,
-        }
+        raise EvaluationException(
+            message="Evaluator returned invalid output.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.FAILED_EXECUTION,
+            target=ErrorTarget.EVALUATE,
+        )
 
 
 def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):

@@ -217,10 +217,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
 
         else:
             raise EvaluationException(
-                message="Tool input accuracy evaluator returned invalid output.",
+                message="Evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
                 category=ErrorCategory.FAILED_EXECUTION,
-                target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
+                target=ErrorTarget.EVALUATE,
             )
 
     async def _real_call(self, **kwargs):

@@ -232,13 +232,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
                 f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
             }
-        if logger:
-            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
-
-        score = math.nan
-        binary_result = self._get_binary_result(score)
-        return {
-            self._result_key: float(score),
-            f"{self._result_key}_result": binary_result,
-            f"{self._result_key}_threshold": self._threshold,
-        }
+        raise EvaluationException(
+            message="Evaluator returned invalid output.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.FAILED_EXECUTION,
+            target=ErrorTarget.EVALUATE,
+        )
-        raise EvaluationException(
-            message="Evaluator returned invalid output.",
-            blame=ErrorBlame.SYSTEM_ERROR,
-            category=ErrorCategory.FAILED_EXECUTION,
-            target=ErrorTarget.EVALUATE,
-        )
+        if logger:
+            logger.warning(
+                "LLM output is not a dictionary; returning NaN for the score and empty reason."
+            )
+        return {
+            f"{self._result_key}": math.nan,
+            f"{self._result_key}_reason": "",
+            f"{self._result_key}_result": "fail",
+            f"{self._result_key}_threshold": self._threshold,
+            f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
+            f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
+            f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
+            f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
+            f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
+            f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
+            f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
+        }
-        raise EvaluationException(
-            message="Evaluator returned invalid output.",
-            blame=ErrorBlame.SYSTEM_ERROR,
-            category=ErrorCategory.FAILED_EXECUTION,
-            target=ErrorTarget.EVALUATE,
-        )
+        if logger:
+            logger.warning(
+                "LLM output is not a dictionary; returning NaN for the score and empty reason."
+            )
+        return {
+            f"{self._result_key}": math.nan,
+            f"{self._result_key}_reason": "",
+            f"{self._result_key}_result": "fail",
+            f"{self._result_key}_threshold": self._threshold,
+            f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
+            f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
+            f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
+            f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
+            f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
+            f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
+            f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
+        }
@@ -239,10 +239,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
 
         else:
             raise EvaluationException(
-                message="Tool selection evaluator returned invalid output.",
+                message="Evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
                 category=ErrorCategory.FAILED_EXECUTION,
-                target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
+                target=ErrorTarget.EVALUATE,
             )
 
     async def _real_call(self, **kwargs):