Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
}

binary_result = self._get_binary_result(score)
return {
self._result_key: float(score),
f"gpt_{self._result_key}": float(score),
f"{self._result_key}_result": binary_result,
f"{self._result_key}_threshold": self._threshold,
}
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)

@staticmethod
def _get_built_in_tool_definition(tool_name: str):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,14 +204,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
}
return response_dict
# If llm_output is not a dictionary, return NaN for the score. This should never happen
if logger:
logger.warning("LLM output is not a dictionary, returning NaN for the score.")

binary_result = self._get_binary_result(score)
return {
self._result_key: float(score),
f"gpt_{self._result_key}": float(score),
f"{self._result_key}_result": binary_result,
f"{self._result_key}_threshold": self._threshold,
}
# If llm_output is not a dictionary, raise exception
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)
Original file line number Diff line number Diff line change
Expand Up @@ -220,12 +220,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
f"{self._result_key}_sample_output": result.get("sample_output", ""),
}

if logger:
logger.warning("LLM output is not a dictionary, returning NaN for the score.")

binary_result = self._get_binary_result(score)
return {
self._result_key: float(score),
f"{self._result_key}_result": binary_result,
f"{self._result_key}_threshold": self._threshold,
}
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
f"{self._result_key}_sample_output": result.get("sample_output", ""),
}

if logger:
logger.warning("LLM output is not a dictionary, returning NaN for the score.")

binary_result = self._get_binary_result(score)
return {
self._result_key: float(score),
f"{self._result_key}_result": binary_result,
f"{self._result_key}_threshold": self._threshold,
}
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
}

if logger:
logger.warning("LLM output is not a dictionary, returning 0 for the success.")

return {self._result_key: 0}
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
}
if logger:
logger.warning("LLM output is not a dictionary, returning 0 for the success.")
return {self._result_key: 0}
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)
Comment on lines +197 to +202
Copy link

Copilot AI Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a breaking change. The existing test at line 230-244 in test_task_completion_evaluator.py expects this evaluator to return 0 when LLM output is not a dictionary, but the new code raises an exception instead. The test needs to be updated to expect an EvaluationException.

Copilot uses AI. Check for mistakes.
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t

else:
raise EvaluationException(
message="Tool call accuracy evaluator returned invalid output.",
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
target=ErrorTarget.EVALUATE,
)

async def _real_call(self, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,16 +207,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
}
if logger:
logger.warning("LLM output is not a dictionary, returning NaN for the score.")

score = math.nan
binary_result = self._get_binary_result(score)
return {
self._result_key: float(score),
f"{self._result_key}_result": binary_result,
f"{self._result_key}_threshold": self._threshold,
}
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)


def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:

else:
raise EvaluationException(
message="Tool input accuracy evaluator returned invalid output.",
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
target=ErrorTarget.EVALUATE,
)

async def _real_call(self, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,13 +232,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
}
if logger:
logger.warning("LLM output is not a dictionary, returning NaN for the score.")

score = math.nan
binary_result = self._get_binary_result(score)
return {
self._result_key: float(score),
f"{self._result_key}_result": binary_result,
f"{self._result_key}_threshold": self._threshold,
}
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)
Comment on lines +235 to +240
Copy link

Copilot AI Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a breaking behavioral change that should be documented in the CHANGELOG. Previously, when the evaluator received invalid (non-parseable) output, it would return a default value (NaN or 0). Now it raises an EvaluationException. This change affects multiple evaluators: ToolSelectionEvaluator, ToolOutputUtilizationEvaluator, ToolInputAccuracyEvaluator, ToolCallSuccessEvaluator, ToolCallAccuracyEvaluator, TaskCompletionEvaluator, TaskAdherenceEvaluator, ResponseCompletenessEvaluator, RelevanceEvaluator, and IntentResolutionEvaluator. The CHANGELOG should document this under a "Breaking Changes" section.

Suggested change
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)
if logger:
logger.warning(
"LLM output is not a dictionary; returning NaN for the score and empty reason."
)
return {
f"{self._result_key}": math.nan,
f"{self._result_key}_reason": "",
f"{self._result_key}_result": "fail",
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
}

Copilot uses AI. Check for mistakes.
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:

else:
raise EvaluationException(
message="Tool selection evaluator returned invalid output.",
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
target=ErrorTarget.EVALUATE,
)

async def _real_call(self, **kwargs):
Expand Down
Loading