evals: rework the StatementEvaluator rubric

kdudka · kdudka · commit 4a599ec9797f · 2025-12-04T15:00:29.000+01:00
It turned out that evals have been passing only because `osidb_cache` was feeding LLM with unfiltered data from OSIDB, which contained also the fields to be suggested. Fixes: commit 2b9fa21 Related: https://issues.redhat.com/browse/AEGIS-265
diff --git a/evals/features/cve/test_suggest_statement.py b/evals/features/cve/test_suggest_statement.py
@@ -20,7 +20,13 @@
 field_evaluators = {
     "suggested_statement": create_llm_judge(
         score_name="StatementEvaluator",
-        rubric="Score how much the actual suggested_statement field is semantically equivalent to the expected suggested_statement field.  If the key message is the same but the style is different, the score should not be zero.  If the style is different, the score should not be 1.0.",
+        rubric=(
+            "Score semantic equivalence between the actual suggested_statement and the expected suggested_statement. "
+            "Emphasize matching rationale (impact justification in RH context, preconditions, scope). "
+            "If style differs but the core message overlaps, the score should be > 0.0 and < 1.0 depending on overlap. "
+            "Only assign 0.0 if the actual is irrelevant to the CVE or contradicts the expected meaning. "
+            "When partially aligned but missing details, prefer a low non-zero score (e.g., 0.12–0.3) rather than 0.0."
+        ),
         include_expected_output=True,
     ),
     "suggested_mitigation": create_llm_judge(