ci(evals): add GitHub Check Run reporting for evaluation results

dcramer · claude · dcramer · commit e4b2cdda36dc · 2025-11-19T17:24:58.000-08:00
Create automated check runs that report eval scores and statistics:
- Add create-eval-check.js script to parse eval-results.json and create GitHub Check Runs
- Update eval:ci to output JSON results via --reporter=json
- Set check conclusion to 'success' if avg score &gt;= 0.5, 'failure' otherwise
- Display overall statistics and score distribution (green/yellow/red)
- Run check creation even if evals fail (continue-on-error + !cancelled)
- Add checks:write permission to eval workflow
- Ignore generated eval-results.json in version control

The check run provides:
- Overall average score with pass/fail threshold (0.50)
- Score distribution by category (&gt;=0.75, 0.50-0.74, &lt;0.50)
- Individual eval scores sorted by performance
- Rationale for failed or low-scoring tests

Co-Authored-By: Claude Code &lt;noreply@anthropic.com&gt;
diff --git a/.github/scripts/create-eval-check.js b/.github/scripts/create-eval-check.js
@@ -0,0 +1,206 @@
+#!/usr/bin/env node
+
+/**
+ * Create a GitHub Check Run for eval results based on score statistics.
+ *
+ * This script:
+ * 1. Reads eval-results.json from the evals package
+ * 2. Calculates overall statistics and score distribution
+ * 3. Creates a GitHub Check Run via the Checks API
+ * 4. Sets conclusion to 'success' if avg score >= 0.5, 'failure' otherwise
+ *
+ * Environment variables required:
+ * - GITHUB_TOKEN: GitHub token with checks:write permission
+ * - GITHUB_REPOSITORY: Repository in format 'owner/repo'
+ * - GITHUB_SHA: Commit SHA to create the check for
+ */
+
+import { readFileSync } from "node:fs";
+import { resolve } from "node:path";
+
+// GitHub API helper
+async function createCheckRun({
+  owner,
+  repo,
+  token,
+  sha,
+  name,
+  conclusion,
+  title,
+  summary,
+  text,
+}) {
+  const url = `https://api.github.com/repos/${owner}/${repo}/check-runs`;
+
+  const response = await fetch(url, {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${token}`,
+      Accept: "application/vnd.github+json",
+      "X-GitHub-Api-Version": "2022-11-28",
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      name,
+      head_sha: sha,
+      status: "completed",
+      conclusion,
+      output: {
+        title,
+        summary,
+        text,
+      },
+    }),
+  });
+
+  if (!response.ok) {
+    const error = await response.text();
+    throw new Error(`Failed to create check run: ${response.status} ${error}`);
+  }
+
+  return response.json();
+}
+
+// Format score with color emoji
+function formatScore(score) {
+  if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
+  if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
+  return `🔴 ${score.toFixed(2)}`;
+}
+
+// Main execution
+async function main() {
+  // Validate environment
+  const token = process.env.GITHUB_TOKEN;
+  const repository = process.env.GITHUB_REPOSITORY;
+  const sha = process.env.GITHUB_SHA;
+
+  if (!token || !repository || !sha) {
+    throw new Error(
+      "Missing required environment variables: GITHUB_TOKEN, GITHUB_REPOSITORY, GITHUB_SHA",
+    );
+  }
+
+  const [owner, repo] = repository.split("/");
+
+  // Read eval results (vitest JSON format)
+  const resultsPath = resolve(
+    process.cwd(),
+    "packages/mcp-server-evals/eval-results.json",
+  );
+  console.log(`Reading eval results from: ${resultsPath}`);
+
+  const vitestResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
+
+  // Extract eval results from vitest format
+  const evalResults = [];
+  for (const testFile of vitestResults.testResults || []) {
+    for (const test of testFile.assertionResults || []) {
+      if (test.meta?.eval) {
+        evalResults.push({
+          name: test.fullName || test.title,
+          file: testFile.name,
+          avgScore: test.meta.eval.avgScore ?? null,
+          scores: test.meta.eval.scores || [],
+          passed: test.status === "passed",
+          duration: test.duration,
+        });
+      }
+    }
+  }
+
+  // Calculate statistics
+  const totalTests = evalResults.length;
+  const validScores = evalResults
+    .map((r) => r.avgScore)
+    .filter((score) => score !== null);
+
+  const avgScore =
+    validScores.length > 0
+      ? validScores.reduce((sum, score) => sum + score, 0) / validScores.length
+      : 0;
+
+  const green = validScores.filter((s) => s >= 0.75).length;
+  const yellow = validScores.filter((s) => s >= 0.5 && s < 0.75).length;
+  const red = validScores.filter((s) => s < 0.5).length;
+  const scoreDistribution = { green, yellow, red };
+
+  // Determine conclusion based on 0.5 threshold
+  const conclusion = avgScore >= 0.5 ? "success" : "failure";
+
+  // Format title
+  const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;
+
+  // Format summary
+  const summary = [
+    `## Overall Statistics`,
+    ``,
+    `- **Total Evaluations**: ${totalTests}`,
+    `- **Average Score**: ${formatScore(avgScore)}`,
+    `- **Pass Threshold**: 0.50 (catastrophic failure)`,
+    ``,
+    `### Score Distribution`,
+    `- 🟢 Green (≥0.75): ${green} evals`,
+    `- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
+    `- 🔴 Red (<0.50): ${red} evals`,
+    ``,
+  ].join("\n");
+
+  // Format detailed results
+  const detailsByScore = [...evalResults].sort(
+    (a, b) => (b.avgScore || 0) - (a.avgScore || 0),
+  );
+
+  const details = [
+    `## Individual Eval Scores`,
+    ``,
+    ...detailsByScore.map((result) => {
+      const score = result.avgScore !== null ? result.avgScore : 0;
+      const statusIcon = result.passed ? "✅" : "❌";
+      const scoreDisplay = formatScore(score);
+
+      let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;
+
+      // Add rationale for failed or low-scoring tests
+      if (!result.passed || score < 0.75) {
+        const firstScore = result.scores[0];
+        if (firstScore?.metadata?.rationale) {
+          line += `\n   - ${firstScore.metadata.rationale}`;
+        }
+      }
+
+      return line;
+    }),
+    ``,
+    `---`,
+    ``,
+    `### Conclusion`,
+    ``,
+    conclusion === "success"
+      ? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
+      : `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
+  ].join("\n");
+
+  // Create check run
+  console.log(`Creating check run with conclusion: ${conclusion}`);
+  const checkRun = await createCheckRun({
+    owner,
+    repo,
+    token,
+    sha,
+    name: "Evaluation Results",
+    conclusion,
+    title,
+    summary,
+    text: details,
+  });
+
+  console.log(`✅ Check run created: ${checkRun.html_url}`);
+  console.log(`   Conclusion: ${conclusion}`);
+  console.log(`   Average Score: ${avgScore.toFixed(2)}`);
+}
+
+main().catch((error) => {
+  console.error("❌ Error creating check run:", error);
+  process.exit(1);
+});
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -20,6 +20,9 @@ jobs:
   eval:
     environment: Actions
     runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      contents: read
     steps:
       - uses: actions/checkout@v4
 
@@ -55,9 +58,16 @@ jobs:
 
       - name: Run evals
         run: pnpm eval:ci evals
+        continue-on-error: true
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
+      - name: Create eval status check
+        if: ${{ !cancelled() }}
+        run: node .github/scripts/create-eval-check.js
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4
         env:
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,4 @@ coverage
 # Generated files
 packages/mcp-server/src/toolDefinitions.json
 packages/mcp-server/src/skillDefinitions.json
+packages/mcp-server-evals/eval-results.json
diff --git a/packages/mcp-server-evals/package.json b/packages/mcp-server-evals/package.json
@@ -12,7 +12,7 @@
     "dev": "tsc -w",
     "start": "tsx src/bin/start-mock-stdio.ts",
     "eval": "vitest --config=vitest.config.ts",
-    "eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --outputFile=eval.junit.xml"
+    "eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --reporter=json --outputFile.json=eval-results.json --outputFile.junit=eval.junit.xml"
   },
   "dependencies": {
     "@ai-sdk/openai": "catalog:",