Skip to content

Commit c135c75

Browse files
dcramerclaude
andcommitted
ci(evals): add GitHub Check Run reporting for evaluation results
Create automated check runs that report eval scores and statistics: - Add create-eval-check.js script to parse eval-results.json and create GitHub Check Runs - Update eval:ci to output JSON results via --reporter=json - Set check conclusion to 'success' if avg score >= 0.5, 'failure' otherwise - Display overall statistics and score distribution (green/yellow/red) - Run check creation even if evals fail (continue-on-error + !cancelled) - Add checks:write permission to eval workflow - Ignore generated eval-results.json in version control The check run provides: - Overall average score with pass/fail threshold (0.50) - Score distribution by category (>=0.75, 0.50-0.74, <0.50) - Individual eval scores sorted by performance - Rationale for failed or low-scoring tests Co-Authored-By: Claude Code <[email protected]>
1 parent 6beb840 commit c135c75

File tree

4 files changed

+228
-8
lines changed

4 files changed

+228
-8
lines changed
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#!/usr/bin/env node
2+
3+
/**
4+
* Create a GitHub Check Run for eval results based on score statistics.
5+
*
6+
* This script:
7+
* 1. Reads eval-results.json from the evals package
8+
* 2. Calculates overall statistics and score distribution
9+
* 3. Creates a GitHub Check Run via the Checks API
10+
* 4. Sets conclusion to 'success' if avg score >= 0.5, 'failure' otherwise
11+
*
12+
* Environment variables required:
13+
* - GITHUB_TOKEN: GitHub token with checks:write permission
14+
* - GITHUB_REPOSITORY: Repository in format 'owner/repo'
15+
* - GITHUB_SHA: Commit SHA to create the check for
16+
*/
17+
18+
import { readFileSync } from "node:fs";
19+
import { resolve } from "node:path";
20+
21+
// GitHub API helper
22+
async function createCheckRun({
23+
owner,
24+
repo,
25+
token,
26+
sha,
27+
name,
28+
conclusion,
29+
title,
30+
summary,
31+
text,
32+
}) {
33+
const url = `https://api.github.com/repos/${owner}/${repo}/check-runs`;
34+
35+
const response = await fetch(url, {
36+
method: "POST",
37+
headers: {
38+
Authorization: `Bearer ${token}`,
39+
Accept: "application/vnd.github+json",
40+
"X-GitHub-Api-Version": "2022-11-28",
41+
"Content-Type": "application/json",
42+
},
43+
body: JSON.stringify({
44+
name,
45+
head_sha: sha,
46+
status: "completed",
47+
conclusion,
48+
output: {
49+
title,
50+
summary,
51+
text,
52+
},
53+
}),
54+
});
55+
56+
if (!response.ok) {
57+
const error = await response.text();
58+
throw new Error(`Failed to create check run: ${response.status} ${error}`);
59+
}
60+
61+
return response.json();
62+
}
63+
64+
// Format score with color emoji
65+
function formatScore(score) {
66+
if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
67+
if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
68+
return `🔴 ${score.toFixed(2)}`;
69+
}
70+
71+
// Main execution
72+
async function main() {
73+
// Validate environment
74+
const token = process.env.GITHUB_TOKEN;
75+
const repository = process.env.GITHUB_REPOSITORY;
76+
const sha = process.env.GITHUB_SHA;
77+
78+
if (!token || !repository || !sha) {
79+
throw new Error(
80+
"Missing required environment variables: GITHUB_TOKEN, GITHUB_REPOSITORY, GITHUB_SHA",
81+
);
82+
}
83+
84+
const [owner, repo] = repository.split("/");
85+
86+
// Read eval results (vitest JSON format)
87+
const resultsPath = resolve(
88+
process.cwd(),
89+
"packages/mcp-server-evals/eval-results.json",
90+
);
91+
console.log(`Reading eval results from: ${resultsPath}`);
92+
93+
let vitestResults;
94+
try {
95+
vitestResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
96+
} catch (error) {
97+
if (error.code === "ENOENT") {
98+
throw new Error(
99+
`Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.`,
100+
);
101+
}
102+
throw new Error(`Failed to read/parse eval results: ${error.message}`);
103+
}
104+
105+
// Extract eval results from vitest format
106+
const evalResults = [];
107+
for (const testFile of vitestResults.testResults || []) {
108+
for (const test of testFile.assertionResults || []) {
109+
if (test.meta?.eval) {
110+
evalResults.push({
111+
name: test.fullName || test.title,
112+
file: testFile.name,
113+
avgScore: test.meta.eval.avgScore ?? null,
114+
scores: test.meta.eval.scores || [],
115+
passed: test.status === "passed",
116+
duration: test.duration,
117+
});
118+
}
119+
}
120+
}
121+
122+
// Calculate statistics
123+
const totalTests = evalResults.length;
124+
const validScores = evalResults
125+
.map((r) => r.avgScore)
126+
.filter((score) => score !== null);
127+
128+
const avgScore =
129+
validScores.length > 0
130+
? validScores.reduce((sum, score) => sum + score, 0) / validScores.length
131+
: 0;
132+
133+
const green = validScores.filter((s) => s >= 0.75).length;
134+
const yellow = validScores.filter((s) => s >= 0.5 && s < 0.75).length;
135+
const red = validScores.filter((s) => s < 0.5).length;
136+
const scoreDistribution = { green, yellow, red };
137+
138+
// Determine conclusion based on 0.5 threshold
139+
const conclusion = avgScore >= 0.5 ? "success" : "failure";
140+
141+
// Format title
142+
const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;
143+
144+
// Format summary
145+
const summary = [
146+
`## Overall Statistics`,
147+
``,
148+
`- **Total Evaluations**: ${totalTests}`,
149+
`- **Average Score**: ${formatScore(avgScore)}`,
150+
`- **Pass Threshold**: 0.50 (catastrophic failure)`,
151+
``,
152+
`### Score Distribution`,
153+
`- 🟢 Green (≥0.75): ${green} evals`,
154+
`- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
155+
`- 🔴 Red (<0.50): ${red} evals`,
156+
``,
157+
].join("\n");
158+
159+
// Format detailed results
160+
const detailsByScore = [...evalResults].sort(
161+
(a, b) => (b.avgScore || 0) - (a.avgScore || 0),
162+
);
163+
164+
const details = [
165+
`## Individual Eval Scores`,
166+
``,
167+
...detailsByScore.map((result) => {
168+
const score = result.avgScore !== null ? result.avgScore : 0;
169+
const statusIcon = result.passed ? "✅" : "❌";
170+
const scoreDisplay = formatScore(score);
171+
172+
let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;
173+
174+
// Add rationale for failed or low-scoring tests
175+
if (!result.passed || score < 0.75) {
176+
const firstScore = result.scores[0];
177+
if (firstScore?.metadata?.rationale) {
178+
line += `\n - ${firstScore.metadata.rationale}`;
179+
}
180+
}
181+
182+
return line;
183+
}),
184+
``,
185+
`---`,
186+
``,
187+
`### Conclusion`,
188+
``,
189+
conclusion === "success"
190+
? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
191+
: `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
192+
].join("\n");
193+
194+
// Create check run
195+
console.log(`Creating check run with conclusion: ${conclusion}`);
196+
const checkRun = await createCheckRun({
197+
owner,
198+
repo,
199+
token,
200+
sha,
201+
name: "Evaluation Results",
202+
conclusion,
203+
title,
204+
summary,
205+
text: details,
206+
});
207+
208+
console.log(`✅ Check run created: ${checkRun.html_url}`);
209+
console.log(` Conclusion: ${conclusion}`);
210+
console.log(` Average Score: ${avgScore.toFixed(2)}`);
211+
}
212+
213+
main().catch((error) => {
214+
console.error("❌ Error creating check run:", error);
215+
process.exit(1);
216+
});

.github/workflows/eval.yml

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ jobs:
2020
eval:
2121
environment: Actions
2222
runs-on: ubuntu-latest
23+
permissions:
24+
checks: write
25+
contents: read
2326
steps:
2427
- uses: actions/checkout@v4
2528

@@ -55,9 +58,16 @@ jobs:
5558

5659
- name: Run evals
5760
run: pnpm eval:ci evals
61+
continue-on-error: true
5862
env:
5963
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
6064

65+
- name: Create eval status check
66+
if: ${{ !cancelled() }}
67+
run: node .github/scripts/create-eval-check.js
68+
env:
69+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
70+
6171
- name: Upload coverage reports to Codecov
6272
uses: codecov/codecov-action@v4
6373
env:
@@ -72,10 +82,3 @@ jobs:
7282
uses: codecov/test-results-action@v1
7383
with:
7484
token: ${{ secrets.CODECOV_TOKEN }}
75-
76-
- name: Publish Test Report
77-
uses: mikepenz/action-junit-report@cf701569b05ccdd861a76b8607a66d76f6fd4857
78-
if: ${{ !cancelled() }}
79-
with:
80-
report_paths: "**/*.junit.xml"
81-
comment: false

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,4 @@ coverage
6565
# Generated files
6666
packages/mcp-server/src/toolDefinitions.json
6767
packages/mcp-server/src/skillDefinitions.json
68+
packages/mcp-server-evals/eval-results.json

packages/mcp-server-evals/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"dev": "tsc -w",
1313
"start": "tsx src/bin/start-mock-stdio.ts",
1414
"eval": "vitest --config=vitest.config.ts",
15-
"eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --outputFile=eval.junit.xml"
15+
"eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --reporter=json --outputFile.json=eval-results.json --outputFile.junit=eval.junit.xml"
1616
},
1717
"dependencies": {
1818
"@ai-sdk/openai": "catalog:",

0 commit comments

Comments
 (0)