Skip to content

Commit e4b2cdd

Browse files
dcramerclaude
andcommitted
ci(evals): add GitHub Check Run reporting for evaluation results
Create automated check runs that report eval scores and statistics: - Add create-eval-check.js script to parse eval-results.json and create GitHub Check Runs - Update eval:ci to output JSON results via --reporter=json - Set check conclusion to 'success' if avg score >= 0.5, 'failure' otherwise - Display overall statistics and score distribution (green/yellow/red) - Run check creation even if evals fail (continue-on-error + !cancelled) - Add checks:write permission to eval workflow - Ignore generated eval-results.json in version control The check run provides: - Overall average score with pass/fail threshold (0.50) - Score distribution by category (>=0.75, 0.50-0.74, <0.50) - Individual eval scores sorted by performance - Rationale for failed or low-scoring tests Co-Authored-By: Claude Code <[email protected]>
1 parent 74c2b4c commit e4b2cdd

File tree

4 files changed

+218
-1
lines changed

4 files changed

+218
-1
lines changed
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
#!/usr/bin/env node
2+
3+
/**
4+
* Create a GitHub Check Run for eval results based on score statistics.
5+
*
6+
* This script:
7+
* 1. Reads eval-results.json from the evals package
8+
* 2. Calculates overall statistics and score distribution
9+
* 3. Creates a GitHub Check Run via the Checks API
10+
* 4. Sets conclusion to 'success' if avg score >= 0.5, 'failure' otherwise
11+
*
12+
* Environment variables required:
13+
* - GITHUB_TOKEN: GitHub token with checks:write permission
14+
* - GITHUB_REPOSITORY: Repository in format 'owner/repo'
15+
* - GITHUB_SHA: Commit SHA to create the check for
16+
*/
17+
18+
import { readFileSync } from "node:fs";
19+
import { resolve } from "node:path";
20+
21+
// GitHub API helper
22+
async function createCheckRun({
23+
owner,
24+
repo,
25+
token,
26+
sha,
27+
name,
28+
conclusion,
29+
title,
30+
summary,
31+
text,
32+
}) {
33+
const url = `https://api.github.com/repos/${owner}/${repo}/check-runs`;
34+
35+
const response = await fetch(url, {
36+
method: "POST",
37+
headers: {
38+
Authorization: `Bearer ${token}`,
39+
Accept: "application/vnd.github+json",
40+
"X-GitHub-Api-Version": "2022-11-28",
41+
"Content-Type": "application/json",
42+
},
43+
body: JSON.stringify({
44+
name,
45+
head_sha: sha,
46+
status: "completed",
47+
conclusion,
48+
output: {
49+
title,
50+
summary,
51+
text,
52+
},
53+
}),
54+
});
55+
56+
if (!response.ok) {
57+
const error = await response.text();
58+
throw new Error(`Failed to create check run: ${response.status} ${error}`);
59+
}
60+
61+
return response.json();
62+
}
63+
64+
// Format score with color emoji
65+
function formatScore(score) {
66+
if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
67+
if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
68+
return `🔴 ${score.toFixed(2)}`;
69+
}
70+
71+
// Main execution
72+
async function main() {
73+
// Validate environment
74+
const token = process.env.GITHUB_TOKEN;
75+
const repository = process.env.GITHUB_REPOSITORY;
76+
const sha = process.env.GITHUB_SHA;
77+
78+
if (!token || !repository || !sha) {
79+
throw new Error(
80+
"Missing required environment variables: GITHUB_TOKEN, GITHUB_REPOSITORY, GITHUB_SHA",
81+
);
82+
}
83+
84+
const [owner, repo] = repository.split("/");
85+
86+
// Read eval results (vitest JSON format)
87+
const resultsPath = resolve(
88+
process.cwd(),
89+
"packages/mcp-server-evals/eval-results.json",
90+
);
91+
console.log(`Reading eval results from: ${resultsPath}`);
92+
93+
const vitestResults = JSON.parse(readFileSync(resultsPath, "utf-8"));
94+
95+
// Extract eval results from vitest format
96+
const evalResults = [];
97+
for (const testFile of vitestResults.testResults || []) {
98+
for (const test of testFile.assertionResults || []) {
99+
if (test.meta?.eval) {
100+
evalResults.push({
101+
name: test.fullName || test.title,
102+
file: testFile.name,
103+
avgScore: test.meta.eval.avgScore ?? null,
104+
scores: test.meta.eval.scores || [],
105+
passed: test.status === "passed",
106+
duration: test.duration,
107+
});
108+
}
109+
}
110+
}
111+
112+
// Calculate statistics
113+
const totalTests = evalResults.length;
114+
const validScores = evalResults
115+
.map((r) => r.avgScore)
116+
.filter((score) => score !== null);
117+
118+
const avgScore =
119+
validScores.length > 0
120+
? validScores.reduce((sum, score) => sum + score, 0) / validScores.length
121+
: 0;
122+
123+
const green = validScores.filter((s) => s >= 0.75).length;
124+
const yellow = validScores.filter((s) => s >= 0.5 && s < 0.75).length;
125+
const red = validScores.filter((s) => s < 0.5).length;
126+
const scoreDistribution = { green, yellow, red };
127+
128+
// Determine conclusion based on 0.5 threshold
129+
const conclusion = avgScore >= 0.5 ? "success" : "failure";
130+
131+
// Format title
132+
const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;
133+
134+
// Format summary
135+
const summary = [
136+
`## Overall Statistics`,
137+
``,
138+
`- **Total Evaluations**: ${totalTests}`,
139+
`- **Average Score**: ${formatScore(avgScore)}`,
140+
`- **Pass Threshold**: 0.50 (catastrophic failure)`,
141+
``,
142+
`### Score Distribution`,
143+
`- 🟢 Green (≥0.75): ${green} evals`,
144+
`- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
145+
`- 🔴 Red (<0.50): ${red} evals`,
146+
``,
147+
].join("\n");
148+
149+
// Format detailed results
150+
const detailsByScore = [...evalResults].sort(
151+
(a, b) => (b.avgScore || 0) - (a.avgScore || 0),
152+
);
153+
154+
const details = [
155+
`## Individual Eval Scores`,
156+
``,
157+
...detailsByScore.map((result) => {
158+
const score = result.avgScore !== null ? result.avgScore : 0;
159+
const statusIcon = result.passed ? "✅" : "❌";
160+
const scoreDisplay = formatScore(score);
161+
162+
let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;
163+
164+
// Add rationale for failed or low-scoring tests
165+
if (!result.passed || score < 0.75) {
166+
const firstScore = result.scores[0];
167+
if (firstScore?.metadata?.rationale) {
168+
line += `\n - ${firstScore.metadata.rationale}`;
169+
}
170+
}
171+
172+
return line;
173+
}),
174+
``,
175+
`---`,
176+
``,
177+
`### Conclusion`,
178+
``,
179+
conclusion === "success"
180+
? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
181+
: `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
182+
].join("\n");
183+
184+
// Create check run
185+
console.log(`Creating check run with conclusion: ${conclusion}`);
186+
const checkRun = await createCheckRun({
187+
owner,
188+
repo,
189+
token,
190+
sha,
191+
name: "Evaluation Results",
192+
conclusion,
193+
title,
194+
summary,
195+
text: details,
196+
});
197+
198+
console.log(`✅ Check run created: ${checkRun.html_url}`);
199+
console.log(` Conclusion: ${conclusion}`);
200+
console.log(` Average Score: ${avgScore.toFixed(2)}`);
201+
}
202+
203+
main().catch((error) => {
204+
console.error("❌ Error creating check run:", error);
205+
process.exit(1);
206+
});

.github/workflows/eval.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ jobs:
2020
eval:
2121
environment: Actions
2222
runs-on: ubuntu-latest
23+
permissions:
24+
checks: write
25+
contents: read
2326
steps:
2427
- uses: actions/checkout@v4
2528

@@ -55,9 +58,16 @@ jobs:
5558

5659
- name: Run evals
5760
run: pnpm eval:ci evals
61+
continue-on-error: true
5862
env:
5963
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
6064

65+
- name: Create eval status check
66+
if: ${{ !cancelled() }}
67+
run: node .github/scripts/create-eval-check.js
68+
env:
69+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
70+
6171
- name: Upload coverage reports to Codecov
6272
uses: codecov/codecov-action@v4
6373
env:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,4 @@ coverage
6565
# Generated files
6666
packages/mcp-server/src/toolDefinitions.json
6767
packages/mcp-server/src/skillDefinitions.json
68+
packages/mcp-server-evals/eval-results.json

packages/mcp-server-evals/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"dev": "tsc -w",
1313
"start": "tsx src/bin/start-mock-stdio.ts",
1414
"eval": "vitest --config=vitest.config.ts",
15-
"eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --outputFile=eval.junit.xml"
15+
"eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --reporter=json --outputFile.json=eval-results.json --outputFile.junit=eval.junit.xml"
1616
},
1717
"dependencies": {
1818
"@ai-sdk/openai": "catalog:",

0 commit comments

Comments
 (0)