2020 eval :
2121 environment : Actions
2222 runs-on : ubuntu-latest
23+ permissions :
24+ checks : write
25+ contents : read
2326 steps :
2427 - uses : actions/checkout@v4
2528
@@ -55,9 +58,141 @@ jobs:
5558
5659 - name : Run evals
5760 run : pnpm eval:ci evals
61+ continue-on-error : true
5862 env :
5963 OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
6064
65+ - name : Create eval status check
66+ uses : actions/github-script@v7
67+ if : ${{ !cancelled() }}
68+ with :
69+ script : |
70+ const fs = require('fs');
71+ const path = require('path');
72+
73+ // Read eval results
74+ const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json');
75+ console.log(`Reading eval results from: ${resultsPath}`);
76+
77+ let vitestResults;
78+ try {
79+ vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));
80+ } catch (error) {
81+ if (error.code === 'ENOENT') {
82+ throw new Error(
83+ `Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.`
84+ );
85+ }
86+ throw new Error(`Failed to read/parse eval results: ${error.message}`);
87+ }
88+
89+ // Extract eval results from vitest format
90+ const evalResults = [];
91+ for (const testFile of vitestResults.testResults || []) {
92+ for (const test of testFile.assertionResults || []) {
93+ if (test.meta?.eval) {
94+ evalResults.push({
95+ name: test.fullName || test.title,
96+ file: testFile.name,
97+ avgScore: test.meta.eval.avgScore ?? null,
98+ scores: test.meta.eval.scores || [],
99+ passed: test.status === 'passed',
100+ duration: test.duration,
101+ });
102+ }
103+ }
104+ }
105+
106+ // Calculate statistics
107+ const totalTests = evalResults.length;
108+ // Treat null scores as 0.0 for consistent categorization
109+ const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0);
110+
111+ const avgScore = scores.length > 0
112+ ? scores.reduce((sum, score) => sum + score, 0) / scores.length
113+ : 0;
114+
115+ const green = scores.filter(s => s >= 0.75).length;
116+ const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length;
117+ const red = scores.filter(s => s < 0.5).length;
118+
119+ // Determine conclusion
120+ const conclusion = avgScore >= 0.5 ? 'success' : 'failure';
121+
122+ // Format score helper
123+ function formatScore(score) {
124+ if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
125+ if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
126+ return `🔴 ${score.toFixed(2)}`;
127+ }
128+
129+ // Build title
130+ const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;
131+
132+ // Build summary
133+ const summary = [
134+ `## Overall Statistics`,
135+ ``,
136+ `- **Total Evaluations**: ${totalTests}`,
137+ `- **Average Score**: ${formatScore(avgScore)}`,
138+ `- **Pass Threshold**: 0.50 (catastrophic failure)`,
139+ ``,
140+ `### Score Distribution`,
141+ `- 🟢 Green (≥0.75): ${green} evals`,
142+ `- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
143+ `- 🔴 Red (<0.50): ${red} evals`,
144+ ].join('\n');
145+
146+ // Build detailed results
147+ const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
148+ const details = [
149+ `## Individual Eval Scores`,
150+ ``,
151+ ...detailsByScore.map(result => {
152+ const score = result.avgScore !== null ? result.avgScore : 0;
153+ const statusIcon = result.passed ? '✅' : '❌';
154+ const scoreDisplay = formatScore(score);
155+
156+ let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;
157+
158+ // Add rationale for failed or low-scoring tests
159+ if (!result.passed || score < 0.75) {
160+ const firstScore = result.scores[0];
161+ if (firstScore?.metadata?.rationale) {
162+ line += `\n - ${firstScore.metadata.rationale}`;
163+ }
164+ }
165+
166+ return line;
167+ }),
168+ ``,
169+ `---`,
170+ ``,
171+ `### Conclusion`,
172+ ``,
173+ conclusion === 'success'
174+ ? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
175+ : `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
176+ ].join('\n');
177+
178+ // Create check run
179+ await github.rest.checks.create({
180+ owner: context.repo.owner,
181+ repo: context.repo.repo,
182+ name: 'Evaluation Results',
183+ head_sha: context.sha,
184+ status: 'completed',
185+ conclusion: conclusion,
186+ output: {
187+ title: title,
188+ summary: summary,
189+ text: details,
190+ },
191+ });
192+
193+ console.log(`✅ Check run created with conclusion: ${conclusion}`);
194+ console.log(` Average Score: ${avgScore.toFixed(2)}`);
195+
61196 - name : Upload coverage reports to Codecov
62197 uses : codecov/codecov-action@v4
63198 env :
@@ -72,10 +207,3 @@ jobs:
72207 uses : codecov/test-results-action@v1
73208 with :
74209 token : ${{ secrets.CODECOV_TOKEN }}
75-
76- - name : Publish Test Report
77- uses : mikepenz/action-junit-report@cf701569b05ccdd861a76b8607a66d76f6fd4857
78- if : ${{ !cancelled() }}
79- with :
80- report_paths : " **/*.junit.xml"
81- comment : false
0 commit comments