2020 eval :
2121 environment : Actions
2222 runs-on : ubuntu-latest
23+ permissions :
24+ checks : write
25+ contents : read
2326 steps :
2427 - uses : actions/checkout@v4
2528
@@ -55,9 +58,142 @@ jobs:
5558
5659 - name : Run evals
5760 run : pnpm eval:ci evals
61+ continue-on-error : true
5862 env :
5963 OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
6064
65+ - name : Create eval status check
66+ uses : actions/github-script@v7
67+ if : ${{ !cancelled() }}
68+ with :
69+ script : |
70+ const fs = require('fs');
71+ const path = require('path');
72+
73+ // Read eval results
74+ const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json');
75+ console.log(`Reading eval results from: ${resultsPath}`);
76+
77+ let vitestResults;
78+ try {
79+ vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));
80+ } catch (error) {
81+ if (error.code === 'ENOENT') {
82+ throw new Error(
83+ `Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.`
84+ );
85+ }
86+ throw new Error(`Failed to read/parse eval results: ${error.message}`);
87+ }
88+
89+ // Extract eval results from vitest format
90+ const evalResults = [];
91+ for (const testFile of vitestResults.testResults || []) {
92+ for (const test of testFile.assertionResults || []) {
93+ if (test.meta?.eval) {
94+ evalResults.push({
95+ name: test.fullName || test.title,
96+ file: testFile.name,
97+ avgScore: test.meta.eval.avgScore ?? null,
98+ scores: test.meta.eval.scores || [],
99+ passed: test.status === 'passed',
100+ duration: test.duration,
101+ });
102+ }
103+ }
104+ }
105+
106+ // Calculate statistics
107+ const totalTests = evalResults.length;
108+ const validScores = evalResults
109+ .map(r => r.avgScore)
110+ .filter(score => score !== null);
111+
112+ const avgScore = validScores.length > 0
113+ ? validScores.reduce((sum, score) => sum + score, 0) / validScores.length
114+ : 0;
115+
116+ const green = validScores.filter(s => s >= 0.75).length;
117+ const yellow = validScores.filter(s => s >= 0.5 && s < 0.75).length;
118+ const red = validScores.filter(s => s < 0.5).length;
119+
120+ // Determine conclusion
121+ const conclusion = avgScore >= 0.5 ? 'success' : 'failure';
122+
123+ // Format score helper
124+ function formatScore(score) {
125+ if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
126+ if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
127+ return `🔴 ${score.toFixed(2)}`;
128+ }
129+
130+ // Build title
131+ const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;
132+
133+ // Build summary
134+ const summary = [
135+ `## Overall Statistics`,
136+ ``,
137+ `- **Total Evaluations**: ${totalTests}`,
138+ `- **Average Score**: ${formatScore(avgScore)}`,
139+ `- **Pass Threshold**: 0.50 (catastrophic failure)`,
140+ ``,
141+ `### Score Distribution`,
142+ `- 🟢 Green (≥0.75): ${green} evals`,
143+ `- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
144+ `- 🔴 Red (<0.50): ${red} evals`,
145+ ].join('\n');
146+
147+ // Build detailed results
148+ const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
149+ const details = [
150+ `## Individual Eval Scores`,
151+ ``,
152+ ...detailsByScore.map(result => {
153+ const score = result.avgScore !== null ? result.avgScore : 0;
154+ const statusIcon = result.passed ? '✅' : '❌';
155+ const scoreDisplay = formatScore(score);
156+
157+ let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;
158+
159+ // Add rationale for failed or low-scoring tests
160+ if (!result.passed || score < 0.75) {
161+ const firstScore = result.scores[0];
162+ if (firstScore?.metadata?.rationale) {
163+ line += `\n - ${firstScore.metadata.rationale}`;
164+ }
165+ }
166+
167+ return line;
168+ }),
169+ ``,
170+ `---`,
171+ ``,
172+ `### Conclusion`,
173+ ``,
174+ conclusion === 'success'
175+ ? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
176+ : `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
177+ ].join('\n');
178+
179+ // Create check run
180+ await github.rest.checks.create({
181+ owner: context.repo.owner,
182+ repo: context.repo.repo,
183+ name: 'Evaluation Results',
184+ head_sha: context.sha,
185+ status: 'completed',
186+ conclusion: conclusion,
187+ output: {
188+ title: title,
189+ summary: summary,
190+ text: details,
191+ },
192+ });
193+
194+ console.log(`✅ Check run created with conclusion: ${conclusion}`);
195+ console.log(` Average Score: ${avgScore.toFixed(2)}`);
196+
61197 - name : Upload coverage reports to Codecov
62198 uses : codecov/codecov-action@v4
63199 env :
@@ -72,10 +208,3 @@ jobs:
72208 uses : codecov/test-results-action@v1
73209 with :
74210 token : ${{ secrets.CODECOV_TOKEN }}
75-
76- - name : Publish Test Report
77- uses : mikepenz/action-junit-report@cf701569b05ccdd861a76b8607a66d76f6fd4857
78- if : ${{ !cancelled() }}
79- with :
80- report_paths : " **/*.junit.xml"
81- comment : false
0 commit comments