diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 292a9438..096d955f 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -20,6 +20,9 @@ jobs: eval: environment: Actions runs-on: ubuntu-latest + permissions: + checks: write + contents: read steps: - uses: actions/checkout@v4 @@ -55,9 +58,141 @@ jobs: - name: Run evals run: pnpm eval:ci evals + continue-on-error: true env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + - name: Create eval status check + uses: actions/github-script@v7 + if: ${{ !cancelled() }} + with: + script: | + const fs = require('fs'); + const path = require('path'); + + // Read eval results + const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json'); + console.log(`Reading eval results from: ${resultsPath}`); + + let vitestResults; + try { + vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8')); + } catch (error) { + if (error.code === 'ENOENT') { + throw new Error( + `Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.` + ); + } + throw new Error(`Failed to read/parse eval results: ${error.message}`); + } + + // Extract eval results from vitest format + const evalResults = []; + for (const testFile of vitestResults.testResults || []) { + for (const test of testFile.assertionResults || []) { + if (test.meta?.eval) { + evalResults.push({ + name: test.fullName || test.title, + file: testFile.name, + avgScore: test.meta.eval.avgScore ?? null, + scores: test.meta.eval.scores || [], + passed: test.status === 'passed', + duration: test.duration, + }); + } + } + } + + // Calculate statistics + const totalTests = evalResults.length; + // Treat null scores as 0.0 for consistent categorization + const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0); + + const avgScore = scores.length > 0 + ? scores.reduce((sum, score) => sum + score, 0) / scores.length + : 0; + + const green = scores.filter(s => s >= 0.75).length; + const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length; + const red = scores.filter(s => s < 0.5).length; + + // Determine conclusion + const conclusion = avgScore >= 0.5 ? 'success' : 'failure'; + + // Format score helper + function formatScore(score) { + if (score >= 0.75) return `🟢 ${score.toFixed(2)}`; + if (score >= 0.5) return `🟡 ${score.toFixed(2)}`; + return `🔴 ${score.toFixed(2)}`; + } + + // Build title + const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`; + + // Build summary + const summary = [ + `## Overall Statistics`, + ``, + `- **Total Evaluations**: ${totalTests}`, + `- **Average Score**: ${formatScore(avgScore)}`, + `- **Pass Threshold**: 0.50 (catastrophic failure)`, + ``, + `### Score Distribution`, + `- 🟢 Green (≥0.75): ${green} evals`, + `- 🟡 Yellow (0.50-0.74): ${yellow} evals`, + `- 🔴 Red (<0.50): ${red} evals`, + ].join('\n'); + + // Build detailed results + const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0)); + const details = [ + `## Individual Eval Scores`, + ``, + ...detailsByScore.map(result => { + const score = result.avgScore !== null ? result.avgScore : 0; + const statusIcon = result.passed ? '✅' : '❌'; + const scoreDisplay = formatScore(score); + + let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`; + + // Add rationale for failed or low-scoring tests + if (!result.passed || score < 0.75) { + const firstScore = result.scores[0]; + if (firstScore?.metadata?.rationale) { + line += `\n - ${firstScore.metadata.rationale}`; + } + } + + return line; + }), + ``, + `---`, + ``, + `### Conclusion`, + ``, + conclusion === 'success' + ? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)` + : `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`, + ].join('\n'); + + // Create check run + await github.rest.checks.create({ + owner: context.repo.owner, + repo: context.repo.repo, + name: 'Evaluation Results', + head_sha: context.sha, + status: 'completed', + conclusion: conclusion, + output: { + title: title, + summary: summary, + text: details, + }, + }); + + console.log(`✅ Check run created with conclusion: ${conclusion}`); + console.log(` Average Score: ${avgScore.toFixed(2)}`); + - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4 env: @@ -72,10 +207,3 @@ jobs: uses: codecov/test-results-action@v1 with: token: ${{ secrets.CODECOV_TOKEN }} - - - name: Publish Test Report - uses: mikepenz/action-junit-report@cf701569b05ccdd861a76b8607a66d76f6fd4857 - if: ${{ !cancelled() }} - with: - report_paths: "**/*.junit.xml" - comment: false diff --git a/.gitignore b/.gitignore index 520ba095..6d8e77a2 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,4 @@ coverage # Generated files packages/mcp-server/src/toolDefinitions.json packages/mcp-server/src/skillDefinitions.json +packages/mcp-server-evals/eval-results.json diff --git a/packages/mcp-server-evals/package.json b/packages/mcp-server-evals/package.json index 2c43baca..a95d8c77 100644 --- a/packages/mcp-server-evals/package.json +++ b/packages/mcp-server-evals/package.json @@ -12,7 +12,7 @@ "dev": "tsc -w", "start": "tsx src/bin/start-mock-stdio.ts", "eval": "vitest --config=vitest.config.ts", - "eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --outputFile=eval.junit.xml" + "eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --reporter=json --outputFile.json=eval-results.json --outputFile.junit=eval.junit.xml" }, "dependencies": { "@ai-sdk/openai": "catalog:",