Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 135 additions & 7 deletions .github/workflows/eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ jobs:
eval:
environment: Actions
runs-on: ubuntu-latest
permissions:
checks: write
contents: read
steps:
- uses: actions/checkout@v4

Expand Down Expand Up @@ -55,9 +58,141 @@ jobs:

- name: Run evals
run: pnpm eval:ci evals
continue-on-error: true
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

- name: Create eval status check
uses: actions/github-script@v7
if: ${{ !cancelled() }}
with:
script: |
const fs = require('fs');
const path = require('path');

// Read eval results
const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json');
console.log(`Reading eval results from: ${resultsPath}`);

let vitestResults;
try {
vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));
} catch (error) {
if (error.code === 'ENOENT') {
throw new Error(
`Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.`
);
}
throw new Error(`Failed to read/parse eval results: ${error.message}`);
}

// Extract eval results from vitest format
const evalResults = [];
for (const testFile of vitestResults.testResults || []) {
for (const test of testFile.assertionResults || []) {
if (test.meta?.eval) {
evalResults.push({
name: test.fullName || test.title,
file: testFile.name,
avgScore: test.meta.eval.avgScore ?? null,
scores: test.meta.eval.scores || [],
passed: test.status === 'passed',
duration: test.duration,
});
}
}
}
Comment on lines +94 to +104

This comment was marked as outdated.


// Calculate statistics
const totalTests = evalResults.length;
// Treat null scores as 0.0 for consistent categorization
const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0);

const avgScore = scores.length > 0
? scores.reduce((sum, score) => sum + score, 0) / scores.length
: 0;

const green = scores.filter(s => s >= 0.75).length;
const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length;
const red = scores.filter(s => s < 0.5).length;

// Determine conclusion
const conclusion = avgScore >= 0.5 ? 'success' : 'failure';

// Format score helper
function formatScore(score) {
if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
return `🔴 ${score.toFixed(2)}`;
}

// Build title
const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;

// Build summary
const summary = [
`## Overall Statistics`,
``,
`- **Total Evaluations**: ${totalTests}`,
`- **Average Score**: ${formatScore(avgScore)}`,
`- **Pass Threshold**: 0.50 (catastrophic failure)`,
``,
`### Score Distribution`,
`- 🟢 Green (≥0.75): ${green} evals`,
`- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
`- 🔴 Red (<0.50): ${red} evals`,
].join('\n');

// Build detailed results
const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
const details = [
`## Individual Eval Scores`,
``,
...detailsByScore.map(result => {
const score = result.avgScore !== null ? result.avgScore : 0;
const statusIcon = result.passed ? '✅' : '❌';
const scoreDisplay = formatScore(score);

let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;

// Add rationale for failed or low-scoring tests
if (!result.passed || score < 0.75) {
const firstScore = result.scores[0];
if (firstScore?.metadata?.rationale) {
line += `\n - ${firstScore.metadata.rationale}`;
}
}

return line;
}),
``,
`---`,
``,
`### Conclusion`,
``,
conclusion === 'success'
? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
: `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
].join('\n');

// Create check run
await github.rest.checks.create({
owner: context.repo.owner,
repo: context.repo.repo,
name: 'Evaluation Results',
head_sha: context.sha,
status: 'completed',
conclusion: conclusion,
output: {
title: title,
summary: summary,
text: details,
},
});

console.log(`✅ Check run created with conclusion: ${conclusion}`);
console.log(` Average Score: ${avgScore.toFixed(2)}`);

- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v4
env:
Expand All @@ -72,10 +207,3 @@ jobs:
uses: codecov/test-results-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}

This comment was marked as outdated.


- name: Publish Test Report
uses: mikepenz/action-junit-report@cf701569b05ccdd861a76b8607a66d76f6fd4857
if: ${{ !cancelled() }}
with:
report_paths: "**/*.junit.xml"
comment: false
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,4 @@ coverage
# Generated files
packages/mcp-server/src/toolDefinitions.json
packages/mcp-server/src/skillDefinitions.json
packages/mcp-server-evals/eval-results.json
2 changes: 1 addition & 1 deletion packages/mcp-server-evals/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"dev": "tsc -w",
"start": "tsx src/bin/start-mock-stdio.ts",
"eval": "vitest --config=vitest.config.ts",
"eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --outputFile=eval.junit.xml"
"eval:ci": "vitest run --coverage --reporter=vitest-evals/reporter --reporter=junit --reporter=json --outputFile.json=eval-results.json --outputFile.junit=eval.junit.xml"
},
"dependencies": {
"@ai-sdk/openai": "catalog:",
Expand Down
Loading