Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

- 📋 **A metadata schema** ([`eval.schema.json`](eval.schema.json)) that defines the information needed for meaningful comparison of evaluation results, including [instance-level data](instance_level_eval.schema.json)
- 🔧 **Validation** that checks data against the schema before it enters the repository
- 🔌 **Converters** for [Inspect AI](every_eval_ever/converters/inspect/), [HELM](every_eval_ever/converters/helm/), and [lm-eval-harness](every_eval_ever/converters/lm_eval/), so you can transform your existing evaluation logs into the standard format
- 🔌 **Converters** for [Inspect AI](every_eval_ever/converters/inspect/), [HELM](every_eval_ever/converters/helm/), [lm-eval-harness](every_eval_ever/converters/lm_eval/), [AlpacaEval](every_eval_ever/converters/alpaca_eval/), and [LEXam](every_eval_ever/converters/lexam/), so you can transform your existing evaluation logs into the standard format

Install the package:

Expand Down Expand Up @@ -327,13 +327,15 @@ uv run python post_codegen.py

## 🔌 Eval Converters

We have prepared converters to make adapting to our schema as easy as possible. At the moment, we support converting local evaluation harness logs from `Inspect AI`, `HELM` and `lm-evaluation-harness` into our unified schema. Each converter produces aggregate JSON and optionally instance-level JSONL output.
We have prepared converters to make adapting to our schema as easy as possible. At the moment, we support converting local evaluation harness logs from `Inspect AI`, `HELM` and `lm-evaluation-harness`, plus public leaderboard scrapes from `AlpacaEval` and `LEXam`, into our unified schema. Each converter produces aggregate JSON and optionally instance-level JSONL output.

| Framework | Command | Instance-Level JSONL |
|---|---|---|
| [Inspect AI](every_eval_ever/converters/inspect/) | `every_eval_ever convert inspect --log_path <path>` | Yes, if samples in log |
| [HELM](every_eval_ever/converters/helm/) | `every_eval_ever convert helm --log_path <path>` | Always |
| [lm-evaluation-harness](every_eval_ever/converters/lm_eval/) | `every_eval_ever convert lm_eval --log_path <path> --include_samples` | With `--include_samples` |
| [AlpacaEval](every_eval_ever/converters/alpaca_eval/) | `every_eval_ever convert alpaca_eval --output_dir data` | No |
| [LEXam](every_eval_ever/converters/lexam/) | `every_eval_ever convert lexam --output_dir data` | No |

For full CLI usage and required input files, see the [Eval Converters README](every_eval_ever/converters/README.md).

Expand Down
43 changes: 40 additions & 3 deletions every_eval_ever/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,40 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int:
return 0


def _cmd_convert_lexam(args: argparse.Namespace) -> int:
from every_eval_ever.converters.lexam.adapter import LEXamAdapter

adapter = LEXamAdapter()
output_dir = Path(args.output_dir)
logs = adapter.fetch_leaderboard()

for log in logs:
if args.source_organization_name != 'unknown':
log.source_metadata.source_organization_name = (
args.source_organization_name
)
if args.source_organization_url is not None:
log.source_metadata.source_organization_url = (
args.source_organization_url
)
if args.evaluator_relationship != 'collaborative':
from every_eval_ever.eval_types import EvaluatorRelationship

log.source_metadata.evaluator_relationship = EvaluatorRelationship(
args.evaluator_relationship
)
if args.eval_library_name != 'lexam':
log.eval_library.name = args.eval_library_name
if args.eval_library_version != 'unknown':
log.eval_library.version = args.eval_library_version

out_file = _write_log(log, output_dir)
print(f' {out_file}')

print(f'\nConverted {len(logs)} model evaluation(s).')
return 0


def _cmd_convert_alpaca_eval(args: argparse.Namespace) -> int:
from every_eval_ever.converters.alpaca_eval.adapter import (
LEADERBOARDS,
Expand Down Expand Up @@ -215,6 +249,7 @@ def _cmd_convert_alpaca_eval(args: argparse.Namespace) -> int:
)
if args.evaluator_relationship != 'third_party':
from every_eval_ever.eval_types import EvaluatorRelationship

log.source_metadata.evaluator_relationship = (
EvaluatorRelationship(args.evaluator_relationship)
)
Expand Down Expand Up @@ -298,7 +333,7 @@ def build_parser() -> argparse.ArgumentParser:
dest='source', required=True
)

for source in ['lm_eval', 'inspect', 'helm', 'alpaca_eval']:
for source in ['lm_eval', 'inspect', 'helm', 'alpaca_eval', 'lexam']:
source_parser = convert_subparsers.add_parser(
source,
help=f'Convert {source} logs',
Expand All @@ -307,7 +342,7 @@ def build_parser() -> argparse.ArgumentParser:
source_parser.add_argument(
'--log_path',
'--log-path',
required=(source != 'alpaca_eval'),
required=(source not in {'alpaca_eval', 'lexam'}),
help='Path to source log file or directory to convert.',
)
source_parser.add_argument(
Expand All @@ -325,7 +360,7 @@ def build_parser() -> argparse.ArgumentParser:
source_parser.add_argument(
'--evaluator_relationship',
'--evaluator-relationship',
default='third_party',
default=('collaborative' if source == 'lexam' else 'third_party'),
choices=EVALUATOR_RELATIONSHIP_CHOICES,
help='Relationship between evaluator and model developer.',
)
Expand Down Expand Up @@ -421,6 +456,8 @@ def main(argv: list[str] | None = None) -> int:
return _cmd_convert_helm(args)
if args.source == 'alpaca_eval':
return _cmd_convert_alpaca_eval(args)
if args.source == 'lexam':
return _cmd_convert_lexam(args)

parser.print_help()
return 1
Expand Down
35 changes: 34 additions & 1 deletion every_eval_ever/converters/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## Automatic Evaluation Log Converters
A collection of scripts to convert evaluation logs from local evaluation frameworks (e.g., `Inspect AI` and `lm-eval-harness`) and public leaderboards (e.g., AlpacaEval) into the unified Every Eval Ever schema.
A collection of scripts to convert evaluation logs from local evaluation frameworks (e.g., `Inspect AI` and `lm-eval-harness`) and public leaderboards (e.g., AlpacaEval and LEXam) into the unified Every Eval Ever schema.

### Installation

Expand Down Expand Up @@ -271,3 +271,36 @@ options:
--version {v1,v2} Which leaderboard to convert. Omit to convert both (default).
--output_dir OUTPUT_DIR Base output directory (default: data).
```

## LEXam

The LEXam converter fetches the public leaderboard HTML from the LEXam project
website repository and converts all model entries into the unified schema.
No local log files are required.

Metrics converted per model:

| Metric | Description |
|---|---|
| Open Question Judge Score | Mean expert-validated LLM-judge score on open-ended law exam questions (0-100) |
| Multiple-Choice Accuracy | Accuracy across all LEXam MCQ configs (0-100) |

### Usage

```bash
uv run every_eval_ever convert lexam --output_dir data
```

Full argument list:

```
usage: every_eval_ever convert lexam [-h] [--output_dir OUTPUT_DIR]
[--source_organization_name ...]
[--evaluator_relationship ...]
[--source_organization_url ...]
[--eval_library_name ...]
[--eval_library_version ...]

options:
--output_dir OUTPUT_DIR Base output directory (default: data).
```
Empty file.
56 changes: 56 additions & 0 deletions every_eval_ever/converters/lexam/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""CLI for converting LEXam leaderboard data to every_eval_ever format."""

import argparse
import json
import sys
import uuid
from pathlib import Path

from .adapter import LEXamAdapter


def main() -> None:
parser = argparse.ArgumentParser(
description=(
'Fetch LEXam leaderboard data from GitHub and convert it '
'to Every Eval Ever schema JSON files.'
)
)
parser.add_argument(
'--output_dir',
default='data',
help='Base output directory (default: data).',
)
args = parser.parse_args()

adapter = LEXamAdapter()
output_dir = Path(args.output_dir)

try:
logs = adapter.fetch_leaderboard()
except Exception as exc:
print(f'ERROR: {exc}', file=sys.stderr)
raise SystemExit(1) from exc

for log in logs:
parts = log.model_info.id.split('/', 1)
developer = parts[0] if len(parts) == 2 else 'unknown'
model_name = parts[1] if len(parts) == 2 else log.model_info.id

out_dir = output_dir / 'lexam' / developer / model_name
out_dir.mkdir(parents=True, exist_ok=True)
out_file = out_dir / f'{uuid.uuid4()}.json'

with out_file.open('w', encoding='utf-8') as file:
json.dump(
log.model_dump(mode='json', exclude_none=True),
file,
indent=2,
)
print(out_file)

print(f'\nConverted {len(logs)} model evaluation(s).')


if __name__ == '__main__':
main()
Loading