diff --git a/README.md b/README.md index 2af8b4280..35cbd5fe4 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ - πŸ“‹ **A metadata schema** ([`eval.schema.json`](eval.schema.json)) that defines the information needed for meaningful comparison of evaluation results, including [instance-level data](instance_level_eval.schema.json) - πŸ”§ **Validation** that checks data against the schema before it enters the repository -- πŸ”Œ **Converters** for [Inspect AI](every_eval_ever/converters/inspect/), [HELM](every_eval_ever/converters/helm/), and [lm-eval-harness](every_eval_ever/converters/lm_eval/), so you can transform your existing evaluation logs into the standard format +- πŸ”Œ **Converters** for [Inspect AI](every_eval_ever/converters/inspect/), [HELM](every_eval_ever/converters/helm/), [lm-eval-harness](every_eval_ever/converters/lm_eval/), [AlpacaEval](every_eval_ever/converters/alpaca_eval/), and [LEXam](every_eval_ever/converters/lexam/), so you can transform your existing evaluation logs into the standard format Install the package: @@ -327,13 +327,15 @@ uv run python post_codegen.py ## πŸ”Œ Eval Converters -We have prepared converters to make adapting to our schema as easy as possible. At the moment, we support converting local evaluation harness logs from `Inspect AI`, `HELM` and `lm-evaluation-harness` into our unified schema. Each converter produces aggregate JSON and optionally instance-level JSONL output. +We have prepared converters to make adapting to our schema as easy as possible. At the moment, we support converting local evaluation harness logs from `Inspect AI`, `HELM` and `lm-evaluation-harness`, plus public leaderboard scrapes from `AlpacaEval` and `LEXam`, into our unified schema. Each converter produces aggregate JSON and optionally instance-level JSONL output. | Framework | Command | Instance-Level JSONL | |---|---|---| | [Inspect AI](every_eval_ever/converters/inspect/) | `every_eval_ever convert inspect --log_path ` | Yes, if samples in log | | [HELM](every_eval_ever/converters/helm/) | `every_eval_ever convert helm --log_path ` | Always | | [lm-evaluation-harness](every_eval_ever/converters/lm_eval/) | `every_eval_ever convert lm_eval --log_path --include_samples` | With `--include_samples` | +| [AlpacaEval](every_eval_ever/converters/alpaca_eval/) | `every_eval_ever convert alpaca_eval --output_dir data` | No | +| [LEXam](every_eval_ever/converters/lexam/) | `every_eval_ever convert lexam --output_dir data` | No | For full CLI usage and required input files, see the [Eval Converters README](every_eval_ever/converters/README.md). diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index c215c89cd..77a04d116 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -188,6 +188,40 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int: return 0 +def _cmd_convert_lexam(args: argparse.Namespace) -> int: + from every_eval_ever.converters.lexam.adapter import LEXamAdapter + + adapter = LEXamAdapter() + output_dir = Path(args.output_dir) + logs = adapter.fetch_leaderboard() + + for log in logs: + if args.source_organization_name != 'unknown': + log.source_metadata.source_organization_name = ( + args.source_organization_name + ) + if args.source_organization_url is not None: + log.source_metadata.source_organization_url = ( + args.source_organization_url + ) + if args.evaluator_relationship != 'collaborative': + from every_eval_ever.eval_types import EvaluatorRelationship + + log.source_metadata.evaluator_relationship = EvaluatorRelationship( + args.evaluator_relationship + ) + if args.eval_library_name != 'lexam': + log.eval_library.name = args.eval_library_name + if args.eval_library_version != 'unknown': + log.eval_library.version = args.eval_library_version + + out_file = _write_log(log, output_dir) + print(f' {out_file}') + + print(f'\nConverted {len(logs)} model evaluation(s).') + return 0 + + def _cmd_convert_alpaca_eval(args: argparse.Namespace) -> int: from every_eval_ever.converters.alpaca_eval.adapter import ( LEADERBOARDS, @@ -215,6 +249,7 @@ def _cmd_convert_alpaca_eval(args: argparse.Namespace) -> int: ) if args.evaluator_relationship != 'third_party': from every_eval_ever.eval_types import EvaluatorRelationship + log.source_metadata.evaluator_relationship = ( EvaluatorRelationship(args.evaluator_relationship) ) @@ -298,7 +333,7 @@ def build_parser() -> argparse.ArgumentParser: dest='source', required=True ) - for source in ['lm_eval', 'inspect', 'helm', 'alpaca_eval']: + for source in ['lm_eval', 'inspect', 'helm', 'alpaca_eval', 'lexam']: source_parser = convert_subparsers.add_parser( source, help=f'Convert {source} logs', @@ -307,7 +342,7 @@ def build_parser() -> argparse.ArgumentParser: source_parser.add_argument( '--log_path', '--log-path', - required=(source != 'alpaca_eval'), + required=(source not in {'alpaca_eval', 'lexam'}), help='Path to source log file or directory to convert.', ) source_parser.add_argument( @@ -325,7 +360,7 @@ def build_parser() -> argparse.ArgumentParser: source_parser.add_argument( '--evaluator_relationship', '--evaluator-relationship', - default='third_party', + default=('collaborative' if source == 'lexam' else 'third_party'), choices=EVALUATOR_RELATIONSHIP_CHOICES, help='Relationship between evaluator and model developer.', ) @@ -421,6 +456,8 @@ def main(argv: list[str] | None = None) -> int: return _cmd_convert_helm(args) if args.source == 'alpaca_eval': return _cmd_convert_alpaca_eval(args) + if args.source == 'lexam': + return _cmd_convert_lexam(args) parser.print_help() return 1 diff --git a/every_eval_ever/converters/README.md b/every_eval_ever/converters/README.md index 50c94f453..e0b8a405f 100644 --- a/every_eval_ever/converters/README.md +++ b/every_eval_ever/converters/README.md @@ -1,5 +1,5 @@ ## Automatic Evaluation Log Converters -A collection of scripts to convert evaluation logs from local evaluation frameworks (e.g., `Inspect AI` and `lm-eval-harness`) and public leaderboards (e.g., AlpacaEval) into the unified Every Eval Ever schema. +A collection of scripts to convert evaluation logs from local evaluation frameworks (e.g., `Inspect AI` and `lm-eval-harness`) and public leaderboards (e.g., AlpacaEval and LEXam) into the unified Every Eval Ever schema. ### Installation @@ -271,3 +271,36 @@ options: --version {v1,v2} Which leaderboard to convert. Omit to convert both (default). --output_dir OUTPUT_DIR Base output directory (default: data). ``` + +## LEXam + +The LEXam converter fetches the public leaderboard HTML from the LEXam project +website repository and converts all model entries into the unified schema. +No local log files are required. + +Metrics converted per model: + +| Metric | Description | +|---|---| +| Open Question Judge Score | Mean expert-validated LLM-judge score on open-ended law exam questions (0-100) | +| Multiple-Choice Accuracy | Accuracy across all LEXam MCQ configs (0-100) | + +### Usage + +```bash +uv run every_eval_ever convert lexam --output_dir data +``` + +Full argument list: + +``` +usage: every_eval_ever convert lexam [-h] [--output_dir OUTPUT_DIR] + [--source_organization_name ...] + [--evaluator_relationship ...] + [--source_organization_url ...] + [--eval_library_name ...] + [--eval_library_version ...] + +options: + --output_dir OUTPUT_DIR Base output directory (default: data). +``` diff --git a/every_eval_ever/converters/lexam/__init__.py b/every_eval_ever/converters/lexam/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/every_eval_ever/converters/lexam/__main__.py b/every_eval_ever/converters/lexam/__main__.py new file mode 100644 index 000000000..94d638042 --- /dev/null +++ b/every_eval_ever/converters/lexam/__main__.py @@ -0,0 +1,56 @@ +"""CLI for converting LEXam leaderboard data to every_eval_ever format.""" + +import argparse +import json +import sys +import uuid +from pathlib import Path + +from .adapter import LEXamAdapter + + +def main() -> None: + parser = argparse.ArgumentParser( + description=( + 'Fetch LEXam leaderboard data from GitHub and convert it ' + 'to Every Eval Ever schema JSON files.' + ) + ) + parser.add_argument( + '--output_dir', + default='data', + help='Base output directory (default: data).', + ) + args = parser.parse_args() + + adapter = LEXamAdapter() + output_dir = Path(args.output_dir) + + try: + logs = adapter.fetch_leaderboard() + except Exception as exc: + print(f'ERROR: {exc}', file=sys.stderr) + raise SystemExit(1) from exc + + for log in logs: + parts = log.model_info.id.split('/', 1) + developer = parts[0] if len(parts) == 2 else 'unknown' + model_name = parts[1] if len(parts) == 2 else log.model_info.id + + out_dir = output_dir / 'lexam' / developer / model_name + out_dir.mkdir(parents=True, exist_ok=True) + out_file = out_dir / f'{uuid.uuid4()}.json' + + with out_file.open('w', encoding='utf-8') as file: + json.dump( + log.model_dump(mode='json', exclude_none=True), + file, + indent=2, + ) + print(out_file) + + print(f'\nConverted {len(logs)} model evaluation(s).') + + +if __name__ == '__main__': + main() diff --git a/every_eval_ever/converters/lexam/adapter.py b/every_eval_ever/converters/lexam/adapter.py new file mode 100644 index 000000000..386d5eb21 --- /dev/null +++ b/every_eval_ever/converters/lexam/adapter.py @@ -0,0 +1,388 @@ +"""Adapter for converting LEXam public leaderboard HTML to every_eval_ever format.""" + +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass + +import requests + +from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters.common.utils import get_current_unix_timestamp +from every_eval_ever.eval_types import ( + AggregationMethod, + EvalLibrary, + EvaluationLog, + EvaluationResult, + EvaluatorRelationship, + JudgeConfig, + LlmScoring, + MetricConfig, + ModelInfo, + ScoreDetails, + ScoreType, + SourceDataHf, + SourceMetadata, + SourceType, + Uncertainty, +) + +logger = logging.getLogger(__name__) + +LEADERBOARD_URL = ( + 'https://raw.githubusercontent.com/LEXam-Benchmark/' + 'lexam-benchmark.github.io/main/index.html' +) +LEADERBOARD_PAGE_URL = 'https://lexam-benchmark.github.io/' +HF_REPO = 'LEXam-Benchmark/LEXam' +BENCHMARK_KEY = 'lexam' +OPEN_QUESTIONS_SAMPLES = 2541 +MCQ_SAMPLES = 4696 +MCQ_CONFIGS = 'mcq_4_choices,mcq_8_choices,mcq_16_choices,mcq_32_choices' + +OPEN_SECTION_TITLE = 'Leaderboard on LEXam – Open Questions' +MCQ_SECTION_TITLE = 'Leaderboard on LEXam – Multiple-Choice Questions' + +_MEDAL_RE = re.compile(r'[\U0001f947-\U0001f949]') + + +@dataclass(frozen=True) +class LeaderboardRow: + """A single model row from a LEXam leaderboard table.""" + + model_name: str + score: float + + +@dataclass(frozen=True) +class ModelIdentity: + """Canonical model identity used for Every Eval Ever output paths.""" + + developer: str + model_id: str + + +_MODEL_IDENTITIES = { + 'Apertus-70B': ModelIdentity('swiss-ai', 'swiss-ai/Apertus-70B'), + 'Apertus-8B': ModelIdentity('swiss-ai', 'swiss-ai/Apertus-8B'), + 'Claude-3.7-Sonnet': ModelIdentity( + 'anthropic', 'anthropic/Claude-3.7-Sonnet' + ), + 'Claude-4.5-Sonnet': ModelIdentity( + 'anthropic', 'anthropic/Claude-4.5-Sonnet' + ), + 'DeepSeek-R1': ModelIdentity('deepseek-ai', 'deepseek-ai/DeepSeek-R1'), + 'DeepSeek-V3': ModelIdentity('deepseek-ai', 'deepseek-ai/DeepSeek-V3'), + 'DeepSeek-V3.2-Exp': ModelIdentity( + 'deepseek-ai', 'deepseek-ai/DeepSeek-V3.2-Exp' + ), + 'DeepSeek-V3.2-chat': ModelIdentity( + 'deepseek-ai', 'deepseek-ai/DeepSeek-V3.2-chat' + ), + 'DeepSeek-V3.2-reasoner': ModelIdentity( + 'deepseek-ai', 'deepseek-ai/DeepSeek-V3.2-reasoner' + ), + 'EuroLLM-9B-it': ModelIdentity( + 'utter-project', 'utter-project/EuroLLM-9B-it' + ), + 'GPT-4.1': ModelIdentity('openai', 'openai/GPT-4.1'), + 'GPT-4.1-mini': ModelIdentity('openai', 'openai/GPT-4.1-mini'), + 'GPT-4.1-nano': ModelIdentity('openai', 'openai/GPT-4.1-nano'), + 'GPT-4o': ModelIdentity('openai', 'openai/GPT-4o'), + 'GPT-4o-mini': ModelIdentity('openai', 'openai/GPT-4o-mini'), + 'GPT-5': ModelIdentity('openai', 'openai/GPT-5'), + 'GPT-5-mini': ModelIdentity('openai', 'openai/GPT-5-mini'), + 'GPT-5-nano': ModelIdentity('openai', 'openai/GPT-5-nano'), + 'GPT-OSS-120B': ModelIdentity('openai', 'openai/GPT-OSS-120B'), + 'GPT-OSS-20B': ModelIdentity('openai', 'openai/GPT-OSS-20B'), + 'Gemini-2.5-Pro': ModelIdentity('google', 'google/Gemini-2.5-Pro'), + 'Gemini-3-Pro-preview': ModelIdentity( + 'google', 'google/Gemini-3-Pro-preview' + ), + 'Gemma-2-9B-it': ModelIdentity('google', 'google/Gemma-2-9B-it'), + 'Gemma-3-12B-it': ModelIdentity('google', 'google/Gemma-3-12B-it'), + 'Llama-3.1-405B-it': ModelIdentity( + 'meta-llama', 'meta-llama/Llama-3.1-405B-it' + ), + 'Llama-3.1-8B-it': ModelIdentity( + 'meta-llama', 'meta-llama/Llama-3.1-8B-it' + ), + 'Llama-3.3-70B-it': ModelIdentity( + 'meta-llama', 'meta-llama/Llama-3.3-70B-it' + ), + 'Llama-4-Maverick': ModelIdentity( + 'meta-llama', 'meta-llama/Llama-4-Maverick' + ), + 'Ministral-8B-it': ModelIdentity('mistralai', 'mistralai/Ministral-8B-it'), + 'O3-mini': ModelIdentity('openai', 'openai/O3-mini'), + 'Phi-4': ModelIdentity('microsoft', 'microsoft/Phi-4'), + 'QwQ-32B': ModelIdentity('qwen', 'qwen/QwQ-32B'), + 'Qwen-2.5-7B-it': ModelIdentity('qwen', 'qwen/Qwen-2.5-7B-it'), + 'Qwen3-235B': ModelIdentity('qwen', 'qwen/Qwen3-235B'), + 'Qwen3-32B': ModelIdentity('qwen', 'qwen/Qwen3-32B'), + 'Qwen3-Next': ModelIdentity('qwen', 'qwen/Qwen3-Next'), +} + + +def _fetch_html(url: str = LEADERBOARD_URL) -> str: + """Download leaderboard HTML from *url*.""" + resp = requests.get(url, timeout=30) + resp.raise_for_status() + return resp.text + + +def _clean_model_name(raw_name: str) -> str: + """Strip medal glyphs and whitespace from a leaderboard model name.""" + return _MEDAL_RE.sub('', raw_name).strip() + + +def _model_identity(model_name: str) -> ModelIdentity: + """Return the explicit model identity for a LEXam leaderboard name.""" + if model_name not in _MODEL_IDENTITIES: + raise ValueError( + f'No model identity mapping for LEXam leaderboard model: {model_name}' + ) + return _MODEL_IDENTITIES[model_name] + + +def _extract_section_rows( + html: str, section_title: str +) -> list[LeaderboardRow]: + """Parse model/score rows from the table under *section_title*.""" + title_idx = html.find(section_title) + if title_idx == -1: + raise ValueError(f'Leaderboard section not found: {section_title}') + + table_start = html.find('', table_start) + if table_end == -1: + raise ValueError(f'Unclosed table for section: {section_title}') + + table_html = html[table_start:table_end] + row_re = re.compile( + r']*>\s*' + r']*>(?:)?(\d+)(?:)?\s*' + r']*>(.*?)\s*' + r']*>(?:)?([\d.]+)(?:)?\s*' + r'', + re.DOTALL | re.IGNORECASE, + ) + + rows: list[LeaderboardRow] = [] + for row_match in row_re.finditer(table_html): + _, model_cell, score_text = row_match.groups() + model_name = re.sub(r'<[^>]+>', '', model_cell) + model_name = _clean_model_name(model_name) + if not model_name: + continue + rows.append( + LeaderboardRow( + model_name=model_name, + score=float(score_text), + ) + ) + if not rows: + raise ValueError(f'No leaderboard rows found for: {section_title}') + return rows + + +def _open_question_source() -> SourceDataHf: + return SourceDataHf( + dataset_name=BENCHMARK_KEY, + source_type='hf_dataset', + hf_repo=HF_REPO, + hf_split='test', + samples_number=OPEN_QUESTIONS_SAMPLES, + additional_details={ + 'benchmark_section': 'open_questions', + 'config': 'open_question', + }, + ) + + +def _mcq_source() -> SourceDataHf: + return SourceDataHf( + dataset_name=BENCHMARK_KEY, + source_type='hf_dataset', + hf_repo=HF_REPO, + hf_split='test', + samples_number=MCQ_SAMPLES, + additional_details={ + 'benchmark_section': 'multiple_choice_questions', + 'configs': MCQ_CONFIGS, + }, + ) + + +def _open_question_judge_scoring() -> LlmScoring: + return LlmScoring( + judges=[ + JudgeConfig( + model_info=ModelInfo( + name='gpt-4o', + id='openai/gpt-4o-2024-11-20', + developer='openai', + ), + ), + JudgeConfig( + model_info=ModelInfo( + name='DeepSeek-V3', + id='deepseek-ai/DeepSeek-V3', + developer='deepseek-ai', + ), + ), + JudgeConfig( + model_info=ModelInfo( + name='Qwen3-32B', + id='qwen/Qwen3-32B', + developer='qwen', + ), + ), + ], + input_prompt=( + 'Expert-validated LLM-as-a-Judge ensemble scoring open-ended ' + 'law exam answers against reference answers.' + ), + aggregation_method=AggregationMethod.average, + additional_details={'validation': 'human expert validated'}, + ) + + +def _build_open_question_result(score: float) -> EvaluationResult: + return EvaluationResult( + evaluation_name='Open Question Judge Score', + metric_config=MetricConfig( + metric_id='lexam.open_question_judge_score', + metric_name='Open Question Judge Score', + metric_kind='accuracy', + metric_unit='percent', + evaluation_description=( + 'Mean LLM-judge score on open-ended law exam questions ' + '(0-100 scale).' + ), + lower_is_better=False, + score_type=ScoreType.continuous, + min_score=0.0, + max_score=100.0, + llm_scoring=_open_question_judge_scoring(), + ), + score_details=ScoreDetails( + score=round(score, 2), + uncertainty=Uncertainty(num_samples=OPEN_QUESTIONS_SAMPLES), + ), + source_data=_open_question_source(), + ) + + +def _build_mcq_result(score: float) -> EvaluationResult: + return EvaluationResult( + evaluation_name='Multiple-Choice Accuracy', + metric_config=MetricConfig( + metric_id='lexam.mcq_accuracy', + metric_name='Multiple-Choice Accuracy', + metric_kind='accuracy', + metric_unit='percent', + evaluation_description=( + 'Accuracy on LEXam multiple-choice law exam questions ' + 'across all MCQ configs (0-100 scale).' + ), + lower_is_better=False, + score_type=ScoreType.continuous, + min_score=0.0, + max_score=100.0, + ), + score_details=ScoreDetails( + score=round(score, 2), + uncertainty=Uncertainty(num_samples=MCQ_SAMPLES), + ), + source_data=_mcq_source(), + ) + + +class LEXamAdapter: + """Converts LEXam public leaderboard rows into EvaluationLog objects.""" + + def fetch_leaderboard( + self, + html: str | None = None, + url: str = LEADERBOARD_URL, + ) -> list[EvaluationLog]: + """Fetch the LEXam leaderboard and return one log per model. + + Args: + html: Optional pre-fetched HTML (used in tests). + url: Leaderboard HTML URL when *html* is not provided. + + Returns: + One EvaluationLog per model, combining open and MCQ metrics when + both are available. + """ + page_html = html if html is not None else _fetch_html(url) + open_rows = _extract_section_rows(page_html, OPEN_SECTION_TITLE) + mcq_rows = _extract_section_rows(page_html, MCQ_SECTION_TITLE) + + open_scores = {row.model_name: row.score for row in open_rows} + mcq_scores = {row.model_name: row.score for row in mcq_rows} + model_names = sorted(set(open_scores) | set(mcq_scores)) + + retrieved_ts = get_current_unix_timestamp() + logs: list[EvaluationLog] = [] + + for model_name in model_names: + evaluation_results: list[EvaluationResult] = [] + if model_name in open_scores: + evaluation_results.append( + _build_open_question_result(open_scores[model_name]) + ) + if model_name in mcq_scores: + evaluation_results.append( + _build_mcq_result(mcq_scores[model_name]) + ) + if not evaluation_results: + continue + + identity = _model_identity(model_name) + + logs.append( + EvaluationLog( + schema_version=SCHEMA_VERSION, + evaluation_id=( + f'{BENCHMARK_KEY}/{identity.model_id}/{retrieved_ts}' + ), + retrieved_timestamp=retrieved_ts, + eval_library=EvalLibrary( + name='lexam', + version='1.0', + additional_details={ + 'leaderboard_url': LEADERBOARD_PAGE_URL, + 'github': 'https://github.com/LEXam-Benchmark/LEXam', + }, + ), + source_metadata=SourceMetadata( + source_name='LEXam Leaderboard', + source_type=SourceType.documentation, + source_organization_name='LEXam-Benchmark', + source_organization_url=( + 'https://github.com/LEXam-Benchmark/LEXam' + ), + evaluator_relationship=EvaluatorRelationship.collaborative, + additional_details={ + 'leaderboard_page': LEADERBOARD_PAGE_URL, + 'source_html': url, + }, + ), + model_info=ModelInfo( + name=model_name, + id=identity.model_id, + developer=identity.developer, + ), + evaluation_results=evaluation_results, + ) + ) + + logger.info('Converted %d LEXam leaderboard model(s).', len(logs)) + return logs diff --git a/tests/data/lexam/leaderboard.html b/tests/data/lexam/leaderboard.html new file mode 100644 index 000000000..6989cbe00 --- /dev/null +++ b/tests/data/lexam/leaderboard.html @@ -0,0 +1,26 @@ + + + +

+ Leaderboard on LEXam – Open Questions +

+ + + + + + +
1GPT-5πŸ₯‡70.20
2GPT-4o-mini42.55
3Gemini-3-Pro-preview30.00
+ +

+ Leaderboard on LEXam – Multiple-Choice Questions +

+ + + + + + +
1GPT-5πŸ₯‡62.65
2GPT-4o-mini40.96
3Phi-425.00
+ + diff --git a/tests/test_lexam_adapter.py b/tests/test_lexam_adapter.py new file mode 100644 index 000000000..ded4dfcaf --- /dev/null +++ b/tests/test_lexam_adapter.py @@ -0,0 +1,143 @@ +"""Unit tests for the LEXam adapter.""" + +from pathlib import Path + +import pytest + +from every_eval_ever.converters.lexam.adapter import ( + LEXamAdapter, + _clean_model_name, + _extract_section_rows, + _model_identity, +) +from every_eval_ever.eval_types import EvaluationLog + +FIXTURE_HTML = ( + Path(__file__).parent / 'data' / 'lexam' / 'leaderboard.html' +).read_text(encoding='utf-8') + + +def test_clean_model_name_strips_medals() -> None: + assert _clean_model_name('GPT-5πŸ₯‡') == 'GPT-5' + assert _clean_model_name('Claude-3.7-SonnetπŸ₯‰') == 'Claude-3.7-Sonnet' + + +def test_extract_section_rows_open_questions() -> None: + rows = _extract_section_rows( + FIXTURE_HTML, + 'Leaderboard on LEXam – Open Questions', + ) + assert len(rows) == 3 + assert rows[0].model_name == 'GPT-5' + assert rows[0].score == 70.20 + + +def test_extract_section_rows_mcq() -> None: + rows = _extract_section_rows( + FIXTURE_HTML, + 'Leaderboard on LEXam – Multiple-Choice Questions', + ) + assert len(rows) == 3 + assert rows[2].model_name == 'Phi-4' + assert rows[2].score == 25.0 + + +def test_extract_section_rows_missing_section_raises() -> None: + with pytest.raises(ValueError, match='Leaderboard section not found'): + _extract_section_rows(FIXTURE_HTML, 'Missing Section') + + +def test_fetch_leaderboard_combines_metrics_per_model() -> None: + logs = LEXamAdapter().fetch_leaderboard(html=FIXTURE_HTML) + by_name = {log.model_info.name: log for log in logs} + + assert len(logs) == 4 + assert len(by_name['GPT-5'].evaluation_results) == 2 + assert len(by_name['Gemini-3-Pro-preview'].evaluation_results) == 1 + assert len(by_name['Phi-4'].evaluation_results) == 1 + + +def test_fetch_leaderboard_open_question_score() -> None: + logs = LEXamAdapter().fetch_leaderboard(html=FIXTURE_HTML) + gpt5 = next(log for log in logs if log.model_info.name == 'GPT-5') + results = {r.evaluation_name: r for r in gpt5.evaluation_results} + + assert results['Open Question Judge Score'].score_details.score == 70.20 + assert results['Multiple-Choice Accuracy'].score_details.score == 62.65 + + +def test_fetch_leaderboard_source_metadata_is_documentation() -> None: + logs = LEXamAdapter().fetch_leaderboard(html=FIXTURE_HTML) + assert logs[0].source_metadata.source_type.value == 'documentation' + assert logs[0].source_metadata.source_name == 'LEXam Leaderboard' + + +def test_fetch_leaderboard_uses_hf_dataset_source() -> None: + logs = LEXamAdapter().fetch_leaderboard(html=FIXTURE_HTML) + gpt5 = next(log for log in logs if log.model_info.name == 'GPT-5') + open_result = next( + r + for r in gpt5.evaluation_results + if r.evaluation_name == 'Open Question Judge Score' + ) + mcq_result = next( + r + for r in gpt5.evaluation_results + if r.evaluation_name == 'Multiple-Choice Accuracy' + ) + + assert open_result.source_data.hf_repo == 'LEXam-Benchmark/LEXam' + assert open_result.source_data.hf_split == 'test' + assert open_result.source_data.samples_number == 2541 + assert mcq_result.source_data.samples_number == 4696 + + +def test_fetch_leaderboard_metric_ids() -> None: + logs = LEXamAdapter().fetch_leaderboard(html=FIXTURE_HTML) + gpt5 = next(log for log in logs if log.model_info.name == 'GPT-5') + by_name = {r.evaluation_name: r for r in gpt5.evaluation_results} + + assert ( + by_name['Open Question Judge Score'].metric_config.metric_id + == 'lexam.open_question_judge_score' + ) + assert ( + by_name['Multiple-Choice Accuracy'].metric_config.metric_id + == 'lexam.mcq_accuracy' + ) + + +def test_fetch_leaderboard_open_metric_has_llm_scoring() -> None: + logs = LEXamAdapter().fetch_leaderboard(html=FIXTURE_HTML) + gpt5 = next(log for log in logs if log.model_info.name == 'GPT-5') + open_result = next( + r + for r in gpt5.evaluation_results + if r.evaluation_name == 'Open Question Judge Score' + ) + + llm_scoring = open_result.metric_config.llm_scoring + assert llm_scoring is not None + assert len(llm_scoring.judges) == 3 + + +def test_fetch_leaderboard_model_developer_inference() -> None: + logs = LEXamAdapter().fetch_leaderboard(html=FIXTURE_HTML) + gpt5 = next(log for log in logs if log.model_info.name == 'GPT-5') + + assert gpt5.model_info.developer == 'openai' + assert gpt5.model_info.id == 'openai/GPT-5' + + +def test_unknown_model_identity_raises() -> None: + with pytest.raises(ValueError, match='No model identity mapping'): + _model_identity('New-Unmapped-Model') + + +def test_fetch_leaderboard_output_validates_as_evaluation_log() -> None: + logs = LEXamAdapter().fetch_leaderboard(html=FIXTURE_HTML) + for log in logs: + validated = EvaluationLog.model_validate( + log.model_dump(mode='json', exclude_none=True) + ) + assert validated.schema_version == log.schema_version