From 7437221d7b5479c5a77e2816eec17429d91b9b5f Mon Sep 17 00:00:00 2001 From: nelaturuharsha Date: Sat, 13 Jun 2026 21:52:11 +0200 Subject: [PATCH 1/3] adding hf community evals converter --- pyproject.toml | 1 + .../community_evals_converter/aggregate.jsonl | 1 + .../676f4465-ce78-411a-9f5a-c97b3d2eac4f.json | 590 ++++ tests/test_community_evals_converter.py | 1521 +++++++++ tools/hf-community-evals/README.md | 126 + .../community_evals_converter.py | 2945 +++++++++++++++++ uv.lock | 2 + 7 files changed, 5186 insertions(+) create mode 100644 tests/data/community_evals_converter/aggregate.jsonl create mode 100644 tests/data/community_evals_converter/datastore/flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json create mode 100644 tests/test_community_evals_converter.py create mode 100644 tools/hf-community-evals/README.md create mode 100644 tools/hf-community-evals/community_evals_converter.py diff --git a/pyproject.toml b/pyproject.toml index aa31551ac..450f30f30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "numpy>=2.4.1", "pandas>=2.3.3", "pydantic>=2.12.5,<3.0.0", + "pyyaml>=6.0.3", "requests>=2.32.5,<3.0.0", "rich>=14.0.0,<15.0.0", "seaborn>=0.13.2", diff --git a/tests/data/community_evals_converter/aggregate.jsonl b/tests/data/community_evals_converter/aggregate.jsonl new file mode 100644 index 000000000..5adeea54b --- /dev/null +++ b/tests/data/community_evals_converter/aggregate.jsonl @@ -0,0 +1 @@ +{"benchmark":"MMLU-Pro","eval_schema_version":"0.2.2","legacy_path":"data/MMLU-Pro/01-ai/yi-1.5-34b-chat/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json","object_path":"flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json","object_uuid":"676f4465-ce78-411a-9f5a-c97b3d2eac4f","record_type":"aggregate","sha256":"a9cc2e4399f182f2e8d1a6198248e124ceafee6f70cbf5ddf31e76d1e74e6f94","size_bytes":23648} diff --git a/tests/data/community_evals_converter/datastore/flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json b/tests/data/community_evals_converter/datastore/flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json new file mode 100644 index 000000000..62372ec22 --- /dev/null +++ b/tests/data/community_evals_converter/datastore/flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json @@ -0,0 +1,590 @@ +{ + "schema_version": "0.2.2", + "evaluation_id": "mmlu-pro/01-ai_yi-1.5-34b-chat/tiger-lab/1777613486.918081", + "retrieved_timestamp": "1777613486.918081", + "source_metadata": { + "source_name": "MMLU-Pro Leaderboard", + "source_type": "documentation", + "source_organization_name": "TIGER-Lab", + "source_organization_url": "https://tiger-ai-lab.github.io", + "evaluator_relationship": "third_party", + "additional_details": { + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "leaderboard_data_source": "TIGER-Lab" + } + }, + "eval_library": { + "name": "MMLU-Pro leaderboard (TIGER-Lab)", + "version": "unknown" + }, + "model_info": { + "name": "Yi-1.5-34B-Chat", + "id": "01-ai/yi-1.5-34b-chat", + "developer": "01-ai", + "additional_details": { + "raw_model_name": "Yi-1.5-34B-Chat", + "size_billions_parameters": "34.0", + "leaderboard_data_source": "TIGER-Lab" + } + }, + "evaluation_results": [ + { + "evaluation_result_id": "mmlu_pro/overall", + "evaluation_name": "MMLU-Pro (overall)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Overall accuracy across the ~12,000-question MMLU-Pro benchmark, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/overall", + "metric_name": "MMLU-Pro (overall)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.5229 + } + }, + { + "evaluation_result_id": "mmlu_pro/biology", + "evaluation_name": "MMLU-Pro (Biology)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Biology subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/biology", + "metric_name": "MMLU-Pro (Biology)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.7141 + } + }, + { + "evaluation_result_id": "mmlu_pro/business", + "evaluation_name": "MMLU-Pro (Business)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Business subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/business", + "metric_name": "MMLU-Pro (Business)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.5843 + } + }, + { + "evaluation_result_id": "mmlu_pro/chemistry", + "evaluation_name": "MMLU-Pro (Chemistry)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Chemistry subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/chemistry", + "metric_name": "MMLU-Pro (Chemistry)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.4753 + } + }, + { + "evaluation_result_id": "mmlu_pro/computer_science", + "evaluation_name": "MMLU-Pro (Computer Science)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Computer Science subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/computer_science", + "metric_name": "MMLU-Pro (Computer Science)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.539 + } + }, + { + "evaluation_result_id": "mmlu_pro/economics", + "evaluation_name": "MMLU-Pro (Economics)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Economics subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/economics", + "metric_name": "MMLU-Pro (Economics)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.6457 + } + }, + { + "evaluation_result_id": "mmlu_pro/engineering", + "evaluation_name": "MMLU-Pro (Engineering)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Engineering subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/engineering", + "metric_name": "MMLU-Pro (Engineering)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.3437 + } + }, + { + "evaluation_result_id": "mmlu_pro/health", + "evaluation_name": "MMLU-Pro (Health)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Health subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/health", + "metric_name": "MMLU-Pro (Health)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.5819 + } + }, + { + "evaluation_result_id": "mmlu_pro/history", + "evaluation_name": "MMLU-Pro (History)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro History subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/history", + "metric_name": "MMLU-Pro (History)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.5276 + } + }, + { + "evaluation_result_id": "mmlu_pro/law", + "evaluation_name": "MMLU-Pro (Law)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Law subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/law", + "metric_name": "MMLU-Pro (Law)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.3479 + } + }, + { + "evaluation_result_id": "mmlu_pro/math", + "evaluation_name": "MMLU-Pro (Math)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Math subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/math", + "metric_name": "MMLU-Pro (Math)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.5618 + } + }, + { + "evaluation_result_id": "mmlu_pro/philosophy", + "evaluation_name": "MMLU-Pro (Philosophy)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Philosophy subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/philosophy", + "metric_name": "MMLU-Pro (Philosophy)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.4629 + } + }, + { + "evaluation_result_id": "mmlu_pro/physics", + "evaluation_name": "MMLU-Pro (Physics)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Physics subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/physics", + "metric_name": "MMLU-Pro (Physics)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.4935 + } + }, + { + "evaluation_result_id": "mmlu_pro/psychology", + "evaluation_name": "MMLU-Pro (Psychology)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Psychology subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/psychology", + "metric_name": "MMLU-Pro (Psychology)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.6429 + } + }, + { + "evaluation_result_id": "mmlu_pro/other", + "evaluation_name": "MMLU-Pro (Other)", + "source_data": { + "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)", + "source_type": "hf_dataset", + "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission", + "hf_split": "train", + "additional_details": { + "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv", + "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro", + "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro", + "paper_url": "https://arxiv.org/abs/2406.01574", + "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro", + "dataset_total_questions": "12000", + "prompt_style": "5-shot CoT" + } + }, + "metric_config": { + "evaluation_description": "Accuracy on the MMLU-Pro Other subset, evaluated 5-shot with chain-of-thought.", + "metric_id": "mmlu_pro/other", + "metric_name": "MMLU-Pro (Other)", + "metric_kind": "accuracy", + "metric_unit": "proportion", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + "additional_details": { + "aggregation": "accuracy_over_subset", + "prompt_style": "5-shot CoT" + } + }, + "score_details": { + "score": 0.5162 + } + } + ] +} \ No newline at end of file diff --git a/tests/test_community_evals_converter.py b/tests/test_community_evals_converter.py new file mode 100644 index 000000000..299f60ebe --- /dev/null +++ b/tests/test_community_evals_converter.py @@ -0,0 +1,1521 @@ +from __future__ import annotations + +import hashlib +import io +import json +from pathlib import Path + +import pytest +import yaml +from rich.console import Console +from rich.progress import Progress + +from every_eval_ever import cli +from tools import community_evals_converter + +FIXTURE_DIR = Path(__file__).parent / 'data' / 'community_evals_converter' + + +class FakeRepoInfo: + def __init__(self, *, sha: str) -> None: + self.sha = sha + + +class FakeHfApi: + def __init__( + self, + *, + datastore_sha: str = 'abc123', + missing_models: set[str] | None = None, + repo_files_by_revision: dict[tuple[str, str], list[str]] | None = None, + dataset_files_by_revision: dict[tuple[str, str], list[str]] | None = None, + discussions: dict[str, list[FakeDiscussion]] | None = None, + ) -> None: + self.datastore_sha = datastore_sha + self.missing_models = missing_models or set() + self.repo_files_by_revision = repo_files_by_revision or {} + self.dataset_files_by_revision = dataset_files_by_revision or {} + self.discussions = discussions or {} + self.model_info_calls: list[str] = [] + self.repo_info_calls: list[dict] = [] + self.discussion_calls: list[str] = [] + self.commits: list[dict] = [] + + def repo_info(self, **kwargs): + self.repo_info_calls.append(kwargs) + assert kwargs['repo_type'] == 'dataset' + assert kwargs['revision'] == 'main' + return FakeRepoInfo(sha=self.datastore_sha) + + def model_info(self, repo_id: str): + self.model_info_calls.append(repo_id) + if repo_id in self.missing_models: + raise RuntimeError('missing model') + return {'id': repo_id} + + def list_repo_files( + self, + repo_id: str, + repo_type: str = 'model', + revision: str | None = None, + ): + if repo_type == 'dataset': + return self.dataset_files_by_revision.get( + (repo_id, revision or 'main'), [] + ) + assert repo_type == 'model' + return self.repo_files_by_revision.get((repo_id, revision or 'main'), []) + + def list_repo_tree( + self, + repo_id: str, + path_in_repo: str | None = None, + *, + recursive: bool = False, + expand: bool = False, + revision: str | None = None, + repo_type: str = 'model', + token: bool | str | None = None, + ): + assert path_in_repo == '.eval_results' + assert recursive is True + assert expand is False + assert repo_type == 'model' + assert token is True + for path in self.repo_files_by_revision.get((repo_id, revision or 'main'), []): + yield FakeRepoFile(path=path, blob_id=f'{revision}:{path}') + + def get_repo_discussions(self, repo_id: str, **_kwargs): + self.discussion_calls.append(repo_id) + return self.discussions.get(repo_id, []) + + def create_commit(self, **kwargs): + self.commits.append(kwargs) + return FakeCommitInfo( + pr_url=f'https://huggingface.co/{kwargs["repo_id"]}/discussions/1', + commit_url=f'https://huggingface.co/{kwargs["repo_id"]}/commit/abc', + ) + + +class FakeCommitInfo: + def __init__(self, *, pr_url: str, commit_url: str) -> None: + self.pr_url = pr_url + self.commit_url = commit_url + + def __str__(self) -> str: + return self.pr_url + + +class FakeRepoFile: + def __init__(self, *, path: str, blob_id: str) -> None: + self.path = path + self.rfilename = path + self.blob_id = blob_id + self.size = 1 + + +class FakeDiscussion: + def __init__( + self, + *, + title: str = 'Add EvalEval community eval results', + git_reference: str = 'refs/pr/1', + url: str = 'https://huggingface.co/google/gemma-2b-it/discussions/1', + num: int = 1, + ) -> None: + self.title = title + self.git_reference = git_reference + self.url = url + self.num = num + + +class RecordingProgress(community_evals_converter.ReviewProgress): + def __init__(self) -> None: + self.descriptions: list[str] = [] + self.advances: list[int] = [] + self.task_initial_descriptions: dict[int, str] = {} + self.advance_by_task: dict[int, int] = {} + + def add_task(self, description: str, total: int | None = None) -> int: + task_id = len(self.task_initial_descriptions) + 1 + self.task_initial_descriptions[task_id] = description + self.advance_by_task[task_id] = 0 + self.descriptions.append(description) + return task_id + + def update( + self, + task_id: int, + *, + advance: int = 0, + description: str | None = None, + total: int | None = None, + ) -> None: + self.advances.append(advance) + self.advance_by_task[task_id] = ( + self.advance_by_task.get(task_id, 0) + advance + ) + if description is not None: + self.descriptions.append(description) + + +def _aggregate( + *, + model_id: str = 'google/gemma-2b-it', + score: float = 0.641, +) -> dict: + return { + 'schema_version': '0.2.2', + 'evaluation_id': 'openeval/google_gemma-2b-it/123', + 'evaluation_timestamp': '2024-07-16T00:00:00Z', + 'retrieved_timestamp': '1234567890', + 'source_metadata': { + 'source_type': 'evaluation_run', + 'source_organization_name': 'EvalEval', + 'evaluator_relationship': 'third_party', + }, + 'eval_library': {'name': 'openeval', 'version': 'unknown'}, + 'model_info': { + 'name': model_id.rsplit('/', 1)[-1], + 'id': model_id, + 'developer': model_id.split('/', 1)[0], + 'inference_platform': 'huggingface', + }, + 'evaluation_results': [ + { + 'evaluation_result_id': 'mmlu-pro::chain-of-thought-correctness', + 'evaluation_name': 'MMLU-Pro', + 'source_data': { + 'dataset_name': 'MMLU-Pro', + 'source_type': 'hf_dataset', + 'hf_repo': 'TIGER-Lab/MMLU-Pro', + }, + 'metric_config': { + 'lower_is_better': False, + 'score_type': 'binary', + 'metric_unit': 'proportion', + 'min_score': 0.0, + 'max_score': 1.0, + }, + 'score_details': {'score': score}, + } + ], + } + + +def _gpqa_aggregate(*, dataset_name: str = 'GPQA') -> dict: + record = _aggregate() + record['evaluation_results'] = [ + { + 'evaluation_result_id': 'gpqa::chain-of-thought-correctness', + 'evaluation_name': dataset_name, + 'source_data': { + 'dataset_name': dataset_name, + 'source_type': 'hf_dataset', + 'hf_repo': 'Idavidrein/gpqa', + }, + 'metric_config': { + 'lower_is_better': False, + 'score_type': 'binary', + 'metric_unit': 'proportion', + 'min_score': 0.0, + 'max_score': 1.0, + }, + 'score_details': {'score': 0.5}, + } + ] + return record + + +def _write_index_row( + tmp_path: Path, + record: dict, + *, + object_uuid: str = '676f4465-ce78-411a-9f5a-c97b3d2eac4f', + row_overrides: dict | None = None, +) -> tuple[Path, Path]: + datastore = tmp_path / 'datastore' + object_path = ( + datastore + / 'flat' + / 'objects' + / object_uuid[:2] + / object_uuid[2:4] + / f'{object_uuid}.json' + ) + object_path.parent.mkdir(parents=True, exist_ok=True) + data = json.dumps(record).encode('utf-8') + object_path.write_bytes(data) + + index_jsonl = tmp_path / 'aggregate.jsonl' + row = { + 'benchmark': 'MMLU-Pro', + 'eval_schema_version': record['schema_version'], + 'legacy_path': f'data/MMLU-Pro/google/gemma-2b-it/{object_uuid}.json', + 'object_path': object_path.relative_to(datastore).as_posix(), + 'object_uuid': object_uuid, + 'record_type': 'aggregate', + 'sha256': hashlib.sha256(data).hexdigest(), + 'size_bytes': len(data), + } + if row_overrides: + row.update(row_overrides) + index_jsonl.write_text(json.dumps(row) + '\n', encoding='utf-8') + return datastore, index_jsonl + + +def _fake_download(datastore: Path): + def download_file(**kwargs) -> str: + assert kwargs['repo_id'] == 'evaleval/EEE_datastore' + assert kwargs['repo_type'] == 'dataset' + assert kwargs['revision'] == 'abc123' + path = datastore / kwargs['filename'] + if not path.exists(): + raise FileNotFoundError(path) + return path.as_posix() + + return download_file + + +def _write_collection_rows( + tmp_path: Path, + records: list[dict], + *, + collection_name: str = 'MMLU-Pro', + include_instance_level: bool = False, +) -> tuple[Path, Path]: + datastore = tmp_path / 'datastore' + rows = [] + for index, record in enumerate(records): + object_uuid = f'676f4465-ce78-411a-9f5a-c97b3d2eac{index:03d}' + object_path = ( + datastore + / 'flat' + / 'objects' + / object_uuid[:2] + / object_uuid[2:4] + / f'{object_uuid}.json' + ) + object_path.parent.mkdir(parents=True, exist_ok=True) + data = json.dumps(record).encode('utf-8') + object_path.write_bytes(data) + row = { + 'benchmark': 'collection-benchmark', + 'eval_schema_version': record['schema_version'], + 'legacy_path': ( + f'data/{collection_name}/{record["model_info"]["id"]}/' + f'{object_uuid}.json' + ), + 'object_path': object_path.relative_to(datastore).as_posix(), + 'object_uuid': object_uuid, + 'record_type': 'aggregate', + 'sha256': hashlib.sha256(data).hexdigest(), + 'size_bytes': len(data), + 'instance_level_available': False, + } + if include_instance_level: + instance_path = object_path.with_name(f'{object_uuid}_samples.jsonl') + instance_data = ( + json.dumps( + { + 'schema_version': 'instance_level_eval_0.2.2', + 'evaluation_id': record['evaluation_id'], + 'model_id': record['model_info']['id'], + } + ) + + '\n' + ).encode('utf-8') + instance_path.write_bytes(instance_data) + row.update( + { + 'instance_level_available': True, + 'instance_level_path': ( + instance_path.relative_to(datastore).as_posix() + ), + 'instance_level_size_bytes': len(instance_data), + 'instance_sha': hashlib.sha256(instance_data).hexdigest(), + } + ) + rows.append(row) + + collection_jsonl = ( + datastore + / 'flat' + / 'indexes' + / 'by_collection' + / f'{collection_name}.jsonl' + ) + collection_jsonl.parent.mkdir(parents=True, exist_ok=True) + collection_jsonl.write_text( + ''.join(json.dumps(row) + '\n' for row in rows), + encoding='utf-8', + ) + return datastore, collection_jsonl + + +def _fake_download_with_model_files( + datastore: Path, + model_files: dict[tuple[str, str, str], Path], +): + def download_file(**kwargs) -> str: + if kwargs['repo_type'] == 'dataset': + assert kwargs['repo_id'] == 'evaleval/EEE_datastore' + assert kwargs['revision'] == 'abc123' + path = datastore / kwargs['filename'] + if not path.exists(): + raise FileNotFoundError(path) + return path.as_posix() + if kwargs['repo_type'] == 'model': + key = (kwargs['repo_id'], kwargs['revision'], kwargs['filename']) + return model_files[key].as_posix() + raise AssertionError(f'unexpected repo_type {kwargs["repo_type"]}') + + return download_file + + +def test_parse_benchmarks_aliases_and_rejects_unknown() -> None: + assert community_evals_converter.parse_benchmarks('gpqa-diamond,mmlu_pro') == [ + 'gpqa', + 'mmlu_pro', + ] + + with pytest.raises(community_evals_converter.HFEvalsError, match='Unsupported benchmark'): + community_evals_converter.parse_benchmarks('alphaxiv') + + +def test_parse_datastore_locator_accepts_optional_revision() -> None: + assert community_evals_converter.parse_datastore_locator( + 'evaleval/EEE_datastore@abc123' + ) == ('evaleval/EEE_datastore', 'abc123') + assert community_evals_converter.parse_datastore_locator('evaleval/EEE_datastore') == ( + 'evaleval/EEE_datastore', + None, + ) + + with pytest.raises( + community_evals_converter.HFEvalsError, match='\\[@\\]' + ): + community_evals_converter.parse_datastore_locator('bad@repo@abc123') + + +def test_resolve_datastore_locator_uses_latest_commit_for_bare_repo() -> None: + api = FakeHfApi(datastore_sha='resolvedabc') + + assert community_evals_converter.resolve_datastore_locator( + 'evaleval/EEE_datastore', api=api + ) == ('evaleval/EEE_datastore', 'resolvedabc') + assert api.repo_info_calls == [ + { + 'repo_id': 'evaleval/EEE_datastore', + 'repo_type': 'dataset', + 'revision': 'main', + } + ] + + +def test_build_collection_manifest_downloads_collection_jsonl_and_scans_results( + tmp_path: Path, +) -> None: + record = _aggregate() + record['evaluation_results'].append( + { + 'evaluation_result_id': 'gsm8k/exact_match', + 'evaluation_name': 'GSM8K', + 'source_data': { + 'dataset_name': 'GSM8K', + 'source_type': 'hf_dataset', + 'hf_repo': 'openai/gsm8k', + }, + 'metric_config': { + 'lower_is_better': False, + 'score_type': 'binary', + 'metric_unit': 'proportion', + 'min_score': 0.0, + 'max_score': 1.0, + }, + 'score_details': {'score': 0.72}, + } + ) + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, + [record], + collection_name='MMLU-Pro', + include_instance_level=True, + ) + + manifest = community_evals_converter.build_collection_manifest( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore', + api=FakeHfApi(datastore_sha='abc123'), + download_file=_fake_download(datastore), + ) + + assert manifest['source_url_mode'] == 'online_collection_index_jsonl' + assert manifest['collection_jsonl'] == ( + 'flat/indexes/by_collection/MMLU-Pro.jsonl' + ) + assert {entry['benchmark'] for entry in manifest['entries']} == { + 'mmlu_pro', + 'gsm8k', + } + assert {entry['target_path'] for entry in manifest['entries']} == { + '.eval_results/mmlu_pro.yaml', + '.eval_results/gsm8k.yaml', + } + assert all(entry['instance_level_available'] is True for entry in manifest['entries']) + assert all('instance_sha' in entry for entry in manifest['entries']) + + +def test_build_collection_manifest_requires_collection_jsonl( + tmp_path: Path, +) -> None: + datastore, aggregate_jsonl = _write_index_row(tmp_path, _aggregate()) + aggregate_dir = ( + datastore / 'flat' / 'indexes' / 'by_collection' / 'MMLU-Pro' + ) + aggregate_dir.mkdir(parents=True) + (aggregate_dir / 'aggregate.jsonl').write_text( + aggregate_jsonl.read_text(encoding='utf-8'), + encoding='utf-8', + ) + + with pytest.raises( + community_evals_converter.HFEvalsError, + match='flat/indexes/by_collection/MMLU-Pro\\.jsonl', + ): + community_evals_converter.build_collection_manifest( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + +def test_build_collection_manifest_suggests_nearby_collection_stems( + tmp_path: Path, +) -> None: + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, + [_aggregate()], + collection_name='fibble_arena', + ) + + with pytest.raises( + community_evals_converter.HFEvalsError, + match='Nearby collection stems: fibble_arena', + ): + community_evals_converter.build_collection_manifest( + collection_name='fibbl_arena', + datastore='evaleval/EEE_datastore@abc123', + api=FakeHfApi( + dataset_files_by_revision={ + ( + 'evaleval/EEE_datastore', + 'abc123', + ): [ + 'flat/indexes/by_collection/fibble_arena.jsonl', + 'flat/indexes/by_collection/MMLU-Pro.jsonl', + ] + } + ), + download_file=_fake_download(datastore), + ) + + +def test_build_collection_manifest_rejects_malformed_instance_provenance( + tmp_path: Path, +) -> None: + datastore, collection_jsonl = _write_collection_rows( + tmp_path, + [_aggregate()], + include_instance_level=True, + ) + row = json.loads(collection_jsonl.read_text(encoding='utf-8')) + row.pop('instance_sha') + collection_jsonl.write_text(json.dumps(row) + '\n', encoding='utf-8') + + with pytest.raises(community_evals_converter.HFEvalsError, match='missing instance_sha'): + community_evals_converter.build_collection_manifest( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + +def test_build_collection_manifest_rejects_path_like_collection_name( + tmp_path: Path, +) -> None: + datastore, _collection_jsonl = _write_collection_rows(tmp_path, [_aggregate()]) + + with pytest.raises(community_evals_converter.HFEvalsError, match='without the \\.jsonl'): + community_evals_converter.build_collection_manifest( + collection_name='MMLU-Pro.jsonl', + datastore='evaleval/EEE_datastore@abc123', + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + with pytest.raises(community_evals_converter.HFEvalsError, match='single by_collection'): + community_evals_converter.build_collection_manifest( + collection_name='MMLU-Pro/records', + datastore='evaleval/EEE_datastore@abc123', + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + +def test_build_collection_manifest_records_url_only_result_as_skipped( + tmp_path: Path, +) -> None: + record = _aggregate() + record['evaluation_results'][0]['source_data'] = { + 'dataset_name': 'External Benchmark', + 'source_type': 'url', + 'url': ['https://example.com/not-a-hf-benchmark'], + } + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, [record], collection_name='external' + ) + + manifest = community_evals_converter.build_collection_manifest( + collection_name='external', + datastore='evaleval/EEE_datastore@abc123', + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + assert manifest['entries'] == [] + assert manifest['skipped'][0]['reason'] == 'no_supported_hf_dataset_result' + + +def test_build_index_manifest_downloads_online_record_and_links_source( + tmp_path: Path, +) -> None: + datastore, index_jsonl = _write_index_row(tmp_path, _aggregate()) + api = FakeHfApi(datastore_sha='abc123') + + manifest = community_evals_converter.build_index_manifest( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore', + benchmarks=['mmlu_pro'], + api=api, + download_file=_fake_download(datastore), + ) + + assert manifest['source_url_mode'] == 'online_flat_index_jsonl' + assert manifest['datastore'] == 'evaleval/EEE_datastore@abc123' + assert manifest['datastore_input'] == 'evaleval/EEE_datastore' + assert api.repo_info_calls + assert manifest['entries'][0]['target_path'] == '.eval_results/mmlu_pro.yaml' + assert manifest['entries'][0]['yaml_entry']['value'] == 64.1 + assert manifest['entries'][0]['yaml_entry']['source']['url'].startswith( + 'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/abc123/flat/objects/' + ) + + +def test_build_index_manifest_accepts_index_directory(tmp_path: Path) -> None: + datastore, index_jsonl = _write_index_row(tmp_path, _aggregate()) + index_dir = tmp_path / 'flat' / 'indexes' / 'by_benchmark' / 'MMLU-Pro' + index_dir.mkdir(parents=True) + (index_dir / 'aggregate.jsonl').write_text( + index_jsonl.read_text(encoding='utf-8'), + encoding='utf-8', + ) + + manifest = community_evals_converter.build_index_manifest( + index_jsonl=index_dir, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + assert manifest['index_jsonl'] == (index_dir / 'aggregate.jsonl').as_posix() + assert manifest['entries'][0]['flat_object_path'].startswith( + 'flat/objects/' + ) + + +def test_build_index_manifest_rejects_index_directory_without_aggregate_jsonl( + tmp_path: Path, +) -> None: + index_dir = tmp_path / 'flat' / 'indexes' / 'by_benchmark' / 'MMLU-Pro' + index_dir.mkdir(parents=True) + + with pytest.raises( + community_evals_converter.HFEvalsError, match='must contain aggregate\\.jsonl' + ): + community_evals_converter.build_index_manifest( + index_jsonl=index_dir, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + api=FakeHfApi(), + ) + + +def test_build_index_manifest_rejects_direct_url_row(tmp_path: Path) -> None: + datastore, index_jsonl = _write_index_row( + tmp_path, + _aggregate(), + row_overrides={ + 'object_path': None, + 'url': 'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/main/flat/objects/test.json', + }, + ) + + with pytest.raises(community_evals_converter.HFEvalsError, match='unsupported.*url'): + community_evals_converter.build_index_manifest( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + +def test_build_index_manifest_rejects_local_path_row(tmp_path: Path) -> None: + datastore, index_jsonl = _write_index_row( + tmp_path, + _aggregate(), + row_overrides={'object_path': None}, + ) + aggregate_path = next((datastore / 'flat' / 'objects').rglob('*.json')) + row = json.loads(index_jsonl.read_text(encoding='utf-8')) + row['local_path'] = aggregate_path.relative_to(tmp_path).as_posix() + index_jsonl.write_text(json.dumps(row) + '\n', encoding='utf-8') + + with pytest.raises(community_evals_converter.HFEvalsError, match='unsupported.*local_path'): + community_evals_converter.build_index_manifest( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + +def test_build_index_manifest_rejects_url_even_with_object_path( + tmp_path: Path, +) -> None: + datastore, index_jsonl = _write_index_row( + tmp_path, + _aggregate(), + row_overrides={ + 'url': 'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/main/flat/objects/test.json', + }, + ) + + with pytest.raises(community_evals_converter.HFEvalsError, match='unsupported.*url'): + community_evals_converter.build_index_manifest( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + +def test_build_index_manifest_preserves_subset_from_index_row( + tmp_path: Path, +) -> None: + datastore, index_jsonl = _write_index_row( + tmp_path, + _aggregate(), + row_overrides={'subset': 'overall'}, + ) + + manifest = community_evals_converter.build_index_manifest( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + assert manifest['entries'][0]['subset'] == 'overall' + + +def test_build_index_manifest_uses_gpqa_subset_for_task_id( + tmp_path: Path, +) -> None: + datastore, index_jsonl = _write_index_row( + tmp_path, + _gpqa_aggregate(), + row_overrides={'benchmark': 'gpqa', 'subset': 'main'}, + ) + + manifest = community_evals_converter.build_index_manifest( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['gpqa'], + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + yaml_entry = manifest['entries'][0]['yaml_entry'] + assert manifest['entries'][0]['subset'] == 'main' + assert yaml_entry['dataset'] == { + 'id': 'Idavidrein/gpqa', + 'task_id': 'main', + } + assert yaml_entry['notes'] == 'GPQA chain-of-thought' + + +def test_build_index_manifest_rejects_invalid_subset_type( + tmp_path: Path, +) -> None: + datastore, index_jsonl = _write_index_row( + tmp_path, + _aggregate(), + row_overrides={'subset': {'name': 'overall'}}, + ) + + with pytest.raises(community_evals_converter.HFEvalsError, match='subset'): + community_evals_converter.build_index_manifest( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + +def test_build_index_manifest_accepts_persistent_fixture() -> None: + manifest = community_evals_converter.build_index_manifest( + index_jsonl=FIXTURE_DIR / 'aggregate.jsonl', + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + api=FakeHfApi(), + download_file=_fake_download(FIXTURE_DIR / 'datastore'), + ) + + assert len(manifest['entries']) == 1 + entry = manifest['entries'][0] + assert entry['flat_object_path'] == ( + 'flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json' + ) + assert entry['yaml_entry']['value'] == 52.29 + assert entry['yaml_entry']['dataset'] == { + 'id': 'TIGER-Lab/MMLU-Pro', + 'task_id': 'mmlu_pro', + } + + +def test_build_index_manifest_fails_on_hash_mismatch(tmp_path: Path) -> None: + datastore, index_jsonl = _write_index_row(tmp_path, _aggregate()) + row = json.loads(index_jsonl.read_text(encoding='utf-8')) + row['sha256'] = '0' * 64 + index_jsonl.write_text(json.dumps(row) + '\n', encoding='utf-8') + + with pytest.raises(community_evals_converter.HFEvalsError, match='sha256 mismatch'): + community_evals_converter.build_index_manifest( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + +def test_review_index_writes_yaml_and_review(tmp_path: Path) -> None: + datastore, index_jsonl = _write_index_row(tmp_path, _aggregate()) + review = community_evals_converter.review_index_for_hf_evals( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + manifest_output_path=tmp_path / 'manifest.json', + yaml_output_dir=tmp_path / 'yamls', + review_output_path=tmp_path / 'review.json', + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + yaml_path = ( + tmp_path + / 'yamls' + / 'google' + / 'gemma-2b-it' + / '.eval_results' + / 'mmlu_pro.yaml' + ) + loaded_yaml = yaml.safe_load(yaml_path.read_text(encoding='utf-8')) + loaded_review = json.loads((tmp_path / 'review.json').read_text(encoding='utf-8')) + + assert review['can_open_prs'] is True + assert loaded_review['can_open_prs'] is True + assert loaded_yaml[0]['dataset'] == { + 'id': 'TIGER-Lab/MMLU-Pro', + 'task_id': 'mmlu_pro', + } + + +def test_review_index_writes_yaml_without_reloading_manifest( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + datastore, index_jsonl = _write_index_row(tmp_path, _aggregate()) + + def fail_load_manifest(_path: Path) -> dict: + raise AssertionError('review flow should use the in-memory manifest') + + monkeypatch.setattr(community_evals_converter, 'load_manifest', fail_load_manifest) + + review = community_evals_converter.review_index_for_hf_evals( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + manifest_output_path=tmp_path / 'manifest.json', + yaml_output_dir=tmp_path / 'yamls', + review_output_path=tmp_path / 'review.json', + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + assert review['yaml_count'] == 1 + assert (tmp_path / 'manifest.json').exists() + assert (tmp_path / 'review.json').exists() + + +def test_review_index_reports_missing_model_without_aliasing(tmp_path: Path) -> None: + record = _aggregate(model_id='local/missing-model') + datastore, index_jsonl = _write_index_row(tmp_path, record) + + review = community_evals_converter.review_index_for_hf_evals( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + manifest_output_path=tmp_path / 'manifest.json', + yaml_output_dir=tmp_path / 'yamls', + review_output_path=tmp_path / 'review.json', + api=FakeHfApi(missing_models={'local/missing-model'}), + download_file=_fake_download(datastore), + ) + + assert review['can_open_prs'] is False + assert review['yaml_count'] == 0 + assert len(review['missing_hf_models']) == 1 + missing = review['missing_hf_models'][0] + assert missing['model_repo'] == 'local/missing-model' + assert missing['status'] == 'missing_hf_model' + assert 'model_repo_alias_from' not in missing + + +def test_review_collection_suppresses_existing_same_score_from_any_yaml_name( + tmp_path: Path, +) -> None: + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, [_aggregate()], collection_name='MMLU-Pro' + ) + model_yaml = tmp_path / 'model_main.yaml' + model_yaml.write_text( + yaml.safe_dump( + [ + { + 'dataset': { + 'id': 'TIGER-Lab/MMLU-Pro', + 'task_id': 'mmlu_pro', + }, + 'value': 64.1, + } + ], + sort_keys=False, + ), + encoding='utf-8', + ) + api = FakeHfApi( + repo_files_by_revision={ + ('google/gemma-2b-it', 'main'): ['.eval_results/not_the_name.yaml'] + } + ) + + review = community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=tmp_path / 'manifest.json', + yaml_output_dir=tmp_path / 'yamls', + review_output_path=tmp_path / 'review.json', + api=api, + download_file=_fake_download_with_model_files( + datastore, + { + ( + 'google/gemma-2b-it', + 'main', + '.eval_results/not_the_name.yaml', + ): model_yaml + }, + ), + ) + + assert review['can_open_prs'] is False + assert review['yaml_count'] == 0 + assert review['manifest']['entries'][0]['status'] == 'already_present' + assert review['duplicate_audit']['findings'][0]['status'] == 'already_present' + + +def test_review_collection_reports_progress_phases(tmp_path: Path) -> None: + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, + [_aggregate()], + collection_name='MMLU-Pro', + ) + progress = RecordingProgress() + + review = community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=tmp_path / 'manifest.json', + yaml_output_dir=tmp_path / 'yamls', + review_output_path=tmp_path / 'review.json', + api=FakeHfApi(), + download_file=_fake_download(datastore), + progress=progress, + ) + + joined = '\n'.join(progress.descriptions) + assert review['can_open_prs'] is True + assert 'Downloading collection index MMLU-Pro.jsonl' in joined + assert 'Processing 1 aggregate rows' in joined + assert 'row 1/1: downloading flat/objects/' in joined + assert 'row 1/1: checking google/gemma-2b-it' in joined + assert 'Auditing 1 ready candidates' in joined + + +def test_rich_review_progress_uses_one_visible_task() -> None: + console = Console(file=io.StringIO(), force_terminal=True) + progress = Progress(console=console) + review_progress = community_evals_converter.RichReviewProgress(progress) + + with progress: + setup_task = review_progress.add_task('Resolving datastore revision', total=4) + review_progress.update(setup_task, advance=4, description='Built manifest') + row_task = review_progress.add_task('Processing 2 aggregate rows', total=2) + review_progress.update(row_task, advance=2, description='Processed 2 rows') + audit_task = review_progress.add_task('Auditing 1 ready candidates', total=1) + review_progress.update(audit_task, advance=1, description='Audit complete') + + assert len(progress.tasks) == 1 + task = progress.tasks[0] + assert task.total == 1 + assert task.completed == 1 + + +def test_review_collection_progress_advances_api_only_rows( + tmp_path: Path, +) -> None: + api_only_record = _aggregate(model_id='anthropic/claude-3-opus') + api_only_record['model_info']['developer'] = 'anthropic' + api_only_record['model_info']['inference_platform'] = 'anthropic' + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, + [_aggregate(), api_only_record], + collection_name='MMLU-Pro', + ) + progress = RecordingProgress() + + review = community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=tmp_path / 'manifest.json', + yaml_output_dir=tmp_path / 'yamls', + review_output_path=tmp_path / 'review.json', + api=FakeHfApi(), + download_file=_fake_download(datastore), + progress=progress, + ) + + row_task = next( + task_id + for task_id, description in progress.task_initial_descriptions.items() + if description == 'Processing 2 aggregate rows' + ) + joined = '\n'.join(progress.descriptions) + assert review['can_open_prs'] is True + assert progress.advance_by_task[row_task] == 2 + assert 'Processed 2 aggregate rows' in joined + + +def test_review_collection_reuses_cached_review_without_downloads( + tmp_path: Path, +) -> None: + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, + [_aggregate()], + collection_name='MMLU-Pro', + ) + manifest_path = tmp_path / 'manifest.json' + yaml_dir = tmp_path / 'yamls' + review_path = tmp_path / 'review.json' + first_review = community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=manifest_path, + yaml_output_dir=yaml_dir, + review_output_path=review_path, + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + def fail_download(**_kwargs) -> str: + raise AssertionError('cached review should not download anything') + + api = FakeHfApi(missing_models={'google/gemma-2b-it'}) + second_review = community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=manifest_path, + yaml_output_dir=yaml_dir, + review_output_path=review_path, + api=api, + download_file=fail_download, + ) + + assert second_review['created_at'] == first_review['created_at'] + assert second_review['yaml_count'] == 1 + assert api.model_info_calls == [] + + +def test_review_collection_force_ignores_cached_review( + tmp_path: Path, +) -> None: + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, + [_aggregate()], + collection_name='MMLU-Pro', + ) + manifest_path = tmp_path / 'manifest.json' + yaml_dir = tmp_path / 'yamls' + review_path = tmp_path / 'review.json' + community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=manifest_path, + yaml_output_dir=yaml_dir, + review_output_path=review_path, + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + api = FakeHfApi(missing_models={'google/gemma-2b-it'}) + forced_review = community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=manifest_path, + yaml_output_dir=yaml_dir, + review_output_path=review_path, + api=api, + download_file=_fake_download(datastore), + force=True, + ) + + assert forced_review['can_open_prs'] is False + assert forced_review['yaml_count'] == 0 + assert api.model_info_calls == ['google/gemma-2b-it'] + + +def test_review_collection_resumes_cached_manifest_without_datastore_downloads( + tmp_path: Path, +) -> None: + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, + [_aggregate()], + collection_name='MMLU-Pro', + ) + manifest_path = tmp_path / 'manifest.json' + community_evals_converter.build_collection_manifest( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + output_path=manifest_path, + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + + def fail_download(**_kwargs) -> str: + raise AssertionError('cached manifest should skip datastore downloads') + + api = FakeHfApi(missing_models={'google/gemma-2b-it'}) + review = community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=manifest_path, + yaml_output_dir=tmp_path / 'yamls', + review_output_path=tmp_path / 'review.json', + api=api, + download_file=fail_download, + ) + + assert review['can_open_prs'] is True + assert review['yaml_count'] == 1 + assert api.model_info_calls == [] + + +def test_review_details_use_clear_headers_and_aggregate_existing_scores() -> None: + console = Console(record=True, width=200) + review = { + 'duplicate_audit': { + 'errors': [ + { + 'entry_index': 0, + 'model_repo': 'google/gemma-blocked', + 'stage': 'read_open_pr_eval_results', + 'path': '.eval_results/mmlu_pro.yaml', + 'error': 'Unable to download eval results YAML', + } + ], + 'findings': [ + { + 'status': 'score_conflict', + 'model_repo': 'nexusflow/athene-v2-chat', + 'existing_value': 73.11, + 'candidate_value': 70.21, + 'pr_url': 'https://huggingface.co/example/repo/discussions/1', + 'paths': ['.eval_results/mmlu_pro.yaml'], + }, + {'status': 'already_present'}, + {'status': 'already_present'}, + ], + }, + 'missing_hf_models': [ + { + 'model_repo': 'missing/model', + 'hf_check_error': 'HF model repo does not exist: missing/model', + 'eee_record_path': 'flat/objects/aa/bb/record.json', + 'yaml_entry': { + 'source': { + 'url': ( + 'https://huggingface.co/datasets/evaleval/' + 'EEE_datastore/blob/abc123/flat/objects/aa/bb/' + 'record.json' + ) + } + }, + } + ], + 'manifest': { + 'datastore_repo': 'evaleval/EEE_datastore', + 'datastore_revision': 'abc123', + 'skipped': [ + { + 'model_id': 'api/model', + 'reason': 'api_only_or_closed_provider:gemini', + 'eee_record_path': 'flat/objects/cc/dd/skipped.json', + } + ], + }, + } + + community_evals_converter._render_review_details(console, review) + + output = console.export_text() + assert 'Needs Attention' in output + assert 'Issue' in output + assert 'Where' in output + assert 'Details' in output + assert 'Candidate' not in output + assert 'Context' not in output + assert 'Score' not in output + assert '2 models' in output + assert 'https://huggingface.co/example/repo/discussions/1' in output + assert ( + 'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/abc123/' + 'flat/objects/aa/bb/record.json' + ) in output + assert ( + 'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/abc123/' + 'flat/objects/cc/dd/skipped.json' + ) in output + + +def test_review_collection_submits_clean_records_despite_open_pr_conflict( + tmp_path: Path, +) -> None: + conflict_record = _aggregate(model_id='google/gemma-2b-it') + clean_record = _aggregate(model_id='google/gemma-clean') + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, + [conflict_record, clean_record], + collection_name='MMLU-Pro', + ) + pr_yaml = tmp_path / 'model_pr.yaml' + pr_yaml.write_text( + yaml.safe_dump( + [ + { + 'dataset': { + 'id': 'TIGER-Lab/MMLU-Pro', + 'task_id': 'mmlu_pro', + }, + 'value': 12.3, + } + ], + sort_keys=False, + ), + encoding='utf-8', + ) + api = FakeHfApi( + repo_files_by_revision={ + ('google/gemma-2b-it', 'refs/pr/7'): [ + '.eval_results/random.yaml' + ], + }, + discussions={ + 'google/gemma-2b-it': [ + FakeDiscussion(git_reference='refs/pr/7', num=7) + ], + }, + ) + + review = community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=tmp_path / 'manifest.json', + yaml_output_dir=tmp_path / 'yamls', + review_output_path=tmp_path / 'review.json', + api=api, + download_file=_fake_download_with_model_files( + datastore, + { + ( + 'google/gemma-2b-it', + 'refs/pr/7', + '.eval_results/random.yaml', + ): pr_yaml + }, + ), + ) + + assert review['can_open_prs'] is True + assert review['yaml_count'] == 1 + statuses = { + entry['model_repo']: entry['status'] + for entry in review['manifest']['entries'] + } + assert statuses == { + 'google/gemma-2b-it': 'score_conflict', + 'google/gemma-clean': 'ready', + } + assert review['duplicate_audit']['findings'][0]['status'] == 'score_conflict' + + +def test_review_collection_blocks_only_candidate_with_audit_error( + tmp_path: Path, +) -> None: + blocked_record = _aggregate(model_id='google/gemma-blocked') + clean_record = _aggregate(model_id='google/gemma-clean') + datastore, _collection_jsonl = _write_collection_rows( + tmp_path, + [blocked_record, clean_record], + collection_name='MMLU-Pro', + ) + api = FakeHfApi( + repo_files_by_revision={ + ('google/gemma-blocked', 'refs/pr/7'): [ + '.eval_results/mmlu_pro.yaml' + ], + }, + discussions={ + 'google/gemma-blocked': [ + FakeDiscussion(git_reference='refs/pr/7', num=7) + ], + }, + ) + + review = community_evals_converter.review_collection_for_hf_evals( + collection_name='MMLU-Pro', + datastore='evaleval/EEE_datastore@abc123', + manifest_output_path=tmp_path / 'manifest.json', + yaml_output_dir=tmp_path / 'yamls', + review_output_path=tmp_path / 'review.json', + api=api, + download_file=_fake_download_with_model_files(datastore, {}), + ) + + statuses = { + entry['model_repo']: entry['status'] + for entry in review['manifest']['entries'] + } + assert review['can_open_prs'] is True + assert statuses == { + 'google/gemma-blocked': 'audit_error', + 'google/gemma-clean': 'ready', + } + assert review['duplicate_audit']['error_count'] == 1 + assert review['duplicate_audit']['errors'][0]['entry_index'] == 0 + assert review['audit_blocked_entries'][0]['model_repo'] == ( + 'google/gemma-blocked' + ) + assert review['yaml_count'] == 2 + assert ( + tmp_path + / 'yamls' + / 'google' + / 'gemma-blocked' + / '.eval_results' + / 'mmlu_pro.yaml' + ).exists() + + submit_api = FakeHfApi() + result = community_evals_converter.create_prs_from_manifest( + manifest_path=tmp_path / 'manifest.json', + limit=None, + yes_i_reviewed=True, + commit_message='Add EvalEval result', + api=submit_api, + ) + + assert result['count'] == 1 + assert submit_api.commits[0]['repo_id'] == 'google/gemma-clean' + + +def test_create_prs_from_manifest_creates_fresh_pr_only(tmp_path: Path) -> None: + datastore, index_jsonl = _write_index_row(tmp_path, _aggregate()) + manifest_path = tmp_path / 'manifest.json' + community_evals_converter.build_index_manifest( + index_jsonl=index_jsonl, + datastore='evaleval/EEE_datastore@abc123', + benchmarks=['mmlu_pro'], + output_path=manifest_path, + api=FakeHfApi(), + download_file=_fake_download(datastore), + ) + api = FakeHfApi( + discussions={ + 'google/gemma-2b-it': [ + FakeDiscussion(git_reference='refs/pr/123'), + ] + } + ) + + result = community_evals_converter.create_prs_from_manifest( + manifest_path=manifest_path, + limit=None, + yes_i_reviewed=True, + commit_message='Add EvalEval result', + api=api, + ) + + assert result['count'] == 1 + assert api.discussion_calls == [] + assert len(api.commits) == 1 + commit = api.commits[0] + assert commit['repo_id'] == 'google/gemma-2b-it' + assert commit['revision'] == 'main' + assert commit['create_pr'] is True + assert [op.__class__.__name__ for op in commit['operations']] == [ + 'CommitOperationAdd' + ] + + +def test_tui_approval_requires_exact_phrase(monkeypatch) -> None: + console = Console(record=True) + review = { + 'manifest': { + 'entries': [ + { + 'status': 'ready', + 'model_repo': 'google/gemma-2b-it', + 'target_path': '.eval_results/mmlu_pro.yaml', + } + ] + } + } + + monkeypatch.setattr(community_evals_converter.Prompt, 'ask', lambda *_args, **_kwargs: 'yes') + + assert community_evals_converter._approve_pr_submission(console, review) is False + + +def test_tui_approval_accepts_open_prs(monkeypatch) -> None: + console = Console(record=True) + review = { + 'manifest': { + 'entries': [ + { + 'status': 'ready', + 'model_repo': 'google/gemma-2b-it', + 'target_path': '.eval_results/mmlu_pro.yaml', + } + ] + } + } + + monkeypatch.setattr( + community_evals_converter.Prompt, + 'ask', + lambda *_args, **_kwargs: community_evals_converter.APPROVAL_PHRASE, + ) + + assert community_evals_converter._approve_pr_submission(console, review) is True + + +def test_prompt_commit_message_requires_non_empty(monkeypatch) -> None: + console = Console(record=True) + + monkeypatch.setattr(community_evals_converter.Prompt, 'ask', lambda *_args, **_kwargs: ' ') + + assert community_evals_converter._prompt_commit_message(console) is None + + +def test_prompt_commit_message_returns_typed_message(monkeypatch) -> None: + console = Console(record=True) + + monkeypatch.setattr( + community_evals_converter.Prompt, + 'ask', + lambda *_args, **_kwargs: 'Add verified EvalEval result', + ) + + assert ( + community_evals_converter._prompt_commit_message(console) + == 'Add verified EvalEval result' + ) + + +def test_parser_rejects_removed_open_prs_flag() -> None: + parser = community_evals_converter.build_parser() + + with pytest.raises(SystemExit): + parser.parse_args( + [ + 'aggregate.jsonl', + '--datastore', + 'evaleval/EEE_datastore@abc123', + '--open-prs', + ] + ) + + +def test_parser_rejects_old_index_workflow_flags() -> None: + parser = community_evals_converter.build_parser() + + with pytest.raises(SystemExit): + parser.parse_args(['MMLU-Pro', '--benchmarks', 'mmlu_pro']) + + with pytest.raises(SystemExit): + parser.parse_args(['MMLU-Pro', '--manifest-output', 'manifest.json']) + + +def test_parser_defaults_to_datastore_repo() -> None: + parser = community_evals_converter.build_parser() + + args = parser.parse_args(['MMLU-Pro']) + + assert args.collection_name == 'MMLU-Pro' + assert args.datastore == 'evaleval/EEE_datastore' + assert args.force is False + + +def test_parser_accepts_force() -> None: + parser = community_evals_converter.build_parser() + + args = parser.parse_args(['MMLU-Pro', '--force']) + + assert args.force is True + + +def test_every_eval_ever_cli_no_longer_exposes_hf_evals() -> None: + parser = cli.build_parser() + + with pytest.raises(SystemExit): + parser.parse_args(['hf-evals']) diff --git a/tools/hf-community-evals/README.md b/tools/hf-community-evals/README.md new file mode 100644 index 000000000..5730a0dad --- /dev/null +++ b/tools/hf-community-evals/README.md @@ -0,0 +1,126 @@ +# EEE -> HF Community Evals + +Built by Harsha Nelaturu, June 2026. + +Use `tools/community_evals_converter.py` to review one EEE datastore collection, generate +local HF Community Evals YAML previews, audit existing scores/open PRs, and +optionally open PRs after explicit approval. + +## Quick Start + +Use `uv run` for all commands. + +```bash +uv run tools/community_evals_converter.py MMLU-Pro \ + --datastore evaleval/EEE_datastore@main +``` + +This will cache the results for this particular collection and if you would like to force a fresh rebuild: + +```bash +uv run tools/community_evals_converter.py MMLU-Pro \ + --datastore evaleval/EEE_datastore@main \ + --force +``` + +The positional argument is a collection stem. It must resolve exactly to: + +```text +https://huggingface.co/datasets/evaleval/EEE_datastore/flat/indexes/by_collection/.jsonl +``` + +## Outputs + +For `MMLU-Pro`, outputs are written under: + +```text +outputs/community_evals_converter_MMLU-Pro/ +``` + +Important output files: + +- `manifest.json`: converted candidate records plus skipped/error metadata. +- `review.json`: full review result, duplicate audit findings, audit errors, + and PR readiness. +- `yamls///.eval_results/.yaml`: local YAML previews. + +`outputs/` is ignored by git. Use these files for inspection, not as merge +inputs. + +## Review Behavior + +The tool: + +- downloads the collection JSONL and referenced aggregate objects from the HF + datastore; +- validates object hashes and optional sizes; +- scans each aggregate record for supported HF benchmark datasets; +- writes YAML entries using the datastore object HF URL as `source.url`; +- keeps flat datastore provenance, including instance-level references when + present; +- checks model repo existence on Hugging Face; +- audits every existing `.eval_results/*.yaml` file on model `main`; +- audits changed `.eval_results/*.yaml` files in open PR refs; +- compares by dataset/task content, not YAML filename. + +Supported benchmarks in this workflow are: + +- `mmlu_pro` +- `gpqa` +- `hle` +- `gsm8k` + +## Resume And Force + +Default reruns reuse exact-match local outputs: + +- matching completed `review.json`: skips collection downloads, model checks, + and duplicate audit; +- matching pre-audit `manifest.json`: skips collection downloads and model + checks, then resumes at duplicate audit. + +The cache must match collection name, datastore input, and HF-check mode. +Invalid exact-match cache files are hard errors. Use `--force` when you want to +ignore the cache and rebuild from the datastore. + +## TUI +The final report has: + +- `Community Evals Converter`: summary counts. +- `Needs Attention`: capped triage table for blockers and exclusions. + +`Needs Attention` uses: + +- `Issue`: `audit_error`, `score_conflict`, `already_present`, + `missing_hf_model`, or `skipped`. +- `Model`: model repo or aggregate model id. +- `Details`: reason or score comparison. +- `Action`: `exclude`, `block entry`, `block all`, or source line. +- `Where`: terminal hyperlink to the HF model PR/file or HF datastore blob URL. + +Repeated same-score `already_present` findings are summarized as one count row. +Full details remain in `review.json`. + +## PR Submission + +The tool only opens PRs after both prompts succeed: + +1. Type exactly: + + ```text + OPEN PRS + ``` + +2. Enter a non-empty commit message. + +Only `status = ready` entries are submitted. + +Excluded statuses: + +- `already_present`: same score already exists. +- `score_conflict`: different score already exists. +- `missing_hf_model`: model repo does not resolve on HF. +- `audit_error`: candidate-scoped audit failure. + +Candidate-scoped audit errors block only that candidate. Audit errors without a +manifest entry block all PR submission. \ No newline at end of file diff --git a/tools/hf-community-evals/community_evals_converter.py b/tools/hf-community-evals/community_evals_converter.py new file mode 100644 index 000000000..c37562069 --- /dev/null +++ b/tools/hf-community-evals/community_evals_converter.py @@ -0,0 +1,2945 @@ +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +from dataclasses import dataclass +from datetime import UTC, datetime +from difflib import get_close_matches +from math import isfinite +from pathlib import Path +from typing import Any, Callable +from urllib.parse import quote + +import requests +import yaml +from huggingface_hub import ( + CommitOperationAdd, + HfApi, + hf_hub_download, +) +from huggingface_hub.errors import EntryNotFoundError +from rich.console import Console +from rich.panel import Panel +from rich.progress import ( + BarColumn, + Progress, + SpinnerColumn, + TaskID, + TextColumn, + TimeElapsedColumn, +) +from rich.prompt import Prompt +from rich.table import Column, Table +from rich.text import Text + +from every_eval_ever.eval_types import EvaluationLog, EvaluationResult + +SOURCE_NAME = 'EvalEval' +MANIFEST_VERSION = 1 +DEFAULT_DATASTORE_REVISION = 'main' +DEFAULT_DATASTORE_REPO = 'evaleval/EEE_datastore' +DEFAULT_BENCHMARKS = ('gpqa', 'hle', 'mmlu_pro', 'gsm8k') +DEFAULT_PR_COMMIT_DESCRIPTION = ( + 'Adds EvalEval Community Evals YAML entries with source backlinks to EEE ' + 'aggregate records.\n\n' + 'Contributor: evaleval' +) +AUDIT_ERROR_STATUS = 'audit_error' + + +class HFEvalsError(RuntimeError): + """Raised when HF Community Evals export cannot proceed safely.""" + + +class ReviewProgress: + def add_task(self, description: str, total: int | None = None) -> int: + return 0 + + def update( + self, + task_id: int, + *, + advance: int = 0, + description: str | None = None, + total: int | None = None, + ) -> None: + return None + + +class RichReviewProgress(ReviewProgress): + def __init__(self, progress: Progress) -> None: + self.progress = progress + self.rich_task_id: TaskID | None = None + self.next_task_id = 0 + self.total_by_task: dict[int, int] = {} + self.completed_by_task: dict[int, int] = {} + self.active_task_id: int | None = None + + def add_task(self, description: str, total: int | None = None) -> int: + self.next_task_id += 1 + task_id = self.next_task_id + task_total = total or 0 + self.total_by_task[task_id] = task_total + self.completed_by_task[task_id] = 0 + self.active_task_id = task_id + if self.rich_task_id is None: + self.rich_task_id = self.progress.add_task( + description, + total=task_total, + ) + else: + self.progress.update( + self.rich_task_id, + description=description, + completed=0, + total=task_total, + ) + return task_id + + def update( + self, + task_id: int, + *, + advance: int = 0, + description: str | None = None, + total: int | None = None, + ) -> None: + if self.rich_task_id is None: + self.rich_task_id = self.progress.add_task( + description or 'Reviewing', + total=0, + ) + if total is not None: + self.total_by_task[task_id] = total + self.active_task_id = task_id + self.completed_by_task[task_id] = ( + self.completed_by_task.get(task_id, 0) + advance + ) + kwargs: dict[str, Any] = { + 'completed': self.completed_by_task[task_id], + 'total': self.total_by_task.get(task_id, 0), + } + if description is not None: + kwargs['description'] = description + self.progress.update(self.rich_task_id, **kwargs) + + +@dataclass(frozen=True) +class BenchmarkConfig: + dataset_id: str + task_id: str + yaml_name: str + dataset_aliases: tuple[str, ...] = () + preferred_metric_ids: tuple[str, ...] = () + + +BENCHMARK_CONFIGS: dict[str, BenchmarkConfig] = { + 'gpqa': BenchmarkConfig( + 'Idavidrein/gpqa', + 'diamond', + 'gpqa_diamond', + ('reasoningMIA/gpqa_diamond',), + ), + 'hle': BenchmarkConfig( + 'cais/hle', + 'default', + 'hle', + preferred_metric_ids=('hle.accuracy', 'hle/accuracy'), + ), + 'mmlu_pro': BenchmarkConfig( + 'TIGER-Lab/MMLU-Pro', + 'mmlu_pro', + 'mmlu_pro', + preferred_metric_ids=( + 'mmlu_pro/overall', + 'mmlu-pro::chain-of-thought-correctness', + ), + ), + 'gsm8k': BenchmarkConfig('openai/gsm8k', 'gsm8k', 'gsm8k', ('gsm8k',)), +} +BENCHMARK_ALIASES = { + 'gpqa_diamond': 'gpqa', +} +HF_TIMEOUT_SECONDS = 10 +OPEN_WEIGHT_MODEL_PREFIXES = ('openai/gpt-oss',) +API_ONLY_PROVIDER_PREFIXES = ( + 'anthropic', + 'gemini', + 'grok', + 'mistral', + 'openai', + 'xai', +) +GPQA_SUBSET_NOTES = { + 'diamond': 'GPQA Diamond', + 'gpqa_diamond': 'GPQA Diamond', + 'main': 'GPQA chain-of-thought', + 'chain_of_thought': 'GPQA chain-of-thought', + 'cot': 'GPQA chain-of-thought', +} +EVAL_RESULT_PATH_FAMILIES = { + 'gpqa': ( + '.eval_results/gpqa_diamond.yaml', + '.eval_results/gpqa-diamond.yaml', + '.eval_results/gpqa.yaml', + ), + 'gsm8k': ('.eval_results/gsm8k.yaml',), + 'hle': ('.eval_results/hle.yaml',), + 'mmlu_pro': ( + '.eval_results/mmlu_pro.yaml', + '.eval_results/mmlu-pro.yaml', + ), +} + + +def normalize_benchmark(value: str) -> str: + return value.strip().lower().replace('-', '_') + + +def parse_benchmarks(raw: str | None) -> list[str]: + if raw is None: + return list(DEFAULT_BENCHMARKS) + benchmarks = [ + BENCHMARK_ALIASES.get(normalize_benchmark(item), normalize_benchmark(item)) + for item in raw.split(',') + ] + benchmarks = [item for item in benchmarks if item] + unknown = sorted(set(benchmarks) - set(BENCHMARK_CONFIGS)) + if unknown: + raise HFEvalsError(f'Unsupported benchmark(s): {", ".join(unknown)}') + if not benchmarks: + raise HFEvalsError('At least one benchmark is required.') + return benchmarks + + +def parse_datastore_locator(value: str) -> tuple[str, str | None]: + raw = value.strip() + if not raw: + raise HFEvalsError('Datastore must be [@].') + if raw.count('@') > 1: + raise HFEvalsError('Datastore must be [@].') + if '@' in raw: + repo_id, revision = ( + part.strip().strip('/') for part in raw.split('@', 1) + ) + else: + repo_id = raw.strip().strip('/') + revision = None + if not repo_id or '/' not in repo_id: + raise HFEvalsError('Datastore repo must look like org/dataset.') + if revision is not None and not revision: + raise HFEvalsError('Datastore revision must not be empty.') + return repo_id, revision + + +def resolve_datastore_locator(value: str, *, api: HfApi) -> tuple[str, str]: + repo_id, revision = parse_datastore_locator(value) + if revision is not None: + return repo_id, revision + + try: + info = api.repo_info( + repo_id=repo_id, + repo_type='dataset', + revision=DEFAULT_DATASTORE_REVISION, + ) + except Exception as exc: # noqa: BLE001 + raise HFEvalsError( + f'Unable to resolve latest datastore commit for {repo_id}' + ) from exc + + sha = getattr(info, 'sha', None) + if sha is None and isinstance(info, dict): + sha = info.get('sha') + if not isinstance(sha, str) or not sha.strip(): + raise HFEvalsError( + f'HF dataset repo did not return a commit sha for {repo_id}' + ) + return repo_id, sha.strip() + + +def dump_yaml_entries(entries: list[dict[str, Any]]) -> str: + return yaml.safe_dump( + entries, + sort_keys=False, + allow_unicode=False, + width=88, + ) + + +def _is_real_hf_api(api: HfApi) -> bool: + return api.__class__ is HfApi + + +def _hf_model_api_url(repo_id: str) -> str: + return f'https://huggingface.co/api/models/{quote(repo_id, safe="/")}' + + +def _http_model_info(repo_id: str) -> dict[str, Any]: + try: + response = requests.get( + _hf_model_api_url(repo_id), timeout=HF_TIMEOUT_SECONDS + ) + except requests.RequestException as exc: + raise HFEvalsError(f'Unable to check HF model repo: {repo_id}') from exc + if response.status_code == 404: + raise HFEvalsError(f'HF model repo does not exist: {repo_id}') + try: + response.raise_for_status() + except requests.HTTPError as exc: + raise HFEvalsError(f'Unable to check HF model repo: {repo_id}') from exc + loaded = response.json() + if not isinstance(loaded, dict): + raise HFEvalsError(f'HF model API returned invalid data: {repo_id}') + return loaded + + +def _repo_exists(api: HfApi, repo_id: str) -> None: + if _is_real_hf_api(api): + _http_model_info(repo_id) + return + try: + api.model_info(repo_id) + except Exception as exc: # noqa: BLE001 - preserve HF client details + raise HFEvalsError(f'HF model repo does not exist: {repo_id}') from exc + + +def _datastore_blob_url( + path: str, + *, + datastore_revision: str, + datastore_repo: str = DEFAULT_DATASTORE_REPO, +) -> str: + repo = datastore_repo.strip().strip('/') + if not repo: + raise HFEvalsError('Datastore repo must not be empty.') + revision = datastore_revision.strip() + if not revision: + raise HFEvalsError('Datastore revision must not be empty.') + return ( + f'https://huggingface.co/datasets/{quote(repo, safe="/")}/blob/' + f'{quote(revision, safe="")}/' + f'{quote(path, safe="/")}' + ) + + +def _date_from_result(log: EvaluationLog, result: EvaluationResult) -> str | None: + value = result.evaluation_timestamp or log.evaluation_timestamp + if value is None: + return None + try: + if value.replace('.', '', 1).isdigit(): + return datetime.fromtimestamp(float(value), tz=UTC).date().isoformat() + except ValueError: + pass + if len(value) >= 10: + return value[:10] + raise HFEvalsError(f'Invalid evaluation timestamp: {value!r}') + + +def _dataset_ids_for_config(config: BenchmarkConfig) -> set[str]: + return { + config.dataset_id.strip().lower(), + *(alias.strip().lower() for alias in config.dataset_aliases), + } + + +def _result_matches_dataset( + result: EvaluationResult, config: BenchmarkConfig +) -> bool: + if result.source_data.source_type == 'hf_dataset': + hf_repo = (result.source_data.hf_repo or '').strip().lower() + if hf_repo in _dataset_ids_for_config(config): + return True + additional_details = result.source_data.additional_details or {} + if isinstance(additional_details, dict): + benchmark_hf_repo = ( + str(additional_details.get('benchmark_hf_repo') or '') + .strip() + .lower() + ) + if benchmark_hf_repo in _dataset_ids_for_config(config): + return True + if hf_repo: + return False + dataset_name = normalize_benchmark(result.source_data.dataset_name) + return dataset_name == normalize_benchmark(config.task_id) + + if result.source_data.source_type == 'url': + dataset_urls = [ + url.strip().lower().rstrip('/') + for url in result.source_data.url + if isinstance(url, str) + ] + return any( + url.endswith(f'/datasets/{dataset_id}') + for dataset_id in _dataset_ids_for_config(config) + for url in dataset_urls + ) + + return False + + +def _result_matches_preferred_metric( + result: EvaluationResult, config: BenchmarkConfig +) -> bool: + if not config.preferred_metric_ids: + return True + allowed = {item.strip().lower() for item in config.preferred_metric_ids} + result_ids = { + item.strip().lower() + for item in ( + result.evaluation_result_id, + result.metric_config.metric_id, + ) + if isinstance(item, str) + } + return bool(allowed & result_ids) + + +def _result_for_dataset( + log: EvaluationLog, config: BenchmarkConfig +) -> EvaluationResult | None: + matches = [ + result + for result in log.evaluation_results + if _result_matches_dataset(result, config) + and _result_matches_preferred_metric(result, config) + ] + if not matches: + return None + if len(matches) != 1: + ids = [ + result.evaluation_result_id or result.evaluation_name + for result in matches + ] + raise HFEvalsError( + f'{log.evaluation_id} has {len(matches)} matching ' + f'evaluation_results for {config.dataset_id}: {ids}' + ) + return matches[0] + + +def _results_for_supported_datasets( + log: EvaluationLog, +) -> list[tuple[str, BenchmarkConfig, EvaluationResult]]: + results: list[tuple[str, BenchmarkConfig, EvaluationResult]] = [] + for benchmark, config in BENCHMARK_CONFIGS.items(): + matches = [ + result + for result in log.evaluation_results + if _result_matches_dataset(result, config) + and _result_matches_preferred_metric(result, config) + ] + if len(matches) > 1: + ids = [ + result.evaluation_result_id or result.evaluation_name + for result in matches + ] + raise HFEvalsError( + f'{log.evaluation_id} has {len(matches)} matching ' + f'evaluation_results for {config.dataset_id}: {ids}' + ) + if matches: + results.append((benchmark, config, matches[0])) + return results + + +def _community_eval_entry( + *, + config: BenchmarkConfig, + task_id: str, + value: float | int, + date: str | None, + source_url: str, + notes: str | None = None, +) -> dict[str, Any]: + entry: dict[str, Any] = { + 'dataset': {'id': config.dataset_id, 'task_id': task_id}, + 'value': value, + 'source': {'url': source_url, 'name': SOURCE_NAME}, + } + if date is not None: + entry['date'] = date + if notes is not None: + entry['notes'] = notes + return entry + + +def _gpqa_variant_notes(result: EvaluationResult) -> str | None: + source_data = result.source_data + hf_repo = (source_data.hf_repo or '').strip().lower() + dataset_name = normalize_benchmark(source_data.dataset_name).replace(' ', '_') + result_id = (result.evaluation_result_id or '').strip().lower() + metric_name = (result.metric_config.metric_name or '').strip().lower() + + if ( + hf_repo == 'human-centered-eval/openeval' + and dataset_name == 'gpqa' + and ( + result_id == 'gpqa::chain-of-thought-correctness' + or metric_name == 'chain_of_thought_correctness' + ) + ): + return 'GPQA chain-of-thought' + + if dataset_name == 'gpqa_diamond' or result_id.startswith('gpqa_diamond/'): + return 'GPQA Diamond' + + return None + + +def _community_eval_notes(benchmark: str, result: EvaluationResult) -> str | None: + if benchmark == 'gpqa': + return _gpqa_variant_notes(result) + return None + + +def _community_eval_notes_for_subset( + benchmark: str, + subset: str | None, +) -> str | None: + if subset is None: + return None + if benchmark != 'gpqa': + return None + normalized_subset = normalize_benchmark(subset) + try: + return GPQA_SUBSET_NOTES[normalized_subset] + except KeyError as exc: + allowed = ', '.join(sorted(GPQA_SUBSET_NOTES)) + raise HFEvalsError( + f'Unsupported subset for gpqa: {subset!r}; expected one of {allowed}.' + ) from exc + + +def _community_eval_task_id( + benchmark: str, + config: BenchmarkConfig, + result: EvaluationResult, + notes: str | None, +) -> str: + if benchmark == 'gpqa': + if notes == 'GPQA chain-of-thought': + return 'main' + if notes == 'GPQA Diamond': + return 'diamond' + return config.task_id + + +def _community_eval_value(result: EvaluationResult) -> float | int: + score = result.score_details.score + if ( + isinstance(score, bool) + or not isinstance(score, (int, float)) + or not isfinite(float(score)) + ): + raise HFEvalsError('score_details.score must be numeric') + + value: float + metric_unit = (result.metric_config.metric_unit or '').strip().lower() + max_score = result.metric_config.max_score + if metric_unit in {'percent', 'percentage'} or max_score == 100: + value = float(score) + elif metric_unit == 'proportion' or max_score == 1: + value = float(score) * 100 + else: + raise HFEvalsError( + 'Cannot convert score to 0-100 Community Evals value without ' + f'metric_unit=proportion/percent or max_score=1/100 for ' + f'{result.evaluation_result_id!r}.' + ) + + if value < 0 or value > 100: + raise HFEvalsError( + f'Community Evals value for {result.evaluation_result_id!r} ' + f'must be in the 0-100 range, got {value}.' + ) + return round(value, 10) + + +def _target_path(config: BenchmarkConfig) -> str: + return f'.eval_results/{config.yaml_name}.yaml' + + +def _entry_is_ready(entry: dict[str, Any]) -> bool: + return entry.get('status', 'ready') == 'ready' + + +def _entry_has_yaml_preview(entry: dict[str, Any]) -> bool: + return entry.get('status', 'ready') in {'ready', AUDIT_ERROR_STATUS} + + +def _api_only_skip_reason(log: EvaluationLog) -> str | None: + platform = (log.model_info.inference_platform or '').strip().lower() + developer = (log.model_info.developer or '').strip().lower() + model_id = log.model_info.id.strip().lower() + model_name = log.model_info.name.strip().lower() + if any( + model_id == prefix or model_id.startswith(prefix) + for prefix in OPEN_WEIGHT_MODEL_PREFIXES + ): + return None + if any( + model_name == prefix or model_name.startswith(prefix) + for prefix in OPEN_WEIGHT_MODEL_PREFIXES + ): + return None + values = (platform, developer, model_id, model_name) + for prefix in API_ONLY_PROVIDER_PREFIXES: + if any(value == prefix or value.startswith(f'{prefix}/') for value in values): + return f'api_only_or_closed_provider:{prefix}' + if 'gemini' in model_id or 'gemini' in model_name: + return 'api_only_or_closed_provider:gemini' + return None + + +def _safe_index_path(row: dict[str, Any], field: str, *, line_ref: str) -> str: + value = row.get(field) + if not isinstance(value, str) or not value.strip(): + raise HFEvalsError(f'{line_ref}: missing {field}') + path = Path(value) + if path.is_absolute() or '..' in path.parts: + raise HFEvalsError(f'{line_ref}: unsafe {field}: {value}') + return value + + +def _index_subset(row: dict[str, Any], *, line_ref: str) -> str | None: + value = row.get('subset') + if value is None: + return None + if not isinstance(value, str) or not value.strip(): + raise HFEvalsError(f'{line_ref}: subset must be a non-empty string') + return value.strip() + + +def _reject_unsupported_row_sources(row: dict[str, Any], *, line_ref: str) -> None: + unsupported = [ + field for field in ('url', 'local_path') if row.get(field) is not None + ] + if unsupported: + fields = ', '.join(unsupported) + raise HFEvalsError( + f'{line_ref}: unsupported aggregate row source field(s): {fields}; ' + 'use object_path from the datastore index' + ) + + +def _validate_instance_level_reference( + row: dict[str, Any], + *, + line_ref: str, +) -> None: + available = row.get('instance_level_available') + if not isinstance(available, bool): + raise HFEvalsError(f'{line_ref}: instance_level_available must be boolean') + if not available: + unexpected = [ + field + for field in ( + 'instance_level_path', + 'instance_level_size_bytes', + 'instance_sha', + ) + if row.get(field) is not None + ] + if unexpected: + raise HFEvalsError( + f'{line_ref}: instance_level_available is false but ' + f'instance provenance is present: {", ".join(unexpected)}' + ) + return + + _safe_index_path(row, 'instance_level_path', line_ref=line_ref) + size = row.get('instance_level_size_bytes') + if not isinstance(size, int): + raise HFEvalsError(f'{line_ref}: instance_level_size_bytes must be an integer') + instance_sha = row.get('instance_sha') + if not isinstance(instance_sha, str) or not instance_sha: + raise HFEvalsError(f'{line_ref}: missing instance_sha') + if len(instance_sha) != 64 or any( + char not in '0123456789abcdef' for char in instance_sha.lower() + ): + raise HFEvalsError(f'{line_ref}: instance_sha must be a sha256 hex digest') + + +def _index_trace_fields(row: dict[str, Any]) -> dict[str, Any]: + fields = {} + for field in ('legacy_path', 'object_path', 'subset'): + value = row.get(field) + if value is not None: + fields[field] = value + return fields + + +def _resolve_index_jsonl_path(index_path: Path) -> Path: + resolved = index_path.resolve() + if not resolved.is_dir(): + return resolved + + aggregate_jsonl = resolved / 'aggregate.jsonl' + if not aggregate_jsonl.exists(): + raise HFEvalsError( + f'Index directory must contain aggregate.jsonl: {resolved}' + ) + if not aggregate_jsonl.is_file(): + raise HFEvalsError( + f'Index directory aggregate.jsonl must be a file: {aggregate_jsonl}' + ) + return aggregate_jsonl + + +def _load_index_rows(index_jsonl: Path) -> list[dict[str, Any]]: + if not index_jsonl.exists(): + raise HFEvalsError(f'Index JSONL does not exist: {index_jsonl}') + if not index_jsonl.is_file(): + raise HFEvalsError(f'Index JSONL must be a file: {index_jsonl}') + + rows: list[dict[str, Any]] = [] + with index_jsonl.open(encoding='utf-8') as file: + for line_number, line in enumerate(file, start=1): + line = line.strip() + if not line: + continue + try: + row = json.loads(line) + except json.JSONDecodeError as exc: + raise HFEvalsError( + f'{index_jsonl}:{line_number}: invalid JSONL row: {exc}' + ) from exc + if not isinstance(row, dict): + raise HFEvalsError( + f'{index_jsonl}:{line_number}: JSONL row must be an object' + ) + row['_index_line'] = line_number + rows.append(row) + if not rows: + raise HFEvalsError(f'Index JSONL has no rows: {index_jsonl}') + return rows + + +def _safe_collection_name(value: str) -> str: + name = value.strip() + if not name: + raise HFEvalsError('Collection name must not be empty.') + if name != value: + raise HFEvalsError('Collection name must not include surrounding whitespace.') + if name.endswith('.jsonl'): + raise HFEvalsError('Pass the collection name without the .jsonl suffix.') + if '/' in name or '\\' in name: + raise HFEvalsError( + 'Collection name must be a single by_collection file stem.' + ) + parts = Path(name).parts + if parts != (name,) or name in {'.', '..'}: + raise HFEvalsError( + 'Collection name must be a single by_collection file stem.' + ) + return name + + +def _collection_index_path(collection_name: str) -> str: + collection_name = _safe_collection_name(collection_name) + return f'flat/indexes/by_collection/{collection_name}.jsonl' + + +def _collection_stems_from_repo_files(paths: list[str]) -> list[str]: + prefix = 'flat/indexes/by_collection/' + suffix = '.jsonl' + stems = set() + for path in paths: + if not isinstance(path, str): + continue + if not path.startswith(prefix) or not path.endswith(suffix): + continue + filename = path[len(prefix) :] + if '/' in filename: + continue + stem = filename[: -len(suffix)] + if stem: + stems.add(stem) + return sorted(stems, key=str.lower) + + +def _normalized_collection_stem(value: str) -> str: + return ( + value.lower() + .replace('-', '') + .replace('_', '') + .replace(' ', '') + .replace('.', '') + ) + + +def _nearby_collection_stems(collection_name: str, stems: list[str]) -> list[str]: + normalized_requested = _normalized_collection_stem(collection_name) + suggestions = [] + for stem in stems: + normalized_stem = _normalized_collection_stem(stem) + if ( + normalized_requested in normalized_stem + or normalized_stem in normalized_requested + ): + suggestions.append(stem) + for stem in get_close_matches(collection_name, stems, n=5, cutoff=0.55): + if stem not in suggestions: + suggestions.append(stem) + return suggestions[:5] + + +def _collection_suggestion_text( + *, + api: HfApi, + datastore_repo: str, + datastore_revision: str, + collection_name: str, +) -> str: + try: + paths = api.list_repo_files( + repo_id=datastore_repo, + repo_type='dataset', + revision=datastore_revision, + ) + except Exception as exc: # noqa: BLE001 + return f'Unable to list available collection stems: {exc}' + stems = _collection_stems_from_repo_files(list(paths)) + if not stems: + return 'No collection JSONL files were found under flat/indexes/by_collection.' + suggestions = _nearby_collection_stems(collection_name, stems) + if not suggestions: + return 'No nearby collection stems found.' + return f'Nearby collection stems: {", ".join(suggestions)}' + + +def _download_collection_index_jsonl( + *, + api: HfApi, + datastore_repo: str, + datastore_revision: str, + collection_name: str, + download_file: Callable[..., str] | None = None, +) -> tuple[str, Path]: + download_file = download_file or hf_hub_download + collection_index_path = _collection_index_path(collection_name) + try: + local_path = download_file( + repo_id=datastore_repo, + repo_type='dataset', + filename=collection_index_path, + revision=datastore_revision, + ) + except Exception as exc: # noqa: BLE001 + suggestion_text = _collection_suggestion_text( + api=api, + datastore_repo=datastore_repo, + datastore_revision=datastore_revision, + collection_name=collection_name, + ) + raise HFEvalsError( + f'Unable to download required collection index file ' + f'{collection_index_path} from {datastore_repo}@{datastore_revision}. ' + f'{suggestion_text}' + ) from exc + return collection_index_path, Path(local_path) + + +def _candidate_duplicate_key(entry: dict[str, Any]) -> tuple[str, str, str]: + dataset = entry['yaml_entry']['dataset'] + return ( + str(entry['model_repo']).strip().lower(), + str(dataset['id']).strip().lower(), + str(dataset['task_id']).strip(), + ) + + +def _numeric_score(value: Any, *, context: str) -> float: + if isinstance(value, bool) or not isinstance(value, (int, float)): + raise HFEvalsError(f'{context}: score value must be numeric') + score = float(value) + if not isfinite(score): + raise HFEvalsError(f'{context}: score value must be finite') + return score + + +def _scores_equal(left: Any, right: Any) -> bool: + return abs( + _numeric_score(left, context='left score') + - _numeric_score(right, context='right score') + ) <= 1e-9 + + +def _read_online_indexed_record( + *, + datastore_repo: str, + datastore_revision: str, + object_path: str, + row: dict[str, Any], + line_ref: str, + download_file: Callable[..., str] | None = None, +) -> EvaluationLog: + download_file = download_file or hf_hub_download + try: + local_path = download_file( + repo_id=datastore_repo, + repo_type='dataset', + filename=object_path, + revision=datastore_revision, + ) + except Exception as exc: # noqa: BLE001 + raise HFEvalsError( + f'{line_ref}: unable to download {object_path} from ' + f'{datastore_repo}@{datastore_revision}' + ) from exc + + return _parse_indexed_record_bytes( + data=Path(local_path).read_bytes(), + row=row, + line_ref=line_ref, + source_ref=object_path, + ) + + +def _parse_indexed_record_bytes( + *, + data: bytes, + row: dict[str, Any], + line_ref: str, + source_ref: str, +) -> EvaluationLog: + expected_size = row.get('size_bytes') + if expected_size is not None: + if not isinstance(expected_size, int): + raise HFEvalsError(f'{line_ref}: size_bytes must be an integer') + if len(data) != expected_size: + raise HFEvalsError( + f'{line_ref}: size_bytes mismatch for {source_ref}: ' + f'expected {expected_size}, got {len(data)}' + ) + + expected_sha = row.get('sha256') + if not isinstance(expected_sha, str) or not expected_sha: + raise HFEvalsError(f'{line_ref}: missing sha256') + actual_sha = hashlib.sha256(data).hexdigest() + if actual_sha != expected_sha: + raise HFEvalsError( + f'{line_ref}: sha256 mismatch for {source_ref}: ' + f'expected {expected_sha}, got {actual_sha}' + ) + + try: + raw = json.loads(data.decode('utf-8')) + log = EvaluationLog.model_validate(raw) + except Exception as exc: # noqa: BLE001 + raise HFEvalsError(f'{line_ref}: invalid EEE aggregate JSON: {exc}') from exc + return log + + +def _candidate_from_record_result( + *, + benchmark: str, + config: BenchmarkConfig, + record_path: str, + log: EvaluationLog, + result: EvaluationResult, + model_repo: str, + source_url: str, + source: str, + status: str, + hf_check_error: str | None, + subset: str | None = None, +) -> dict[str, Any]: + value = _community_eval_value(result) + notes = _community_eval_notes(benchmark, result) + subset_notes = _community_eval_notes_for_subset(benchmark, subset) + if subset_notes is not None: + if notes is not None and notes != subset_notes: + raise HFEvalsError( + f'Index subset {subset!r} conflicts with aggregate variant ' + f'{notes!r}.' + ) + notes = subset_notes + task_id = _community_eval_task_id(benchmark, config, result, notes) + yaml_entry = _community_eval_entry( + config=config, + task_id=task_id, + value=value, + date=_date_from_result(log, result), + source_url=source_url, + notes=notes, + ) + entry = { + 'status': status, + 'benchmark': benchmark, + 'model_repo': model_repo, + 'target_path': _target_path(config), + 'eee_evaluation_id': log.evaluation_id, + 'eee_evaluation_result_id': result.evaluation_result_id, + 'eee_record_path': record_path, + 'source': source, + 'yaml_entry': yaml_entry, + 'pr_title': f'Add EvalEval {task_id} result for {model_repo}', + 'pr_body': ( + 'Adds a Hugging Face Community Evals result from ' + f'{SOURCE_NAME} with a backlink to the source EEE record.' + ), + } + if hf_check_error is not None: + entry['hf_check_error'] = hf_check_error + return entry + + +def build_index_manifest( + *, + index_jsonl: Path, + datastore: str, + benchmarks: list[str], + output_path: Path | None = None, + api: HfApi | None = None, + check_hf: bool = True, + download_file: Callable[..., str] | None = None, +) -> dict[str, Any]: + """Build HF Community Evals candidates from online flat datastore rows.""" + + api = api or HfApi() + index_jsonl = _resolve_index_jsonl_path(index_jsonl) + rows = _load_index_rows(index_jsonl) + entries: list[dict[str, Any]] = [] + skipped: list[dict[str, Any]] = [] + errors: list[str] = [] + seen_keys: dict[tuple[str, str, str, str | None], dict[str, Any]] = {} + repo_check_cache: dict[str, HFEvalsError | None] = {} + datastore_repo, datastore_revision = resolve_datastore_locator( + datastore, api=api + ) + + def cached_repo_error(repo_id: str) -> HFEvalsError | None: + if repo_id not in repo_check_cache: + try: + _repo_exists(api, repo_id) + repo_check_cache[repo_id] = None + except HFEvalsError as exc: + repo_check_cache[repo_id] = exc + return repo_check_cache[repo_id] + + for row in rows: + line_number = row['_index_line'] + line_ref = f'{index_jsonl}:{line_number}' + + raw_benchmark = row.get('benchmark') + if not isinstance(raw_benchmark, str) or not raw_benchmark.strip(): + errors.append(f'{line_ref}: missing benchmark') + continue + try: + _reject_unsupported_row_sources(row, line_ref=line_ref) + subset = _index_subset(row, line_ref=line_ref) + except HFEvalsError as exc: + errors.append(str(exc)) + continue + normalized_benchmark = BENCHMARK_ALIASES.get( + normalize_benchmark(raw_benchmark), + normalize_benchmark(raw_benchmark), + ) + if normalized_benchmark not in BENCHMARK_CONFIGS: + skipped.append( + { + 'reason': 'unsupported_index_benchmark', + 'benchmark': raw_benchmark, + 'index_path': index_jsonl.as_posix(), + 'index_line': line_number, + **_index_trace_fields(row), + } + ) + continue + if normalized_benchmark not in benchmarks: + skipped.append( + { + 'reason': 'benchmark_not_selected', + 'benchmark': raw_benchmark, + 'index_path': index_jsonl.as_posix(), + 'index_line': line_number, + **_index_trace_fields(row), + } + ) + continue + + record_type = row.get('record_type') + if record_type != 'aggregate': + skipped.append( + { + 'reason': 'non_aggregate_index_row', + 'record_type': record_type, + 'benchmark': raw_benchmark, + 'index_path': index_jsonl.as_posix(), + 'index_line': line_number, + **_index_trace_fields(row), + } + ) + continue + + try: + object_path = _safe_index_path(row, 'object_path', line_ref=line_ref) + record_ref = object_path + source_url = _datastore_blob_url( + object_path, + datastore_repo=datastore_repo, + datastore_revision=datastore_revision, + ) + source_mode = 'online_flat_index_jsonl' + log = _read_online_indexed_record( + datastore_repo=datastore_repo, + datastore_revision=datastore_revision, + object_path=object_path, + row=row, + line_ref=line_ref, + download_file=download_file, + ) + except HFEvalsError as exc: + errors.append(str(exc)) + continue + + api_only_reason = _api_only_skip_reason(log) + if api_only_reason is not None: + skipped.append( + { + 'reason': api_only_reason, + 'benchmark': raw_benchmark, + 'eee_evaluation_id': log.evaluation_id, + 'eee_record_path': record_ref, + 'index_path': index_jsonl.as_posix(), + 'index_line': line_number, + **_index_trace_fields(row), + 'model_id': log.model_info.id, + } + ) + continue + + raw_model_repo = log.model_info.id + if not isinstance(raw_model_repo, str) or not raw_model_repo.strip(): + errors.append(f'{line_ref}: record has no model_info.id') + continue + model_repo = raw_model_repo.strip() + + status = 'ready' + hf_check_error: str | None = None + if check_hf: + error = cached_repo_error(model_repo) + if error is not None: + status = 'missing_hf_model' + hf_check_error = str(error) + + config = BENCHMARK_CONFIGS[normalized_benchmark] + try: + result = _result_for_dataset(log, config) + except HFEvalsError as exc: + errors.append(f'{line_ref}: {exc}') + continue + if result is None: + skipped.append( + { + 'reason': 'no_matching_evaluation_result', + 'benchmark': raw_benchmark, + 'eee_evaluation_id': log.evaluation_id, + 'eee_record_path': record_ref, + 'index_path': index_jsonl.as_posix(), + 'index_line': line_number, + **_index_trace_fields(row), + } + ) + continue + + try: + entry = _candidate_from_record_result( + benchmark=normalized_benchmark, + config=config, + record_path=record_ref, + log=log, + result=result, + model_repo=model_repo, + source_url=source_url, + source=source_mode, + status=status, + hf_check_error=hf_check_error, + subset=subset, + ) + except HFEvalsError as exc: + errors.append(f'{line_ref}: {exc}') + continue + + entry['index_path'] = index_jsonl.as_posix() + entry['index_line'] = line_number + entry['flat_object_path'] = record_ref + for field in ( + 'legacy_path', + 'object_uuid', + 'subset', + 'sha256', + 'size_bytes', + 'eval_schema_version', + 'instance_object_path', + 'instance_sha256', + 'instance_size_bytes', + ): + value = row.get(field) + if value is not None: + entry[field] = value + + dataset = entry['yaml_entry']['dataset'] + duplicate_key = ( + model_repo.lower(), + dataset['id'], + dataset['task_id'], + entry['yaml_entry'].get('notes'), + ) + existing_entry = seen_keys.get(duplicate_key) + if existing_entry is not None: + if existing_entry['yaml_entry'] == entry['yaml_entry']: + skipped.append( + { + 'reason': 'duplicate_candidate_same_entry', + 'model_repo': model_repo, + 'eee_evaluation_id': log.evaluation_id, + 'eee_record_path': record_ref, + 'index_path': index_jsonl.as_posix(), + 'index_line': line_number, + **_index_trace_fields(row), + } + ) + continue + errors.append( + f'{line_ref}: duplicate candidate for {model_repo} ' + f'{dataset["id"]}/{dataset["task_id"]} with different ' + 'YAML values.' + ) + continue + seen_keys[duplicate_key] = entry + entries.append(entry) + + manifest = { + 'version': MANIFEST_VERSION, + 'created_at': datetime.now(tz=UTC).isoformat(), + 'benchmarks': benchmarks, + 'hf_checks': check_hf, + 'source_url_mode': 'online_flat_index_jsonl', + 'datastore': f'{datastore_repo}@{datastore_revision}', + 'datastore_input': datastore, + 'datastore_repo': datastore_repo, + 'datastore_revision': datastore_revision, + 'index_jsonl': index_jsonl.as_posix(), + 'entries': entries, + 'skipped': skipped, + 'errors': errors, + } + + if output_path is not None: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text( + json.dumps(manifest, indent=2, sort_keys=True) + '\n', + encoding='utf-8', + ) + + if errors: + raise HFEvalsError('\n'.join(errors)) + + return manifest + + +def build_collection_manifest( + *, + collection_name: str, + datastore: str, + output_path: Path | None = None, + api: HfApi | None = None, + check_hf: bool = True, + download_file: Callable[..., str] | None = None, + progress: ReviewProgress | None = None, +) -> dict[str, Any]: + """Build HF Community Evals candidates from a datastore collection.""" + + progress = progress or ReviewProgress() + api = api or HfApi() + collection_name = _safe_collection_name(collection_name) + entries: list[dict[str, Any]] = [] + skipped: list[dict[str, Any]] = [] + errors: list[str] = [] + seen_keys: dict[tuple[str, str, str], dict[str, Any]] = {} + repo_check_cache: dict[str, HFEvalsError | None] = {} + setup_task = progress.add_task('Resolving datastore revision', total=4) + datastore_repo, datastore_revision = resolve_datastore_locator( + datastore, api=api + ) + progress.update( + setup_task, + advance=1, + description=f'Downloading collection index {collection_name}.jsonl', + ) + collection_index_path, collection_index_jsonl = _download_collection_index_jsonl( + api=api, + datastore_repo=datastore_repo, + datastore_revision=datastore_revision, + collection_name=collection_name, + download_file=download_file, + ) + progress.update(setup_task, advance=1, description='Reading collection rows') + rows = _load_index_rows(collection_index_jsonl) + progress.update( + setup_task, + advance=1, + description=f'Loaded {len(rows)} collection rows', + ) + + def cached_repo_error(repo_id: str) -> HFEvalsError | None: + if repo_id not in repo_check_cache: + try: + _repo_exists(api, repo_id) + repo_check_cache[repo_id] = None + except HFEvalsError as exc: + repo_check_cache[repo_id] = exc + return repo_check_cache[repo_id] + + row_task = progress.add_task( + f'Processing {len(rows)} aggregate rows', + total=len(rows), + ) + for row_number, row in enumerate(rows, start=1): + line_number = row['_index_line'] + line_ref = f'{collection_index_path}:{line_number}' + raw_benchmark = row.get('benchmark') + row_label = f'row {row_number}/{len(rows)}' + + try: + _reject_unsupported_row_sources(row, line_ref=line_ref) + subset = _index_subset(row, line_ref=line_ref) + _validate_instance_level_reference(row, line_ref=line_ref) + except HFEvalsError as exc: + errors.append(str(exc)) + progress.update(row_task, advance=1) + continue + + record_type = row.get('record_type') + if record_type != 'aggregate': + skipped.append( + { + 'reason': 'non_aggregate_collection_row', + 'record_type': record_type, + 'benchmark': raw_benchmark, + 'collection_index_path': collection_index_path, + 'collection_index_line': line_number, + **_index_trace_fields(row), + } + ) + progress.update(row_task, advance=1) + continue + + try: + object_path = _safe_index_path(row, 'object_path', line_ref=line_ref) + progress.update( + row_task, + description=f'{row_label}: downloading {object_path}', + ) + source_url = _datastore_blob_url( + object_path, + datastore_repo=datastore_repo, + datastore_revision=datastore_revision, + ) + log = _read_online_indexed_record( + datastore_repo=datastore_repo, + datastore_revision=datastore_revision, + object_path=object_path, + row=row, + line_ref=line_ref, + download_file=download_file, + ) + except HFEvalsError as exc: + errors.append(str(exc)) + progress.update(row_task, advance=1) + continue + + api_only_reason = _api_only_skip_reason(log) + if api_only_reason is not None: + skipped.append( + { + 'reason': api_only_reason, + 'benchmark': raw_benchmark, + 'eee_evaluation_id': log.evaluation_id, + 'eee_record_path': object_path, + 'collection_index_path': collection_index_path, + 'collection_index_line': line_number, + **_index_trace_fields(row), + 'model_id': log.model_info.id, + } + ) + progress.update(row_task, advance=1) + continue + + raw_model_repo = log.model_info.id + if not isinstance(raw_model_repo, str) or not raw_model_repo.strip(): + errors.append(f'{line_ref}: record has no model_info.id') + progress.update(row_task, advance=1) + continue + model_repo = raw_model_repo.strip() + progress.update(row_task, description=f'{row_label}: checking {model_repo}') + + status = 'ready' + hf_check_error: str | None = None + if check_hf: + error = cached_repo_error(model_repo) + if error is not None: + status = 'missing_hf_model' + hf_check_error = str(error) + + try: + supported_results = _results_for_supported_datasets(log) + except HFEvalsError as exc: + errors.append(f'{line_ref}: {exc}') + progress.update(row_task, advance=1) + continue + if not supported_results: + skipped.append( + { + 'reason': 'no_supported_hf_dataset_result', + 'benchmark': raw_benchmark, + 'eee_evaluation_id': log.evaluation_id, + 'eee_record_path': object_path, + 'collection_index_path': collection_index_path, + 'collection_index_line': line_number, + **_index_trace_fields(row), + } + ) + progress.update(row_task, advance=1) + continue + + for benchmark, config, result in supported_results: + try: + entry = _candidate_from_record_result( + benchmark=benchmark, + config=config, + record_path=object_path, + log=log, + result=result, + model_repo=model_repo, + source_url=source_url, + source='online_collection_index_jsonl', + status=status, + hf_check_error=hf_check_error, + subset=subset, + ) + except HFEvalsError as exc: + errors.append(f'{line_ref}: {exc}') + continue + + entry['collection'] = collection_name + entry['collection_index_path'] = collection_index_path + entry['collection_index_line'] = line_number + entry['flat_object_path'] = object_path + for field in ( + 'legacy_path', + 'object_uuid', + 'subset', + 'sha256', + 'size_bytes', + 'eval_schema_version', + 'instance_level_available', + 'instance_level_path', + 'instance_level_sha256', + 'instance_level_size_bytes', + 'instance_sha', + 'instance_object_path', + 'instance_sha256', + 'instance_size_bytes', + ): + value = row.get(field) + if value is not None: + entry[field] = value + + duplicate_key = _candidate_duplicate_key(entry) + existing_entry = seen_keys.get(duplicate_key) + if existing_entry is not None: + if _scores_equal( + existing_entry['yaml_entry']['value'], + entry['yaml_entry']['value'], + ): + skipped.append( + { + 'reason': 'duplicate_candidate_same_score', + 'model_repo': model_repo, + 'eee_evaluation_id': log.evaluation_id, + 'eee_record_path': object_path, + 'collection_index_path': collection_index_path, + 'collection_index_line': line_number, + **_index_trace_fields(row), + } + ) + continue + dataset = entry['yaml_entry']['dataset'] + errors.append( + f'{line_ref}: duplicate candidate for {model_repo} ' + f'{dataset["id"]}/{dataset["task_id"]} with different ' + 'scores.' + ) + continue + seen_keys[duplicate_key] = entry + entries.append(entry) + progress.update(row_task, advance=1) + + progress.update(row_task, description=f'Processed {len(rows)} aggregate rows') + + manifest = { + 'version': MANIFEST_VERSION, + 'created_at': datetime.now(tz=UTC).isoformat(), + 'collection': collection_name, + 'benchmarks': list(DEFAULT_BENCHMARKS), + 'hf_checks': check_hf, + 'source_url_mode': 'online_collection_index_jsonl', + 'datastore': f'{datastore_repo}@{datastore_revision}', + 'datastore_input': datastore, + 'datastore_repo': datastore_repo, + 'datastore_revision': datastore_revision, + 'collection_jsonl': collection_index_path, + 'entries': entries, + 'skipped': skipped, + 'errors': errors, + } + + if output_path is not None: + _write_manifest(manifest, output_path) + + if errors: + raise HFEvalsError('\n'.join(errors)) + + progress.update( + setup_task, + advance=1, + description=( + f'Built manifest: {len(entries)} entries, {len(skipped)} skipped, ' + f'{len(errors)} errors' + ), + ) + return manifest + + +def _path_family_for_entry(entry: dict[str, Any]) -> tuple[str, tuple[str, ...]]: + benchmark = entry.get('benchmark') + if not isinstance(benchmark, str): + raise HFEvalsError('Manifest entry benchmark must be a string.') + paths = EVAL_RESULT_PATH_FAMILIES.get(benchmark) + if paths is None: + return benchmark, (entry['target_path'],) + return benchmark, paths + + +def _repo_eval_tree( + api: HfApi, + repo_id: str, + revision: str, +) -> dict[str, dict[str, Any]]: + try: + items = list( + api.list_repo_tree( + repo_id, + '.eval_results', + recursive=True, + expand=False, + revision=revision, + repo_type='model', + token=True, + ) + ) + except EntryNotFoundError: + return {} + except Exception as exc: # noqa: BLE001 + if exc.__class__.__name__ == 'EntryNotFoundError': + return {} + raise HFEvalsError( + f'Unable to list .eval_results for {repo_id}@{revision}' + ) from exc + + tree: dict[str, dict[str, Any]] = {} + for item in items: + path = getattr(item, 'path', None) or getattr(item, 'rfilename', None) + if not path: + continue + tree[path] = { + 'blob_id': getattr(item, 'blob_id', None) or path, + 'size': getattr(item, 'size', None), + } + return tree + + +def _discussion_number(discussion: Any) -> int | None: + value = getattr(discussion, 'num', None) + if value is None: + url = getattr(discussion, 'url', '') + if isinstance(url, str) and '/discussions/' in url: + value = url.rsplit('/discussions/', 1)[-1].strip('/') + try: + return int(value) + except (TypeError, ValueError): + return None + + +def _discussion_url(repo_id: str, discussion: Any) -> str: + url = getattr(discussion, 'url', None) + if isinstance(url, str) and url: + return url + number = _discussion_number(discussion) + if number is None: + return f'https://huggingface.co/{repo_id}/discussions' + return f'https://huggingface.co/{repo_id}/discussions/{number}' + + +def _discussion_revision(discussion: Any) -> str | None: + revision = getattr(discussion, 'git_reference', None) + if isinstance(revision, str) and revision: + return revision + number = _discussion_number(discussion) + if number is None: + return None + return f'refs/pr/{number}' + + +def _open_pull_requests(api: HfApi, repo_id: str) -> list[Any]: + try: + return list( + api.get_repo_discussions( + repo_id, + repo_type='model', + discussion_type='pull_request', + discussion_status='open', + token=True, + ) + ) + except Exception as exc: # noqa: BLE001 + raise HFEvalsError(f'Unable to list open PRs for {repo_id}') from exc + + +def _candidate_comment(entry: dict[str, Any]) -> str: + yaml_entry = entry['yaml_entry'] + dataset = yaml_entry['dataset'] + source = yaml_entry['source'] + benchmark = f'{dataset["id"]}/{dataset["task_id"]}' + source_name = source.get('name') or SOURCE_NAME + source_url = source['url'] + value = yaml_entry['value'] + return ( + f'This model scores {value} on {benchmark} run by {source_name}, ' + 'but it is different from the currently posted score. ' + f'See {source_url} for full details.' + ) + + +def _already_present_comment(entry: dict[str, Any]) -> str: + yaml_entry = entry['yaml_entry'] + dataset = yaml_entry['dataset'] + return ( + 'Already present, will not open PR: ' + f'{entry["model_repo"]} has {dataset["id"]}/{dataset["task_id"]} ' + f'with score {yaml_entry["value"]}.' + ) + + +def _eval_yaml_paths(tree: dict[str, dict[str, Any]]) -> list[str]: + return sorted( + path + for path in tree + if path.startswith('.eval_results/') + and path.rsplit('.', 1)[-1].lower() in {'yaml', 'yml'} + ) + + +def _download_model_file_text( + *, + repo_id: str, + revision: str, + path: str, + download_file: Callable[..., str] | None = None, +) -> str: + download_file = download_file or hf_hub_download + try: + local_path = download_file( + repo_id=repo_id, + repo_type='model', + filename=path, + revision=revision, + ) + except Exception as exc: # noqa: BLE001 + raise HFEvalsError( + f'Unable to download {path} from {repo_id}@{revision}' + ) from exc + return Path(local_path).read_text(encoding='utf-8') + + +def _load_eval_yaml_entries( + *, + repo_id: str, + revision: str, + path: str, + download_file: Callable[..., str] | None = None, +) -> list[dict[str, Any]]: + text = _download_model_file_text( + repo_id=repo_id, + revision=revision, + path=path, + download_file=download_file, + ) + try: + loaded = yaml.safe_load(text) + except yaml.YAMLError as exc: + raise HFEvalsError( + f'Invalid YAML in {repo_id}@{revision}:{path}: {exc}' + ) from exc + if not isinstance(loaded, list): + raise HFEvalsError( + f'Eval results YAML must be a list in {repo_id}@{revision}:{path}' + ) + entries: list[dict[str, Any]] = [] + for index, item in enumerate(loaded, start=1): + if not isinstance(item, dict): + raise HFEvalsError( + f'Eval results item {index} must be an object in ' + f'{repo_id}@{revision}:{path}' + ) + entries.append(item) + return entries + + +def _yaml_dataset_key(yaml_entry: dict[str, Any]) -> tuple[str, str] | None: + dataset = yaml_entry.get('dataset') + if not isinstance(dataset, dict): + return None + dataset_id = dataset.get('id') + task_id = dataset.get('task_id') + if not isinstance(dataset_id, str) or not isinstance(task_id, str): + return None + return dataset_id.strip().lower(), task_id.strip() + + +def _candidate_yaml_dataset_key(entry: dict[str, Any]) -> tuple[str, str]: + dataset = entry['yaml_entry']['dataset'] + return str(dataset['id']).strip().lower(), str(dataset['task_id']).strip() + + +def _classify_existing_yaml_entries( + *, + candidate: dict[str, Any], + yaml_entries: list[dict[str, Any]], + context: str, +) -> dict[str, Any] | None: + candidate_key = _candidate_yaml_dataset_key(candidate) + candidate_value = candidate['yaml_entry']['value'] + for item in yaml_entries: + if _yaml_dataset_key(item) != candidate_key: + continue + if 'value' not in item: + raise HFEvalsError(f'{context}: matching entry is missing value') + if _scores_equal(item['value'], candidate_value): + return { + 'status': 'already_present', + 'existing_value': item['value'], + 'comment': _already_present_comment(candidate), + } + return { + 'status': 'score_conflict', + 'existing_value': item['value'], + 'comment': _candidate_comment(candidate), + } + return None + + +def audit_manifest_for_hf_eval_duplicates( + manifest: dict[str, Any], + *, + api: HfApi | None = None, + download_file: Callable[..., str] | None = None, + progress: ReviewProgress | None = None, +) -> dict[str, Any]: + """Check candidate YAML entries against main .eval_results and open PRs.""" + + progress = progress or ReviewProgress() + api = api or HfApi() + entries = [ + (entry_index, entry) + for entry_index, entry in enumerate(manifest.get('entries', [])) + if _entry_is_ready(entry) + ] + main_tree_cache: dict[str, dict[str, dict[str, Any]]] = {} + main_yaml_cache: dict[tuple[str, str], list[dict[str, Any]]] = {} + open_pr_cache: dict[str, list[Any]] = {} + pr_tree_cache: dict[tuple[str, str], dict[str, dict[str, Any]]] = {} + pr_yaml_cache: dict[tuple[str, str, str], list[dict[str, Any]]] = {} + findings: list[dict[str, Any]] = [] + errors: list[dict[str, Any]] = [] + audit_task = progress.add_task( + f'Auditing {len(entries)} ready candidates', + total=len(entries), + ) + + def cached_main_tree(repo_id: str) -> dict[str, dict[str, Any]]: + if repo_id not in main_tree_cache: + main_tree_cache[repo_id] = _repo_eval_tree( + api, repo_id, DEFAULT_DATASTORE_REVISION + ) + return main_tree_cache[repo_id] + + def cached_prs(repo_id: str) -> list[Any]: + if repo_id not in open_pr_cache: + open_pr_cache[repo_id] = _open_pull_requests(api, repo_id) + return open_pr_cache[repo_id] + + def cached_pr_tree( + repo_id: str, revision: str + ) -> dict[str, dict[str, Any]]: + key = (repo_id, revision) + if key not in pr_tree_cache: + pr_tree_cache[key] = _repo_eval_tree(api, repo_id, revision) + return pr_tree_cache[key] + + def cached_yaml( + repo_id: str, + revision: str, + path: str, + ) -> list[dict[str, Any]]: + if revision == DEFAULT_DATASTORE_REVISION: + key = (repo_id, path) + if key not in main_yaml_cache: + main_yaml_cache[key] = _load_eval_yaml_entries( + repo_id=repo_id, + revision=revision, + path=path, + download_file=download_file, + ) + return main_yaml_cache[key] + key = (repo_id, revision, path) + if key not in pr_yaml_cache: + pr_yaml_cache[key] = _load_eval_yaml_entries( + repo_id=repo_id, + revision=revision, + path=path, + download_file=download_file, + ) + return pr_yaml_cache[key] + + for entry_index, entry in entries: + repo_id = entry['model_repo'] + benchmark = entry.get('benchmark') + progress.update( + audit_task, + description=f'Auditing {repo_id} {entry["target_path"]}', + ) + try: + main_tree = cached_main_tree(repo_id) + except HFEvalsError as exc: + errors.append( + { + 'entry_index': entry_index, + 'model_repo': repo_id, + 'benchmark': benchmark, + 'target_path': entry['target_path'], + 'stage': 'list_main_eval_results', + 'error': str(exc), + } + ) + progress.update(audit_task, advance=1) + continue + + for path in _eval_yaml_paths(main_tree): + try: + yaml_entries = cached_yaml( + repo_id, DEFAULT_DATASTORE_REVISION, path + ) + match = _classify_existing_yaml_entries( + candidate=entry, + yaml_entries=yaml_entries, + context=f'{repo_id}@main:{path}', + ) + except HFEvalsError as exc: + errors.append( + { + 'entry_index': entry_index, + 'model_repo': repo_id, + 'benchmark': benchmark, + 'target_path': entry['target_path'], + 'stage': 'read_main_eval_results', + 'path': path, + 'error': str(exc), + } + ) + continue + if match is None: + continue + findings.append( + { + 'type': f'existing_eval_results_{match["status"]}', + 'status': match['status'], + 'entry_index': entry_index, + 'model_repo': repo_id, + 'benchmark': benchmark, + 'target_path': entry['target_path'], + 'candidate_path': entry['target_path'], + 'existing_path': path, + 'existing_value': match['existing_value'], + 'candidate_value': entry['yaml_entry']['value'], + 'candidate_source_url': entry['yaml_entry']['source']['url'], + 'comment': match['comment'], + } + ) + + try: + discussions = cached_prs(repo_id) + except HFEvalsError as exc: + errors.append( + { + 'entry_index': entry_index, + 'model_repo': repo_id, + 'benchmark': benchmark, + 'target_path': entry['target_path'], + 'stage': 'list_open_prs', + 'error': str(exc), + } + ) + progress.update(audit_task, advance=1) + continue + + for discussion in discussions: + revision = _discussion_revision(discussion) + if revision is None: + errors.append( + { + 'entry_index': entry_index, + 'model_repo': repo_id, + 'benchmark': benchmark, + 'target_path': entry['target_path'], + 'stage': 'resolve_pr_revision', + 'error': f'No PR revision for {_discussion_url(repo_id, discussion)}', + } + ) + continue + try: + pr_tree = cached_pr_tree(repo_id, revision) + except HFEvalsError as exc: + errors.append( + { + 'entry_index': entry_index, + 'model_repo': repo_id, + 'benchmark': benchmark, + 'target_path': entry['target_path'], + 'stage': 'list_open_pr_eval_results', + 'pr_url': _discussion_url(repo_id, discussion), + 'error': str(exc), + } + ) + continue + + changed_paths = [] + for path in _eval_yaml_paths(pr_tree): + main_blob = main_tree.get(path, {}).get('blob_id') + pr_blob = pr_tree[path].get('blob_id') + if main_blob != pr_blob: + changed_paths.append(path) + for path in changed_paths: + try: + yaml_entries = cached_yaml(repo_id, revision, path) + match = _classify_existing_yaml_entries( + candidate=entry, + yaml_entries=yaml_entries, + context=f'{repo_id}@{revision}:{path}', + ) + except HFEvalsError as exc: + errors.append( + { + 'entry_index': entry_index, + 'model_repo': repo_id, + 'benchmark': benchmark, + 'target_path': entry['target_path'], + 'stage': 'read_open_pr_eval_results', + 'pr_url': _discussion_url(repo_id, discussion), + 'path': path, + 'error': str(exc), + } + ) + continue + if match is None: + continue + findings.append( + { + 'type': f'open_pr_eval_results_{match["status"]}', + 'status': match['status'], + 'entry_index': entry_index, + 'model_repo': repo_id, + 'benchmark': benchmark, + 'target_path': entry['target_path'], + 'candidate_path': entry['target_path'], + 'pr_url': _discussion_url(repo_id, discussion), + 'pr_title': getattr(discussion, 'title', None), + 'paths': [path], + 'existing_value': match['existing_value'], + 'candidate_value': entry['yaml_entry']['value'], + 'candidate_source_url': entry['yaml_entry']['source']['url'], + 'comment': match['comment'], + } + ) + + progress.update(audit_task, advance=1) + + return { + 'created_at': datetime.now(tz=UTC).isoformat(), + 'candidate_count': len(entries), + 'finding_count': len(findings), + 'error_count': len(errors), + 'findings': findings, + 'errors': errors, + } + + +def _apply_duplicate_audit_to_manifest( + manifest: dict[str, Any], + duplicate_audit: dict[str, Any], +) -> None: + priority = {'already_present': 1, 'score_conflict': 2} + selected: dict[int, tuple[int, str, list[dict[str, Any]]]] = {} + for finding in duplicate_audit.get('findings', []): + entry_index = finding.get('entry_index') + status = finding.get('status') + if not isinstance(entry_index, int) or status not in priority: + continue + rank = priority[status] + existing = selected.get(entry_index) + if existing is None: + selected[entry_index] = (rank, status, [finding]) + continue + existing_rank, existing_status, findings = existing + findings.append(finding) + if rank > existing_rank: + selected[entry_index] = (rank, status, findings) + else: + selected[entry_index] = (existing_rank, existing_status, findings) + + entries = manifest.get('entries', []) + if not isinstance(entries, list): + raise HFEvalsError('Manifest entries must be a list.') + for entry_index, (_rank, status, findings) in selected.items(): + if entry_index < 0 or entry_index >= len(entries): + raise HFEvalsError( + f'Duplicate audit referenced missing manifest entry {entry_index}.' + ) + entry = entries[entry_index] + if not isinstance(entry, dict): + raise HFEvalsError( + f'Manifest entry {entry_index} must be an object.' + ) + if not _entry_is_ready(entry): + continue + entry['status'] = status + entry['duplicate_audit_findings'] = findings + + errors_by_entry: dict[int, list[dict[str, Any]]] = {} + for error in duplicate_audit.get('errors', []): + entry_index = error.get('entry_index') + if not isinstance(entry_index, int): + continue + errors_by_entry.setdefault(entry_index, []).append(error) + + for entry_index, audit_errors in errors_by_entry.items(): + if entry_index < 0 or entry_index >= len(entries): + raise HFEvalsError( + f'Duplicate audit referenced missing manifest entry {entry_index}.' + ) + entry = entries[entry_index] + if not isinstance(entry, dict): + raise HFEvalsError( + f'Manifest entry {entry_index} must be an object.' + ) + entry['duplicate_audit_errors'] = audit_errors + if _entry_is_ready(entry): + entry['status'] = AUDIT_ERROR_STATUS + + +def _write_manifest(manifest: dict[str, Any], output_path: Path) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text( + json.dumps(manifest, indent=2, sort_keys=True) + '\n', + encoding='utf-8', + ) + + +def _review_from_manifest( + *, + manifest: dict[str, Any], + manifest_output_path: Path, + yaml_output_dir: Path, + review_output_path: Path, + duplicate_audit: dict[str, Any], +) -> dict[str, Any]: + _apply_duplicate_audit_to_manifest(manifest, duplicate_audit) + _write_manifest(manifest, manifest_output_path) + yaml_result = _write_yaml_from_manifest(manifest, yaml_output_dir) + ready_entries = [ + entry for entry in manifest['entries'] if _entry_is_ready(entry) + ] + audit_blocked_entries = [ + entry + for entry in manifest['entries'] + if entry.get('status') == AUDIT_ERROR_STATUS + ] + global_audit_errors = [ + error + for error in duplicate_audit.get('errors', []) + if not isinstance(error.get('entry_index'), int) + ] + review = { + 'created_at': datetime.now(tz=UTC).isoformat(), + 'manifest_path': manifest_output_path.as_posix(), + 'yaml_output_dir': yaml_output_dir.as_posix(), + 'yaml_count': yaml_result['count'], + 'yaml_files': yaml_result['written'], + 'can_open_prs': len(ready_entries) > 0 and not global_audit_errors, + 'audit_blocked_entries': audit_blocked_entries, + 'global_audit_errors': global_audit_errors, + 'missing_hf_models': [ + entry + for entry in manifest['entries'] + if entry.get('status') == 'missing_hf_model' + ], + 'manifest': manifest, + 'duplicate_audit': duplicate_audit, + } + review_output_path.parent.mkdir(parents=True, exist_ok=True) + review_output_path.write_text( + json.dumps(review, indent=2, sort_keys=True) + '\n', + encoding='utf-8', + ) + return review + + +def review_index_for_hf_evals( + *, + index_jsonl: Path, + datastore: str, + benchmarks: list[str], + manifest_output_path: Path, + yaml_output_dir: Path, + review_output_path: Path, + api: HfApi | None = None, + check_hf: bool = True, + download_file: Callable[..., str] | None = None, +) -> dict[str, Any]: + api = api or HfApi() + manifest = build_index_manifest( + index_jsonl=index_jsonl, + datastore=datastore, + benchmarks=benchmarks, + output_path=None, + api=api, + check_hf=check_hf, + download_file=download_file, + ) + duplicate_audit = audit_manifest_for_hf_eval_duplicates( + manifest, + api=api, + download_file=download_file, + ) + return _review_from_manifest( + manifest=manifest, + manifest_output_path=manifest_output_path, + yaml_output_dir=yaml_output_dir, + review_output_path=review_output_path, + duplicate_audit=duplicate_audit, + ) + + +def review_collection_for_hf_evals( + *, + collection_name: str, + datastore: str, + manifest_output_path: Path, + yaml_output_dir: Path, + review_output_path: Path, + api: HfApi | None = None, + check_hf: bool = True, + download_file: Callable[..., str] | None = None, + progress: ReviewProgress | None = None, + force: bool = False, +) -> dict[str, Any]: + progress = progress or ReviewProgress() + api = api or HfApi() + collection_name = _safe_collection_name(collection_name) + + manifest = None + if not force: + cached_review = _load_cached_collection_review( + review_output_path=review_output_path, + yaml_output_dir=yaml_output_dir, + collection_name=collection_name, + datastore=datastore, + check_hf=check_hf, + ) + if cached_review is not None: + cache_task = progress.add_task('Using cached review', total=1) + progress.update(cache_task, advance=1, description='Used cached review') + return cached_review + + manifest = _load_cached_collection_manifest( + manifest_output_path=manifest_output_path, + collection_name=collection_name, + datastore=datastore, + check_hf=check_hf, + ) + if manifest is not None: + cache_task = progress.add_task('Using cached manifest', total=1) + progress.update( + cache_task, + advance=1, + description='Used cached manifest; starting audit', + ) + + if manifest is None: + manifest = build_collection_manifest( + collection_name=collection_name, + datastore=datastore, + output_path=manifest_output_path, + api=api, + check_hf=check_hf, + download_file=download_file, + progress=progress, + ) + duplicate_audit = audit_manifest_for_hf_eval_duplicates( + manifest, + api=api, + download_file=download_file, + progress=progress, + ) + return _review_from_manifest( + manifest=manifest, + manifest_output_path=manifest_output_path, + yaml_output_dir=yaml_output_dir, + review_output_path=review_output_path, + duplicate_audit=duplicate_audit, + ) + + +def _validate_manifest(manifest: dict[str, Any]) -> dict[str, Any]: + if manifest.get('version') != MANIFEST_VERSION: + raise HFEvalsError( + f'Unsupported manifest version: {manifest.get("version")!r}' + ) + entries = manifest.get('entries') + if not isinstance(entries, list): + raise HFEvalsError('Manifest entries must be a list.') + errors = manifest.get('errors') or [] + if errors: + raise HFEvalsError('Manifest contains errors; rebuild it first.') + return manifest + + +def load_manifest(path: Path) -> dict[str, Any]: + manifest = json.loads(path.read_text(encoding='utf-8')) + if not isinstance(manifest, dict): + raise HFEvalsError('Manifest must be a JSON object.') + return _validate_manifest(manifest) + + +def _collection_cache_matches( + manifest: dict[str, Any], + *, + collection_name: str, + datastore: str, + check_hf: bool, +) -> bool: + return ( + manifest.get('version') == MANIFEST_VERSION + and manifest.get('collection') == collection_name + and manifest.get('datastore_input') == datastore + and manifest.get('hf_checks') is check_hf + and manifest.get('source_url_mode') == 'online_collection_index_jsonl' + ) + + +def _manifest_has_duplicate_audit_state(manifest: dict[str, Any]) -> bool: + audit_statuses = {'already_present', 'score_conflict', AUDIT_ERROR_STATUS} + for entry in manifest.get('entries', []): + if not isinstance(entry, dict): + continue + if entry.get('status') in audit_statuses: + return True + if ( + 'duplicate_audit_findings' in entry + or 'duplicate_audit_errors' in entry + ): + return True + return False + + +def _load_cached_collection_manifest( + *, + manifest_output_path: Path, + collection_name: str, + datastore: str, + check_hf: bool, +) -> dict[str, Any] | None: + if not manifest_output_path.exists(): + return None + try: + manifest = load_manifest(manifest_output_path) + except (json.JSONDecodeError, OSError) as exc: + raise HFEvalsError( + f'Cached manifest is not readable: {manifest_output_path}' + ) from exc + if not _collection_cache_matches( + manifest, + collection_name=collection_name, + datastore=datastore, + check_hf=check_hf, + ): + return None + if _manifest_has_duplicate_audit_state(manifest): + raise HFEvalsError( + f'Cached manifest is post-audit but {manifest_output_path.parent / "review.json"} ' + 'is missing or does not match. Move the cached output directory aside ' + 'before rebuilding.' + ) + return manifest + + +def _load_cached_collection_review( + *, + review_output_path: Path, + yaml_output_dir: Path, + collection_name: str, + datastore: str, + check_hf: bool, +) -> dict[str, Any] | None: + if not review_output_path.exists(): + return None + try: + review = json.loads(review_output_path.read_text(encoding='utf-8')) + except (json.JSONDecodeError, OSError) as exc: + raise HFEvalsError( + f'Cached review is not readable: {review_output_path}' + ) from exc + if not isinstance(review, dict): + raise HFEvalsError(f'Cached review must be an object: {review_output_path}') + manifest = review.get('manifest') + if not isinstance(manifest, dict): + raise HFEvalsError( + f'Cached review is missing its manifest: {review_output_path}' + ) + if not _collection_cache_matches( + manifest, + collection_name=collection_name, + datastore=datastore, + check_hf=check_hf, + ): + return None + for field in ( + 'duplicate_audit', + 'can_open_prs', + 'audit_blocked_entries', + 'global_audit_errors', + 'missing_hf_models', + ): + if field not in review: + raise HFEvalsError( + f'Cached review is missing {field}: {review_output_path}' + ) + yaml_result = _write_yaml_from_manifest(manifest, yaml_output_dir) + review['yaml_output_dir'] = yaml_output_dir.as_posix() + review['yaml_count'] = yaml_result['count'] + review['yaml_files'] = yaml_result['written'] + review_output_path.write_text( + json.dumps(review, indent=2, sort_keys=True) + '\n', + encoding='utf-8', + ) + return review + + +def _write_yaml_from_manifest( + manifest: dict[str, Any], + output_dir: Path, +) -> dict[str, Any]: + manifest = _validate_manifest(manifest) + grouped: dict[tuple[str, str], tuple[str, str, list[dict[str, Any]]]] = {} + for entry in manifest['entries']: + if not _entry_has_yaml_preview(entry): + continue + model_repo = entry['model_repo'] + target_path = entry['target_path'] + key = (model_repo.lower(), target_path) + if key not in grouped: + grouped[key] = (model_repo, target_path, []) + grouped[key][2].append(entry['yaml_entry']) + + written: list[str] = [] + for model_repo, target_path, yaml_entries in sorted(grouped.values()): + path = output_dir / model_repo / target_path + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(dump_yaml_entries(yaml_entries), encoding='utf-8') + written.append(path.as_posix()) + + return {'written': written, 'count': len(written)} + + +def write_yaml_from_manifest(manifest_path: Path, output_dir: Path) -> dict[str, Any]: + return _write_yaml_from_manifest(load_manifest(manifest_path), output_dir) + + +def create_prs_from_manifest( + manifest_path: Path, + *, + limit: int | None, + yes_i_reviewed: bool, + commit_message: str, + api: HfApi | None = None, + commit_description: str = DEFAULT_PR_COMMIT_DESCRIPTION, + stream: bool = False, +) -> dict[str, Any]: + if not yes_i_reviewed: + raise HFEvalsError('Refusing to create PRs without --yes-i-reviewed.') + if not commit_message.strip(): + raise HFEvalsError('Commit message must not be empty.') + if not commit_description.strip(): + raise HFEvalsError('Commit description must not be empty.') + manifest = load_manifest(manifest_path) + api = api or HfApi() + + grouped: dict[str, tuple[str, dict[str, list[dict[str, Any]]]]] = {} + for entry in manifest['entries']: + if not _entry_is_ready(entry): + continue + model_repo = entry['model_repo'] + repo_key = model_repo.lower() + if repo_key not in grouped: + grouped[repo_key] = (model_repo, {}) + by_path = grouped[repo_key][1] + by_path.setdefault(entry['target_path'], []).append(entry['yaml_entry']) + + created: list[dict[str, Any]] = [] + total_repos = len(grouped) + for repo_index, (model_repo, by_path) in enumerate( + sorted(grouped.values(), key=lambda item: item[0].lower()) + ): + if limit is not None and repo_index >= limit: + break + if stream: + print( + f'[{repo_index + 1}/{total_repos}] preparing {model_repo}', + flush=True, + ) + + operations: list[CommitOperationAdd] = [] + for target_path, new_entries in sorted(by_path.items()): + operations.append( + CommitOperationAdd( + path_in_repo=target_path, + path_or_fileobj=dump_yaml_entries(new_entries).encode('utf-8'), + ) + ) + + if not operations: + if stream: + print( + f'[{repo_index + 1}/{total_repos}] no changes {model_repo}', + flush=True, + ) + continue + + try: + info = api.create_commit( + repo_id=model_repo, + repo_type='model', + operations=operations, + commit_message=commit_message, + commit_description=commit_description, + revision=DEFAULT_DATASTORE_REVISION, + create_pr=True, + ) + except Exception as exc: # noqa: BLE001 + raise HFEvalsError(f'Unable to create PR for {model_repo}') from exc + + pr_url = getattr(info, 'pr_url', None) + commit_url = getattr(info, 'commit_url', None) + created.append( + { + 'model_repo': model_repo, + 'commit': str(info), + 'commit_url': commit_url, + 'pr_url': pr_url, + 'updated_existing_pr': False, + } + ) + if stream: + print( + f'[{repo_index + 1}/{total_repos}] ' + f'created {model_repo}: ' + f'{pr_url or commit_url or info}', + flush=True, + ) + + return { + 'created': created, + 'count': len(created), + 'failed': [], + 'failed_count': 0, + 'skipped': [], + 'skipped_count': 0, + } + + +APPROVAL_PHRASE = 'OPEN PRS' + + +def _panel( + renderable: object, + *, + title: str | None = None, + border_style: str = 'yellow', +) -> Panel: + return Panel( + renderable, + title=title, + border_style=border_style, + expand=False, + ) + + +def _render_interrupted_prompt(console: Console) -> None: + console.line() + console.print(_panel('PR submission cancelled.', border_style='yellow')) + + +def _default_paths(collection_name: str) -> tuple[Path, Path, Path]: + stem = _safe_collection_name(collection_name).replace(' ', '_') + base = Path('outputs') / f'community_evals_converter_{stem}' + return ( + base / 'manifest.json', + base / 'yamls', + base / 'review.json', + ) + + +def _render_summary(console: Console, review: dict) -> None: + manifest = review['manifest'] + audit = review['duplicate_audit'] + missing_models = review['missing_hf_models'] + + table = Table(title='Community Evals Converter', show_header=True, header_style='bold') + table.add_column('Item') + table.add_column('Count', justify='right') + table.add_row('records converted', str(len(manifest['entries']))) + table.add_row( + 'ready records', + str(len([entry for entry in manifest['entries'] if _entry_is_ready(entry)])), + ) + table.add_row( + 'already present', + str( + len( + [ + entry + for entry in manifest['entries'] + if entry.get('status') == 'already_present' + ] + ) + ), + ) + table.add_row( + 'score conflicts', + str( + len( + [ + entry + for entry in manifest['entries'] + if entry.get('status') == 'score_conflict' + ] + ) + ), + ) + table.add_row( + 'audit-blocked records', + str(len(review.get('audit_blocked_entries', []))), + ) + table.add_row('preview YAML files', str(review['yaml_count'])) + table.add_row('skipped records', str(len(manifest['skipped']))) + table.add_row('missing HF models', str(len(missing_models))) + table.add_row('existing score findings', str(audit['finding_count'])) + table.add_row('audit errors', str(audit['error_count'])) + console.print(table) + + console.print(f'Manifest: {review["manifest_path"]}') + console.print(f'YAML dir: {review["yaml_output_dir"]}') + + +def _render_review_details(console: Console, review: dict) -> None: + max_rows = 20 + rows: list[tuple[str, str, str, str, str | Text]] = [] + + def datastore_record_url(path: object) -> object: + raw_path = str(path or '') + if not raw_path.startswith('flat/'): + return path + manifest = review['manifest'] + datastore_repo = manifest.get('datastore_repo') + datastore_revision = manifest.get('datastore_revision') + if not isinstance(datastore_repo, str) or not isinstance( + datastore_revision, + str, + ): + return path + return _datastore_blob_url( + raw_path, + datastore_repo=datastore_repo, + datastore_revision=datastore_revision, + ) + + def where_cell(value: object) -> str | Text: + text = str(value or '') + if text.startswith(('http://', 'https://')): + return Text(text, style=f'link {text}') + return text + + def add( + issue: str, + model: object, + details: object, + action: str, + where: object, + ) -> None: + if len(rows) >= max_rows: + return + rows.append( + ( + str(issue or ''), + str(model or ''), + str(details or ''), + action, + where_cell(where), + ) + ) + + for error in review['duplicate_audit']['errors']: + entry_index = error.get('entry_index') + action = 'block entry' if isinstance(entry_index, int) else 'block all' + add( + 'audit_error', + error.get('model_repo'), + error.get('error'), + action, + error.get('pr_url') or error.get('path') or error.get('stage'), + ) + + findings = review['duplicate_audit']['findings'] + score_conflicts = [ + item for item in findings if item.get('status') == 'score_conflict' + ] + already_present = [ + item for item in findings if item.get('status') == 'already_present' + ] + for item in score_conflicts: + where = item.get('existing_path') or item.get('pr_url') or '' + paths = item.get('paths') + if paths: + details = ( + f'{item.get("existing_value")} -> {item.get("candidate_value")}; ' + f'existing score differs from EvalEval; {", ".join(paths)}' + ) + else: + details = ( + f'{item.get("existing_value")} -> {item.get("candidate_value")}; ' + 'existing score differs from EvalEval.' + ) + add( + 'score_conflict', + item.get('model_repo'), + details, + 'exclude', + where, + ) + + if already_present: + add( + 'already_present', + f'{len(already_present)} models', + 'Same-score result already exists; excluded from PRs.', + 'exclude', + '.eval_results', + ) + + for entry in review['missing_hf_models']: + add( + 'missing_hf_model', + entry.get('model_repo'), + entry.get('hf_check_error'), + 'exclude', + entry.get('yaml_entry', {}).get('source', {}).get('url') + or datastore_record_url(entry.get('eee_record_path')), + ) + + for item in review['manifest']['skipped']: + line = item.get('collection_index_line') or item.get('index_line') or '' + add( + 'skipped', + item.get('model_id'), + item.get('reason'), + f'line {line}' if line else 'skip', + datastore_record_url( + item.get('eee_record_path') or item.get('object_path') + ), + ) + + if not rows: + return + + total = ( + len(review['duplicate_audit']['errors']) + + len(score_conflicts) + + (1 if already_present else 0) + + len(review['missing_hf_models']) + + len(review['manifest']['skipped']) + ) + table = Table( + title='Needs Attention', + show_header=True, + header_style='bold cyan', + show_lines=True, + ) + table.add_column('Issue', no_wrap=True) + table.add_column('Model', overflow='fold', ratio=2, max_width=30) + table.add_column('Details', overflow='fold', ratio=4) + table.add_column('Action', no_wrap=True) + table.add_column('Where', no_wrap=True, overflow='ellipsis', ratio=4) + for row in rows: + table.add_row(*row) + if total > len(rows): + table.caption = ( + f'Showing {len(rows)} of {total} attention items. ' + 'Full data is in review JSON.' + ) + console.print(table) + + +def _render_not_ready(console: Console, review: dict) -> None: + audit_blocked_count = len(review.get('audit_blocked_entries', [])) + global_audit_error_count = len(review.get('global_audit_errors', [])) + if global_audit_error_count: + message = ( + f'{global_audit_error_count} global audit error(s) blocked PR ' + 'submission. Local YAML previews were still written when possible.' + ) + elif audit_blocked_count: + message = ( + f'{audit_blocked_count} candidate(s) had audit errors, and no ' + 'clean ready entries remain. Local YAML previews were still ' + 'written for inspection.' + ) + else: + message = 'No clean ready entries are available. PRs were not submitted.' + console.print( + _panel( + message, + title='PRs Not Submitted', + border_style='yellow', + ) + ) + + +def _render_ready(console: Console, review: dict) -> None: + audit_blocked_count = len(review.get('audit_blocked_entries', [])) + message = ( + 'Clean ready entries are available. Existing same-score duplicates ' + 'and score conflicts have been excluded from submission.' + ) + if audit_blocked_count: + message += ( + f'\n\n{audit_blocked_count} candidate(s) had audit errors and ' + 'will not be submitted. Their local YAML previews remain under ' + f'{review["yaml_output_dir"]}.' + ) + console.print( + _panel( + message, + title='Ready', + border_style='green', + ) + ) + + +def _prompt_commit_message(console: Console) -> str | None: + try: + message = Prompt.ask('Commit message').strip() + except (EOFError, KeyboardInterrupt): + _render_interrupted_prompt(console) + return None + if not message: + console.print( + _panel('Commit message is required.', title='PRs Not Submitted') + ) + return None + return message + + +def _submit_prs( + console: Console, + manifest_output: Path, + *, + commit_message: str, +) -> int: + try: + result = create_prs_from_manifest( + manifest_path=manifest_output, + limit=None, + yes_i_reviewed=True, + commit_message=commit_message, + stream=True, + ) + except HFEvalsError as exc: + console.print(_panel(str(exc), title='PR Creation Failed', border_style='red')) + return 1 + console.print(json.dumps(result, indent=2, sort_keys=True)) + return 0 + + +def _maybe_submit_prs( + console: Console, + review: dict, + manifest_output: Path, +) -> int: + if not review['can_open_prs']: + _render_not_ready(console, review) + return 0 + + _render_ready(console, review) + if not _approve_pr_submission(console, review): + return 0 + commit_message = _prompt_commit_message(console) + if commit_message is None: + return 0 + return _submit_prs( + console, + manifest_output, + commit_message=commit_message, + ) + + +def _ready_entries_by_repo(review: dict) -> dict[str, list[str]]: + by_repo: dict[str, set[str]] = {} + for entry in review['manifest']['entries']: + if entry.get('status', 'ready') != 'ready': + continue + repo = str(entry['model_repo']) + by_repo.setdefault(repo, set()).add(str(entry['target_path'])) + return { + repo: sorted(paths) + for repo, paths in sorted(by_repo.items(), key=lambda item: item[0].lower()) + } + + +def _approve_pr_submission(console: Console, review: dict) -> bool: + by_repo = _ready_entries_by_repo(review) + if not by_repo: + console.print(_panel('No ready entries to submit.', border_style='yellow')) + return False + + table = Table( + title='PR Submission Approval', + show_header=True, + header_style='bold', + show_lines=True, + ) + table.add_column('Model repo') + table.add_column('Files') + for repo, paths in by_repo.items(): + table.add_row(repo, '\n'.join(paths)) + console.print(table) + console.print( + _panel( + f'Type {APPROVAL_PHRASE!r} to submit these PRs. ' + 'Anything else cancels.', + title='Approval Required', + border_style='yellow', + ) + ) + try: + answer = Prompt.ask('Approval').strip() + except (EOFError, KeyboardInterrupt): + _render_interrupted_prompt(console) + return False + if answer != APPROVAL_PHRASE: + console.print(_panel('PR submission cancelled.', border_style='yellow')) + return False + return True + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description='Review an EEE datastore collection for HF Community Evals.', + ) + parser.add_argument( + 'collection_name', + help='Collection file stem under flat/indexes/by_collection/.jsonl.', + ) + parser.add_argument( + '--datastore', + default=DEFAULT_DATASTORE_REPO, + help=( + 'Online HF dataset locator in the form or ' + '@. Defaults to evaleval/EEE_datastore and ' + 'resolves the current main commit.' + ), + ) + parser.add_argument( + '--force', + action='store_true', + help='Ignore cached review/manifest outputs and rebuild from datastore.', + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + console = Console() + + try: + collection_name = _safe_collection_name(args.collection_name) + manifest_output, yaml_dir, review_output = _default_paths(collection_name) + console.rule( + '[bold cyan]EEE -> HF Community Evals[/] ' + '[dim](built by Harsha Nelaturu, June 2026)[/]' + ) + with Progress( + SpinnerColumn(), + TextColumn( + '[bold blue]{task.description}', + table_column=Column(width=48, no_wrap=True, overflow='ellipsis'), + ), + BarColumn(bar_width=28), + TextColumn( + '{task.completed:>4.0f}/{task.total:<4.0f}', + table_column=Column(width=10, no_wrap=True), + ), + TimeElapsedColumn(), + console=console, + expand=False, + ) as rich_progress: + review = review_collection_for_hf_evals( + collection_name=collection_name, + datastore=args.datastore, + manifest_output_path=manifest_output, + yaml_output_dir=yaml_dir, + review_output_path=review_output, + progress=RichReviewProgress(rich_progress), + force=args.force, + ) + except HFEvalsError as exc: + console.print(_panel(str(exc), title='Review Failed', border_style='red')) + return 1 + + _render_summary(console, review) + _render_review_details(console, review) + console.print(f'Review JSON: {review_output.as_posix()}') + + return _maybe_submit_prs(console, review, manifest_output) + + +if __name__ == '__main__': + raise SystemExit(main(sys.argv[1:])) diff --git a/uv.lock b/uv.lock index c04019bd3..01f0357dc 100644 --- a/uv.lock +++ b/uv.lock @@ -855,6 +855,7 @@ dependencies = [ { name = "numpy" }, { name = "pandas" }, { name = "pydantic" }, + { name = "pyyaml" }, { name = "requests" }, { name = "rich" }, { name = "seaborn" }, @@ -895,6 +896,7 @@ requires-dist = [ { name = "numpy", specifier = ">=2.4.1" }, { name = "pandas", specifier = ">=2.3.3" }, { name = "pydantic", specifier = ">=2.12.5,<3.0.0" }, + { name = "pyyaml", specifier = ">=6.0.3" }, { name = "requests", specifier = ">=2.32.5,<3.0.0" }, { name = "rich", specifier = ">=14.0.0,<15.0.0" }, { name = "seaborn", specifier = ">=0.13.2" }, From fa2331341820a8d129573df2bef0ea8a7094e1d8 Mon Sep 17 00:00:00 2001 From: nelaturuharsha Date: Sat, 13 Jun 2026 22:08:16 +0200 Subject: [PATCH 2/3] fixes to make tests pass + add hf community evals to docs --- docs/getting-started/index.md | 1 + docs/hf-community-evals/index.md | 132 ++++++++++++++++++ tools/hf-community-evals/README.md | 10 +- .../community_evals_converter.py | 4 - 4 files changed, 138 insertions(+), 9 deletions(-) create mode 100644 docs/hf-community-evals/index.md diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md index 136c14409..cab549c23 100644 --- a/docs/getting-started/index.md +++ b/docs/getting-started/index.md @@ -31,3 +31,4 @@ uv run python -m every_eval_ever --help - See [Data Structure](../data-structure/) - See [Eval Converters](../eval-converters/) - See [Contributing](../contributing/) +- See [HF Community Evals](../hf-community-evals/) diff --git a/docs/hf-community-evals/index.md b/docs/hf-community-evals/index.md new file mode 100644 index 000000000..6ea2b1477 --- /dev/null +++ b/docs/hf-community-evals/index.md @@ -0,0 +1,132 @@ +--- +layout: default +title: HF Community Evals +nav_order: 6 +--- + +# EEE -> HF Community Evals + +Built and maintained by Harsha Nelaturu · EvalEval Coalition · June 2026. + +Use `tools/hf-community-evals/community_evals_converter.py` to review one EEE datastore collection, generate +local HF Community Evals YAML previews, audit existing scores/open PRs, and +optionally open PRs after explicit approval. + +## Quick Start + +Use `uv run` for all commands. + +```bash +uv run tools/hf-community-evals/community_evals_converter.py MMLU-Pro \ + --datastore evaleval/EEE_datastore@main +``` + +This will cache the results for this particular collection and if you would like to force a fresh rebuild: + +```bash +uv run tools/hf-community-evals/community_evals_converter.py MMLU-Pro \ + --datastore evaleval/EEE_datastore@main \ + --force +``` + +The positional argument is a collection stem. It must resolve exactly to: + +```text +https://huggingface.co/datasets/evaleval/EEE_datastore/flat/indexes/by_collection/.jsonl +``` + +## Outputs + +For `MMLU-Pro`, outputs are written under: + +```text +outputs/community_evals_converter_MMLU-Pro/ +``` + +Important output files: + +- `manifest.json`: converted candidate records plus skipped/error metadata. +- `review.json`: full review result, duplicate audit findings, audit errors, + and PR readiness. +- `yamls///.eval_results/.yaml`: local YAML previews. + +`outputs/` is ignored by git. Use these files for inspection, not as merge +inputs. + +## Review Behavior + +The tool: + +- downloads the collection JSONL and referenced aggregate objects from the HF + datastore; +- validates object hashes and optional sizes; +- scans each aggregate record for supported HF benchmark datasets; +- writes YAML entries using the datastore object HF URL as `source.url`; +- keeps flat datastore provenance, including instance-level references when + present; +- checks model repo existence on Hugging Face; +- audits every existing `.eval_results/*.yaml` file on model `main`; +- audits changed `.eval_results/*.yaml` files in open PR refs; +- compares by dataset/task content, not YAML filename. + +Supported benchmarks in this workflow are: + +- `mmlu_pro` +- `gpqa` +- `hle` +- `gsm8k` + +## Resume And Force + +Default reruns reuse exact-match local outputs: + +- matching completed `review.json`: skips collection downloads, model checks, + and duplicate audit; +- matching pre-audit `manifest.json`: skips collection downloads and model + checks, then resumes at duplicate audit. + +The cache must match collection name, datastore input, and HF-check mode. +Invalid exact-match cache files are hard errors. Use `--force` when you want to +ignore the cache and rebuild from the datastore. + +## TUI +The final report has: + +- `Community Evals Converter`: summary counts. +- `Needs Attention`: capped triage table for blockers and exclusions. + +`Needs Attention` uses: + +- `Issue`: `audit_error`, `score_conflict`, `already_present`, + `missing_hf_model`, or `skipped`. +- `Model`: model repo or aggregate model id. +- `Details`: reason or score comparison. +- `Action`: `exclude`, `block entry`, `block all`, or source line. +- `Where`: terminal hyperlink to the HF model PR/file or HF datastore blob URL. + +Repeated same-score `already_present` findings are summarized as one count row. +Full details remain in `review.json`. + +## PR Submission + +The tool only opens PRs after both prompts succeed: + +1. Type exactly: + + ```text + OPEN PRS + ``` + +2. Enter a non-empty commit message. + +Only `status = ready` entries are submitted. + +Excluded statuses: + +- `already_present`: same score already exists. +- `score_conflict`: different score already exists. +- `missing_hf_model`: model repo does not resolve on HF. +- `audit_error`: candidate-scoped audit failure. + +Candidate-scoped audit errors block only that candidate. Audit errors without a +manifest entry block all PR submission. diff --git a/tools/hf-community-evals/README.md b/tools/hf-community-evals/README.md index 5730a0dad..7e1d8c85c 100644 --- a/tools/hf-community-evals/README.md +++ b/tools/hf-community-evals/README.md @@ -1,8 +1,8 @@ # EEE -> HF Community Evals -Built by Harsha Nelaturu, June 2026. +Built and maintained by Harsha Nelaturu · EvalEval Coalition · June 2026. -Use `tools/community_evals_converter.py` to review one EEE datastore collection, generate +Use `tools/hf-community-evals/community_evals_converter.py` to review one EEE datastore collection, generate local HF Community Evals YAML previews, audit existing scores/open PRs, and optionally open PRs after explicit approval. @@ -11,14 +11,14 @@ optionally open PRs after explicit approval. Use `uv run` for all commands. ```bash -uv run tools/community_evals_converter.py MMLU-Pro \ +uv run tools/hf-community-evals/community_evals_converter.py MMLU-Pro \ --datastore evaleval/EEE_datastore@main ``` This will cache the results for this particular collection and if you would like to force a fresh rebuild: ```bash -uv run tools/community_evals_converter.py MMLU-Pro \ +uv run tools/hf-community-evals/community_evals_converter.py MMLU-Pro \ --datastore evaleval/EEE_datastore@main \ --force ``` @@ -123,4 +123,4 @@ Excluded statuses: - `audit_error`: candidate-scoped audit failure. Candidate-scoped audit errors block only that candidate. Audit errors without a -manifest entry block all PR submission. \ No newline at end of file +manifest entry block all PR submission. diff --git a/tools/hf-community-evals/community_evals_converter.py b/tools/hf-community-evals/community_evals_converter.py index c37562069..273bac3e6 100644 --- a/tools/hf-community-evals/community_evals_converter.py +++ b/tools/hf-community-evals/community_evals_converter.py @@ -2902,10 +2902,6 @@ def main(argv: list[str] | None = None) -> int: try: collection_name = _safe_collection_name(args.collection_name) manifest_output, yaml_dir, review_output = _default_paths(collection_name) - console.rule( - '[bold cyan]EEE -> HF Community Evals[/] ' - '[dim](built by Harsha Nelaturu, June 2026)[/]' - ) with Progress( SpinnerColumn(), TextColumn( From c2dde1ab2ed6078b12b1ed68155d377ad7c3c9e2 Mon Sep 17 00:00:00 2001 From: nelaturuharsha Date: Sat, 13 Jun 2026 22:14:15 +0200 Subject: [PATCH 3/3] fixing the test --- tests/test_community_evals_converter.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/test_community_evals_converter.py b/tests/test_community_evals_converter.py index 299f60ebe..602833dec 100644 --- a/tests/test_community_evals_converter.py +++ b/tests/test_community_evals_converter.py @@ -1,8 +1,10 @@ from __future__ import annotations import hashlib +import importlib.util import io import json +import sys from pathlib import Path import pytest @@ -11,7 +13,28 @@ from rich.progress import Progress from every_eval_ever import cli -from tools import community_evals_converter + + +def _load_community_evals_converter(): + source = ( + Path(__file__).resolve().parents[1] + / 'tools' + / 'hf-community-evals' + / 'community_evals_converter.py' + ) + spec = importlib.util.spec_from_file_location( + 'community_evals_converter_under_test', + source, + ) + if spec is None or spec.loader is None: + raise ImportError(f'Unable to load {source}') + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +community_evals_converter = _load_community_evals_converter() FIXTURE_DIR = Path(__file__).parent / 'data' / 'community_evals_converter'