From 7437221d7b5479c5a77e2816eec17429d91b9b5f Mon Sep 17 00:00:00 2001
From: nelaturuharsha <nelaturu.harsha@gmail.com>
Date: Sat, 13 Jun 2026 21:52:11 +0200
Subject: [PATCH 1/3] adding hf community evals converter

---
 pyproject.toml                                |    1 +
 .../community_evals_converter/aggregate.jsonl |    1 +
 .../676f4465-ce78-411a-9f5a-c97b3d2eac4f.json |  590 ++++
 tests/test_community_evals_converter.py       | 1521 +++++++++
 tools/hf-community-evals/README.md            |  126 +
 .../community_evals_converter.py              | 2945 +++++++++++++++++
 uv.lock                                       |    2 +
 7 files changed, 5186 insertions(+)
 create mode 100644 tests/data/community_evals_converter/aggregate.jsonl
 create mode 100644 tests/data/community_evals_converter/datastore/flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json
 create mode 100644 tests/test_community_evals_converter.py
 create mode 100644 tools/hf-community-evals/README.md
 create mode 100644 tools/hf-community-evals/community_evals_converter.py

diff --git a/pyproject.toml b/pyproject.toml
index aa31551ac..450f30f30 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "numpy>=2.4.1",
     "pandas>=2.3.3",
     "pydantic>=2.12.5,<3.0.0",
+    "pyyaml>=6.0.3",
     "requests>=2.32.5,<3.0.0",
     "rich>=14.0.0,<15.0.0",
     "seaborn>=0.13.2",
diff --git a/tests/data/community_evals_converter/aggregate.jsonl b/tests/data/community_evals_converter/aggregate.jsonl
new file mode 100644
index 000000000..5adeea54b
--- /dev/null
+++ b/tests/data/community_evals_converter/aggregate.jsonl
@@ -0,0 +1 @@
+{"benchmark":"MMLU-Pro","eval_schema_version":"0.2.2","legacy_path":"data/MMLU-Pro/01-ai/yi-1.5-34b-chat/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json","object_path":"flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json","object_uuid":"676f4465-ce78-411a-9f5a-c97b3d2eac4f","record_type":"aggregate","sha256":"a9cc2e4399f182f2e8d1a6198248e124ceafee6f70cbf5ddf31e76d1e74e6f94","size_bytes":23648}
diff --git a/tests/data/community_evals_converter/datastore/flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json b/tests/data/community_evals_converter/datastore/flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json
new file mode 100644
index 000000000..62372ec22
--- /dev/null
+++ b/tests/data/community_evals_converter/datastore/flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json
@@ -0,0 +1,590 @@
+{
+  "schema_version": "0.2.2",
+  "evaluation_id": "mmlu-pro/01-ai_yi-1.5-34b-chat/tiger-lab/1777613486.918081",
+  "retrieved_timestamp": "1777613486.918081",
+  "source_metadata": {
+    "source_name": "MMLU-Pro Leaderboard",
+    "source_type": "documentation",
+    "source_organization_name": "TIGER-Lab",
+    "source_organization_url": "https://tiger-ai-lab.github.io",
+    "evaluator_relationship": "third_party",
+    "additional_details": {
+      "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+      "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+      "paper_url": "https://arxiv.org/abs/2406.01574",
+      "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+      "leaderboard_data_source": "TIGER-Lab"
+    }
+  },
+  "eval_library": {
+    "name": "MMLU-Pro leaderboard (TIGER-Lab)",
+    "version": "unknown"
+  },
+  "model_info": {
+    "name": "Yi-1.5-34B-Chat",
+    "id": "01-ai/yi-1.5-34b-chat",
+    "developer": "01-ai",
+    "additional_details": {
+      "raw_model_name": "Yi-1.5-34B-Chat",
+      "size_billions_parameters": "34.0",
+      "leaderboard_data_source": "TIGER-Lab"
+    }
+  },
+  "evaluation_results": [
+    {
+      "evaluation_result_id": "mmlu_pro/overall",
+      "evaluation_name": "MMLU-Pro (overall)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Overall accuracy across the ~12,000-question MMLU-Pro benchmark, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/overall",
+        "metric_name": "MMLU-Pro (overall)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.5229
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/biology",
+      "evaluation_name": "MMLU-Pro (Biology)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Biology subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/biology",
+        "metric_name": "MMLU-Pro (Biology)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.7141
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/business",
+      "evaluation_name": "MMLU-Pro (Business)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Business subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/business",
+        "metric_name": "MMLU-Pro (Business)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.5843
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/chemistry",
+      "evaluation_name": "MMLU-Pro (Chemistry)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Chemistry subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/chemistry",
+        "metric_name": "MMLU-Pro (Chemistry)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.4753
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/computer_science",
+      "evaluation_name": "MMLU-Pro (Computer Science)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Computer Science subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/computer_science",
+        "metric_name": "MMLU-Pro (Computer Science)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.539
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/economics",
+      "evaluation_name": "MMLU-Pro (Economics)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Economics subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/economics",
+        "metric_name": "MMLU-Pro (Economics)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.6457
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/engineering",
+      "evaluation_name": "MMLU-Pro (Engineering)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Engineering subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/engineering",
+        "metric_name": "MMLU-Pro (Engineering)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.3437
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/health",
+      "evaluation_name": "MMLU-Pro (Health)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Health subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/health",
+        "metric_name": "MMLU-Pro (Health)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.5819
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/history",
+      "evaluation_name": "MMLU-Pro (History)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro History subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/history",
+        "metric_name": "MMLU-Pro (History)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.5276
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/law",
+      "evaluation_name": "MMLU-Pro (Law)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Law subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/law",
+        "metric_name": "MMLU-Pro (Law)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.3479
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/math",
+      "evaluation_name": "MMLU-Pro (Math)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Math subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/math",
+        "metric_name": "MMLU-Pro (Math)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.5618
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/philosophy",
+      "evaluation_name": "MMLU-Pro (Philosophy)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Philosophy subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/philosophy",
+        "metric_name": "MMLU-Pro (Philosophy)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.4629
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/physics",
+      "evaluation_name": "MMLU-Pro (Physics)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Physics subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/physics",
+        "metric_name": "MMLU-Pro (Physics)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.4935
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/psychology",
+      "evaluation_name": "MMLU-Pro (Psychology)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Psychology subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/psychology",
+        "metric_name": "MMLU-Pro (Psychology)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.6429
+      }
+    },
+    {
+      "evaluation_result_id": "mmlu_pro/other",
+      "evaluation_name": "MMLU-Pro (Other)",
+      "source_data": {
+        "dataset_name": "MMLU-Pro leaderboard submissions (TIGER-Lab)",
+        "source_type": "hf_dataset",
+        "hf_repo": "TIGER-Lab/mmlu_pro_leaderboard_submission",
+        "hf_split": "train",
+        "additional_details": {
+          "results_csv_url": "https://huggingface.co/datasets/TIGER-Lab/mmlu_pro_leaderboard_submission/resolve/main/results.csv",
+          "leaderboard_space_url": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
+          "benchmark_hf_repo": "TIGER-Lab/MMLU-Pro",
+          "paper_url": "https://arxiv.org/abs/2406.01574",
+          "github_url": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+          "dataset_total_questions": "12000",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "metric_config": {
+        "evaluation_description": "Accuracy on the MMLU-Pro Other subset, evaluated 5-shot with chain-of-thought.",
+        "metric_id": "mmlu_pro/other",
+        "metric_name": "MMLU-Pro (Other)",
+        "metric_kind": "accuracy",
+        "metric_unit": "proportion",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0.0,
+        "max_score": 1.0,
+        "additional_details": {
+          "aggregation": "accuracy_over_subset",
+          "prompt_style": "5-shot CoT"
+        }
+      },
+      "score_details": {
+        "score": 0.5162
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/test_community_evals_converter.py b/tests/test_community_evals_converter.py
new file mode 100644
index 000000000..299f60ebe
--- /dev/null
+++ b/tests/test_community_evals_converter.py
@@ -0,0 +1,1521 @@
+from __future__ import annotations
+
+import hashlib
+import io
+import json
+from pathlib import Path
+
+import pytest
+import yaml
+from rich.console import Console
+from rich.progress import Progress
+
+from every_eval_ever import cli
+from tools import community_evals_converter
+
+FIXTURE_DIR = Path(__file__).parent / 'data' / 'community_evals_converter'
+
+
+class FakeRepoInfo:
+    def __init__(self, *, sha: str) -> None:
+        self.sha = sha
+
+
+class FakeHfApi:
+    def __init__(
+        self,
+        *,
+        datastore_sha: str = 'abc123',
+        missing_models: set[str] | None = None,
+        repo_files_by_revision: dict[tuple[str, str], list[str]] | None = None,
+        dataset_files_by_revision: dict[tuple[str, str], list[str]] | None = None,
+        discussions: dict[str, list[FakeDiscussion]] | None = None,
+    ) -> None:
+        self.datastore_sha = datastore_sha
+        self.missing_models = missing_models or set()
+        self.repo_files_by_revision = repo_files_by_revision or {}
+        self.dataset_files_by_revision = dataset_files_by_revision or {}
+        self.discussions = discussions or {}
+        self.model_info_calls: list[str] = []
+        self.repo_info_calls: list[dict] = []
+        self.discussion_calls: list[str] = []
+        self.commits: list[dict] = []
+
+    def repo_info(self, **kwargs):
+        self.repo_info_calls.append(kwargs)
+        assert kwargs['repo_type'] == 'dataset'
+        assert kwargs['revision'] == 'main'
+        return FakeRepoInfo(sha=self.datastore_sha)
+
+    def model_info(self, repo_id: str):
+        self.model_info_calls.append(repo_id)
+        if repo_id in self.missing_models:
+            raise RuntimeError('missing model')
+        return {'id': repo_id}
+
+    def list_repo_files(
+        self,
+        repo_id: str,
+        repo_type: str = 'model',
+        revision: str | None = None,
+    ):
+        if repo_type == 'dataset':
+            return self.dataset_files_by_revision.get(
+                (repo_id, revision or 'main'), []
+            )
+        assert repo_type == 'model'
+        return self.repo_files_by_revision.get((repo_id, revision or 'main'), [])
+
+    def list_repo_tree(
+        self,
+        repo_id: str,
+        path_in_repo: str | None = None,
+        *,
+        recursive: bool = False,
+        expand: bool = False,
+        revision: str | None = None,
+        repo_type: str = 'model',
+        token: bool | str | None = None,
+    ):
+        assert path_in_repo == '.eval_results'
+        assert recursive is True
+        assert expand is False
+        assert repo_type == 'model'
+        assert token is True
+        for path in self.repo_files_by_revision.get((repo_id, revision or 'main'), []):
+            yield FakeRepoFile(path=path, blob_id=f'{revision}:{path}')
+
+    def get_repo_discussions(self, repo_id: str, **_kwargs):
+        self.discussion_calls.append(repo_id)
+        return self.discussions.get(repo_id, [])
+
+    def create_commit(self, **kwargs):
+        self.commits.append(kwargs)
+        return FakeCommitInfo(
+            pr_url=f'https://huggingface.co/{kwargs["repo_id"]}/discussions/1',
+            commit_url=f'https://huggingface.co/{kwargs["repo_id"]}/commit/abc',
+        )
+
+
+class FakeCommitInfo:
+    def __init__(self, *, pr_url: str, commit_url: str) -> None:
+        self.pr_url = pr_url
+        self.commit_url = commit_url
+
+    def __str__(self) -> str:
+        return self.pr_url
+
+
+class FakeRepoFile:
+    def __init__(self, *, path: str, blob_id: str) -> None:
+        self.path = path
+        self.rfilename = path
+        self.blob_id = blob_id
+        self.size = 1
+
+
+class FakeDiscussion:
+    def __init__(
+        self,
+        *,
+        title: str = 'Add EvalEval community eval results',
+        git_reference: str = 'refs/pr/1',
+        url: str = 'https://huggingface.co/google/gemma-2b-it/discussions/1',
+        num: int = 1,
+    ) -> None:
+        self.title = title
+        self.git_reference = git_reference
+        self.url = url
+        self.num = num
+
+
+class RecordingProgress(community_evals_converter.ReviewProgress):
+    def __init__(self) -> None:
+        self.descriptions: list[str] = []
+        self.advances: list[int] = []
+        self.task_initial_descriptions: dict[int, str] = {}
+        self.advance_by_task: dict[int, int] = {}
+
+    def add_task(self, description: str, total: int | None = None) -> int:
+        task_id = len(self.task_initial_descriptions) + 1
+        self.task_initial_descriptions[task_id] = description
+        self.advance_by_task[task_id] = 0
+        self.descriptions.append(description)
+        return task_id
+
+    def update(
+        self,
+        task_id: int,
+        *,
+        advance: int = 0,
+        description: str | None = None,
+        total: int | None = None,
+    ) -> None:
+        self.advances.append(advance)
+        self.advance_by_task[task_id] = (
+            self.advance_by_task.get(task_id, 0) + advance
+        )
+        if description is not None:
+            self.descriptions.append(description)
+
+
+def _aggregate(
+    *,
+    model_id: str = 'google/gemma-2b-it',
+    score: float = 0.641,
+) -> dict:
+    return {
+        'schema_version': '0.2.2',
+        'evaluation_id': 'openeval/google_gemma-2b-it/123',
+        'evaluation_timestamp': '2024-07-16T00:00:00Z',
+        'retrieved_timestamp': '1234567890',
+        'source_metadata': {
+            'source_type': 'evaluation_run',
+            'source_organization_name': 'EvalEval',
+            'evaluator_relationship': 'third_party',
+        },
+        'eval_library': {'name': 'openeval', 'version': 'unknown'},
+        'model_info': {
+            'name': model_id.rsplit('/', 1)[-1],
+            'id': model_id,
+            'developer': model_id.split('/', 1)[0],
+            'inference_platform': 'huggingface',
+        },
+        'evaluation_results': [
+            {
+                'evaluation_result_id': 'mmlu-pro::chain-of-thought-correctness',
+                'evaluation_name': 'MMLU-Pro',
+                'source_data': {
+                    'dataset_name': 'MMLU-Pro',
+                    'source_type': 'hf_dataset',
+                    'hf_repo': 'TIGER-Lab/MMLU-Pro',
+                },
+                'metric_config': {
+                    'lower_is_better': False,
+                    'score_type': 'binary',
+                    'metric_unit': 'proportion',
+                    'min_score': 0.0,
+                    'max_score': 1.0,
+                },
+                'score_details': {'score': score},
+            }
+        ],
+    }
+
+
+def _gpqa_aggregate(*, dataset_name: str = 'GPQA') -> dict:
+    record = _aggregate()
+    record['evaluation_results'] = [
+        {
+            'evaluation_result_id': 'gpqa::chain-of-thought-correctness',
+            'evaluation_name': dataset_name,
+            'source_data': {
+                'dataset_name': dataset_name,
+                'source_type': 'hf_dataset',
+                'hf_repo': 'Idavidrein/gpqa',
+            },
+            'metric_config': {
+                'lower_is_better': False,
+                'score_type': 'binary',
+                'metric_unit': 'proportion',
+                'min_score': 0.0,
+                'max_score': 1.0,
+            },
+            'score_details': {'score': 0.5},
+        }
+    ]
+    return record
+
+
+def _write_index_row(
+    tmp_path: Path,
+    record: dict,
+    *,
+    object_uuid: str = '676f4465-ce78-411a-9f5a-c97b3d2eac4f',
+    row_overrides: dict | None = None,
+) -> tuple[Path, Path]:
+    datastore = tmp_path / 'datastore'
+    object_path = (
+        datastore
+        / 'flat'
+        / 'objects'
+        / object_uuid[:2]
+        / object_uuid[2:4]
+        / f'{object_uuid}.json'
+    )
+    object_path.parent.mkdir(parents=True, exist_ok=True)
+    data = json.dumps(record).encode('utf-8')
+    object_path.write_bytes(data)
+
+    index_jsonl = tmp_path / 'aggregate.jsonl'
+    row = {
+        'benchmark': 'MMLU-Pro',
+        'eval_schema_version': record['schema_version'],
+        'legacy_path': f'data/MMLU-Pro/google/gemma-2b-it/{object_uuid}.json',
+        'object_path': object_path.relative_to(datastore).as_posix(),
+        'object_uuid': object_uuid,
+        'record_type': 'aggregate',
+        'sha256': hashlib.sha256(data).hexdigest(),
+        'size_bytes': len(data),
+    }
+    if row_overrides:
+        row.update(row_overrides)
+    index_jsonl.write_text(json.dumps(row) + '\n', encoding='utf-8')
+    return datastore, index_jsonl
+
+
+def _fake_download(datastore: Path):
+    def download_file(**kwargs) -> str:
+        assert kwargs['repo_id'] == 'evaleval/EEE_datastore'
+        assert kwargs['repo_type'] == 'dataset'
+        assert kwargs['revision'] == 'abc123'
+        path = datastore / kwargs['filename']
+        if not path.exists():
+            raise FileNotFoundError(path)
+        return path.as_posix()
+
+    return download_file
+
+
+def _write_collection_rows(
+    tmp_path: Path,
+    records: list[dict],
+    *,
+    collection_name: str = 'MMLU-Pro',
+    include_instance_level: bool = False,
+) -> tuple[Path, Path]:
+    datastore = tmp_path / 'datastore'
+    rows = []
+    for index, record in enumerate(records):
+        object_uuid = f'676f4465-ce78-411a-9f5a-c97b3d2eac{index:03d}'
+        object_path = (
+            datastore
+            / 'flat'
+            / 'objects'
+            / object_uuid[:2]
+            / object_uuid[2:4]
+            / f'{object_uuid}.json'
+        )
+        object_path.parent.mkdir(parents=True, exist_ok=True)
+        data = json.dumps(record).encode('utf-8')
+        object_path.write_bytes(data)
+        row = {
+            'benchmark': 'collection-benchmark',
+            'eval_schema_version': record['schema_version'],
+            'legacy_path': (
+                f'data/{collection_name}/{record["model_info"]["id"]}/'
+                f'{object_uuid}.json'
+            ),
+            'object_path': object_path.relative_to(datastore).as_posix(),
+            'object_uuid': object_uuid,
+            'record_type': 'aggregate',
+            'sha256': hashlib.sha256(data).hexdigest(),
+            'size_bytes': len(data),
+            'instance_level_available': False,
+        }
+        if include_instance_level:
+            instance_path = object_path.with_name(f'{object_uuid}_samples.jsonl')
+            instance_data = (
+                json.dumps(
+                    {
+                        'schema_version': 'instance_level_eval_0.2.2',
+                        'evaluation_id': record['evaluation_id'],
+                        'model_id': record['model_info']['id'],
+                    }
+                )
+                + '\n'
+            ).encode('utf-8')
+            instance_path.write_bytes(instance_data)
+            row.update(
+                {
+                    'instance_level_available': True,
+                    'instance_level_path': (
+                        instance_path.relative_to(datastore).as_posix()
+                    ),
+                    'instance_level_size_bytes': len(instance_data),
+                    'instance_sha': hashlib.sha256(instance_data).hexdigest(),
+                }
+            )
+        rows.append(row)
+
+    collection_jsonl = (
+        datastore
+        / 'flat'
+        / 'indexes'
+        / 'by_collection'
+        / f'{collection_name}.jsonl'
+    )
+    collection_jsonl.parent.mkdir(parents=True, exist_ok=True)
+    collection_jsonl.write_text(
+        ''.join(json.dumps(row) + '\n' for row in rows),
+        encoding='utf-8',
+    )
+    return datastore, collection_jsonl
+
+
+def _fake_download_with_model_files(
+    datastore: Path,
+    model_files: dict[tuple[str, str, str], Path],
+):
+    def download_file(**kwargs) -> str:
+        if kwargs['repo_type'] == 'dataset':
+            assert kwargs['repo_id'] == 'evaleval/EEE_datastore'
+            assert kwargs['revision'] == 'abc123'
+            path = datastore / kwargs['filename']
+            if not path.exists():
+                raise FileNotFoundError(path)
+            return path.as_posix()
+        if kwargs['repo_type'] == 'model':
+            key = (kwargs['repo_id'], kwargs['revision'], kwargs['filename'])
+            return model_files[key].as_posix()
+        raise AssertionError(f'unexpected repo_type {kwargs["repo_type"]}')
+
+    return download_file
+
+
+def test_parse_benchmarks_aliases_and_rejects_unknown() -> None:
+    assert community_evals_converter.parse_benchmarks('gpqa-diamond,mmlu_pro') == [
+        'gpqa',
+        'mmlu_pro',
+    ]
+
+    with pytest.raises(community_evals_converter.HFEvalsError, match='Unsupported benchmark'):
+        community_evals_converter.parse_benchmarks('alphaxiv')
+
+
+def test_parse_datastore_locator_accepts_optional_revision() -> None:
+    assert community_evals_converter.parse_datastore_locator(
+        'evaleval/EEE_datastore@abc123'
+    ) == ('evaleval/EEE_datastore', 'abc123')
+    assert community_evals_converter.parse_datastore_locator('evaleval/EEE_datastore') == (
+        'evaleval/EEE_datastore',
+        None,
+    )
+
+    with pytest.raises(
+        community_evals_converter.HFEvalsError, match='<hf_dataset_repo>\\[@<revision>\\]'
+    ):
+        community_evals_converter.parse_datastore_locator('bad@repo@abc123')
+
+
+def test_resolve_datastore_locator_uses_latest_commit_for_bare_repo() -> None:
+    api = FakeHfApi(datastore_sha='resolvedabc')
+
+    assert community_evals_converter.resolve_datastore_locator(
+        'evaleval/EEE_datastore', api=api
+    ) == ('evaleval/EEE_datastore', 'resolvedabc')
+    assert api.repo_info_calls == [
+        {
+            'repo_id': 'evaleval/EEE_datastore',
+            'repo_type': 'dataset',
+            'revision': 'main',
+        }
+    ]
+
+
+def test_build_collection_manifest_downloads_collection_jsonl_and_scans_results(
+    tmp_path: Path,
+) -> None:
+    record = _aggregate()
+    record['evaluation_results'].append(
+        {
+            'evaluation_result_id': 'gsm8k/exact_match',
+            'evaluation_name': 'GSM8K',
+            'source_data': {
+                'dataset_name': 'GSM8K',
+                'source_type': 'hf_dataset',
+                'hf_repo': 'openai/gsm8k',
+            },
+            'metric_config': {
+                'lower_is_better': False,
+                'score_type': 'binary',
+                'metric_unit': 'proportion',
+                'min_score': 0.0,
+                'max_score': 1.0,
+            },
+            'score_details': {'score': 0.72},
+        }
+    )
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [record],
+        collection_name='MMLU-Pro',
+        include_instance_level=True,
+    )
+
+    manifest = community_evals_converter.build_collection_manifest(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore',
+        api=FakeHfApi(datastore_sha='abc123'),
+        download_file=_fake_download(datastore),
+    )
+
+    assert manifest['source_url_mode'] == 'online_collection_index_jsonl'
+    assert manifest['collection_jsonl'] == (
+        'flat/indexes/by_collection/MMLU-Pro.jsonl'
+    )
+    assert {entry['benchmark'] for entry in manifest['entries']} == {
+        'mmlu_pro',
+        'gsm8k',
+    }
+    assert {entry['target_path'] for entry in manifest['entries']} == {
+        '.eval_results/mmlu_pro.yaml',
+        '.eval_results/gsm8k.yaml',
+    }
+    assert all(entry['instance_level_available'] is True for entry in manifest['entries'])
+    assert all('instance_sha' in entry for entry in manifest['entries'])
+
+
+def test_build_collection_manifest_requires_collection_jsonl(
+    tmp_path: Path,
+) -> None:
+    datastore, aggregate_jsonl = _write_index_row(tmp_path, _aggregate())
+    aggregate_dir = (
+        datastore / 'flat' / 'indexes' / 'by_collection' / 'MMLU-Pro'
+    )
+    aggregate_dir.mkdir(parents=True)
+    (aggregate_dir / 'aggregate.jsonl').write_text(
+        aggregate_jsonl.read_text(encoding='utf-8'),
+        encoding='utf-8',
+    )
+
+    with pytest.raises(
+        community_evals_converter.HFEvalsError,
+        match='flat/indexes/by_collection/MMLU-Pro\\.jsonl',
+    ):
+        community_evals_converter.build_collection_manifest(
+            collection_name='MMLU-Pro',
+            datastore='evaleval/EEE_datastore@abc123',
+            api=FakeHfApi(),
+            download_file=_fake_download(datastore),
+        )
+
+
+def test_build_collection_manifest_suggests_nearby_collection_stems(
+    tmp_path: Path,
+) -> None:
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [_aggregate()],
+        collection_name='fibble_arena',
+    )
+
+    with pytest.raises(
+        community_evals_converter.HFEvalsError,
+        match='Nearby collection stems: fibble_arena',
+    ):
+        community_evals_converter.build_collection_manifest(
+            collection_name='fibbl_arena',
+            datastore='evaleval/EEE_datastore@abc123',
+            api=FakeHfApi(
+                dataset_files_by_revision={
+                    (
+                        'evaleval/EEE_datastore',
+                        'abc123',
+                    ): [
+                        'flat/indexes/by_collection/fibble_arena.jsonl',
+                        'flat/indexes/by_collection/MMLU-Pro.jsonl',
+                    ]
+                }
+            ),
+            download_file=_fake_download(datastore),
+        )
+
+
+def test_build_collection_manifest_rejects_malformed_instance_provenance(
+    tmp_path: Path,
+) -> None:
+    datastore, collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [_aggregate()],
+        include_instance_level=True,
+    )
+    row = json.loads(collection_jsonl.read_text(encoding='utf-8'))
+    row.pop('instance_sha')
+    collection_jsonl.write_text(json.dumps(row) + '\n', encoding='utf-8')
+
+    with pytest.raises(community_evals_converter.HFEvalsError, match='missing instance_sha'):
+        community_evals_converter.build_collection_manifest(
+            collection_name='MMLU-Pro',
+            datastore='evaleval/EEE_datastore@abc123',
+            api=FakeHfApi(),
+            download_file=_fake_download(datastore),
+        )
+
+
+def test_build_collection_manifest_rejects_path_like_collection_name(
+    tmp_path: Path,
+) -> None:
+    datastore, _collection_jsonl = _write_collection_rows(tmp_path, [_aggregate()])
+
+    with pytest.raises(community_evals_converter.HFEvalsError, match='without the \\.jsonl'):
+        community_evals_converter.build_collection_manifest(
+            collection_name='MMLU-Pro.jsonl',
+            datastore='evaleval/EEE_datastore@abc123',
+            api=FakeHfApi(),
+            download_file=_fake_download(datastore),
+        )
+
+    with pytest.raises(community_evals_converter.HFEvalsError, match='single by_collection'):
+        community_evals_converter.build_collection_manifest(
+            collection_name='MMLU-Pro/records',
+            datastore='evaleval/EEE_datastore@abc123',
+            api=FakeHfApi(),
+            download_file=_fake_download(datastore),
+        )
+
+
+def test_build_collection_manifest_records_url_only_result_as_skipped(
+    tmp_path: Path,
+) -> None:
+    record = _aggregate()
+    record['evaluation_results'][0]['source_data'] = {
+        'dataset_name': 'External Benchmark',
+        'source_type': 'url',
+        'url': ['https://example.com/not-a-hf-benchmark'],
+    }
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path, [record], collection_name='external'
+    )
+
+    manifest = community_evals_converter.build_collection_manifest(
+        collection_name='external',
+        datastore='evaleval/EEE_datastore@abc123',
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+
+    assert manifest['entries'] == []
+    assert manifest['skipped'][0]['reason'] == 'no_supported_hf_dataset_result'
+
+
+def test_build_index_manifest_downloads_online_record_and_links_source(
+    tmp_path: Path,
+) -> None:
+    datastore, index_jsonl = _write_index_row(tmp_path, _aggregate())
+    api = FakeHfApi(datastore_sha='abc123')
+
+    manifest = community_evals_converter.build_index_manifest(
+        index_jsonl=index_jsonl,
+        datastore='evaleval/EEE_datastore',
+        benchmarks=['mmlu_pro'],
+        api=api,
+        download_file=_fake_download(datastore),
+    )
+
+    assert manifest['source_url_mode'] == 'online_flat_index_jsonl'
+    assert manifest['datastore'] == 'evaleval/EEE_datastore@abc123'
+    assert manifest['datastore_input'] == 'evaleval/EEE_datastore'
+    assert api.repo_info_calls
+    assert manifest['entries'][0]['target_path'] == '.eval_results/mmlu_pro.yaml'
+    assert manifest['entries'][0]['yaml_entry']['value'] == 64.1
+    assert manifest['entries'][0]['yaml_entry']['source']['url'].startswith(
+        'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/abc123/flat/objects/'
+    )
+
+
+def test_build_index_manifest_accepts_index_directory(tmp_path: Path) -> None:
+    datastore, index_jsonl = _write_index_row(tmp_path, _aggregate())
+    index_dir = tmp_path / 'flat' / 'indexes' / 'by_benchmark' / 'MMLU-Pro'
+    index_dir.mkdir(parents=True)
+    (index_dir / 'aggregate.jsonl').write_text(
+        index_jsonl.read_text(encoding='utf-8'),
+        encoding='utf-8',
+    )
+
+    manifest = community_evals_converter.build_index_manifest(
+        index_jsonl=index_dir,
+        datastore='evaleval/EEE_datastore@abc123',
+        benchmarks=['mmlu_pro'],
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+
+    assert manifest['index_jsonl'] == (index_dir / 'aggregate.jsonl').as_posix()
+    assert manifest['entries'][0]['flat_object_path'].startswith(
+        'flat/objects/'
+    )
+
+
+def test_build_index_manifest_rejects_index_directory_without_aggregate_jsonl(
+    tmp_path: Path,
+) -> None:
+    index_dir = tmp_path / 'flat' / 'indexes' / 'by_benchmark' / 'MMLU-Pro'
+    index_dir.mkdir(parents=True)
+
+    with pytest.raises(
+        community_evals_converter.HFEvalsError, match='must contain aggregate\\.jsonl'
+    ):
+        community_evals_converter.build_index_manifest(
+            index_jsonl=index_dir,
+            datastore='evaleval/EEE_datastore@abc123',
+            benchmarks=['mmlu_pro'],
+            api=FakeHfApi(),
+        )
+
+
+def test_build_index_manifest_rejects_direct_url_row(tmp_path: Path) -> None:
+    datastore, index_jsonl = _write_index_row(
+        tmp_path,
+        _aggregate(),
+        row_overrides={
+            'object_path': None,
+            'url': 'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/main/flat/objects/test.json',
+        },
+    )
+
+    with pytest.raises(community_evals_converter.HFEvalsError, match='unsupported.*url'):
+        community_evals_converter.build_index_manifest(
+            index_jsonl=index_jsonl,
+            datastore='evaleval/EEE_datastore@abc123',
+            benchmarks=['mmlu_pro'],
+            api=FakeHfApi(),
+            download_file=_fake_download(datastore),
+        )
+
+
+def test_build_index_manifest_rejects_local_path_row(tmp_path: Path) -> None:
+    datastore, index_jsonl = _write_index_row(
+        tmp_path,
+        _aggregate(),
+        row_overrides={'object_path': None},
+    )
+    aggregate_path = next((datastore / 'flat' / 'objects').rglob('*.json'))
+    row = json.loads(index_jsonl.read_text(encoding='utf-8'))
+    row['local_path'] = aggregate_path.relative_to(tmp_path).as_posix()
+    index_jsonl.write_text(json.dumps(row) + '\n', encoding='utf-8')
+
+    with pytest.raises(community_evals_converter.HFEvalsError, match='unsupported.*local_path'):
+        community_evals_converter.build_index_manifest(
+            index_jsonl=index_jsonl,
+            datastore='evaleval/EEE_datastore@abc123',
+            benchmarks=['mmlu_pro'],
+            api=FakeHfApi(),
+            download_file=_fake_download(datastore),
+        )
+
+
+def test_build_index_manifest_rejects_url_even_with_object_path(
+    tmp_path: Path,
+) -> None:
+    datastore, index_jsonl = _write_index_row(
+        tmp_path,
+        _aggregate(),
+        row_overrides={
+            'url': 'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/main/flat/objects/test.json',
+        },
+    )
+
+    with pytest.raises(community_evals_converter.HFEvalsError, match='unsupported.*url'):
+        community_evals_converter.build_index_manifest(
+            index_jsonl=index_jsonl,
+            datastore='evaleval/EEE_datastore@abc123',
+            benchmarks=['mmlu_pro'],
+            api=FakeHfApi(),
+            download_file=_fake_download(datastore),
+        )
+
+
+def test_build_index_manifest_preserves_subset_from_index_row(
+    tmp_path: Path,
+) -> None:
+    datastore, index_jsonl = _write_index_row(
+        tmp_path,
+        _aggregate(),
+        row_overrides={'subset': 'overall'},
+    )
+
+    manifest = community_evals_converter.build_index_manifest(
+        index_jsonl=index_jsonl,
+        datastore='evaleval/EEE_datastore@abc123',
+        benchmarks=['mmlu_pro'],
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+
+    assert manifest['entries'][0]['subset'] == 'overall'
+
+
+def test_build_index_manifest_uses_gpqa_subset_for_task_id(
+    tmp_path: Path,
+) -> None:
+    datastore, index_jsonl = _write_index_row(
+        tmp_path,
+        _gpqa_aggregate(),
+        row_overrides={'benchmark': 'gpqa', 'subset': 'main'},
+    )
+
+    manifest = community_evals_converter.build_index_manifest(
+        index_jsonl=index_jsonl,
+        datastore='evaleval/EEE_datastore@abc123',
+        benchmarks=['gpqa'],
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+
+    yaml_entry = manifest['entries'][0]['yaml_entry']
+    assert manifest['entries'][0]['subset'] == 'main'
+    assert yaml_entry['dataset'] == {
+        'id': 'Idavidrein/gpqa',
+        'task_id': 'main',
+    }
+    assert yaml_entry['notes'] == 'GPQA chain-of-thought'
+
+
+def test_build_index_manifest_rejects_invalid_subset_type(
+    tmp_path: Path,
+) -> None:
+    datastore, index_jsonl = _write_index_row(
+        tmp_path,
+        _aggregate(),
+        row_overrides={'subset': {'name': 'overall'}},
+    )
+
+    with pytest.raises(community_evals_converter.HFEvalsError, match='subset'):
+        community_evals_converter.build_index_manifest(
+            index_jsonl=index_jsonl,
+            datastore='evaleval/EEE_datastore@abc123',
+            benchmarks=['mmlu_pro'],
+            api=FakeHfApi(),
+            download_file=_fake_download(datastore),
+        )
+
+
+def test_build_index_manifest_accepts_persistent_fixture() -> None:
+    manifest = community_evals_converter.build_index_manifest(
+        index_jsonl=FIXTURE_DIR / 'aggregate.jsonl',
+        datastore='evaleval/EEE_datastore@abc123',
+        benchmarks=['mmlu_pro'],
+        api=FakeHfApi(),
+        download_file=_fake_download(FIXTURE_DIR / 'datastore'),
+    )
+
+    assert len(manifest['entries']) == 1
+    entry = manifest['entries'][0]
+    assert entry['flat_object_path'] == (
+        'flat/objects/67/6f/676f4465-ce78-411a-9f5a-c97b3d2eac4f.json'
+    )
+    assert entry['yaml_entry']['value'] == 52.29
+    assert entry['yaml_entry']['dataset'] == {
+        'id': 'TIGER-Lab/MMLU-Pro',
+        'task_id': 'mmlu_pro',
+    }
+
+
+def test_build_index_manifest_fails_on_hash_mismatch(tmp_path: Path) -> None:
+    datastore, index_jsonl = _write_index_row(tmp_path, _aggregate())
+    row = json.loads(index_jsonl.read_text(encoding='utf-8'))
+    row['sha256'] = '0' * 64
+    index_jsonl.write_text(json.dumps(row) + '\n', encoding='utf-8')
+
+    with pytest.raises(community_evals_converter.HFEvalsError, match='sha256 mismatch'):
+        community_evals_converter.build_index_manifest(
+            index_jsonl=index_jsonl,
+            datastore='evaleval/EEE_datastore@abc123',
+            benchmarks=['mmlu_pro'],
+            api=FakeHfApi(),
+            download_file=_fake_download(datastore),
+        )
+
+
+def test_review_index_writes_yaml_and_review(tmp_path: Path) -> None:
+    datastore, index_jsonl = _write_index_row(tmp_path, _aggregate())
+    review = community_evals_converter.review_index_for_hf_evals(
+        index_jsonl=index_jsonl,
+        datastore='evaleval/EEE_datastore@abc123',
+        benchmarks=['mmlu_pro'],
+        manifest_output_path=tmp_path / 'manifest.json',
+        yaml_output_dir=tmp_path / 'yamls',
+        review_output_path=tmp_path / 'review.json',
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+
+    yaml_path = (
+        tmp_path
+        / 'yamls'
+        / 'google'
+        / 'gemma-2b-it'
+        / '.eval_results'
+        / 'mmlu_pro.yaml'
+    )
+    loaded_yaml = yaml.safe_load(yaml_path.read_text(encoding='utf-8'))
+    loaded_review = json.loads((tmp_path / 'review.json').read_text(encoding='utf-8'))
+
+    assert review['can_open_prs'] is True
+    assert loaded_review['can_open_prs'] is True
+    assert loaded_yaml[0]['dataset'] == {
+        'id': 'TIGER-Lab/MMLU-Pro',
+        'task_id': 'mmlu_pro',
+    }
+
+
+def test_review_index_writes_yaml_without_reloading_manifest(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    datastore, index_jsonl = _write_index_row(tmp_path, _aggregate())
+
+    def fail_load_manifest(_path: Path) -> dict:
+        raise AssertionError('review flow should use the in-memory manifest')
+
+    monkeypatch.setattr(community_evals_converter, 'load_manifest', fail_load_manifest)
+
+    review = community_evals_converter.review_index_for_hf_evals(
+        index_jsonl=index_jsonl,
+        datastore='evaleval/EEE_datastore@abc123',
+        benchmarks=['mmlu_pro'],
+        manifest_output_path=tmp_path / 'manifest.json',
+        yaml_output_dir=tmp_path / 'yamls',
+        review_output_path=tmp_path / 'review.json',
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+
+    assert review['yaml_count'] == 1
+    assert (tmp_path / 'manifest.json').exists()
+    assert (tmp_path / 'review.json').exists()
+
+
+def test_review_index_reports_missing_model_without_aliasing(tmp_path: Path) -> None:
+    record = _aggregate(model_id='local/missing-model')
+    datastore, index_jsonl = _write_index_row(tmp_path, record)
+
+    review = community_evals_converter.review_index_for_hf_evals(
+        index_jsonl=index_jsonl,
+        datastore='evaleval/EEE_datastore@abc123',
+        benchmarks=['mmlu_pro'],
+        manifest_output_path=tmp_path / 'manifest.json',
+        yaml_output_dir=tmp_path / 'yamls',
+        review_output_path=tmp_path / 'review.json',
+        api=FakeHfApi(missing_models={'local/missing-model'}),
+        download_file=_fake_download(datastore),
+    )
+
+    assert review['can_open_prs'] is False
+    assert review['yaml_count'] == 0
+    assert len(review['missing_hf_models']) == 1
+    missing = review['missing_hf_models'][0]
+    assert missing['model_repo'] == 'local/missing-model'
+    assert missing['status'] == 'missing_hf_model'
+    assert 'model_repo_alias_from' not in missing
+
+
+def test_review_collection_suppresses_existing_same_score_from_any_yaml_name(
+    tmp_path: Path,
+) -> None:
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path, [_aggregate()], collection_name='MMLU-Pro'
+    )
+    model_yaml = tmp_path / 'model_main.yaml'
+    model_yaml.write_text(
+        yaml.safe_dump(
+            [
+                {
+                    'dataset': {
+                        'id': 'TIGER-Lab/MMLU-Pro',
+                        'task_id': 'mmlu_pro',
+                    },
+                    'value': 64.1,
+                }
+            ],
+            sort_keys=False,
+        ),
+        encoding='utf-8',
+    )
+    api = FakeHfApi(
+        repo_files_by_revision={
+            ('google/gemma-2b-it', 'main'): ['.eval_results/not_the_name.yaml']
+        }
+    )
+
+    review = community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=tmp_path / 'manifest.json',
+        yaml_output_dir=tmp_path / 'yamls',
+        review_output_path=tmp_path / 'review.json',
+        api=api,
+        download_file=_fake_download_with_model_files(
+            datastore,
+            {
+                (
+                    'google/gemma-2b-it',
+                    'main',
+                    '.eval_results/not_the_name.yaml',
+                ): model_yaml
+            },
+        ),
+    )
+
+    assert review['can_open_prs'] is False
+    assert review['yaml_count'] == 0
+    assert review['manifest']['entries'][0]['status'] == 'already_present'
+    assert review['duplicate_audit']['findings'][0]['status'] == 'already_present'
+
+
+def test_review_collection_reports_progress_phases(tmp_path: Path) -> None:
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [_aggregate()],
+        collection_name='MMLU-Pro',
+    )
+    progress = RecordingProgress()
+
+    review = community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=tmp_path / 'manifest.json',
+        yaml_output_dir=tmp_path / 'yamls',
+        review_output_path=tmp_path / 'review.json',
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+        progress=progress,
+    )
+
+    joined = '\n'.join(progress.descriptions)
+    assert review['can_open_prs'] is True
+    assert 'Downloading collection index MMLU-Pro.jsonl' in joined
+    assert 'Processing 1 aggregate rows' in joined
+    assert 'row 1/1: downloading flat/objects/' in joined
+    assert 'row 1/1: checking google/gemma-2b-it' in joined
+    assert 'Auditing 1 ready candidates' in joined
+
+
+def test_rich_review_progress_uses_one_visible_task() -> None:
+    console = Console(file=io.StringIO(), force_terminal=True)
+    progress = Progress(console=console)
+    review_progress = community_evals_converter.RichReviewProgress(progress)
+
+    with progress:
+        setup_task = review_progress.add_task('Resolving datastore revision', total=4)
+        review_progress.update(setup_task, advance=4, description='Built manifest')
+        row_task = review_progress.add_task('Processing 2 aggregate rows', total=2)
+        review_progress.update(row_task, advance=2, description='Processed 2 rows')
+        audit_task = review_progress.add_task('Auditing 1 ready candidates', total=1)
+        review_progress.update(audit_task, advance=1, description='Audit complete')
+
+    assert len(progress.tasks) == 1
+    task = progress.tasks[0]
+    assert task.total == 1
+    assert task.completed == 1
+
+
+def test_review_collection_progress_advances_api_only_rows(
+    tmp_path: Path,
+) -> None:
+    api_only_record = _aggregate(model_id='anthropic/claude-3-opus')
+    api_only_record['model_info']['developer'] = 'anthropic'
+    api_only_record['model_info']['inference_platform'] = 'anthropic'
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [_aggregate(), api_only_record],
+        collection_name='MMLU-Pro',
+    )
+    progress = RecordingProgress()
+
+    review = community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=tmp_path / 'manifest.json',
+        yaml_output_dir=tmp_path / 'yamls',
+        review_output_path=tmp_path / 'review.json',
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+        progress=progress,
+    )
+
+    row_task = next(
+        task_id
+        for task_id, description in progress.task_initial_descriptions.items()
+        if description == 'Processing 2 aggregate rows'
+    )
+    joined = '\n'.join(progress.descriptions)
+    assert review['can_open_prs'] is True
+    assert progress.advance_by_task[row_task] == 2
+    assert 'Processed 2 aggregate rows' in joined
+
+
+def test_review_collection_reuses_cached_review_without_downloads(
+    tmp_path: Path,
+) -> None:
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [_aggregate()],
+        collection_name='MMLU-Pro',
+    )
+    manifest_path = tmp_path / 'manifest.json'
+    yaml_dir = tmp_path / 'yamls'
+    review_path = tmp_path / 'review.json'
+    first_review = community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=manifest_path,
+        yaml_output_dir=yaml_dir,
+        review_output_path=review_path,
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+
+    def fail_download(**_kwargs) -> str:
+        raise AssertionError('cached review should not download anything')
+
+    api = FakeHfApi(missing_models={'google/gemma-2b-it'})
+    second_review = community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=manifest_path,
+        yaml_output_dir=yaml_dir,
+        review_output_path=review_path,
+        api=api,
+        download_file=fail_download,
+    )
+
+    assert second_review['created_at'] == first_review['created_at']
+    assert second_review['yaml_count'] == 1
+    assert api.model_info_calls == []
+
+
+def test_review_collection_force_ignores_cached_review(
+    tmp_path: Path,
+) -> None:
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [_aggregate()],
+        collection_name='MMLU-Pro',
+    )
+    manifest_path = tmp_path / 'manifest.json'
+    yaml_dir = tmp_path / 'yamls'
+    review_path = tmp_path / 'review.json'
+    community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=manifest_path,
+        yaml_output_dir=yaml_dir,
+        review_output_path=review_path,
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+
+    api = FakeHfApi(missing_models={'google/gemma-2b-it'})
+    forced_review = community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=manifest_path,
+        yaml_output_dir=yaml_dir,
+        review_output_path=review_path,
+        api=api,
+        download_file=_fake_download(datastore),
+        force=True,
+    )
+
+    assert forced_review['can_open_prs'] is False
+    assert forced_review['yaml_count'] == 0
+    assert api.model_info_calls == ['google/gemma-2b-it']
+
+
+def test_review_collection_resumes_cached_manifest_without_datastore_downloads(
+    tmp_path: Path,
+) -> None:
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [_aggregate()],
+        collection_name='MMLU-Pro',
+    )
+    manifest_path = tmp_path / 'manifest.json'
+    community_evals_converter.build_collection_manifest(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        output_path=manifest_path,
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+
+    def fail_download(**_kwargs) -> str:
+        raise AssertionError('cached manifest should skip datastore downloads')
+
+    api = FakeHfApi(missing_models={'google/gemma-2b-it'})
+    review = community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=manifest_path,
+        yaml_output_dir=tmp_path / 'yamls',
+        review_output_path=tmp_path / 'review.json',
+        api=api,
+        download_file=fail_download,
+    )
+
+    assert review['can_open_prs'] is True
+    assert review['yaml_count'] == 1
+    assert api.model_info_calls == []
+
+
+def test_review_details_use_clear_headers_and_aggregate_existing_scores() -> None:
+    console = Console(record=True, width=200)
+    review = {
+        'duplicate_audit': {
+            'errors': [
+                {
+                    'entry_index': 0,
+                    'model_repo': 'google/gemma-blocked',
+                    'stage': 'read_open_pr_eval_results',
+                    'path': '.eval_results/mmlu_pro.yaml',
+                    'error': 'Unable to download eval results YAML',
+                }
+            ],
+            'findings': [
+                {
+                    'status': 'score_conflict',
+                    'model_repo': 'nexusflow/athene-v2-chat',
+                    'existing_value': 73.11,
+                    'candidate_value': 70.21,
+                    'pr_url': 'https://huggingface.co/example/repo/discussions/1',
+                    'paths': ['.eval_results/mmlu_pro.yaml'],
+                },
+                {'status': 'already_present'},
+                {'status': 'already_present'},
+            ],
+        },
+        'missing_hf_models': [
+            {
+                'model_repo': 'missing/model',
+                'hf_check_error': 'HF model repo does not exist: missing/model',
+                'eee_record_path': 'flat/objects/aa/bb/record.json',
+                'yaml_entry': {
+                    'source': {
+                        'url': (
+                            'https://huggingface.co/datasets/evaleval/'
+                            'EEE_datastore/blob/abc123/flat/objects/aa/bb/'
+                            'record.json'
+                        )
+                    }
+                },
+            }
+        ],
+        'manifest': {
+            'datastore_repo': 'evaleval/EEE_datastore',
+            'datastore_revision': 'abc123',
+            'skipped': [
+                {
+                    'model_id': 'api/model',
+                    'reason': 'api_only_or_closed_provider:gemini',
+                    'eee_record_path': 'flat/objects/cc/dd/skipped.json',
+                }
+            ],
+        },
+    }
+
+    community_evals_converter._render_review_details(console, review)
+
+    output = console.export_text()
+    assert 'Needs Attention' in output
+    assert 'Issue' in output
+    assert 'Where' in output
+    assert 'Details' in output
+    assert 'Candidate' not in output
+    assert 'Context' not in output
+    assert 'Score' not in output
+    assert '2 models' in output
+    assert 'https://huggingface.co/example/repo/discussions/1' in output
+    assert (
+        'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/abc123/'
+        'flat/objects/aa/bb/record.json'
+    ) in output
+    assert (
+        'https://huggingface.co/datasets/evaleval/EEE_datastore/blob/abc123/'
+        'flat/objects/cc/dd/skipped.json'
+    ) in output
+
+
+def test_review_collection_submits_clean_records_despite_open_pr_conflict(
+    tmp_path: Path,
+) -> None:
+    conflict_record = _aggregate(model_id='google/gemma-2b-it')
+    clean_record = _aggregate(model_id='google/gemma-clean')
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [conflict_record, clean_record],
+        collection_name='MMLU-Pro',
+    )
+    pr_yaml = tmp_path / 'model_pr.yaml'
+    pr_yaml.write_text(
+        yaml.safe_dump(
+            [
+                {
+                    'dataset': {
+                        'id': 'TIGER-Lab/MMLU-Pro',
+                        'task_id': 'mmlu_pro',
+                    },
+                    'value': 12.3,
+                }
+            ],
+            sort_keys=False,
+        ),
+        encoding='utf-8',
+    )
+    api = FakeHfApi(
+        repo_files_by_revision={
+            ('google/gemma-2b-it', 'refs/pr/7'): [
+                '.eval_results/random.yaml'
+            ],
+        },
+        discussions={
+            'google/gemma-2b-it': [
+                FakeDiscussion(git_reference='refs/pr/7', num=7)
+            ],
+        },
+    )
+
+    review = community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=tmp_path / 'manifest.json',
+        yaml_output_dir=tmp_path / 'yamls',
+        review_output_path=tmp_path / 'review.json',
+        api=api,
+        download_file=_fake_download_with_model_files(
+            datastore,
+            {
+                (
+                    'google/gemma-2b-it',
+                    'refs/pr/7',
+                    '.eval_results/random.yaml',
+                ): pr_yaml
+            },
+        ),
+    )
+
+    assert review['can_open_prs'] is True
+    assert review['yaml_count'] == 1
+    statuses = {
+        entry['model_repo']: entry['status']
+        for entry in review['manifest']['entries']
+    }
+    assert statuses == {
+        'google/gemma-2b-it': 'score_conflict',
+        'google/gemma-clean': 'ready',
+    }
+    assert review['duplicate_audit']['findings'][0]['status'] == 'score_conflict'
+
+
+def test_review_collection_blocks_only_candidate_with_audit_error(
+    tmp_path: Path,
+) -> None:
+    blocked_record = _aggregate(model_id='google/gemma-blocked')
+    clean_record = _aggregate(model_id='google/gemma-clean')
+    datastore, _collection_jsonl = _write_collection_rows(
+        tmp_path,
+        [blocked_record, clean_record],
+        collection_name='MMLU-Pro',
+    )
+    api = FakeHfApi(
+        repo_files_by_revision={
+            ('google/gemma-blocked', 'refs/pr/7'): [
+                '.eval_results/mmlu_pro.yaml'
+            ],
+        },
+        discussions={
+            'google/gemma-blocked': [
+                FakeDiscussion(git_reference='refs/pr/7', num=7)
+            ],
+        },
+    )
+
+    review = community_evals_converter.review_collection_for_hf_evals(
+        collection_name='MMLU-Pro',
+        datastore='evaleval/EEE_datastore@abc123',
+        manifest_output_path=tmp_path / 'manifest.json',
+        yaml_output_dir=tmp_path / 'yamls',
+        review_output_path=tmp_path / 'review.json',
+        api=api,
+        download_file=_fake_download_with_model_files(datastore, {}),
+    )
+
+    statuses = {
+        entry['model_repo']: entry['status']
+        for entry in review['manifest']['entries']
+    }
+    assert review['can_open_prs'] is True
+    assert statuses == {
+        'google/gemma-blocked': 'audit_error',
+        'google/gemma-clean': 'ready',
+    }
+    assert review['duplicate_audit']['error_count'] == 1
+    assert review['duplicate_audit']['errors'][0]['entry_index'] == 0
+    assert review['audit_blocked_entries'][0]['model_repo'] == (
+        'google/gemma-blocked'
+    )
+    assert review['yaml_count'] == 2
+    assert (
+        tmp_path
+        / 'yamls'
+        / 'google'
+        / 'gemma-blocked'
+        / '.eval_results'
+        / 'mmlu_pro.yaml'
+    ).exists()
+
+    submit_api = FakeHfApi()
+    result = community_evals_converter.create_prs_from_manifest(
+        manifest_path=tmp_path / 'manifest.json',
+        limit=None,
+        yes_i_reviewed=True,
+        commit_message='Add EvalEval result',
+        api=submit_api,
+    )
+
+    assert result['count'] == 1
+    assert submit_api.commits[0]['repo_id'] == 'google/gemma-clean'
+
+
+def test_create_prs_from_manifest_creates_fresh_pr_only(tmp_path: Path) -> None:
+    datastore, index_jsonl = _write_index_row(tmp_path, _aggregate())
+    manifest_path = tmp_path / 'manifest.json'
+    community_evals_converter.build_index_manifest(
+        index_jsonl=index_jsonl,
+        datastore='evaleval/EEE_datastore@abc123',
+        benchmarks=['mmlu_pro'],
+        output_path=manifest_path,
+        api=FakeHfApi(),
+        download_file=_fake_download(datastore),
+    )
+    api = FakeHfApi(
+        discussions={
+            'google/gemma-2b-it': [
+                FakeDiscussion(git_reference='refs/pr/123'),
+            ]
+        }
+    )
+
+    result = community_evals_converter.create_prs_from_manifest(
+        manifest_path=manifest_path,
+        limit=None,
+        yes_i_reviewed=True,
+        commit_message='Add EvalEval result',
+        api=api,
+    )
+
+    assert result['count'] == 1
+    assert api.discussion_calls == []
+    assert len(api.commits) == 1
+    commit = api.commits[0]
+    assert commit['repo_id'] == 'google/gemma-2b-it'
+    assert commit['revision'] == 'main'
+    assert commit['create_pr'] is True
+    assert [op.__class__.__name__ for op in commit['operations']] == [
+        'CommitOperationAdd'
+    ]
+
+
+def test_tui_approval_requires_exact_phrase(monkeypatch) -> None:
+    console = Console(record=True)
+    review = {
+        'manifest': {
+            'entries': [
+                {
+                    'status': 'ready',
+                    'model_repo': 'google/gemma-2b-it',
+                    'target_path': '.eval_results/mmlu_pro.yaml',
+                }
+            ]
+        }
+    }
+
+    monkeypatch.setattr(community_evals_converter.Prompt, 'ask', lambda *_args, **_kwargs: 'yes')
+
+    assert community_evals_converter._approve_pr_submission(console, review) is False
+
+
+def test_tui_approval_accepts_open_prs(monkeypatch) -> None:
+    console = Console(record=True)
+    review = {
+        'manifest': {
+            'entries': [
+                {
+                    'status': 'ready',
+                    'model_repo': 'google/gemma-2b-it',
+                    'target_path': '.eval_results/mmlu_pro.yaml',
+                }
+            ]
+        }
+    }
+
+    monkeypatch.setattr(
+        community_evals_converter.Prompt,
+        'ask',
+        lambda *_args, **_kwargs: community_evals_converter.APPROVAL_PHRASE,
+    )
+
+    assert community_evals_converter._approve_pr_submission(console, review) is True
+
+
+def test_prompt_commit_message_requires_non_empty(monkeypatch) -> None:
+    console = Console(record=True)
+
+    monkeypatch.setattr(community_evals_converter.Prompt, 'ask', lambda *_args, **_kwargs: ' ')
+
+    assert community_evals_converter._prompt_commit_message(console) is None
+
+
+def test_prompt_commit_message_returns_typed_message(monkeypatch) -> None:
+    console = Console(record=True)
+
+    monkeypatch.setattr(
+        community_evals_converter.Prompt,
+        'ask',
+        lambda *_args, **_kwargs: 'Add verified EvalEval result',
+    )
+
+    assert (
+        community_evals_converter._prompt_commit_message(console)
+        == 'Add verified EvalEval result'
+    )
+
+
+def test_parser_rejects_removed_open_prs_flag() -> None:
+    parser = community_evals_converter.build_parser()
+
+    with pytest.raises(SystemExit):
+        parser.parse_args(
+            [
+                'aggregate.jsonl',
+                '--datastore',
+                'evaleval/EEE_datastore@abc123',
+                '--open-prs',
+            ]
+        )
+
+
+def test_parser_rejects_old_index_workflow_flags() -> None:
+    parser = community_evals_converter.build_parser()
+
+    with pytest.raises(SystemExit):
+        parser.parse_args(['MMLU-Pro', '--benchmarks', 'mmlu_pro'])
+
+    with pytest.raises(SystemExit):
+        parser.parse_args(['MMLU-Pro', '--manifest-output', 'manifest.json'])
+
+
+def test_parser_defaults_to_datastore_repo() -> None:
+    parser = community_evals_converter.build_parser()
+
+    args = parser.parse_args(['MMLU-Pro'])
+
+    assert args.collection_name == 'MMLU-Pro'
+    assert args.datastore == 'evaleval/EEE_datastore'
+    assert args.force is False
+
+
+def test_parser_accepts_force() -> None:
+    parser = community_evals_converter.build_parser()
+
+    args = parser.parse_args(['MMLU-Pro', '--force'])
+
+    assert args.force is True
+
+
+def test_every_eval_ever_cli_no_longer_exposes_hf_evals() -> None:
+    parser = cli.build_parser()
+
+    with pytest.raises(SystemExit):
+        parser.parse_args(['hf-evals'])
diff --git a/tools/hf-community-evals/README.md b/tools/hf-community-evals/README.md
new file mode 100644
index 000000000..5730a0dad
--- /dev/null
+++ b/tools/hf-community-evals/README.md
@@ -0,0 +1,126 @@
+# EEE -> HF Community Evals
+
+Built by Harsha Nelaturu, June 2026.
+
+Use `tools/community_evals_converter.py` to review one EEE datastore collection, generate
+local HF Community Evals YAML previews, audit existing scores/open PRs, and
+optionally open PRs after explicit approval.
+
+## Quick Start
+
+Use `uv run` for all commands.
+
+```bash
+uv run tools/community_evals_converter.py MMLU-Pro \
+  --datastore evaleval/EEE_datastore@main
+```
+
+This will cache the results for this particular collection and if you would like to force a fresh rebuild:
+
+```bash
+uv run tools/community_evals_converter.py MMLU-Pro \
+  --datastore evaleval/EEE_datastore@main \
+  --force
+```
+
+The positional argument is a collection stem. It must resolve exactly to:
+
+```text
+https://huggingface.co/datasets/evaleval/EEE_datastore/flat/indexes/by_collection/<collection>.jsonl
+```
+
+## Outputs
+
+For `MMLU-Pro`, outputs are written under:
+
+```text
+outputs/community_evals_converter_MMLU-Pro/
+```
+
+Important output files:
+
+- `manifest.json`: converted candidate records plus skipped/error metadata.
+- `review.json`: full review result, duplicate audit findings, audit errors,
+  and PR readiness.
+- `yamls/<owner>/<model>/.eval_results/<benchmark>.yaml`: local YAML previews.
+
+`outputs/` is ignored by git. Use these files for inspection, not as merge
+inputs.
+
+## Review Behavior
+
+The tool:
+
+- downloads the collection JSONL and referenced aggregate objects from the HF
+  datastore;
+- validates object hashes and optional sizes;
+- scans each aggregate record for supported HF benchmark datasets;
+- writes YAML entries using the datastore object HF URL as `source.url`;
+- keeps flat datastore provenance, including instance-level references when
+  present;
+- checks model repo existence on Hugging Face;
+- audits every existing `.eval_results/*.yaml` file on model `main`;
+- audits changed `.eval_results/*.yaml` files in open PR refs;
+- compares by dataset/task content, not YAML filename.
+
+Supported benchmarks in this workflow are:
+
+- `mmlu_pro`
+- `gpqa`
+- `hle`
+- `gsm8k`
+
+## Resume And Force
+
+Default reruns reuse exact-match local outputs:
+
+- matching completed `review.json`: skips collection downloads, model checks,
+  and duplicate audit;
+- matching pre-audit `manifest.json`: skips collection downloads and model
+  checks, then resumes at duplicate audit.
+
+The cache must match collection name, datastore input, and HF-check mode.
+Invalid exact-match cache files are hard errors. Use `--force` when you want to
+ignore the cache and rebuild from the datastore.
+
+## TUI
+The final report has:
+
+- `Community Evals Converter`: summary counts.
+- `Needs Attention`: capped triage table for blockers and exclusions.
+
+`Needs Attention` uses:
+
+- `Issue`: `audit_error`, `score_conflict`, `already_present`,
+  `missing_hf_model`, or `skipped`.
+- `Model`: model repo or aggregate model id.
+- `Details`: reason or score comparison.
+- `Action`: `exclude`, `block entry`, `block all`, or source line.
+- `Where`: terminal hyperlink to the HF model PR/file or HF datastore blob URL.
+
+Repeated same-score `already_present` findings are summarized as one count row.
+Full details remain in `review.json`.
+
+## PR Submission
+
+The tool only opens PRs after both prompts succeed:
+
+1. Type exactly:
+
+   ```text
+   OPEN PRS
+   ```
+
+2. Enter a non-empty commit message.
+
+Only `status = ready` entries are submitted.
+
+Excluded statuses:
+
+- `already_present`: same score already exists.
+- `score_conflict`: different score already exists.
+- `missing_hf_model`: model repo does not resolve on HF.
+- `audit_error`: candidate-scoped audit failure.
+
+Candidate-scoped audit errors block only that candidate. Audit errors without a
+manifest entry block all PR submission.
\ No newline at end of file
diff --git a/tools/hf-community-evals/community_evals_converter.py b/tools/hf-community-evals/community_evals_converter.py
new file mode 100644
index 000000000..c37562069
--- /dev/null
+++ b/tools/hf-community-evals/community_evals_converter.py
@@ -0,0 +1,2945 @@
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import sys
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from difflib import get_close_matches
+from math import isfinite
+from pathlib import Path
+from typing import Any, Callable
+from urllib.parse import quote
+
+import requests
+import yaml
+from huggingface_hub import (
+    CommitOperationAdd,
+    HfApi,
+    hf_hub_download,
+)
+from huggingface_hub.errors import EntryNotFoundError
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    Progress,
+    SpinnerColumn,
+    TaskID,
+    TextColumn,
+    TimeElapsedColumn,
+)
+from rich.prompt import Prompt
+from rich.table import Column, Table
+from rich.text import Text
+
+from every_eval_ever.eval_types import EvaluationLog, EvaluationResult
+
+SOURCE_NAME = 'EvalEval'
+MANIFEST_VERSION = 1
+DEFAULT_DATASTORE_REVISION = 'main'
+DEFAULT_DATASTORE_REPO = 'evaleval/EEE_datastore'
+DEFAULT_BENCHMARKS = ('gpqa', 'hle', 'mmlu_pro', 'gsm8k')
+DEFAULT_PR_COMMIT_DESCRIPTION = (
+    'Adds EvalEval Community Evals YAML entries with source backlinks to EEE '
+    'aggregate records.\n\n'
+    'Contributor: evaleval'
+)
+AUDIT_ERROR_STATUS = 'audit_error'
+
+
+class HFEvalsError(RuntimeError):
+    """Raised when HF Community Evals export cannot proceed safely."""
+
+
+class ReviewProgress:
+    def add_task(self, description: str, total: int | None = None) -> int:
+        return 0
+
+    def update(
+        self,
+        task_id: int,
+        *,
+        advance: int = 0,
+        description: str | None = None,
+        total: int | None = None,
+    ) -> None:
+        return None
+
+
+class RichReviewProgress(ReviewProgress):
+    def __init__(self, progress: Progress) -> None:
+        self.progress = progress
+        self.rich_task_id: TaskID | None = None
+        self.next_task_id = 0
+        self.total_by_task: dict[int, int] = {}
+        self.completed_by_task: dict[int, int] = {}
+        self.active_task_id: int | None = None
+
+    def add_task(self, description: str, total: int | None = None) -> int:
+        self.next_task_id += 1
+        task_id = self.next_task_id
+        task_total = total or 0
+        self.total_by_task[task_id] = task_total
+        self.completed_by_task[task_id] = 0
+        self.active_task_id = task_id
+        if self.rich_task_id is None:
+            self.rich_task_id = self.progress.add_task(
+                description,
+                total=task_total,
+            )
+        else:
+            self.progress.update(
+                self.rich_task_id,
+                description=description,
+                completed=0,
+                total=task_total,
+            )
+        return task_id
+
+    def update(
+        self,
+        task_id: int,
+        *,
+        advance: int = 0,
+        description: str | None = None,
+        total: int | None = None,
+    ) -> None:
+        if self.rich_task_id is None:
+            self.rich_task_id = self.progress.add_task(
+                description or 'Reviewing',
+                total=0,
+            )
+        if total is not None:
+            self.total_by_task[task_id] = total
+        self.active_task_id = task_id
+        self.completed_by_task[task_id] = (
+            self.completed_by_task.get(task_id, 0) + advance
+        )
+        kwargs: dict[str, Any] = {
+            'completed': self.completed_by_task[task_id],
+            'total': self.total_by_task.get(task_id, 0),
+        }
+        if description is not None:
+            kwargs['description'] = description
+        self.progress.update(self.rich_task_id, **kwargs)
+
+
+@dataclass(frozen=True)
+class BenchmarkConfig:
+    dataset_id: str
+    task_id: str
+    yaml_name: str
+    dataset_aliases: tuple[str, ...] = ()
+    preferred_metric_ids: tuple[str, ...] = ()
+
+
+BENCHMARK_CONFIGS: dict[str, BenchmarkConfig] = {
+    'gpqa': BenchmarkConfig(
+        'Idavidrein/gpqa',
+        'diamond',
+        'gpqa_diamond',
+        ('reasoningMIA/gpqa_diamond',),
+    ),
+    'hle': BenchmarkConfig(
+        'cais/hle',
+        'default',
+        'hle',
+        preferred_metric_ids=('hle.accuracy', 'hle/accuracy'),
+    ),
+    'mmlu_pro': BenchmarkConfig(
+        'TIGER-Lab/MMLU-Pro',
+        'mmlu_pro',
+        'mmlu_pro',
+        preferred_metric_ids=(
+            'mmlu_pro/overall',
+            'mmlu-pro::chain-of-thought-correctness',
+        ),
+    ),
+    'gsm8k': BenchmarkConfig('openai/gsm8k', 'gsm8k', 'gsm8k', ('gsm8k',)),
+}
+BENCHMARK_ALIASES = {
+    'gpqa_diamond': 'gpqa',
+}
+HF_TIMEOUT_SECONDS = 10
+OPEN_WEIGHT_MODEL_PREFIXES = ('openai/gpt-oss',)
+API_ONLY_PROVIDER_PREFIXES = (
+    'anthropic',
+    'gemini',
+    'grok',
+    'mistral',
+    'openai',
+    'xai',
+)
+GPQA_SUBSET_NOTES = {
+    'diamond': 'GPQA Diamond',
+    'gpqa_diamond': 'GPQA Diamond',
+    'main': 'GPQA chain-of-thought',
+    'chain_of_thought': 'GPQA chain-of-thought',
+    'cot': 'GPQA chain-of-thought',
+}
+EVAL_RESULT_PATH_FAMILIES = {
+    'gpqa': (
+        '.eval_results/gpqa_diamond.yaml',
+        '.eval_results/gpqa-diamond.yaml',
+        '.eval_results/gpqa.yaml',
+    ),
+    'gsm8k': ('.eval_results/gsm8k.yaml',),
+    'hle': ('.eval_results/hle.yaml',),
+    'mmlu_pro': (
+        '.eval_results/mmlu_pro.yaml',
+        '.eval_results/mmlu-pro.yaml',
+    ),
+}
+
+
+def normalize_benchmark(value: str) -> str:
+    return value.strip().lower().replace('-', '_')
+
+
+def parse_benchmarks(raw: str | None) -> list[str]:
+    if raw is None:
+        return list(DEFAULT_BENCHMARKS)
+    benchmarks = [
+        BENCHMARK_ALIASES.get(normalize_benchmark(item), normalize_benchmark(item))
+        for item in raw.split(',')
+    ]
+    benchmarks = [item for item in benchmarks if item]
+    unknown = sorted(set(benchmarks) - set(BENCHMARK_CONFIGS))
+    if unknown:
+        raise HFEvalsError(f'Unsupported benchmark(s): {", ".join(unknown)}')
+    if not benchmarks:
+        raise HFEvalsError('At least one benchmark is required.')
+    return benchmarks
+
+
+def parse_datastore_locator(value: str) -> tuple[str, str | None]:
+    raw = value.strip()
+    if not raw:
+        raise HFEvalsError('Datastore must be <hf_dataset_repo>[@<revision>].')
+    if raw.count('@') > 1:
+        raise HFEvalsError('Datastore must be <hf_dataset_repo>[@<revision>].')
+    if '@' in raw:
+        repo_id, revision = (
+            part.strip().strip('/') for part in raw.split('@', 1)
+        )
+    else:
+        repo_id = raw.strip().strip('/')
+        revision = None
+    if not repo_id or '/' not in repo_id:
+        raise HFEvalsError('Datastore repo must look like org/dataset.')
+    if revision is not None and not revision:
+        raise HFEvalsError('Datastore revision must not be empty.')
+    return repo_id, revision
+
+
+def resolve_datastore_locator(value: str, *, api: HfApi) -> tuple[str, str]:
+    repo_id, revision = parse_datastore_locator(value)
+    if revision is not None:
+        return repo_id, revision
+
+    try:
+        info = api.repo_info(
+            repo_id=repo_id,
+            repo_type='dataset',
+            revision=DEFAULT_DATASTORE_REVISION,
+        )
+    except Exception as exc:  # noqa: BLE001
+        raise HFEvalsError(
+            f'Unable to resolve latest datastore commit for {repo_id}'
+        ) from exc
+
+    sha = getattr(info, 'sha', None)
+    if sha is None and isinstance(info, dict):
+        sha = info.get('sha')
+    if not isinstance(sha, str) or not sha.strip():
+        raise HFEvalsError(
+            f'HF dataset repo did not return a commit sha for {repo_id}'
+        )
+    return repo_id, sha.strip()
+
+
+def dump_yaml_entries(entries: list[dict[str, Any]]) -> str:
+    return yaml.safe_dump(
+        entries,
+        sort_keys=False,
+        allow_unicode=False,
+        width=88,
+    )
+
+
+def _is_real_hf_api(api: HfApi) -> bool:
+    return api.__class__ is HfApi
+
+
+def _hf_model_api_url(repo_id: str) -> str:
+    return f'https://huggingface.co/api/models/{quote(repo_id, safe="/")}'
+
+
+def _http_model_info(repo_id: str) -> dict[str, Any]:
+    try:
+        response = requests.get(
+            _hf_model_api_url(repo_id), timeout=HF_TIMEOUT_SECONDS
+        )
+    except requests.RequestException as exc:
+        raise HFEvalsError(f'Unable to check HF model repo: {repo_id}') from exc
+    if response.status_code == 404:
+        raise HFEvalsError(f'HF model repo does not exist: {repo_id}')
+    try:
+        response.raise_for_status()
+    except requests.HTTPError as exc:
+        raise HFEvalsError(f'Unable to check HF model repo: {repo_id}') from exc
+    loaded = response.json()
+    if not isinstance(loaded, dict):
+        raise HFEvalsError(f'HF model API returned invalid data: {repo_id}')
+    return loaded
+
+
+def _repo_exists(api: HfApi, repo_id: str) -> None:
+    if _is_real_hf_api(api):
+        _http_model_info(repo_id)
+        return
+    try:
+        api.model_info(repo_id)
+    except Exception as exc:  # noqa: BLE001 - preserve HF client details
+        raise HFEvalsError(f'HF model repo does not exist: {repo_id}') from exc
+
+
+def _datastore_blob_url(
+    path: str,
+    *,
+    datastore_revision: str,
+    datastore_repo: str = DEFAULT_DATASTORE_REPO,
+) -> str:
+    repo = datastore_repo.strip().strip('/')
+    if not repo:
+        raise HFEvalsError('Datastore repo must not be empty.')
+    revision = datastore_revision.strip()
+    if not revision:
+        raise HFEvalsError('Datastore revision must not be empty.')
+    return (
+        f'https://huggingface.co/datasets/{quote(repo, safe="/")}/blob/'
+        f'{quote(revision, safe="")}/'
+        f'{quote(path, safe="/")}'
+    )
+
+
+def _date_from_result(log: EvaluationLog, result: EvaluationResult) -> str | None:
+    value = result.evaluation_timestamp or log.evaluation_timestamp
+    if value is None:
+        return None
+    try:
+        if value.replace('.', '', 1).isdigit():
+            return datetime.fromtimestamp(float(value), tz=UTC).date().isoformat()
+    except ValueError:
+        pass
+    if len(value) >= 10:
+        return value[:10]
+    raise HFEvalsError(f'Invalid evaluation timestamp: {value!r}')
+
+
+def _dataset_ids_for_config(config: BenchmarkConfig) -> set[str]:
+    return {
+        config.dataset_id.strip().lower(),
+        *(alias.strip().lower() for alias in config.dataset_aliases),
+    }
+
+
+def _result_matches_dataset(
+    result: EvaluationResult, config: BenchmarkConfig
+) -> bool:
+    if result.source_data.source_type == 'hf_dataset':
+        hf_repo = (result.source_data.hf_repo or '').strip().lower()
+        if hf_repo in _dataset_ids_for_config(config):
+            return True
+        additional_details = result.source_data.additional_details or {}
+        if isinstance(additional_details, dict):
+            benchmark_hf_repo = (
+                str(additional_details.get('benchmark_hf_repo') or '')
+                .strip()
+                .lower()
+            )
+            if benchmark_hf_repo in _dataset_ids_for_config(config):
+                return True
+        if hf_repo:
+            return False
+        dataset_name = normalize_benchmark(result.source_data.dataset_name)
+        return dataset_name == normalize_benchmark(config.task_id)
+
+    if result.source_data.source_type == 'url':
+        dataset_urls = [
+            url.strip().lower().rstrip('/')
+            for url in result.source_data.url
+            if isinstance(url, str)
+        ]
+        return any(
+            url.endswith(f'/datasets/{dataset_id}')
+            for dataset_id in _dataset_ids_for_config(config)
+            for url in dataset_urls
+        )
+
+    return False
+
+
+def _result_matches_preferred_metric(
+    result: EvaluationResult, config: BenchmarkConfig
+) -> bool:
+    if not config.preferred_metric_ids:
+        return True
+    allowed = {item.strip().lower() for item in config.preferred_metric_ids}
+    result_ids = {
+        item.strip().lower()
+        for item in (
+            result.evaluation_result_id,
+            result.metric_config.metric_id,
+        )
+        if isinstance(item, str)
+    }
+    return bool(allowed & result_ids)
+
+
+def _result_for_dataset(
+    log: EvaluationLog, config: BenchmarkConfig
+) -> EvaluationResult | None:
+    matches = [
+        result
+        for result in log.evaluation_results
+        if _result_matches_dataset(result, config)
+        and _result_matches_preferred_metric(result, config)
+    ]
+    if not matches:
+        return None
+    if len(matches) != 1:
+        ids = [
+            result.evaluation_result_id or result.evaluation_name
+            for result in matches
+        ]
+        raise HFEvalsError(
+            f'{log.evaluation_id} has {len(matches)} matching '
+            f'evaluation_results for {config.dataset_id}: {ids}'
+        )
+    return matches[0]
+
+
+def _results_for_supported_datasets(
+    log: EvaluationLog,
+) -> list[tuple[str, BenchmarkConfig, EvaluationResult]]:
+    results: list[tuple[str, BenchmarkConfig, EvaluationResult]] = []
+    for benchmark, config in BENCHMARK_CONFIGS.items():
+        matches = [
+            result
+            for result in log.evaluation_results
+            if _result_matches_dataset(result, config)
+            and _result_matches_preferred_metric(result, config)
+        ]
+        if len(matches) > 1:
+            ids = [
+                result.evaluation_result_id or result.evaluation_name
+                for result in matches
+            ]
+            raise HFEvalsError(
+                f'{log.evaluation_id} has {len(matches)} matching '
+                f'evaluation_results for {config.dataset_id}: {ids}'
+            )
+        if matches:
+            results.append((benchmark, config, matches[0]))
+    return results
+
+
+def _community_eval_entry(
+    *,
+    config: BenchmarkConfig,
+    task_id: str,
+    value: float | int,
+    date: str | None,
+    source_url: str,
+    notes: str | None = None,
+) -> dict[str, Any]:
+    entry: dict[str, Any] = {
+        'dataset': {'id': config.dataset_id, 'task_id': task_id},
+        'value': value,
+        'source': {'url': source_url, 'name': SOURCE_NAME},
+    }
+    if date is not None:
+        entry['date'] = date
+    if notes is not None:
+        entry['notes'] = notes
+    return entry
+
+
+def _gpqa_variant_notes(result: EvaluationResult) -> str | None:
+    source_data = result.source_data
+    hf_repo = (source_data.hf_repo or '').strip().lower()
+    dataset_name = normalize_benchmark(source_data.dataset_name).replace(' ', '_')
+    result_id = (result.evaluation_result_id or '').strip().lower()
+    metric_name = (result.metric_config.metric_name or '').strip().lower()
+
+    if (
+        hf_repo == 'human-centered-eval/openeval'
+        and dataset_name == 'gpqa'
+        and (
+            result_id == 'gpqa::chain-of-thought-correctness'
+            or metric_name == 'chain_of_thought_correctness'
+        )
+    ):
+        return 'GPQA chain-of-thought'
+
+    if dataset_name == 'gpqa_diamond' or result_id.startswith('gpqa_diamond/'):
+        return 'GPQA Diamond'
+
+    return None
+
+
+def _community_eval_notes(benchmark: str, result: EvaluationResult) -> str | None:
+    if benchmark == 'gpqa':
+        return _gpqa_variant_notes(result)
+    return None
+
+
+def _community_eval_notes_for_subset(
+    benchmark: str,
+    subset: str | None,
+) -> str | None:
+    if subset is None:
+        return None
+    if benchmark != 'gpqa':
+        return None
+    normalized_subset = normalize_benchmark(subset)
+    try:
+        return GPQA_SUBSET_NOTES[normalized_subset]
+    except KeyError as exc:
+        allowed = ', '.join(sorted(GPQA_SUBSET_NOTES))
+        raise HFEvalsError(
+            f'Unsupported subset for gpqa: {subset!r}; expected one of {allowed}.'
+        ) from exc
+
+
+def _community_eval_task_id(
+    benchmark: str,
+    config: BenchmarkConfig,
+    result: EvaluationResult,
+    notes: str | None,
+) -> str:
+    if benchmark == 'gpqa':
+        if notes == 'GPQA chain-of-thought':
+            return 'main'
+        if notes == 'GPQA Diamond':
+            return 'diamond'
+    return config.task_id
+
+
+def _community_eval_value(result: EvaluationResult) -> float | int:
+    score = result.score_details.score
+    if (
+        isinstance(score, bool)
+        or not isinstance(score, (int, float))
+        or not isfinite(float(score))
+    ):
+        raise HFEvalsError('score_details.score must be numeric')
+
+    value: float
+    metric_unit = (result.metric_config.metric_unit or '').strip().lower()
+    max_score = result.metric_config.max_score
+    if metric_unit in {'percent', 'percentage'} or max_score == 100:
+        value = float(score)
+    elif metric_unit == 'proportion' or max_score == 1:
+        value = float(score) * 100
+    else:
+        raise HFEvalsError(
+            'Cannot convert score to 0-100 Community Evals value without '
+            f'metric_unit=proportion/percent or max_score=1/100 for '
+            f'{result.evaluation_result_id!r}.'
+        )
+
+    if value < 0 or value > 100:
+        raise HFEvalsError(
+            f'Community Evals value for {result.evaluation_result_id!r} '
+            f'must be in the 0-100 range, got {value}.'
+        )
+    return round(value, 10)
+
+
+def _target_path(config: BenchmarkConfig) -> str:
+    return f'.eval_results/{config.yaml_name}.yaml'
+
+
+def _entry_is_ready(entry: dict[str, Any]) -> bool:
+    return entry.get('status', 'ready') == 'ready'
+
+
+def _entry_has_yaml_preview(entry: dict[str, Any]) -> bool:
+    return entry.get('status', 'ready') in {'ready', AUDIT_ERROR_STATUS}
+
+
+def _api_only_skip_reason(log: EvaluationLog) -> str | None:
+    platform = (log.model_info.inference_platform or '').strip().lower()
+    developer = (log.model_info.developer or '').strip().lower()
+    model_id = log.model_info.id.strip().lower()
+    model_name = log.model_info.name.strip().lower()
+    if any(
+        model_id == prefix or model_id.startswith(prefix)
+        for prefix in OPEN_WEIGHT_MODEL_PREFIXES
+    ):
+        return None
+    if any(
+        model_name == prefix or model_name.startswith(prefix)
+        for prefix in OPEN_WEIGHT_MODEL_PREFIXES
+    ):
+        return None
+    values = (platform, developer, model_id, model_name)
+    for prefix in API_ONLY_PROVIDER_PREFIXES:
+        if any(value == prefix or value.startswith(f'{prefix}/') for value in values):
+            return f'api_only_or_closed_provider:{prefix}'
+    if 'gemini' in model_id or 'gemini' in model_name:
+        return 'api_only_or_closed_provider:gemini'
+    return None
+
+
+def _safe_index_path(row: dict[str, Any], field: str, *, line_ref: str) -> str:
+    value = row.get(field)
+    if not isinstance(value, str) or not value.strip():
+        raise HFEvalsError(f'{line_ref}: missing {field}')
+    path = Path(value)
+    if path.is_absolute() or '..' in path.parts:
+        raise HFEvalsError(f'{line_ref}: unsafe {field}: {value}')
+    return value
+
+
+def _index_subset(row: dict[str, Any], *, line_ref: str) -> str | None:
+    value = row.get('subset')
+    if value is None:
+        return None
+    if not isinstance(value, str) or not value.strip():
+        raise HFEvalsError(f'{line_ref}: subset must be a non-empty string')
+    return value.strip()
+
+
+def _reject_unsupported_row_sources(row: dict[str, Any], *, line_ref: str) -> None:
+    unsupported = [
+        field for field in ('url', 'local_path') if row.get(field) is not None
+    ]
+    if unsupported:
+        fields = ', '.join(unsupported)
+        raise HFEvalsError(
+            f'{line_ref}: unsupported aggregate row source field(s): {fields}; '
+            'use object_path from the datastore index'
+        )
+
+
+def _validate_instance_level_reference(
+    row: dict[str, Any],
+    *,
+    line_ref: str,
+) -> None:
+    available = row.get('instance_level_available')
+    if not isinstance(available, bool):
+        raise HFEvalsError(f'{line_ref}: instance_level_available must be boolean')
+    if not available:
+        unexpected = [
+            field
+            for field in (
+                'instance_level_path',
+                'instance_level_size_bytes',
+                'instance_sha',
+            )
+            if row.get(field) is not None
+        ]
+        if unexpected:
+            raise HFEvalsError(
+                f'{line_ref}: instance_level_available is false but '
+                f'instance provenance is present: {", ".join(unexpected)}'
+            )
+        return
+
+    _safe_index_path(row, 'instance_level_path', line_ref=line_ref)
+    size = row.get('instance_level_size_bytes')
+    if not isinstance(size, int):
+        raise HFEvalsError(f'{line_ref}: instance_level_size_bytes must be an integer')
+    instance_sha = row.get('instance_sha')
+    if not isinstance(instance_sha, str) or not instance_sha:
+        raise HFEvalsError(f'{line_ref}: missing instance_sha')
+    if len(instance_sha) != 64 or any(
+        char not in '0123456789abcdef' for char in instance_sha.lower()
+    ):
+        raise HFEvalsError(f'{line_ref}: instance_sha must be a sha256 hex digest')
+
+
+def _index_trace_fields(row: dict[str, Any]) -> dict[str, Any]:
+    fields = {}
+    for field in ('legacy_path', 'object_path', 'subset'):
+        value = row.get(field)
+        if value is not None:
+            fields[field] = value
+    return fields
+
+
+def _resolve_index_jsonl_path(index_path: Path) -> Path:
+    resolved = index_path.resolve()
+    if not resolved.is_dir():
+        return resolved
+
+    aggregate_jsonl = resolved / 'aggregate.jsonl'
+    if not aggregate_jsonl.exists():
+        raise HFEvalsError(
+            f'Index directory must contain aggregate.jsonl: {resolved}'
+        )
+    if not aggregate_jsonl.is_file():
+        raise HFEvalsError(
+            f'Index directory aggregate.jsonl must be a file: {aggregate_jsonl}'
+        )
+    return aggregate_jsonl
+
+
+def _load_index_rows(index_jsonl: Path) -> list[dict[str, Any]]:
+    if not index_jsonl.exists():
+        raise HFEvalsError(f'Index JSONL does not exist: {index_jsonl}')
+    if not index_jsonl.is_file():
+        raise HFEvalsError(f'Index JSONL must be a file: {index_jsonl}')
+
+    rows: list[dict[str, Any]] = []
+    with index_jsonl.open(encoding='utf-8') as file:
+        for line_number, line in enumerate(file, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise HFEvalsError(
+                    f'{index_jsonl}:{line_number}: invalid JSONL row: {exc}'
+                ) from exc
+            if not isinstance(row, dict):
+                raise HFEvalsError(
+                    f'{index_jsonl}:{line_number}: JSONL row must be an object'
+                )
+            row['_index_line'] = line_number
+            rows.append(row)
+    if not rows:
+        raise HFEvalsError(f'Index JSONL has no rows: {index_jsonl}')
+    return rows
+
+
+def _safe_collection_name(value: str) -> str:
+    name = value.strip()
+    if not name:
+        raise HFEvalsError('Collection name must not be empty.')
+    if name != value:
+        raise HFEvalsError('Collection name must not include surrounding whitespace.')
+    if name.endswith('.jsonl'):
+        raise HFEvalsError('Pass the collection name without the .jsonl suffix.')
+    if '/' in name or '\\' in name:
+        raise HFEvalsError(
+            'Collection name must be a single by_collection file stem.'
+        )
+    parts = Path(name).parts
+    if parts != (name,) or name in {'.', '..'}:
+        raise HFEvalsError(
+            'Collection name must be a single by_collection file stem.'
+        )
+    return name
+
+
+def _collection_index_path(collection_name: str) -> str:
+    collection_name = _safe_collection_name(collection_name)
+    return f'flat/indexes/by_collection/{collection_name}.jsonl'
+
+
+def _collection_stems_from_repo_files(paths: list[str]) -> list[str]:
+    prefix = 'flat/indexes/by_collection/'
+    suffix = '.jsonl'
+    stems = set()
+    for path in paths:
+        if not isinstance(path, str):
+            continue
+        if not path.startswith(prefix) or not path.endswith(suffix):
+            continue
+        filename = path[len(prefix) :]
+        if '/' in filename:
+            continue
+        stem = filename[: -len(suffix)]
+        if stem:
+            stems.add(stem)
+    return sorted(stems, key=str.lower)
+
+
+def _normalized_collection_stem(value: str) -> str:
+    return (
+        value.lower()
+        .replace('-', '')
+        .replace('_', '')
+        .replace(' ', '')
+        .replace('.', '')
+    )
+
+
+def _nearby_collection_stems(collection_name: str, stems: list[str]) -> list[str]:
+    normalized_requested = _normalized_collection_stem(collection_name)
+    suggestions = []
+    for stem in stems:
+        normalized_stem = _normalized_collection_stem(stem)
+        if (
+            normalized_requested in normalized_stem
+            or normalized_stem in normalized_requested
+        ):
+            suggestions.append(stem)
+    for stem in get_close_matches(collection_name, stems, n=5, cutoff=0.55):
+        if stem not in suggestions:
+            suggestions.append(stem)
+    return suggestions[:5]
+
+
+def _collection_suggestion_text(
+    *,
+    api: HfApi,
+    datastore_repo: str,
+    datastore_revision: str,
+    collection_name: str,
+) -> str:
+    try:
+        paths = api.list_repo_files(
+            repo_id=datastore_repo,
+            repo_type='dataset',
+            revision=datastore_revision,
+        )
+    except Exception as exc:  # noqa: BLE001
+        return f'Unable to list available collection stems: {exc}'
+    stems = _collection_stems_from_repo_files(list(paths))
+    if not stems:
+        return 'No collection JSONL files were found under flat/indexes/by_collection.'
+    suggestions = _nearby_collection_stems(collection_name, stems)
+    if not suggestions:
+        return 'No nearby collection stems found.'
+    return f'Nearby collection stems: {", ".join(suggestions)}'
+
+
+def _download_collection_index_jsonl(
+    *,
+    api: HfApi,
+    datastore_repo: str,
+    datastore_revision: str,
+    collection_name: str,
+    download_file: Callable[..., str] | None = None,
+) -> tuple[str, Path]:
+    download_file = download_file or hf_hub_download
+    collection_index_path = _collection_index_path(collection_name)
+    try:
+        local_path = download_file(
+            repo_id=datastore_repo,
+            repo_type='dataset',
+            filename=collection_index_path,
+            revision=datastore_revision,
+        )
+    except Exception as exc:  # noqa: BLE001
+        suggestion_text = _collection_suggestion_text(
+            api=api,
+            datastore_repo=datastore_repo,
+            datastore_revision=datastore_revision,
+            collection_name=collection_name,
+        )
+        raise HFEvalsError(
+            f'Unable to download required collection index file '
+            f'{collection_index_path} from {datastore_repo}@{datastore_revision}. '
+            f'{suggestion_text}'
+        ) from exc
+    return collection_index_path, Path(local_path)
+
+
+def _candidate_duplicate_key(entry: dict[str, Any]) -> tuple[str, str, str]:
+    dataset = entry['yaml_entry']['dataset']
+    return (
+        str(entry['model_repo']).strip().lower(),
+        str(dataset['id']).strip().lower(),
+        str(dataset['task_id']).strip(),
+    )
+
+
+def _numeric_score(value: Any, *, context: str) -> float:
+    if isinstance(value, bool) or not isinstance(value, (int, float)):
+        raise HFEvalsError(f'{context}: score value must be numeric')
+    score = float(value)
+    if not isfinite(score):
+        raise HFEvalsError(f'{context}: score value must be finite')
+    return score
+
+
+def _scores_equal(left: Any, right: Any) -> bool:
+    return abs(
+        _numeric_score(left, context='left score')
+        - _numeric_score(right, context='right score')
+    ) <= 1e-9
+
+
+def _read_online_indexed_record(
+    *,
+    datastore_repo: str,
+    datastore_revision: str,
+    object_path: str,
+    row: dict[str, Any],
+    line_ref: str,
+    download_file: Callable[..., str] | None = None,
+) -> EvaluationLog:
+    download_file = download_file or hf_hub_download
+    try:
+        local_path = download_file(
+            repo_id=datastore_repo,
+            repo_type='dataset',
+            filename=object_path,
+            revision=datastore_revision,
+        )
+    except Exception as exc:  # noqa: BLE001
+        raise HFEvalsError(
+            f'{line_ref}: unable to download {object_path} from '
+            f'{datastore_repo}@{datastore_revision}'
+        ) from exc
+
+    return _parse_indexed_record_bytes(
+        data=Path(local_path).read_bytes(),
+        row=row,
+        line_ref=line_ref,
+        source_ref=object_path,
+    )
+
+
+def _parse_indexed_record_bytes(
+    *,
+    data: bytes,
+    row: dict[str, Any],
+    line_ref: str,
+    source_ref: str,
+) -> EvaluationLog:
+    expected_size = row.get('size_bytes')
+    if expected_size is not None:
+        if not isinstance(expected_size, int):
+            raise HFEvalsError(f'{line_ref}: size_bytes must be an integer')
+        if len(data) != expected_size:
+            raise HFEvalsError(
+                f'{line_ref}: size_bytes mismatch for {source_ref}: '
+                f'expected {expected_size}, got {len(data)}'
+            )
+
+    expected_sha = row.get('sha256')
+    if not isinstance(expected_sha, str) or not expected_sha:
+        raise HFEvalsError(f'{line_ref}: missing sha256')
+    actual_sha = hashlib.sha256(data).hexdigest()
+    if actual_sha != expected_sha:
+        raise HFEvalsError(
+            f'{line_ref}: sha256 mismatch for {source_ref}: '
+            f'expected {expected_sha}, got {actual_sha}'
+        )
+
+    try:
+        raw = json.loads(data.decode('utf-8'))
+        log = EvaluationLog.model_validate(raw)
+    except Exception as exc:  # noqa: BLE001
+        raise HFEvalsError(f'{line_ref}: invalid EEE aggregate JSON: {exc}') from exc
+    return log
+
+
+def _candidate_from_record_result(
+    *,
+    benchmark: str,
+    config: BenchmarkConfig,
+    record_path: str,
+    log: EvaluationLog,
+    result: EvaluationResult,
+    model_repo: str,
+    source_url: str,
+    source: str,
+    status: str,
+    hf_check_error: str | None,
+    subset: str | None = None,
+) -> dict[str, Any]:
+    value = _community_eval_value(result)
+    notes = _community_eval_notes(benchmark, result)
+    subset_notes = _community_eval_notes_for_subset(benchmark, subset)
+    if subset_notes is not None:
+        if notes is not None and notes != subset_notes:
+            raise HFEvalsError(
+                f'Index subset {subset!r} conflicts with aggregate variant '
+                f'{notes!r}.'
+            )
+        notes = subset_notes
+    task_id = _community_eval_task_id(benchmark, config, result, notes)
+    yaml_entry = _community_eval_entry(
+        config=config,
+        task_id=task_id,
+        value=value,
+        date=_date_from_result(log, result),
+        source_url=source_url,
+        notes=notes,
+    )
+    entry = {
+        'status': status,
+        'benchmark': benchmark,
+        'model_repo': model_repo,
+        'target_path': _target_path(config),
+        'eee_evaluation_id': log.evaluation_id,
+        'eee_evaluation_result_id': result.evaluation_result_id,
+        'eee_record_path': record_path,
+        'source': source,
+        'yaml_entry': yaml_entry,
+        'pr_title': f'Add EvalEval {task_id} result for {model_repo}',
+        'pr_body': (
+            'Adds a Hugging Face Community Evals result from '
+            f'{SOURCE_NAME} with a backlink to the source EEE record.'
+        ),
+    }
+    if hf_check_error is not None:
+        entry['hf_check_error'] = hf_check_error
+    return entry
+
+
+def build_index_manifest(
+    *,
+    index_jsonl: Path,
+    datastore: str,
+    benchmarks: list[str],
+    output_path: Path | None = None,
+    api: HfApi | None = None,
+    check_hf: bool = True,
+    download_file: Callable[..., str] | None = None,
+) -> dict[str, Any]:
+    """Build HF Community Evals candidates from online flat datastore rows."""
+
+    api = api or HfApi()
+    index_jsonl = _resolve_index_jsonl_path(index_jsonl)
+    rows = _load_index_rows(index_jsonl)
+    entries: list[dict[str, Any]] = []
+    skipped: list[dict[str, Any]] = []
+    errors: list[str] = []
+    seen_keys: dict[tuple[str, str, str, str | None], dict[str, Any]] = {}
+    repo_check_cache: dict[str, HFEvalsError | None] = {}
+    datastore_repo, datastore_revision = resolve_datastore_locator(
+        datastore, api=api
+    )
+
+    def cached_repo_error(repo_id: str) -> HFEvalsError | None:
+        if repo_id not in repo_check_cache:
+            try:
+                _repo_exists(api, repo_id)
+                repo_check_cache[repo_id] = None
+            except HFEvalsError as exc:
+                repo_check_cache[repo_id] = exc
+        return repo_check_cache[repo_id]
+
+    for row in rows:
+        line_number = row['_index_line']
+        line_ref = f'{index_jsonl}:{line_number}'
+
+        raw_benchmark = row.get('benchmark')
+        if not isinstance(raw_benchmark, str) or not raw_benchmark.strip():
+            errors.append(f'{line_ref}: missing benchmark')
+            continue
+        try:
+            _reject_unsupported_row_sources(row, line_ref=line_ref)
+            subset = _index_subset(row, line_ref=line_ref)
+        except HFEvalsError as exc:
+            errors.append(str(exc))
+            continue
+        normalized_benchmark = BENCHMARK_ALIASES.get(
+            normalize_benchmark(raw_benchmark),
+            normalize_benchmark(raw_benchmark),
+        )
+        if normalized_benchmark not in BENCHMARK_CONFIGS:
+            skipped.append(
+                {
+                    'reason': 'unsupported_index_benchmark',
+                    'benchmark': raw_benchmark,
+                    'index_path': index_jsonl.as_posix(),
+                    'index_line': line_number,
+                    **_index_trace_fields(row),
+                }
+            )
+            continue
+        if normalized_benchmark not in benchmarks:
+            skipped.append(
+                {
+                    'reason': 'benchmark_not_selected',
+                    'benchmark': raw_benchmark,
+                    'index_path': index_jsonl.as_posix(),
+                    'index_line': line_number,
+                    **_index_trace_fields(row),
+                }
+            )
+            continue
+
+        record_type = row.get('record_type')
+        if record_type != 'aggregate':
+            skipped.append(
+                {
+                    'reason': 'non_aggregate_index_row',
+                    'record_type': record_type,
+                    'benchmark': raw_benchmark,
+                    'index_path': index_jsonl.as_posix(),
+                    'index_line': line_number,
+                    **_index_trace_fields(row),
+                }
+            )
+            continue
+
+        try:
+            object_path = _safe_index_path(row, 'object_path', line_ref=line_ref)
+            record_ref = object_path
+            source_url = _datastore_blob_url(
+                object_path,
+                datastore_repo=datastore_repo,
+                datastore_revision=datastore_revision,
+            )
+            source_mode = 'online_flat_index_jsonl'
+            log = _read_online_indexed_record(
+                datastore_repo=datastore_repo,
+                datastore_revision=datastore_revision,
+                object_path=object_path,
+                row=row,
+                line_ref=line_ref,
+                download_file=download_file,
+            )
+        except HFEvalsError as exc:
+            errors.append(str(exc))
+            continue
+
+        api_only_reason = _api_only_skip_reason(log)
+        if api_only_reason is not None:
+            skipped.append(
+                {
+                    'reason': api_only_reason,
+                    'benchmark': raw_benchmark,
+                    'eee_evaluation_id': log.evaluation_id,
+                    'eee_record_path': record_ref,
+                    'index_path': index_jsonl.as_posix(),
+                    'index_line': line_number,
+                    **_index_trace_fields(row),
+                    'model_id': log.model_info.id,
+                }
+            )
+            continue
+
+        raw_model_repo = log.model_info.id
+        if not isinstance(raw_model_repo, str) or not raw_model_repo.strip():
+            errors.append(f'{line_ref}: record has no model_info.id')
+            continue
+        model_repo = raw_model_repo.strip()
+
+        status = 'ready'
+        hf_check_error: str | None = None
+        if check_hf:
+            error = cached_repo_error(model_repo)
+            if error is not None:
+                status = 'missing_hf_model'
+                hf_check_error = str(error)
+
+        config = BENCHMARK_CONFIGS[normalized_benchmark]
+        try:
+            result = _result_for_dataset(log, config)
+        except HFEvalsError as exc:
+            errors.append(f'{line_ref}: {exc}')
+            continue
+        if result is None:
+            skipped.append(
+                {
+                    'reason': 'no_matching_evaluation_result',
+                    'benchmark': raw_benchmark,
+                    'eee_evaluation_id': log.evaluation_id,
+                    'eee_record_path': record_ref,
+                    'index_path': index_jsonl.as_posix(),
+                    'index_line': line_number,
+                    **_index_trace_fields(row),
+                }
+            )
+            continue
+
+        try:
+            entry = _candidate_from_record_result(
+                benchmark=normalized_benchmark,
+                config=config,
+                record_path=record_ref,
+                log=log,
+                result=result,
+                model_repo=model_repo,
+                source_url=source_url,
+                source=source_mode,
+                status=status,
+                hf_check_error=hf_check_error,
+                subset=subset,
+            )
+        except HFEvalsError as exc:
+            errors.append(f'{line_ref}: {exc}')
+            continue
+
+        entry['index_path'] = index_jsonl.as_posix()
+        entry['index_line'] = line_number
+        entry['flat_object_path'] = record_ref
+        for field in (
+            'legacy_path',
+            'object_uuid',
+            'subset',
+            'sha256',
+            'size_bytes',
+            'eval_schema_version',
+            'instance_object_path',
+            'instance_sha256',
+            'instance_size_bytes',
+        ):
+            value = row.get(field)
+            if value is not None:
+                entry[field] = value
+
+        dataset = entry['yaml_entry']['dataset']
+        duplicate_key = (
+            model_repo.lower(),
+            dataset['id'],
+            dataset['task_id'],
+            entry['yaml_entry'].get('notes'),
+        )
+        existing_entry = seen_keys.get(duplicate_key)
+        if existing_entry is not None:
+            if existing_entry['yaml_entry'] == entry['yaml_entry']:
+                skipped.append(
+                    {
+                        'reason': 'duplicate_candidate_same_entry',
+                        'model_repo': model_repo,
+                        'eee_evaluation_id': log.evaluation_id,
+                        'eee_record_path': record_ref,
+                        'index_path': index_jsonl.as_posix(),
+                        'index_line': line_number,
+                        **_index_trace_fields(row),
+                    }
+                )
+                continue
+            errors.append(
+                f'{line_ref}: duplicate candidate for {model_repo} '
+                f'{dataset["id"]}/{dataset["task_id"]} with different '
+                'YAML values.'
+            )
+            continue
+        seen_keys[duplicate_key] = entry
+        entries.append(entry)
+
+    manifest = {
+        'version': MANIFEST_VERSION,
+        'created_at': datetime.now(tz=UTC).isoformat(),
+        'benchmarks': benchmarks,
+        'hf_checks': check_hf,
+        'source_url_mode': 'online_flat_index_jsonl',
+        'datastore': f'{datastore_repo}@{datastore_revision}',
+        'datastore_input': datastore,
+        'datastore_repo': datastore_repo,
+        'datastore_revision': datastore_revision,
+        'index_jsonl': index_jsonl.as_posix(),
+        'entries': entries,
+        'skipped': skipped,
+        'errors': errors,
+    }
+
+    if output_path is not None:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(
+            json.dumps(manifest, indent=2, sort_keys=True) + '\n',
+            encoding='utf-8',
+        )
+
+    if errors:
+        raise HFEvalsError('\n'.join(errors))
+
+    return manifest
+
+
+def build_collection_manifest(
+    *,
+    collection_name: str,
+    datastore: str,
+    output_path: Path | None = None,
+    api: HfApi | None = None,
+    check_hf: bool = True,
+    download_file: Callable[..., str] | None = None,
+    progress: ReviewProgress | None = None,
+) -> dict[str, Any]:
+    """Build HF Community Evals candidates from a datastore collection."""
+
+    progress = progress or ReviewProgress()
+    api = api or HfApi()
+    collection_name = _safe_collection_name(collection_name)
+    entries: list[dict[str, Any]] = []
+    skipped: list[dict[str, Any]] = []
+    errors: list[str] = []
+    seen_keys: dict[tuple[str, str, str], dict[str, Any]] = {}
+    repo_check_cache: dict[str, HFEvalsError | None] = {}
+    setup_task = progress.add_task('Resolving datastore revision', total=4)
+    datastore_repo, datastore_revision = resolve_datastore_locator(
+        datastore, api=api
+    )
+    progress.update(
+        setup_task,
+        advance=1,
+        description=f'Downloading collection index {collection_name}.jsonl',
+    )
+    collection_index_path, collection_index_jsonl = _download_collection_index_jsonl(
+        api=api,
+        datastore_repo=datastore_repo,
+        datastore_revision=datastore_revision,
+        collection_name=collection_name,
+        download_file=download_file,
+    )
+    progress.update(setup_task, advance=1, description='Reading collection rows')
+    rows = _load_index_rows(collection_index_jsonl)
+    progress.update(
+        setup_task,
+        advance=1,
+        description=f'Loaded {len(rows)} collection rows',
+    )
+
+    def cached_repo_error(repo_id: str) -> HFEvalsError | None:
+        if repo_id not in repo_check_cache:
+            try:
+                _repo_exists(api, repo_id)
+                repo_check_cache[repo_id] = None
+            except HFEvalsError as exc:
+                repo_check_cache[repo_id] = exc
+        return repo_check_cache[repo_id]
+
+    row_task = progress.add_task(
+        f'Processing {len(rows)} aggregate rows',
+        total=len(rows),
+    )
+    for row_number, row in enumerate(rows, start=1):
+        line_number = row['_index_line']
+        line_ref = f'{collection_index_path}:{line_number}'
+        raw_benchmark = row.get('benchmark')
+        row_label = f'row {row_number}/{len(rows)}'
+
+        try:
+            _reject_unsupported_row_sources(row, line_ref=line_ref)
+            subset = _index_subset(row, line_ref=line_ref)
+            _validate_instance_level_reference(row, line_ref=line_ref)
+        except HFEvalsError as exc:
+            errors.append(str(exc))
+            progress.update(row_task, advance=1)
+            continue
+
+        record_type = row.get('record_type')
+        if record_type != 'aggregate':
+            skipped.append(
+                {
+                    'reason': 'non_aggregate_collection_row',
+                    'record_type': record_type,
+                    'benchmark': raw_benchmark,
+                    'collection_index_path': collection_index_path,
+                    'collection_index_line': line_number,
+                    **_index_trace_fields(row),
+                }
+            )
+            progress.update(row_task, advance=1)
+            continue
+
+        try:
+            object_path = _safe_index_path(row, 'object_path', line_ref=line_ref)
+            progress.update(
+                row_task,
+                description=f'{row_label}: downloading {object_path}',
+            )
+            source_url = _datastore_blob_url(
+                object_path,
+                datastore_repo=datastore_repo,
+                datastore_revision=datastore_revision,
+            )
+            log = _read_online_indexed_record(
+                datastore_repo=datastore_repo,
+                datastore_revision=datastore_revision,
+                object_path=object_path,
+                row=row,
+                line_ref=line_ref,
+                download_file=download_file,
+            )
+        except HFEvalsError as exc:
+            errors.append(str(exc))
+            progress.update(row_task, advance=1)
+            continue
+
+        api_only_reason = _api_only_skip_reason(log)
+        if api_only_reason is not None:
+            skipped.append(
+                {
+                    'reason': api_only_reason,
+                    'benchmark': raw_benchmark,
+                    'eee_evaluation_id': log.evaluation_id,
+                    'eee_record_path': object_path,
+                    'collection_index_path': collection_index_path,
+                    'collection_index_line': line_number,
+                    **_index_trace_fields(row),
+                    'model_id': log.model_info.id,
+                }
+            )
+            progress.update(row_task, advance=1)
+            continue
+
+        raw_model_repo = log.model_info.id
+        if not isinstance(raw_model_repo, str) or not raw_model_repo.strip():
+            errors.append(f'{line_ref}: record has no model_info.id')
+            progress.update(row_task, advance=1)
+            continue
+        model_repo = raw_model_repo.strip()
+        progress.update(row_task, description=f'{row_label}: checking {model_repo}')
+
+        status = 'ready'
+        hf_check_error: str | None = None
+        if check_hf:
+            error = cached_repo_error(model_repo)
+            if error is not None:
+                status = 'missing_hf_model'
+                hf_check_error = str(error)
+
+        try:
+            supported_results = _results_for_supported_datasets(log)
+        except HFEvalsError as exc:
+            errors.append(f'{line_ref}: {exc}')
+            progress.update(row_task, advance=1)
+            continue
+        if not supported_results:
+            skipped.append(
+                {
+                    'reason': 'no_supported_hf_dataset_result',
+                    'benchmark': raw_benchmark,
+                    'eee_evaluation_id': log.evaluation_id,
+                    'eee_record_path': object_path,
+                    'collection_index_path': collection_index_path,
+                    'collection_index_line': line_number,
+                    **_index_trace_fields(row),
+                }
+            )
+            progress.update(row_task, advance=1)
+            continue
+
+        for benchmark, config, result in supported_results:
+            try:
+                entry = _candidate_from_record_result(
+                    benchmark=benchmark,
+                    config=config,
+                    record_path=object_path,
+                    log=log,
+                    result=result,
+                    model_repo=model_repo,
+                    source_url=source_url,
+                    source='online_collection_index_jsonl',
+                    status=status,
+                    hf_check_error=hf_check_error,
+                    subset=subset,
+                )
+            except HFEvalsError as exc:
+                errors.append(f'{line_ref}: {exc}')
+                continue
+
+            entry['collection'] = collection_name
+            entry['collection_index_path'] = collection_index_path
+            entry['collection_index_line'] = line_number
+            entry['flat_object_path'] = object_path
+            for field in (
+                'legacy_path',
+                'object_uuid',
+                'subset',
+                'sha256',
+                'size_bytes',
+                'eval_schema_version',
+                'instance_level_available',
+                'instance_level_path',
+                'instance_level_sha256',
+                'instance_level_size_bytes',
+                'instance_sha',
+                'instance_object_path',
+                'instance_sha256',
+                'instance_size_bytes',
+            ):
+                value = row.get(field)
+                if value is not None:
+                    entry[field] = value
+
+            duplicate_key = _candidate_duplicate_key(entry)
+            existing_entry = seen_keys.get(duplicate_key)
+            if existing_entry is not None:
+                if _scores_equal(
+                    existing_entry['yaml_entry']['value'],
+                    entry['yaml_entry']['value'],
+                ):
+                    skipped.append(
+                        {
+                            'reason': 'duplicate_candidate_same_score',
+                            'model_repo': model_repo,
+                            'eee_evaluation_id': log.evaluation_id,
+                            'eee_record_path': object_path,
+                            'collection_index_path': collection_index_path,
+                            'collection_index_line': line_number,
+                            **_index_trace_fields(row),
+                        }
+                    )
+                    continue
+                dataset = entry['yaml_entry']['dataset']
+                errors.append(
+                    f'{line_ref}: duplicate candidate for {model_repo} '
+                    f'{dataset["id"]}/{dataset["task_id"]} with different '
+                    'scores.'
+                )
+                continue
+            seen_keys[duplicate_key] = entry
+            entries.append(entry)
+        progress.update(row_task, advance=1)
+
+    progress.update(row_task, description=f'Processed {len(rows)} aggregate rows')
+
+    manifest = {
+        'version': MANIFEST_VERSION,
+        'created_at': datetime.now(tz=UTC).isoformat(),
+        'collection': collection_name,
+        'benchmarks': list(DEFAULT_BENCHMARKS),
+        'hf_checks': check_hf,
+        'source_url_mode': 'online_collection_index_jsonl',
+        'datastore': f'{datastore_repo}@{datastore_revision}',
+        'datastore_input': datastore,
+        'datastore_repo': datastore_repo,
+        'datastore_revision': datastore_revision,
+        'collection_jsonl': collection_index_path,
+        'entries': entries,
+        'skipped': skipped,
+        'errors': errors,
+    }
+
+    if output_path is not None:
+        _write_manifest(manifest, output_path)
+
+    if errors:
+        raise HFEvalsError('\n'.join(errors))
+
+    progress.update(
+        setup_task,
+        advance=1,
+        description=(
+            f'Built manifest: {len(entries)} entries, {len(skipped)} skipped, '
+            f'{len(errors)} errors'
+        ),
+    )
+    return manifest
+
+
+def _path_family_for_entry(entry: dict[str, Any]) -> tuple[str, tuple[str, ...]]:
+    benchmark = entry.get('benchmark')
+    if not isinstance(benchmark, str):
+        raise HFEvalsError('Manifest entry benchmark must be a string.')
+    paths = EVAL_RESULT_PATH_FAMILIES.get(benchmark)
+    if paths is None:
+        return benchmark, (entry['target_path'],)
+    return benchmark, paths
+
+
+def _repo_eval_tree(
+    api: HfApi,
+    repo_id: str,
+    revision: str,
+) -> dict[str, dict[str, Any]]:
+    try:
+        items = list(
+            api.list_repo_tree(
+                repo_id,
+                '.eval_results',
+                recursive=True,
+                expand=False,
+                revision=revision,
+                repo_type='model',
+                token=True,
+            )
+        )
+    except EntryNotFoundError:
+        return {}
+    except Exception as exc:  # noqa: BLE001
+        if exc.__class__.__name__ == 'EntryNotFoundError':
+            return {}
+        raise HFEvalsError(
+            f'Unable to list .eval_results for {repo_id}@{revision}'
+        ) from exc
+
+    tree: dict[str, dict[str, Any]] = {}
+    for item in items:
+        path = getattr(item, 'path', None) or getattr(item, 'rfilename', None)
+        if not path:
+            continue
+        tree[path] = {
+            'blob_id': getattr(item, 'blob_id', None) or path,
+            'size': getattr(item, 'size', None),
+        }
+    return tree
+
+
+def _discussion_number(discussion: Any) -> int | None:
+    value = getattr(discussion, 'num', None)
+    if value is None:
+        url = getattr(discussion, 'url', '')
+        if isinstance(url, str) and '/discussions/' in url:
+            value = url.rsplit('/discussions/', 1)[-1].strip('/')
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _discussion_url(repo_id: str, discussion: Any) -> str:
+    url = getattr(discussion, 'url', None)
+    if isinstance(url, str) and url:
+        return url
+    number = _discussion_number(discussion)
+    if number is None:
+        return f'https://huggingface.co/{repo_id}/discussions'
+    return f'https://huggingface.co/{repo_id}/discussions/{number}'
+
+
+def _discussion_revision(discussion: Any) -> str | None:
+    revision = getattr(discussion, 'git_reference', None)
+    if isinstance(revision, str) and revision:
+        return revision
+    number = _discussion_number(discussion)
+    if number is None:
+        return None
+    return f'refs/pr/{number}'
+
+
+def _open_pull_requests(api: HfApi, repo_id: str) -> list[Any]:
+    try:
+        return list(
+            api.get_repo_discussions(
+                repo_id,
+                repo_type='model',
+                discussion_type='pull_request',
+                discussion_status='open',
+                token=True,
+            )
+        )
+    except Exception as exc:  # noqa: BLE001
+        raise HFEvalsError(f'Unable to list open PRs for {repo_id}') from exc
+
+
+def _candidate_comment(entry: dict[str, Any]) -> str:
+    yaml_entry = entry['yaml_entry']
+    dataset = yaml_entry['dataset']
+    source = yaml_entry['source']
+    benchmark = f'{dataset["id"]}/{dataset["task_id"]}'
+    source_name = source.get('name') or SOURCE_NAME
+    source_url = source['url']
+    value = yaml_entry['value']
+    return (
+        f'This model scores {value} on {benchmark} run by {source_name}, '
+        'but it is different from the currently posted score. '
+        f'See {source_url} for full details.'
+    )
+
+
+def _already_present_comment(entry: dict[str, Any]) -> str:
+    yaml_entry = entry['yaml_entry']
+    dataset = yaml_entry['dataset']
+    return (
+        'Already present, will not open PR: '
+        f'{entry["model_repo"]} has {dataset["id"]}/{dataset["task_id"]} '
+        f'with score {yaml_entry["value"]}.'
+    )
+
+
+def _eval_yaml_paths(tree: dict[str, dict[str, Any]]) -> list[str]:
+    return sorted(
+        path
+        for path in tree
+        if path.startswith('.eval_results/')
+        and path.rsplit('.', 1)[-1].lower() in {'yaml', 'yml'}
+    )
+
+
+def _download_model_file_text(
+    *,
+    repo_id: str,
+    revision: str,
+    path: str,
+    download_file: Callable[..., str] | None = None,
+) -> str:
+    download_file = download_file or hf_hub_download
+    try:
+        local_path = download_file(
+            repo_id=repo_id,
+            repo_type='model',
+            filename=path,
+            revision=revision,
+        )
+    except Exception as exc:  # noqa: BLE001
+        raise HFEvalsError(
+            f'Unable to download {path} from {repo_id}@{revision}'
+        ) from exc
+    return Path(local_path).read_text(encoding='utf-8')
+
+
+def _load_eval_yaml_entries(
+    *,
+    repo_id: str,
+    revision: str,
+    path: str,
+    download_file: Callable[..., str] | None = None,
+) -> list[dict[str, Any]]:
+    text = _download_model_file_text(
+        repo_id=repo_id,
+        revision=revision,
+        path=path,
+        download_file=download_file,
+    )
+    try:
+        loaded = yaml.safe_load(text)
+    except yaml.YAMLError as exc:
+        raise HFEvalsError(
+            f'Invalid YAML in {repo_id}@{revision}:{path}: {exc}'
+        ) from exc
+    if not isinstance(loaded, list):
+        raise HFEvalsError(
+            f'Eval results YAML must be a list in {repo_id}@{revision}:{path}'
+        )
+    entries: list[dict[str, Any]] = []
+    for index, item in enumerate(loaded, start=1):
+        if not isinstance(item, dict):
+            raise HFEvalsError(
+                f'Eval results item {index} must be an object in '
+                f'{repo_id}@{revision}:{path}'
+            )
+        entries.append(item)
+    return entries
+
+
+def _yaml_dataset_key(yaml_entry: dict[str, Any]) -> tuple[str, str] | None:
+    dataset = yaml_entry.get('dataset')
+    if not isinstance(dataset, dict):
+        return None
+    dataset_id = dataset.get('id')
+    task_id = dataset.get('task_id')
+    if not isinstance(dataset_id, str) or not isinstance(task_id, str):
+        return None
+    return dataset_id.strip().lower(), task_id.strip()
+
+
+def _candidate_yaml_dataset_key(entry: dict[str, Any]) -> tuple[str, str]:
+    dataset = entry['yaml_entry']['dataset']
+    return str(dataset['id']).strip().lower(), str(dataset['task_id']).strip()
+
+
+def _classify_existing_yaml_entries(
+    *,
+    candidate: dict[str, Any],
+    yaml_entries: list[dict[str, Any]],
+    context: str,
+) -> dict[str, Any] | None:
+    candidate_key = _candidate_yaml_dataset_key(candidate)
+    candidate_value = candidate['yaml_entry']['value']
+    for item in yaml_entries:
+        if _yaml_dataset_key(item) != candidate_key:
+            continue
+        if 'value' not in item:
+            raise HFEvalsError(f'{context}: matching entry is missing value')
+        if _scores_equal(item['value'], candidate_value):
+            return {
+                'status': 'already_present',
+                'existing_value': item['value'],
+                'comment': _already_present_comment(candidate),
+            }
+        return {
+            'status': 'score_conflict',
+            'existing_value': item['value'],
+            'comment': _candidate_comment(candidate),
+        }
+    return None
+
+
+def audit_manifest_for_hf_eval_duplicates(
+    manifest: dict[str, Any],
+    *,
+    api: HfApi | None = None,
+    download_file: Callable[..., str] | None = None,
+    progress: ReviewProgress | None = None,
+) -> dict[str, Any]:
+    """Check candidate YAML entries against main .eval_results and open PRs."""
+
+    progress = progress or ReviewProgress()
+    api = api or HfApi()
+    entries = [
+        (entry_index, entry)
+        for entry_index, entry in enumerate(manifest.get('entries', []))
+        if _entry_is_ready(entry)
+    ]
+    main_tree_cache: dict[str, dict[str, dict[str, Any]]] = {}
+    main_yaml_cache: dict[tuple[str, str], list[dict[str, Any]]] = {}
+    open_pr_cache: dict[str, list[Any]] = {}
+    pr_tree_cache: dict[tuple[str, str], dict[str, dict[str, Any]]] = {}
+    pr_yaml_cache: dict[tuple[str, str, str], list[dict[str, Any]]] = {}
+    findings: list[dict[str, Any]] = []
+    errors: list[dict[str, Any]] = []
+    audit_task = progress.add_task(
+        f'Auditing {len(entries)} ready candidates',
+        total=len(entries),
+    )
+
+    def cached_main_tree(repo_id: str) -> dict[str, dict[str, Any]]:
+        if repo_id not in main_tree_cache:
+            main_tree_cache[repo_id] = _repo_eval_tree(
+                api, repo_id, DEFAULT_DATASTORE_REVISION
+            )
+        return main_tree_cache[repo_id]
+
+    def cached_prs(repo_id: str) -> list[Any]:
+        if repo_id not in open_pr_cache:
+            open_pr_cache[repo_id] = _open_pull_requests(api, repo_id)
+        return open_pr_cache[repo_id]
+
+    def cached_pr_tree(
+        repo_id: str, revision: str
+    ) -> dict[str, dict[str, Any]]:
+        key = (repo_id, revision)
+        if key not in pr_tree_cache:
+            pr_tree_cache[key] = _repo_eval_tree(api, repo_id, revision)
+        return pr_tree_cache[key]
+
+    def cached_yaml(
+        repo_id: str,
+        revision: str,
+        path: str,
+    ) -> list[dict[str, Any]]:
+        if revision == DEFAULT_DATASTORE_REVISION:
+            key = (repo_id, path)
+            if key not in main_yaml_cache:
+                main_yaml_cache[key] = _load_eval_yaml_entries(
+                    repo_id=repo_id,
+                    revision=revision,
+                    path=path,
+                    download_file=download_file,
+                )
+            return main_yaml_cache[key]
+        key = (repo_id, revision, path)
+        if key not in pr_yaml_cache:
+            pr_yaml_cache[key] = _load_eval_yaml_entries(
+                repo_id=repo_id,
+                revision=revision,
+                path=path,
+                download_file=download_file,
+            )
+        return pr_yaml_cache[key]
+
+    for entry_index, entry in entries:
+        repo_id = entry['model_repo']
+        benchmark = entry.get('benchmark')
+        progress.update(
+            audit_task,
+            description=f'Auditing {repo_id} {entry["target_path"]}',
+        )
+        try:
+            main_tree = cached_main_tree(repo_id)
+        except HFEvalsError as exc:
+            errors.append(
+                {
+                    'entry_index': entry_index,
+                    'model_repo': repo_id,
+                    'benchmark': benchmark,
+                    'target_path': entry['target_path'],
+                    'stage': 'list_main_eval_results',
+                    'error': str(exc),
+                }
+            )
+            progress.update(audit_task, advance=1)
+            continue
+
+        for path in _eval_yaml_paths(main_tree):
+            try:
+                yaml_entries = cached_yaml(
+                    repo_id, DEFAULT_DATASTORE_REVISION, path
+                )
+                match = _classify_existing_yaml_entries(
+                    candidate=entry,
+                    yaml_entries=yaml_entries,
+                    context=f'{repo_id}@main:{path}',
+                )
+            except HFEvalsError as exc:
+                errors.append(
+                    {
+                        'entry_index': entry_index,
+                        'model_repo': repo_id,
+                        'benchmark': benchmark,
+                        'target_path': entry['target_path'],
+                        'stage': 'read_main_eval_results',
+                        'path': path,
+                        'error': str(exc),
+                    }
+                )
+                continue
+            if match is None:
+                continue
+            findings.append(
+                {
+                    'type': f'existing_eval_results_{match["status"]}',
+                    'status': match['status'],
+                    'entry_index': entry_index,
+                    'model_repo': repo_id,
+                    'benchmark': benchmark,
+                    'target_path': entry['target_path'],
+                    'candidate_path': entry['target_path'],
+                    'existing_path': path,
+                    'existing_value': match['existing_value'],
+                    'candidate_value': entry['yaml_entry']['value'],
+                    'candidate_source_url': entry['yaml_entry']['source']['url'],
+                    'comment': match['comment'],
+                }
+            )
+
+        try:
+            discussions = cached_prs(repo_id)
+        except HFEvalsError as exc:
+            errors.append(
+                {
+                    'entry_index': entry_index,
+                    'model_repo': repo_id,
+                    'benchmark': benchmark,
+                    'target_path': entry['target_path'],
+                    'stage': 'list_open_prs',
+                    'error': str(exc),
+                }
+            )
+            progress.update(audit_task, advance=1)
+            continue
+
+        for discussion in discussions:
+            revision = _discussion_revision(discussion)
+            if revision is None:
+                errors.append(
+                    {
+                        'entry_index': entry_index,
+                        'model_repo': repo_id,
+                        'benchmark': benchmark,
+                        'target_path': entry['target_path'],
+                        'stage': 'resolve_pr_revision',
+                        'error': f'No PR revision for {_discussion_url(repo_id, discussion)}',
+                    }
+                )
+                continue
+            try:
+                pr_tree = cached_pr_tree(repo_id, revision)
+            except HFEvalsError as exc:
+                errors.append(
+                    {
+                        'entry_index': entry_index,
+                        'model_repo': repo_id,
+                        'benchmark': benchmark,
+                        'target_path': entry['target_path'],
+                        'stage': 'list_open_pr_eval_results',
+                        'pr_url': _discussion_url(repo_id, discussion),
+                        'error': str(exc),
+                    }
+                )
+                continue
+
+            changed_paths = []
+            for path in _eval_yaml_paths(pr_tree):
+                main_blob = main_tree.get(path, {}).get('blob_id')
+                pr_blob = pr_tree[path].get('blob_id')
+                if main_blob != pr_blob:
+                    changed_paths.append(path)
+            for path in changed_paths:
+                try:
+                    yaml_entries = cached_yaml(repo_id, revision, path)
+                    match = _classify_existing_yaml_entries(
+                        candidate=entry,
+                        yaml_entries=yaml_entries,
+                        context=f'{repo_id}@{revision}:{path}',
+                    )
+                except HFEvalsError as exc:
+                    errors.append(
+                        {
+                            'entry_index': entry_index,
+                            'model_repo': repo_id,
+                            'benchmark': benchmark,
+                            'target_path': entry['target_path'],
+                            'stage': 'read_open_pr_eval_results',
+                            'pr_url': _discussion_url(repo_id, discussion),
+                            'path': path,
+                            'error': str(exc),
+                        }
+                    )
+                    continue
+                if match is None:
+                    continue
+                findings.append(
+                    {
+                        'type': f'open_pr_eval_results_{match["status"]}',
+                        'status': match['status'],
+                        'entry_index': entry_index,
+                        'model_repo': repo_id,
+                        'benchmark': benchmark,
+                        'target_path': entry['target_path'],
+                        'candidate_path': entry['target_path'],
+                        'pr_url': _discussion_url(repo_id, discussion),
+                        'pr_title': getattr(discussion, 'title', None),
+                        'paths': [path],
+                        'existing_value': match['existing_value'],
+                        'candidate_value': entry['yaml_entry']['value'],
+                        'candidate_source_url': entry['yaml_entry']['source']['url'],
+                        'comment': match['comment'],
+                    }
+                )
+
+        progress.update(audit_task, advance=1)
+
+    return {
+        'created_at': datetime.now(tz=UTC).isoformat(),
+        'candidate_count': len(entries),
+        'finding_count': len(findings),
+        'error_count': len(errors),
+        'findings': findings,
+        'errors': errors,
+    }
+
+
+def _apply_duplicate_audit_to_manifest(
+    manifest: dict[str, Any],
+    duplicate_audit: dict[str, Any],
+) -> None:
+    priority = {'already_present': 1, 'score_conflict': 2}
+    selected: dict[int, tuple[int, str, list[dict[str, Any]]]] = {}
+    for finding in duplicate_audit.get('findings', []):
+        entry_index = finding.get('entry_index')
+        status = finding.get('status')
+        if not isinstance(entry_index, int) or status not in priority:
+            continue
+        rank = priority[status]
+        existing = selected.get(entry_index)
+        if existing is None:
+            selected[entry_index] = (rank, status, [finding])
+            continue
+        existing_rank, existing_status, findings = existing
+        findings.append(finding)
+        if rank > existing_rank:
+            selected[entry_index] = (rank, status, findings)
+        else:
+            selected[entry_index] = (existing_rank, existing_status, findings)
+
+    entries = manifest.get('entries', [])
+    if not isinstance(entries, list):
+        raise HFEvalsError('Manifest entries must be a list.')
+    for entry_index, (_rank, status, findings) in selected.items():
+        if entry_index < 0 or entry_index >= len(entries):
+            raise HFEvalsError(
+                f'Duplicate audit referenced missing manifest entry {entry_index}.'
+            )
+        entry = entries[entry_index]
+        if not isinstance(entry, dict):
+            raise HFEvalsError(
+                f'Manifest entry {entry_index} must be an object.'
+            )
+        if not _entry_is_ready(entry):
+            continue
+        entry['status'] = status
+        entry['duplicate_audit_findings'] = findings
+
+    errors_by_entry: dict[int, list[dict[str, Any]]] = {}
+    for error in duplicate_audit.get('errors', []):
+        entry_index = error.get('entry_index')
+        if not isinstance(entry_index, int):
+            continue
+        errors_by_entry.setdefault(entry_index, []).append(error)
+
+    for entry_index, audit_errors in errors_by_entry.items():
+        if entry_index < 0 or entry_index >= len(entries):
+            raise HFEvalsError(
+                f'Duplicate audit referenced missing manifest entry {entry_index}.'
+            )
+        entry = entries[entry_index]
+        if not isinstance(entry, dict):
+            raise HFEvalsError(
+                f'Manifest entry {entry_index} must be an object.'
+            )
+        entry['duplicate_audit_errors'] = audit_errors
+        if _entry_is_ready(entry):
+            entry['status'] = AUDIT_ERROR_STATUS
+
+
+def _write_manifest(manifest: dict[str, Any], output_path: Path) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(
+        json.dumps(manifest, indent=2, sort_keys=True) + '\n',
+        encoding='utf-8',
+    )
+
+
+def _review_from_manifest(
+    *,
+    manifest: dict[str, Any],
+    manifest_output_path: Path,
+    yaml_output_dir: Path,
+    review_output_path: Path,
+    duplicate_audit: dict[str, Any],
+) -> dict[str, Any]:
+    _apply_duplicate_audit_to_manifest(manifest, duplicate_audit)
+    _write_manifest(manifest, manifest_output_path)
+    yaml_result = _write_yaml_from_manifest(manifest, yaml_output_dir)
+    ready_entries = [
+        entry for entry in manifest['entries'] if _entry_is_ready(entry)
+    ]
+    audit_blocked_entries = [
+        entry
+        for entry in manifest['entries']
+        if entry.get('status') == AUDIT_ERROR_STATUS
+    ]
+    global_audit_errors = [
+        error
+        for error in duplicate_audit.get('errors', [])
+        if not isinstance(error.get('entry_index'), int)
+    ]
+    review = {
+        'created_at': datetime.now(tz=UTC).isoformat(),
+        'manifest_path': manifest_output_path.as_posix(),
+        'yaml_output_dir': yaml_output_dir.as_posix(),
+        'yaml_count': yaml_result['count'],
+        'yaml_files': yaml_result['written'],
+        'can_open_prs': len(ready_entries) > 0 and not global_audit_errors,
+        'audit_blocked_entries': audit_blocked_entries,
+        'global_audit_errors': global_audit_errors,
+        'missing_hf_models': [
+            entry
+            for entry in manifest['entries']
+            if entry.get('status') == 'missing_hf_model'
+        ],
+        'manifest': manifest,
+        'duplicate_audit': duplicate_audit,
+    }
+    review_output_path.parent.mkdir(parents=True, exist_ok=True)
+    review_output_path.write_text(
+        json.dumps(review, indent=2, sort_keys=True) + '\n',
+        encoding='utf-8',
+    )
+    return review
+
+
+def review_index_for_hf_evals(
+    *,
+    index_jsonl: Path,
+    datastore: str,
+    benchmarks: list[str],
+    manifest_output_path: Path,
+    yaml_output_dir: Path,
+    review_output_path: Path,
+    api: HfApi | None = None,
+    check_hf: bool = True,
+    download_file: Callable[..., str] | None = None,
+) -> dict[str, Any]:
+    api = api or HfApi()
+    manifest = build_index_manifest(
+        index_jsonl=index_jsonl,
+        datastore=datastore,
+        benchmarks=benchmarks,
+        output_path=None,
+        api=api,
+        check_hf=check_hf,
+        download_file=download_file,
+    )
+    duplicate_audit = audit_manifest_for_hf_eval_duplicates(
+        manifest,
+        api=api,
+        download_file=download_file,
+    )
+    return _review_from_manifest(
+        manifest=manifest,
+        manifest_output_path=manifest_output_path,
+        yaml_output_dir=yaml_output_dir,
+        review_output_path=review_output_path,
+        duplicate_audit=duplicate_audit,
+    )
+
+
+def review_collection_for_hf_evals(
+    *,
+    collection_name: str,
+    datastore: str,
+    manifest_output_path: Path,
+    yaml_output_dir: Path,
+    review_output_path: Path,
+    api: HfApi | None = None,
+    check_hf: bool = True,
+    download_file: Callable[..., str] | None = None,
+    progress: ReviewProgress | None = None,
+    force: bool = False,
+) -> dict[str, Any]:
+    progress = progress or ReviewProgress()
+    api = api or HfApi()
+    collection_name = _safe_collection_name(collection_name)
+
+    manifest = None
+    if not force:
+        cached_review = _load_cached_collection_review(
+            review_output_path=review_output_path,
+            yaml_output_dir=yaml_output_dir,
+            collection_name=collection_name,
+            datastore=datastore,
+            check_hf=check_hf,
+        )
+        if cached_review is not None:
+            cache_task = progress.add_task('Using cached review', total=1)
+            progress.update(cache_task, advance=1, description='Used cached review')
+            return cached_review
+
+        manifest = _load_cached_collection_manifest(
+            manifest_output_path=manifest_output_path,
+            collection_name=collection_name,
+            datastore=datastore,
+            check_hf=check_hf,
+        )
+        if manifest is not None:
+            cache_task = progress.add_task('Using cached manifest', total=1)
+            progress.update(
+                cache_task,
+                advance=1,
+                description='Used cached manifest; starting audit',
+            )
+
+    if manifest is None:
+        manifest = build_collection_manifest(
+            collection_name=collection_name,
+            datastore=datastore,
+            output_path=manifest_output_path,
+            api=api,
+            check_hf=check_hf,
+            download_file=download_file,
+            progress=progress,
+        )
+    duplicate_audit = audit_manifest_for_hf_eval_duplicates(
+        manifest,
+        api=api,
+        download_file=download_file,
+        progress=progress,
+    )
+    return _review_from_manifest(
+        manifest=manifest,
+        manifest_output_path=manifest_output_path,
+        yaml_output_dir=yaml_output_dir,
+        review_output_path=review_output_path,
+        duplicate_audit=duplicate_audit,
+    )
+
+
+def _validate_manifest(manifest: dict[str, Any]) -> dict[str, Any]:
+    if manifest.get('version') != MANIFEST_VERSION:
+        raise HFEvalsError(
+            f'Unsupported manifest version: {manifest.get("version")!r}'
+        )
+    entries = manifest.get('entries')
+    if not isinstance(entries, list):
+        raise HFEvalsError('Manifest entries must be a list.')
+    errors = manifest.get('errors') or []
+    if errors:
+        raise HFEvalsError('Manifest contains errors; rebuild it first.')
+    return manifest
+
+
+def load_manifest(path: Path) -> dict[str, Any]:
+    manifest = json.loads(path.read_text(encoding='utf-8'))
+    if not isinstance(manifest, dict):
+        raise HFEvalsError('Manifest must be a JSON object.')
+    return _validate_manifest(manifest)
+
+
+def _collection_cache_matches(
+    manifest: dict[str, Any],
+    *,
+    collection_name: str,
+    datastore: str,
+    check_hf: bool,
+) -> bool:
+    return (
+        manifest.get('version') == MANIFEST_VERSION
+        and manifest.get('collection') == collection_name
+        and manifest.get('datastore_input') == datastore
+        and manifest.get('hf_checks') is check_hf
+        and manifest.get('source_url_mode') == 'online_collection_index_jsonl'
+    )
+
+
+def _manifest_has_duplicate_audit_state(manifest: dict[str, Any]) -> bool:
+    audit_statuses = {'already_present', 'score_conflict', AUDIT_ERROR_STATUS}
+    for entry in manifest.get('entries', []):
+        if not isinstance(entry, dict):
+            continue
+        if entry.get('status') in audit_statuses:
+            return True
+        if (
+            'duplicate_audit_findings' in entry
+            or 'duplicate_audit_errors' in entry
+        ):
+            return True
+    return False
+
+
+def _load_cached_collection_manifest(
+    *,
+    manifest_output_path: Path,
+    collection_name: str,
+    datastore: str,
+    check_hf: bool,
+) -> dict[str, Any] | None:
+    if not manifest_output_path.exists():
+        return None
+    try:
+        manifest = load_manifest(manifest_output_path)
+    except (json.JSONDecodeError, OSError) as exc:
+        raise HFEvalsError(
+            f'Cached manifest is not readable: {manifest_output_path}'
+        ) from exc
+    if not _collection_cache_matches(
+        manifest,
+        collection_name=collection_name,
+        datastore=datastore,
+        check_hf=check_hf,
+    ):
+        return None
+    if _manifest_has_duplicate_audit_state(manifest):
+        raise HFEvalsError(
+            f'Cached manifest is post-audit but {manifest_output_path.parent / "review.json"} '
+            'is missing or does not match. Move the cached output directory aside '
+            'before rebuilding.'
+        )
+    return manifest
+
+
+def _load_cached_collection_review(
+    *,
+    review_output_path: Path,
+    yaml_output_dir: Path,
+    collection_name: str,
+    datastore: str,
+    check_hf: bool,
+) -> dict[str, Any] | None:
+    if not review_output_path.exists():
+        return None
+    try:
+        review = json.loads(review_output_path.read_text(encoding='utf-8'))
+    except (json.JSONDecodeError, OSError) as exc:
+        raise HFEvalsError(
+            f'Cached review is not readable: {review_output_path}'
+        ) from exc
+    if not isinstance(review, dict):
+        raise HFEvalsError(f'Cached review must be an object: {review_output_path}')
+    manifest = review.get('manifest')
+    if not isinstance(manifest, dict):
+        raise HFEvalsError(
+            f'Cached review is missing its manifest: {review_output_path}'
+        )
+    if not _collection_cache_matches(
+        manifest,
+        collection_name=collection_name,
+        datastore=datastore,
+        check_hf=check_hf,
+    ):
+        return None
+    for field in (
+        'duplicate_audit',
+        'can_open_prs',
+        'audit_blocked_entries',
+        'global_audit_errors',
+        'missing_hf_models',
+    ):
+        if field not in review:
+            raise HFEvalsError(
+                f'Cached review is missing {field}: {review_output_path}'
+            )
+    yaml_result = _write_yaml_from_manifest(manifest, yaml_output_dir)
+    review['yaml_output_dir'] = yaml_output_dir.as_posix()
+    review['yaml_count'] = yaml_result['count']
+    review['yaml_files'] = yaml_result['written']
+    review_output_path.write_text(
+        json.dumps(review, indent=2, sort_keys=True) + '\n',
+        encoding='utf-8',
+    )
+    return review
+
+
+def _write_yaml_from_manifest(
+    manifest: dict[str, Any],
+    output_dir: Path,
+) -> dict[str, Any]:
+    manifest = _validate_manifest(manifest)
+    grouped: dict[tuple[str, str], tuple[str, str, list[dict[str, Any]]]] = {}
+    for entry in manifest['entries']:
+        if not _entry_has_yaml_preview(entry):
+            continue
+        model_repo = entry['model_repo']
+        target_path = entry['target_path']
+        key = (model_repo.lower(), target_path)
+        if key not in grouped:
+            grouped[key] = (model_repo, target_path, [])
+        grouped[key][2].append(entry['yaml_entry'])
+
+    written: list[str] = []
+    for model_repo, target_path, yaml_entries in sorted(grouped.values()):
+        path = output_dir / model_repo / target_path
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(dump_yaml_entries(yaml_entries), encoding='utf-8')
+        written.append(path.as_posix())
+
+    return {'written': written, 'count': len(written)}
+
+
+def write_yaml_from_manifest(manifest_path: Path, output_dir: Path) -> dict[str, Any]:
+    return _write_yaml_from_manifest(load_manifest(manifest_path), output_dir)
+
+
+def create_prs_from_manifest(
+    manifest_path: Path,
+    *,
+    limit: int | None,
+    yes_i_reviewed: bool,
+    commit_message: str,
+    api: HfApi | None = None,
+    commit_description: str = DEFAULT_PR_COMMIT_DESCRIPTION,
+    stream: bool = False,
+) -> dict[str, Any]:
+    if not yes_i_reviewed:
+        raise HFEvalsError('Refusing to create PRs without --yes-i-reviewed.')
+    if not commit_message.strip():
+        raise HFEvalsError('Commit message must not be empty.')
+    if not commit_description.strip():
+        raise HFEvalsError('Commit description must not be empty.')
+    manifest = load_manifest(manifest_path)
+    api = api or HfApi()
+
+    grouped: dict[str, tuple[str, dict[str, list[dict[str, Any]]]]] = {}
+    for entry in manifest['entries']:
+        if not _entry_is_ready(entry):
+            continue
+        model_repo = entry['model_repo']
+        repo_key = model_repo.lower()
+        if repo_key not in grouped:
+            grouped[repo_key] = (model_repo, {})
+        by_path = grouped[repo_key][1]
+        by_path.setdefault(entry['target_path'], []).append(entry['yaml_entry'])
+
+    created: list[dict[str, Any]] = []
+    total_repos = len(grouped)
+    for repo_index, (model_repo, by_path) in enumerate(
+        sorted(grouped.values(), key=lambda item: item[0].lower())
+    ):
+        if limit is not None and repo_index >= limit:
+            break
+        if stream:
+            print(
+                f'[{repo_index + 1}/{total_repos}] preparing {model_repo}',
+                flush=True,
+            )
+
+        operations: list[CommitOperationAdd] = []
+        for target_path, new_entries in sorted(by_path.items()):
+            operations.append(
+                CommitOperationAdd(
+                    path_in_repo=target_path,
+                    path_or_fileobj=dump_yaml_entries(new_entries).encode('utf-8'),
+                )
+            )
+
+        if not operations:
+            if stream:
+                print(
+                    f'[{repo_index + 1}/{total_repos}] no changes {model_repo}',
+                    flush=True,
+                )
+            continue
+
+        try:
+            info = api.create_commit(
+                repo_id=model_repo,
+                repo_type='model',
+                operations=operations,
+                commit_message=commit_message,
+                commit_description=commit_description,
+                revision=DEFAULT_DATASTORE_REVISION,
+                create_pr=True,
+            )
+        except Exception as exc:  # noqa: BLE001
+            raise HFEvalsError(f'Unable to create PR for {model_repo}') from exc
+
+        pr_url = getattr(info, 'pr_url', None)
+        commit_url = getattr(info, 'commit_url', None)
+        created.append(
+            {
+                'model_repo': model_repo,
+                'commit': str(info),
+                'commit_url': commit_url,
+                'pr_url': pr_url,
+                'updated_existing_pr': False,
+            }
+        )
+        if stream:
+            print(
+                f'[{repo_index + 1}/{total_repos}] '
+                f'created {model_repo}: '
+                f'{pr_url or commit_url or info}',
+                flush=True,
+            )
+
+    return {
+        'created': created,
+        'count': len(created),
+        'failed': [],
+        'failed_count': 0,
+        'skipped': [],
+        'skipped_count': 0,
+    }
+
+
+APPROVAL_PHRASE = 'OPEN PRS'
+
+
+def _panel(
+    renderable: object,
+    *,
+    title: str | None = None,
+    border_style: str = 'yellow',
+) -> Panel:
+    return Panel(
+        renderable,
+        title=title,
+        border_style=border_style,
+        expand=False,
+    )
+
+
+def _render_interrupted_prompt(console: Console) -> None:
+    console.line()
+    console.print(_panel('PR submission cancelled.', border_style='yellow'))
+
+
+def _default_paths(collection_name: str) -> tuple[Path, Path, Path]:
+    stem = _safe_collection_name(collection_name).replace(' ', '_')
+    base = Path('outputs') / f'community_evals_converter_{stem}'
+    return (
+        base / 'manifest.json',
+        base / 'yamls',
+        base / 'review.json',
+    )
+
+
+def _render_summary(console: Console, review: dict) -> None:
+    manifest = review['manifest']
+    audit = review['duplicate_audit']
+    missing_models = review['missing_hf_models']
+
+    table = Table(title='Community Evals Converter', show_header=True, header_style='bold')
+    table.add_column('Item')
+    table.add_column('Count', justify='right')
+    table.add_row('records converted', str(len(manifest['entries'])))
+    table.add_row(
+        'ready records',
+        str(len([entry for entry in manifest['entries'] if _entry_is_ready(entry)])),
+    )
+    table.add_row(
+        'already present',
+        str(
+            len(
+                [
+                    entry
+                    for entry in manifest['entries']
+                    if entry.get('status') == 'already_present'
+                ]
+            )
+        ),
+    )
+    table.add_row(
+        'score conflicts',
+        str(
+            len(
+                [
+                    entry
+                    for entry in manifest['entries']
+                    if entry.get('status') == 'score_conflict'
+                ]
+            )
+        ),
+    )
+    table.add_row(
+        'audit-blocked records',
+        str(len(review.get('audit_blocked_entries', []))),
+    )
+    table.add_row('preview YAML files', str(review['yaml_count']))
+    table.add_row('skipped records', str(len(manifest['skipped'])))
+    table.add_row('missing HF models', str(len(missing_models)))
+    table.add_row('existing score findings', str(audit['finding_count']))
+    table.add_row('audit errors', str(audit['error_count']))
+    console.print(table)
+
+    console.print(f'Manifest: {review["manifest_path"]}')
+    console.print(f'YAML dir:  {review["yaml_output_dir"]}')
+
+
+def _render_review_details(console: Console, review: dict) -> None:
+    max_rows = 20
+    rows: list[tuple[str, str, str, str, str | Text]] = []
+
+    def datastore_record_url(path: object) -> object:
+        raw_path = str(path or '')
+        if not raw_path.startswith('flat/'):
+            return path
+        manifest = review['manifest']
+        datastore_repo = manifest.get('datastore_repo')
+        datastore_revision = manifest.get('datastore_revision')
+        if not isinstance(datastore_repo, str) or not isinstance(
+            datastore_revision,
+            str,
+        ):
+            return path
+        return _datastore_blob_url(
+            raw_path,
+            datastore_repo=datastore_repo,
+            datastore_revision=datastore_revision,
+        )
+
+    def where_cell(value: object) -> str | Text:
+        text = str(value or '')
+        if text.startswith(('http://', 'https://')):
+            return Text(text, style=f'link {text}')
+        return text
+
+    def add(
+        issue: str,
+        model: object,
+        details: object,
+        action: str,
+        where: object,
+    ) -> None:
+        if len(rows) >= max_rows:
+            return
+        rows.append(
+            (
+                str(issue or ''),
+                str(model or ''),
+                str(details or ''),
+                action,
+                where_cell(where),
+            )
+        )
+
+    for error in review['duplicate_audit']['errors']:
+        entry_index = error.get('entry_index')
+        action = 'block entry' if isinstance(entry_index, int) else 'block all'
+        add(
+            'audit_error',
+            error.get('model_repo'),
+            error.get('error'),
+            action,
+            error.get('pr_url') or error.get('path') or error.get('stage'),
+        )
+
+    findings = review['duplicate_audit']['findings']
+    score_conflicts = [
+        item for item in findings if item.get('status') == 'score_conflict'
+    ]
+    already_present = [
+        item for item in findings if item.get('status') == 'already_present'
+    ]
+    for item in score_conflicts:
+        where = item.get('existing_path') or item.get('pr_url') or ''
+        paths = item.get('paths')
+        if paths:
+            details = (
+                f'{item.get("existing_value")} -> {item.get("candidate_value")}; '
+                f'existing score differs from EvalEval; {", ".join(paths)}'
+            )
+        else:
+            details = (
+                f'{item.get("existing_value")} -> {item.get("candidate_value")}; '
+                'existing score differs from EvalEval.'
+            )
+        add(
+            'score_conflict',
+            item.get('model_repo'),
+            details,
+            'exclude',
+            where,
+        )
+
+    if already_present:
+        add(
+            'already_present',
+            f'{len(already_present)} models',
+            'Same-score result already exists; excluded from PRs.',
+            'exclude',
+            '.eval_results',
+        )
+
+    for entry in review['missing_hf_models']:
+        add(
+            'missing_hf_model',
+            entry.get('model_repo'),
+            entry.get('hf_check_error'),
+            'exclude',
+            entry.get('yaml_entry', {}).get('source', {}).get('url')
+            or datastore_record_url(entry.get('eee_record_path')),
+        )
+
+    for item in review['manifest']['skipped']:
+        line = item.get('collection_index_line') or item.get('index_line') or ''
+        add(
+            'skipped',
+            item.get('model_id'),
+            item.get('reason'),
+            f'line {line}' if line else 'skip',
+            datastore_record_url(
+                item.get('eee_record_path') or item.get('object_path')
+            ),
+        )
+
+    if not rows:
+        return
+
+    total = (
+        len(review['duplicate_audit']['errors'])
+        + len(score_conflicts)
+        + (1 if already_present else 0)
+        + len(review['missing_hf_models'])
+        + len(review['manifest']['skipped'])
+    )
+    table = Table(
+        title='Needs Attention',
+        show_header=True,
+        header_style='bold cyan',
+        show_lines=True,
+    )
+    table.add_column('Issue', no_wrap=True)
+    table.add_column('Model', overflow='fold', ratio=2, max_width=30)
+    table.add_column('Details', overflow='fold', ratio=4)
+    table.add_column('Action', no_wrap=True)
+    table.add_column('Where', no_wrap=True, overflow='ellipsis', ratio=4)
+    for row in rows:
+        table.add_row(*row)
+    if total > len(rows):
+        table.caption = (
+            f'Showing {len(rows)} of {total} attention items. '
+            'Full data is in review JSON.'
+        )
+    console.print(table)
+
+
+def _render_not_ready(console: Console, review: dict) -> None:
+    audit_blocked_count = len(review.get('audit_blocked_entries', []))
+    global_audit_error_count = len(review.get('global_audit_errors', []))
+    if global_audit_error_count:
+        message = (
+            f'{global_audit_error_count} global audit error(s) blocked PR '
+            'submission. Local YAML previews were still written when possible.'
+        )
+    elif audit_blocked_count:
+        message = (
+            f'{audit_blocked_count} candidate(s) had audit errors, and no '
+            'clean ready entries remain. Local YAML previews were still '
+            'written for inspection.'
+        )
+    else:
+        message = 'No clean ready entries are available. PRs were not submitted.'
+    console.print(
+        _panel(
+            message,
+            title='PRs Not Submitted',
+            border_style='yellow',
+        )
+    )
+
+
+def _render_ready(console: Console, review: dict) -> None:
+    audit_blocked_count = len(review.get('audit_blocked_entries', []))
+    message = (
+        'Clean ready entries are available. Existing same-score duplicates '
+        'and score conflicts have been excluded from submission.'
+    )
+    if audit_blocked_count:
+        message += (
+            f'\n\n{audit_blocked_count} candidate(s) had audit errors and '
+            'will not be submitted. Their local YAML previews remain under '
+            f'{review["yaml_output_dir"]}.'
+        )
+    console.print(
+        _panel(
+            message,
+            title='Ready',
+            border_style='green',
+        )
+    )
+
+
+def _prompt_commit_message(console: Console) -> str | None:
+    try:
+        message = Prompt.ask('Commit message').strip()
+    except (EOFError, KeyboardInterrupt):
+        _render_interrupted_prompt(console)
+        return None
+    if not message:
+        console.print(
+            _panel('Commit message is required.', title='PRs Not Submitted')
+        )
+        return None
+    return message
+
+
+def _submit_prs(
+    console: Console,
+    manifest_output: Path,
+    *,
+    commit_message: str,
+) -> int:
+    try:
+        result = create_prs_from_manifest(
+            manifest_path=manifest_output,
+            limit=None,
+            yes_i_reviewed=True,
+            commit_message=commit_message,
+            stream=True,
+        )
+    except HFEvalsError as exc:
+        console.print(_panel(str(exc), title='PR Creation Failed', border_style='red'))
+        return 1
+    console.print(json.dumps(result, indent=2, sort_keys=True))
+    return 0
+
+
+def _maybe_submit_prs(
+    console: Console,
+    review: dict,
+    manifest_output: Path,
+) -> int:
+    if not review['can_open_prs']:
+        _render_not_ready(console, review)
+        return 0
+
+    _render_ready(console, review)
+    if not _approve_pr_submission(console, review):
+        return 0
+    commit_message = _prompt_commit_message(console)
+    if commit_message is None:
+        return 0
+    return _submit_prs(
+        console,
+        manifest_output,
+        commit_message=commit_message,
+    )
+
+
+def _ready_entries_by_repo(review: dict) -> dict[str, list[str]]:
+    by_repo: dict[str, set[str]] = {}
+    for entry in review['manifest']['entries']:
+        if entry.get('status', 'ready') != 'ready':
+            continue
+        repo = str(entry['model_repo'])
+        by_repo.setdefault(repo, set()).add(str(entry['target_path']))
+    return {
+        repo: sorted(paths)
+        for repo, paths in sorted(by_repo.items(), key=lambda item: item[0].lower())
+    }
+
+
+def _approve_pr_submission(console: Console, review: dict) -> bool:
+    by_repo = _ready_entries_by_repo(review)
+    if not by_repo:
+        console.print(_panel('No ready entries to submit.', border_style='yellow'))
+        return False
+
+    table = Table(
+        title='PR Submission Approval',
+        show_header=True,
+        header_style='bold',
+        show_lines=True,
+    )
+    table.add_column('Model repo')
+    table.add_column('Files')
+    for repo, paths in by_repo.items():
+        table.add_row(repo, '\n'.join(paths))
+    console.print(table)
+    console.print(
+        _panel(
+            f'Type {APPROVAL_PHRASE!r} to submit these PRs. '
+            'Anything else cancels.',
+            title='Approval Required',
+            border_style='yellow',
+        )
+    )
+    try:
+        answer = Prompt.ask('Approval').strip()
+    except (EOFError, KeyboardInterrupt):
+        _render_interrupted_prompt(console)
+        return False
+    if answer != APPROVAL_PHRASE:
+        console.print(_panel('PR submission cancelled.', border_style='yellow'))
+        return False
+    return True
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description='Review an EEE datastore collection for HF Community Evals.',
+    )
+    parser.add_argument(
+        'collection_name',
+        help='Collection file stem under flat/indexes/by_collection/<name>.jsonl.',
+    )
+    parser.add_argument(
+        '--datastore',
+        default=DEFAULT_DATASTORE_REPO,
+        help=(
+            'Online HF dataset locator in the form <repo> or '
+            '<repo>@<revision>. Defaults to evaleval/EEE_datastore and '
+            'resolves the current main commit.'
+        ),
+    )
+    parser.add_argument(
+        '--force',
+        action='store_true',
+        help='Ignore cached review/manifest outputs and rebuild from datastore.',
+    )
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    console = Console()
+
+    try:
+        collection_name = _safe_collection_name(args.collection_name)
+        manifest_output, yaml_dir, review_output = _default_paths(collection_name)
+        console.rule(
+            '[bold cyan]EEE -> HF Community Evals[/] '
+            '[dim](built by Harsha Nelaturu, June 2026)[/]'
+        )
+        with Progress(
+            SpinnerColumn(),
+            TextColumn(
+                '[bold blue]{task.description}',
+                table_column=Column(width=48, no_wrap=True, overflow='ellipsis'),
+            ),
+            BarColumn(bar_width=28),
+            TextColumn(
+                '{task.completed:>4.0f}/{task.total:<4.0f}',
+                table_column=Column(width=10, no_wrap=True),
+            ),
+            TimeElapsedColumn(),
+            console=console,
+            expand=False,
+        ) as rich_progress:
+            review = review_collection_for_hf_evals(
+                collection_name=collection_name,
+                datastore=args.datastore,
+                manifest_output_path=manifest_output,
+                yaml_output_dir=yaml_dir,
+                review_output_path=review_output,
+                progress=RichReviewProgress(rich_progress),
+                force=args.force,
+            )
+    except HFEvalsError as exc:
+        console.print(_panel(str(exc), title='Review Failed', border_style='red'))
+        return 1
+
+    _render_summary(console, review)
+    _render_review_details(console, review)
+    console.print(f'Review JSON: {review_output.as_posix()}')
+
+    return _maybe_submit_prs(console, review, manifest_output)
+
+
+if __name__ == '__main__':
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/uv.lock b/uv.lock
index c04019bd3..01f0357dc 100644
--- a/uv.lock
+++ b/uv.lock
@@ -855,6 +855,7 @@ dependencies = [
     { name = "numpy" },
     { name = "pandas" },
     { name = "pydantic" },
+    { name = "pyyaml" },
     { name = "requests" },
     { name = "rich" },
     { name = "seaborn" },
@@ -895,6 +896,7 @@ requires-dist = [
     { name = "numpy", specifier = ">=2.4.1" },
     { name = "pandas", specifier = ">=2.3.3" },
     { name = "pydantic", specifier = ">=2.12.5,<3.0.0" },
+    { name = "pyyaml", specifier = ">=6.0.3" },
     { name = "requests", specifier = ">=2.32.5,<3.0.0" },
     { name = "rich", specifier = ">=14.0.0,<15.0.0" },
     { name = "seaborn", specifier = ">=0.13.2" },

From fa2331341820a8d129573df2bef0ea8a7094e1d8 Mon Sep 17 00:00:00 2001
From: nelaturuharsha <nelaturu.harsha@gmail.com>
Date: Sat, 13 Jun 2026 22:08:16 +0200
Subject: [PATCH 2/3] fixes to make tests pass + add hf community evals to docs

---
 docs/getting-started/index.md                 |   1 +
 docs/hf-community-evals/index.md              | 132 ++++++++++++++++++
 tools/hf-community-evals/README.md            |  10 +-
 .../community_evals_converter.py              |   4 -
 4 files changed, 138 insertions(+), 9 deletions(-)
 create mode 100644 docs/hf-community-evals/index.md

diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md
index 136c14409..cab549c23 100644
--- a/docs/getting-started/index.md
+++ b/docs/getting-started/index.md
@@ -31,3 +31,4 @@ uv run python -m every_eval_ever --help
 - See [Data Structure](../data-structure/)
 - See [Eval Converters](../eval-converters/)
 - See [Contributing](../contributing/)
+- See [HF Community Evals](../hf-community-evals/)
diff --git a/docs/hf-community-evals/index.md b/docs/hf-community-evals/index.md
new file mode 100644
index 000000000..6ea2b1477
--- /dev/null
+++ b/docs/hf-community-evals/index.md
@@ -0,0 +1,132 @@
+---
+layout: default
+title: HF Community Evals
+nav_order: 6
+---
+
+# EEE -> HF Community Evals
+
+Built and maintained by Harsha Nelaturu · EvalEval Coalition · June 2026.
+
+Use `tools/hf-community-evals/community_evals_converter.py` to review one EEE datastore collection, generate
+local HF Community Evals YAML previews, audit existing scores/open PRs, and
+optionally open PRs after explicit approval.
+
+## Quick Start
+
+Use `uv run` for all commands.
+
+```bash
+uv run tools/hf-community-evals/community_evals_converter.py MMLU-Pro \
+  --datastore evaleval/EEE_datastore@main
+```
+
+This will cache the results for this particular collection and if you would like to force a fresh rebuild:
+
+```bash
+uv run tools/hf-community-evals/community_evals_converter.py MMLU-Pro \
+  --datastore evaleval/EEE_datastore@main \
+  --force
+```
+
+The positional argument is a collection stem. It must resolve exactly to:
+
+```text
+https://huggingface.co/datasets/evaleval/EEE_datastore/flat/indexes/by_collection/<collection>.jsonl
+```
+
+## Outputs
+
+For `MMLU-Pro`, outputs are written under:
+
+```text
+outputs/community_evals_converter_MMLU-Pro/
+```
+
+Important output files:
+
+- `manifest.json`: converted candidate records plus skipped/error metadata.
+- `review.json`: full review result, duplicate audit findings, audit errors,
+  and PR readiness.
+- `yamls/<owner>/<model>/.eval_results/<benchmark>.yaml`: local YAML previews.
+
+`outputs/` is ignored by git. Use these files for inspection, not as merge
+inputs.
+
+## Review Behavior
+
+The tool:
+
+- downloads the collection JSONL and referenced aggregate objects from the HF
+  datastore;
+- validates object hashes and optional sizes;
+- scans each aggregate record for supported HF benchmark datasets;
+- writes YAML entries using the datastore object HF URL as `source.url`;
+- keeps flat datastore provenance, including instance-level references when
+  present;
+- checks model repo existence on Hugging Face;
+- audits every existing `.eval_results/*.yaml` file on model `main`;
+- audits changed `.eval_results/*.yaml` files in open PR refs;
+- compares by dataset/task content, not YAML filename.
+
+Supported benchmarks in this workflow are:
+
+- `mmlu_pro`
+- `gpqa`
+- `hle`
+- `gsm8k`
+
+## Resume And Force
+
+Default reruns reuse exact-match local outputs:
+
+- matching completed `review.json`: skips collection downloads, model checks,
+  and duplicate audit;
+- matching pre-audit `manifest.json`: skips collection downloads and model
+  checks, then resumes at duplicate audit.
+
+The cache must match collection name, datastore input, and HF-check mode.
+Invalid exact-match cache files are hard errors. Use `--force` when you want to
+ignore the cache and rebuild from the datastore.
+
+## TUI
+The final report has:
+
+- `Community Evals Converter`: summary counts.
+- `Needs Attention`: capped triage table for blockers and exclusions.
+
+`Needs Attention` uses:
+
+- `Issue`: `audit_error`, `score_conflict`, `already_present`,
+  `missing_hf_model`, or `skipped`.
+- `Model`: model repo or aggregate model id.
+- `Details`: reason or score comparison.
+- `Action`: `exclude`, `block entry`, `block all`, or source line.
+- `Where`: terminal hyperlink to the HF model PR/file or HF datastore blob URL.
+
+Repeated same-score `already_present` findings are summarized as one count row.
+Full details remain in `review.json`.
+
+## PR Submission
+
+The tool only opens PRs after both prompts succeed:
+
+1. Type exactly:
+
+   ```text
+   OPEN PRS
+   ```
+
+2. Enter a non-empty commit message.
+
+Only `status = ready` entries are submitted.
+
+Excluded statuses:
+
+- `already_present`: same score already exists.
+- `score_conflict`: different score already exists.
+- `missing_hf_model`: model repo does not resolve on HF.
+- `audit_error`: candidate-scoped audit failure.
+
+Candidate-scoped audit errors block only that candidate. Audit errors without a
+manifest entry block all PR submission.
diff --git a/tools/hf-community-evals/README.md b/tools/hf-community-evals/README.md
index 5730a0dad..7e1d8c85c 100644
--- a/tools/hf-community-evals/README.md
+++ b/tools/hf-community-evals/README.md
@@ -1,8 +1,8 @@
 # EEE -> HF Community Evals
 
-Built by Harsha Nelaturu, June 2026.
+Built and maintained by Harsha Nelaturu · EvalEval Coalition · June 2026.
 
-Use `tools/community_evals_converter.py` to review one EEE datastore collection, generate
+Use `tools/hf-community-evals/community_evals_converter.py` to review one EEE datastore collection, generate
 local HF Community Evals YAML previews, audit existing scores/open PRs, and
 optionally open PRs after explicit approval.
 
@@ -11,14 +11,14 @@ optionally open PRs after explicit approval.
 Use `uv run` for all commands.
 
 ```bash
-uv run tools/community_evals_converter.py MMLU-Pro \
+uv run tools/hf-community-evals/community_evals_converter.py MMLU-Pro \
   --datastore evaleval/EEE_datastore@main
 ```
 
 This will cache the results for this particular collection and if you would like to force a fresh rebuild:
 
 ```bash
-uv run tools/community_evals_converter.py MMLU-Pro \
+uv run tools/hf-community-evals/community_evals_converter.py MMLU-Pro \
   --datastore evaleval/EEE_datastore@main \
   --force
 ```
@@ -123,4 +123,4 @@ Excluded statuses:
 - `audit_error`: candidate-scoped audit failure.
 
 Candidate-scoped audit errors block only that candidate. Audit errors without a
-manifest entry block all PR submission.
\ No newline at end of file
+manifest entry block all PR submission.
diff --git a/tools/hf-community-evals/community_evals_converter.py b/tools/hf-community-evals/community_evals_converter.py
index c37562069..273bac3e6 100644
--- a/tools/hf-community-evals/community_evals_converter.py
+++ b/tools/hf-community-evals/community_evals_converter.py
@@ -2902,10 +2902,6 @@ def main(argv: list[str] | None = None) -> int:
     try:
         collection_name = _safe_collection_name(args.collection_name)
         manifest_output, yaml_dir, review_output = _default_paths(collection_name)
-        console.rule(
-            '[bold cyan]EEE -> HF Community Evals[/] '
-            '[dim](built by Harsha Nelaturu, June 2026)[/]'
-        )
         with Progress(
             SpinnerColumn(),
             TextColumn(

From c2dde1ab2ed6078b12b1ed68155d377ad7c3c9e2 Mon Sep 17 00:00:00 2001
From: nelaturuharsha <nelaturu.harsha@gmail.com>
Date: Sat, 13 Jun 2026 22:14:15 +0200
Subject: [PATCH 3/3] fixing the test

---
 tests/test_community_evals_converter.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/tests/test_community_evals_converter.py b/tests/test_community_evals_converter.py
index 299f60ebe..602833dec 100644
--- a/tests/test_community_evals_converter.py
+++ b/tests/test_community_evals_converter.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
 import hashlib
+import importlib.util
 import io
 import json
+import sys
 from pathlib import Path
 
 import pytest
@@ -11,7 +13,28 @@
 from rich.progress import Progress
 
 from every_eval_ever import cli
-from tools import community_evals_converter
+
+
+def _load_community_evals_converter():
+    source = (
+        Path(__file__).resolve().parents[1]
+        / 'tools'
+        / 'hf-community-evals'
+        / 'community_evals_converter.py'
+    )
+    spec = importlib.util.spec_from_file_location(
+        'community_evals_converter_under_test',
+        source,
+    )
+    if spec is None or spec.loader is None:
+        raise ImportError(f'Unable to load {source}')
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+community_evals_converter = _load_community_evals_converter()
 
 FIXTURE_DIR = Path(__file__).parent / 'data' / 'community_evals_converter'