IBM · yoavkatz · May 20, 2026 · May 14, 2026 · May 14, 2026 · May 18, 2026
diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
@@ -17,17 +17,19 @@ jobs:
     timeout-minutes: 30
     env:
       OS: ubuntu-latest
+      PYTHONUNBUFFERED: "1"
       UNITXT_DEFAULT_VERBOSITY: error
       DATASETS_VERBOSITY: error
       HF_HUB_VERBOSITY: error
       HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
       HF_HUB_DOWNLOAD_TIMEOUT: 60
       HF_HUB_ETAG_TIMEOUT: 60
       TQDM_DISABLE: "True"
+      HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
 
     strategy:
       matrix:
-        modulo: [0,1,2,3,4,5,6,7]
+        modulo: [0,1,2,3,4,5,6,7,8,9]
 
     steps:
     - uses: actions/checkout@v5
@@ -44,16 +46,11 @@ jobs:
       with:
         ssh-private-key: ${{ secrets.LLMEVALKIT_SSH_KEY }}
 
-    - name:  Hugging Face Login
-      run: |
-        for i in {1..5}; do
-          huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
-        done
     - name: Run Tests
       run: |
         modulo="${{ matrix.modulo }}"
         echo "modulo=${modulo}" >> $GITHUB_STEP_SUMMARY
-        echo "sed -i 's/^num_par = 1 /num_par = 8 /' tests/catalog/test_preparation.py" > sedit.sh
+        echo "sed -i 's/^num_par = 1 /num_par = 10 /' tests/catalog/test_preparation.py" > sedit.sh
         echo "sed -i 's/^modulo = 0/modulo = ${modulo}/' tests/catalog/test_preparation.py" >> sedit.sh
         sh sedit.sh
         python -m unittest tests.catalog.test_preparation

diff --git a/prepare/cards/arena_hard/common.py b/prepare/cards/arena_hard/common.py
@@ -5,6 +5,7 @@
     Cast,
     Copy,
     FilterByCondition,
+    RemoveFields,
     Rename,
     SelectFields,
     Set,
@@ -18,18 +19,22 @@
 arena_hard_hf_space_processing_steps = SequentialOperator(
     steps=[
         # region Question file
-        Rename(field_to_field={"cluster": "group"}, apply_to_streams=["questions"]),
+        Rename(
+            field_to_field={"uid": "question_id", "cluster": "category"},
+            apply_to_streams=["questions"],
+        ),
         Copy(
-            field_to_field={"turns/0/content": "model_input"},
+            field_to_field={"prompt": "model_input"},
             apply_to_streams=["questions"],
         ),
         # endregion
         # region Answers file processing
+        Rename(
+            field_to_field={"uid": "question_id", "model": "model_id"},
+            apply_to_streams=["model_answer"],
+        ),
         Copy(
-            field_to_field={
-                "choices/0/turns/0/content": "model_output",
-                "choices/0/turns/0/token_len": "model_output_token_len",
-            },
+            field_to_field={"messages/1/content/answer": "model_output"},
             apply_to_streams=["model_answer"],
         ),
         Apply(
@@ -52,9 +57,14 @@
             apply_to_streams=["judgment"],
         ),
         Rename(
-            field_to_field={"model": "model_2", "judge": "judge_model_id"},
+            field_to_field={
+                "uid": "question_id",
+                "model": "model_2",
+                "judge": "judge_model_id",
+            },
             apply_to_streams=["judgment"],
         ),
+        RemoveFields(fields=["category"], apply_to_streams=["judgment"]),
         Set(fields={"model_1": "gpt-4-0314"}, apply_to_streams=["judgment"]),
         Cast(
             field="judge_input_model_1_ordered_first",

diff --git a/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py b/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py
@@ -15,8 +15,8 @@
 
 card = TaskCard(
     loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
+        space_name="lmarena-ai/arena-hard-viewer",
+        revision="56c7614",  # Apr 23, 2025 - first commit with v0.1 data in new space
         data_files={
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py b/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py
@@ -16,8 +16,8 @@
 
 card = TaskCard(
     loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
+        space_name="lmarena-ai/arena-hard-viewer",
+        revision="56c7614",  # Apr 23, 2025 - first commit with v0.1 data in new space
         data_files={
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py b/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py
@@ -13,8 +13,8 @@
 
 card = TaskCard(
     loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
+        space_name="lmarena-ai/arena-hard-viewer",
+        revision="56c7614",  # Apr 23, 2025 - first commit with v0.1 data in new space
         data_files={
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/prepare/cards/mtrag.py b/prepare/cards/mtrag.py
@@ -20,7 +20,7 @@
 card = TaskCard(
     loader=LoadJsonFile(
         files={
-            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
+            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/mtrag-human/generation_tasks/reference+RAG.jsonl"
         },
         lines=True,
         data_classification_policy=["public"],

diff --git a/src/unitxt/api.py b/src/unitxt/api.py
@@ -221,9 +221,7 @@ def _source_to_dataset(
         if streaming:
             return ds_builder.as_streaming_dataset(split=split)
 
-        return ds_builder.as_dataset(
-            split=split, run_post_process=False, verification_mode="no_checks"
-        )
+        return ds_builder.as_dataset(split=split)
 
     except DatasetGenerationError as e:
         raise e.__cause__

diff --git a/...ds/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json b/...ds/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json
@@ -2,8 +2,8 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
+        "space_name": "lmarena-ai/arena-hard-viewer",
+        "revision": "56c7614",
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/.../response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json b/.../response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json
@@ -2,8 +2,8 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
+        "space_name": "lmarena-ai/arena-hard-viewer",
+        "revision": "56c7614",
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/...ena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json b/...ena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json
@@ -2,8 +2,8 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
+        "space_name": "lmarena-ai/arena-hard-viewer",
+        "revision": "56c7614",
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/src/unitxt/catalog/cards/rag/mtrag.json b/src/unitxt/catalog/cards/rag/mtrag.json
@@ -3,7 +3,7 @@
     "loader": {
         "__type__": "load_json_file",
         "files": {
-            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
+            "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/mtrag-human/generation_tasks/reference+RAG.jsonl"
         },
         "lines": true,
         "data_classification_policy": [

diff --git a/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json b/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json
@@ -4,7 +4,8 @@
         {
             "__type__": "rename",
             "field_to_field": {
-                "cluster": "group"
+                "uid": "question_id",
+                "cluster": "category"
             },
             "apply_to_streams": [
                 "questions"
@@ -13,17 +14,26 @@
         {
             "__type__": "copy",
             "field_to_field": {
-                "turns/0/content": "model_input"
+                "prompt": "model_input"
             },
             "apply_to_streams": [
                 "questions"
             ]
         },
+        {
+            "__type__": "rename",
+            "field_to_field": {
+                "uid": "question_id",
+                "model": "model_id"
+            },
+            "apply_to_streams": [
+                "model_answer"
+            ]
+        },
         {
             "__type__": "copy",
             "field_to_field": {
-                "choices/0/turns/0/content": "model_output",
-                "choices/0/turns/0/token_len": "model_output_token_len"
+                "messages/1/content/answer": "model_output"
             },
             "apply_to_streams": [
                 "model_answer"
@@ -57,13 +67,23 @@
         {
             "__type__": "rename",
             "field_to_field": {
+                "uid": "question_id",
                 "model": "model_2",
                 "judge": "judge_model_id"
             },
             "apply_to_streams": [
                 "judgment"
             ]
         },
+        {
+            "__type__": "remove_fields",
+            "fields": [
+                "category"
+            ],
+            "apply_to_streams": [
+                "judgment"
+            ]
+        },
         {
             "__type__": "set",
             "fields": {

diff --git a/src/unitxt/dataset.py b/src/unitxt/dataset.py
@@ -126,21 +126,13 @@ def as_streaming_dataset(
     def as_dataset(
         self,
         split: Optional[datasets.Split] = None,
-        run_post_process=True,
-        verification_mode: Optional[Union[datasets.VerificationMode, str]] = None,
         in_memory=False,
     ) -> Union[datasets.Dataset, datasets.DatasetDict]:
         """Return a Dataset for the specified split.
 
         Args:
             split (`datasets.Split`):
                 Which subset of the data to return.
-            run_post_process (`bool`, defaults to `True`):
-                Whether to run post-processing dataset transforms and/or add
-                indexes.
-            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
-                Verification mode determining the checks to run on the
-                downloaded/processed dataset information (checksums/size/splits/...).
             in_memory (`bool`, defaults to `False`):
                 Whether to copy the data in-memory.
 
@@ -164,6 +156,6 @@ def as_dataset(
         """
         return (
             super()
-            .as_dataset(split, run_post_process, verification_mode, in_memory)
+            .as_dataset(split=split, in_memory=in_memory)
             .with_transform(loads_batch)
         )
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -1929,10 +1929,12 @@ def compute(
             pred_df_win_rate, ref_df_win_rate, on="model", suffixes=("_pred", "_ref")
         )
         pearson_corr, _ = pearsonr(
-            merged_df["win_rate_pred"], merged_df["win_rate_ref"]
+            merged_df["win_rate_pred"].astype(float),
+            merged_df["win_rate_ref"].astype(float),
         )
         spearman_corr, _ = spearmanr(
-            merged_df["win_rate_pred"], merged_df["win_rate_ref"]
+            merged_df["win_rate_pred"].astype(float),
+            merged_df["win_rate_ref"].astype(float),
         )
 
         return {"pearson_corr": pearson_corr, "spearman_corr": spearman_corr}
@@ -4427,32 +4429,50 @@ def prepare(self):
         super().prepare()
         self.bertscore = None
 
+    def _get_scorer(self):
+        from bert_score import BERTScorer
+
+        if self.bertscore is None:
+            self.bertscore = BERTScorer(
+                model_type=self.model_name,
+                num_layers=self.model_layer,
+                batch_size=self.batch_size,
+                device=self.get_device(),
+            )
+            # Some models (e.g. DeBERTa) report an absurdly large
+            # model_max_length that overflows the tokenizers Rust backend.
+            # Cap it to the model's actual max_position_embeddings.
+            tokenizer = self.bertscore._tokenizer
+            if tokenizer.model_max_length > 1_000_000:
+                from transformers import AutoConfig
+
+                config = AutoConfig.from_pretrained(self.model_name)
+                tokenizer.model_max_length = getattr(
+                    config, "max_position_embeddings", 512
+                )
+        return self.bertscore
+
     def map_stream(
         self, evaluation_inputs_stream: Generator[EvaluationInput[str], None, None]
     ):
-        from evaluate import load
-
-        if self.bertscore is None:
-            self.bertscore = load("bertscore", experiment_id=str(uuid.uuid4()))
+        scorer = self._get_scorer()
 
         predictions = []
         references = []
         for prediction, reference, _ in evaluation_inputs_stream:
             predictions.append(prediction)
             references.append(reference)
 
-        results = self.bertscore.compute(
-            predictions=predictions,
-            references=references,
+        (precisions, recalls, f1s) = scorer.score(
+            cands=predictions,
+            refs=references,
             batch_size=self.batch_size,
-            device=self.get_device(),
-            model_type=self.model_name,
-            num_layers=self.model_layer,
+            verbose=True,
         )
 
         intermediates = []
         for precision, recall, f1 in zip(
-            results["precision"], results["recall"], results["f1"]
+            precisions.tolist(), recalls.tolist(), f1s.tolist()
         ):
             intermediates.append(
                 {
@@ -5103,7 +5123,11 @@ def __init__(self, model_name, single_token_mode):
             model_path = self.model_name
             if settings.hf_offline_models_path is not None:
                 model_path = os.path.join(settings.hf_offline_models_path, model_path)
-            self.model = self.model_class().from_pretrained(model_path).to(self.device)
+            self.model = (
+                self.model_class()
+                .from_pretrained(model_path, dtype=torch.float32)
+                .to(self.device)
+            )
             self.tokenizer = AutoTokenizer.from_pretrained(model_path)
             if self.tokenizer.pad_token_id is None:
                 self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

diff --git a/tests/catalog/test_preparation.py b/tests/catalog/test_preparation.py
@@ -51,7 +51,7 @@ def test_preparations(self):
         for file in all_preparation_files:
             passed = True
             error = None
-            logger.info(
+            logger.critical(
                 "\n_____________________________________________\n"
                 f"  Testing preparation file:\n  {file}."
                 "\n_____________________________________________\n"