Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
152e4cb
fix: Replace huggingface-cli login with HF_TOKEN env var in catalog_p…
yoavkatz May 14, 2026
9649da8
fix: Use keyword arguments in DatasetBuilder.as_dataset() call
yoavkatz May 14, 2026
0f500a0
fix: Migrate arena-hard card to lmarena-ai/arena-hard-viewer and fix …
yoavkatz May 18, 2026
9f9cf09
fix: Force float32 in Perplexity metric to prevent NaN with float16 m…
yoavkatz May 18, 2026
777fd44
fix: Remove run_post_process and verification_mode params for dataset…
yoavkatz May 18, 2026
44f06a6
fix: Replace assertWarns with catch_warnings to avoid transformers im…
yoavkatz May 18, 2026
ee59098
fix: Cap tokenizer model_max_length in BertScore to prevent OverflowE…
yoavkatz May 18, 2026
62c56e9
fix: Migrate remaining arena-hard cards to lmarena-ai/arena-hard-viewer
yoavkatz May 18, 2026
01f7c4f
fix: Log each preparation file at CRITICAL level for CI visibility
yoavkatz May 18, 2026
77defee
fix: Enable verbose logging in BertScore to debug CI hang
yoavkatz May 18, 2026
7e352dd
fix: Disable test_card for numeric_nlg and coqa to unblock CI
yoavkatz May 18, 2026
c85d02d
debug: Enable progress bars in catalog_preparation CI to diagnose hang
yoavkatz May 19, 2026
4534314
fix: Disable test_card for ffqa_filtered to unblock CI
yoavkatz May 19, 2026
5b8da91
fix: Increase catalog_preparation partitions to 10 and re-enable prog…
yoavkatz May 19, 2026
e2177a5
debug: Remove pip cache from catalog_preparation to test if it affect…
yoavkatz May 19, 2026
a363516
debug: Limit catalog_preparation to 4 parallel jobs to test HF rate l…
yoavkatz May 20, 2026
b592cd1
debug: Add PYTHONUNBUFFERED=1 to see output before hang
yoavkatz May 20, 2026
fcbfb43
revert: Remove max-parallel limit since it didn't help
yoavkatz May 20, 2026
6616199
fix: Update mtrag card URL after repo restructure
yoavkatz May 20, 2026
995ca6d
fix: Restore pip cache in catalog_preparation workflow
yoavkatz May 20, 2026
8e098b7
fix: Update mtrag catalog JSON with new URL
yoavkatz May 20, 2026
2cd9877
fix: Re-enable test_card for numeric_nlg, coqa, and ffqa_filtered
yoavkatz May 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions .github/workflows/catalog_preparation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,19 @@ jobs:
timeout-minutes: 30
env:
OS: ubuntu-latest
PYTHONUNBUFFERED: "1"
UNITXT_DEFAULT_VERBOSITY: error
DATASETS_VERBOSITY: error
HF_HUB_VERBOSITY: error
HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
HF_HUB_DOWNLOAD_TIMEOUT: 60
HF_HUB_ETAG_TIMEOUT: 60
TQDM_DISABLE: "True"
HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}

strategy:
matrix:
modulo: [0,1,2,3,4,5,6,7]
modulo: [0,1,2,3,4,5,6,7,8,9]

steps:
- uses: actions/checkout@v5
Expand All @@ -44,16 +46,11 @@ jobs:
with:
ssh-private-key: ${{ secrets.LLMEVALKIT_SSH_KEY }}

- name: Hugging Face Login
run: |
for i in {1..5}; do
huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
done
- name: Run Tests
run: |
modulo="${{ matrix.modulo }}"
echo "modulo=${modulo}" >> $GITHUB_STEP_SUMMARY
echo "sed -i 's/^num_par = 1 /num_par = 8 /' tests/catalog/test_preparation.py" > sedit.sh
echo "sed -i 's/^num_par = 1 /num_par = 10 /' tests/catalog/test_preparation.py" > sedit.sh
echo "sed -i 's/^modulo = 0/modulo = ${modulo}/' tests/catalog/test_preparation.py" >> sedit.sh
sh sedit.sh
python -m unittest tests.catalog.test_preparation
Expand Down
24 changes: 17 additions & 7 deletions prepare/cards/arena_hard/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Cast,
Copy,
FilterByCondition,
RemoveFields,
Rename,
SelectFields,
Set,
Expand All @@ -18,18 +19,22 @@
arena_hard_hf_space_processing_steps = SequentialOperator(
steps=[
# region Question file
Rename(field_to_field={"cluster": "group"}, apply_to_streams=["questions"]),
Rename(
field_to_field={"uid": "question_id", "cluster": "category"},
apply_to_streams=["questions"],
),
Copy(
field_to_field={"turns/0/content": "model_input"},
field_to_field={"prompt": "model_input"},
apply_to_streams=["questions"],
),
# endregion
# region Answers file processing
Rename(
field_to_field={"uid": "question_id", "model": "model_id"},
apply_to_streams=["model_answer"],
),
Copy(
field_to_field={
"choices/0/turns/0/content": "model_output",
"choices/0/turns/0/token_len": "model_output_token_len",
},
field_to_field={"messages/1/content/answer": "model_output"},
apply_to_streams=["model_answer"],
),
Apply(
Expand All @@ -52,9 +57,14 @@
apply_to_streams=["judgment"],
),
Rename(
field_to_field={"model": "model_2", "judge": "judge_model_id"},
field_to_field={
"uid": "question_id",
"model": "model_2",
"judge": "judge_model_id",
},
apply_to_streams=["judgment"],
),
RemoveFields(fields=["category"], apply_to_streams=["judgment"]),
Set(fields={"model_1": "gpt-4-0314"}, apply_to_streams=["judgment"]),
Cast(
field="judge_input_model_1_ordered_first",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

card = TaskCard(
loader=LoadFromHFSpace(
space_name="lmsys/arena-hard-browser",
revision="03b91ca", # May 26, 2024
space_name="lmarena-ai/arena-hard-viewer",
revision="56c7614", # Apr 23, 2025 - first commit with v0.1 data in new space
data_files={
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

card = TaskCard(
loader=LoadFromHFSpace(
space_name="lmsys/arena-hard-browser",
revision="03b91ca", # May 26, 2024
space_name="lmarena-ai/arena-hard-viewer",
revision="56c7614", # Apr 23, 2025 - first commit with v0.1 data in new space
data_files={
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

card = TaskCard(
loader=LoadFromHFSpace(
space_name="lmsys/arena-hard-browser",
revision="03b91ca", # May 26, 2024
space_name="lmarena-ai/arena-hard-viewer",
revision="56c7614", # Apr 23, 2025 - first commit with v0.1 data in new space
data_files={
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
Expand Down
2 changes: 1 addition & 1 deletion prepare/cards/mtrag.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
card = TaskCard(
loader=LoadJsonFile(
files={
"test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
"test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/mtrag-human/generation_tasks/reference+RAG.jsonl"
},
lines=True,
data_classification_policy=["public"],
Expand Down
4 changes: 1 addition & 3 deletions src/unitxt/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,7 @@ def _source_to_dataset(
if streaming:
return ds_builder.as_streaming_dataset(split=split)

return ds_builder.as_dataset(
split=split, run_post_process=False, verification_mode="no_checks"
)
return ds_builder.as_dataset(split=split)

except DatasetGenerationError as e:
raise e.__cause__
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"__type__": "task_card",
"loader": {
"__type__": "load_from_hf_space",
"space_name": "lmsys/arena-hard-browser",
"revision": "03b91ca",
"space_name": "lmarena-ai/arena-hard-viewer",
"revision": "56c7614",
"data_files": {
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"__type__": "task_card",
"loader": {
"__type__": "load_from_hf_space",
"space_name": "lmsys/arena-hard-browser",
"revision": "03b91ca",
"space_name": "lmarena-ai/arena-hard-viewer",
"revision": "56c7614",
"data_files": {
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"__type__": "task_card",
"loader": {
"__type__": "load_from_hf_space",
"space_name": "lmsys/arena-hard-browser",
"revision": "03b91ca",
"space_name": "lmarena-ai/arena-hard-viewer",
"revision": "56c7614",
"data_files": {
"questions": "data/arena-hard-v0.1/question.jsonl",
"model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",
Expand Down
2 changes: 1 addition & 1 deletion src/unitxt/catalog/cards/rag/mtrag.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"loader": {
"__type__": "load_json_file",
"files": {
"test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
"test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/mtrag-human/generation_tasks/reference+RAG.jsonl"
},
"lines": true,
"data_classification_policy": [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
{
"__type__": "rename",
"field_to_field": {
"cluster": "group"
"uid": "question_id",
"cluster": "category"
},
"apply_to_streams": [
"questions"
Expand All @@ -13,17 +14,26 @@
{
"__type__": "copy",
"field_to_field": {
"turns/0/content": "model_input"
"prompt": "model_input"
},
"apply_to_streams": [
"questions"
]
},
{
"__type__": "rename",
"field_to_field": {
"uid": "question_id",
"model": "model_id"
},
"apply_to_streams": [
"model_answer"
]
},
{
"__type__": "copy",
"field_to_field": {
"choices/0/turns/0/content": "model_output",
"choices/0/turns/0/token_len": "model_output_token_len"
"messages/1/content/answer": "model_output"
},
"apply_to_streams": [
"model_answer"
Expand Down Expand Up @@ -57,13 +67,23 @@
{
"__type__": "rename",
"field_to_field": {
"uid": "question_id",
"model": "model_2",
"judge": "judge_model_id"
},
"apply_to_streams": [
"judgment"
]
},
{
"__type__": "remove_fields",
"fields": [
"category"
],
"apply_to_streams": [
"judgment"
]
},
{
"__type__": "set",
"fields": {
Expand Down
10 changes: 1 addition & 9 deletions src/unitxt/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,21 +126,13 @@ def as_streaming_dataset(
def as_dataset(
self,
split: Optional[datasets.Split] = None,
run_post_process=True,
verification_mode: Optional[Union[datasets.VerificationMode, str]] = None,
in_memory=False,
) -> Union[datasets.Dataset, datasets.DatasetDict]:
"""Return a Dataset for the specified split.

Args:
split (`datasets.Split`):
Which subset of the data to return.
run_post_process (`bool`, defaults to `True`):
Whether to run post-processing dataset transforms and/or add
indexes.
verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
Verification mode determining the checks to run on the
downloaded/processed dataset information (checksums/size/splits/...).
in_memory (`bool`, defaults to `False`):
Whether to copy the data in-memory.

Expand All @@ -164,6 +156,6 @@ def as_dataset(
"""
return (
super()
.as_dataset(split, run_post_process, verification_mode, in_memory)
.as_dataset(split=split, in_memory=in_memory)
.with_transform(loads_batch)
)
52 changes: 38 additions & 14 deletions src/unitxt/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1929,10 +1929,12 @@ def compute(
pred_df_win_rate, ref_df_win_rate, on="model", suffixes=("_pred", "_ref")
)
pearson_corr, _ = pearsonr(
merged_df["win_rate_pred"], merged_df["win_rate_ref"]
merged_df["win_rate_pred"].astype(float),
merged_df["win_rate_ref"].astype(float),
)
spearman_corr, _ = spearmanr(
merged_df["win_rate_pred"], merged_df["win_rate_ref"]
merged_df["win_rate_pred"].astype(float),
merged_df["win_rate_ref"].astype(float),
)

return {"pearson_corr": pearson_corr, "spearman_corr": spearman_corr}
Expand Down Expand Up @@ -4427,32 +4429,50 @@ def prepare(self):
super().prepare()
self.bertscore = None

def _get_scorer(self):
from bert_score import BERTScorer

if self.bertscore is None:
self.bertscore = BERTScorer(
model_type=self.model_name,
num_layers=self.model_layer,
batch_size=self.batch_size,
device=self.get_device(),
)
# Some models (e.g. DeBERTa) report an absurdly large
# model_max_length that overflows the tokenizers Rust backend.
# Cap it to the model's actual max_position_embeddings.
tokenizer = self.bertscore._tokenizer
if tokenizer.model_max_length > 1_000_000:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(self.model_name)
tokenizer.model_max_length = getattr(
config, "max_position_embeddings", 512
)
return self.bertscore

def map_stream(
self, evaluation_inputs_stream: Generator[EvaluationInput[str], None, None]
):
from evaluate import load

if self.bertscore is None:
self.bertscore = load("bertscore", experiment_id=str(uuid.uuid4()))
scorer = self._get_scorer()

predictions = []
references = []
for prediction, reference, _ in evaluation_inputs_stream:
predictions.append(prediction)
references.append(reference)

results = self.bertscore.compute(
predictions=predictions,
references=references,
(precisions, recalls, f1s) = scorer.score(
cands=predictions,
refs=references,
batch_size=self.batch_size,
device=self.get_device(),
model_type=self.model_name,
num_layers=self.model_layer,
verbose=True,
)

intermediates = []
for precision, recall, f1 in zip(
results["precision"], results["recall"], results["f1"]
precisions.tolist(), recalls.tolist(), f1s.tolist()
):
intermediates.append(
{
Expand Down Expand Up @@ -5103,7 +5123,11 @@ def __init__(self, model_name, single_token_mode):
model_path = self.model_name
if settings.hf_offline_models_path is not None:
model_path = os.path.join(settings.hf_offline_models_path, model_path)
self.model = self.model_class().from_pretrained(model_path).to(self.device)
self.model = (
self.model_class()
.from_pretrained(model_path, dtype=torch.float32)
.to(self.device)
)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
Expand Down
2 changes: 1 addition & 1 deletion tests/catalog/test_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_preparations(self):
for file in all_preparation_files:
passed = True
error = None
logger.info(
logger.critical(
"\n_____________________________________________\n"
f" Testing preparation file:\n {file}."
"\n_____________________________________________\n"
Expand Down
Loading
Loading