feat: add streaming ds (#778)

HuiyingLi · web-flow · commit 9995e4a8db2a · 2025-11-07T21:22:21.000Z
Signed-off-by: HuiyingLi &lt;willwin.lee@gmail.com&gt;
diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
@@ -199,7 +199,7 @@ def __init__(
         self.tokenizer = tokenizer
         if getattr(self.tokenizer, "pad_token", None) is None:
             if hasattr(self.tokenizer, "eos_token"):
-                self.tokenizer.pad_token = self.tokenizer
+                self.tokenizer.pad_token = self.tokenizer.eos_token
             else:
                 logger.warning("Setting tokenizer pad_token to ' '. tokenizer does not have `eos_token`.")
                 self.tokenizer.pad_token = " "
diff --git a/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_iterable_dataset.py b/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_iterable_dataset.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Dict, Iterator, List, Optional, Union
+
+from torch.utils.data import IterableDataset
+
+from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset import (
+    ColumnMappedTextInstructionDataset,
+    ColumnTypes,
+    _check_all_values_equal_length,
+    _load_dataset,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ColumnMappedTextInstructionIterableDataset(IterableDataset, ColumnMappedTextInstructionDataset):
+    """Streaming iterable variant that reuses the column-mapping/tokenization logic.
+
+    This wraps a Hugging Face streaming dataset (IterableDataset from `datasets`)
+    and yields tokenized samples compatible with the non-streaming variant, while
+    supporting sharding and epoch-setting for deterministic shuffles upstream.
+    """
+
+    def __init__(
+        self,
+        path_or_dataset_id: Union[str, List[str]],
+        column_mapping: Dict[str, str],
+        tokenizer,
+        *,
+        split: Optional[str] = None,
+        name: Optional[str] = None,
+        answer_only_loss_mask: bool = True,
+        seq_length: Optional[int] = None,
+        padding: Union[str, bool] = "do_not_pad",
+        truncation: Union[str, bool] = "do_not_truncate",
+        start_of_turn_token: Optional[str] = None,
+        limit_dataset_samples: Optional[int] = None,
+        repeat_on_exhaustion: bool = True,
+    ) -> None:
+        if tokenizer is None:
+            raise ValueError("Tokenizer is required")
+        self.tokenizer = tokenizer
+        if getattr(self.tokenizer, "pad_token", None) is None:
+            if hasattr(self.tokenizer, "eos_token"):
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            else:
+                logger.warning("Setting tokenizer pad_token to ' '. tokenizer does not have `eos_token`.")
+                self.tokenizer.pad_token = " "
+
+        if ColumnTypes.Answer.value not in column_mapping:
+            raise AssertionError(("Expected answer to be in column_mapping", column_mapping))
+        if len(column_mapping) == 3:
+            if ColumnTypes.Context.value not in column_mapping:
+                raise AssertionError(("Expected context to be in column_mapping", column_mapping))
+            if ColumnTypes.Question.value not in column_mapping:
+                raise AssertionError(("Expected question to be in column_mapping", column_mapping))
+        elif len(column_mapping) == 2:
+            if ColumnTypes.Context.value not in column_mapping and ColumnTypes.Question.value not in column_mapping:
+                raise AssertionError(("Expected context or question to be in column_mapping", column_mapping))
+        else:
+            raise ValueError(f"Expected 2 or 3 columns in column_mapping, got {len(column_mapping)}")
+
+        self.column_mapping = column_mapping
+        self.answer_only_loss_mask = answer_only_loss_mask
+        self.start_of_turn_token = start_of_turn_token
+        self.seq_length = seq_length
+        self.padding = padding
+        self.truncation = truncation
+        self.num_shards = getattr(self, "num_shards", 1)
+        self._current_epoch_for_repeat = 0
+        self.repeat_on_exhaustion = bool(repeat_on_exhaustion)
+
+        # Always load in streaming mode
+        ds = _load_dataset(path_or_dataset_id, split=split, streaming=True, name=name)
+        if limit_dataset_samples is not None:
+            try:
+                ds = ds.take(limit_dataset_samples)
+            except Exception as e:
+                logger.warning("limit_dataset_samples ignored; 'take' not supported on this dataset: %s", e)
+
+        self.dataset = ds
+
+    def __iter__(self) -> Iterator[Dict[str, List[int]]]:
+        while True:
+            for row in self.dataset:
+                mapped = {dest: row[src] for dest, src in self.column_mapping.items() if src in row}
+                # Skip rows missing required fields
+                if ColumnTypes.Answer.value not in mapped:
+                    continue
+                tokenized = self._apply_tokenizer(mapped)  # provided by ColumnMappedTextInstructionDataset
+                # Skip samples with no valid labels (aligns with non-iterable behavior)
+                if not any(label != -100 for label in tokenized.get("labels", [])):
+                    continue
+                if not _check_all_values_equal_length(tokenized):
+                    continue
+                yield tokenized
+
+            if not self.repeat_on_exhaustion:
+                return
+            # Wrap-around: advance epoch for deterministic reshuffle if supported and iterate again
+            try:
+                self._current_epoch_for_repeat += 1
+                self.set_epoch(self._current_epoch_for_repeat)
+            except Exception:
+                pass
+
+    def set_epoch(self, epoch: int) -> None:
+        ds = getattr(self, "dataset", None)
+        if ds is not None and hasattr(ds, "set_epoch"):
+            ds.set_epoch(epoch)
+
+    def shard(self, num_shards: int, index: int):
+        ds = getattr(self, "dataset", None)
+        if ds is not None and hasattr(ds, "shard"):
+            self.dataset = ds.shard(num_shards, index)
+        return self
+
+    def shuffle(self, buffer_size: int, seed: int):
+        ds = getattr(self, "dataset", None)
+        if ds is not None and hasattr(ds, "shuffle"):
+            self.dataset = ds.shuffle(buffer_size=buffer_size, seed=seed)
+        return self
diff --git a/nemo_automodel/recipes/llm/train_ft.py b/nemo_automodel/recipes/llm/train_ft.py
@@ -462,6 +462,22 @@ def build_dataloader(
             with FirstRankPerNode():
                 ds = cfg_ds.instantiate(**kwargs)
 
+        # If using an IterableDataset, per-rank sharding for unique samples
+        if isinstance(ds, IterableDataset):
+            try:
+                if ds.num_shards >= dp_world_size:
+                    ds = ds.shard(dp_world_size, dp_rank)
+                    logging.info(
+                        f"Sharded IterableDataset via dataset.shard: world_size={dp_world_size}, rank={dp_rank}"
+                    )
+                else:
+                    from datasets.distributed import split_dataset_by_node
+
+                    ds.dataset = split_dataset_by_node(ds.dataset, world_size=dp_world_size, rank=dp_rank)
+                    logging.info(f"Sharded dataset via split_dataset_by_node: world_size={dp_world_size}")
+            except Exception as e:
+                logging.warning(f"IterableDataset sharding skipped due to error: {e}")
+
         packed_sequence_size = getattr(cfg_ps, "packed_sequence_size", 0)
         # check if packed sequence is supported
         if packed_sequence_size > 0 and not supports_seq_lens:
@@ -518,6 +534,22 @@ def build_dataloader(
                 dl_kwargs["drop_last"] = True
         else:
             logging.info("Using IterableDataset; skipping sampler.")
+            # Optional shuffle for streaming IterableDataset (uses HF dataset shuffle if available)
+            shuffle = cfg_dl.get("shuffle", False)
+            shuffle_buffer_size = cfg_dl.get("shuffle_buffer_size", 10000)
+            # Do not pass shuffle-related kwargs to the DataLoader when using IterableDataset
+            # But leave them in dl config to be consistent
+            if hasattr(cfg_dl, "shuffle"):
+                del cfg_dl.shuffle
+            if hasattr(cfg_dl, "shuffle_buffer_size"):
+                del cfg_dl.shuffle_buffer_size
+
+            if shuffle and hasattr(ds, "shuffle"):
+                try:
+                    ds = ds.shuffle(buffer_size=shuffle_buffer_size, seed=seed)
+                    logging.info(f"Shuffling IterableDataset with buffer_size={shuffle_buffer_size}, seed={seed}")
+                except Exception as e:
+                    logging.warning(f"IterableDataset shuffle skipped due to error: {e}")
             dl_kwargs = {}
 
         # Handle collate_fn with optional mask precomputation for pipeline parallelism
diff --git a/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction_iterable.py b/tests/unit_tests/datasets/llm/test_column_mapped_text_instruction_iterable.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+
+import pytest
+
+from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset import (
+    ColumnMappedTextInstructionIterableDataset,
+)
+
+
+class _DummyTokenizer:  # noqa: D401
+    """Minimal tokenizer stub sufficient for dataset tokenization paths."""
+
+    def __init__(self):
+        self.pad_token = "<pad>"
+        self.pad_token_id = 0
+        self.eos_token_id = 1
+        self.bos_token_id = 2
+        self._counter = 3
+
+    def __call__(
+        self,
+        text: str,
+        add_special_tokens: bool = True,
+        padding=None,
+        truncation=None,
+        max_length=None,
+    ):
+        tokens = text.split()
+        input_ids = list(range(self._counter, self._counter + len(tokens)))
+        if add_special_tokens:
+            input_ids = [self.bos_token_id] + input_ids + [self.eos_token_id]
+        # Advance counter so successive calls yield distinct id ranges
+        self._counter += len(tokens) + (2 if add_special_tokens else 0)
+        return {"input_ids": input_ids}
+
+
+def _write_jsonl(path: Path, rows):
+    with path.open("w", encoding="utf-8") as fp:
+        for row in rows:
+            fp.write(json.dumps(row) + "\n")
+
+def test_iterable_dataset_shard_and_shuffle_smoke(monkeypatch, tmp_path: Path):
+    class _StubHFIterable:
+        def __init__(self, rows):
+            self._rows = rows
+            self._shard = None
+            self._shuffled = False
+
+        def __iter__(self):
+            it = self._rows
+            if self._shard is not None:
+                n, idx = self._shard
+                it = [r for i, r in enumerate(it) if i % n == idx]
+            if self._shuffled:
+                it = list(reversed(it))
+            for r in it:
+                yield r
+
+        def shard(self, num_shards, index):
+            self._shard = (num_shards, index)
+            return self
+
+        def shuffle(self, buffer_size, seed):
+            self._shuffled = True
+            return self
+
+    rows = [
+        {"q": "Q0?", "a": "A0"},
+        {"q": "Q1?", "a": "A1"},
+        {"q": "Q2?", "a": "A2"},
+    ]
+
+    def _fake_load_dataset(*args, **kwargs):
+        return _StubHFIterable(rows)
+
+    monkeypatch.setattr(
+        "nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset._load_dataset",
+        _fake_load_dataset,
+    )
+
+    ds = ColumnMappedTextInstructionIterableDataset(
+        path_or_dataset_id="ignored.jsonl",
+        column_mapping={"question": "q", "answer": "a"},
+        tokenizer=_DummyTokenizer(),
+        answer_only_loss_mask=False,
+        repeat_on_exhaustion=False,
+    ).shard(2, 1).shuffle(buffer_size=2, seed=0)
+
+    first = next(iter(ds))
+    assert {"input_ids", "attention_mask", "labels"}.issubset(first.keys())
+
+
+def test_iterable_dataset_pad_token_fallback_with_eos(tmp_path: Path):
+    class _TokNoPadWithEos:
+        eos_token = "</s>"
+        pad_token = None
+
+    rows = [{"q": "Q?", "a": "A"}]
+    jsonl_path = tmp_path / "toy_pad_eos.jsonl"
+    _write_jsonl(jsonl_path, rows)
+
+    tok = _TokNoPadWithEos()
+    _ = ColumnMappedTextInstructionIterableDataset(
+        path_or_dataset_id=str(jsonl_path),
+        column_mapping={"question": "q", "answer": "a"},
+        tokenizer=tok,
+        answer_only_loss_mask=False,
+        repeat_on_exhaustion=False,
+    )
+    assert tok.pad_token == tok.eos_token
+
+
+def test_iterable_dataset_pad_token_fallback_without_eos(tmp_path: Path):
+    class _TokNoPadNoEos:
+        pad_token = None
+
+    rows = [{"q": "Q?", "a": "A"}]
+    jsonl_path = tmp_path / "toy_pad_noeos.jsonl"
+    _write_jsonl(jsonl_path, rows)
+
+    tok = _TokNoPadNoEos()
+    _ = ColumnMappedTextInstructionIterableDataset(
+        path_or_dataset_id=str(jsonl_path),
+        column_mapping={"question": "q", "answer": "a"},
+        tokenizer=tok,
+        answer_only_loss_mask=False,
+        repeat_on_exhaustion=False,
+    )
+    assert tok.pad_token == " "
+
+
+def test_iterable_dataset_mapping_checks_missing_answer(tmp_path: Path):
+    rows = [{"q": "Q?", "a": "A"}]
+    jsonl_path = tmp_path / "toy_missing_answer.jsonl"
+    _write_jsonl(jsonl_path, rows)
+
+    with pytest.raises(AssertionError):
+        _ = ColumnMappedTextInstructionIterableDataset(
+            path_or_dataset_id=str(jsonl_path),
+            column_mapping={"question": "q"},  # missing answer
+            tokenizer=_DummyTokenizer(),
+        )
+
+
+def test_iterable_dataset_mapping_checks_two_keys_missing_both_context_and_question(tmp_path: Path):
+    rows = [{"q": "Q?", "a": "A"}]
+    jsonl_path = tmp_path / "toy_two_keys_invalid.jsonl"
+    _write_jsonl(jsonl_path, rows)
+
+    with pytest.raises(AssertionError, match="Expected context or question"):
+        _ = ColumnMappedTextInstructionIterableDataset(
+            path_or_dataset_id=str(jsonl_path),
+            column_mapping={"answer": "a", "foo": "bar"},
+            tokenizer=_DummyTokenizer(),
+        )
+
+
+def test_iterable_dataset_mapping_checks_invalid_num_columns(tmp_path: Path):
+    rows = [{"q": "Q?", "a": "A"}]
+    jsonl_path = tmp_path / "toy_invalid_cols.jsonl"
+    _write_jsonl(jsonl_path, rows)
+
+    with pytest.raises(ValueError, match="Expected 2 or 3 columns"):
+        _ = ColumnMappedTextInstructionIterableDataset(
+            path_or_dataset_id=str(jsonl_path),
+            column_mapping={"answer": "a"},  # only 1 key
+            tokenizer=_DummyTokenizer(),
+        )
+
+
diff --git a/tests/unit_tests/recipes/test_train_ft.py b/tests/unit_tests/recipes/test_train_ft.py