feat: introduce ScopedModuleOffloading in KD to reduce memory usage (#774)

akoumpa · web-flow · commit 6eca242e39d6 · 2025-11-07T16:33:18.000-05:00
Signed-off-by: Alexandros Koumparoulis &lt;akoumparouli@nvidia.com&gt;
diff --git a/nemo_automodel/components/training/utils.py b/nemo_automodel/components/training/utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import math
 from typing import Iterable
 
@@ -299,3 +300,28 @@ def scale_grads_and_clip_grad_norm(
         pp_axis_name=pp_axis_name,
         foreach=foreach,
     )
+
+
+def move_to_device(model, device):
+    # FSDP modules do not move buffers to the device automatically
+    for v in model.buffers():
+        v.data = v.data.to(device)
+    model.to(device)
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+class ScopedModuleOffloading:
+    def __init__(self, model, enabled=False):
+        self.model = model
+        self.enabled = enabled
+
+    def __enter__(self):
+        if self.enabled:
+            move_to_device(self.model, "cuda")
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.enabled:
+            move_to_device(self.model, "cpu")
+        return False  # Re-raise exceptions by default
diff --git a/nemo_automodel/recipes/llm/kd.py b/nemo_automodel/recipes/llm/kd.py
@@ -51,7 +51,7 @@
 from nemo_automodel.components.loggers.metric_logger import MetricsSample
 from nemo_automodel.components.loss.linear_ce import FusedLinearCrossEntropy
 from nemo_automodel.components.training.rng import ScopedRNG
-from nemo_automodel.components.training.utils import count_tail_padding
+from nemo_automodel.components.training.utils import ScopedModuleOffloading, count_tail_padding
 from nemo_automodel.recipes.llm.train_ft import (
     TrainFinetuneRecipeForNextTokenPrediction,
     calculate_loss,
@@ -134,12 +134,14 @@ def setup(self):  # noqa: C901 – same complexity as parent
         if self.pp_enabled:
             raise ValueError("Pipeline parallelism support will be added in the future for knowledge distillation")
 
+        self._offload_teacher_model = self.cfg.get("offload_teacher_model", False)
         # teacher specific
+        teacher_device = self.dist_env.device if not self._offload_teacher_model else "cpu"
         self.teacher_model = _build_teacher_model(
             self.cfg.get("teacher_model", None),
             self.cfg.get("seed", 42),
             self.cfg.get("packed_sequence.packed_sequence_size", 0) > 0,
-            self.dist_env.device,
+            teacher_device,
             self.model_wrapper,
             self.device_mesh,
         )
@@ -173,6 +175,14 @@ def _forward_backward_step(
         model = self.model_parts[0]
         sync_ctx = get_sync_ctx(model, idx == num_batches - 1) if is_train else nullcontext()
         with train_ctx(), sync_ctx:
+            # No grad for teacher forward
+            with (
+                ScopedModuleOffloading(self.teacher_model, enabled=self._offload_teacher_model),
+                torch.inference_mode(),
+            ):
+                teacher_logits = self.teacher_model(**batch)
+                teacher_logits = getattr(teacher_logits, "logits", teacher_logits).detach().clone()
+
             # Student forward
             student_keep_last = isinstance(self.loss_fn, FusedLinearCrossEntropy)
             if student_keep_last:
@@ -191,11 +201,6 @@ def _forward_backward_step(
                 hidden_states=student_out.hidden_states[-1] if "hidden_states" in student_out else None,
                 num_label_tokens=num_label_tokens,
             )
-            # No grad for teacher forward
-            with torch.no_grad():
-                teacher_logits = self.teacher_model(**batch)
-                teacher_logits = getattr(teacher_logits, "logits", teacher_logits).detach()
-
             # Reminder: kd_loss is normalized by num_label_tokens,
             # which typically is larger than the number of labels in this batch,
             # because it contains the total number of labels for all batches contained
diff --git a/tests/unit_tests/training/test_train_utils.py b/tests/unit_tests/training/test_train_utils.py
@@ -17,6 +17,11 @@
 import pytest
 import torch
 
+import pytest
+import torch
+import torch.nn as nn
+
+from nemo_automodel.components.training.utils import move_to_device, ScopedModuleOffloading
 from nemo_automodel.components.training.utils import clip_grad_norm, count_tail_padding
 
 
@@ -127,3 +132,70 @@ def test_clip_grad_norm_returns_zero_when_max_grad_norm_is_none():
     )
 
     assert grad_norm == 0
+
+
+class _TinyModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(4, 2, bias=False)
+        self.register_buffer("scale", torch.ones(1))
+
+
+def _all_tensors_on_device(module: nn.Module, device_type: str) -> bool:
+    for p in module.parameters():
+        if p.device.type != device_type:
+            return False
+    for b in module.buffers():
+        if b.device.type != device_type:
+            return False
+    return True
+
+
+def test_move_to_device_cpu():
+    model = _TinyModule()
+    # Ensure starts on CPU
+    assert _all_tensors_on_device(model, "cpu")
+
+    # Move to CPU (idempotent)
+    move_to_device(model, "cpu")
+    assert _all_tensors_on_device(model, "cpu")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_move_to_device_cuda():
+    model = _TinyModule()
+    # Move to CUDA
+    move_to_device(model, "cuda")
+    assert _all_tensors_on_device(model, "cuda")
+
+    # Move back to CPU to leave environment clean
+    move_to_device(model, "cpu")
+    assert _all_tensors_on_device(model, "cpu")
+
+
+def test_scoped_offloading_disabled_noop_and_reraises():
+    model = _TinyModule()
+    assert _all_tensors_on_device(model, "cpu")
+
+    with pytest.raises(ValueError):
+        with ScopedModuleOffloading(model, enabled=False):
+            # Should not move devices and should re-raise exceptions
+            assert _all_tensors_on_device(model, "cpu")
+            raise ValueError("boom")
+
+    # After context, still on CPU
+    assert _all_tensors_on_device(model, "cpu")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_scoped_offloading_enabled_moves_and_reraises():
+    model = _TinyModule()
+    assert _all_tensors_on_device(model, "cpu")
+
+    # Enter moves to CUDA, exit moves back to CPU and re-raises exceptions
+    with pytest.raises(RuntimeError):
+        with ScopedModuleOffloading(model, enabled=True):
+            assert _all_tensors_on_device(model, "cuda")
+            raise RuntimeError("fail inside context")
+
+    assert _all_tensors_on_device(model, "cpu")