From 1d775b2f248e5d20fbb0518b2f87649f2145e06d Mon Sep 17 00:00:00 2001
From: Boxiang Wang <boxiangw@nvidia.com>
Date: Wed, 22 Oct 2025 16:39:09 -0700
Subject: [PATCH 1/2] Train working

Signed-off-by: Boxiang Wang <boxiangw@nvidia.com>
---
 llama3_8b.py                                  | 28 +++++++++++++++++++
 nemo/collections/llm/recipes/llama3_8b.py     |  2 +-
 .../pytorch/strategies/megatron_strategy.py   | 11 ++++----
 3 files changed, 35 insertions(+), 6 deletions(-)
 create mode 100644 llama3_8b.py

diff --git a/llama3_8b.py b/llama3_8b.py
new file mode 100644
index 000000000000..159fb404db72
--- /dev/null
+++ b/llama3_8b.py
@@ -0,0 +1,28 @@
+import nemo_run as run
+
+from nemo.collections.llm.recipes import llama3_8b
+
+if __name__ == "__main__":
+    pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8, performance_mode=True)
+
+    pretrain.trainer.strategy.context_parallel_size=1
+    pretrain.trainer.log_every_n_steps=1
+    pretrain.data.global_batch_size=16
+    pretrain.data.seq_length=64
+    pretrain.trainer.max_steps=10
+
+    pretrain.trainer.strategy.fsdp='megatron'
+    pretrain.trainer.strategy.ddp.average_in_collective=False
+    pretrain.trainer.strategy.ddp.use_megatron_fsdp=True
+    pretrain.trainer.strategy.save_ckpt_format='fsdp_dtensor'
+    # pretrain.trainer.strategy.gradient_accumulation_fusion=False
+
+    # # included in the performance mode but not normal mode
+    
+    pretrain.trainer.strategy.ddp.grad_reduce_in_fp32=False
+    pretrain.trainer.plugins.grad_reduce_in_fp32 = False
+    pretrain.optim.config.use_precision_aware_optimizer = False
+    pretrain.optim.config.use_megatron_fsdp = True
+    # pretrain.data.seq_length = 4096
+
+    run.run(pretrain)
\ No newline at end of file
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
index 31723bf6aafc..85fc29eddf97 100644
--- a/nemo/collections/llm/recipes/llama3_8b.py
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -54,7 +54,7 @@ def model() -> run.Config[pl.LightningModule]:
             >>> model_config = model()
             >>> print(model_config)
     """
-    return run.Config(LlamaModel, config=run.Config(Llama3Config8B))
+    return run.Config(LlamaModel, config=run.Config(Llama3Config8B, gradient_accumulation_fusion=False))
 
 
 def trainer(
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index a2507dd36fdc..965e8e7806cb 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -373,10 +373,10 @@ def __init__(
                 "Setting FSDP option to megatron"
             )
             fsdp = 'megatron'
-            if use_megatron_fsdp and self.save_ckpt_format != "fsdp_dtensor":
-                raise NotImplementedError(
-                    f"Megatron-FSDP checkpointing is not supported with {self.save_ckpt_format}."
-                )
+        if use_megatron_fsdp and self.save_ckpt_format != "fsdp_dtensor":
+            raise NotImplementedError(
+                f"Megatron-FSDP checkpointing is not supported with {self.save_ckpt_format}."
+            )
 
         if fsdp == "pytorch":
             raise NotImplementedError("PyTorch FSDP2 is not supported with MegatronParallel.")
@@ -1052,7 +1052,8 @@ def should_restore_optimizer_states(self, selective_restore: bool = False) -> bo
     def _save_fsdp_dtensor_common_state(self, state_dict, ckpt_dir):
         state_dict = state_dict.copy()
         del state_dict["model"]
-        del state_dict["optimizer_states"]
+        if "optimizer_states" in state_dict:
+            del state_dict["optimizer_states"]
         torch.save(state_dict, os.path.join(ckpt_dir, "common.pt"))
 
     def _load_fsdp_dtensor_common_state(self, ckpt_dir):

From 271c986db284d20f73cc1b1e4784bf1b813ccf52 Mon Sep 17 00:00:00 2001
From: BoxiangW <BoxiangW@users.noreply.github.com>
Date: Wed, 22 Oct 2025 23:41:08 +0000
Subject: [PATCH 2/2] Apply isort and black reformatting

Signed-off-by: BoxiangW <BoxiangW@users.noreply.github.com>
---
 llama3_8b.py                                  | 24 +++++++++----------
 .../pytorch/strategies/megatron_strategy.py   |  4 +---
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/llama3_8b.py b/llama3_8b.py
index 159fb404db72..f129913aa95d 100644
--- a/llama3_8b.py
+++ b/llama3_8b.py
@@ -5,24 +5,24 @@
 if __name__ == "__main__":
     pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8, performance_mode=True)
 
-    pretrain.trainer.strategy.context_parallel_size=1
-    pretrain.trainer.log_every_n_steps=1
-    pretrain.data.global_batch_size=16
-    pretrain.data.seq_length=64
-    pretrain.trainer.max_steps=10
+    pretrain.trainer.strategy.context_parallel_size = 1
+    pretrain.trainer.log_every_n_steps = 1
+    pretrain.data.global_batch_size = 16
+    pretrain.data.seq_length = 64
+    pretrain.trainer.max_steps = 10
 
-    pretrain.trainer.strategy.fsdp='megatron'
-    pretrain.trainer.strategy.ddp.average_in_collective=False
-    pretrain.trainer.strategy.ddp.use_megatron_fsdp=True
-    pretrain.trainer.strategy.save_ckpt_format='fsdp_dtensor'
+    pretrain.trainer.strategy.fsdp = 'megatron'
+    pretrain.trainer.strategy.ddp.average_in_collective = False
+    pretrain.trainer.strategy.ddp.use_megatron_fsdp = True
+    pretrain.trainer.strategy.save_ckpt_format = 'fsdp_dtensor'
     # pretrain.trainer.strategy.gradient_accumulation_fusion=False
 
     # # included in the performance mode but not normal mode
-    
-    pretrain.trainer.strategy.ddp.grad_reduce_in_fp32=False
+
+    pretrain.trainer.strategy.ddp.grad_reduce_in_fp32 = False
     pretrain.trainer.plugins.grad_reduce_in_fp32 = False
     pretrain.optim.config.use_precision_aware_optimizer = False
     pretrain.optim.config.use_megatron_fsdp = True
     # pretrain.data.seq_length = 4096
 
-    run.run(pretrain)
\ No newline at end of file
+    run.run(pretrain)
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index 965e8e7806cb..5ab71dbab9da 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -374,9 +374,7 @@ def __init__(
             )
             fsdp = 'megatron'
         if use_megatron_fsdp and self.save_ckpt_format != "fsdp_dtensor":
-            raise NotImplementedError(
-                f"Megatron-FSDP checkpointing is not supported with {self.save_ckpt_format}."
-            )
+            raise NotImplementedError(f"Megatron-FSDP checkpointing is not supported with {self.save_ckpt_format}.")
 
         if fsdp == "pytorch":
             raise NotImplementedError("PyTorch FSDP2 is not supported with MegatronParallel.")