From 1d775b2f248e5d20fbb0518b2f87649f2145e06d Mon Sep 17 00:00:00 2001 From: Boxiang Wang Date: Wed, 22 Oct 2025 16:39:09 -0700 Subject: [PATCH 1/2] Train working Signed-off-by: Boxiang Wang --- llama3_8b.py | 28 +++++++++++++++++++ nemo/collections/llm/recipes/llama3_8b.py | 2 +- .../pytorch/strategies/megatron_strategy.py | 11 ++++---- 3 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 llama3_8b.py diff --git a/llama3_8b.py b/llama3_8b.py new file mode 100644 index 000000000000..159fb404db72 --- /dev/null +++ b/llama3_8b.py @@ -0,0 +1,28 @@ +import nemo_run as run + +from nemo.collections.llm.recipes import llama3_8b + +if __name__ == "__main__": + pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8, performance_mode=True) + + pretrain.trainer.strategy.context_parallel_size=1 + pretrain.trainer.log_every_n_steps=1 + pretrain.data.global_batch_size=16 + pretrain.data.seq_length=64 + pretrain.trainer.max_steps=10 + + pretrain.trainer.strategy.fsdp='megatron' + pretrain.trainer.strategy.ddp.average_in_collective=False + pretrain.trainer.strategy.ddp.use_megatron_fsdp=True + pretrain.trainer.strategy.save_ckpt_format='fsdp_dtensor' + # pretrain.trainer.strategy.gradient_accumulation_fusion=False + + # # included in the performance mode but not normal mode + + pretrain.trainer.strategy.ddp.grad_reduce_in_fp32=False + pretrain.trainer.plugins.grad_reduce_in_fp32 = False + pretrain.optim.config.use_precision_aware_optimizer = False + pretrain.optim.config.use_megatron_fsdp = True + # pretrain.data.seq_length = 4096 + + run.run(pretrain) \ No newline at end of file diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py index 31723bf6aafc..85fc29eddf97 100644 --- a/nemo/collections/llm/recipes/llama3_8b.py +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -54,7 +54,7 @@ def model() -> run.Config[pl.LightningModule]: >>> model_config = model() >>> print(model_config) """ - return run.Config(LlamaModel, config=run.Config(Llama3Config8B)) + return run.Config(LlamaModel, config=run.Config(Llama3Config8B, gradient_accumulation_fusion=False)) def trainer( diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index a2507dd36fdc..965e8e7806cb 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -373,10 +373,10 @@ def __init__( "Setting FSDP option to megatron" ) fsdp = 'megatron' - if use_megatron_fsdp and self.save_ckpt_format != "fsdp_dtensor": - raise NotImplementedError( - f"Megatron-FSDP checkpointing is not supported with {self.save_ckpt_format}." - ) + if use_megatron_fsdp and self.save_ckpt_format != "fsdp_dtensor": + raise NotImplementedError( + f"Megatron-FSDP checkpointing is not supported with {self.save_ckpt_format}." + ) if fsdp == "pytorch": raise NotImplementedError("PyTorch FSDP2 is not supported with MegatronParallel.") @@ -1052,7 +1052,8 @@ def should_restore_optimizer_states(self, selective_restore: bool = False) -> bo def _save_fsdp_dtensor_common_state(self, state_dict, ckpt_dir): state_dict = state_dict.copy() del state_dict["model"] - del state_dict["optimizer_states"] + if "optimizer_states" in state_dict: + del state_dict["optimizer_states"] torch.save(state_dict, os.path.join(ckpt_dir, "common.pt")) def _load_fsdp_dtensor_common_state(self, ckpt_dir): From 271c986db284d20f73cc1b1e4784bf1b813ccf52 Mon Sep 17 00:00:00 2001 From: BoxiangW Date: Wed, 22 Oct 2025 23:41:08 +0000 Subject: [PATCH 2/2] Apply isort and black reformatting Signed-off-by: BoxiangW --- llama3_8b.py | 24 +++++++++---------- .../pytorch/strategies/megatron_strategy.py | 4 +--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/llama3_8b.py b/llama3_8b.py index 159fb404db72..f129913aa95d 100644 --- a/llama3_8b.py +++ b/llama3_8b.py @@ -5,24 +5,24 @@ if __name__ == "__main__": pretrain = llama3_8b.pretrain_recipe(num_nodes=1, num_gpus_per_node=8, performance_mode=True) - pretrain.trainer.strategy.context_parallel_size=1 - pretrain.trainer.log_every_n_steps=1 - pretrain.data.global_batch_size=16 - pretrain.data.seq_length=64 - pretrain.trainer.max_steps=10 + pretrain.trainer.strategy.context_parallel_size = 1 + pretrain.trainer.log_every_n_steps = 1 + pretrain.data.global_batch_size = 16 + pretrain.data.seq_length = 64 + pretrain.trainer.max_steps = 10 - pretrain.trainer.strategy.fsdp='megatron' - pretrain.trainer.strategy.ddp.average_in_collective=False - pretrain.trainer.strategy.ddp.use_megatron_fsdp=True - pretrain.trainer.strategy.save_ckpt_format='fsdp_dtensor' + pretrain.trainer.strategy.fsdp = 'megatron' + pretrain.trainer.strategy.ddp.average_in_collective = False + pretrain.trainer.strategy.ddp.use_megatron_fsdp = True + pretrain.trainer.strategy.save_ckpt_format = 'fsdp_dtensor' # pretrain.trainer.strategy.gradient_accumulation_fusion=False # # included in the performance mode but not normal mode - - pretrain.trainer.strategy.ddp.grad_reduce_in_fp32=False + + pretrain.trainer.strategy.ddp.grad_reduce_in_fp32 = False pretrain.trainer.plugins.grad_reduce_in_fp32 = False pretrain.optim.config.use_precision_aware_optimizer = False pretrain.optim.config.use_megatron_fsdp = True # pretrain.data.seq_length = 4096 - run.run(pretrain) \ No newline at end of file + run.run(pretrain) diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py index 965e8e7806cb..5ab71dbab9da 100644 --- a/nemo/lightning/pytorch/strategies/megatron_strategy.py +++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py @@ -374,9 +374,7 @@ def __init__( ) fsdp = 'megatron' if use_megatron_fsdp and self.save_ckpt_format != "fsdp_dtensor": - raise NotImplementedError( - f"Megatron-FSDP checkpointing is not supported with {self.save_ckpt_format}." - ) + raise NotImplementedError(f"Megatron-FSDP checkpointing is not supported with {self.save_ckpt_format}.") if fsdp == "pytorch": raise NotImplementedError("PyTorch FSDP2 is not supported with MegatronParallel.")