Integrate with StepScheduler and checkpointing code

pthombre · akoumpa · commit 4873cc3c3658 · 2025-10-21T13:47:21.000-07:00
Signed-off-by: Pranav Prashant Thombre &lt;pthombre@nvidia.com&gt;
diff --git a/examples/diffusion/finetune/wan2_1_t2v_flow.yaml b/examples/diffusion/finetune/wan2_1_t2v_flow.yaml
@@ -22,7 +22,7 @@ data:
     num_nodes: 1
 
 batch:
-  batch_size_per_node: 1
+  batch_size_per_node: 8
 
 training:
   num_epochs: 20
@@ -43,13 +43,19 @@ flow_matching:
 
 fsdp:
   cpu_offload: true
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
 
 logging:
   save_every: 50
   log_every: 2
 
 checkpoint:
-  output_dir: /opt/Automodel/wan_t2v_flow_outputs_updated/
-  resume: null
+  enabled: true
+  checkpoint_dir: /opt/Automodel/wan_t2v_flow_outputs_base_recipe_checkpoint_NEW_new/
+  model_save_format: torch_save
+  save_consolidated: false
+  restore_from: null
 
 
diff --git a/nemo_automodel/components/_diffusers/utils/validate_t2v.py b/nemo_automodel/components/_diffusers/utils/validate_t2v.py
@@ -139,7 +139,7 @@ def main():
         # Try EMA checkpoint first (best quality)
         ema_path = os.path.join(args.checkpoint, "ema_shadow.pt")
         consolidated_path = os.path.join(args.checkpoint, "consolidated_model.bin")
-        sharded_dir = os.path.join(args.checkpoint, "transformer_model")
+        sharded_dir = os.path.join(args.checkpoint, "model")
         
         if os.path.exists(ema_path):
             print(f"[INFO] Loading EMA checkpoint (best quality)...")
@@ -183,9 +183,9 @@ def main():
             )
 
             # Load shards into the FSDP-wrapped model
-            model_state = {"model": fsdp_transformer.state_dict()}
+            model_state = fsdp_transformer.state_dict()
             dist_load(state_dict=model_state, storage_reader=FileSystemReader(sharded_dir))
-            fsdp_transformer.load_state_dict(model_state["model"])
+            fsdp_transformer.load_state_dict(model_state)
 
             # Unwrap back to the original module for inference
             pipe.transformer = fsdp_transformer.module
diff --git a/nemo_automodel/recipes/diffusion/finetune.py b/nemo_automodel/recipes/diffusion/finetune.py