Bug fix for ptq_generate, add functional test for ptq and ptq_generate

yueshen2016 · yueshen2016 · commit 6da9242ab679 · 2025-11-11T05:15:08.000Z
Signed-off-by: James Shen &lt;yueshen@nvidia.com&gt;
diff --git a/examples/quantization/ptq_generate.py b/examples/quantization/ptq_generate.py
@@ -92,7 +92,7 @@ def main(
     model_provider.initialize_model_parallel(seed=0)
     megatron_model = bridge.load_megatron_model(
         megatron_load_path,
-        mp_override={
+        mp_overrides={
             "tensor_model_parallel_size": tp,
             "pipeline_model_parallel_size": pp,
             "expert_model_parallel_size": ep,
diff --git a/tests/functional_tests/L2_Launch_quantization.sh b/tests/functional_tests/L2_Launch_quantization.sh
@@ -22,6 +22,7 @@ export CUDA_VISIBLE_DEVICES="0,1"
 
 uv run coverage run --data-file=/opt/Megatron-Bridge/.coverage --source=/opt/Megatron-Bridge/ --parallel-mode -m pytest \
   -o log_cli=true -o log_cli_level=INFO -v -s -x -m "not pleasefixme" --tb=short -rA \
-  tests/functional_tests/quantization/test_qat_workflow.py
+  tests/functional_tests/quantization/test_qat_workflow.py \
+  tests/functional_tests/quantization/test_quantization_workflow.py
 coverage combine -q
 
diff --git a/tests/functional_tests/quantization/test_qat_workflow.py b/tests/functional_tests/quantization/test_qat_workflow.py
@@ -14,11 +14,18 @@
 
 """Functional tests for QAT (Quantization Aware Training) workflow."""
 
+import os
 import subprocess
 from pathlib import Path
 
 import pytest
 
+from megatron.bridge.training.utils.checkpoint_utils import (
+    TRACKER_PREFIX,
+    get_checkpoint_name,
+    get_checkpoint_tracker_filename,
+    get_checkpoint_train_state_filename,
+)
 from tests.functional_tests.utils import clear_directories
 
 
@@ -98,6 +105,8 @@ def _run_pretrain_from_quantized_checkpoint(
         tp: int = 1,
         pp: int = 1,
         cp: int = 2,
+        train_iters: int = 10,
+        save_interval: int = 10,
     ):
         """
         Run pre-training from a quantized checkpoint using subprocess.
@@ -109,9 +118,12 @@ def _run_pretrain_from_quantized_checkpoint(
             tp: Tensor parallelism size
             pp: Pipeline parallelism size
             cp: Context parallelism size (default: 2)
+            train_iters: Number of training iterations
+            save_interval: Interval for saving checkpoints
 
         Returns:
-            subprocess.CompletedProcess: The result of the subprocess run
+            tuple: (subprocess.CompletedProcess, final_iteration)
+                   where final_iteration is the last checkpoint saved
         """
         # Calculate total number of processes needed (tp * pp * cp)
         total_procs = tp * pp * cp
@@ -121,6 +133,10 @@ def _run_pretrain_from_quantized_checkpoint(
 
         python_executable = sys.executable
 
+        # Calculate the final iteration (last checkpoint that will be saved)
+        # Checkpoints are saved at intervals, so the last one is at train_iters if it's a multiple of save_interval
+        final_iteration = (train_iters // save_interval) * save_interval
+
         # Base command for pre-training from quantized checkpoint
         cmd = [
             python_executable,
@@ -139,13 +155,13 @@ def _run_pretrain_from_quantized_checkpoint(
             "model.gradient_accumulation_fusion=False",
             f"checkpoint.pretrained_checkpoint={quantized_checkpoint_path}",
             f"checkpoint.save={checkpoint_save_dir}",
-            "checkpoint.save_interval=10",
-            "train.train_iters=10",
+            f"checkpoint.save_interval={save_interval}",
+            f"train.train_iters={train_iters}",
             "train.eval_interval=5",
             "train.eval_iters=2",
             "train.global_batch_size=8",
             "scheduler.lr_warmup_iters=2",
-            "scheduler.lr_decay_iters=10",
+            f"scheduler.lr_decay_iters={train_iters}",
         ]
 
         # Always add parallelism arguments to override script defaults
@@ -154,7 +170,7 @@ def _run_pretrain_from_quantized_checkpoint(
         cmd.append(f"model.context_parallel_size={cp}")
 
         result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent.parent.parent.parent)
-        return result
+        return result, final_iteration
 
     @pytest.mark.run_only_on("GPU")
     @pytest.mark.parametrize("recipe_name,parallelism_overrides", QAT_WORKFLOW_CONFIGS)
@@ -212,13 +228,17 @@ def test_qat_workflow(self, recipe_name, parallelism_overrides, tmp_path):
 
             print(f"=== STEP 2: Running pre-training from quantized checkpoint for {recipe_name} ===")
             # Step 2: Run pre-training from the quantized checkpoint
-            pretrain_result = self._run_pretrain_from_quantized_checkpoint(
+            train_iters = 10
+            save_interval = 10
+            pretrain_result, expected_iteration = self._run_pretrain_from_quantized_checkpoint(
                 quantized_checkpoint_path=str(quantized_checkpoint_dir),
                 checkpoint_save_dir=str(checkpoint_save_dir),
                 hf_model_id="meta-llama/Llama-3.2-1B",
                 tp=tensor_model_parallel_size or 1,
                 pp=pipeline_model_parallel_size or 1,
                 cp=context_parallel_size or 2,  # Default context parallelism is 2
+                train_iters=train_iters,
+                save_interval=save_interval,
             )
 
             if pretrain_result.returncode != 0:
@@ -227,12 +247,63 @@ def test_qat_workflow(self, recipe_name, parallelism_overrides, tmp_path):
                 assert False, f"Pre-training step failed with return code {pretrain_result.returncode}"
 
             print("✓ Pre-training from quantized checkpoint completed successfully")
+            print(f"  Training ran for {train_iters} iterations, saving every {save_interval} iterations")
+            print(f"  Expected final checkpoint iteration: {expected_iteration}")
 
-            # Verify checkpoint files were created (simple existence check, not full distributed verification)
+            # Verify checkpoint files were created with comprehensive checks
+            # (adapted from verify_checkpoint_files but without requiring torch.distributed)
             assert checkpoint_save_dir.exists(), f"Checkpoint save directory not found at {checkpoint_save_dir}"
-            checkpoint_dirs = list(checkpoint_save_dir.iterdir())
-            assert len(checkpoint_dirs) > 0, f"No checkpoints saved in {checkpoint_save_dir}"
-            print(f"✓ Checkpoint files verified: {[d.name for d in checkpoint_dirs]}")
+
+            # Verify Megatron-Bridge tracker file
+            latest_tracker_file = get_checkpoint_train_state_filename(str(checkpoint_save_dir), prefix=TRACKER_PREFIX)
+            assert os.path.exists(latest_tracker_file), (
+                f"Latest checkpoint tracker file not found at {latest_tracker_file}"
+            )
+            print(f"✓ Megatron-Bridge tracker file found: {latest_tracker_file}")
+
+            # Verify Megatron-LM compatibility tracker file
+            megatron_lm_tracker = get_checkpoint_tracker_filename(str(checkpoint_save_dir))
+            assert os.path.exists(megatron_lm_tracker), f"Megatron-LM tracker file not found at {megatron_lm_tracker}"
+            print(f"✓ Megatron-LM tracker file found: {megatron_lm_tracker}")
+
+            # Verify the tracker file contains the correct iteration
+            with open(megatron_lm_tracker, "r") as f:
+                saved_iteration = f.read().strip()
+            assert saved_iteration == str(expected_iteration), (
+                f"Megatron-LM tracker file contains '{saved_iteration}', expected '{expected_iteration}'"
+            )
+            print(f"✓ Tracker file contains correct iteration: {expected_iteration}")
+
+            # Verify final checkpoint directory exists
+            final_iter_dir = get_checkpoint_name(str(checkpoint_save_dir), expected_iteration, release=False)
+            assert os.path.exists(final_iter_dir), f"Final checkpoint directory not found at {final_iter_dir}"
+            print(f"✓ Final checkpoint directory found: {final_iter_dir}")
+
+            # Verify metadata file exists
+            metadata_file = os.path.join(final_iter_dir, ".metadata")
+            assert os.path.exists(metadata_file), f"Checkpoint metadata file not found at {metadata_file}"
+            print(f"✓ Metadata file found: {metadata_file}")
+
+            # Verify .distcp files (torch.distributed.checkpoint format)
+            distcp_files = [f for f in os.listdir(final_iter_dir) if f.endswith(".distcp")]
+
+            # Calculate expected world size from parallelism settings
+            tp = tensor_model_parallel_size or 1
+            pp = pipeline_model_parallel_size or 1
+            cp = context_parallel_size or 2
+            world_size = tp * pp * cp
+
+            # For torch_dist format, expect 2 * world_size .distcp files
+            # (one for model state, one for optimizer state per rank)
+            expected_distcp_files = 2 * world_size
+            assert len(distcp_files) == expected_distcp_files, (
+                f"Expected {expected_distcp_files} .distcp files (2 * {world_size} world_size), "
+                f"found {len(distcp_files)}: {distcp_files}"
+            )
+            print(
+                f"✓ Correct number of .distcp files: {len(distcp_files)} "
+                f"(world_size={world_size}, tp={tp}, pp={pp}, cp={cp})"
+            )
 
             print(f"SUCCESS: Complete QAT workflow test passed for {recipe_name}")
 
diff --git a/tests/functional_tests/quantization/test_quantization_workflow.py b/tests/functional_tests/quantization/test_quantization_workflow.py
@@ -159,9 +159,12 @@ def test_quantization_and_generation_single_gpu(self, tmp_path):
                 assert False, f"Generation step failed with return code {generation_result.returncode}"
 
             # Verify generation succeeded
-            assert f"Loaded quantized model from: {quantized_checkpoint_dir}" in generation_result.stdout, (
-                f"Checkpoint loading message not found. Output: {generation_result.stdout}"
-            )
+            # Note: stdout may have line wrapping, so we normalize it by removing newlines within the output
+            stdout_normalized = generation_result.stdout.replace("\n", " ")
+            assert (
+                "Loaded quantized model from:" in generation_result.stdout
+                and str(quantized_checkpoint_dir) in stdout_normalized
+            ), f"Checkpoint loading message not found. Output: {generation_result.stdout}"
             assert "Testing quantized model with custom prompts" in generation_result.stdout, (
                 f"Generation test message not found. Output: {generation_result.stdout}"
             )
@@ -180,9 +183,10 @@ def test_quantization_and_generation_single_gpu(self, tmp_path):
     @pytest.mark.parametrize(
         "quant_tp,quant_pp,gen_tp,gen_pp,test_name",
         [
+            (1, 1, 2, 1, "TP1_to_TP2"),  # quantize with tp=1, generate with tp=2
             (2, 1, 1, 1, "TP2_to_Single"),  # quantize with tp=2, generate with tp=1
             (1, 1, 1, 2, "PP1_to_PP2"),  # quantize with pp=1, generate with pp=2
-            (1, 2, 1, 1, "PP2_to_Single"),  # additional: quantize pp=2, generate single
+            (1, 2, 1, 1, "PP2_to_Single"),  # quantize pp=2, generate single
         ],
     )
     def test_quantization_and_generation_parallelism(self, tmp_path, quant_tp, quant_pp, gen_tp, gen_pp, test_name):
@@ -242,9 +246,12 @@ def test_quantization_and_generation_parallelism(self, tmp_path, quant_tp, quant
                 assert False, f"Generation step for {test_name} failed with return code {generation_result.returncode}"
 
             # Verify generation succeeded with correct parallelism
-            assert f"Loaded quantized model from: {quantized_checkpoint_dir}" in generation_result.stdout, (
-                f"Checkpoint loading message not found in {test_name}. Output: {generation_result.stdout}"
-            )
+            # Note: stdout may have line wrapping, so we normalize it by removing newlines within the output
+            stdout_normalized = generation_result.stdout.replace("\n", " ")
+            assert (
+                "Loaded quantized model from:" in generation_result.stdout
+                and str(quantized_checkpoint_dir) in stdout_normalized
+            ), f"Checkpoint loading message not found in {test_name}. Output: {generation_result.stdout}"
             assert f"Tensor parallel size: {gen_tp}" in generation_result.stdout, (
                 f"Generation TP setting not found in {test_name}. Output: {generation_result.stdout}"
             )