fix: safely handle edgecase of uneven vocab shards in tensor-parallel logprobs

gitlost-murali · gitlost-murali · commit 3d4342b8b437 · 2025-12-03T20:36:39.000Z
diff --git a/src/forge/util/ops.py b/src/forge/util/ops.py
@@ -155,6 +155,20 @@ def compute_logprobs_parallel(
     return target_logits - log_normalizer
 
 
+def _get_vocab_shard_bounds(
+    vocab_size: int, tp_rank: int, tp_size: int
+) -> tuple[int, int, int]:
+    """
+    Return (start, end, width) for a shard when vocab dimension is unevenly split.
+    """
+    base_shard = vocab_size // tp_size
+    remainder = vocab_size % tp_size
+    shard_width = base_shard + (1 if tp_rank < remainder else 0)
+    vocab_start = tp_rank * base_shard + min(tp_rank, remainder)
+    vocab_end = vocab_start + shard_width
+    return vocab_start, vocab_end, shard_width
+
+
 def get_vocab_shard_info(
     logits: DTensor,
 ) -> tuple[dist.ProcessGroup | None, int, int, int, int]:
@@ -171,19 +185,26 @@ def get_vocab_shard_info(
     local_logits = logits._local_tensor
     placements = logits.placements
     device_mesh = logits.device_mesh
+    global_vocab_size = logits.shape[-1]
 
     for i, p in enumerate(placements):
         if isinstance(p, Shard) and p.dim == 2:  # vocab dimension
             tp_group = device_mesh.get_group(mesh_dim=i)
             tp_size = dist.get_world_size(tp_group)
             tp_rank = dist.get_rank(tp_group)
+            vocab_start, vocab_end, shard_width = _get_vocab_shard_bounds(
+                global_vocab_size, tp_rank, tp_size
+            )
             local_vocab_size = local_logits.shape[-1]
-            vocab_start = tp_rank * local_vocab_size
-            vocab_end = vocab_start + local_vocab_size
+            if local_vocab_size != shard_width:
+                raise ValueError(
+                    "DTensor local shard width does not match inferred shard size "
+                    f"(rank={tp_rank}, local={local_vocab_size}, expected={shard_width})"
+                )
             return tp_group, tp_rank, tp_size, vocab_start, vocab_end
 
     # Not sharded
-    return None, 0, 1, 0, local_logits.shape[-1]
+    return None, 0, 1, 0, global_vocab_size
 
 
 def _distributed_log_normalizer(
diff --git a/tests/unit_tests/util/test_ops.py b/tests/unit_tests/util/test_ops.py
@@ -350,6 +350,65 @@ def test_parallel_logprobs_align_false(self):
             msg="Parallel logprobs with align=False should match",
         )
 
+    @gpu_test(gpu_count=2)
+    def test_parallel_logprobs_uneven_vocab_shards(self):
+        """Ensure uneven vocab shards still produce correct logprobs."""
+        torch.manual_seed(321)
+
+        batch_size = 2
+        seq_len = 12
+        vocab_size = 1001  # Not divisible by world_size
+        target_len = 6
+
+        rank = dist.get_rank()
+        device = torch.device(f"cuda:{rank}")
+
+        if rank == 0:
+            full_logits = torch.randn(
+                batch_size, seq_len, vocab_size, dtype=torch.float32, device=device
+            )
+            target_ids = torch.randint(
+                0, vocab_size, (batch_size, target_len), device=device
+            )
+        else:
+            full_logits = torch.empty(
+                batch_size, seq_len, vocab_size, dtype=torch.float32, device=device
+            )
+            target_ids = torch.empty(
+                batch_size, target_len, dtype=torch.int64, device=device
+            )
+
+        dist.broadcast(full_logits, src=0)
+        dist.broadcast(target_ids, src=0)
+
+        expected = compute_logprobs(full_logits, target_ids, align=True)
+
+        mesh = init_device_mesh("cuda", (self.world_size,), mesh_dim_names=("tp",))
+        base_shard = vocab_size // self.world_size
+        remainder = vocab_size % self.world_size
+        extra = 1 if rank < remainder else 0
+        vocab_start = rank * base_shard + min(rank, remainder)
+        vocab_end = vocab_start + base_shard + extra
+        local_slice = full_logits[:, :, vocab_start:vocab_end].contiguous()
+
+        dtensor_logits = DTensor.from_local(
+            local_slice,
+            mesh,
+            placements=[Shard(2)],
+            shape=torch.Size((batch_size, seq_len, vocab_size)),
+            stride=full_logits.stride(),
+        )
+
+        result = compute_logprobs_parallel(dtensor_logits, target_ids, align=True)
+
+        torch.testing.assert_close(
+            result,
+            expected,
+            atol=1e-5,
+            rtol=1e-5,
+            msg="Parallel logprobs should support uneven vocab shards",
+        )
+
     @gpu_test(gpu_count=2)
     def test_parallel_logprobs_numerical_stability(self):
         """Test parallel logprobs handles extreme values correctly."""