[Pallas MGPU] Adding support for squeezed block dims in the pipeline BlockSpecs. They can be identified with a None or pl.Squeezed.

Rifur13 · Google-ML-Automation · commit faef8769be00 · 2025-12-03T17:23:40.000-08:00
PiperOrigin-RevId: 839975575
diff --git a/jax/_src/pallas/mosaic_gpu/pipeline.py b/jax/_src/pallas/mosaic_gpu/pipeline.py
@@ -59,8 +59,15 @@ def _get_block_size(
       raise NotImplementedError(f"Unsupported block size type: {type(bd)}")
 
 def _get_block_shape(spec: pallas_core.BlockSpec):
-  assert spec.block_shape is not None
-  return tuple(_get_block_size(bd) for bd in spec.block_shape)
+  if spec.block_shape is None:
+    raise ValueError("Block shape must be specified.")
+
+  block_shape = tuple(
+      _get_block_size(bd)
+      for bd in spec.block_shape
+      if not (bd is None or isinstance(bd, pl.Squeezed))
+  )
+  return block_shape
 
 
 map_brefs = functools.partial(
@@ -84,18 +91,27 @@ def get_ref_for_slot(
       return self.gmem_ref
     return self.smem_ref.at[slot]
 
-  def compute_gmem_slice(self, grid_indices) -> tuple[pl.Slice, ...]:
+  def compute_gmem_slice(self, grid_indices) -> tuple[pl.Slice | jax.Array, ...]:
     index_map = self.spec.index_map
     assert index_map is not None
     # We don't allow Python scalars here, because they are interpreted
     # differently depending on the x32/x64 mode.
     assert all(i.dtype == jnp.dtype(jnp.int32) for i in grid_indices)
-    sizes = _get_block_shape(self.spec)
+
+    def _make_block_slice(block_index: jax.Array, bd: pl.BlockDim):
+      match bd:
+        case int():
+          return pl.Slice(block_index * bd, bd)
+        case pl.Blocked(block_size):
+          return pl.Slice(block_index * block_size, block_size)
+        case None | pl.Squeezed():
+          return block_index
+        case _:
+          raise ValueError(f"Unsupported block dimension type: {bd}")
+
     return tuple(
-        pl.Slice(idx * size, size)  # type: ignore[arg-type]
-        for idx, size in zip(
-            index_map(*grid_indices), sizes  # type: ignore[arg-type]
-        )
+        _make_block_slice(bd, idx)
+        for bd, idx in zip(index_map(*grid_indices), self.spec.block_shape)
     )
 
   def copy_in(self, slot, grid_indices, barrier_ref, barrier_slot=None):
@@ -177,6 +193,10 @@ class _Slice:
   start: int | jax.Array
   size: int | jax.Array
 
+  @classmethod
+  def from_val(cls, s: pl.Slice| jax.Array):
+    return cls(s.start, s.size) if isinstance(s, pl.Slice) else cls(s, 1)
+
   def __eq__(self, other: _Slice) -> jax.Array:  # type: ignore
     return lax.bitwise_and(self.start == other.start, self.size == other.size)
 
@@ -372,7 +392,7 @@ def loop_body(step, carry):
           continue
         assert last_store_slices[idx] is not None
         new_store_slices[idx] = tuple(
-            _Slice(s.start, s.size) for s in bref.compute_gmem_slice(indices)
+            _Slice.from_val(s) for s in bref.compute_gmem_slice(indices)
         )
         are_same_slices = map(
             lambda old, new: old == new,
@@ -690,7 +710,7 @@ def _get_scoped_allocs(*gmem_refs: AbstractRefPytree):
       slots = max_concurrent_steps if has_seq_dim else 1
       smem_allocs.append(
           gpu_core.SMEM(
-              (slots, *spec.block_shape),   # type: ignore
+              (slots, *_get_block_shape(spec)),   # type: ignore
               gmem_ref.dtype,
               transforms=getattr(spec, "transforms", ()),
           )
@@ -880,7 +900,7 @@ def compute_loop_body(step, carry):
             continue
           assert last_store_slices[idx] is not None
           new_store_slices[idx] = tuple(
-              _Slice(s.start, s.size) for s in bref.compute_gmem_slice(indices)
+              _Slice.from_val(s) for s in bref.compute_gmem_slice(indices)
           )
           are_same_slices = map(
               lambda old, new: old == new,
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -4734,6 +4734,33 @@ def kernel_body(_, o_smem, carry):
         kernel_fn(), jnp.tile(jnp.repeat(jnp.arange(num_steps), 64), (64, 1))
     )
 
+  @parameterized.parameters((pl.Squeezed(),), (None,))
+  def test_emit_with_squeezed_dim(self, squeezed_dim):
+
+    shape = (16, 256)
+    num_steps = shape[0]
+
+    def kernel(x_gmem, o_gmem):
+      plgpu.emit_pipeline(
+          kernel_body,
+          in_specs=[pl.BlockSpec((squeezed_dim, shape[1]), lambda i: (i, 0))],
+          out_specs=[pl.BlockSpec((squeezed_dim, shape[1]), lambda i: (i, 0))],
+          grid=(num_steps,),
+          max_concurrent_steps=2,
+      )(x_gmem, o_gmem)
+
+    def kernel_body(_, in_smem, o_smem):
+      o_smem[...] = in_smem[...] + 1
+
+    kernel_fn = self.pallas_call(
+        kernel,
+        in_specs=[pl.BlockSpec(memory_space=plgpu.GMEM)],
+        out_specs=pl.BlockSpec(memory_space=plgpu.GMEM),
+        out_shape=jax.ShapeDtypeStruct((16, 256), jnp.int32),
+    )
+    x = jnp.arange(16 * 256, dtype=jnp.int32).reshape(16, 256)
+    np.testing.assert_array_equal(kernel_fn(x), x + 1)
+
 
 class PipelineWGTest(
     PipelineTest, lowering_semantics=plgpu.LoweringSemantics.Warpgroup
@@ -5423,6 +5450,38 @@ def pipeline_body(_, x_smem, o_smem):
     )
     np.testing.assert_array_equal(y, np.stack([x + 1.0, x + 1.0]))
 
+  @parameterized.parameters((pl.Squeezed(),), (None,))
+  def test_emit_with_squeezed_dim(self, squeezed_dim):
+    self.skip_if_wg_semantics()
+
+    shape = (16, 256)
+    num_steps = shape[0]
+
+    def kernel(x_gmem, o_gmem):
+      plgpu.emit_pipeline_warp_specialized(
+          kernel_body,
+          in_specs=[pl.BlockSpec((squeezed_dim, shape[1]), lambda i: (i, 0))],
+          out_specs=[pl.BlockSpec((squeezed_dim, shape[1]), lambda i: (i, 0))],
+          grid=(num_steps,),
+          max_concurrent_steps=2,
+          num_compute_wgs=1,
+          memory_registers=40,
+          wg_axis="wg",
+      )(x_gmem, o_gmem)
+
+    def kernel_body(_, in_smem, o_smem):
+      o_smem[...] = in_smem[...] + 1
+
+    kernel_fn = self.kernel(
+        kernel,
+        out_shape=jax.ShapeDtypeStruct((16, 256), jnp.int32),
+        num_threads=2,
+        thread_name="wg",
+    )
+    x = jnp.arange(16 * 256, dtype=jnp.int32).reshape(16, 256)
+    np.testing.assert_array_equal(kernel_fn(x), x + 1)
+
+
 
 class WarpSpecializedPipelineWGTest(
     WarpSpecializedPipelineTest,