[pallas:mosaic] pltpu.emit_pipeline now accepts block specs in HBM

superbobry · Google-ML-Automation · commit e2815d514864 · 2025-12-18T07:17:26.000-08:00
This makes it possible to use it to implement pipelining in the `pallas_call`
lowering on SparseCore.

PiperOrigin-RevId: 846255621
diff --git a/jax/_src/pallas/mosaic/pipeline.py b/jax/_src/pallas/mosaic/pipeline.py
@@ -40,6 +40,7 @@
 
 SMEM = tpu_core.MemorySpace.SMEM
 VMEM = tpu_core.MemorySpace.VMEM
+HBM = tpu_core.MemorySpace.HBM
 ANY = pallas_core.MemorySpace.ANY
 REF = pallas_core.MemoryRef
 GridDimensionSemantics = tpu_core.GridDimensionSemantics
@@ -582,13 +583,13 @@ def create(
       accum_ref = VMEM.from_type(ty.update(shape=block_shape))
     else:
       accum_ref = None
-    if source_memory_space == VMEM:
-      # We don't need to do any double-buffering in the case that our pipeline
-      # reference is already in VMEM, we just need allocate the accumulation
-      # buffer and we will refer to the original reference slices directly.
-      if spec.memory_space not in (VMEM, None):
-        raise ValueError(
-            f"Cannot hold a non-buffered ref in {spec.memory_space=}")
+    buffer_memory_space = (
+          VMEM if spec.memory_space is None else spec.memory_space)
+    if buffer_memory_space not in (SMEM, VMEM, HBM):
+      raise ValueError(
+          f"Unsupported buffer memory space: {buffer_memory_space}"
+      )
+    if source_memory_space is buffer_memory_space:
       return cls(
           _spec=spec,
           _buffer_type=buffer_type,
@@ -609,12 +610,6 @@ def create(
           swap=None,
       )
     else:
-      buffer_memory_space = (
-          VMEM if spec.memory_space is None else spec.memory_space)
-      if buffer_memory_space not in (SMEM, VMEM):
-        raise ValueError(
-            f"Unsupported buffer memory space: {buffer_memory_space}"
-        )
       if use_lookahead and grid_rank is None:
         raise ValueError(
             "grid_rank must be specified when use_lookahead is True."
@@ -1335,7 +1330,7 @@ def out_of_fetch(self, buffered_ref):
     # Currently this is based on the iteration, but if we want to support
     # lookahead this will depend on whether the lookahead reached the end.
     if not buffered_ref.is_buffered:
-      return False
+      return jnp.bool(False)
     return self.step >= (self.num_steps - buffered_ref.buffer_count + 1)
 
   def has_changed(self, buffered_ref):
diff --git a/tests/pallas/tpu_pallas_pipeline_test.py b/tests/pallas/tpu_pallas_pipeline_test.py
@@ -148,6 +148,31 @@ def body(o_ref):
     )()
     np.testing.assert_allclose(out, jnp.full_like(out, 42))
 
+  def test_hbm_output(self):
+    @functools.partial(
+        pl.pallas_call,
+        out_shape=jax.ShapeDtypeStruct((8, 512), jnp.int32),
+        in_specs=[pl.BlockSpec(memory_space=pltpu.HBM)],
+        out_specs=pl.BlockSpec(memory_space=pltpu.HBM),
+    )
+    def kernel(x_hbm_ref, o_hbm_ref):
+      @functools.partial(
+          pltpu.emit_pipeline,
+          grid=(4,),
+          in_specs=pl.BlockSpec((8, 128), lambda i: (0, i)),
+          out_specs=pl.BlockSpec(
+              (8, 512), lambda i: (0, 0), memory_space=pltpu.HBM
+          ),
+      )
+      def pipeline(x_ref, o_ref):
+        i = pl.program_id(0)
+        pltpu.sync_copy(x_ref, o_ref.at[:, pl.ds(i * 128, 128)])
+
+      pipeline(x_hbm_ref, o_hbm_ref)
+
+    x = jnp.arange(8 * 512).reshape(8, 512)
+    np.testing.assert_allclose(kernel(x), x)
+
   @parameterized.product(
       no_pipelining=[False, True],
   )