[Pallas/Mosaic GPU] Add support for s8 WGMMA with lhs in registers.

bchetioui · Google-ML-Automation · commit 7a007ea992fb · 2025-10-31T11:05:02.000-07:00
The low-level support only allows `swizzle=64` or larger for the time being.

PiperOrigin-RevId: 826554680
diff --git a/jax/_src/pallas/mosaic_gpu/core.py b/jax/_src/pallas/mosaic_gpu/core.py
@@ -1437,6 +1437,7 @@ def to_mgpu(self) -> mgpu.FragmentedLayout:
 class Layout(SomeLayout, enum.Enum):
   #: [m, n] matrix, where m % 64 == 0 == n % 8.
   WGMMA = enum.auto()
+  WGMMA_8BIT = enum.auto()
   WGMMA_UPCAST_2X = enum.auto()
   WGMMA_UPCAST_4X = enum.auto()
   WGMMA_TRANSPOSED = enum.auto()
@@ -1472,6 +1473,9 @@ def check_no_args():
       case Layout.WGMMA:
         check_no_args()
         return mgpu.WGMMA_LAYOUT
+      case Layout.WGMMA_8BIT:
+        check_no_args()
+        return mgpu.WGMMA_LAYOUT_8BIT
       case Layout.WGMMA_UPCAST_2X:
         check_no_args()
         return mgpu.WGMMA_LAYOUT_UPCAST_2X
diff --git a/jax/experimental/mosaic/gpu/__init__.py b/jax/experimental/mosaic/gpu/__init__.py
@@ -63,6 +63,7 @@
     TCGEN05_COL_LAYOUT as TCGEN05_COL_LAYOUT,
     TiledLayout as TiledLayout,
     WGMMA_LAYOUT as WGMMA_LAYOUT,
+    WGMMA_LAYOUT_8BIT as WGMMA_LAYOUT_8BIT,
     WGMMA_ROW_LAYOUT as WGMMA_ROW_LAYOUT,
     WGMMA_COL_LAYOUT as WGMMA_COL_LAYOUT,
     WGMMA_TRANSPOSED_LAYOUT as WGMMA_TRANSPOSED_LAYOUT,
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -2997,6 +2997,39 @@ def scope(acc_ref):
     )(a, b)
     np.testing.assert_allclose(res, a @ b, rtol=1e-3)
 
+  def test_wgmma_registers_integer(self):
+    # TODO(bchetioui): plumb in is_signed into WGMMA lowering and allow an
+    # integer accumulator type to be created.
+    self.skip_if_wg_semantics()
+    input_dtype = jnp.int8
+    out_dtype = jnp.int32
+    def kernel(a_ref, b_ref, o_ref):
+      def scope(acc_ref):
+        a_regs = plgpu.load(a_ref, (), layout=plgpu.Layout.WGMMA_8BIT)
+        plgpu.wgmma(acc_ref, a_regs, plgpu.transpose_ref(b_ref, (1, 0)))
+        return acc_ref[...]
+      o_ref[...] = pl.run_scoped(scope, plgpu.ACC((64, 192), out_dtype))
+
+    key1, key2 = jax.random.split(jax.random.key(42), 2)
+    m = 64
+    k = 128
+    n = 192
+    a = jax.random.randint(key1, shape=(m, k), minval=-128, maxval=127, dtype=input_dtype)
+    b = jax.random.randint(key2, shape=(n, k), minval=-128, maxval=127, dtype=input_dtype)
+
+    transforms = self.default_transforms(swizzle=64, dtype=input_dtype)
+    res = self.pallas_call(
+        kernel,
+        in_specs=[
+            plgpu.BlockSpec(transforms=transforms),
+            plgpu.BlockSpec(transforms=transforms),
+        ],
+        out_shape=jax.ShapeDtypeStruct((64, 192), out_dtype),
+    )(a, b)
+    np.testing.assert_array_equal(
+        res, a.astype(out_dtype) @ b.T.astype(out_dtype)
+    )
+
   def test_wgmma_registers_init(self):
     def kernel(a_ref, b_ref, i_ref, o_ref):
       def scope(acc_ref):