[Mosaic GPU] Add support for all kinds of TMA reductions.

dimitar-asenov · Google-ML-Automation · commit f5a09b8ff76b · 2025-12-16T07:44:42.000-08:00
I had to change the tma descriptor cache key, since there are cases where we currently need two different descriptors based on the reduction op. We could in principle go back to a single TMA descriptor in those cases if we pass sign information to async_copy.

PiperOrigin-RevId: 845269465
diff --git a/jax/experimental/mosaic/gpu/launch_context.py b/jax/experimental/mosaic/gpu/launch_context.py
@@ -39,7 +39,24 @@
 
 TMA_DESCRIPTOR_BYTES = 128
 TMA_DESCRIPTOR_ALIGNMENT = 64
-TMAReductionOp = Literal["add", "min", "max", "inc", "dec", "and", "or", "xor"]
+TMAReductionOp = Literal[
+    "add",
+    "min",
+    "max",
+    "inc",
+    "dec",
+    "and",
+    "or",
+    "xor",
+    "umin",
+    "umax",
+    "smin",
+    "smax",
+]
+
+def _reduction_op_to_ptx(reduction_op: TMAReductionOp) -> str:
+  # convert [s|u]min|max to min|max
+  return reduction_op[-3:]
 
 c = utils.c  # This is too common to fully qualify.
 
@@ -426,6 +443,81 @@ def _find_kernel_argument_for_gmem_ref(
   return gmem_ref
 
 
+def _is_tma_reduction_op_supported(
+    reduction_op: TMAReductionOp | None, dtype: ir.Type,
+) -> bool:
+  """Returns whether the given TMA reduction op supports the given dtype.
+
+  This function essentially implements the table at:
+  https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
+  with the following differences:
+  - For `add` reductions, we also support int64, treating it as uint64.
+  - For `and`, `or`, and `xor` reductions, we support signed integer types.
+  - For `inc` and `dec` reductions, we support both signed and unsigned i32
+    treating both as unsigned.
+  """
+  i32 = ir.IntegerType.get_signless(32)
+  i64 = ir.IntegerType.get_signless(64)
+  f16 = ir.F16Type.get()
+  f32 = ir.F32Type.get()
+  bf16 = ir.BF16Type.get()
+
+  match reduction_op:
+    case None:
+      return True
+    case "add":
+      return dtype in (f16, f32, bf16, i32, i64)
+    case "max" | "min":
+      return dtype in (f16, bf16)
+    case "umax" | "umin" | "smax" | "smin":
+      return dtype in (i32, i64)
+    case "inc" | "dec":
+      return dtype == i32
+    case "and" | "or" | "xor":
+      return dtype in (i32, i64)
+
+
+def _tma_dma_type(
+    element_type: ir.Type,
+    reduction_op: TMAReductionOp | None,
+) -> int:
+  """Returns the TMA DMA type for the given element type and signedness."""
+  if ir.IntegerType.isinstance(element_type):
+    bitwidth = utils.bitwidth_impl(element_type)
+    if bitwidth == 2:
+      tma_dtype = 8
+    elif bitwidth == 4:
+      tma_dtype = 0
+    elif bitwidth == 8:
+      tma_dtype = 1
+    elif bitwidth == 16:
+      tma_dtype = 2
+    elif bitwidth == 32:
+      tma_dtype = 9 if reduction_op in ("smin", "smax") else 3
+    elif bitwidth == 64:
+      tma_dtype = 10 if reduction_op in ("smin", "smax") else 4
+    else:
+      raise ValueError(f"Unsupported integer bitwidth: {bitwidth}")
+  elif ir.F16Type.isinstance(element_type):
+    tma_dtype = 5
+  elif ir.F32Type.isinstance(element_type):
+    tma_dtype = 6
+  elif ir.BF16Type.isinstance(element_type):
+    tma_dtype = 7
+  # We treat narrow floats as integers
+  elif ir.Float8E5M2Type.isinstance(element_type):
+    tma_dtype = 1
+  elif ir.Float8E4M3FNType.isinstance(element_type):
+    tma_dtype = 1
+  elif ir.Float8E8M0FNUType.isinstance(element_type):
+    tma_dtype = 1
+  elif ir.Float4E2M1FNType.isinstance(element_type):
+    tma_dtype = 0
+  else:
+    raise ValueError(f"unsupported TMA dtype {element_type}")
+  return tma_dtype
+
+
 class AsyncCopyImplementation(enum.Enum):
   TMA = enum.auto()
   CP_ASYNC = enum.auto()
@@ -438,7 +530,7 @@ class LaunchContext:
   cluster_size: tuple[int, int, int]
   profiler: OnDeviceProfiler | None = None
   tma_descriptors: dict[
-      tuple[ir.Value, tuple[int, ...], int | None, tuple[MemRefTransform, ...], Any],
+      tuple[ir.Value, tuple[int, ...], int | None, tuple[MemRefTransform, ...], Any, int],
       ir.Value,
   ] = dataclasses.field(default_factory=dict, init=False)
   is_device_collective: bool = False
@@ -512,10 +604,11 @@ def _get_tma_desc(
       reduction_op: TMAReductionOp | None,
   ):
     gmem_ref = _find_kernel_argument_for_gmem_ref(gmem_ref)
+    tma_dtype = _tma_dma_type(ir.MemRefType(gmem_ref.type).element_type, reduction_op)
     # Using ir.Values in cache keys is a little sketchy, but I think it should
     # be fine. Having it in the key will keep it alive, and if comparison and
     # hashing is by identity then it should work out.
-    tma_desc_key = (gmem_ref, transformed_slice_shape, swizzle, gmem_transform, gmem_peer_id)
+    tma_desc_key = (gmem_ref, transformed_slice_shape, swizzle, gmem_transform, gmem_peer_id, tma_dtype)
     if (tma_desc := self.tma_descriptors.get(tma_desc_key, None)) is None:
       i32 = ir.IntegerType.get_signless(32)
       i64 = ir.IntegerType.get_signless(64)
@@ -580,43 +673,6 @@ def init_tma_desc(host_ptr):
         )
         # TODO(apaszke): Better verification (e.g. slice is non-zero)
         # TODO(apaszke): We always know strides statically.
-        if isinstance(ref_ty.element_type, ir.IntegerType):
-          if reduction_op is not None:
-            raise ValueError(
-                f"TMA with reduction_op={reduction_op} is not supported with Integers"
-            )
-          bitwidth = utils.bitwidth_impl(ref_ty.element_type)
-          if bitwidth == 2:
-            tma_dtype = 8
-          elif bitwidth == 4:
-            tma_dtype = 0
-          elif bitwidth == 8:
-            tma_dtype = 1
-          elif bitwidth == 16:
-            tma_dtype = 2
-          elif bitwidth == 32:
-            tma_dtype = 3
-          elif bitwidth == 64:
-            tma_dtype = 4
-          else:
-            raise ValueError(f"Unsupported integer bitwidth: {bitwidth}")
-        elif ir.F16Type.isinstance(ref_ty.element_type):
-          tma_dtype = 5
-        elif ir.F32Type.isinstance(ref_ty.element_type):
-          tma_dtype = 6
-        elif ir.BF16Type.isinstance(ref_ty.element_type):
-          tma_dtype = 7
-        # We treat narrow floats as integers
-        elif ir.Float8E5M2Type.isinstance(ref_ty.element_type):
-          tma_dtype = 1
-        elif ir.Float8E4M3FNType.isinstance(ref_ty.element_type):
-          tma_dtype = 1
-        elif ir.Float8E8M0FNUType.isinstance(ref_ty.element_type):
-          tma_dtype = 1
-        elif ir.Float4E2M1FNType.isinstance(ref_ty.element_type):
-          tma_dtype = 0
-        else:
-          raise ValueError(f"unsupported TMA dtype {ref_ty.element_type}")
         dtype_or_bitwidth = c(tma_dtype, i64)
         args = [
             host_ptr,
@@ -953,16 +1009,10 @@ def async_copy(
     if reduction_op is not None:
       if implementation != AsyncCopyImplementation.TMA:
         raise ValueError("Only the TMA implementation supports reductions")
-      if not any(
-          t.isinstance(element_type)
-          for t in (ir.F32Type, ir.BF16Type, ir.F16Type)
-      ):
-        raise ValueError(
-            "TMA with reduction is only supported with f32, f16 and bf16"
-        )
-      if reduction_op != "add":
+      if not _is_tma_reduction_op_supported(reduction_op, element_type):
         raise ValueError(
-            "TMA with reduction is only supported with add operation"
+            f"Reduction op {reduction_op} not supported by the TMA"
+            f" implementation for element type {element_type}"
         )
 
     if src_ref_ty.memory_space is None and utils.is_smem_ref(dst_ref_ty):
@@ -1329,7 +1379,7 @@ def async_copy(
         llvm.inline_asm(
           ir.Type.parse("!llvm.void"),
           [predicate,smem_ptr,tma_desc,*rev_dyn_base_indices],
-          f"@$0 cp.reduce.async.bulk.tensor.{rank}d.global.shared::cta.{reduction_op}.tile.bulk_group [$2,{{{idx_operands}}}], [$1];",
+          f"@$0 cp.reduce.async.bulk.tensor.{rank}d.global.shared::cta.{_reduction_op_to_ptx(reduction_op)}.tile.bulk_group [$2,{{{idx_operands}}}], [$1];",
           "b,r,l" + ",r" * rank,
           has_side_effects=True,
         )
diff --git a/jaxlib/mosaic/dialect/gpu/mosaic_gpu.td b/jaxlib/mosaic/dialect/gpu/mosaic_gpu.td
@@ -205,7 +205,11 @@ def MosaicGPU_TMAReduction : I32EnumAttr<"TMAReduction",
       I32EnumAttrCase<"Dec", 4, "dec">,
       I32EnumAttrCase<"And", 5, "and">,
       I32EnumAttrCase<"Or", 6, "or">,
-      I32EnumAttrCase<"Xor", 7, "xor">
+      I32EnumAttrCase<"Xor", 7, "xor">,
+      I32EnumAttrCase<"Umin", 8, "umin">,
+      I32EnumAttrCase<"Umax", 9, "umax">,
+      I32EnumAttrCase<"Smin", 10, "smin">,
+      I32EnumAttrCase<"Smax", 11, "smax">
     ]>{
   let cppNamespace = "::mosaic_gpu";
 }
diff --git a/jaxlib/mosaic/gpu/runtime.cc b/jaxlib/mosaic/gpu/runtime.cc
@@ -47,7 +47,7 @@ void mosaic_gpu_init_tma_desc(CUtensorMap *tma_desc, void *base_addr,
 
   CUtensorMapDataType data_type;
   int64_t elem_bitwidth;
-  // types are defined in: LaunchContext._get_tma_desc()
+  // types are defined in: launch_context._tma_dma_type()
   if (elem_type == 8){
     // this is for int2s
     data_type = CU_TENSOR_MAP_DATA_TYPE_UINT8;
@@ -77,7 +77,13 @@ void mosaic_gpu_init_tma_desc(CUtensorMap *tma_desc, void *base_addr,
   } else if (elem_type == 7){
     data_type = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
     elem_bitwidth = 16;
-  } else{
+  } else if (elem_type == 9){
+    data_type = CU_TENSOR_MAP_DATA_TYPE_INT32;
+    elem_bitwidth = 32;
+  } else if (elem_type == 10){
+    data_type = CU_TENSOR_MAP_DATA_TYPE_INT64;
+    elem_bitwidth = 64;
+  }  else{
     fprintf(stderr, "Unsupported element type: %ld \n", elem_type);
     abort();
   }
diff --git a/tests/mosaic/gpu_test.py b/tests/mosaic/gpu_test.py
@@ -5397,29 +5397,57 @@ def body(
     x = self.prng.uniform(0, 10, input_shape).astype(el_type)
     self.assertArraysEqual(kernel(x), x.reshape(output_shape))
 
-  @parameterized.parameters(jnp.float32, jnp.bfloat16, jnp.float16)
-  def test_async_store_add_reduction(self, dtype):
-    # TODO(b/415721295):Remove after the minimal jaxlib version is 0.8.2.
+  @parameterized.product(
+      dtype=(jnp.int32, jnp.int64, jnp.uint32, jnp.uint64, jnp.float32, jnp.float16, jnp.bfloat16),
+      reduction_op=("add", "min", "max", "inc", "dec", "and", "or", "xor"),
+  )
+  def test_async_store_reduction(self, dtype, reduction_op):
+    # TODO(b/415721295):Clean up after the minimal jaxlib version is 0.8.2.
     if not hasattr(mgpu_dialect, "TMAReduction"):
-      self.skipTest("TMAReduction op is required.")
+      self.skipTest("The mgpu_dialect.TMAReduction attribute is required.")
+
+    if reduction_op in ("min", "max"):
+      if dtype in (jnp.int32, jnp.int64):
+        reduction_op = "s" + reduction_op
+      elif dtype in (jnp.uint32, jnp.uint64):
+        reduction_op = "u" + reduction_op
+
+    if reduction_op in ("smin", "smax", "umin", "umax") and not hasattr(mgpu_dialect.TMAReduction, "Smin"):
+      self.skipTest("The Smin/Smax/Umin/Umax reduction types are required.")
+
+    if (
+        not launch_context._is_tma_reduction_op_supported(
+            reduction_op,
+            utils.dtype_to_ir_type(dtype),
+        )
+        or (
+            dtype in (jnp.uint32, jnp.uint64)
+            and reduction_op in ("smin", "smax")
+        )
+        or (
+            dtype in (jnp.int32, jnp.int64) and reduction_op in ("umin", "umax")
+        )
+        or dtype == jnp.int32 and reduction_op in ("inc", "dec")
+    ):
+      self.skipTest("TMA does not support this reduction op for this dtype")
 
     shape = (8, 128)
 
     def body(ctx, src, dst, smem):
       del ctx
-      smem_ref, tma_barrier = smem
+      src_smem_ref, tma_barrier = smem
       i32 = ir.IntegerType.get_signless(32)
       zero = arith.constant(i32, 0)
       indices = [zero, zero]
-      slice_lengths = smem_ref.type.shape
+      slice_lengths = src_smem_ref.type.shape
 
       tma_barrier.arrive_expect_tx(
-          utils.bitwidth(smem_ref.type.element_type) * math.prod(shape) // 8
+          utils.bitwidth(src_smem_ref.type.element_type) * math.prod(shape) // 8
       )
 
       mgpu_dialect.async_load(
           source=src,
-          destination=smem_ref,
+          destination=src_smem_ref,
           barrier=tma_barrier.as_barrier_memref(),
           indices=indices,
           slice_lengths=slice_lengths,
@@ -5428,31 +5456,60 @@ def body(ctx, src, dst, smem):
 
       tma_barrier.wait()
 
+      reduction_attr = getattr(
+          mgpu_dialect.TMAReduction, reduction_op.capitalize()
+      )
+
       mgpu_dialect.async_store(
-          source=smem_ref,
+          source=src_smem_ref,
           destination=dst,
           indices=indices,
           slice_lengths=slice_lengths,
-          reduction_op=mgpu_dialect.TMAReduction.Add,
+          reduction_op=reduction_attr,
       )
       nvvm.cp_async_bulk_wait_group(0)
 
-    src = jnp.ones(shape, dtype=dtype)
-    dst = jnp.ones(shape, dtype=dtype)
+    prng_key = jax.random.key(1234)
+    k0, k1 = jax.random.split(prng_key, 2)
+    if dtype in (jnp.bfloat16, jnp.float16, jnp.float32):
+      src = jax.random.uniform(k0, shape, dtype, -10, 10)
+      dst = jax.random.uniform(k1, shape, dtype, -10, 10)
+    else:
+      src = jax.random.randint(k0, shape, -10, 10).astype(dtype)
+      dst = jax.random.randint(k1, shape, -10, 10).astype(dtype)
+
+    if reduction_op == "add":
+      expected = src + dst
+    elif reduction_op in ("min", "smin", "umin"):
+      expected = jnp.minimum(src, dst)
+    elif reduction_op in ("max", "smax", "umax"):
+      expected = jnp.maximum(src, dst)
+    elif reduction_op == "and":
+      expected = src & dst
+    elif reduction_op == "or":
+      expected = src | dst
+    elif reduction_op == "xor":
+      expected = src ^ dst
+    elif reduction_op == "inc":
+      expected = jnp.where(dst >= src, 0, dst + 1)
+    elif reduction_op == "dec":
+      expected = jnp.where((dst == 0) | (dst > src), src, dst - 1)
+    else:
+      raise ValueError(f"Unsupported reduction op: {reduction_op}")
 
     jax_shape = jax.ShapeDtypeStruct(shape, dtype)
     kernel = mgpu.as_gpu_kernel(
         body,
         grid=(1, 1, 1),
         block=(128, 1, 1),
-        in_shape=(jax_shape,),
+        in_shape=(jax_shape),
         out_shape=(),
         inout_shape=(jax_shape,),
         smem_scratch_shape=[jax_shape, core.TMABarrier(1)],
         thread_semantics=mgpu.LoweringSemantics.Warpgroup,
     )
 
-    np.testing.assert_array_equal(kernel(src, dst)[0], src + dst)
+    np.testing.assert_array_equal(kernel(src, dst)[0], expected)
 
 
 class MosaicGpuDialectSm90ATest(Sm90ATestCase, jtu.JaxTestCase):