pytorch
diff --git a/‎examples/__init__.py‎ b/‎examples/__init__.py‎
diff --git a/‎examples/distributed/__init__.py‎ b/‎examples/distributed/__init__.py‎
diff --git a/‎examples/distributed/one_shot_allreduce_bias_rmsnorm.py‎
Lines changed: 184 additions & 0 deletions b/‎examples/distributed/one_shot_allreduce_bias_rmsnorm.py‎
Lines changed: 184 additions & 0 deletions
diff --git a/‎examples/distributed/utils.py‎
Lines changed: 160 additions & 0 deletions b/‎examples/distributed/utils.py‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎scripts/lint_examples_main.py‎
Lines changed: 2 additions & 0 deletions b/‎scripts/lint_examples_main.py‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,184 @@
+"""
+One-Shot All-Reduce + Bias + RMS Norm Fusion Example
+=====================================================
+This example demonstrates how to implement a fused one-shot all-reduce with bias
+addition and RMS normalization using Helion and PyTorch's distributed capabilities.
+It includes a Helion kernel demonstrating how to use symm_mem_sync Triton kernel for
+cross-device synchronization and torch.ops.symm_mem.get_remote_tensors for accessing symmetric
+memory tensors on peer devices.
+"""
+
+from __future__ import annotations
+
+import os
+
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+
+from examples.distributed.utils import symm_mem_sync
+
+import helion
+from helion._testing import DEVICE
+from helion._testing import run_example
+import helion.language as hl
+
+
+@helion.jit(
+    config=helion.Config(
+        block_sizes=[8],
+        num_warps=8,
+    ),
+    static_shapes=True,
+)
+def one_shot_allreduce_bias_rmsnorm_kernel(
+    x: torch.Tensor,
+    symm_mem_buffer: torch.Tensor,
+    bias: torch.Tensor,
+    weight: torch.Tensor,
+    signal_pad_ptrs: torch.Tensor,
+    EPS: hl.constexpr,
+    RANK: hl.constexpr,
+    WORLD_SIZE: hl.constexpr,
+    GROUP_NAME: hl.constexpr,
+) -> torch.Tensor:
+    """
+    Fused one-shot all-reduce + bias addition + RMS normalization.
+    """
+    N, D = x.size()
+    output = torch.empty_like(x)
+
+    # Get remote buffers from all ranks (views into each rank's symm_mem_buffer)
+    buffer_tuple = torch.ops.symm_mem.get_remote_tensors(symm_mem_buffer, GROUP_NAME)
+
+    for tile_n in hl.tile(N):
+        # Step 1: Copy input x to our symmetric memory buffer
+        symm_mem_buffer[tile_n, :] = x[tile_n, :]
+
+        # Step 2: Sync with hasPreviousMemAccess=True hasSubsequentMemAccess=True
+        # - release fence: ensures our write to symm_mem_buffer is visible to other ranks
+        # - acquire fence: ensures we see other ranks' writes to their buffers
+        hl.triton_kernel(
+            symm_mem_sync,
+            args=(signal_pad_ptrs, tile_n.id, RANK, WORLD_SIZE, True, True),
+            output_like=None,
+        )
+
+        # Step 3: All-reduce + bias: acc = bias + sum(buffer from all ranks)
+        # Initialize acc with the right shape by broadcasting bias
+        acc = symm_mem_buffer[tile_n, :].to(torch.float32) * 0.0 + bias[None, :].to(
+            torch.float32
+        )
+        for remote_buffer in buffer_tuple:
+            acc = acc + remote_buffer[tile_n, :].to(torch.float32)
+
+        # Step 4: RMS Norm: y = acc * rsqrt(mean(acc^2) + eps) * weight
+        variance = torch.mean(acc * acc, dim=-1, keepdim=True)
+        rstd = torch.rsqrt(variance + EPS)  # type: ignore[unsupported-operation]
+        normalized = acc * rstd
+        output[tile_n, :] = (normalized * weight[None, :].to(torch.float32)).to(x.dtype)
+
+        # Step 5: Final sync (release only)
+        hl.triton_kernel(
+            symm_mem_sync,
+            args=(signal_pad_ptrs, tile_n.id, RANK, WORLD_SIZE, True, False),
+            output_like=None,
+        )
+
+    return output
+
+
+def helion_one_shot_allreduce_bias_rmsnorm(
+    x: torch.Tensor,  # Regular input tensor
+    bias: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-5,
+) -> torch.Tensor:
+    """
+    Wrapper that sets up symmetric memory and calls the Helion kernel.
+    """
+    group = dist.group.WORLD
+    if group is None:
+        raise RuntimeError("Distributed group is not initialized")
+
+    N, D = x.shape
+
+    symm_mem_buffer = symm_mem.empty(N, D, dtype=x.dtype, device=x.device)
+    symm_mem_hdl = symm_mem.rendezvous(symm_mem_buffer, group.group_name)
+
+    return one_shot_allreduce_bias_rmsnorm_kernel(
+        x,
+        symm_mem_buffer,
+        bias,
+        weight,
+        symm_mem_hdl.signal_pad_ptrs_dev,
+        EPS=eps,
+        RANK=symm_mem_hdl.rank,
+        WORLD_SIZE=symm_mem_hdl.world_size,
+        GROUP_NAME=group.group_name,
+    )
+
+
+def reference_one_shot_allreduce_bias_rmsnorm(
+    x: torch.Tensor,
+    bias: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-5,
+) -> torch.Tensor:
+    x_reduced = x.clone()
+    dist.all_reduce(x_reduced)
+    x_with_bias = x_reduced + bias
+
+    # RMS Norm
+    variance = x_with_bias.to(torch.float32).pow(2).mean(-1, keepdim=True)
+    rstd = torch.rsqrt(variance + eps)
+    normalized = x_with_bias.to(torch.float32) * rstd
+    return (normalized * weight.to(torch.float32)).to(x.dtype)
+
+
+def test(N: int, D: int, device: torch.device, dtype: torch.dtype) -> None:
+    """Test the Helion implementation against the reference."""
+    rank = dist.get_rank()
+
+    torch.manual_seed(42 + rank)
+    x = torch.randn(N, D, dtype=dtype, device=device)
+
+    torch.manual_seed(42)
+    bias = torch.randn(D, dtype=dtype, device=device)
+    weight = torch.randn(D, dtype=dtype, device=device)
+
+    run_example(
+        helion_one_shot_allreduce_bias_rmsnorm,
+        reference_one_shot_allreduce_bias_rmsnorm,
+        (x, bias, weight),
+        rtol=1e-4,
+        atol=1e-4,
+    )
+
+
+def main() -> None:
+    symm_mem.set_backend("NVSHMEM")
+    rank = int(os.environ["LOCAL_RANK"])
+    torch.manual_seed(42 + rank)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    dist.init_process_group("nccl")
+    symm_mem.enable_symm_mem_for_group(
+        dist.group.WORLD.group_name  # type: ignore[missing-attribute]
+    )
+
+    test(N=128, D=4096, device=device, dtype=torch.float32)
+
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    """
+    Run with:
+    python -m torch.distributed.run --standalone \
+    --nproc-per-node 4 \
+    --rdzv-backend c10d --rdzv-endpoint localhost:0 \
+    examples/distributed/one_shot_allreduce_bias_rmsnorm.py
+    """
+    assert DEVICE.type == "cuda", "Requires CUDA device"
+    main()
@@ -0,0 +1,160 @@
+from __future__ import annotations
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _get_tid():  # noqa: ANN202
+    return tl.inline_asm_elementwise(
+        """
+        mov.u32 $0, %tid.x;
+        mov.u32 $1, %tid.y;
+        mov.u32 $2, %tid.z;
+        """,
+        "=r,=r,=r",
+        [],
+        dtype=(tl.uint32, tl.uint32, tl.uint32),
+        is_pure=True,
+        pack=1,
+    )
+
+
+@triton.jit
+def _get_ntid():  # noqa: ANN202
+    return tl.inline_asm_elementwise(
+        """
+        mov.u32 $0, %ntid.x;
+        mov.u32 $1, %ntid.y;
+        mov.u32 $2, %ntid.z;
+        """,
+        "=r,=r,=r",
+        [],
+        dtype=(tl.uint32, tl.uint32, tl.uint32),
+        is_pure=True,
+        pack=1,
+    )
+
+
+@triton.jit
+def _get_flat_tid():  # noqa: ANN202
+    tid_x, tid_y, tid_z = _get_tid()
+    ntid_x, ntid_y, _ = _get_ntid()
+    return tid_z * ntid_y * ntid_x + tid_y * ntid_x + tid_x
+
+
+@triton.jit
+def _get_flat_bid():  # noqa: ANN202
+    return (
+        tl.program_id(2) * tl.num_programs(1) * tl.num_programs(0)
+        + tl.program_id(1) * tl.num_programs(0)
+        + tl.program_id(0)
+    )
+
+
+@triton.jit
+def _send_signal(addrs, sem: tl.constexpr) -> None:  # noqa: ANN001
+    tl.inline_asm_elementwise(
+        f"""
+        {{
+            .reg .u32   %tmp32_<1>;
+            .reg .pred  %p<1>;
+
+            send_signal:
+                atom.global.{sem}.sys.cas.b32 %tmp32_0, [$1], 0, 1;
+                setp.eq.u32 %p0, %tmp32_0, 0;
+                @!%p0 bra send_signal;
+        }}
+        """,
+        "=r, l",
+        [addrs],
+        dtype=addrs.dtype,
+        is_pure=False,
+        pack=1,
+    )
+
+
+@triton.jit
+def _wait_signal(addrs, sem: tl.constexpr) -> None:  # noqa: ANN001
+    tl.inline_asm_elementwise(
+        f"""
+        {{
+            .reg .u32   %tmp32_<1>;
+            .reg .pred  %p<1>;
+
+            wait_signal:
+                atom.global.sys.{sem}.cas.b32 %tmp32_0, [$1], 1, 0;
+                setp.eq.u32 %p0, %tmp32_0, 1;
+                @!%p0 bra wait_signal;
+        }}
+        """,
+        "=r, l",
+        [addrs],
+        dtype=tl.int32,
+        is_pure=False,
+        pack=1,
+    )
+
+
+@triton.jit
+def symm_mem_sync(
+    signal_pad_ptrs,  # noqa: ANN001
+    block_id,  # noqa: ANN001
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    hasPreviousMemAccess: tl.constexpr = False,  # pyrefly: ignore[bad-function-definition]
+    hasSubsequentMemAccess: tl.constexpr = False,  # pyrefly: ignore[bad-function-definition]
+) -> None:
+    """
+    Synchronizes blocks with matching block_id across participating devices.
+
+    Note: the function itself is not a system level barrier/fence. It is a
+    building block for expressing different synchronization patterns.
+
+    Pattern 0: Ensures that all writes to symm_mem buffers from previous
+    kernels across all devices are visible to the current kernel:
+
+        symm_mem_sync(..., hasPreviousMemAccess=False, hasSubsequentMemAccess=True)
+
+    Pattern 1: Ensures that all writes to symm_mem buffers from the current
+    block are visible to all remote blocks with matching blockIdx:
+
+        symm_mem_sync(..., hasPreviousMemAccess=True, hasSubsequentMemAccess=True)
+
+    Pattern 2: Ensures that symm_mem buffers read by the current kernel are safe
+    for writing by subsequent kernels across all devices.
+
+        symm_mem_sync(..., hasPreviousMemAccess=True, hasSubsequentMemAccess=False)
+
+    CUDA graph friendliness:
+
+        This barrier operates through atomic operations on a zero-filled signal
+        pad, which resets to a zero-filled state after each successful
+        synchronization. This design eliminates the need for incrementing a
+        flag from host.
+    """
+    if block_id is None:
+        block_id = _get_flat_bid()
+    flat_tid = _get_flat_tid()
+
+    remote_ranks = tl.arange(0, world_size)
+    signal_pad_ptrs = signal_pad_ptrs.to(tl.pointer_type(tl.uint64))
+    remote_signal_pad_addrs = tl.load(signal_pad_ptrs + remote_ranks).to(
+        tl.pointer_type(tl.uint32)
+    )
+    send_addrs = remote_signal_pad_addrs + block_id * world_size + rank
+
+    local_signal_pad_addr = tl.load(signal_pad_ptrs + rank).to(
+        tl.pointer_type(tl.uint32)
+    )
+    wait_addrs = local_signal_pad_addr + block_id * world_size + remote_ranks
+
+    if hasPreviousMemAccess:
+        tl.debug_barrier()
+
+    if flat_tid < world_size:
+        _send_signal(send_addrs, "release" if hasPreviousMemAccess else "relaxed")
+        _wait_signal(wait_addrs, "acquire" if hasSubsequentMemAccess else "relaxed")
+
+    if hasSubsequentMemAccess:
+        tl.debug_barrier()
@@ -24,6 +24,8 @@ def main() -> int:
     for filename in sys.argv[1:]:
         if not filename.startswith("examples/") or not filename.endswith(".py"):
             continue
+        if Path(filename).name in ["__init__.py", "utils.py"]:
+            continue
         if not has_main_function(filename):
             print(f"{filename} is missing a main() function.")
             failed = True