aws-neuron · ymwangg · Jun 2, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/examples/models/qwen3/dump_nki_ir.py b/examples/models/qwen3/dump_nki_ir.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""Dump the lowered nki_ir for each Qwen3 building-block pattern, for inspection.
+
+Traces each kernel, lowers tensor_ir -> nki_ir, and writes per pattern into
+nki_ir_dumps/<name>.nki: the tensor_ir graph, an nki opcode histogram, and the
+full nki_ir text dump. Lets us examine exactly what instructions each pattern
+expands into (and where the HBM round-trips / scaffolding are).
+
+This mirrors qwen3_embedding/dump_nki_ir.py but uses the Qwen3-30B-A3B MoE
+building blocks and per-rank (TP=4) shapes. torch.distributed is mocked to a
+TP=4 world (single process) so the head splits and intermediate sizes match a
+real shard, exactly like test_oom_repro.py.
+
+    uv run python dump_nki_ir.py
+"""
+
+import os
+from collections import Counter
+from unittest import mock
+
+import numpy as np
+import torch.distributed as dist
+
+# The kernels divide head counts / hidden by world_size, so the dump must see
+# the same TP=4 world the real shards are built for (see test_oom_repro.py).
+WORLD_SIZE = 4
+mock.patch.object(dist, "is_initialized", lambda: True).start()
+mock.patch.object(dist, "get_world_size", lambda *a, **k: WORLD_SIZE).start()
+mock.patch.object(dist, "get_rank", lambda *a, **k: 0).start()
+
+from kernels.rmsnorm import rmsnorm_kernel  # noqa: E402
+from kernels.softmax import softmax_kernel  # noqa: E402
+from kernels.feedforward import feedforward_kernel, silu_kernel_  # noqa: E402
+from kernels.rope import apply_rotary_emb_kernel  # noqa: E402
+
+from nkipy.core.trace import NKIPyKernel  # noqa: E402
+from nkigen_lite.tensor_ir.passes import lower_to_nki  # noqa: E402
+
+OUT_DIR = os.path.join(os.path.dirname(__file__), "nki_ir_dumps")
+DT = np.dtype("float32")  # f32 so both backends share one numerics path
+NORM_EPS = 1e-6
+
+# Qwen3-30B-A3B dims (global; kernels divide head counts / intermediate by TP).
+HIDDEN = 2048
+HEAD_DIM = 128
+NUM_HEADS = 32          # -> 8 local q heads at TP=4
+NUM_KV_HEADS = 4        # -> 1 local kv head at TP=4
+INTER = 768 // WORLD_SIZE  # per-rank expert intermediate = 192
+SEQ = 128
+
+N_LOCAL_HEADS = NUM_HEADS // WORLD_SIZE        # 8
+N_LOCAL_KV_HEADS = max(1, NUM_KV_HEADS // WORLD_SIZE)  # 1
+
+# Shapes for standalone building blocks.
+HID = (1, SEQ, HIDDEN)                         # hidden states for norm / ffn
+SCORE = (1, N_LOCAL_HEADS, SEQ, SEQ)           # attention scores for softmax
+QSHAPE = (1, SEQ, N_LOCAL_HEADS, HEAD_DIM)     # query post-reshape (BSHD)
+KSHAPE = (1, SEQ, N_LOCAL_KV_HEADS, HEAD_DIM)  # key post-reshape (BSHD)
+
+
+def matmul_gup(x, w):
+    return np.matmul(x, w)
+
+
+def dump(name, fn, arrays):
+    k = NKIPyKernel.trace(fn, backend="nkigen-lite")
+    ir = k.specialize(**arrays)
+    tensor_graph = ir._graph
+    # lower_to_nki mutates the tensor graph in place (canonicalize/decompose),
+    # so snapshot its dump from a fresh trace for the "tensor_ir" section.
+    k2 = NKIPyKernel.trace(fn, backend="nkigen-lite")
+    tg = k2.specialize(**arrays)._graph
+    tensor_dump = tg.dump()
+
+    nki = lower_to_nki(tensor_graph)
+    hist = Counter(o.opcode for o in nki.ops)
+
+    path = os.path.join(OUT_DIR, f"{name}.nki")
+    with open(path, "w") as f:
+        f.write(f"# Pattern: {name}\n")
+        f.write(f"# inputs: " + ", ".join(
+            f"{key}={getattr(v, 'shape', v)}" for key, v in arrays.items()) + "\n")
+        f.write(f"# nki_ir total ops: {len(nki.ops)}\n\n")
+        f.write("=" * 70 + "\n## tensor_ir (after trace)\n" + "=" * 70 + "\n")
+        f.write(tensor_dump + "\n\n")
+        f.write("=" * 70 + "\n## nki_ir opcode histogram\n" + "=" * 70 + "\n")
+        for op, n in hist.most_common():
+            f.write(f"  {op:24s} {n}\n")
+        f.write("\n" + "=" * 70 + "\n## nki_ir (full)\n" + "=" * 70 + "\n")
+        f.write(nki.dump() + "\n")
+    print(f"  wrote {path}  ({len(nki.ops)} nki ops)")
+
+
+def main():
+    rng = np.random.default_rng(0)
+    os.makedirs(OUT_DIR, exist_ok=True)
+
+    dump("rmsnorm", rmsnorm_kernel,
+         {"x": rng.standard_normal(HID).astype(DT),
+          "weight": rng.standard_normal(HIDDEN).astype(DT), "eps": NORM_EPS})
+
+    dump("softmax", softmax_kernel,
+         {"x": rng.standard_normal(SCORE).astype(DT)})
+
+    dump("silu", silu_kernel_,
+         {"x": rng.standard_normal((1, SEQ, INTER)).astype(DT)})
+
+    # Single-expert feed-forward (one expert's gate_up / down shard).
+    dump("feedforward", feedforward_kernel,
+         {"x": rng.standard_normal(HID).astype(DT),
+          "gate_up_weight": rng.standard_normal((HIDDEN, 2 * INTER)).astype(DT),
+          "down_weight": rng.standard_normal((INTER, HIDDEN)).astype(DT)})
+
+    # QKV projection matmul (hidden -> per-rank qkv width).
+    qkv_out = (N_LOCAL_HEADS + 2 * N_LOCAL_KV_HEADS) * HEAD_DIM
+    dump("matmul_qkv", matmul_gup,
+         {"x": rng.standard_normal(HID).astype(DT),
+          "w": rng.standard_normal((HIDDEN, qkv_out)).astype(DT)})
+
+    # RoPE applied to q/k (post-reshape BSHD).
+    half = HEAD_DIM // 2
+    dump("rope", apply_rotary_emb_kernel,
+         {"xq": rng.standard_normal(QSHAPE).astype(DT),
+          "xk": rng.standard_normal(KSHAPE).astype(DT),
+          "freqs_cos": rng.standard_normal((SEQ, half)).astype(DT),
+          "freqs_sin": rng.standard_normal((SEQ, half)).astype(DT)})
+
+    print(f"\nDumps in {OUT_DIR}/")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/qwen3/kernels/attention.py b/examples/models/qwen3/kernels/attention.py
@@ -35,6 +35,9 @@ def attention_kernel(
     cache_v,
     start_pos: Optional[nt.tensor],
     o_weight,
+    freqs_cos_cache=None,
+    freqs_sin_cache=None,
+    causal_mask_cache=None,
 ):
     """
     Unified attention kernel for Qwen3.
@@ -81,19 +84,33 @@ def attention_kernel(
 
     # RoPE
     max_seq_len = cache_k.shape[1]
-    freqs_cos, freqs_sin = compute_cos_sin_cache(
-        head_dim, max_seq_len, base=1000000, dtype=nl.bfloat16
-    )
     if is_prefill:
+        # Prefill: the needed rows are a static prefix, so slice the comptime
+        # cache directly (no full-cache promotion needed).
+        freqs_cos, freqs_sin = compute_cos_sin_cache(
+            head_dim, max_seq_len, base=1000000, dtype=nl.bfloat16
+        )
         freqs_cos = freqs_cos[0:seq_len]
         freqs_sin = freqs_sin[0:seq_len]
     else:
-        # Promote comptime numpy arrays to runtime tensors so they can be
-        # indexed with the runtime tensor start_pos.
-        freqs_cos = tensor_apis.constant(freqs_cos)
-        freqs_sin = tensor_apis.constant(freqs_sin)
-        freqs_cos = freqs_cos[start_pos]
-        freqs_sin = freqs_sin[start_pos]
+        # Decode: the row is selected by the runtime tensor start_pos, so the
+        # whole cache must be a runtime tensor.  The caches are passed in as
+        # kernel inputs (freqs_cos_cache/freqs_sin_cache) rather than promoted
+        # from a comptime numpy array: a full (max_seq_len, head_dim) constant
+        # would lower to one run-length fill per row on the nkigen-lite backend
+        # and blow its constant limit.  Falling back to the comptime constant
+        # keeps the HLO path and any caller that doesn't pass caches working.
+        if freqs_cos_cache is not None and freqs_sin_cache is not None:
+            freqs_cos = freqs_cos_cache[start_pos]
+            freqs_sin = freqs_sin_cache[start_pos]
+        else:
+            freqs_cos, freqs_sin = compute_cos_sin_cache(
+                head_dim, max_seq_len, base=1000000, dtype=nl.bfloat16
+            )
+            freqs_cos = tensor_apis.constant(freqs_cos)
+            freqs_sin = tensor_apis.constant(freqs_sin)
+            freqs_cos = freqs_cos[start_pos]
+            freqs_sin = freqs_sin[start_pos]
     xq, xk = apply_rotary_emb_kernel(xq, xk, freqs_cos, freqs_sin)
 
     # KV cache update
@@ -121,14 +138,33 @@ def attention_kernel(
 
     # Comptime: causal mask is a numpy array computed from constants at compile
     # time. Promote to runtime tensor so it can participate in ops with runtime tensors.
-    causal_mask = np.triu(np.ones((k_seq_len, k_seq_len)) * -100000, k=1).astype(
-        scores.dtype
-    )
-    causal_mask = tensor_apis.constant(causal_mask)
     # Apply causal mask
     if is_prefill:
-        scores = scores + np.expand_dims(causal_mask[:seq_len, :k_seq_len], axis=[0, 1])
+        # Build the mask at the sliced (seq_len, k_seq_len) shape directly rather
+        # than materializing the full (k_seq_len, k_seq_len) and slicing after the
+        # constant promotion: triu of the slice equals the slice of triu, and the
+        # full mask (e.g. 4096x4096) would otherwise be promoted as a huge
+        # non-uniform constant (one run-length fill per row, all live at once).
+        causal_mask = np.triu(
+            np.ones((seq_len, k_seq_len)) * -100000, k=1
+        ).astype(scores.dtype)
+        causal_mask = tensor_apis.constant(causal_mask)
+        scores = scores + np.expand_dims(causal_mask, axis=[0, 1])
     else:
+        # Decode: the mask row is selected by the runtime start_pos, so the
+        # whole (k_seq_len, k_seq_len) mask must be a runtime tensor.  Like the
+        # RoPE caches, it's passed in as a kernel input (causal_mask_cache)
+        # rather than promoted from a comptime numpy array, which would lower to
+        # one run-length fill per row and overflow the nkigen-lite constant
+        # limit.  Falling back to the comptime constant keeps the HLO path and
+        # callers that don't pass the cache working.
+        if causal_mask_cache is not None:
+            causal_mask = causal_mask_cache
+        else:
+            causal_mask = np.triu(
+                np.ones((k_seq_len, k_seq_len)) * -100000, k=1
+            ).astype(scores.dtype)
+            causal_mask = tensor_apis.constant(causal_mask)
         scores = scores + np.expand_dims(
             causal_mask[start_pos, :k_seq_len], axis=[0, 1]
         )

diff --git a/examples/models/qwen3/kernels/sampling.py b/examples/models/qwen3/kernels/sampling.py
@@ -30,7 +30,7 @@ def stream_shuffle_broadcast(src, dst):
         )
 
 
-@nki.jit(platform_target="trn2")
+@nki.jit
 def nki_rmsnorm_kernel(input_tensor, weight, eps):
     """
     RMSNorm NKI kernel - based on AWS official tutorial pattern.

diff --git a/examples/models/qwen3/kernels/transformer_layer.py b/examples/models/qwen3/kernels/transformer_layer.py
@@ -30,12 +30,19 @@ def transformer_layer(
     gate_up_weight,
     down_weight,
     configs: Config,
+    freqs_cos_cache=None,
+    freqs_sin_cache=None,
+    causal_mask_cache=None,
 ):
     """
     Single transformer layer for both context encoding (prefill) and token generation (decode).
 
     When start_pos is None: prefill mode (process full context)
     When start_pos is provided: decode mode (process single token)
+
+    In decode mode, freqs_cos_cache/freqs_sin_cache supply the RoPE cos/sin
+    tables as runtime kernel inputs (indexed by start_pos), avoiding a large
+    comptime constant promotion that overflows the nkigen-lite constant limit.
     """
     # Apply input RMSNorm
     norm_x = rmsnorm_kernel(x, input_weight, configs.norm_eps)
@@ -54,6 +61,9 @@ def transformer_layer(
         cache_v,
         start_pos=start_pos,
         o_weight=o_weight,
+        freqs_cos_cache=freqs_cos_cache,
+        freqs_sin_cache=freqs_sin_cache,
+        causal_mask_cache=causal_mask_cache,
     )
 
     # Residual connection after attention

diff --git a/examples/models/qwen3/nki_ir_dumps/.gitignore b/examples/models/qwen3/nki_ir_dumps/.gitignore
@@ -0,0 +1 @@
+*.nki
diff --git a/examples/models/qwen3/nki_ir_dumps/README.md b/examples/models/qwen3/nki_ir_dumps/README.md
@@ -0,0 +1,67 @@
+# nki_ir pattern dumps
+
+Generated by `../dump_nki_ir.py`. Each `<pattern>.nki` contains the tensor_ir
+(after trace), an nki opcode histogram, and the full lowered nki_ir for one
+Qwen3-30B-A3B building-block kernel (f32, batch=1 seq=128, per-rank TP=4
+shapes: hidden=2048, 8 local q heads / 1 local kv head, expert inter=192).
+
+Regenerate: `QWEN3_BACKEND=nkigen-lite uv run python ../dump_nki_ir.py`
+
+## Summary (nki ops on nkigen-lite)
+
+| pattern | nki ops | real compute ops | dominant scaffolding | note |
+|---------|--------:|-----------------:|----------------------|------|
+| silu | 14 | 1 act | 4 dma / 4 alloc / 3 dealloc | fuses well |
+| rmsnorm | 112 | ~9 | 23 dma / 34 alloc / 22 dealloc / 21 view | chain broken into segments |
+| feedforward | 283 | 24 mm + act | 58 dma / 84 alloc / 18 transpose | 2 matmuls + swiglu |
+| matmul_qkv | 314 | 48 mm | 69 dma / 88 alloc / 16 transpose | per-N-tile A reload+transpose |
+| softmax | 552 | ~64 | 144 dma / 160 alloc / 152 dealloc | reduce collapsed; chain not fused |
+| **rope** | **1,754** | **54 mul/add** | **758 dma / 456 alloc** | was 8,351 — see fix below |
+
+## What the dumps show — two compounding wastes
+
+### 1. Per-op HBM round-trips (same as qwen3_embedding)
+
+Across every pattern the real compute ops are a small minority; most nki ops are
+`alloc` + `dma_copy` + `dealloc` + `view` scaffolding around each op. The
+direct-lower pipeline puts an **HBM boundary between segments**, so a chain like
+rmsnorm's `mul(x,x) -> reduce -> add -> sqrt -> broadcast -> div -> mul` becomes
+several segments, each doing: load inputs from HBM -> 1 compute -> store result
+to HBM. The intermediate never stays on-chip even though the next op consumes it
+immediately.
+
+Concrete in `rmsnorm.nki`: 9 compute ops, but 23 DMAs / 34 allocs / 22 deallocs.
+`softmax.nki`: ~64 compute ops vs 144 DMAs / 160 allocs — the max/sub/exp/sum/div
+chain is split across segments instead of staying resident.
+
+### 2. RoPE's elementwise-segment blowup — FIXED (8,351 -> 1,754)
+
+`rope.nki` *was* the outlier: ~26 tensor_ir ops expanded to **8,351 nki ops**
+(2,628 DMAs, 1,056 elementwise mul/add), one elementwise segment alone emitting
+4,864 ops.
+
+Root cause was in the *segmenter*, not the broadcast lowering. RoPE has a q path
+on `(1,128,8,64)` tiles and a k path on `(1,128,1,64)` tiles. The layout solver
+gives both the same `(p_dims, f_dims)` classification, so `_segment_ops` merged
+them into one elementwise group. That group's ops collapse to two *different*
+partition extents (q: prod(1,128,8)=1024, k: prod(1,128,1)=128), so the fast
+"collapse leading dims onto the partition" path bailed and the generic fallback
+put the size-1 head axis on the partition and **unrolled the seq=128 axis one
+element at a time** — a 128x blowup on the q-path ops.
+
+The fix (`nkigen-lite/.../basic/direct_lower.py::_segment_ops`) breaks an
+elementwise group whenever an op would introduce a second distinct non-1
+collapsed P (or F) extent, so every group stays cleanly collapsible. RoPE drops
+to **1,754 ops** (54 elementwise, exact-match numerics) and the q-path segments
+now tile as single `(128, 512)` ops. Other patterns are unchanged. This scales
+with prefill sequence length, so the win grows at longer contexts.
+
+## Implication
+
+Remaining headroom (now the same as qwen3_embedding — issue #1 above):
+
+- **Cross-op SBUF residency / fusion.** Keep a segment's result on-chip when the
+  next op is its consumer instead of store-to-HBM + reload. This removes the
+  load+store+alloc+dealloc per boundary (the ~80-90% of ops that aren't compute)
+  in rmsnorm / softmax / feedforward / the remaining rope DMAs. HLO's BIR does
+  exactly this. See `../../../../nkigen-lite/PERFORMANCE.md`.
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,7 +30,7 @@ def stream_shuffle_broadcast(src, dst): @@
             )
-    @nki.jit(platform_target="trn2")
+    @nki.jit
     def nki_rmsnorm_kernel(input_tensor, weight, eps):
         """
         RMSNorm NKI kernel - based on AWS official tutorial pattern.
@@ Expand Down @@