pytorch · Itssshikhar · Oct 30, 2025 · Oct 31, 2025 · Nov 1, 2025 · Nov 5, 2025
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -392,6 +392,13 @@ def configs(self) -> list[Config]:
 
     def format_kernel_decorator(self, config: Config, settings: Settings) -> str:
         """Return the @helion.kernel decorator snippet capturing configs and settings that influence Triton code generation."""
+        # Include shape_bucketing only when non-default to keep logs compact
+        if getattr(settings, "shape_bucketing", "min2") != "min2":
+            return (
+                f"@helion.kernel(config={config.__repr__()}, "
+                f"static_shapes={settings.static_shapes}, "
+                f"shape_bucketing='{settings.shape_bucketing}')"
+            )
         return f"@helion.kernel(config={config.__repr__()}, static_shapes={settings.static_shapes})"
 
     def to_triton_code(
@@ -817,11 +824,15 @@ def _tensor_key(fn: Kernel, obj: torch.Tensor) -> Hashable:
             (*obj.size(),),
             (*obj.stride(),),
         )
+    # Non-static path: bucket sizes for specialization. Default is 0/1/>=2 (as 2).
+    vals = tuple([min(s, 2) for s in obj.size()])
+    if getattr(fn.settings, "shape_bucketing", "min2") == "zero_nonzero":
+        # Keep zero distinct; unify 1 with >=2 to reduce variant churn
+        vals = tuple(0 if v == 0 else 2 for v in vals)
     return (
         obj.dtype,
         obj.device.type,
-        # 0, 1, or >=2 specialization
-        tuple([min(s, 2) for s in obj.size()]),
+        vals,
     )
 
 

diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -232,6 +232,21 @@ def _get_autotune_random_seed() -> int:
     return int(time.time() * 1000) % 2**32
 
 
+def _get_shape_bucketing() -> Literal["min2", "zero_nonzero"]:
+    val = _env_get_literal(
+        "HELION_SHAPE_BUCKETING",
+        "min2",
+        mapping={
+            "min2": "min2",
+            "zero_nonzero": "zero_nonzero",
+        },
+    )
+    # Narrow to Literal explicitly
+    if val == "zero_nonzero":
+        return "zero_nonzero"
+    return "min2"
+
+
 def _get_ref_mode() -> RefMode:
     interpret = _env_get_bool("HELION_INTERPRET", False)
     return RefMode.EAGER if interpret else RefMode.OFF
@@ -347,6 +362,12 @@ class _Settings:
             _env_get_bool, "HELION_DEBUG_DTYPE_ASSERTS", False
         )
     )
+    # Controls non-static shape specialization bucketing. When "min2" (default),
+    # we bucket dynamic sizes per-dimension into 0, 1, or >=2 (represented as 2).
+    # When "zero_nonzero", we keep 0 distinct and unify 1 with >=2 to reduce churn.
+    shape_bucketing: Literal["min2", "zero_nonzero"] = dataclasses.field(
+        default_factory=_get_shape_bucketing
+    )
     ref_mode: RefMode = dataclasses.field(default_factory=_get_ref_mode)
     autotuner_fn: AutotunerFunction = default_autotuner_fn
     autotune_baseline_fn: Callable[..., object] | None = None
@@ -401,6 +422,12 @@ class Settings(_Settings):
         ),
         "allow_warp_specialize": "If True, allow warp specialization for tl.range calls on CUDA devices.",
         "debug_dtype_asserts": "If True, emit tl.static_assert checks for dtype after each device node.",
+        "shape_bucketing": (
+            "Dynamic-shape specialization policy when static_shapes=False. "
+            "'min2' buckets each dimension into 0,1,>=2 (current behavior). "
+            "'zero_nonzero' keeps 0 distinct and unifies 1 with >=2 to reduce variants. "
+            "Override with HELION_SHAPE_BUCKETING=min2|zero_nonzero."
+        ),
         "ref_mode": "Reference mode for kernel execution. Can be RefMode.OFF or RefMode.EAGER.",
         "autotuner_fn": (
             "Function to create an autotuner. "

diff --git a/test/test_shape_bucketing.py b/test/test_shape_bucketing.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import unittest
+
+import torch
+
+from helion.runtime.kernel import kernel
+from helion.runtime.settings import Settings
+
+
+def _dummy(x: torch.Tensor) -> torch.Tensor:
+    return x
+
+
+class TestShapeBucketing(unittest.TestCase):
+    def test_min2_bucketing_default(self) -> None:
+        k = kernel(_dummy, settings=Settings(static_shapes=False))
+
+        t0 = torch.empty(0, 3)
+        t1 = torch.empty(1, 3)
+        t2 = torch.empty(2, 3)
+        t7 = torch.empty(7, 3)
+
+        key_0 = k.specialization_key([t0])
+        key_1 = k.specialization_key([t1])
+        key_2 = k.specialization_key([t2])
+        key_7 = k.specialization_key([t7])
+
+        # min2: 0,1,>=2 (as 2)
+        self.assertNotEqual(key_0, key_2)
+        self.assertNotEqual(key_1, key_2)
+        self.assertEqual(key_2, key_7)
+
+    def test_zero_nonzero_bucketing(self) -> None:
+        k = kernel(
+            _dummy,
+            settings=Settings(static_shapes=False, shape_bucketing="zero_nonzero"),
+        )
+
+        t0 = torch.empty(0, 3)
+        t1 = torch.empty(1, 3)
+        t2 = torch.empty(2, 3)
+
+        key_0 = k.specialization_key([t0])
+        key_1 = k.specialization_key([t1])
+        key_2 = k.specialization_key([t2])
+
+        # zero_nonzero: keep 0 distinct; unify 1 with >=2
+        self.assertNotEqual(key_0, key_2)
+        self.assertEqual(key_1, key_2)
+
+
+if __name__ == "__main__":
+    unittest.main()