pytorch
diff --git a/‎helion/_compiler/compile_environment.py‎
Lines changed: 103 additions & 11 deletions b/‎helion/_compiler/compile_environment.py‎
Lines changed: 103 additions & 11 deletions
diff --git a/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 114 additions & 33 deletions b/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 114 additions & 33 deletions
@@ -18,6 +18,8 @@
 from torch._inductor.utils import triton_type
 from torch._subclasses import FakeTensorMode
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
+from torch.utils._sympy.symbol import SymT
+from torch.utils._sympy.symbol import symbol_is_type
 
 from .. import exc
 from ..language.constexpr import ConstExpr
@@ -167,16 +169,23 @@ def allocate_block_size(
         reduction: bool = False,
         source: BlockSizeSource,
         hint: int = 64,
+        reuse_var: torch.SymInt | None = None,
     ) -> int:
         idx = len(self.block_sizes)
+        # Use the provided var or create a new one
+        var = (
+            reuse_var
+            if reuse_var is not None
+            else self.create_block_var(
+                f"block_size_{idx}" if not reduction else f"rdim_{idx}",
+                hint=hint,
+            )
+        )
         self.block_sizes.append(
             info := BlockSizeInfo(
                 block_id=idx,
                 size=size,
-                var=self.create_block_var(
-                    f"block_size_{idx}" if not reduction else f"rdim_{idx}",
-                    hint=hint,
-                ),
+                var=var,
                 reduction=reduction,
                 block_size_source=source,
             )
@@ -185,37 +194,55 @@ def allocate_block_size(
         from .host_function import HostFunction
         from .host_function import SymbolOrigin
 
-        HostFunction.current().expr_to_origin[info.symbol()] = SymbolOrigin(
-            origin=BlockSizeOrigin(idx),
-        )
+        # Only register in expr_to_origin if we created a new var
+        # (otherwise the var is already registered under its original block)
+        if reuse_var is None:
+            HostFunction.current().expr_to_origin[info.symbol()] = SymbolOrigin(
+                origin=BlockSizeOrigin(idx),
+            )
         return idx
 
     def allocate_reduction_dimension(self, size: torch.SymInt | int) -> BlockSizeInfo:
         # Check if this size is already a registered block size
+        existing_block: BlockSizeInfo | None = None
         if isinstance(size, torch.SymInt):
             from .host_function import HostFunction
 
             expr = size._sympy_()
             origin_info = HostFunction.current().expr_to_origin.get(expr)
             if origin_info and isinstance(origin_info.origin, BlockSizeOrigin):
                 block_idx = origin_info.origin.block_id
-                # Return the existing block size if it's a reduction dimension
-                if self.block_sizes[block_idx].reduction:
-                    return self.block_sizes[block_idx]
+                existing_block = self.block_sizes[block_idx]
+
+        def _is_unbacked_symint(x: int | torch.SymInt) -> bool:
+            if not isinstance(x, torch.SymInt):
+                return False
+            expr = x._sympy_()
+            if isinstance(expr, sympy.Symbol):
+                return symbol_is_type(expr, SymT.UNBACKED_INT)
+            return False
 
         # Check for existing reduction dimensions with the same size
         for rdim in self.block_sizes:
-            if rdim.reduction and rdim.size == size:
+            if not rdim.reduction or not isinstance(rdim.size, (int, torch.SymInt)):
+                continue
+            if _is_unbacked_symint(rdim.size) and _is_unbacked_symint(size):
+                if self.known_equal(rdim.size, size):
+                    return rdim
+            elif rdim.size == size:
                 return rdim
 
         # Allocate a new reduction dimension
+        # If size is already a block var, reuse it to maintain symbol identity
+        reuse_var = existing_block.var if existing_block is not None else None
         rdim_idx = self.allocate_block_size(
             size,
             reduction=True,
             source=ReductionLoopBlockSizeSource(
                 sum([int(bs.reduction) for bs in self.block_sizes])
             ),
             hint=next_power_of_2(self.size_hint(size)),
+            reuse_var=reuse_var,
         )
         return self.block_sizes[rdim_idx]
 
@@ -272,6 +299,71 @@ def cached_create_unbacked_symint(
             self._symint_cache[key] = result
         return result
 
+    def _normalize_shape_to_block_vars(
+        self, shape: list[int | torch.SymInt]
+    ) -> list[int | torch.SymInt]:
+        """Normalize shape dimensions to use canonical block size variables."""
+        return [
+            self.block_sizes[bid].var
+            if (bid := self.get_block_id(s)) is not None
+            else s
+            for s in shape
+        ]
+
+    def should_broadcast_tensor_indexers(
+        self, tensors: typing.Sequence[torch.Tensor]
+    ) -> bool:
+        """Check whether tensor indexers need broadcasting."""
+        if not tensors:
+            return False
+        # 1D tensors with block-size dims don't need broadcasting
+        if all(
+            t.ndim == 1 and self.get_block_id(t.size(0)) is not None for t in tensors
+        ):
+            return False
+        # Single 1D tensor doesn't need broadcast handling
+        return not (len(tensors) == 1 and tensors[0].ndim == 1)
+
+    def tensor_indexer_broadcast_shape(
+        self, tensors: typing.Sequence[torch.Tensor]
+    ) -> list[int | torch.SymInt]:
+        """Compute broadcast shape for tensor indexers."""
+        shapes = [list(t.size()) for t in tensors]
+        if all(len(s) == 1 for s in shapes) and len(shapes) > 1:  # Cartesian
+            # Normalize each dimension to block size variable
+            return self._normalize_shape_to_block_vars([s[0] for s in shapes])
+        max_ndim = max(len(s) for s in shapes)
+        padded = [([1] * (max_ndim - len(s)) + s) for s in shapes]
+        result = [
+            next((d for d in dims if self.size_hint(d) != 1), 1)
+            for dims in zip(*padded, strict=True)
+        ]
+        # Normalize the result to use canonical block size variables
+        return self._normalize_shape_to_block_vars(result)
+
+    def tensor_indexer_dims(
+        self, indexer_tensor: torch.Tensor
+    ) -> list[int | torch.SymInt]:
+        """Return dims contributed by a tensor indexer (non-broadcast case)."""
+        non_trivial = [d for d in indexer_tensor.size() if self.size_hint(d) != 1]
+        # Use size-based approach to find block_id
+        bid = self.get_block_id(non_trivial[0]) if non_trivial else None
+        if bid is not None:
+            return [self.block_sizes[bid].var]
+        return non_trivial or [1]  # type: ignore[return-value]
+
+    def new_index_result(
+        self, tensor: torch.Tensor, output_shape: typing.Sequence[int | torch.SymInt]
+    ) -> torch.Tensor:
+        """Create tensor for indexing ops with normalized shapes.
+
+        Uses size-based approach to normalize all dimensions that correspond
+        to block sizes to their canonical variables.
+        """
+        # Normalize all dimensions to canonical block size variables
+        shape = self._normalize_shape_to_block_vars(list(output_shape))
+        return tensor.new_empty(shape)
+
     def to_fake(self, obj: object, origin: Origin) -> object:
         if obj is None:
             return None
 
@@ -575,6 +575,8 @@ def compute_shape(
         input_size = collections.deque(tensor.size())
         output_size = []
         env = CompileEnvironment.current()
+        tensor_indexers = [k for k in index if isinstance(k, torch.Tensor)]
+        should_broadcast = env.should_broadcast_tensor_indexers(tensor_indexers)
         k_index = 0
         for k in index:
             if k is None:
@@ -617,11 +619,14 @@ def compute_shape(
                 else:
                     output_size.append(1)
                 k_index += 1
-            elif isinstance(k, torch.Tensor) and (
-                k.ndim == 1 or (len(index) == 1 and tensor.ndim == 1)
-            ):
+            elif isinstance(k, torch.Tensor):
                 input_size.popleft()
-                output_size.extend(k.size())
+                if not should_broadcast:
+                    output_size.extend(env.tensor_indexer_dims(k))
+                elif k is tensor_indexers[0]:
+                    output_size.extend(
+                        env.tensor_indexer_broadcast_shape(tensor_indexers)
+                    )
                 k_index += 1
             else:
                 raise exc.InvalidIndexingType(k)
@@ -667,13 +672,87 @@ def create(
         output_size = SubscriptIndexing.compute_shape(fake_value, index, state)
         env = CompileEnvironment.current()
         dtype = env.triton_index_type()
+        tensor_indexers = [k for k in index if isinstance(k, torch.Tensor)]
+        should_broadcast = env.should_broadcast_tensor_indexers(tensor_indexers)
+        broadcast_dims = 0
+        if should_broadcast:
+            broadcast_dims = len(env.tensor_indexer_broadcast_shape(tensor_indexers))
+            is_cartesian = (
+                broadcast_dims >= 2
+                and len(tensor_indexers) == broadcast_dims
+                and all(
+                    t.ndim == 1
+                    or sum(1 for d in t.size() if env.size_hint(d) != 1) <= 1
+                    for t in tensor_indexers
+                )
+            )
         if dtype == "tl.int32" and SubscriptIndexing._needs_int64(fake_value):
             raise exc.IndexOffsetOutOfRangeForInt32(env.index_dtype)
 
         def _is_size_one(size: int | torch.SymInt) -> bool:
             return env.known_equal(size, 1)
 
         k_index = 0
+
+        def handle_broadcast_tensor(
+            position: int,
+            index_elem: torch.Tensor,
+            index_var: str,
+            cur_output_idx: int,
+        ) -> tuple[str, dict[str, None]]:
+            assert broadcast_dims > 0
+            tensor_idx = next(
+                i for i, t in enumerate(tensor_indexers) if t is index_elem
+            )
+            first_tensor_out_idx = (
+                cur_output_idx if tensor_idx == 0 else cur_output_idx - broadcast_dims
+            )
+            non_trivial_output_positions: list[int] = []
+            if is_cartesian:
+                pos = first_tensor_out_idx + tensor_idx
+                single_output_dim = True
+            else:
+                # Find position(s) where this tensor contributes non-trivial dims
+                offset = max(0, broadcast_dims - index_elem.ndim)
+                non_trivial_output_positions = [
+                    first_tensor_out_idx + offset + i
+                    for i in range(index_elem.ndim)
+                    if env.size_hint(index_elem.size(i)) != 1
+                ]
+                pos = non_trivial_output_positions[0]
+                single_output_dim = len(non_trivial_output_positions) <= 1
+
+            new_masks: dict[str, None] = {}
+            if single_output_dim:
+                expand = (
+                    tile_strategy.expand_str(output_size, pos)
+                    if index_elem.ndim == 1
+                    else ""
+                )
+                idx_val = f"({index_var}){expand}"
+            else:
+                # Multi-dim tensor with multiple non-trivial dims
+                idx_val = f"({index_var})"
+                if tensor_idx == 0:
+                    for p in non_trivial_output_positions:
+                        if (
+                            p < len(output_size)
+                            and (bid := env.get_block_id(output_size[p]))
+                            and (mv := state.codegen.mask_var(bid))
+                            and not _is_size_one(fake_value.size(len(index_values)))
+                        ):
+                            new_masks.setdefault(
+                                f"({mv}){tile_strategy.expand_str(output_size, p)}"
+                            )
+            # Padded iota mask
+            if (
+                orig_len := _get_padded_iota_original_length(state, position)
+            ) is not None:
+                new_masks.setdefault(
+                    f"(({index_var} < {orig_len}){tile_strategy.expand_str(output_size, first_tensor_out_idx + tensor_idx)})"
+                )
+            return idx_val, new_masks
+
         for n, k in enumerate(index):
             if k is None:
                 output_idx += 1
@@ -752,40 +831,42 @@ def _is_size_one(size: int | torch.SymInt) -> bool:
                         index_values.append(f"tl.zeros([1], {dtype}){expand}")
                 output_idx += 1
                 k_index += 1
-            elif isinstance(k, torch.Tensor) and k.ndim == 1:
-                expand = tile_strategy.expand_str(output_size, output_idx)
+            elif isinstance(k, torch.Tensor):
                 ast_index = state.ast_args[1]
                 assert isinstance(ast_index, (list, tuple))
-                assert len(ast_index) == len(index)
                 index_var = state.codegen.lift(ast_index[n], prefix="index").id
+
+                # Use broadcast handling for: multiple tensors, or single tensor with ndim > 1
+                if should_broadcast:
+                    idx_val, new_masks = handle_broadcast_tensor(
+                        n, k, index_var, output_idx
+                    )
+                    index_values.append(idx_val)
+                    mask_values.update(new_masks)
+                    if k is tensor_indexers[0]:
+                        output_idx += broadcast_dims
+                    k_index += 1
+                    continue
+
+                expand = (
+                    tile_strategy.expand_str(output_size, output_idx)
+                    if k.ndim < len(output_size)
+                    else ""
+                )
                 index_values.append(f"({index_var}){expand}")
-                if (block_idx := env.get_block_id(output_size[output_idx])) is not None:
-                    if mask := state.codegen.mask_var(block_idx):
-                        mask_values.setdefault(f"({mask}){expand}")
-                # Check if this index comes from a padded hl.arange and generate mask
-                if (
-                    original_length := _get_padded_iota_original_length(state, n)
-                ) is not None:
-                    mask_values.setdefault(f"({index_var} < {original_length}){expand}")
-                output_idx += 1
-                k_index += 1
-            elif (
-                isinstance(k, torch.Tensor) and len(index) == 1 and fake_value.ndim == 1
-            ):
-                # TODO(jansel): combine this case with the above
-                ast_index = state.ast_args[1]
-                assert isinstance(ast_index, (list, tuple))
-                assert len(ast_index) == 1
-                index_var = state.codegen.lift(ast_index[0], prefix="index").id
-                index_values.append(index_var)
-                output_idx += k.ndim
-                for n, s in enumerate(output_size):
-                    if (block_idx := env.get_block_id(s)) is not None and (
-                        mask := state.codegen.mask_var(block_idx)
+                mask_block_id = (
+                    env.get_block_id(output_size[output_idx])
+                    if output_idx < len(output_size)
+                    else None
+                )
+                if mask_block_id is not None:
+                    mask_var = state.codegen.mask_var(mask_block_id)
+                    if mask_var and not _is_size_one(
+                        fake_value.size(len(index_values) - 1)
                     ):
-                        mask_values.setdefault(
-                            f"({mask}){tile_strategy.expand_str(output_size, n)}"
-                        )
+                        mask_values.setdefault(f"({mask_var}){expand}")
+
+                output_idx += k.ndim
                 k_index += 1
             else:
                 raise exc.InvalidIndexingType(type(k))