[Mosaic TPU][NFC] Move the load->reshape optimization to the pre canonicalization optimization pass

apaszke · Google-ML-Automation · commit f2562d4af8d9 · 2025-11-07T06:21:55.000-08:00
PiperOrigin-RevId: 829408887
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/canonicalize_mosaic.cc b/jaxlib/mosaic/dialect/tpu/transforms/canonicalize_mosaic.cc
@@ -1485,188 +1485,8 @@ FailureOr<Value> canonicalize_shape_cast(const CanonicalizeContext& ctx,
 
 FailureOr<Value> canonicalize_reshape(const CanonicalizeContext &ctx,
                                       Operation &raw_op) {
-  // Below, we try to look for reshapes that flatten multiple dims into the
-  // lane dimension. If the source of the reshape originates from a load of a
-  // ref with 128 minor dimension (effectively untiled), we can replace the
-  // load/reshape sequence with an efficient strided load. In essence, the
-  // strided load creates vregs with a narrow slice along the target minor
-  // dimension, but with the 2nd minor dim after the reshape already in
-  // sublanes. The results of strided load can be concatenated to form the
-  // final vector result.
-  //
-  // A little extra care needs to be applied to packed types, which we handle by
-  // briefly extending to 32-bit and repacking them after concatenation.
-  auto op = cast<tpu::ReshapeOp>(raw_op);
-  TypedValue<VectorType> src = op.getSource();
-  VectorType src_ty = src.getType();
-  VectorType tgt_ty = op.getResult().getType();
-  if (src_ty.getRank() < 2 || tgt_ty.getRank() < 1) {
-    return raw_op.getResult(0);
-  }
-  const int bitwidth = src_ty.getElementTypeBitWidth();
-  const int packing = 32 / bitwidth;
-  if (ctx.hardware_generation < 4 && packing > 1) {
-    return raw_op.getResult(0);
-  }
-
-  auto load_op = dyn_cast_if_present<vector::LoadOp>(src.getDefiningOp());
-  // This rewrite might not be profitable if the load has other users.
-  if (!load_op || !load_op.getBase().hasOneUse()) {
-    return raw_op.getResult(0);
-  }
-
-  TypedValue<MemRefType> ref = load_op.getBase();
-  MemRefType ref_ty = getMemRefType(ref);
-  // The reshape below might be invalid if the memref is not contiguous, but it
-  // is an overly conservative check (we don't need all dims to be contiguous).
-  if (!isContiguousMemref(ref)) {
-    return raw_op.getResult(0);
-  }
-
-  const int64_t lane = ctx.target_shape[1];
-  auto src_shape = src_ty.getShape();
-  auto tgt_shape = tgt_ty.getShape();
-  // Only handle the cases where the minor dim starts out as the number of lanes
-  // and we fold at least the second minor dim into it, in a way that changes
-  // its shape.
-  if (src_shape.back() != lane ||
-      tgt_shape.back() % (packing * lane) != 0 ||
-      tgt_shape.back() == src_shape.back() ||
-      tgt_shape.back() < llvm::product_of(src_shape.take_back(2))) {
-    return raw_op.getResult(0);
-  }
-
-  // We don't handle memrefs with padding.
-  auto tiled_layout = dyn_cast<tpu::TiledLayoutAttr>(ref_ty.getLayout());
-  if (!tiled_layout || tiled_layout.getTiles().empty()) {
-    return raw_op.getResult(0);
-  }
-  ArrayRef<int64_t> front_tile = tiled_layout.getTiles().front().dimensions();
-  ArrayRef<int64_t> ref_tiled_shape =
-      ref_ty.getShape().take_back(front_tile.size());
-  for (int i = 0; i < front_tile.size(); ++i) {
-    if (ref_tiled_shape[i] % front_tile[i]) {
-      return raw_op.getResult(0);
-    }
-  }
-
-  // NOTE: We could generalize this to allow only flattening part of a dimension
-  int folded_dims = 0;
-  {
-    int suffix_size = 1;
-    auto sizes_it = src_shape.rbegin();
-    while (suffix_size < tgt_shape.back()) {
-      suffix_size *= *(sizes_it++);
-    }
-    // Make sure that the minor dim is folded only from entire major dims, not
-    // from a part of some minor dim.
-    if (suffix_size != tgt_shape.back()) {
-      return raw_op.getResult(0);
-    }
-    folded_dims = sizes_it - src_shape.rbegin();
-  }
-  DCHECK_GE(folded_dims, 2);  // Should fold at least 2nd minor into minor.
-
-  // We don't handle slicing in the folded dims at the moment.
-  if (ref_ty.getShape().take_back(folded_dims) !=
-      src_ty.getShape().take_back(folded_dims)) {
-    return raw_op.getResult(0);
-  }
-
-  // NOTE: Source vector shape might be different from ref shape when slicing.
-  SmallVector<int64_t> mem_shape(ref_ty.getShape().drop_back(folded_dims));
-  if (mem_shape.empty()) {
-    mem_shape.push_back(1);
-  }
-
-  CanonicalBuilder b(ctx, op->getLoc(), op.getOperation());
-  Location loc = op.getLoc();
-
-  // Flatten the untiled dims into second minor and bitcast to i32.
-  mem_shape.back() *= tgt_shape.back() / lane;
-  mem_shape.push_back(lane);
-  Value reshaped_ref = b.create<tpu::MemRefReshapeOp>(
-      MemRefType::get(mem_shape, ref_ty.getElementType()), ref);
-  *(mem_shape.end() - 2) /= packing;
-  Value i32_view = b.create<tpu::MemRefBitcastOp>(
-      MemRefType::get(mem_shape, b.getI32Type()), reshaped_ref);
-
-  // Define the shape of the small i32 chunk we will load in each iteration.
-  // TODO(b/458291444): The loads we emit here might use suboptimal shapes and
-  // we could do better by folding some dims (as much as slicing allows).
-  SmallVector<int64_t> chunk_shape(src_shape.drop_back(folded_dims));
-  if (chunk_shape.empty()) {
-    chunk_shape.push_back(1);
-  }
-  chunk_shape.push_back(lane);
-  VectorType chunk_ty = VectorType::get(chunk_shape, b.getI32Type());
-
-  SmallVector<int32_t> strides(mem_shape.size(), 1);
-  const int64_t sublane_prod = tgt_shape.back() / lane;
-  const int64_t stride = sublane_prod / packing;
-  *(strides.end() - 2) = stride;
-
-  // Reuse indices from the original load for the prefix.
-  auto indices = load_op.getIndices();
-  SmallVector<Value> idxs(indices.drop_back(folded_dims));
-  if (idxs.empty()) {
-    idxs.push_back(IdxConst(0, b, loc));
-  }
-  Value split_base_idx =
-      b.create<arith::MulIOp>(idxs.back(), IdxConst(stride, b, loc));
-  idxs.push_back(IdxConst(0, b, loc));
-
-  SmallVector<Value> unpacked_chunks;
-  unpacked_chunks.reserve(stride * packing);
-  for (int i = 0; i < stride; ++i) {
-    *(idxs.end() - 2) =
-        b.create<arith::AddIOp>(split_base_idx, IdxConst(i, b, loc));
-    Value chunk =
-        b.create<tpu::StridedLoadOp>(chunk_ty, i32_view, idxs, strides);
-    // Unpack elements from i32 if necessary.
-    for (int p = 0; p < packing; ++p) {
-      unpacked_chunks.push_back(b.create<arith::ShRUIOp>(
-          chunk.getType(), chunk, I32Const(p * bitwidth, chunk_shape, b, loc)));
-    }
-  }
-
-  Value unpacked_flat;
-  if (unpacked_chunks.size() == 1) {
-    unpacked_flat = unpacked_chunks.front();
-  } else {
-    SmallVector<int64_t> concat_shape(src_shape.drop_back(folded_dims));
-    if (concat_shape.empty()) {
-      concat_shape.push_back(1);
-    }
-    concat_shape.push_back(tgt_shape.back());
-    unpacked_flat = b.create<tpu::ConcatenateOp>(
-        VectorType::get(concat_shape, b.getI32Type()), unpacked_chunks,
-        concat_shape.size() - 1);
-  }
-
-  Value result = unpacked_flat;
-  if (packing > 1) {  // Pack back, if needed.
-    result = b.create<arith::TruncIOp>(
-        VectorType::get(cast<VectorType>(result.getType()).getShape(),
-                        b.getIntegerType(bitwidth)),
-        result);
-  }
-  // Bitcast to the target type, if needed.
-  if (cast<VectorType>(result.getType()) != tgt_ty.getElementType()) {
-    result = b.create<arith::BitcastOp>(
-        VectorType::get(cast<VectorType>(result.getType()).getShape(),
-                        tgt_ty.getElementType()),
-        result);
-  }
-  // Apply the reshape to major dims, if needed.
-  if (cast<VectorType>(result.getType()).getShape() != tgt_ty.getShape()) {
-    result = b.create<tpu::ReshapeOp>(tgt_ty, result);
-  }
-  DCHECK_EQ(result.getType(), tgt_ty);
-
-  op.replaceAllUsesWith(result);
-  op.erase();
-  return result;
+  // TODO(b/456092935): Better implementation for reshapes that (un)fold minor.
+  return raw_op.getResult(0);
 }
 
 FailureOr<Value> canonicalize_transpose(const CanonicalizeContext &ctx,
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/pre_canonicalization_optimization.cc b/jaxlib/mosaic/dialect/tpu/transforms/pre_canonicalization_optimization.cc
@@ -50,6 +50,199 @@ namespace mlir::tpu {
 
 namespace {
 
+void optimizeLoadReshape(int hardware_generation,
+                         std::array<int64_t, 2> target_shape,
+                         Operation& raw_op) {
+  // Below, we try to look for reshapes that flatten multiple dims into the
+  // lane dimension. If the source of the reshape originates from a load of a
+  // ref with 128 minor dimension (effectively untiled), we can replace the
+  // load/reshape sequence with an efficient strided load. In essence, the
+  // strided load creates vregs with a narrow slice along the target minor
+  // dimension, but with the 2nd minor dim after the reshape already in
+  // sublanes. The results of strided load can be concatenated to form the
+  // final vector result.
+  //
+  // A little extra care needs to be applied to packed types, which we handle by
+  // briefly extending to 32-bit and repacking them after concatenation.
+  TypedValue<VectorType> src;
+  VectorType tgt_ty;
+  if (auto op = dyn_cast<tpu::ReshapeOp>(&raw_op)) {
+    src = op.getSource();
+    tgt_ty = op.getResult().getType();
+  } else if (auto op = dyn_cast<vector::ShapeCastOp>(&raw_op)) {
+    src = op.getSource();
+    tgt_ty = op.getResult().getType();
+  } else {
+    return;
+  }
+  VectorType src_ty = src.getType();
+  if (src_ty.getRank() < 2 || tgt_ty.getRank() < 1) {
+    return;
+  }
+  const int bitwidth = src_ty.getElementTypeBitWidth();
+  const int packing = 32 / bitwidth;
+  if (hardware_generation < 4 && packing > 1) {
+    return;
+  }
+
+  auto load_op = dyn_cast_if_present<vector::LoadOp>(src.getDefiningOp());
+  // This rewrite might not be profitable if the load has other users.
+  if (!load_op || !load_op.getBase().hasOneUse()) {
+    return;
+  }
+
+  TypedValue<MemRefType> ref = load_op.getBase();
+  MemRefType ref_ty = getMemRefType(ref);
+  // The reshape below might be invalid if the memref is not contiguous, but it
+  // is an overly conservative check (we don't need all dims to be contiguous).
+  if (!isContiguousMemref(ref)) {
+    return;
+  }
+
+  const int64_t lane = target_shape[1];
+  auto src_shape = src_ty.getShape();
+  auto tgt_shape = tgt_ty.getShape();
+  // Only handle the cases where the minor dim starts out as the number of lanes
+  // and we fold at least the second minor dim into it, in a way that changes
+  // its shape.
+  if (src_shape.back() != lane ||
+      tgt_shape.back() % (packing * lane) != 0 ||
+      tgt_shape.back() == src_shape.back() ||
+      tgt_shape.back() < llvm::product_of(src_shape.take_back(2))) {
+    return;
+  }
+
+  // We don't handle memrefs with padding.
+  auto tiled_layout = dyn_cast<tpu::TiledLayoutAttr>(ref_ty.getLayout());
+  if (!tiled_layout || tiled_layout.getTiles().empty()) {
+    return;
+  }
+  ArrayRef<int64_t> front_tile = tiled_layout.getTiles().front().dimensions();
+  ArrayRef<int64_t> ref_tiled_shape =
+      ref_ty.getShape().take_back(front_tile.size());
+  for (int i = 0; i < front_tile.size(); ++i) {
+    if (ref_tiled_shape[i] % front_tile[i]) {
+      return;
+    }
+  }
+
+  // NOTE: We could generalize this to allow only flattening part of a dimension
+  int folded_dims = 0;
+  {
+    int suffix_size = 1;
+    auto sizes_it = src_shape.rbegin();
+    while (suffix_size < tgt_shape.back()) {
+      suffix_size *= *(sizes_it++);
+    }
+    // Make sure that the minor dim is folded only from entire major dims, not
+    // from a part of some minor dim.
+    if (suffix_size != tgt_shape.back()) {
+      return;
+    }
+    folded_dims = sizes_it - src_shape.rbegin();
+  }
+  DCHECK_GE(folded_dims, 2);  // Should fold at least 2nd minor into minor.
+
+  // We don't handle slicing in the folded dims at the moment.
+  if (ref_ty.getShape().take_back(folded_dims) !=
+      src_ty.getShape().take_back(folded_dims)) {
+    return;
+  }
+
+  Location loc = raw_op.getLoc();
+  ImplicitLocOpBuilder b(loc, &raw_op);
+
+  // Flatten the untiled dims into second minor and bitcast to i32.
+  // NOTE: Source vector shape might be different from ref shape when slicing.
+  SmallVector<int64_t> mem_shape(ref_ty.getShape().drop_back(folded_dims));
+  if (mem_shape.empty()) {
+    mem_shape.push_back(1);
+  }
+  mem_shape.back() *= tgt_shape.back() / lane;
+  mem_shape.push_back(lane);
+  Value reshaped_ref = b.create<tpu::MemRefReshapeOp>(
+      MemRefType::get(mem_shape, ref_ty.getElementType()), ref);
+  *(mem_shape.end() - 2) /= packing;
+  Value i32_view = b.create<tpu::MemRefBitcastOp>(
+      MemRefType::get(mem_shape, b.getI32Type()), reshaped_ref);
+
+  // Define the shape of the small i32 chunk we will load in each iteration.
+  // TODO(b/458291444): The loads we emit here might use suboptimal shapes and
+  // we could do better by folding some dims (as much as slicing allows).
+  SmallVector<int64_t> chunk_shape(src_shape.drop_back(folded_dims));
+  if (chunk_shape.empty()) {
+    chunk_shape.push_back(1);
+  }
+  chunk_shape.push_back(lane);
+  VectorType chunk_ty = VectorType::get(chunk_shape, b.getI32Type());
+
+  SmallVector<int32_t> strides(mem_shape.size(), 1);
+  const int64_t sublane_prod = tgt_shape.back() / lane;
+  const int64_t stride = sublane_prod / packing;
+  *(strides.end() - 2) = stride;
+
+  // Reuse indices from the original load for the prefix.
+  auto indices = load_op.getIndices();
+  SmallVector<Value> idxs(indices.drop_back(folded_dims));
+  if (idxs.empty()) {
+    idxs.push_back(IdxConst(0, b, loc));
+  }
+  Value split_base_idx =
+      b.create<arith::MulIOp>(idxs.back(), IdxConst(stride, b, loc));
+  idxs.push_back(IdxConst(0, b, loc));
+
+  SmallVector<Value> unpacked_chunks;
+  unpacked_chunks.reserve(stride * packing);
+  for (int i = 0; i < stride; ++i) {
+    *(idxs.end() - 2) =
+        b.create<arith::AddIOp>(split_base_idx, IdxConst(i, b, loc));
+    Value chunk =
+        b.create<tpu::StridedLoadOp>(chunk_ty, i32_view, idxs, strides);
+    // Unpack elements from i32 if necessary.
+    for (int p = 0; p < packing; ++p) {
+      unpacked_chunks.push_back(b.create<arith::ShRUIOp>(
+          chunk.getType(), chunk, I32Const(p * bitwidth, chunk_shape, b, loc)));
+    }
+  }
+
+  Value unpacked_flat;
+  if (unpacked_chunks.size() == 1) {
+    unpacked_flat = unpacked_chunks.front();
+  } else {
+    SmallVector<int64_t> concat_shape(src_shape.drop_back(folded_dims));
+    if (concat_shape.empty()) {
+      concat_shape.push_back(1);
+    }
+    concat_shape.push_back(tgt_shape.back());
+    unpacked_flat = b.create<tpu::ConcatenateOp>(
+        VectorType::get(concat_shape, b.getI32Type()), unpacked_chunks,
+        concat_shape.size() - 1);
+  }
+
+  Value result = unpacked_flat;
+  if (packing > 1) {  // Pack back, if needed.
+    result = b.create<arith::TruncIOp>(
+        VectorType::get(cast<VectorType>(result.getType()).getShape(),
+                        b.getIntegerType(bitwidth)),
+        result);
+  }
+  // Bitcast to the target type, if needed.
+  if (cast<VectorType>(result.getType()) != tgt_ty.getElementType()) {
+    result = b.create<arith::BitcastOp>(
+        VectorType::get(cast<VectorType>(result.getType()).getShape(),
+                        tgt_ty.getElementType()),
+        result);
+  }
+  // Apply the reshape to major dims, if needed.
+  if (cast<VectorType>(result.getType()).getShape() != tgt_ty.getShape()) {
+    result = b.create<tpu::ReshapeOp>(tgt_ty, result);
+  }
+  DCHECK_EQ(result.getType(), tgt_ty);
+
+  raw_op.replaceAllUsesWith(ValueRange{result});
+  raw_op.erase();
+}
+
 void optimizeStore(int hardware_generation, std::array<int64_t, 2> target_shape,
                    Operation& raw_op) {
   // Fuses a vector.shape_cast (that expands dimensions) into a subsequent
@@ -417,6 +610,8 @@ struct PreCanonicalizationOptimizationPass
         }
       } else if (isa<vector::StoreOp, tpu::VectorStoreOp>(op)) {
         optimizeStore(hardware_generation_, target_shape_, *op);
+      } else if (isa<vector::ShapeCastOp, tpu::ReshapeOp>(op)) {
+        optimizeLoadReshape(hardware_generation_, target_shape_, *op);
       }
     });
   }