From fc1f3f397e2a21c64b0d241a5a954da81b1efd1a Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Mon, 27 Oct 2025 17:08:44 +0100 Subject: [PATCH 01/32] Revert "[libcxx] Define `_LIBCPP_HAS_C8RTOMB_MBRTOC8` to true if compiling with clang" (#165268) Reverts llvm/llvm-project#152724 The PR was merged with broken pre-commit CI. --- libcxx/include/__config | 4 +-- .../depr.c.headers/uchar_h.compile.pass.cpp | 5 ++++ .../uchar_h_char8_t.compile.pass.cpp | 25 ---------------- .../strings/c.strings/cuchar.compile.pass.cpp | 5 ++++ .../c.strings/cuchar_char8_t.compile.pass.cpp | 25 ---------------- .../c.strings/no_c8rtomb_mbrtoc8.verify.cpp | 30 +++++++++++++++++++ libcxx/utils/libcxx/test/features.py | 17 ----------- 7 files changed, 41 insertions(+), 70 deletions(-) delete mode 100644 libcxx/test/std/depr/depr.c.headers/uchar_h_char8_t.compile.pass.cpp delete mode 100644 libcxx/test/std/strings/c.strings/cuchar_char8_t.compile.pass.cpp create mode 100644 libcxx/test/std/strings/c.strings/no_c8rtomb_mbrtoc8.verify.cpp diff --git a/libcxx/include/__config b/libcxx/include/__config index 5971a3c5407b9..b4c081dcdff1b 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1021,9 +1021,7 @@ typedef __char32_t char32_t; // the latter depends on internal GNU libc details that are not appropriate // to depend on here, so any declarations present when __cpp_char8_t is not // defined are ignored. -# if defined(__clang__) -# define _LIBCPP_HAS_C8RTOMB_MBRTOC8 1 -# elif defined(_LIBCPP_GLIBC_PREREQ) +# if defined(_LIBCPP_GLIBC_PREREQ) # if _LIBCPP_GLIBC_PREREQ(2, 36) && defined(__cpp_char8_t) # define _LIBCPP_HAS_C8RTOMB_MBRTOC8 1 # else diff --git a/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp b/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp index c448ba83f4b38..a1560c8ee5853 100644 --- a/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp +++ b/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp @@ -23,6 +23,11 @@ // __STDC_UTF_16__ may or may not be defined by the C standard library // __STDC_UTF_32__ may or may not be defined by the C standard library +#if !defined(TEST_HAS_NO_C8RTOMB_MBRTOC8) +ASSERT_SAME_TYPE(size_t, decltype(mbrtoc8((char8_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0))); +ASSERT_SAME_TYPE(size_t, decltype(c8rtomb((char*)0, (char8_t)0, (mbstate_t*)0))); +#endif + ASSERT_SAME_TYPE(size_t, decltype(mbrtoc16((char16_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0))); ASSERT_SAME_TYPE(size_t, decltype(c16rtomb((char*)0, (char16_t)0, (mbstate_t*)0))); diff --git a/libcxx/test/std/depr/depr.c.headers/uchar_h_char8_t.compile.pass.cpp b/libcxx/test/std/depr/depr.c.headers/uchar_h_char8_t.compile.pass.cpp deleted file mode 100644 index 34b512f9c5959..0000000000000 --- a/libcxx/test/std/depr/depr.c.headers/uchar_h_char8_t.compile.pass.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03 - -// The following platforms do not provide mbrtoc8 and c8rtomb so the tests fail -// XFAIL: target={{.+}}-aix{{.*}} -// XFAIL: android -// XFAIL: darwin -// XFAIL: freebsd -// XFAIL: windows -// XFAIL: glibc-no-char8_t-support -// XFAIL: LIBCXX-PICOLIBC-FIXME - -// - -#include - -ASSERT_SAME_TYPE(size_t, decltype(mbrtoc8((char8_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0))); -ASSERT_SAME_TYPE(size_t, decltype(c8rtomb((char*)0, (char8_t)0, (mbstate_t*)0))); diff --git a/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp b/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp index 96b394a9934f8..2076384deb2b2 100644 --- a/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp +++ b/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp @@ -23,6 +23,11 @@ // __STDC_UTF_16__ may or may not be defined by the C standard library // __STDC_UTF_32__ may or may not be defined by the C standard library +#if !defined(TEST_HAS_NO_C8RTOMB_MBRTOC8) +ASSERT_SAME_TYPE(std::size_t, decltype(std::mbrtoc8((char8_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0))); +ASSERT_SAME_TYPE(std::size_t, decltype(std::c8rtomb((char*)0, (char8_t)0, (mbstate_t*)0))); +#endif + ASSERT_SAME_TYPE(std::size_t, decltype(std::mbrtoc16((char16_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0))); ASSERT_SAME_TYPE(std::size_t, decltype(std::c16rtomb((char*)0, (char16_t)0, (mbstate_t*)0))); diff --git a/libcxx/test/std/strings/c.strings/cuchar_char8_t.compile.pass.cpp b/libcxx/test/std/strings/c.strings/cuchar_char8_t.compile.pass.cpp deleted file mode 100644 index 019265b534c5c..0000000000000 --- a/libcxx/test/std/strings/c.strings/cuchar_char8_t.compile.pass.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03 - -// The following platforms do not provide mbrtoc8 and c8rtomb so the tests fail -// XFAIL: target={{.+}}-aix{{.*}} -// XFAIL: android -// XFAIL: darwin -// XFAIL: freebsd -// XFAIL: windows -// XFAIL: glibc-no-char8_t-support -// XFAIL: LIBCXX-PICOLIBC-FIXME - -// - -#include - -ASSERT_SAME_TYPE(std::size_t, decltype(std::mbrtoc8((char8_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0))); -ASSERT_SAME_TYPE(std::size_t, decltype(std::c8rtomb((char*)0, (char8_t)0, (mbstate_t*)0))); diff --git a/libcxx/test/std/strings/c.strings/no_c8rtomb_mbrtoc8.verify.cpp b/libcxx/test/std/strings/c.strings/no_c8rtomb_mbrtoc8.verify.cpp new file mode 100644 index 0000000000000..1d4a225668d80 --- /dev/null +++ b/libcxx/test/std/strings/c.strings/no_c8rtomb_mbrtoc8.verify.cpp @@ -0,0 +1,30 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03 + +#include + +#include "test_macros.h" + +// When C++ char8_t support is not enabled, definitions of these functions that +// match the C2X declarations may still be present in the global namespace with +// a char8_t typedef substituted for the C++ char8_t type. If so, these are not +// the declarations we are looking for, so don't test for them. +#if !defined(TEST_HAS_NO_CHAR8_T) +using U = decltype(::c8rtomb); +using V = decltype(::mbrtoc8); +# if !_LIBCPP_HAS_C8RTOMB_MBRTOC8 +// expected-error@-3 {{no member named 'c8rtomb' in the global namespace}} +// expected-error@-3 {{no member named 'mbrtoc8' in the global namespace}} +# else +// expected-no-diagnostics +# endif +#else +// expected-no-diagnostics +#endif diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index 1668e4ab01d75..7d6e78de343c5 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -293,23 +293,6 @@ def _mingwSupportsModules(cfg): """, ), ), - # Check for Glibc < 2.36, where there was no support for char8_t functions - Feature( - name="glibc-no-char8_t-support", - when=lambda cfg: "__GLIBC__" in compilerMacros(cfg) - and not sourceBuilds( - cfg, - """ - #include - #include - int main(void) { - char8_t c; - mbstate_t s = {0}; - return mbrtoc8(&c, "", 0, &s); - } - """, - ), - ), Feature( name="has-unix-headers", when=lambda cfg: sourceBuilds( From 07372fcf6c687208b72df96e763de494fc32ffc0 Mon Sep 17 00:00:00 2001 From: Nishant Patel Date: Mon, 27 Oct 2025 09:09:33 -0700 Subject: [PATCH 02/32] [MLIR][XeGPU] Remove leading unit dims from vector ops before unrolling (#165030) This PR uses the upstream populateCastAwayVectorLeadingOneDimPatterns to remove leading unit dims from vector ops and then do the unrolling/blocking --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 7 +++ mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 59 +++++++++++++++---- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 2c37140ad9c76..ec5feb8bc8c4a 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -344,6 +344,13 @@ void XeGPUBlockingPass::runOnOperation() { xegpu::doSCFStructuralTypeConversionWithTensorType(op, converter); + // Remove leading unit dimensions from vector ops and then + // do the unrolling. + { + RewritePatternSet patterns(ctx); + vector::populateCastAwayVectorLeadingOneDimPatterns(patterns); + (void)applyPatternsGreedily(op, std::move(patterns)); + } xegpu::UnrollOptions options; options.setFilterConstraint( [&](Operation *op) -> LogicalResult { return success(needsUnroll(op)); }); diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 7e742af754fbe..d61908b422194 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -715,7 +715,8 @@ gpu.module @test_kernel { gpu.module @test_kernel { // CHECK-LABEL: load_store_nd_with_offsets // CHECK-SAME: [[arg0:%.+]]: memref<1024x1024xf32>, [[arg1:%.+]]: memref<1024x1024xf32>, [[arg2:%.+]]: memref<1024x1024xf32> - // CHECK-DAG: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x32xf32> + // CHECK-DAG: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32xf32> + // CHECK-DAG: [[cst_0:%.+]] = arith.constant dense<0.000000e+00> : vector<1x32xf32> // CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index // CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index // CHECK: [[tdesc_a:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32> @@ -723,20 +724,27 @@ gpu.module @test_kernel { // CHECK: [[tdesc_c:%.+]] = xegpu.create_nd_tdesc [[arg2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32> // CHECK: [[ld_a0:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32> // CHECK: [[ld_a1:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32> + // CHECK: [[ins_a0:%.+]] = vector.insert_strided_slice [[ld_a0]], [[cst_0]] {offsets = [0, 0], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32> + // CHECK: [[ins_a1:%.+]] = vector.insert_strided_slice [[ld_a1]], [[ins_a0]] {offsets = [0, 16], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32> // CHECK: [[ld_b0:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32> // CHECK: [[ld_b1:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32> - // CHECK: [[cast_a0:%.+]] = vector.shape_cast [[ld_a0]] : vector<1x16xf32> to vector<16xf32> - // CHECK: [[cast_b0:%.+]] = vector.shape_cast [[ld_b0]] : vector<1x16xf32> to vector<16xf32> - // CHECK: [[add0:%.+]] = arith.addf [[cast_a0]], [[cast_b0]] : vector<16xf32> - // CHECK: [[ins0:%.+]] = vector.insert_strided_slice [[add0]], [[cst]] {offsets = [0, 0], strides = [1]} : vector<16xf32> into vector<1x32xf32> - // CHECK: [[cast_a1:%.+]] = vector.shape_cast [[ld_a1]] : vector<1x16xf32> to vector<16xf32> - // CHECK: [[cast_b1:%.+]] = vector.shape_cast [[ld_b1]] : vector<1x16xf32> to vector<16xf32> - // CHECK: [[add1:%.+]] = arith.addf [[cast_a1]], [[cast_b1]] : vector<16xf32> - // CHECK: [[ins1:%.+]] = vector.insert_strided_slice [[add1]], [[ins0]] {offsets = [0, 16], strides = [1]} : vector<16xf32> into vector<1x32xf32> - // CHECK: [[ext0:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32> - // CHECK: [[ext1:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 16], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32> - // CHECK: xegpu.store_nd [[ext0]], [[tdesc_c]][[[c0]], [[c0]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32> - // CHECK: xegpu.store_nd [[ext1]], [[tdesc_c]][[[c0]], [[c16]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32> + // CHECK: [[ins_b0:%.+]] = vector.insert_strided_slice [[ld_b0]], [[cst_0]] {offsets = [0, 0], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32> + // CHECK: [[ins_b1:%.+]] = vector.insert_strided_slice [[ld_b1]], [[ins_b0]] {offsets = [0, 16], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32> + // CHECK: [[ext_a:%.+]] = vector.extract [[ins_a1]][0] : vector<32xf32> from vector<1x32xf32> + // CHECK: [[ext_b:%.+]] = vector.extract [[ins_b1]][0] : vector<32xf32> from vector<1x32xf32> + // CHECK: [[slice_a0:%.+]] = vector.extract_strided_slice [[ext_a]] {offsets = [0], sizes = [16], strides = [1]} : vector<32xf32> to vector<16xf32> + // CHECK: [[slice_b0:%.+]] = vector.extract_strided_slice [[ext_b]] {offsets = [0], sizes = [16], strides = [1]} : vector<32xf32> to vector<16xf32> + // CHECK: [[add0:%.+]] = arith.addf [[slice_a0]], [[slice_b0]] : vector<16xf32> + // CHECK: [[ins_add0:%.+]] = vector.insert_strided_slice [[add0]], [[cst]] {offsets = [0], strides = [1]} : vector<16xf32> into vector<32xf32> + // CHECK: [[slice_a1:%.+]] = vector.extract_strided_slice [[ext_a]] {offsets = [16], sizes = [16], strides = [1]} : vector<32xf32> to vector<16xf32> + // CHECK: [[slice_b1:%.+]] = vector.extract_strided_slice [[ext_b]] {offsets = [16], sizes = [16], strides = [1]} : vector<32xf32> to vector<16xf32> + // CHECK: [[add1:%.+]] = arith.addf [[slice_a1]], [[slice_b1]] : vector<16xf32> + // CHECK: [[ins_add1:%.+]] = vector.insert_strided_slice [[add1]], [[ins_add0]] {offsets = [16], strides = [1]} : vector<16xf32> into vector<32xf32> + // CHECK: [[broadcast:%.+]] = vector.broadcast [[ins_add1]] : vector<32xf32> to vector<1x32xf32> + // CHECK: [[ext_result0:%.+]] = vector.extract_strided_slice [[broadcast]] {offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32> + // CHECK: [[ext_result1:%.+]] = vector.extract_strided_slice [[broadcast]] {offsets = [0, 16], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32> + // CHECK: xegpu.store_nd [[ext_result0]], [[tdesc_c]][[[c0]], [[c0]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32> + // CHECK: xegpu.store_nd [[ext_result1]], [[tdesc_c]][[[c0]], [[c16]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32> gpu.func @load_store_nd_with_offsets(%A: memref<1024x1024xf32>, %B: memref<1024x1024xf32>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index @@ -752,3 +760,28 @@ gpu.module @test_kernel { gpu.return } } + +// ----- +#inst_data = #xegpu.layout +gpu.module @test_kernel { + // CHECK-LABEL: load_add_store_leading_unit_dims + // CHECK-SAME: [[arg0:%.+]]: ui64, [[arg1:%.+]]: ui64, [[arg2:%.+]]: ui64 + // CHECK: [[mask:%.+]] = arith.constant dense : vector<32xi1> + // CHECK: [[offsets:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<32xindex> + // CHECK: [[a:%.+]] = xegpu.load [[arg0]][[[offsets]]], [[mask]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + // CHECK: [[b:%.+]] = xegpu.load [[arg1]][[[offsets]]], [[mask]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + // CHECK: [[add:%.+]] = arith.addf [[a]], [[b]] : vector<32xf32> + // CHECK: xegpu.store [[add]], [[arg2]][[[offsets]]], [[mask]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1> + gpu.func @load_add_store_leading_unit_dims(%A: ui64, %B: ui64, %C: ui64) { + %cst = arith.constant {layout_result_0 = #inst_data} dense<[ + [[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]] + ]> : vector<1x1x32xindex> + %mask = arith.constant {layout_result_0 = #inst_data} dense : vector<1x1x32xi1> + %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + %addf = arith.addf %a, %b {layout_result_0 = #inst_data} : vector<1x1x32xf32> + xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout_operand_0 = #inst_data, layout_operand_2 = #inst_data, layout_operand_3 = #inst_data, l1_hint = #xegpu.cache_hint} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1> + gpu.return + } +} From c431ee7ded5fe26bb43a2eb013321c9bd340de2d Mon Sep 17 00:00:00 2001 From: Nishant Patel Date: Mon, 27 Oct 2025 09:10:46 -0700 Subject: [PATCH 03/32] [MLIR][XeGPU] Fix isEvenlyDistributable API in xegpu (#164907) --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 7 +++++-- mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 24e909548fe0b..f9aa28d5203db 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -113,9 +113,12 @@ bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef shape, if (layout.size() != shape.size()) return std::nullopt; auto ratio = computeShapeRatio(shape, layout); - if (!ratio.has_value()) + if (ratio.has_value()) { + newShape = ratio.value(); + } else if (!rr || !computeShapeRatio(layout, shape).has_value()) { return std::nullopt; - newShape = ratio.value(); + } + // Round-robin case: continue with original newShape } if (data.size()) { diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 742d11f8052ec..52acde4dffc2e 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -527,4 +527,11 @@ gpu.module @test_distribution { %cst_1 = arith.constant {layout_result_0 = #xegpu.layout} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex> gpu.return } + + // CHECK-LABEL: scalar_broadcast + gpu.func @scalar_broadcast(%arg0: index) { + // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1xindex> + %broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout} : index to vector<4x1x1xindex> + gpu.return + } } From 430d0edb521c33e6bf6e38cd1b7a49b173ef18e7 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Mon, 27 Oct 2025 09:27:19 -0700 Subject: [PATCH 04/32] [FlowSensitive] [StatusOr] [8/N] Support value ctor and assignment Reviewers: jvoung, Xazax-hun Reviewed By: jvoung Pull Request: https://github.com/llvm/llvm-project/pull/163894 --- .../Models/UncheckedStatusOrAccessModel.cpp | 50 +++++ ...ncheckedStatusOrAccessModelTestFixture.cpp | 172 ++++++++++++++++++ 2 files changed, 222 insertions(+) diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp index 90551c22e0734..c6a680d8cf252 100644 --- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp +++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp @@ -177,6 +177,31 @@ static auto isPointerComparisonOperatorCall(std::string operator_name) { pointee(anyOf(statusOrType(), statusType()))))))); } +// The nullPointerConstant in the two matchers below is to support +// absl::StatusOr X = nullptr. +// nullptr does not match the bound type. +// TODO: be less restrictive around convertible types in general. +static auto isStatusOrValueAssignmentCall() { + using namespace ::clang::ast_matchers; // NOLINT: Too many names + return cxxOperatorCallExpr( + hasOverloadedOperatorName("="), + callee(cxxMethodDecl(ofClass(statusOrClass()))), + hasArgument(1, anyOf(hasType(hasUnqualifiedDesugaredType( + type(equalsBoundNode("T")))), + nullPointerConstant()))); +} + +static auto isStatusOrValueConstructor() { + using namespace ::clang::ast_matchers; // NOLINT: Too many names + return cxxConstructExpr( + hasType(statusOrType()), + hasArgument(0, + anyOf(hasType(hasCanonicalType(type(equalsBoundNode("T")))), + nullPointerConstant(), + hasType(namedDecl(hasAnyName("absl::in_place_t", + "std::in_place_t")))))); +} + static auto buildDiagnoseMatchSwitch(const UncheckedStatusOrAccessModelOptions &Options) { return CFGMatchSwitchBuildergetNumArgs() > 1); + + auto *StatusOrLoc = State.Env.get(*Expr->getArg(0)); + if (StatusOrLoc == nullptr) + return; + + auto &OkVal = initializeStatusOr(*StatusOrLoc, State.Env); + State.Env.assume(OkVal.formula()); +} + +static void transferValueConstructor(const CXXConstructExpr *Expr, + const MatchFinder::MatchResult &, + LatticeTransferState &State) { + auto &OkVal = + initializeStatusOr(State.Env.getResultObjectLocation(*Expr), State.Env); + State.Env.assume(OkVal.formula()); +} + CFGMatchSwitch buildTransferMatchSwitch(ASTContext &Ctx, CFGMatchSwitchBuilder Builder) { @@ -573,6 +619,10 @@ buildTransferMatchSwitch(ASTContext &Ctx, .CaseOfCFGStmt(isNotOkStatusCall(), transferNotOkStatusCall) .CaseOfCFGStmt(isStatusOrMemberCallWithName("emplace"), transferEmplaceCall) + .CaseOfCFGStmt(isStatusOrValueAssignmentCall(), + transferValueAssignmentCall) + .CaseOfCFGStmt(isStatusOrValueConstructor(), + transferValueConstructor) .Build(); } diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp index 425beb939a42a..452062587ce72 100644 --- a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp +++ b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp @@ -2975,6 +2975,178 @@ TEST_P(UncheckedStatusOrAccessModelTest, Emplace) { )cc"); } +TEST_P(UncheckedStatusOrAccessModelTest, ValueConstruction) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_BOOL result = false; + result.value(); + } + )cc"); + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_INT result = 21; + result.value(); + } + )cc"); + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_INT result = Make(); + result.value(); // [[unsafe]] + } + )cc"); + ExpectDiagnosticsFor( + R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_BOOL result = false; + if (result.ok()) + result.value(); + else + result.value(); + } + )cc"); + + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_BOOL result(false); + result.value(); + } + )cc"); + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_INT result(21); + result.value(); + } + )cc"); + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_INT result(Make()); + result.value(); // [[unsafe]] + } + )cc"); + ExpectDiagnosticsFor( + R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_BOOL result(false); + if (result.ok()) + result.value(); + else + result.value(); + } + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, ValueAssignment) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_BOOL result; + result = false; + result.value(); + } + )cc"); + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_INT result; + result = 21; + result.value(); + } + )cc"); + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_INT result; + result = Make(); + result.value(); // [[unsafe]] + } + )cc"); + ExpectDiagnosticsFor( + R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_BOOL result; + result = false; + if (result.ok()) + result.value(); + else + result.value(); + } + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, NestedStatusOr) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + absl::StatusOr result; + result = Make(); + result.value(); + } + )cc"); + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + absl::StatusOr result = Make(); + result.value(); + } + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, PtrConstruct) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_VOIDPTR sor = nullptr; + *sor; + } + )cc"); + + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_VOIDPTR sor(nullptr); + *sor; + } + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, InPlaceConstruct) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + STATUSOR_VOIDPTR absl_sor(absl::in_place, {nullptr}); + *absl_sor; + STATUSOR_VOIDPTR std_sor(std::in_place, {nullptr}); + *std_sor; + } + )cc"); +} + } // namespace std::string From 4ed494e7282c3b36a35b8e0930fd2e14b7038167 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Mon, 27 Oct 2025 09:29:24 -0700 Subject: [PATCH 05/32] [FlowSensitive] [StatusOr] [9/N] Make sure all StatusOr are initialized This is important if the first use of a StatusOr (or Status) is in a conditional statement, we need a stable value for `ok` from outside of the conditional statement to make sure we don't use a different variable in every branch. Reviewers: jvoung, Xazax-hun Reviewed By: jvoung Pull Request: https://github.com/llvm/llvm-project/pull/163898 --- .../Models/UncheckedStatusOrAccessModel.cpp | 39 +++++++++++++++++++ ...ncheckedStatusOrAccessModelTestFixture.cpp | 39 +++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp index c6a680d8cf252..b42bfa3821c2e 100644 --- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp +++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp @@ -202,6 +202,16 @@ static auto isStatusOrValueConstructor() { "std::in_place_t")))))); } +static auto isStatusOrConstructor() { + using namespace ::clang::ast_matchers; // NOLINT: Too many names + return cxxConstructExpr(hasType(statusOrType())); +} + +static auto isStatusConstructor() { + using namespace ::clang::ast_matchers; // NOLINT: Too many names + return cxxConstructExpr(hasType(statusType())); +} + static auto buildDiagnoseMatchSwitch(const UncheckedStatusOrAccessModelOptions &Options) { return CFGMatchSwitchBuilder buildTransferMatchSwitch(ASTContext &Ctx, CFGMatchSwitchBuilder Builder) { @@ -623,6 +652,16 @@ buildTransferMatchSwitch(ASTContext &Ctx, transferValueAssignmentCall) .CaseOfCFGStmt(isStatusOrValueConstructor(), transferValueConstructor) + // N.B. These need to come after all other CXXConstructExpr. + // These are there to make sure that every Status and StatusOr object + // have their ok boolean initialized when constructed. If we were to + // lazily initialize them when we first access them, we can produce + // false positives if that first access is in a control flow statement. + // You can comment out these two constructors and see tests fail. + .CaseOfCFGStmt(isStatusOrConstructor(), + transferStatusOrConstructor) + .CaseOfCFGStmt(isStatusConstructor(), + transferStatusConstructor) .Build(); } diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp index 452062587ce72..5635ff4e01d36 100644 --- a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp +++ b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp @@ -3147,6 +3147,45 @@ TEST_P(UncheckedStatusOrAccessModelTest, InPlaceConstruct) { )cc"); } +TEST_P(UncheckedStatusOrAccessModelTest, ConstructStatusOrFromReference) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + void target() { + const auto sor1 = Make(); + const auto sor2 = Make(); + if (!sor1.ok() && !sor2.ok()) return; + if (sor1.ok() && !sor2.ok()) { + } else if (!sor1.ok() && sor2.ok()) { + } else { + sor1.value(); + sor2.value(); + } + } + )cc"); +} + +TEST_P(UncheckedStatusOrAccessModelTest, ConstructStatusFromReference) { + ExpectDiagnosticsFor(R"cc( +#include "unchecked_statusor_access_test_defs.h" + + void target() { + const auto sor1 = Make(); + const auto sor2 = Make(); + const auto s1 = Make(); + const auto s2 = Make(); + + if (!s1.ok() && !s2.ok()) return; + if (s1.ok() && !s2.ok()) { + } else if (!s1.ok() && s2.ok()) { + } else { + if (s1 != sor1.status() || s2 != sor2.status()) return; + sor1.value(); + sor2.value(); + } + } + )cc"); +} + } // namespace std::string From 9a0aa922ed3e0accc2d2fbfffa619e249a7c84ac Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Mon, 27 Oct 2025 09:48:03 -0700 Subject: [PATCH 06/32] [LLDB] Disable rosetta test on green dragon --- .../API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py index 0f40dfd09c958..4516c9b58ba37 100644 --- a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py +++ b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py @@ -18,6 +18,11 @@ def apple_silicon(): def rosetta_debugserver_installed(): + import platform + version = platform.mac_ver() + # Workaround for an undiagnosed problem on green dragon. + if version[0] == '15' and version[1] == '5': + return False return exists("/Library/Apple/usr/libexec/oah/debugserver") From 128214f3b2a4b470a4b45f9b2eece7a439d795d7 Mon Sep 17 00:00:00 2001 From: Tobias Stadler Date: Mon, 27 Oct 2025 09:54:38 -0700 Subject: [PATCH 07/32] [llvm-remarkutil] Introduce summary tool (#160549) This tool provides a harness for implementing different strategies that summarize many remarks (possibly from multiple translation units) into new summary remarks. The remark summaries can then be viewed using tools like `opt-viewer`. The first summary strategy is `--inline-callees`, which generates remarks that summarize the per-callee inline statistics for functions that appear in inling remarks. This is useful for troubleshooting inlining issues/regressions on large codebases. Pull Request: https://github.com/llvm/llvm-project/pull/160549 --- llvm/include/llvm/Remarks/Remark.h | 21 +- llvm/lib/Remarks/Remark.cpp | 14 +- .../broken-bitstream-remark-magic.test | 1 + .../broken-bitstream-remark.test | 1 + .../llvm-remarkutil/broken-yaml-remark.test | 1 + .../tools/llvm-remarkutil/empty-file.test | 9 +- .../summary/Inputs/inline.yaml | 50 ++++ .../tools/llvm-remarkutil/summary/inline.test | 54 ++++ llvm/tools/llvm-remarkutil/CMakeLists.txt | 1 + llvm/tools/llvm-remarkutil/RemarkCounter.cpp | 6 +- llvm/tools/llvm-remarkutil/RemarkSummary.cpp | 254 ++++++++++++++++++ .../tools/llvm-remarkutil/RemarkUtilHelpers.h | 51 ++++ 12 files changed, 445 insertions(+), 18 deletions(-) create mode 100644 llvm/test/tools/llvm-remarkutil/summary/Inputs/inline.yaml create mode 100644 llvm/test/tools/llvm-remarkutil/summary/inline.test create mode 100644 llvm/tools/llvm-remarkutil/RemarkSummary.cpp diff --git a/llvm/include/llvm/Remarks/Remark.h b/llvm/include/llvm/Remarks/Remark.h index 8c8ca769c7d16..663af6302d6ff 100644 --- a/llvm/include/llvm/Remarks/Remark.h +++ b/llvm/include/llvm/Remarks/Remark.h @@ -51,12 +51,21 @@ struct Argument { // If set, the debug location corresponding to the value. std::optional Loc; + Argument() = default; + Argument(StringRef Key, StringRef Val) : Key(Key), Val(Val) {} + /// Implement operator<< on Argument. LLVM_ABI void print(raw_ostream &OS) const; - /// Return the value of argument as int. - LLVM_ABI std::optional getValAsInt() const; - /// Check if the argument value can be parsed as int. - LLVM_ABI bool isValInt() const; + + /// Return the value of argument as an integer of type T. + template + std::optional getValAsInt(unsigned Radix = 10) const { + StringRef Str = Val; + T Res; + if (Str.consumeInteger(Radix, Res) || !Str.empty()) + return std::nullopt; + return Res; + } }; // Create wrappers for C Binding types (see CBindingWrapping.h). @@ -127,6 +136,10 @@ struct Remark { /// Return a message composed from the arguments as a string. LLVM_ABI std::string getArgsAsMsg() const; + /// Return the first argument with the specified key or nullptr if no such + /// argument was found. + LLVM_ABI Argument *getArgByKey(StringRef Key); + /// Clone this remark to explicitly ask for a copy. Remark clone() const { return *this; } diff --git a/llvm/lib/Remarks/Remark.cpp b/llvm/lib/Remarks/Remark.cpp index 0e98cad8e9045..09f24e93255e0 100644 --- a/llvm/lib/Remarks/Remark.cpp +++ b/llvm/lib/Remarks/Remark.cpp @@ -13,6 +13,7 @@ #include "llvm/Remarks/Remark.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include using namespace llvm; @@ -26,16 +27,13 @@ std::string Remark::getArgsAsMsg() const { return Str; } -/// Returns the value of a specified key parsed from StringRef. -std::optional Argument::getValAsInt() const { - APInt KeyVal; - if (Val.getAsInteger(10, KeyVal)) - return std::nullopt; - return KeyVal.getSExtValue(); +Argument *Remark::getArgByKey(StringRef Key) { + auto *It = find_if(Args, [&](auto &Arg) { return Arg.Key == Key; }); + if (It == Args.end()) + return nullptr; + return &*It; } -bool Argument::isValInt() const { return getValAsInt().has_value(); } - void RemarkLocation::print(raw_ostream &OS) const { OS << "{ " << "File: " << SourceFilePath << ", Line: " << SourceLine diff --git a/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test index c21dbd72a2a18..9d64201cc071e 100644 --- a/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test +++ b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test @@ -3,5 +3,6 @@ RUN: not llvm-remarkutil instruction-mix %p/Inputs/broken-remark-magic.bitstream RUN: not llvm-remarkutil annotation-count --annotation-type=remark %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil count %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil filter %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s +RUN: not llvm-remarkutil summary %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s CHECK: error: Automatic detection of remark format failed. Unknown magic number: '1234' diff --git a/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark.test b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark.test index 339f082d4825b..0a668131c801c 100644 --- a/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark.test +++ b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark.test @@ -3,5 +3,6 @@ RUN: not llvm-remarkutil instruction-count --parser=bitstream %p/Inputs/broken-r RUN: not llvm-remarkutil annotation-count --parser=bitstream --annotation-type=remark %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil count --parser=bitstream %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil filter --parser=bitstream %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s +RUN: not llvm-remarkutil summary --parser=bitstream %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s CHECK: error: Unknown magic number: expecting RMRK, got --- . diff --git a/llvm/test/tools/llvm-remarkutil/broken-yaml-remark.test b/llvm/test/tools/llvm-remarkutil/broken-yaml-remark.test index 9da3de4034b0f..76b2d5610d8cd 100644 --- a/llvm/test/tools/llvm-remarkutil/broken-yaml-remark.test +++ b/llvm/test/tools/llvm-remarkutil/broken-yaml-remark.test @@ -4,5 +4,6 @@ RUN: not llvm-remarkutil instruction-mix --parser=yaml %p/Inputs/broken-remark - RUN: not llvm-remarkutil annotation-count --parser=yaml --annotation-type=remark %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil count --parser=yaml %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil filter --parser=yaml %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s +RUN: not llvm-remarkutil summary --parser=yaml %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s CHECK: error: Type, Pass, Name or Function missing diff --git a/llvm/test/tools/llvm-remarkutil/empty-file.test b/llvm/test/tools/llvm-remarkutil/empty-file.test index 9b2b000e9c24b..53f04f36226a5 100644 --- a/llvm/test/tools/llvm-remarkutil/empty-file.test +++ b/llvm/test/tools/llvm-remarkutil/empty-file.test @@ -4,18 +4,21 @@ RUN: not llvm-remarkutil instruction-mix --parser=yaml %p/Inputs/empty-file -o - RUN: not llvm-remarkutil annotation-count --parser=yaml --annotation-type=remark %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --check-prefix=YAMLPARSER RUN: not llvm-remarkutil count --parser=yaml %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --check-prefix=YAMLPARSER RUN: not llvm-remarkutil filter --parser=yaml %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --check-prefix=YAMLPARSER +RUN: not llvm-remarkutil summary --parser=yaml %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --check-prefix=YAMLPARSER RUN: llvm-remarkutil bitstream2yaml %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=BITSTREAM2YAML RUN: llvm-remarkutil instruction-count --parser=bitstream %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=SIZEBITSTREAM RUN: llvm-remarkutil instruction-mix --parser=bitstream %p/Inputs/empty-file --report_style=csv -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=MIXBITSTREAM RUN: llvm-remarkutil annotation-count --parser=bitstream --annotation-type=remark %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=ANNOTATIONBITSTREAM RUN: llvm-remarkutil count --parser=bitstream %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=COUNTBITSTREAM -RUN: llvm-remarkutil filter --parser=bitstream %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=FILTERBITSTREAM +RUN: llvm-remarkutil filter --parser=bitstream %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTYBITSTREAM +RUN: llvm-remarkutil summary --parser=bitstream %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTYBITSTREAM ; Parser format auto-detection should treat empty files as bitstream files RUN: llvm-remarkutil instruction-count %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=SIZEBITSTREAM RUN: llvm-remarkutil instruction-mix %p/Inputs/empty-file --report_style=csv -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=MIXBITSTREAM RUN: llvm-remarkutil annotation-count --annotation-type=remark %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=ANNOTATIONBITSTREAM RUN: llvm-remarkutil count %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=COUNTBITSTREAM -RUN: llvm-remarkutil filter %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=FILTERBITSTREAM +RUN: llvm-remarkutil filter %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTYBITSTREAM +RUN: llvm-remarkutil summary %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTYBITSTREAM ; YAMLPARSER: error: document root is not of mapping type. @@ -34,4 +37,4 @@ RUN: llvm-remarkutil filter %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allo ; MIXBITSTREAM-LABEL: Instruction,Count ; MIXBITSTREAM-EMPTY: -; FILTERBITSTREAM-NOT: {{.}} +; EMPTYBITSTREAM-NOT: {{.}} diff --git a/llvm/test/tools/llvm-remarkutil/summary/Inputs/inline.yaml b/llvm/test/tools/llvm-remarkutil/summary/Inputs/inline.yaml new file mode 100644 index 0000000000000..efb8cd6ecf5a9 --- /dev/null +++ b/llvm/test/tools/llvm-remarkutil/summary/Inputs/inline.yaml @@ -0,0 +1,50 @@ +--- !Missed +Pass: inline +Name: TooCostly +DebugLoc: { File: 'foo.cpp', Line: 21, Column: 6 } +Function: fooCaller +Args: + - Callee: fooCallee + DebugLoc: { File: 'foo.cpp', Line: 10, Column: 0 } + - Caller: fooCaller + DebugLoc: { File: 'foo.cpp', Line: 20, Column: 0 } + - Cost: '125' + - Threshold: '100' +... +--- !Passed +Pass: inline +Name: Inlined +DebugLoc: { File: 'foo.cpp', Line: 21, Column: 6 } +Function: fooCaller2 +Args: + - Callee: fooCallee + DebugLoc: { File: 'foo.cpp', Line: 10, Column: 0 } + - Caller: fooCaller + DebugLoc: { File: 'foo.cpp', Line: 20, Column: 0 } + - Cost: '-15' + - Threshold: '100' + - Line: '1' + - Column: '6' +... +--- !Passed +Pass: inline +Name: AlwaysInline +DebugLoc: { File: 'bar.cpp', Line: 23, Column: 10 } +Function: barCaller +Args: + - Callee: barCallee + DebugLoc: { File: 'bar.cpp', Line: 5, Column: 0 } + - Caller: barCaller + DebugLoc: { File: 'bar.cpp', Line: 22, Column: 0 } + - Reason: always inline attribute + - Line: '23' + - Column: '10' +... +--- !Missed +Pass: inline +Name: NoDefinition +Function: bazCaller +Args: + - Callee: bazCallee + - Caller: bazCaller +... diff --git a/llvm/test/tools/llvm-remarkutil/summary/inline.test b/llvm/test/tools/llvm-remarkutil/summary/inline.test new file mode 100644 index 0000000000000..57473186e63e3 --- /dev/null +++ b/llvm/test/tools/llvm-remarkutil/summary/inline.test @@ -0,0 +1,54 @@ +RUN: llvm-remarkutil summary --inline-callees %p/Inputs/inline.yaml | FileCheck -strict-whitespace %s + +; CHECK: --- !Analysis +; CHECK-NEXT: Pass: inline +; CHECK-NEXT: Name: Summary +; CHECK-NEXT: DebugLoc: { File: bar.cpp, Line: 5, Column: 0 } +; CHECK-NEXT: Function: barCallee +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: 'Incoming Calls (' +; CHECK-NEXT: - String: AlwaysInline +; CHECK-NEXT: - String: ': ' +; CHECK-NEXT: - AlwaysInline: '1' +; CHECK-NEXT: - String: ')' +; CHECK-NEXT: ... +; CHECK-NEXT: --- !Analysis +; CHECK-NEXT: Pass: inline +; CHECK-NEXT: Name: Summary +; CHECK-NEXT: Function: bazCallee +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: 'Incoming Calls (' +; CHECK-NEXT: - String: NoDefinition +; CHECK-NEXT: - String: ': ' +; CHECK-NEXT: - NoDefinition: '1' +; CHECK-NEXT: - String: ')' +; CHECK-NEXT: ... +; CHECK-NEXT: --- !Analysis +; CHECK-NEXT: Pass: inline +; CHECK-NEXT: Name: Summary +; CHECK-NEXT: DebugLoc: { File: foo.cpp, Line: 10, Column: 0 } +; CHECK-NEXT: Function: fooCallee +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: 'Incoming Calls (' +; CHECK-NEXT: - String: Inlined +; CHECK-NEXT: - String: ': ' +; CHECK-NEXT: - Inlined: '1' +; CHECK-NEXT: - String: ', ' +; CHECK-NEXT: - String: TooCostly +; CHECK-NEXT: - String: ': ' +; CHECK-NEXT: - TooCostly: '1' +; CHECK-NEXT: - String: ')' +; CHECK-NEXT: - String: "\nLeast profitable (cost=" +; CHECK-NEXT: - LeastProfitCost: '125' +; CHECK-NEXT: DebugLoc: { File: foo.cpp, Line: 21, Column: 6 } +; CHECK-NEXT: - String: ', threshold=' +; CHECK-NEXT: - LeastProfitThreshold: '100' +; CHECK-NEXT: - String: ')' +; CHECK-NEXT: - String: "\nMost profitable (cost=" +; CHECK-NEXT: - MostProfitCost: '-15' +; CHECK-NEXT: DebugLoc: { File: foo.cpp, Line: 21, Column: 6 } +; CHECK-NEXT: - String: ', threshold=' +; CHECK-NEXT: - MostProfitThreshold: '100' +; CHECK-NEXT: - String: ')' +; CHECK-NEXT: ... +; CHECK-NOT: {{.}} diff --git a/llvm/tools/llvm-remarkutil/CMakeLists.txt b/llvm/tools/llvm-remarkutil/CMakeLists.txt index c6e9334d87c04..3f0a4360266e1 100644 --- a/llvm/tools/llvm-remarkutil/CMakeLists.txt +++ b/llvm/tools/llvm-remarkutil/CMakeLists.txt @@ -11,6 +11,7 @@ add_llvm_tool(llvm-remarkutil RemarkFilter.cpp RemarkInstructionMix.cpp RemarkSizeDiff.cpp + RemarkSummary.cpp RemarkUtil.cpp RemarkUtilHelpers.cpp RemarkUtilRegistry.cpp diff --git a/llvm/tools/llvm-remarkutil/RemarkCounter.cpp b/llvm/tools/llvm-remarkutil/RemarkCounter.cpp index 2e842c8c2d72e..4e429b75e3c2d 100644 --- a/llvm/tools/llvm-remarkutil/RemarkCounter.cpp +++ b/llvm/tools/llvm-remarkutil/RemarkCounter.cpp @@ -70,11 +70,11 @@ static cl::opt GroupByOpt( /// integer value or 0 if it is has no integer value. static unsigned getValForKey(StringRef Key, const Remark &Remark) { auto *RemarkArg = find_if(Remark.Args, [&Key](const Argument &Arg) { - return Arg.Key == Key && Arg.isValInt(); + return Arg.Key == Key && Arg.getValAsInt(); }); if (RemarkArg == Remark.Args.end()) return 0; - return *RemarkArg->getValAsInt(); + return *RemarkArg->getValAsInt(); } Error ArgumentCounter::getAllMatchingArgumentsInRemark( @@ -91,7 +91,7 @@ Error ArgumentCounter::getAllMatchingArgumentsInRemark( continue; for (auto &Key : Arguments) { for (Argument Arg : Remark.Args) - if (Key.match(Arg.Key) && Arg.isValInt()) + if (Key.match(Arg.Key) && Arg.getValAsInt()) ArgumentSetIdxMap.insert({Arg.Key, ArgumentSetIdxMap.size()}); } } diff --git a/llvm/tools/llvm-remarkutil/RemarkSummary.cpp b/llvm/tools/llvm-remarkutil/RemarkSummary.cpp new file mode 100644 index 0000000000000..124bd51720d17 --- /dev/null +++ b/llvm/tools/llvm-remarkutil/RemarkSummary.cpp @@ -0,0 +1,254 @@ +//===- RemarkSummary.cpp --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Specialized tool to summarize remarks +// +//===----------------------------------------------------------------------===// + +#include "RemarkUtilHelpers.h" +#include "RemarkUtilRegistry.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/WithColor.h" +#include + +using namespace llvm; +using namespace remarks; +using namespace llvm::remarkutil; + +namespace summary { + +static cl::SubCommand + SummarySub("summary", "Summarize remarks using different strategies."); + +INPUT_FORMAT_COMMAND_LINE_OPTIONS(SummarySub) +OUTPUT_FORMAT_COMMAND_LINE_OPTIONS(SummarySub) +INPUT_OUTPUT_COMMAND_LINE_OPTIONS(SummarySub) + +static cl::OptionCategory SummaryStrategyCat("Strategy options"); + +enum class KeepMode { None, Used, All }; + +static cl::opt KeepInputOpt( + "keep", cl::desc("Keep input remarks in output"), cl::init(KeepMode::None), + cl::values(clEnumValN(KeepMode::None, "none", + "Don't keep input remarks (default)"), + clEnumValN(KeepMode::Used, "used", + "Keep only remarks used for summary"), + clEnumValN(KeepMode::All, "all", "Keep all input remarks")), + cl::sub(SummarySub)); + +static cl::opt + IgnoreMalformedOpt("ignore-malformed", + cl::desc("Ignore remarks that fail to process"), + cl::init(false), cl::Hidden, cl::sub(SummarySub)); + +// Use one cl::opt per Strategy, because future strategies might need to take +// per-strategy parameters. +static cl::opt EnableInlineSummaryOpt( + "inline-callees", cl::desc("Summarize per-callee inling statistics"), + cl::cat(SummaryStrategyCat), cl::init(false), cl::sub(SummarySub)); + +/// An interface to implement different strategies for creating remark +/// summaries. Override this class to develop new strategies. +class SummaryStrategy { +public: + virtual ~SummaryStrategy() = default; + + /// Strategy should return true if it wants to process the remark \p R. + virtual bool filter(Remark &R) = 0; + + /// Hook to process the remark \p R (i.e. collect the necessary data for + /// producing summary remarks). This will only be called with remarks + /// accepted by filter(). Can return an error if \p R is malformed or + /// unexpected. + virtual Error process(Remark &R) = 0; + + /// Hook to emit new remarks based on the collected data. + virtual void emit(RemarkSerializer &Serializer) = 0; +}; + +/// Check if any summary strategy options are explicitly enabled. +static bool isAnyStrategyRequested() { + StringMap Opts = cl::getRegisteredOptions(SummarySub); + for (auto &[_, Opt] : Opts) { + if (!is_contained(Opt->Categories, &SummaryStrategyCat)) + continue; + if (!Opt->getNumOccurrences()) + continue; + return true; + } + return false; +} + +class InlineCalleeSummary : public SummaryStrategy { + struct CallsiteCost { + int Cost = 0; + int Threshold = 0; + std::optional Loc; + + int getProfit() const { return Threshold - Cost; } + + friend bool operator==(const CallsiteCost &A, const CallsiteCost &B) { + return A.Cost == B.Cost && A.Threshold == B.Threshold && A.Loc == B.Loc; + } + + friend bool operator!=(const CallsiteCost &A, const CallsiteCost &B) { + return !(A == B); + } + }; + + struct CalleeSummary { + SmallDenseMap Stats; + std::optional Loc; + std::optional LeastProfit; + std::optional MostProfit; + + void updateCost(CallsiteCost NewCost) { + if (!LeastProfit || NewCost.getProfit() < LeastProfit->getProfit()) + LeastProfit = NewCost; + if (!MostProfit || NewCost.getProfit() > MostProfit->getProfit()) + MostProfit = NewCost; + } + }; + + DenseMap Callees; + + Error malformed() { return createStringError("Malformed inline remark."); } + + bool filter(Remark &R) override { + return R.PassName == "inline" && R.RemarkName != "Summary"; + } + + Error process(Remark &R) override { + auto *CalleeArg = R.getArgByKey("Callee"); + if (!CalleeArg) + return Error::success(); + auto &Callee = Callees[CalleeArg->Val]; + ++Callee.Stats[R.RemarkName]; + if (!Callee.Loc) + Callee.Loc = CalleeArg->Loc; + + Argument *CostArg = R.getArgByKey("Cost"); + Argument *ThresholdArg = R.getArgByKey("Threshold"); + if (!CostArg || !ThresholdArg) + return Error::success(); + auto CostVal = CostArg->getValAsInt(); + auto ThresholdVal = ThresholdArg->getValAsInt(); + if (!CostVal || !ThresholdVal) + return malformed(); + Callee.updateCost({*CostVal, *ThresholdVal, R.Loc}); + return Error::success(); + } + + void emit(RemarkSerializer &Serializer) override { + SmallVector SortedKeys(Callees.keys()); + llvm::sort(SortedKeys); + for (StringRef K : SortedKeys) { + auto &V = Callees[K]; + RemarkBuilder RB(Type::Analysis, "inline", "Summary", K); + if (V.Stats.empty()) + continue; + RB.R.Loc = V.Loc; + RB << "Incoming Calls ("; + SmallVector StatKeys(V.Stats.keys()); + llvm::sort(StatKeys); + bool First = true; + for (StringRef StatK : StatKeys) { + if (!First) + RB << ", "; + RB << StatK << ": " << NV(StatK, V.Stats[StatK]); + First = false; + } + RB << ")"; + if (V.LeastProfit && V.MostProfit != V.LeastProfit) { + RB << "\nLeast profitable (cost=" + << NV("LeastProfitCost", V.LeastProfit->Cost, V.LeastProfit->Loc) + << ", threshold=" + << NV("LeastProfitThreshold", V.LeastProfit->Threshold) << ")"; + } + if (V.MostProfit) { + RB << "\nMost profitable (cost=" + << NV("MostProfitCost", V.MostProfit->Cost, V.MostProfit->Loc) + << ", threshold=" + << NV("MostProfitThreshold", V.MostProfit->Threshold) << ")"; + } + Serializer.emit(RB.R); + } + } +}; + +static Error trySummary() { + auto MaybeBuf = getInputMemoryBuffer(InputFileName); + if (!MaybeBuf) + return MaybeBuf.takeError(); + auto MaybeParser = createRemarkParser(InputFormat, (*MaybeBuf)->getBuffer()); + if (!MaybeParser) + return MaybeParser.takeError(); + auto &Parser = **MaybeParser; + + Format SerializerFormat = + getSerializerFormat(OutputFileName, OutputFormat, Parser.ParserFormat); + + auto MaybeOF = getOutputFileForRemarks(OutputFileName, SerializerFormat); + if (!MaybeOF) + return MaybeOF.takeError(); + auto OF = std::move(*MaybeOF); + + auto MaybeSerializer = createRemarkSerializer(SerializerFormat, OF->os()); + if (!MaybeSerializer) + return MaybeSerializer.takeError(); + auto &Serializer = **MaybeSerializer; + + bool UseDefaultStrategies = !isAnyStrategyRequested(); + SmallVector> Strategies; + if (EnableInlineSummaryOpt || UseDefaultStrategies) + Strategies.push_back(std::make_unique()); + + auto MaybeRemark = Parser.next(); + for (; MaybeRemark; MaybeRemark = Parser.next()) { + Remark &Remark = **MaybeRemark; + bool UsedRemark = false; + for (auto &Strategy : Strategies) { + if (!Strategy->filter(Remark)) + continue; + UsedRemark = true; + if (auto E = Strategy->process(Remark)) { + if (IgnoreMalformedOpt) { + WithColor::warning() << "Ignored error: " << E << "\n"; + consumeError(std::move(E)); + continue; + } + return E; + } + } + if (KeepInputOpt == KeepMode::All || + (KeepInputOpt == KeepMode::Used && UsedRemark)) + Serializer.emit(Remark); + } + + auto E = MaybeRemark.takeError(); + if (!E.isA()) + return E; + consumeError(std::move(E)); + + for (auto &Strategy : Strategies) + Strategy->emit(Serializer); + + OF->keep(); + return Error::success(); +} + +static CommandRegistration SummaryReg(&SummarySub, trySummary); + +} // namespace summary diff --git a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h index 73867fe35f06c..39e7b423c4dc0 100644 --- a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h +++ b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h @@ -9,6 +9,7 @@ // Helpers for remark utilites // //===----------------------------------------------------------------------===// +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Remarks/Remark.h" #include "llvm/Remarks/RemarkFormat.h" @@ -19,6 +20,7 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Regex.h" +#include "llvm/Support/StringSaver.h" #include "llvm/Support/ToolOutputFile.h" // Keep input + output help + names consistent across the various modes via a @@ -205,5 +207,54 @@ struct Filters { bool filterRemark(const Remark &Remark); }; +/// Helper to construct Remarks using an API similar to DiagnosticInfo. +/// Once this is more fully featured, consider implementing DiagnosticInfo using +/// RemarkBuilder. +class RemarkBuilder { + BumpPtrAllocator Alloc; + UniqueStringSaver Strs; + +public: + Remark R; + struct Argument { + std::string Key; + std::string Val; + std::optional Loc; + Argument(StringRef Key, StringRef Val, + std::optional Loc = std::nullopt) + : Key(Key), Val(Val), Loc(Loc) {} + Argument(StringRef Key, int Val, + std::optional Loc = std::nullopt) + : Key(Key), Val(itostr(Val)), Loc(Loc) {} + }; + + RemarkBuilder(Type RemarkType, StringRef PassName, StringRef RemarkName, + StringRef FunctionName) + : Strs(Alloc) { + R.RemarkType = RemarkType; + R.PassName = Strs.save(PassName); + R.RemarkName = Strs.save(RemarkName); + R.FunctionName = Strs.save(FunctionName); + } + + RemarkBuilder &operator<<(Argument &&Arg) { + auto &RArg = R.Args.emplace_back(Strs.save(Arg.Key), Strs.save(Arg.Val)); + RArg.Loc = Arg.Loc; + return *this; + } + + RemarkBuilder &operator<<(const char *Str) { + R.Args.emplace_back("String", Str); + return *this; + } + + RemarkBuilder &operator<<(StringRef Str) { + R.Args.emplace_back("String", Strs.save(Str)); + return *this; + } +}; + +using NV = RemarkBuilder::Argument; + } // namespace remarks } // namespace llvm From 585b6e2d449e767d41a813e285a8a8d38fb77ea6 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Mon, 27 Oct 2025 17:56:05 +0100 Subject: [PATCH 08/32] [flang][OpenMP] Allocate `allocatable` init temps on the stack for GPUs (#164761) Temps needed for the allocatable reduction/privatization init regions are now allocated on the heap all the time. However, this is performance killer for GPUs since malloc calls are prohibitively expensive. Therefore, we should do these allocations on the stack for GPU reductions. This is similar to what we do for arrays. Additionally, I am working on getting reductions-by-ref to work on GPUs which is a bit of a challenge given the many involved steps (e.g. intra-warp and inter-warp reuctions, shuffling data from remote lanes, ...). But this is a prerequisite step. --- .../Lower/Support/PrivateReductionUtils.cpp | 35 +++--- .../target-private-allocatable.f90 | 107 +++++++++++------- 2 files changed, 88 insertions(+), 54 deletions(-) diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp index d433ce367d259..c6c428860bca1 100644 --- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp +++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp @@ -376,6 +376,8 @@ class PopulateInitAndCleanupRegionsHelper { loadedMoldArg = builder.loadIfRef(loc, moldArg); return loadedMoldArg; } + + bool shouldAllocateTempOnStack() const; }; } // namespace @@ -438,8 +440,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar( builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front()); } - mlir::Value valAlloc = builder.createHeapTemporary(loc, innerTy, /*name=*/{}, - /*shape=*/{}, lenParams); + bool shouldAllocateOnStack = shouldAllocateTempOnStack(); + mlir::Value valAlloc = + (shouldAllocateOnStack) + ? builder.createTemporary(loc, innerTy, /*name=*/{}, + /*shape=*/{}, lenParams) + : builder.createHeapTemporary(loc, innerTy, /*name=*/{}, + /*shape=*/{}, lenParams); + if (scalarInitValue) builder.createStoreWithConvert(loc, scalarInitValue, valAlloc); mlir::Value box = fir::EmboxOp::create(builder, loc, valType, valAlloc, @@ -451,8 +459,9 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar( fir::StoreOp lastOp = fir::StoreOp::create(builder, loc, box, allocatedPrivVarArg); - createCleanupRegion(converter, loc, argType, cleanupRegion, sym, - isDoConcurrent); + if (!shouldAllocateOnStack) + createCleanupRegion(converter, loc, argType, cleanupRegion, sym, + isDoConcurrent); if (ifUnallocated) builder.setInsertionPointAfter(ifUnallocated); @@ -462,6 +471,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar( createYield(allocatedPrivVarArg); } +bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const { + // On the GPU, always allocate on the stack since heap allocatins are very + // expensive. + auto offloadMod = + llvm::dyn_cast(*builder.getModule()); + return offloadMod && offloadMod.getIsGPU(); +} + void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray( fir::BaseBoxType boxTy, bool needsInitialization) { bool isAllocatableOrPointer = @@ -504,15 +521,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray( // Allocating on the heap in case the whole reduction/privatization is nested // inside of a loop auto temp = [&]() { - bool shouldAllocateOnStack = false; - - // On the GPU, always allocate on the stack since heap allocatins are very - // expensive. - if (auto offloadMod = llvm::dyn_cast( - *builder.getModule())) - shouldAllocateOnStack = offloadMod.getIsGPU(); - - if (shouldAllocateOnStack) + if (shouldAllocateTempOnStack()) return createStackTempFromMold(loc, builder, source); auto [temp, needsDealloc] = createTempFromMold(loc, builder, source); diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 index 3d93fbc6e446e..272f34fc0fd1a 100644 --- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 +++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 @@ -1,9 +1,22 @@ ! Tests delayed privatization for `targets ... private(..)` for allocatables. ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \ -! RUN: -o - %s 2>&1 | FileCheck %s +! RUN: -o - %s 2>&1 | FileCheck %s --check-prefix=CPU + ! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging -o - %s 2>&1 \ -! RUN: | FileCheck %s +! RUN: | FileCheck %s --check-prefix=CPU + +! RUN: %if amdgpu-registered-target %{ \ +! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir \ +! RUN: -fopenmp -fopenmp-is-target-device \ +! RUN: -mmlir --enable-delayed-privatization-staging \ +! RUN: -o - %s 2>&1 | \ +! RUN: FileCheck %s --check-prefix=GPU \ +! RUN: %} + +! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging \ +! RUN: -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=GPU subroutine target_allocatable implicit none @@ -14,53 +27,65 @@ subroutine target_allocatable !$omp end target end subroutine target_allocatable -! CHECK-LABEL: omp.private {type = private} -! CHECK-SAME: @[[VAR_PRIVATIZER_SYM:.*]] : -! CHECK-SAME: [[DESC_TYPE:!fir.box>]] init { -! CHECK: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]): +! CPU-LABEL: omp.private {type = private} +! CPU-SAME: @[[VAR_PRIVATIZER_SYM:.*]] : +! CPU-SAME: [[DESC_TYPE:!fir.box>]] init { +! CPU: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]): + +! CPU-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]] +! CPU-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap +! CPU-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap) -> i64 +! CPU-NEXT: %[[C0:.*]] = arith.constant 0 : i64 +! CPU-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64 -! CHECK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]] -! CHECK-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap -! CHECK-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap) -> i64 -! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64 -! CHECK-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64 +! CPU-NEXT: fir.if %[[ALLOC_COND]] { +! CPU-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap) -> [[DESC_TYPE]] +! CPU-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] +! CPU-NEXT: } else { +! CPU-NEXT: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 +! CPU-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap) -> [[DESC_TYPE]] +! CPU-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] +! CPU-NEXT: } -! CHECK-NEXT: fir.if %[[ALLOC_COND]] { -! CHECK-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap) -> [[DESC_TYPE]] -! CHECK-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] -! CHECK-NEXT: } else { -! CHECK-NEXT: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 -! CHECK-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap) -> [[DESC_TYPE]] -! CHECK-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] -! CHECK-NEXT: } +! CPU-NEXT: omp.yield(%[[PRIV_ALLOC]] : [[TYPE]]) -! CHECK-NEXT: omp.yield(%[[PRIV_ALLOC]] : [[TYPE]]) +! CPU-NEXT: } dealloc { +! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]): -! CHECK-NEXT: } dealloc { -! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]): +! CPU-NEXT: %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]] +! CPU-NEXT: %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]] +! CPU-NEXT: %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]] +! CPU-NEXT: %[[C0:.*]] = arith.constant 0 : i64 +! CPU-NEXT: %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64 -! CHECK-NEXT: %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]] -! CHECK-NEXT: %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]] -! CHECK-NEXT: %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]] -! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64 -! CHECK-NEXT: %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64 +! CPU-NEXT: fir.if %[[PRIV_NULL_COND]] { +! CPU-NEXT: fir.freemem %[[PRIV_ADDR]] +! CPU-NEXT: } -! CHECK-NEXT: fir.if %[[PRIV_NULL_COND]] { -! CHECK-NEXT: fir.freemem %[[PRIV_ADDR]] -! CHECK-NEXT: } +! CPU-NEXT: omp.yield +! CPU-NEXT: } -! CHECK-NEXT: omp.yield -! CHECK-NEXT: } +! CPU-LABEL: func.func @_QPtarget_allocatable() { -! CHECK-LABEL: func.func @_QPtarget_allocatable() { +! CPU: %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]] +! CPU-SAME: {bindc_name = "alloc_var", {{.*}}} +! CPU: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]] +! CPU: %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref>>) -> [[MEMBER_TYPE:.*]] +! CPU: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}} +! CPU: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> -! CHECK: %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]] -! CHECK-SAME: {bindc_name = "alloc_var", {{.*}}} -! CHECK: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]] -! CHECK: %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref>>) -> [[MEMBER_TYPE:.*]] -! CHECK: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}} -! CHECK: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> +! CPU: omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private( +! CPU-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) { -! CHECK: omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private( -! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) { +! GPU-LABEL: omp.private {type = private} {{.*}} init { +! GPU: fir.if %{{.*}} { +! GPU-NEXT: %[[ZERO_BOX:.*]] = fir.embox %{{.*}} +! GPU-NEXT: fir.store %[[ZERO_BOX]] to %{{.*}} +! GPU-NEXT: } else { +! GPU-NOT: fir.allocmem i32 +! GPU-NEXT: %[[PRIV_ALLOC:.*]] = fir.alloca i32 +! GPU-NEXT: %[[PRIV_ALLOC_BOX:.*]] = fir.embox %[[PRIV_ALLOC]] +! GPU-NEXT: fir.store %[[PRIV_ALLOC_BOX]] to %{{.*}} +! GPU-NEXT: } +! GPU-NEXT: omp.yield(%{{.*}}) From defe934dd6a1bfa582cde959758f57059de25c39 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Mon, 27 Oct 2025 09:59:45 -0700 Subject: [PATCH 09/32] [InstrProf][NFC] Use -profile-correlate flag in tests (#163299) Back in https://github.com/llvm/llvm-project/pull/69493 the `-debug-info-correlate` LLVM flag was deprecated in favor of `-profile-correlate=debug-info`. Update all tests to use this new flag. --- .../test/profile/Darwin/instrprof-debug-info-correlate.c | 4 ++-- .../Linux/instrprof-debug-info-correlate-debuginfod.c | 2 +- .../test/profile/Linux/instrprof-debug-info-correlate.c | 6 +++--- .../profile/Linux/instrprof-show-debug-info-correlation.c | 4 ++-- .../InstrProfiling/debug-info-correlate-coverage.ll | 2 +- .../Instrumentation/InstrProfiling/debug-info-correlate.ll | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c index f347d439e2e06..46d25a4e386dc 100644 --- a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c +++ b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c @@ -1,5 +1,5 @@ // Value profiling is currently not supported in lightweight mode. -// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp +// RUN: %clang_pgogen -o %t -g -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t.dSYM %t.proflite @@ -9,7 +9,7 @@ // RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata) -// RUN: %clang_pgogen -o %t.cov -g -mllvm --debug-info-correlate -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp +// RUN: %clang_pgogen -o %t.cov -g -mllvm --profile-correlate=debug-info -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov.dSYM %t.cov.proflite diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-debuginfod.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-debuginfod.c index 788cb31e5116c..903ead31e0f60 100644 --- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-debuginfod.c +++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-debuginfod.c @@ -7,7 +7,7 @@ // RUN: llvm-profdata merge -o %t.default.profdata %t.profraw // Build with profile debuginfo correlation. -// RUN: %clang_pgogen -o %t.correlate.exe -Wl,--build-id=0x12345678 -g -gdwarf-4 -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp +// RUN: %clang_pgogen -o %t.correlate.exe -Wl,--build-id=0x12345678 -g -gdwarf-4 -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp // RUN: env LLVM_PROFILE_FILE=%t.debug-info-correlate.proflite %run %t.correlate.exe // Test llvm-profdata merge profile correlation with --debuginfod option. diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c index 426426d9a05a2..194f980df9681 100644 --- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c +++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c @@ -3,7 +3,7 @@ // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal // RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw -// RUN: %clang_pgogen -o %t.d4 -g -gdwarf-4 -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp +// RUN: %clang_pgogen -o %t.d4 -g -gdwarf-4 -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp // RUN: env LLVM_PROFILE_FILE=%t.d4.proflite %run %t.d4 // RUN: llvm-profdata merge -o %t.d4.profdata --debug-info=%t.d4 %t.d4.proflite @@ -11,7 +11,7 @@ // RUN: llvm-profdata show --all-functions --counts %t.d4.profdata > %t.d4.dump // RUN: diff %t.normal.dump %t.d4.dump -// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp +// RUN: %clang_pgogen -o %t -g -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite @@ -19,7 +19,7 @@ // RUN: llvm-profdata show --all-functions --counts %t.profdata > %t.prof.dump // RUN: diff %t.normal2.dump %t.prof.dump -// RUN: %clang_pgogen -o %t.cov -g -mllvm --debug-info-correlate -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp +// RUN: %clang_pgogen -o %t.cov -g -mllvm --profile-correlate=debug-info -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov %t.cov.proflite diff --git a/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c b/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c index 245dc79891042..93bf40f98d3ab 100644 --- a/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c +++ b/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c @@ -1,8 +1,8 @@ -// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %s +// RUN: %clang_pgogen -o %t -g -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %s // RUN: llvm-profdata show --debug-info=%t --detailed-summary --show-prof-sym-list | FileCheck %s // RUN: llvm-profdata show --debug-info=%t --show-format=yaml | FileCheck %s --match-full-lines --check-prefix YAML -// RUN: %clang_pgogen -o %t.no.dbg -mllvm --debug-info-correlate -mllvm --disable-vp=true %s +// RUN: %clang_pgogen -o %t.no.dbg -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %s // RUN: not llvm-profdata show --debug-info=%t.no.dbg 2>&1 | FileCheck %s --check-prefix NO-DBG // NO-DBG: unable to correlate profile: could not find any profile data metadata in correlated file diff --git a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll index 192bac6e503a0..dd64615338170 100644 --- a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll +++ b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=instrprof -debug-info-correlate -S | opt -O2 -S | FileCheck %s +; RUN: opt < %s -passes=instrprof -profile-correlate=debug-info -S | opt -O2 -S | FileCheck %s @__profn_foo = private constant [3 x i8] c"foo" ; CHECK: @__profc_foo diff --git a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll index fd868ead5b78d..84eaab33701a4 100644 --- a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll +++ b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=instrprof -debug-info-correlate -S > %t.ll +; RUN: opt < %s -passes=instrprof -profile-correlate=debug-info -S > %t.ll ; RUN: FileCheck < %t.ll --implicit-check-not "{{__llvm_prf_data|__llvm_prf_names}}" %s ; RUN: %llc_dwarf -O0 -filetype=obj < %t.ll | llvm-dwarfdump - | FileCheck --implicit-check-not "{{DW_TAG|NULL}}" %s --check-prefix CHECK-DWARF From 242c716c68f2ea8e7976649b532e7008978af406 Mon Sep 17 00:00:00 2001 From: anoopkg6 Date: Mon, 27 Oct 2025 12:22:01 -0500 Subject: [PATCH 10/32] Fix Linux kernel build failure for SytemZ. (#165274) Linux kernel build fails for SystemZ as output of INLINEASM was GR32Bit general-purpose register instead of SystemZ::CC. --------- Co-authored-by: anoopkg6 Co-authored-by: Ulrich Weigand --- .../Target/SystemZ/SystemZISelLowering.cpp | 6 +-- .../SystemZ/inline-asm-flag-output-01.ll | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index de28faf4908e9..3da720f54e6ab 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1714,7 +1714,7 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( } if (Constraint[1] == '@') { if (StringRef("{@cc}").compare(Constraint) == 0) - return std::make_pair(0u, &SystemZ::GR32BitRegClass); + return std::make_pair(SystemZ::CC, &SystemZ::CCRRegClass); } } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); @@ -1766,10 +1766,6 @@ SDValue SystemZTargetLowering::LowerAsmOutputForConstraint( OpInfo.ConstraintVT.getSizeInBits() < 8) report_fatal_error("Glue output operand is of invalid type"); - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(SystemZ::CC); - if (Glue.getNode()) { Glue = DAG.getCopyFromReg(Chain, DL, SystemZ::CC, MVT::i32, Glue); Chain = Glue.getValue(1); diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll index 6b8746e05704c..a86420ef6ffa6 100644 --- a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll +++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll @@ -736,3 +736,40 @@ exit: ret void } +; Test INLINEASM defines CC. +@wait_fence = global i32 0, align 4 +@bit_cc = global i32 0, align 4 +define void @test_inlineasm_define_cc() { +; CHECK-LABEL: test_inlineasm_define_cc: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, wait_fence@GOT +; CHECK-NEXT: chsi 0(%r1), 0 +; CHECK-NEXT: ber %r14 +; CHECK-NEXT: .LBB29_1: # %while.body.lr.ph +; CHECK-NEXT: lgrl %r1, bit_cc@GOT +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: ipm %r0 +; CHECK-NEXT: srl %r0, 28 +; CHECK-NEXT: st %r0, 0(%r1) +; CHECK-NEXT: .LBB29_2: # %while.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: j .LBB29_2 +entry: + %0 = load i32, ptr @wait_fence, align 4 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: + %1 = tail call i32 asm "", "={@cc}"() + %2 = icmp ult i32 %1, 4 + tail call void @llvm.assume(i1 %2) + store i32 %1, ptr @bit_cc, align 4 + br label %while.body + +while.body: + br label %while.body + +while.end: + ret void +} From e903494967ad9b9a68c62f984cf53db24b4532e9 Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Mon, 27 Oct 2025 10:25:58 -0700 Subject: [PATCH 11/32] [lldb] Fix TestVTableValue.py test_overwrite_vtable test (#164910) Some machines have read-only vtables but this test expects to overwrite them. Use -no_data_const to ensure the vtable is writable --- lldb/test/API/functionalities/vtable/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lldb/test/API/functionalities/vtable/Makefile b/lldb/test/API/functionalities/vtable/Makefile index 99998b20bcb05..cbd7d472fb768 100644 --- a/lldb/test/API/functionalities/vtable/Makefile +++ b/lldb/test/API/functionalities/vtable/Makefile @@ -1,3 +1,9 @@ CXX_SOURCES := main.cpp +ifeq "$(OS)" "Darwin" + # Make vtables writable for test_overwrite_vtable test + # The -no_data_const flag prevents vtables from being placed in __DATA_CONST + LD_EXTRAS := -Wl,-no_data_const +endif + include Makefile.rules From a868e7ed597ad1e4b83dc8591edb842792aafe43 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Mon, 27 Oct 2025 10:30:53 -0700 Subject: [PATCH 12/32] Revert "[LLDB] Disable rosetta test on green dragon" This reverts commit 9a0aa922ed3e0accc2d2fbfffa619e249a7c84ac. --- .../API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py index 4516c9b58ba37..0f40dfd09c958 100644 --- a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py +++ b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py @@ -18,11 +18,6 @@ def apple_silicon(): def rosetta_debugserver_installed(): - import platform - version = platform.mac_ver() - # Workaround for an undiagnosed problem on green dragon. - if version[0] == '15' and version[1] == '5': - return False return exists("/Library/Apple/usr/libexec/oah/debugserver") From 43f119baa61469c1b193c695ca22008585a0732d Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Mon, 27 Oct 2025 10:31:01 -0700 Subject: [PATCH 13/32] [LLDB] Disable rosetta test on green dragon --- .../API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py index 0f40dfd09c958..3b414ddb78b91 100644 --- a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py +++ b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py @@ -18,6 +18,11 @@ def apple_silicon(): def rosetta_debugserver_installed(): + import platform + version = platform.mac_ver() + # Workaround for an undiagnosed problem on green dragon. + if version[0] and version[0][0] == '15' and version[0][1] == '5': + return False return exists("/Library/Apple/usr/libexec/oah/debugserver") From 8f1c72dcd3fc4bc871bbcd06bfae76510f3714b0 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Mon, 27 Oct 2025 10:43:56 -0700 Subject: [PATCH 14/32] [lit] Support more ulimit options These are the other options used in compiler-rt that we also need to support. Reviewers: arichardson, petrhosek, ilovepi Reviewed By: ilovepi, arichardson Pull Request: https://github.com/llvm/llvm-project/pull/165122 --- llvm/utils/lit/lit/TestRunner.py | 4 ++++ llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py | 4 ++++ llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py | 2 ++ llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt | 2 ++ llvm/utils/lit/tests/shtest-ulimit.py | 4 ++++ 5 files changed, 16 insertions(+) diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index f88314547bb3f..a48df097403c7 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -612,6 +612,10 @@ def executeBuiltinUlimit(cmd, shenv): shenv.ulimit["RLIMIT_AS"] = new_limit * 1024 elif cmd.args[1] == "-n": shenv.ulimit["RLIMIT_NOFILE"] = new_limit + elif cmd.args[1] == "-s": + shenv.ulimit["RLIMIT_STACK"] = new_limit * 1024 + elif cmd.args[1] == "-f": + shenv.ulimit["RLIMIT_FSIZE"] = new_limit else: raise InternalShellError( cmd, "'ulimit' does not support option: %s" % cmd.args[1] diff --git a/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py index 33d2d59ff0dbe..a9dc2595497e7 100644 --- a/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py +++ b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py @@ -17,6 +17,10 @@ def main(argv): resource.setrlimit(resource.RLIMIT_AS, limit) elif limit_str == "RLIMIT_NOFILE": resource.setrlimit(resource.RLIMIT_NOFILE, limit) + elif limit_str == "RLIMIT_STACK": + resource.setrlimit(resource.RLIMIT_STACK, limit) + elif limit_str == "RLIMIT_FSIZE": + resource.setrlimit(resource.RLIMIT_FSIZE, limit) process_output = subprocess.run(command_args) sys.exit(process_output.returncode) diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py index 632f954fa8fde..c732c0429e661 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py +++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py @@ -2,3 +2,5 @@ print("RLIMIT_AS=" + str(resource.getrlimit(resource.RLIMIT_AS)[0])) print("RLIMIT_NOFILE=" + str(resource.getrlimit(resource.RLIMIT_NOFILE)[0])) +print("RLIMIT_STACK=" + str(resource.getrlimit(resource.RLIMIT_STACK)[0])) +print("RLIMIT_FSIZE=" + str(resource.getrlimit(resource.RLIMIT_FSIZE)[0])) diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt index 4edf1c303a092..d38dc44fa033d 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt @@ -1,4 +1,6 @@ # RUN: ulimit -n 50 +# RUN: ulimit -s 256 +# RUN: ulimit -f 5 # RUN: %{python} %S/print_limits.py # Fail the test so that we can assert on the output. # RUN: not echo return diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py index 09cd475b737c1..ba3de8b1bfced 100644 --- a/llvm/utils/lit/tests/shtest-ulimit.py +++ b/llvm/utils/lit/tests/shtest-ulimit.py @@ -19,7 +19,11 @@ # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}}) # CHECK: ulimit -n 50 +# CHECK: ulimit -s 256 +# CHECK: ulimit -f 5 # CHECK: RLIMIT_NOFILE=50 +# CHECK: RLIMIT_STACK=262144 +# CHECK: RLIMIT_FSIZE=5 # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_reset.txt ({{[^)]*}}) # CHECK: RLIMIT_NOFILE=[[BASE_NOFILE_LIMIT]] From bce7f7cc2245a44b1f9a9463846f8480968b67fa Mon Sep 17 00:00:00 2001 From: Gheorghe-Teodor Bercea Date: Mon, 27 Oct 2025 13:44:44 -0400 Subject: [PATCH 15/32] [AMDGPU] Precommit test for sinking vector ops PR 162580 (#165050) Pre-commit test for PR: https://github.com/llvm/llvm-project/pull/162580 --- llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll | 236 +++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll diff --git a/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll b/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll new file mode 100644 index 0000000000000..670e2c5b2c9e0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll @@ -0,0 +1,236 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=OPT %s + +; testing insert case +define amdgpu_kernel void @runningSum(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %inputElement1, i32 %inputIter) { +; OPT-LABEL: define amdgpu_kernel void @runningSum( +; OPT-SAME: ptr addrspace(1) [[OUT0:%.*]], ptr addrspace(1) [[OUT1:%.*]], i32 [[INPUTELEMENT1:%.*]], i32 [[INPUTITER:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[PREHEADER:.*]]: +; OPT-NEXT: [[VECELEMENT1:%.*]] = insertelement <2 x i32> poison, i32 [[INPUTELEMENT1]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[VECELEMENT1]], <2 x i32> poison, <2 x i32> zeroinitializer +; OPT-NEXT: br label %[[LOOPBODY:.*]] +; OPT: [[LOOPBODY]]: +; OPT-NEXT: [[PREVIOUSSUM:%.*]] = phi <2 x i32> [ [[TMP1]], %[[PREHEADER]] ], [ [[RUNNINGSUM:%.*]], %[[LOOPBODY]] ] +; OPT-NEXT: [[ITERCOUNT:%.*]] = phi i32 [ [[INPUTITER]], %[[PREHEADER]] ], [ [[ITERSLEFT:%.*]], %[[LOOPBODY]] ] +; OPT-NEXT: [[RUNNINGSUM]] = add <2 x i32> [[TMP1]], [[PREVIOUSSUM]] +; OPT-NEXT: [[ITERSLEFT]] = sub i32 [[ITERCOUNT]], 1 +; OPT-NEXT: [[COND:%.*]] = icmp eq i32 [[ITERSLEFT]], 0 +; OPT-NEXT: br i1 [[COND]], label %[[LOOPEXIT:.*]], label %[[LOOPBODY]] +; OPT: [[LOOPEXIT]]: +; OPT-NEXT: [[SUMELEMENT0:%.*]] = extractelement <2 x i32> [[RUNNINGSUM]], i64 0 +; OPT-NEXT: [[SUMELEMENT1:%.*]] = extractelement <2 x i32> [[RUNNINGSUM]], i64 1 +; OPT-NEXT: store i32 [[SUMELEMENT0]], ptr addrspace(1) [[OUT0]], align 4 +; OPT-NEXT: store i32 [[SUMELEMENT1]], ptr addrspace(1) [[OUT1]], align 4 +; OPT-NEXT: ret void +; +preheader: + %vecElement1 = insertelement <2 x i32> poison, i32 %inputElement1, i64 0 + %broadcast1 = shufflevector <2 x i32> %vecElement1, <2 x i32> poison, <2 x i32> zeroinitializer + br label %loopBody + +loopBody: + %previousSum = phi <2 x i32> [ %broadcast1, %preheader ], [ %runningSum, %loopBody ] + %iterCount = phi i32 [ %inputIter, %preheader ], [ %itersLeft, %loopBody ] + %runningSum = add <2 x i32> %broadcast1, %previousSum + %itersLeft = sub i32 %iterCount, 1 + %cond = icmp eq i32 %itersLeft, 0 + br i1 %cond, label %loopExit, label %loopBody + +loopExit: + %sumElement0 = extractelement <2 x i32> %runningSum, i64 0 + %sumElement1 = extractelement <2 x i32> %runningSum, i64 1 + store i32 %sumElement0, ptr addrspace(1) %out0 + store i32 %sumElement1, ptr addrspace(1) %out1 + ret void +} + +; testing extract case with single use - with divergent control flow +; The vector has SINGLE use (extractelement), both sink into if.then +define amdgpu_kernel void @test_sink_extract_single_use_operands(ptr addrspace(1) %out0, <2 x i32> %inputVec, i32 %tid, i32 %cond) { +; OPT-LABEL: define amdgpu_kernel void @test_sink_extract_single_use_operands( +; OPT-SAME: ptr addrspace(1) [[OUT0:%.*]], <2 x i32> [[INPUTVEC:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[RUNNINGSUM:%.*]] = add <2 x i32> [[INPUTVEC]], splat (i32 1) +; OPT-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[RUNNINGSUM]], i64 0 +; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]] +; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; OPT: [[IF_THEN]]: +; OPT-NEXT: [[RESULT:%.*]] = add i32 [[TMP0]], 100 +; OPT-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT0]], align 4 +; OPT-NEXT: br label %[[IF_END]] +; OPT: [[IF_END]]: +; OPT-NEXT: ret void +; +entry: + %runningSum = add <2 x i32> %inputVec, + %sumElement0 = extractelement <2 x i32> %runningSum, i64 0 + %cmp = icmp slt i32 %tid, %cond + br i1 %cmp, label %if.then, label %if.end + +if.then: + %result = add i32 %sumElement0, 100 + store i32 %result, ptr addrspace(1) %out0 + br label %if.end + +if.end: + ret void +} + +; testing extract case - extracting two elements with divergent control flow +; The vector has TWO uses (two extractelements), all sink into if.then +define amdgpu_kernel void @test_sink_extract_operands(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %input_vec, i32 %tid, i32 %cond) { +; OPT-LABEL: define amdgpu_kernel void @test_sink_extract_operands( +; OPT-SAME: ptr addrspace(1) [[OUT0:%.*]], ptr addrspace(1) [[OUT1:%.*]], <4 x i32> [[INPUT_VEC:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[VEC_FULL:%.*]] = add <4 x i32> [[INPUT_VEC]], +; OPT-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[VEC_FULL]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_FULL]], i64 1 +; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]] +; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; OPT: [[IF_THEN]]: +; OPT-NEXT: [[RESULT0:%.*]] = add i32 [[TMP0]], 100 +; OPT-NEXT: [[RESULT1:%.*]] = add i32 [[TMP1]], 200 +; OPT-NEXT: store i32 [[RESULT0]], ptr addrspace(1) [[OUT0]], align 4 +; OPT-NEXT: store i32 [[RESULT1]], ptr addrspace(1) [[OUT1]], align 4 +; OPT-NEXT: br label %[[IF_END]] +; OPT: [[IF_END]]: +; OPT-NEXT: ret void +; +entry: + %vec_full = add <4 x i32> %input_vec, + %extract0 = extractelement <4 x i32> %vec_full, i64 0 + %extract1 = extractelement <4 x i32> %vec_full, i64 1 + %cmp = icmp slt i32 %tid, %cond + br i1 %cmp, label %if.then, label %if.end + +if.then: + %result0 = add i32 %extract0, 100 + %result1 = add i32 %extract1, 200 + store i32 %result0, ptr addrspace(1) %out0 + store i32 %result1, ptr addrspace(1) %out1 + br label %if.end + +if.end: + ret void +} + +; testing shuffle case with divergent control flow - shuffles sink into if.then +define amdgpu_kernel void @test_shuffle_insert_subvector(ptr addrspace(1) %ptr, <4 x i16> %vec1, <4 x i16> %vec2, i32 %tid, i32 %cond) { +; OPT-LABEL: define amdgpu_kernel void @test_shuffle_insert_subvector( +; OPT-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x i16> [[VEC1:%.*]], <4 x i16> [[VEC2:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> [[VEC2]], <4 x i32> +; OPT-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> [[VEC2]], <4 x i32> +; OPT-NEXT: [[SHUFFLE3:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> poison, <4 x i32> +; OPT-NEXT: [[SHUFFLE4:%.*]] = shufflevector <4 x i16> [[VEC2]], <4 x i16> poison, <4 x i32> +; OPT-NEXT: [[SHUFFLE5:%.*]] = shufflevector <4 x i16> [[SHUFFLE]], <4 x i16> [[SHUFFLE2]], <4 x i32> +; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]] +; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; OPT: [[IF_THEN]]: +; OPT-NEXT: [[RESULT_VEC:%.*]] = add <4 x i16> [[SHUFFLE5]], +; OPT-NEXT: [[OTHER_RESULT:%.*]] = mul <4 x i16> [[SHUFFLE3]], splat (i16 2) +; OPT-NEXT: [[MORE_RESULT:%.*]] = sub <4 x i16> [[SHUFFLE4]], splat (i16 5) +; OPT-NEXT: store <4 x i16> [[RESULT_VEC]], ptr addrspace(1) [[PTR]], align 8 +; OPT-NEXT: store <4 x i16> [[OTHER_RESULT]], ptr addrspace(1) [[PTR]], align 8 +; OPT-NEXT: store <4 x i16> [[MORE_RESULT]], ptr addrspace(1) [[PTR]], align 8 +; OPT-NEXT: br label %[[IF_END]] +; OPT: [[IF_END]]: +; OPT-NEXT: ret void +; +entry: + %shuffle = shufflevector <4 x i16> %vec1, <4 x i16> %vec2, <4 x i32> + %shuffle2 = shufflevector <4 x i16> %vec1, <4 x i16> %vec2, <4 x i32> + %shuffle3 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <4 x i32> + %shuffle4 = shufflevector <4 x i16> %vec2, <4 x i16> poison, <4 x i32> + %shuffle5 = shufflevector <4 x i16> %shuffle, <4 x i16> %shuffle2, <4 x i32> + %cmp = icmp slt i32 %tid, %cond + br i1 %cmp, label %if.then, label %if.end + +if.then: + %result_vec = add <4 x i16> %shuffle5, + %other_result = mul <4 x i16> %shuffle3, + %more_result = sub <4 x i16> %shuffle4, + store <4 x i16> %result_vec, ptr addrspace(1) %ptr + store <4 x i16> %other_result, ptr addrspace(1) %ptr + store <4 x i16> %more_result, ptr addrspace(1) %ptr + br label %if.end + +if.end: + ret void +} + +; testing shuffle extract subvector with divergent control flow - shuffles sink into if.then +define amdgpu_kernel void @test_shuffle_extract_subvector(ptr addrspace(1) %ptr, <4 x i16> %input_vec, i32 %tid, i32 %cond) { +; OPT-LABEL: define amdgpu_kernel void @test_shuffle_extract_subvector( +; OPT-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x i16> [[INPUT_VEC:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <2 x i32> +; OPT-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <2 x i32> +; OPT-NEXT: [[SHUFFLE3:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <4 x i32> +; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]] +; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; OPT: [[IF_THEN]]: +; OPT-NEXT: [[RESULT_VEC:%.*]] = add <2 x i16> [[SHUFFLE]], +; OPT-NEXT: [[RESULT_VEC2:%.*]] = mul <2 x i16> [[SHUFFLE2]], splat (i16 3) +; OPT-NEXT: [[RESULT_VEC3:%.*]] = sub <4 x i16> [[SHUFFLE3]], splat (i16 10) +; OPT-NEXT: store <2 x i16> [[RESULT_VEC]], ptr addrspace(1) [[PTR]], align 4 +; OPT-NEXT: store <2 x i16> [[RESULT_VEC2]], ptr addrspace(1) [[PTR]], align 4 +; OPT-NEXT: store <4 x i16> [[RESULT_VEC3]], ptr addrspace(1) [[PTR]], align 8 +; OPT-NEXT: br label %[[IF_END]] +; OPT: [[IF_END]]: +; OPT-NEXT: ret void +; +entry: + %shuffle = shufflevector <4 x i16> %input_vec, <4 x i16> poison, <2 x i32> + %shuffle2 = shufflevector <4 x i16> %input_vec, <4 x i16> poison, <2 x i32> + %shuffle3 = shufflevector <4 x i16> %input_vec, <4 x i16> poison, <4 x i32> + %cmp = icmp slt i32 %tid, %cond + br i1 %cmp, label %if.then, label %if.end + +if.then: + %result_vec = add <2 x i16> %shuffle, + %result_vec2 = mul <2 x i16> %shuffle2, + %result_vec3 = sub <4 x i16> %shuffle3, + store <2 x i16> %result_vec, ptr addrspace(1) %ptr + store <2 x i16> %result_vec2, ptr addrspace(1) %ptr + store <4 x i16> %result_vec3, ptr addrspace(1) %ptr + br label %if.end + +if.end: + ret void +} + +; testing shuffle sink with widening operations and divergent control flow +define amdgpu_kernel void @test_shuffle_sink_operands(ptr addrspace(1) %ptr, <2 x i16> %input_vec, <2 x i16> %input_vec2, i32 %tid, i32 %cond) { +; OPT-LABEL: define amdgpu_kernel void @test_shuffle_sink_operands( +; OPT-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x i16> [[INPUT_VEC:%.*]], <2 x i16> [[INPUT_VEC2:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[INPUT_VEC]], <2 x i16> poison, <4 x i32> +; OPT-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i16> [[INPUT_VEC2]], <2 x i16> poison, <4 x i32> +; OPT-NEXT: [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]] +; OPT-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; OPT: [[IF_THEN]]: +; OPT-NEXT: [[RESULT_VEC:%.*]] = add <4 x i16> [[SHUFFLE]], +; OPT-NEXT: [[RESULT_VEC2:%.*]] = mul <4 x i16> [[SHUFFLE2]], splat (i16 5) +; OPT-NEXT: store <4 x i16> [[RESULT_VEC]], ptr addrspace(1) [[PTR]], align 8 +; OPT-NEXT: store <4 x i16> [[RESULT_VEC2]], ptr addrspace(1) [[PTR]], align 8 +; OPT-NEXT: br label %[[IF_END]] +; OPT: [[IF_END]]: +; OPT-NEXT: ret void +; +entry: + %shuffle = shufflevector <2 x i16> %input_vec, <2 x i16> poison, <4 x i32> + %shuffle2 = shufflevector <2 x i16> %input_vec2, <2 x i16> poison, <4 x i32> + %cmp = icmp slt i32 %tid, %cond + br i1 %cmp, label %if.then, label %if.end + +if.then: + %result_vec = add <4 x i16> %shuffle, + %result_vec2 = mul <4 x i16> %shuffle2, + store <4 x i16> %result_vec, ptr addrspace(1) %ptr + store <4 x i16> %result_vec2, ptr addrspace(1) %ptr + br label %if.end + +if.end: + ret void +} From 30c3a91f94f8e99c494e46f6026d6b1bd943c355 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Mon, 27 Oct 2025 13:46:45 -0400 Subject: [PATCH 16/32] [PowerPC] Add Implementation and test for new eTCE instructions (#164002) Add implementation and encoding tests for: - tlbiep - tlbieio - tlbsyncio - ptesyncio --- llvm/lib/Target/PowerPC/PPCInstrFormats.td | 20 +++++++++---- llvm/lib/Target/PowerPC/PPCInstrFuture.td | 30 +++++++++++++++++++ .../PowerPC/ppc-encoding-ISAFuture.txt | 12 ++++++++ .../PowerPC/ppc64le-encoding-ISAFuture.txt | 12 ++++++++ llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s | 16 ++++++++++ 5 files changed, 85 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index 98c5f09260811..1a77b00588311 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -850,24 +850,34 @@ class XForm_45 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = 0; } -class XForm_RSB5_UIMM2_2UIMM1 opcode, bits<10> xo, dag OOL, dag IOL, +class XForm_RSB5_UIMM2 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, list pattern> : I { bits<5> RS; bits<5> RB; bits<2> RIC; - bits<1> PRS; - bits<1> R; let Pattern = pattern; let Inst{6...10} = RS; + let Inst{11} = 0; let Inst{12...13} = RIC; - let Inst{14} = PRS; - let Inst{15} = R; + let Inst{14...15} = 0; let Inst{16...20} = RB; let Inst{21...30} = xo; + let Inst{31} = 0; +} + +class XForm_RSB5_UIMM2_2UIMM1 opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, list pattern> + : XForm_RSB5_UIMM2 { + + bits<1> PRS; + bits<1> R; + + let Inst{14} = PRS; + let Inst{15} = R; } class X_FRT5_XO2_XO3_XO10 opcode, bits<2> xo1, bits<3> xo2, bits<10> xo, diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 1aefea1a1c498..b0bed71c6755f 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -11,6 +11,18 @@ // //===----------------------------------------------------------------------===// +class XForm_RS5 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + list pattern> : I { + bits<5> RS; + + let Pattern = pattern; + + let Inst{6...10} = RS; + let Inst{11...20} = 0; + let Inst{21...30} = xo; + let Inst{31} = 0; +} + class XOForm_RTAB5_L1 opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, list pattern> : I { @@ -294,6 +306,24 @@ let Predicates = [IsISAFuture] in { defm SUBFUS : XOForm_RTAB5_L1r<31, 72, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB, u1imm:$L), "subfus", "$RT, $L, $RA, $RB", []>; + def TLBSYNCIO + : XForm_RS5<31, 564, (outs), (ins g8rc:$RS), "tlbsyncio $RS", []>; + def PTESYNCIO + : XForm_RS5<31, 596, (outs), (ins g8rc:$RS), "ptesyncio $RS", []>; + def TLBIEP : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs), + (ins gprc:$RB, gprc:$RS, u2imm:$RIC, + u1imm:$PRS, u1imm:$R), + "tlbiep $RB, $RS, $RIC, $PRS, $R", []>; + def TLBIEIO + : XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC), + "tlbieio $RB, $RS, $RIC", []>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in { + def TLBIEP8 + : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs), + (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC, + u1imm:$PRS, u1imm:$R), + "tlbiep $RB, $RS, $RIC, $PRS, $R", []>; + } } let Predicates = [HasVSX, IsISAFuture] in { diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt index cdfc8ce9e0ca5..054489ce51a60 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt @@ -7,6 +7,18 @@ # RUN: llvm-mc --disassemble %s -triple powerpc-unknown-aix-gnu \ # RUN: -mcpu=future | FileCheck %s +#CHECK: tlbiep 8, 10, 2, 1, 0 +0x7d 0x4a 0x40 0x64 + +#CHECK: tlbieio 8, 10, 2 +0x7d 0x48 0x40 0x24 + +#CHECK: tlbsyncio 15 +0x7d 0xe0 0x04 0x68 + +#CHECK: ptesyncio 15 +0x7d 0xe0 0x04 0xa8 + #CHECK: dmxxextfdmr512 2, 34, 1, 0 0xf0 0x82 0x17 0x12 diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt index f7e314fc819e4..17d1413bacc3a 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt @@ -1,6 +1,18 @@ # RUN: llvm-mc --disassemble %s -triple powerpc64le-unknown-unknown \ # RUN: -mcpu=future | FileCheck %s +#CHECK: tlbiep 8, 10, 2, 1, 0 +0x64 0x40 0x4a 0x7d + +#CHECK: tlbieio 8, 10, 2 +0x24 0x40 0x48 0x7d + +#CHECK: tlbsyncio 15 +0x68 0x04 0xe0 0x7d + +#CHECK: ptesyncio 15 +0xa8 0x04 0xe0 0x7d + #CHECK: dmxxextfdmr512 2, 34, 1, 0 0x12 0x17 0x82 0xf0 diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s index 29fedd7c20646..e5bc1f47bf666 100644 --- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s +++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s @@ -5,6 +5,22 @@ # RUN: llvm-mc -triple powerpc-unknown-aix-gnu --show-encoding %s | \ # RUN: FileCheck -check-prefix=CHECK-BE %s +#CHECK-BE: tlbiep 8, 10, 2, 1, 0 # encoding: [0x7d,0x4a,0x40,0x64] +#CHECK-LE: tlbiep 8, 10, 2, 1, 0 # encoding: [0x64,0x40,0x4a,0x7d] + tlbiep 8, 10, 2, 1, 0 + +# CHECK-BE: tlbieio 8, 10, 2 # encoding: [0x7d,0x48,0x40,0x24] +# CHECK-LE: tlbieio 8, 10, 2 # encoding: [0x24,0x40,0x48,0x7d] + tlbieio 8, 10, 2 + +# CHECK-BE: tlbsyncio 15 # encoding: [0x7d,0xe0,0x04,0x68] +# CHECK-LE: tlbsyncio 15 # encoding: [0x68,0x04,0xe0,0x7d] + tlbsyncio 15 + +# CHECK-BE: ptesyncio 15 # encoding: [0x7d,0xe0,0x04,0xa8] +# CHECK-LE: ptesyncio 15 # encoding: [0xa8,0x04,0xe0,0x7d] + ptesyncio 15 + # CHECK-BE: dmxxextfdmr512 2, 34, 1, 0 # encoding: [0xf0,0x82,0x17,0x12] # CHECK-LE: dmxxextfdmr512 2, 34, 1, 0 # encoding: [0x12,0x17,0x82,0xf0] dmxxextfdmr512 2, 34, 1, 0 From 30f2bf75587e87d73c238619866d39c53c389849 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 27 Oct 2025 10:47:11 -0700 Subject: [PATCH 17/32] [AMDGPU] Use implicit operand to preserve liveness of COPY (#164911) When lowering spills / restores, we may end up partially lowering the spill via copies and the remaining portion with loads/stores. In this partial lowering case,the implicit-def operands added to the restore load clobber the preceding copies -- telling MachineCopyPropagation to delete them. By also attaching an implicit operand to the load, the COPYs have an artificial use and thus will not be deleted - this is the same strategy taken in https://github.com/llvm/llvm-project/pull/115285 I'm not sure that we need implicit-def operands on any load restore, but I guess it may make sense if it needs to be split into multiple loads and some have been optimized out as containing undef elements. These implicit / implicit-def operands continue to cause correctness issues. A previous / ongoing long term plan to remove them is being addressed via: https://discourse.llvm.org/t/llvm-codegen-rfc-add-mo-lanemask-type-and-a-new-copy-lanemask-instruction/88021 https://github.com/llvm/llvm-project/pull/151123 https://github.com/llvm/llvm-project/pull/151124 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 6 +- .../AMDGPU/pei-build-spill-partial-agpr.mir | 10 +- .../AMDGPU/spill-restore-partial-copy.mir | 324 ++++++++++++++++++ .../CodeGen/AMDGPU/spill-to-agpr-partial.mir | 12 +- ...tor-spill-restore-to-other-vector-type.mir | 12 +- 5 files changed, 346 insertions(+), 18 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ebd2e7ecf249e..d80a6f339c8f6 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1874,9 +1874,13 @@ void SIRegisterInfo::buildSpillLoadStore( } bool IsSrcDstDef = SrcDstRegState & RegState::Define; + bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore; if (NeedSuperRegImpOperand && - (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) + (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) { MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); + if (PartialReloadCopy) + MIB.addReg(ValueReg, RegState::Implicit); + } // The epilog restore of a wwm-scratch register can cause undesired // optimization during machine-cp post PrologEpilogInserter if the same diff --git a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir index 8eddc9a5afd50..c9208bfa15c63 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir @@ -73,7 +73,7 @@ body: | ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 ; FLATSCR-V2A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1 - ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; FLATSCR-V2A-NEXT: S_ENDPGM 0 $vgpr0_vgpr1 = IMPLICIT_DEF SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) @@ -112,7 +112,7 @@ body: | ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 ; FLATSCR-V2A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2 :: (store (s64) into %stack.0, align 4, addrspace 5) ; FLATSCR-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 - ; FLATSCR-V2A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; FLATSCR-V2A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (load (s64) from %stack.0, align 4, addrspace 5) ; FLATSCR-V2A-NEXT: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF SI_SPILL_V96_SAVE killed $vgpr0_vgpr1_vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5) @@ -157,7 +157,7 @@ body: | ; FLATSCR-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; FLATSCR-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0, addrspace 5) + ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0, addrspace 5) ; FLATSCR-V2A-NEXT: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) @@ -203,7 +203,7 @@ body: | ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; FLATSCR-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; FLATSCR-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 - ; FLATSCR-V2A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; FLATSCR-V2A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s64) from %stack.0, align 4, addrspace 5) ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; FLATSCR-V2A-NEXT: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF @@ -255,7 +255,7 @@ body: | ; FLATSCR-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 - ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0, addrspace 5) + ; FLATSCR-V2A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0, addrspace 5) ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir b/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir new file mode 100644 index 0000000000000..bb87b6e52da89 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir @@ -0,0 +1,324 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx950 -run-pass prologepilog,machine-cp -o - %s | FileCheck -check-prefix=GFX950 %s + +--- | + define amdgpu_kernel void @full_copy() #0 { ret void } + + define amdgpu_kernel void @partial_copy() #0 { ret void } + + define amdgpu_kernel void @full_spill() #0 { ret void } + + attributes #0 = { "amdgpu-waves-per-eu"="8,8" } +... + +--- +name: full_copy +tracksRegLiveness: true +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 1, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 2, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 3, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 4, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 5, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 6, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + hasSpilledVGPRs: true +body: | + bb.0: + ; GFX950-LABEL: name: full_copy + ; GFX950: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: renamable $agpr0_agpr1 = IMPLICIT_DEF + ; GFX950-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; GFX950-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; GFX950-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit-def $vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit killed $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit killed $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr23, implicit $exec, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr20_vgpr21_vgpr22_vgpr23 + ; GFX950-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr22, implicit $exec, implicit $vgpr20_vgpr21_vgpr22_vgpr23 + ; GFX950-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr21, implicit $exec, implicit $vgpr20_vgpr21_vgpr22_vgpr23 + ; GFX950-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr20, implicit $exec, implicit killed $vgpr20_vgpr21_vgpr22_vgpr23 + ; GFX950-NEXT: $vgpr0 = IMPLICIT_DEF + ; GFX950-NEXT: $agpr5 = COPY $agpr6, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr7, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr8, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr9, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr10, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr11, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr12, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr13, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr14, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr15, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr16, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr17, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr18, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr19, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr20, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr21, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr22, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr23, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr24, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr25, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr26, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr27, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr28, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr29, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + renamable $agpr0_agpr1 = IMPLICIT_DEF + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr4_vgpr5_vgpr6_vgpr7, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr8_vgpr9_vgpr10_vgpr11, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr12_vgpr13_vgpr14_vgpr15, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr16_vgpr17_vgpr18_vgpr19, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr20_vgpr21_vgpr22_vgpr23, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5) + $vgpr0 = IMPLICIT_DEF + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec + S_ENDPGM 0 +... + + +# We need to add implicit operand as well as implicit-def operand to the scratch_load, otherwise, MachineCopyPropagation will think the preceeding copies are dead, and will delete them. + +--- +name: partial_copy +tracksRegLiveness: true +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 1, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 2, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 3, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 4, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 5, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 6, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + hasSpilledVGPRs: true +body: | + bb.0: + ; GFX950-LABEL: name: partial_copy + ; GFX950: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: renamable $agpr0_agpr1 = IMPLICIT_DEF + ; GFX950-NEXT: renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF + ; GFX950-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; GFX950-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; GFX950-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit-def $vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit killed $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit killed $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr23, implicit $exec, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr20_vgpr21_vgpr22_vgpr23 + ; GFX950-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr22, implicit $exec, implicit $vgpr20_vgpr21_vgpr22_vgpr23 + ; GFX950-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr20_vgpr21, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr20_vgpr21_vgpr22_vgpr23 :: (store (s64) into %stack.5, align 4, addrspace 5) + ; GFX950-NEXT: $vgpr0 = IMPLICIT_DEF + ; GFX950-NEXT: $agpr5 = COPY $agpr6, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr7, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr8, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr9, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr10, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr11, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr12, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr13, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr14, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr15, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr16, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr17, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr18, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr19, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr20, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr21, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr22, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr23, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr24, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr25, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr26, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr27, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2_agpr3 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr2_agpr3_agpr4_agpr5 :: (load (s64) from %stack.5, align 4, addrspace 5) + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + renamable $agpr0_agpr1 = IMPLICIT_DEF + renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr4_vgpr5_vgpr6_vgpr7, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr8_vgpr9_vgpr10_vgpr11, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr12_vgpr13_vgpr14_vgpr15, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr16_vgpr17_vgpr18_vgpr19, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr20_vgpr21_vgpr22_vgpr23, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5) + $vgpr0 = IMPLICIT_DEF + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec + S_ENDPGM 0 +... + +# Since there are no preceeding copies , we do not need to add implicit operand, as the implicit-def operand does not clobber. + +--- +name: full_spill +tracksRegLiveness: true +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 1, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 2, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 3, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 4, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 5, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } + - { id: 6, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4 } +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + hasSpilledVGPRs: true +body: | + bb.0: + ; GFX950-LABEL: name: full_spill + ; GFX950: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: renamable $agpr0_agpr1 = IMPLICIT_DEF + ; GFX950-NEXT: renamable $agpr26_agpr27 = IMPLICIT_DEF + ; GFX950-NEXT: renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF + ; GFX950-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; GFX950-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + ; GFX950-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX950-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX950-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11 + ; GFX950-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit-def $vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit killed $vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX950-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit killed $vgpr16_vgpr17_vgpr18_vgpr19 + ; GFX950-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr20_vgpr21_vgpr22_vgpr23, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.5, align 4, addrspace 5) + ; GFX950-NEXT: $vgpr0 = IMPLICIT_DEF + ; GFX950-NEXT: $agpr5 = COPY $agpr6, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr7, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr8, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr9, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr10, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr11, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr12, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr13, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr14, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr15, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr16, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr17, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr18, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr19, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr20, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr21, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec + ; GFX950-NEXT: $agpr5 = COPY $agpr22, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr4 = COPY $agpr23, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr3 = COPY $agpr24, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: $agpr2 = COPY $agpr25, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec + ; GFX950-NEXT: $agpr2_agpr3_agpr4_agpr5 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.5, align 4, addrspace 5) + ; GFX950-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + renamable $agpr0_agpr1 = IMPLICIT_DEF + renamable $agpr26_agpr27 = IMPLICIT_DEF + renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF + renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF + SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr4_vgpr5_vgpr6_vgpr7, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr8_vgpr9_vgpr10_vgpr11, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr12_vgpr13_vgpr14_vgpr15, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr16_vgpr17_vgpr18_vgpr19, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) + SI_SPILL_AV128_SAVE killed $vgpr20_vgpr21_vgpr22_vgpr23, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5) + $vgpr0 = IMPLICIT_DEF + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 0, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 1024, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 2048, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 3072, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 4096, 0, implicit $exec + renamable $agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5) + DS_WRITE_B128_gfx9 renamable $vgpr0, killed renamable $agpr2_agpr3_agpr4_agpr5, 5120, 0, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/spill-to-agpr-partial.mir b/llvm/test/CodeGen/AMDGPU/spill-to-agpr-partial.mir index 52593e01eafde..beeb9b2df8b01 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-to-agpr-partial.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-to-agpr-partial.mir @@ -19,7 +19,7 @@ body: | ; GCN-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; GCN-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s96) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s96) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s96) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25_agpr26_agpr27, implicit $agpr28_agpr29, implicit $agpr30 SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -46,7 +46,7 @@ body: | ; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; GCN-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GCN-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25_agpr26_agpr27, implicit $agpr28_agpr29 SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -75,7 +75,7 @@ body: | ; GCN-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr29, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; GCN-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; GCN-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25_agpr26_agpr27, implicit $agpr28 SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -129,7 +129,7 @@ body: | ; GCN-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 ; GCN-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s96) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr55, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; GCN-NEXT: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s96) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (load (s96) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr52_vgpr53, implicit $vgpr54 SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -156,7 +156,7 @@ body: | ; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr54, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 ; GCN-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr55, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN-NEXT: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr52_vgpr53 SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -185,7 +185,7 @@ body: | ; GCN-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr53, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 ; GCN-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr54, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 ; GCN-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr55, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN-NEXT: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr52 SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir b/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir index 2fac3d29cb0dc..69cf924548ed8 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir +++ b/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir @@ -22,7 +22,7 @@ body: | ; GCN-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 ; GCN-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s96) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: $vgpr51 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 - ; GCN-NEXT: $vgpr48_vgpr49_vgpr50 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s96) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: $vgpr48_vgpr49_vgpr50 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s96) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr54, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -49,7 +49,7 @@ body: | ; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: $vgpr51 = COPY $vgpr54, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 ; GCN-NEXT: $vgpr50 = COPY $vgpr55, implicit $vgpr48_vgpr49_vgpr50_vgpr51 - ; GCN-NEXT: $vgpr48_vgpr49 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: $vgpr48_vgpr49 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -78,7 +78,7 @@ body: | ; GCN-NEXT: $vgpr51 = COPY $vgpr53, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 ; GCN-NEXT: $vgpr50 = COPY $vgpr54, implicit $vgpr48_vgpr49_vgpr50_vgpr51 ; GCN-NEXT: $vgpr49 = COPY $vgpr55, implicit $vgpr48_vgpr49_vgpr50_vgpr51 - ; GCN-NEXT: $vgpr48 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr48 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51, implicit $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -132,7 +132,7 @@ body: | ; GCN-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; GCN-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s96) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29 - ; GCN-NEXT: $agpr26_agpr27_agpr28 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s96) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: $agpr26_agpr27_agpr28 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29, implicit $agpr26_agpr27_agpr28_agpr29 :: (load (s96) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25 SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -159,7 +159,7 @@ body: | ; GCN-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29 ; GCN-NEXT: $agpr28 = COPY $agpr31, implicit $agpr26_agpr27_agpr28_agpr29 - ; GCN-NEXT: $agpr26_agpr27 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: $agpr26_agpr27 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29, implicit $agpr26_agpr27_agpr28_agpr29 :: (load (s64) from %stack.0, align 4, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25 SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) @@ -188,7 +188,7 @@ body: | ; GCN-NEXT: $agpr29 = COPY $agpr25, implicit-def $agpr26_agpr27_agpr28_agpr29 ; GCN-NEXT: $agpr28 = COPY $agpr30, implicit $agpr26_agpr27_agpr28_agpr29 ; GCN-NEXT: $agpr27 = COPY $agpr31, implicit $agpr26_agpr27_agpr28_agpr29 - ; GCN-NEXT: $agpr26 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $agpr26 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29, implicit $agpr26_agpr27_agpr28_agpr29 :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24 SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) $agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) From c1f652883b92703a7b30751a2188cb501dc2af98 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Mon, 27 Oct 2025 10:52:27 -0700 Subject: [PATCH 18/32] [llvm][clang] Explicitly pass the VFS to sanitizer passes (#165267) This PR passes the VFS to LLVM's sanitizer passes from Clang, so that the configuration files can be loaded in the same way all other compiler inputs are. --- clang/lib/CodeGen/BackendUtil.cpp | 8 ++++--- .../Instrumentation/SanitizerBinaryMetadata.h | 6 +++++ .../Instrumentation/SanitizerCoverage.h | 22 +++++++------------ .../SanitizerBinaryMetadata.cpp | 11 ++++++---- .../Instrumentation/SanitizerCoverage.cpp | 12 ++++++++++ 5 files changed, 38 insertions(+), 21 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 468c930acacbd..aefc262dca17f 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -713,14 +713,16 @@ static void addSanitizers(const Triple &TargetTriple, ThinOrFullLTOPhase) { if (CodeGenOpts.hasSanitizeCoverage()) { auto SancovOpts = getSancovOptsFromCGOpts(CodeGenOpts); - MPM.addPass(SanitizerCoveragePass( - SancovOpts, CodeGenOpts.SanitizeCoverageAllowlistFiles, - CodeGenOpts.SanitizeCoverageIgnorelistFiles)); + MPM.addPass( + SanitizerCoveragePass(SancovOpts, PB.getVirtualFileSystemPtr(), + CodeGenOpts.SanitizeCoverageAllowlistFiles, + CodeGenOpts.SanitizeCoverageIgnorelistFiles)); } if (CodeGenOpts.hasSanitizeBinaryMetadata()) { MPM.addPass(SanitizerBinaryMetadataPass( getSanitizerBinaryMetadataOptions(CodeGenOpts), + PB.getVirtualFileSystemPtr(), CodeGenOpts.SanitizeMetadataIgnorelistFiles)); } diff --git a/llvm/include/llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h b/llvm/include/llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h index 054016622a577..63c5990a41741 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h +++ b/llvm/include/llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h @@ -13,6 +13,7 @@ #define LLVM_TRANSFORMS_INSTRUMENTATION_SANITIZERBINARYMETADATA_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" @@ -20,6 +21,9 @@ #include "llvm/Transforms/Utils/Instrumentation.h" namespace llvm { +namespace vfs { +class FileSystem; +} // namespace vfs struct SanitizerBinaryMetadataOptions { bool Covered = false; @@ -53,12 +57,14 @@ class SanitizerBinaryMetadataPass public: LLVM_ABI explicit SanitizerBinaryMetadataPass( SanitizerBinaryMetadataOptions Opts = {}, + IntrusiveRefCntPtr VFS = nullptr, ArrayRef IgnorelistFiles = {}); LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); static bool isRequired() { return true; } private: const SanitizerBinaryMetadataOptions Options; + IntrusiveRefCntPtr VFS; const ArrayRef IgnorelistFiles; }; diff --git a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h index f14f5b90a5cc9..a8a09fb95c4bd 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h +++ b/llvm/include/llvm/Transforms/Instrumentation/SanitizerCoverage.h @@ -15,14 +15,17 @@ #ifndef LLVM_TRANSFORMS_INSTRUMENTATION_SANITIZERCOVERAGE_H #define LLVM_TRANSFORMS_INSTRUMENTATION_SANITIZERCOVERAGE_H +#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/SpecialCaseList.h" -#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Transforms/Utils/Instrumentation.h" namespace llvm { class Module; +namespace vfs { +class FileSystem; +} // namespace vfs /// This is the ModuleSanitizerCoverage pass used in the new pass manager. The /// pass instruments functions for coverage, adds initialization calls to the @@ -32,24 +35,15 @@ class SanitizerCoveragePass : public PassInfoMixin { public: explicit SanitizerCoveragePass( SanitizerCoverageOptions Options = SanitizerCoverageOptions(), - const std::vector &AllowlistFiles = - std::vector(), - const std::vector &BlocklistFiles = - std::vector()) - : Options(Options) { - if (AllowlistFiles.size() > 0) - Allowlist = SpecialCaseList::createOrDie(AllowlistFiles, - *vfs::getRealFileSystem()); - if (BlocklistFiles.size() > 0) - Blocklist = SpecialCaseList::createOrDie(BlocklistFiles, - *vfs::getRealFileSystem()); - } + IntrusiveRefCntPtr VFS = nullptr, + const std::vector &AllowlistFiles = {}, + const std::vector &BlocklistFiles = {}); LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); static bool isRequired() { return true; } private: SanitizerCoverageOptions Options; - + IntrusiveRefCntPtr VFS; std::unique_ptr Allowlist; std::unique_ptr Blocklist; }; diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp index 4801ac75f8572..210b1266de23c 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp @@ -481,15 +481,18 @@ StringRef SanitizerBinaryMetadata::getSectionEnd(StringRef SectionSuffix) { } // namespace SanitizerBinaryMetadataPass::SanitizerBinaryMetadataPass( - SanitizerBinaryMetadataOptions Opts, ArrayRef IgnorelistFiles) - : Options(std::move(Opts)), IgnorelistFiles(std::move(IgnorelistFiles)) {} + SanitizerBinaryMetadataOptions Opts, + IntrusiveRefCntPtr VFS, + ArrayRef IgnorelistFiles) + : Options(std::move(Opts)), + VFS(VFS ? std::move(VFS) : vfs::getRealFileSystem()), + IgnorelistFiles(std::move(IgnorelistFiles)) {} PreservedAnalyses SanitizerBinaryMetadataPass::run(Module &M, AnalysisManager &AM) { std::unique_ptr Ignorelist; if (!IgnorelistFiles.empty()) { - Ignorelist = SpecialCaseList::createOrDie(IgnorelistFiles, - *vfs::getRealFileSystem()); + Ignorelist = SpecialCaseList::createOrDie(IgnorelistFiles, *VFS); if (Ignorelist->inSection("metadata", "src", M.getSourceFileName())) return PreservedAnalyses::all(); } diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index b74a0708b67ae..09abf6a33648c 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -318,6 +318,18 @@ class ModuleSanitizerCoverage { }; } // namespace +SanitizerCoveragePass::SanitizerCoveragePass( + SanitizerCoverageOptions Options, IntrusiveRefCntPtr VFS, + const std::vector &AllowlistFiles, + const std::vector &BlocklistFiles) + : Options(std::move(Options)), + VFS(VFS ? std::move(VFS) : vfs::getRealFileSystem()) { + if (AllowlistFiles.size() > 0) + Allowlist = SpecialCaseList::createOrDie(AllowlistFiles, *this->VFS); + if (BlocklistFiles.size() > 0) + Blocklist = SpecialCaseList::createOrDie(BlocklistFiles, *this->VFS); +} + PreservedAnalyses SanitizerCoveragePass::run(Module &M, ModuleAnalysisManager &MAM) { auto &FAM = MAM.getResult(M).getManager(); From 263377a17570e1cbe6eeae9ffa5ce02f240839ef Mon Sep 17 00:00:00 2001 From: Tarun Prabhu Date: Mon, 27 Oct 2025 11:58:04 -0600 Subject: [PATCH 19/32] [flang][Driver] Warn on -fbuiltin and -fno-builtin The options -fbuiltin and -fno-builtin are not valid for Fortran. However, they are accepted by gfortran which emits a warning message but continues to compile the code. Both -fbuiltin and -fno-builtin have been enabled for flang. Specifying either will result in a warning message being shown but no other effects. Compilation will proceed normally after these warnings are shown. This brings flang's behavior in line with gfortran for these options. Fixes #164766 --- clang/include/clang/Basic/DiagnosticDriverKinds.td | 3 +++ clang/include/clang/Driver/Options.td | 4 ++-- clang/lib/Driver/ToolChains/Flang.cpp | 7 +++++++ flang/test/Driver/flang-f-opts.f90 | 13 +++++++++++++ 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 0581bf353d936..83980e3ac35b7 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -133,6 +133,9 @@ def warn_drv_unsupported_option_for_offload_arch_req_feature : Warning< def warn_drv_unsupported_option_for_target : Warning< "ignoring '%0' option as it is not currently supported for target '%1'">, InGroup; +def warn_drv_invalid_argument_for_flang : Warning< + "'%0' is not valid for Fortran">, + InGroup; def warn_drv_unsupported_option_for_flang : Warning< "the argument '%0' is not supported for option '%1'. Mapping to '%1%2'">, InGroup; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index ef1c8758705f4..bca8b26bc3d30 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1955,7 +1955,7 @@ defm borland_extensions : BoolFOption<"borland-extensions", "Accept non-standard constructs supported by the Borland compiler">, NegFlag>; def fbuiltin : Flag<["-"], "fbuiltin">, Group, - Visibility<[ClangOption, CLOption, DXCOption]>; + Visibility<[ClangOption, CLOption, DXCOption, FlangOption, FC1Option]>; def fbuiltin_module_map : Flag <["-"], "fbuiltin-module-map">, Group, Flags<[]>, HelpText<"Load the clang builtins module map file.">; defm caret_diagnostics : BoolFOption<"caret-diagnostics", @@ -3563,7 +3563,7 @@ def fno_assume_sane_operator_new : Flag<["-"], "fno-assume-sane-operator-new">, Visibility<[ClangOption, CC1Option]>, MarshallingInfoNegativeFlag>; def fno_builtin : Flag<["-"], "fno-builtin">, Group, - Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, + Visibility<[ClangOption, CC1Option, CLOption, DXCOption, FlangOption, FC1Option]>, HelpText<"Disable implicit builtin knowledge of functions">; def fno_builtin_ : Joined<["-"], "fno-builtin-">, Group, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index a56fa41c49d34..88bce181d40d2 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -945,6 +945,13 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA, assert(false && "Unexpected action class for Flang tool."); } + // We support some options that are invalid for Fortran and have no effect. + // These are solely for compatibility with other compilers. Emit a warning if + // any such options are provided, then proceed normally. + for (options::ID Opt : {options::OPT_fbuiltin, options::OPT_fno_builtin}) + if (const Arg *A = Args.getLastArg(Opt)) + D.Diag(diag::warn_drv_invalid_argument_for_flang) << A->getSpelling(); + const InputInfo &Input = Inputs[0]; types::ID InputType = Input.getType(); diff --git a/flang/test/Driver/flang-f-opts.f90 b/flang/test/Driver/flang-f-opts.f90 index b972b9b7b2a59..77bb4d7aa8a91 100644 --- a/flang/test/Driver/flang-f-opts.f90 +++ b/flang/test/Driver/flang-f-opts.f90 @@ -13,3 +13,16 @@ ! CHECK-PROFILE-GENERATE-LLVM: "-fprofile-generate" ! RUN: %flang -### -S -fprofile-use=%S %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-DIR %s ! CHECK-PROFILE-USE-DIR: "-fprofile-use={{.*}}" + +! RUN: %flang -### -fbuiltin %s 2>&1 \ +! RUN: | FileCheck %s -check-prefix=WARN-BUILTIN +! WARN-BUILTIN: warning: '-fbuiltin' is not valid for Fortran + +! RUN: %flang -### -fno-builtin %s 2>&1 \ +! RUN: | FileCheck %s -check-prefix=WARN-NO-BUILTIN +! WARN-NO-BUILTIN: warning: '-fno-builtin' is not valid for Fortran + +! RUN: %flang -### -fbuiltin -fno-builtin %s 2>&1 \ +! RUN: | FileCheck %s -check-prefix=WARN-BUILTIN-MULTIPLE +! WARN-BUILTIN-MULTIPLE: warning: '-fbuiltin' is not valid for Fortran +! WARN-BUILTIN-MULTIPLE: warning: '-fno-builtin' is not valid for Fortran From 88558d52c71081d5c6c372f87fb454a89747c5dd Mon Sep 17 00:00:00 2001 From: jimingham Date: Mon, 27 Oct 2025 11:38:10 -0700 Subject: [PATCH 20/32] Avoid stalls when MainLoop::Interrupt fails to wake up the MainLoop (#164905) Turns out there's a bug in the current lldb sources that if you fork, set the stdio file handles to close on exec and then exec lldb with some commands and the `--batch` flag, lldb will stall on exit. The first cause of the bug is that the Python session handler - and probably other places in lldb - think 0, 1, and 2 HAVE TO BE the stdio file handles, and open and close and dup them as needed. NB: I am NOT trying to fix that bug. I'm not convinced running the lldb driver headless is worth a lot of effort, it's just as easy to redirect them to /dev/null, which does work. But I would like to keep lldb from stalling on the way out when this happens. The reason we stall is that we have a MainLoop waiting for signals, and we try to Interrupt it, but because stdio was closed, the interrupt pipe for the MainLoop gets the file descriptor 0, which gets closed by the Python session handler if you run some script command. So the Interrupt fails. We were running the Write to the interrupt pipe wrapped in `llvm::cantFail`, but in a no asserts build that just drops the error on the floor. So then lldb went on to call std::thread::join on the still active MainLoop, and that stalls I made Interrupt (and AddCallback & AddPendingCallback) return a bool for "interrupt success" instead. All the places where code was requesting termination, I added checks for that failure, and skip the std::thread::join call on the MainLoop thread, since that is almost certainly going to stall at this point. I didn't do the same for the Windows MainLoop, as I don't know if/when the WSASetEvent call can fail, so I always return true here. I also didn't turn the test off for Windows. According to the Python docs all the API's I used should work on Windows... If that turns out not to be true I'll make the test Darwin/Unix only. --- lldb/include/lldb/Host/MainLoopBase.h | 22 +++++--- lldb/include/lldb/Host/posix/MainLoopPosix.h | 2 +- .../lldb/Host/windows/MainLoopWindows.h | 2 +- lldb/source/Host/common/MainLoopBase.cpp | 6 ++- lldb/source/Host/posix/MainLoopPosix.cpp | 7 +-- lldb/source/Host/windows/MainLoopWindows.cpp | 4 +- .../Protocol/MCP/ProtocolServerMCP.cpp | 7 +-- .../stdio_closed/TestDriverWithClosedSTDIO.py | 51 +++++++++++++++++++ lldb/tools/driver/Driver.cpp | 7 +-- lldb/tools/lldb-dap/DAP.cpp | 8 +-- lldb/unittests/DAP/TestBase.cpp | 3 +- lldb/unittests/Host/JSONTransportTest.cpp | 19 ++++--- lldb/unittests/Host/MainLoopTest.cpp | 37 +++++++++----- .../Protocol/ProtocolMCPServerTest.cpp | 3 +- 14 files changed, 133 insertions(+), 45 deletions(-) create mode 100644 lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py diff --git a/lldb/include/lldb/Host/MainLoopBase.h b/lldb/include/lldb/Host/MainLoopBase.h index be9a2676e7443..9529f2c214784 100644 --- a/lldb/include/lldb/Host/MainLoopBase.h +++ b/lldb/include/lldb/Host/MainLoopBase.h @@ -57,18 +57,23 @@ class MainLoopBase { // Add a pending callback that will be executed once after all the pending // events are processed. The callback will be executed even if termination // was requested. - void AddPendingCallback(const Callback &callback) { - AddCallback(callback, std::chrono::steady_clock::time_point()); + // Returns false if an interrupt was needed to get the loop to act on the new + // callback, but the interrupt failed, true otherwise. Mostly used when the + // pending callback is a RequestTermination, since if the interrupt fails for + // that callback, waiting for the MainLoop thread to terminate could stall. + bool AddPendingCallback(const Callback &callback) { + return AddCallback(callback, std::chrono::steady_clock::time_point()); } // Add a callback that will be executed after a certain amount of time has - // passed. - void AddCallback(const Callback &callback, std::chrono::nanoseconds delay) { - AddCallback(callback, std::chrono::steady_clock::now() + delay); + // passed. See AddPendingCallback comment for the return value. + bool AddCallback(const Callback &callback, std::chrono::nanoseconds delay) { + return AddCallback(callback, std::chrono::steady_clock::now() + delay); } // Add a callback that will be executed after a given point in time. - void AddCallback(const Callback &callback, TimePoint point); + // See AddPendingCallback comment for the return value. + bool AddCallback(const Callback &callback, TimePoint point); // Waits for registered events and invoke the proper callbacks. Returns when // all callbacks deregister themselves or when someone requests termination. @@ -85,8 +90,9 @@ class MainLoopBase { virtual void UnregisterReadObject(IOObject::WaitableHandle handle) = 0; - // Interrupt the loop that is currently waiting for events. - virtual void Interrupt() = 0; + /// Interrupt the loop that is currently waiting for events. Return true if + /// the interrupt succeeded, false if it failed. + virtual bool Interrupt() = 0; void ProcessCallbacks(); diff --git a/lldb/include/lldb/Host/posix/MainLoopPosix.h b/lldb/include/lldb/Host/posix/MainLoopPosix.h index e9ac798b948df..92cdbe9d87ec3 100644 --- a/lldb/include/lldb/Host/posix/MainLoopPosix.h +++ b/lldb/include/lldb/Host/posix/MainLoopPosix.h @@ -54,7 +54,7 @@ class MainLoopPosix : public MainLoopBase { void UnregisterReadObject(IOObject::WaitableHandle handle) override; void UnregisterSignal(int signo, std::list::iterator callback_it); - void Interrupt() override; + bool Interrupt() override; private: void ProcessReadObject(IOObject::WaitableHandle handle); diff --git a/lldb/include/lldb/Host/windows/MainLoopWindows.h b/lldb/include/lldb/Host/windows/MainLoopWindows.h index 705e7e78ba48a..65b44aa1582c3 100644 --- a/lldb/include/lldb/Host/windows/MainLoopWindows.h +++ b/lldb/include/lldb/Host/windows/MainLoopWindows.h @@ -50,7 +50,7 @@ class MainLoopWindows : public MainLoopBase { protected: void UnregisterReadObject(IOObject::WaitableHandle handle) override; - void Interrupt() override; + bool Interrupt() override; private: llvm::Expected Poll(); diff --git a/lldb/source/Host/common/MainLoopBase.cpp b/lldb/source/Host/common/MainLoopBase.cpp index 64a57e65849e9..232b9bc0aa354 100644 --- a/lldb/source/Host/common/MainLoopBase.cpp +++ b/lldb/source/Host/common/MainLoopBase.cpp @@ -12,8 +12,9 @@ using namespace lldb; using namespace lldb_private; -void MainLoopBase::AddCallback(const Callback &callback, TimePoint point) { +bool MainLoopBase::AddCallback(const Callback &callback, TimePoint point) { bool interrupt_needed; + bool interrupt_succeeded = true; { std::lock_guard lock{m_callback_mutex}; // We need to interrupt the main thread if this callback is scheduled to @@ -22,7 +23,8 @@ void MainLoopBase::AddCallback(const Callback &callback, TimePoint point) { m_callbacks.emplace(point, callback); } if (interrupt_needed) - Interrupt(); + interrupt_succeeded = Interrupt(); + return interrupt_succeeded; } void MainLoopBase::ProcessCallbacks() { diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp index 19a7128fbe407..c6fe7814bd22e 100644 --- a/lldb/source/Host/posix/MainLoopPosix.cpp +++ b/lldb/source/Host/posix/MainLoopPosix.cpp @@ -387,10 +387,11 @@ void MainLoopPosix::ProcessSignal(int signo) { } } -void MainLoopPosix::Interrupt() { +bool MainLoopPosix::Interrupt() { if (m_interrupting.exchange(true)) - return; + return true; char c = '.'; - cantFail(m_interrupt_pipe.Write(&c, 1)); + llvm::Expected result = m_interrupt_pipe.Write(&c, 1); + return result && *result != 0; } diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp index 9b7df10258bcd..5e5888aee2181 100644 --- a/lldb/source/Host/windows/MainLoopWindows.cpp +++ b/lldb/source/Host/windows/MainLoopWindows.cpp @@ -272,4 +272,6 @@ Status MainLoopWindows::Run() { return Status(); } -void MainLoopWindows::Interrupt() { WSASetEvent(m_interrupt_event); } +bool MainLoopWindows::Interrupt() { + return WSASetEvent(m_interrupt_event); +} diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp index 390cf3eeb16a5..77a3ba6574cde 100644 --- a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp +++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp @@ -133,11 +133,12 @@ llvm::Error ProtocolServerMCP::Stop() { } // Stop the main loop. - m_loop.AddPendingCallback( + bool addition_succeeded = m_loop.AddPendingCallback( [](lldb_private::MainLoopBase &loop) { loop.RequestTermination(); }); - // Wait for the main loop to exit. - if (m_loop_thread.joinable()) + // Wait for the main loop to exit, but not if we didn't succeed in inserting + // our pending callback or we'll wait forever. + if (addition_succeeded && m_loop_thread.joinable()) m_loop_thread.join(); m_accept_handles.clear(); diff --git a/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py b/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py new file mode 100644 index 0000000000000..cff97b822db81 --- /dev/null +++ b/lldb/test/API/driver/stdio_closed/TestDriverWithClosedSTDIO.py @@ -0,0 +1,51 @@ +""" +Test that if you exec lldb with the stdio file handles +closed, it is able to exit without hanging. +""" + + +import lldb +import os +import sys +import socket +import fcntl + +import lldbsuite.test.lldbutil as lldbutil +from lldbsuite.test.lldbtest import * + + +class TestDriverWithClosedSTDIO(TestBase): + # If your test case doesn't stress debug info, then + # set this to true. That way it won't be run once for + # each debug info format. + NO_DEBUG_INFO_TESTCASE = True + + def test_run_lldb_and_wait(self): + """This test forks, closes the stdio channels and exec's lldb. + Then it waits for it to exit and asserts it did that successfully""" + pid = os.fork() + if pid == 0: + fcntl.fcntl(sys.stdin, fcntl.F_SETFD, fcntl.FD_CLOEXEC) + fcntl.fcntl(sys.stdout, fcntl.F_SETFD, fcntl.FD_CLOEXEC) + fcntl.fcntl(sys.stderr, fcntl.F_SETFD, fcntl.FD_CLOEXEC) + lldb = lldbtest_config.lldbExec + print(f"About to run: {lldb}") + os.execlp( + lldb, + lldb, + "-x", + "-o", + "script print(lldb.debugger.GetNumTargets())", + "--batch", + ) + else: + if pid == -1: + print("Couldn't fork a process.") + return + ret_pid, status = os.waitpid(pid, 0) + # We're really just checking that lldb doesn't stall. + # At the time this test was written, if you close stdin + # in an asserts build, lldb aborts. So handle both + # of those cases. The failure will just be that the + # waitpid doesn't return, and the test times out. + self.assertFalse(os.WIFSTOPPED(status), "We either exited or crashed.") diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index ba0041111045b..733331f4ddac0 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -902,9 +902,10 @@ int main(int argc, char const *argv[]) { } #if !defined(_WIN32) - signal_loop.AddPendingCallback( - [](MainLoopBase &loop) { loop.RequestTermination(); }); - signal_thread.join(); + // Try to interrupt the signal thread. If that succeeds, wait for it to exit. + if (signal_loop.AddPendingCallback( + [](MainLoopBase &loop) { loop.RequestTermination(); })) + signal_thread.join(); #endif return exit_code; diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 3c4f2253d1ad5..f009a902f79e7 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -1104,9 +1104,11 @@ llvm::Error DAP::Loop() { "unhandled packet"); } - m_loop.AddPendingCallback( - [](MainLoopBase &loop) { loop.RequestTermination(); }); - thread.join(); + // Don't wait to join the mainloop thread if our callback wasn't added + // successfully, or we'll wait forever. + if (m_loop.AddPendingCallback( + [](MainLoopBase &loop) { loop.RequestTermination(); })) + thread.join(); if (m_error_occurred) return llvm::createStringError(llvm::inconvertibleErrorCode(), diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp index 83a303554ad6b..8cb459964f7d8 100644 --- a/lldb/unittests/DAP/TestBase.cpp +++ b/lldb/unittests/DAP/TestBase.cpp @@ -55,8 +55,9 @@ void TransportBase::SetUp() { } void TransportBase::Run() { - loop.AddPendingCallback( + bool addition_succeeded = loop.AddPendingCallback( [](lldb_private::MainLoopBase &loop) { loop.RequestTermination(); }); + EXPECT_TRUE(addition_succeeded); EXPECT_THAT_ERROR(loop.Run().takeError(), llvm::Succeeded()); } diff --git a/lldb/unittests/Host/JSONTransportTest.cpp b/lldb/unittests/Host/JSONTransportTest.cpp index 54f1372ca0fff..e90ab8e85a105 100644 --- a/lldb/unittests/Host/JSONTransportTest.cpp +++ b/lldb/unittests/Host/JSONTransportTest.cpp @@ -269,12 +269,13 @@ template class JSONTransportTest : public PipePairTest { loop.RequestTermination(); }); } - loop.AddCallback( + bool addition_succeeded = loop.AddCallback( [](MainLoopBase &loop) { loop.RequestTermination(); FAIL() << "timeout"; }, timeout); + EXPECT_TRUE(addition_succeeded); auto handle = transport->RegisterMessageHandler(loop, message_handler); if (!handle) return handle.takeError(); @@ -367,7 +368,9 @@ class TransportBinderTest : public testing::Test { } void Run() { - loop.AddPendingCallback([](auto &loop) { loop.RequestTermination(); }); + bool addition_succeeded = + loop.AddPendingCallback([](auto &loop) { loop.RequestTermination(); }); + EXPECT_TRUE(addition_succeeded); EXPECT_THAT_ERROR(loop.Run().takeError(), Succeeded()); } }; @@ -435,8 +438,9 @@ TEST_F(HTTPDelimitedJSONTransportTest, ReadPartialMessage) { EXPECT_CALL(message_handler, Received(Request{5, "foo", std::nullopt})); ASSERT_THAT_EXPECTED(input.Write(part1.data(), part1.size()), Succeeded()); - loop.AddPendingCallback( + bool addition_succeeded = loop.AddPendingCallback( [](MainLoopBase &loop) { loop.RequestTermination(); }); + EXPECT_TRUE(addition_succeeded); ASSERT_THAT_ERROR(Run(/*close_stdin=*/false), Succeeded()); ASSERT_THAT_EXPECTED(input.Write(part2.data(), part2.size()), Succeeded()); input.CloseWriteFileDescriptor(); @@ -454,15 +458,17 @@ TEST_F(HTTPDelimitedJSONTransportTest, ReadWithZeroByteWrites) { ASSERT_THAT_EXPECTED(input.Write(part1.data(), part1.size()), Succeeded()); // Run the main loop once for the initial read. - loop.AddPendingCallback( + bool addition_succeeded = loop.AddPendingCallback( [](MainLoopBase &loop) { loop.RequestTermination(); }); + EXPECT_TRUE(addition_succeeded); ASSERT_THAT_ERROR(Run(/*close_stdin=*/false), Succeeded()); // zero-byte write. ASSERT_THAT_EXPECTED(input.Write(part1.data(), 0), Succeeded()); // zero-byte write. - loop.AddPendingCallback( + addition_succeeded = loop.AddPendingCallback( [](MainLoopBase &loop) { loop.RequestTermination(); }); + EXPECT_TRUE(addition_succeeded); ASSERT_THAT_ERROR(Run(/*close_stdin=*/false), Succeeded()); // Write the remaining part of the message. @@ -569,8 +575,9 @@ TEST_F(JSONRPCTransportTest, ReadPartialMessage) { EXPECT_CALL(message_handler, Received(Request{42, "foo", std::nullopt})); ASSERT_THAT_EXPECTED(input.Write(part1.data(), part1.size()), Succeeded()); - loop.AddPendingCallback( + bool addition_succeeded = loop.AddPendingCallback( [](MainLoopBase &loop) { loop.RequestTermination(); }); + EXPECT_TRUE(addition_succeeded); ASSERT_THAT_ERROR(Run(/*close_input=*/false), Succeeded()); ASSERT_THAT_EXPECTED(input.Write(part2.data(), part2.size()), Succeeded()); diff --git a/lldb/unittests/Host/MainLoopTest.cpp b/lldb/unittests/Host/MainLoopTest.cpp index ae16d02101819..8a248100c936a 100644 --- a/lldb/unittests/Host/MainLoopTest.cpp +++ b/lldb/unittests/Host/MainLoopTest.cpp @@ -179,9 +179,13 @@ TEST_F(MainLoopTest, PipeDelayBetweenRegisterAndRun) { ASSERT_THAT_EXPECTED(pipe.Write(&X, len), llvm::HasValue(1)); }; // Add a write that triggers a read events. - loop.AddCallback(cb, std::chrono::milliseconds(500)); - loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); }, - std::chrono::milliseconds(1000)); + bool addition_succeeded = + loop.AddCallback(cb, std::chrono::milliseconds(500)); + ASSERT_TRUE(addition_succeeded); + addition_succeeded = + loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); }, + std::chrono::milliseconds(1000)); + ASSERT_TRUE(addition_succeeded); ASSERT_TRUE(error.Success()); ASSERT_TRUE(handle); @@ -310,8 +314,10 @@ TEST_F(MainLoopTest, NoSpuriousSocketReads) { error); ASSERT_THAT_ERROR(error.ToError(), llvm::Succeeded()); // Terminate the loop after one second. - loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); }, - std::chrono::seconds(1)); + bool addition_succeeded = + loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); }, + std::chrono::seconds(1)); + ASSERT_TRUE(addition_succeeded); ASSERT_THAT_ERROR(loop.Run().ToError(), llvm::Succeeded()); // Make sure the callback was called only once. @@ -388,10 +394,11 @@ TEST_F(MainLoopTest, PendingCallbackTrigger) { MainLoop loop; std::promise add_callback2; bool callback1_called = false; - loop.AddPendingCallback([&](MainLoopBase &loop) { + bool addition_succeeded = loop.AddPendingCallback([&](MainLoopBase &loop) { callback1_called = true; add_callback2.set_value(); }); + EXPECT_TRUE(addition_succeeded); Status error; ASSERT_THAT_ERROR(error.ToError(), llvm::Succeeded()); bool callback2_called = false; @@ -416,9 +423,11 @@ TEST_F(MainLoopTest, ManyPendingCallbacks) { // caused a deadlock when the pipe filled up (either because the main loop was // not running, because it was slow, or because it was busy/blocked doing // something else). - for (int i = 0; i < 65536; ++i) - loop.AddPendingCallback( + for (int i = 0; i < 65536; ++i) { + bool addition_succeeded = loop.AddPendingCallback( [&](MainLoopBase &loop) { loop.RequestTermination(); }); + EXPECT_TRUE(addition_succeeded); + } ASSERT_TRUE(loop.Run().Success()); } @@ -444,8 +453,10 @@ TEST_F(MainLoopTest, TimedCallbacksRunInOrder) { add_cb(2); add_cb(4); add_cb(1); - loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); }, - start + 5 * epsilon); + bool addition_succeeded = + loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); }, + start + 5 * epsilon); + EXPECT_TRUE(addition_succeeded); ASSERT_THAT_ERROR(loop.Run().takeError(), llvm::Succeeded()); EXPECT_GE(std::chrono::steady_clock::now() - start, 5 * epsilon); ASSERT_THAT(order, testing::ElementsAre(1, 2, 3, 4)); @@ -455,22 +466,24 @@ TEST_F(MainLoopTest, TimedCallbackShortensSleep) { MainLoop loop; auto start = std::chrono::steady_clock::now(); bool long_callback_called = false; - loop.AddCallback( + bool addition_succeeded = loop.AddCallback( [&](MainLoopBase &loop) { long_callback_called = true; loop.RequestTermination(); }, std::chrono::seconds(30)); + EXPECT_TRUE(addition_succeeded); std::future async_run = std::async(std::launch::async, &MainLoop::Run, std::ref(loop)); std::this_thread::sleep_for(std::chrono::milliseconds(100)); bool short_callback_called = false; - loop.AddCallback( + addition_succeeded = loop.AddCallback( [&](MainLoopBase &loop) { short_callback_called = true; loop.RequestTermination(); }, std::chrono::seconds(1)); + EXPECT_TRUE(addition_succeeded); ASSERT_THAT_ERROR(async_run.get().takeError(), llvm::Succeeded()); EXPECT_LT(std::chrono::steady_clock::now() - start, std::chrono::seconds(10)); EXPECT_TRUE(short_callback_called); diff --git a/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp index 45464db958e04..97f32e2fbb1bf 100644 --- a/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp +++ b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp @@ -150,8 +150,9 @@ class ProtocolServerMCPTest : public testing::Test { /// Runs the MainLoop a single time, executing any pending callbacks. void Run() { - loop.AddPendingCallback( + bool addition_succeeded = loop.AddPendingCallback( [](MainLoopBase &loop) { loop.RequestTermination(); }); + EXPECT_TRUE(addition_succeeded); EXPECT_THAT_ERROR(loop.Run().takeError(), Succeeded()); } From f8a0599d761e4283b3877f0bf4043e01722dc448 Mon Sep 17 00:00:00 2001 From: sstwcw Date: Mon, 27 Oct 2025 18:41:09 +0000 Subject: [PATCH 21/32] [clang-format] Align comments following continued aligned lines (#164687) new ```C++ auto aaaaaaaaaaaaaaaaaaaaa = {}; // auto b = [] { // return; // }; auto aaaaaaaaaaaaaaaaaaaaa = {}; // auto b = [] { // return aaaaaaaaaaaaaaaaaaaaa; // }; ``` old ```C++ auto aaaaaaaaaaaaaaaaaaaaa = {}; // auto b = [] { // return; // }; auto aaaaaaaaaaaaaaaaaaaaa = {}; // auto b = [] { // return aaaaaaaaaaaaaaaaaaaaa; // }; ``` Aligning a line to another line involves keeping track of the tokens' positions. Previously the shift was incorrectly added to some tokens that did not move. Then the comments would end up in the wrong places. --- clang/lib/Format/WhitespaceManager.cpp | 25 +++++++++++++++++-------- clang/unittests/Format/FormatTest.cpp | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index 65fc65e79fdc3..f24b8ab14bdce 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -288,6 +288,9 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, ArrayRef Matches, SmallVector &Changes) { int Shift = 0; + // Set when the shift is applied anywhere in the line. Cleared when the line + // ends. + bool LineShifted = false; // ScopeStack keeps track of the current scope depth. It contains the levels // of at most 2 scopes. The first one is the one that the matched token is @@ -339,8 +342,11 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, Changes[i - 1].Tok->is(tok::string_literal); bool SkipMatchCheck = InsideNestedScope || ContinuedStringLiteral; - if (CurrentChange.NewlinesBefore > 0 && !SkipMatchCheck) - Shift = 0; + if (CurrentChange.NewlinesBefore > 0) { + LineShifted = false; + if (!SkipMatchCheck) + Shift = 0; + } // If this is the first matching token to be aligned, remember by how many // spaces it has to be shifted, so the rest of the changes on the line are @@ -349,7 +355,6 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, Shift = Column - (RightJustify ? CurrentChange.TokenLength : 0) - CurrentChange.StartOfTokenColumn; ScopeStack = {CurrentChange.indentAndNestingLevel()}; - CurrentChange.Spaces += Shift; } if (Shift == 0) @@ -358,8 +363,10 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, // This is for lines that are split across multiple lines, as mentioned in // the ScopeStack comment. The stack size being 1 means that the token is // not in a scope that should not move. - if (ScopeStack.size() == 1u && CurrentChange.NewlinesBefore > 0 && - (ContinuedStringLiteral || InsideNestedScope)) { + if ((!Matches.empty() && Matches[0] == i) || + (ScopeStack.size() == 1u && CurrentChange.NewlinesBefore > 0 && + (ContinuedStringLiteral || InsideNestedScope))) { + LineShifted = true; CurrentChange.Spaces += Shift; } @@ -369,9 +376,11 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, static_cast(Changes[i].Tok->SpacesRequiredBefore) || CurrentChange.Tok->is(tok::eof)); - CurrentChange.StartOfTokenColumn += Shift; - if (i + 1 != Changes.size()) - Changes[i + 1].PreviousEndOfTokenColumn += Shift; + if (LineShifted) { + CurrentChange.StartOfTokenColumn += Shift; + if (i + 1 != Changes.size()) + Changes[i + 1].PreviousEndOfTokenColumn += Shift; + } // If PointerAlignment is PAS_Right, keep *s or &s next to the token, // except if the token is equal, then a space is needed. diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index ce68f91bef02a..d45babe1b82ad 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -19615,6 +19615,25 @@ TEST_F(FormatTest, AlignConsecutiveAssignments) { "};", Alignment); + // Aligning lines should not mess up the comments. However, feel free to + // change the test if it turns out that comments inside the closure should not + // be aligned with those outside it. + verifyFormat("auto aaaaaaaaaaaaaaaaaaaaa = {}; //\n" + "auto b = [] { //\n" + " return; //\n" + "};", + Alignment); + verifyFormat("auto aaaaaaaaaaaaaaaaaaaaa = {}; //\n" + "auto b = [] { //\n" + " return aaaaaaaaaaaaaaaaaaaaa; //\n" + "};", + Alignment); + verifyFormat("auto aaaaaaaaaaaaaaa = {}; //\n" + "auto b = [] { //\n" + " return aaaaaaaaaaaaaaaaaaaaa; //\n" + "};", + Alignment); + verifyFormat("auto b = f(aaaaaaaaaaaaaaaaaaaaaaaaa,\n" " ccc ? aaaaa : bbbbb,\n" " dddddddddddddddddddddddddd);", From a6788b52468fb1bf661ce76f95ad92d0050bd35e Mon Sep 17 00:00:00 2001 From: Hanumanth Date: Mon, 27 Oct 2025 14:43:18 -0400 Subject: [PATCH 22/32] [mlir][tensor] Fix runtime verification for `tensor.extract_slice` when size dimension value is 0 (#164878) Previously, the runtime verification pass would insert assertion statements with conditions that always evaluate to false for semantically valid `tensor.extract_slice` operations where one of the dimensions had a size of 0. The `tensor.extract_slice` runtime verification logic was unconditionally generating checks for the position of the last element (`offset + (size - 1) * stride`). When `size` is 0, this causes the assertion condition to always be false, leading to runtime failures even though the operation is semantically valid. This patch fixes the issue by making the `lastPos` check conditional. The offset is always verified, but the endpoint check is only performed when `size > 0` to avoid generating spurious assert statements. This issue was discovered through LiteRT model, where a dynamic shape calculation resulted in a zero-sized dimension being passed to `tensor.extract_slice`. The following is a simplified IR snippet from the model. After running the runtime verification pass, an assertion that always fails is generated because the SSA value `%3` becomes 0. ```mlir func.func @simple_repro_from_liteRT_model(%arg0: tensor<10x4x1xf32>) -> tensor { %cst = arith.constant dense<0> : tensor<1xi32> %cst_0 = arith.constant dense<-1> : tensor<2xi32> %c-1 = arith.constant -1 : index %c0 = arith.constant 0 : index %c10 = arith.constant 10 : index %c1 = arith.constant 1 : index %c4 = arith.constant 4 : index %c2 = arith.constant 2 : index %0 = tensor.empty() : tensor<3xi32> %inserted_slice = tensor.insert_slice %cst into %0[0] [1] [1] : tensor<1xi32> into tensor<3xi32> %inserted_slice_1 = tensor.insert_slice %cst_0 into %inserted_slice[1] [2] [1] : tensor<2xi32> into tensor<3xi32> %extracted = tensor.extract %inserted_slice_1[%c0] : tensor<3xi32> %1 = index.casts %extracted : i32 to index %2 = arith.cmpi eq, %1, %c-1 : index %3 = arith.select %2, %c10, %1 : index %extracted_2 = tensor.extract %inserted_slice_1[%c1] : tensor<3xi32> %4 = index.casts %extracted_2 : i32 to index %5 = arith.cmpi eq, %4, %c-1 : index %6 = arith.select %5, %c4, %4 : index %extracted_3 = tensor.extract %inserted_slice_1[%c2] : tensor<3xi32> %7 = index.casts %extracted_3 : i32 to index %8 = arith.cmpi eq, %7, %c-1 : index %9 = arith.select %8, %c1, %7 : index %extracted_slice = tensor.extract_slice %arg0[0, 0, 0] [%3, %6, %9] [1, 1, 1] : tensor<10x4x1xf32> to tensor return %extracted_slice : tensor } ``` The issue can be reproduced more simply with the following test case, where `dim_0` is `0`. When the runtime verification pass is applied to this code with `dim_0 = 0`, it generates an assertion that will always fail at runtime. ```mlir func.func @extract_slice_zero_size_dim(%arg0: tensor<10x4x1xf32>, %dim_0: index, %dim_1: index, %dim_2: index) { %slice = tensor.extract_slice %arg0[0, 0, 0] [%dim_0, %dim_1, %dim_2] [1, 1, 1] : tensor<10x4x1xf32> to tensor return } func.func @test_zero_size_extraction() { %input = arith.constant dense<1.0> : tensor<10x4x1xf32> // Define slice dimensions: 0x4x1 (zero-size in first dimension) %dim_0 = arith.constant 0 : index %dim_1 = arith.constant 4 : index %dim_2 = arith.constant 1 : index func.call @extract_slice_zero_size_dim(%input, %dim_0, %dim_1, %dim_2) : (tensor<10x4x1xf32>, index, index, index) -> () return } ``` P.S. We probably have a similar issue with `memref.subview`. I will check this and send a separate PR for the issue. --------- Co-authored-by: Hanumanth Hanumantharayappa --- .../Transforms/RuntimeOpVerification.cpp | 30 +++++++++++++++++-- .../extract_slice-runtime-verification.mlir | 13 ++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp index c031118606823..753cb95b1c906 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/RuntimeOpVerification.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Interfaces/RuntimeVerifiableOpInterface.h" @@ -158,7 +159,11 @@ struct ExtractSliceOpInterface // 0 <= offset + (size - 1) * stride < dim_size Value zero = arith::ConstantIndexOp::create(builder, loc, 0); Value one = arith::ConstantIndexOp::create(builder, loc, 1); - for (int64_t i = 0, e = sourceType.getRank(); i < e; ++i) { + + for (int64_t i : llvm::seq(0, sourceType.getRank())) { + // Reset insertion point to before the operation for each dimension + builder.setInsertionPoint(extractSliceOp); + Value offset = getValueOrCreateConstantIndexOp( builder, loc, extractSliceOp.getMixedOffsets()[i]); Value size = getValueOrCreateConstantIndexOp( @@ -176,6 +181,16 @@ struct ExtractSliceOpInterface std::to_string(i) + " is out-of-bounds")); + // Only verify if size > 0 + Value sizeIsNonZero = arith::CmpIOp::create( + builder, loc, arith::CmpIPredicate::sgt, size, zero); + + auto ifOp = scf::IfOp::create(builder, loc, builder.getI1Type(), + sizeIsNonZero, /*withElseRegion=*/true); + + // Populate the "then" region (for size > 0). + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + // Verify that slice does not run out-of-bounds. Value sizeMinusOne = arith::SubIOp::create(builder, loc, size, one); Value sizeMinusOneTimesStride = @@ -184,8 +199,19 @@ struct ExtractSliceOpInterface arith::AddIOp::create(builder, loc, offset, sizeMinusOneTimesStride); Value lastPosInBounds = generateInBoundsCheck(builder, loc, lastPos, zero, dimSize); + scf::YieldOp::create(builder, loc, lastPosInBounds); + + // Populate the "else" region (for size == 0). + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + Value trueVal = + arith::ConstantOp::create(builder, loc, builder.getBoolAttr(true)); + scf::YieldOp::create(builder, loc, trueVal); + + builder.setInsertionPointAfter(ifOp); + Value finalCondition = ifOp.getResult(0); + cf::AssertOp::create( - builder, loc, lastPosInBounds, + builder, loc, finalCondition, generateErrorMessage( op, "extract_slice runs out-of-bounds along dimension " + std::to_string(i))); diff --git a/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir b/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir index 0c7c4a6cb2d6f..a77fa310a3699 100644 --- a/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir +++ b/mlir/test/Integration/Dialect/Tensor/extract_slice-runtime-verification.mlir @@ -34,6 +34,12 @@ func.func @extract_slice_dynamic_rank_reduce(%tensor: tensor, %offset: return } +func.func @extract_slice_zero_size_dim(%arg0: tensor<10x4x1xf32>, %dim_0: index, %dim_1: index, %dim_2: index) { + tensor.extract_slice %arg0[0, 0, 0] [%dim_0, %dim_1, %dim_2] [1, 1, 1] : tensor<10x4x1xf32> to tensor + return +} + + func.func @main() { %0 = arith.constant 0 : index %1 = arith.constant 1 : index @@ -101,6 +107,13 @@ func.func @main() { // CHECK-NOT: ERROR: Runtime op verification failed func.call @extract_slice_dynamic_rank_reduce(%alloca_4_dyn, %0, %1, %0) : (tensor, index, index, index) -> () + %cst10x4x1xf32 = arith.constant dense<1.0> : tensor<10x4x1xf32> + + // CHECK-NOT: ERROR: Runtime op verification failed + %dim_0 = arith.constant 0 : index + %dim_1 = arith.constant 4 : index + %dim_2 = arith.constant 1 : index + func.call @extract_slice_zero_size_dim(%cst10x4x1xf32, %dim_0, %dim_1, %dim_2) : (tensor<10x4x1xf32>, index, index, index) -> () return } From cbe7c49e93b630d3388dba2663b08a3c5c1bc8b6 Mon Sep 17 00:00:00 2001 From: Hanumanth Date: Mon, 27 Oct 2025 14:43:45 -0400 Subject: [PATCH 23/32] [mlir][memref] Fix runtime verification for memref.subview when size dimension value is 0 (#164897) Previously, the runtime verification pass would insert assertion statements with conditions that always evaluate to false for semantically valid `memref.subview` operations where one of the dimensions had a size of 0. The `memref.subview` runtime verification logic was unconditionally generating checks for the position of the last element (`offset + (size - 1) * stride`). When `size` is 0, this causes the assertion condition to always be false, leading to runtime failures even though the operation is semantically valid. This patch fixes the issue by making the `lastPos` check conditional. The offset is always verified, but the endpoint check is only performed when `size > 0` to avoid generating spurious assert statements. This issue was discovered through a LiteRT model, where a dynamic shape calculation resulted in a zero-sized dimension being passed to `memref.subview`. The following is a simplified IR snippet from the model. After running the runtime verification pass, an assertion that always fails is generated because the SSA value `%5` becomes 0. ```mlir module { memref.global "private" constant @__constant_2xi32 : memref<2xi32> = dense<-1> {alignment = 64 : i64} memref.global "private" constant @__constant_1xi32 : memref<1xi32> = dense<0> {alignment = 64 : i64} func.func @simpleRepro(%arg0: memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>>) -> memref> { %c2 = arith.constant 2 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index %c10 = arith.constant 10 : index %c0 = arith.constant 0 : index %c-1 = arith.constant -1 : index %0 = memref.get_global @__constant_1xi32 : memref<1xi32> %1 = memref.get_global @__constant_2xi32 : memref<2xi32> %alloca = memref.alloca() {alignment = 64 : i64} : memref<3xi32> %subview = memref.subview %alloca[0] [1] [1] : memref<3xi32> to memref<1xi32, strided<[1]>> memref.copy %0, %subview : memref<1xi32> to memref<1xi32, strided<[1]>> %subview_0 = memref.subview %alloca[1] [2] [1] : memref<3xi32> to memref<2xi32, strided<[1], offset: 1>> memref.copy %1, %subview_0 : memref<2xi32> to memref<2xi32, strided<[1], offset: 1>> %2 = memref.load %alloca[%c0] : memref<3xi32> %3 = index.casts %2 : i32 to index %4 = arith.cmpi eq, %3, %c-1 : index %5 = arith.select %4, %c10, %3 : index %6 = memref.load %alloca[%c1] : memref<3xi32> %7 = index.casts %6 : i32 to index %8 = arith.cmpi eq, %7, %c-1 : index %9 = arith.select %8, %c4, %7 : index %10 = memref.load %alloca[%c2] : memref<3xi32> %11 = index.casts %10 : i32 to index %12 = arith.cmpi eq, %11, %c-1 : index %13 = arith.select %12, %c1, %11 : index %subview_1 = memref.subview %arg0[0, 0, 0] [%5, %9, %13] [1, 1, 1] : memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>> to memref> return %subview_1 : memref> } } ``` P.S. This is a similar issue to the one fixed for `tensor.extract_slice` in https://github.com/llvm/llvm-project/pull/164878 --------- Co-authored-by: Hanumanth Hanumantharayappa --- .../Transforms/RuntimeOpVerification.cpp | 29 +++++++++++++++++-- .../MemRef/subview-runtime-verification.mlir | 21 ++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp index 291da1f76ca9b..14152c5a1af0c 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp @@ -15,6 +15,7 @@ #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Interfaces/RuntimeVerifiableOpInterface.h" using namespace mlir; @@ -273,7 +274,9 @@ struct SubViewOpInterface Value one = arith::ConstantIndexOp::create(builder, loc, 1); auto metadataOp = ExtractStridedMetadataOp::create(builder, loc, subView.getSource()); - for (int64_t i = 0, e = sourceType.getRank(); i < e; ++i) { + for (int64_t i : llvm::seq(0, sourceType.getRank())) { + // Reset insertion point to before the operation for each dimension + builder.setInsertionPoint(subView); Value offset = getValueOrCreateConstantIndexOp( builder, loc, subView.getMixedOffsets()[i]); Value size = getValueOrCreateConstantIndexOp(builder, loc, @@ -290,6 +293,16 @@ struct SubViewOpInterface std::to_string(i) + " is out-of-bounds")); + // Only verify if size > 0 + Value sizeIsNonZero = arith::CmpIOp::create( + builder, loc, arith::CmpIPredicate::sgt, size, zero); + + auto ifOp = scf::IfOp::create(builder, loc, builder.getI1Type(), + sizeIsNonZero, /*withElseRegion=*/true); + + // Populate the "then" region (for size > 0). + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + // Verify that slice does not run out-of-bounds. Value sizeMinusOne = arith::SubIOp::create(builder, loc, size, one); Value sizeMinusOneTimesStride = @@ -298,8 +311,20 @@ struct SubViewOpInterface arith::AddIOp::create(builder, loc, offset, sizeMinusOneTimesStride); Value lastPosInBounds = generateInBoundsCheck(builder, loc, lastPos, zero, dimSize); + + scf::YieldOp::create(builder, loc, lastPosInBounds); + + // Populate the "else" region (for size == 0). + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + Value trueVal = + arith::ConstantOp::create(builder, loc, builder.getBoolAttr(true)); + scf::YieldOp::create(builder, loc, trueVal); + + builder.setInsertionPointAfter(ifOp); + Value finalCondition = ifOp.getResult(0); + cf::AssertOp::create( - builder, loc, lastPosInBounds, + builder, loc, finalCondition, generateErrorMessage(op, "subview runs out-of-bounds along dimension " + std::to_string(i))); diff --git a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir index 71e813c0a6300..84875675ac3d0 100644 --- a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir +++ b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir @@ -2,6 +2,7 @@ // RUN: -expand-strided-metadata \ // RUN: -lower-affine \ // RUN: -test-cf-assert \ +// RUN: -convert-scf-to-cf \ // RUN: -convert-to-llvm | \ // RUN: mlir-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_runner_utils 2>&1 | \ @@ -11,6 +12,7 @@ // RUN: -expand-strided-metadata \ // RUN: -lower-affine \ // RUN: -test-cf-assert \ +// RUN: -convert-scf-to-cf \ // RUN: -convert-to-llvm="allow-pattern-rollback=0" \ // RUN: -reconcile-unrealized-casts | \ // RUN: mlir-runner -e main -entry-point-result=void \ @@ -38,6 +40,17 @@ func.func @subview_dynamic_rank_reduce(%memref: memref, %offset: index, return } +func.func @subview_zero_size_dim(%memref: memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>>, + %dim_0: index, + %dim_1: index, + %dim_2: index) { + %subview = memref.subview %memref[0, 0, 0] [%dim_0, %dim_1, %dim_2] [1, 1, 1] : + memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>> to + memref> + return +} + + func.func @main() { %0 = arith.constant 0 : index %1 = arith.constant 1 : index @@ -105,6 +118,14 @@ func.func @main() { // CHECK-NOT: ERROR: Runtime op verification failed func.call @subview_dynamic_rank_reduce(%alloca_4_dyn, %0, %1, %0) : (memref, index, index, index) -> () + %alloca_10x4x1 = memref.alloca() : memref<10x4x1xf32> + %alloca_10x4x1_dyn_stride = memref.cast %alloca_10x4x1 : memref<10x4x1xf32> to memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>> + // CHECK-NOT: ERROR: Runtime op verification failed + %dim_0 = arith.constant 0 : index + %dim_1 = arith.constant 4 : index + %dim_2 = arith.constant 1 : index + func.call @subview_zero_size_dim(%alloca_10x4x1_dyn_stride, %dim_0, %dim_1, %dim_2) + : (memref<10x4x1xf32, strided<[?, ?, ?], offset: ?>>, index, index, index) -> () return } From d8184343755ababad6479d07451f36dd695f75c1 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Mon, 27 Oct 2025 11:53:30 -0700 Subject: [PATCH 24/32] [LLDB] Add debug output to test to diagnose bot failure --- lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py index 3b414ddb78b91..3633701833220 100644 --- a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py +++ b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py @@ -21,6 +21,7 @@ def rosetta_debugserver_installed(): import platform version = platform.mac_ver() # Workaround for an undiagnosed problem on green dragon. + print(version) if version[0] and version[0][0] == '15' and version[0][1] == '5': return False return exists("/Library/Apple/usr/libexec/oah/debugserver") From cc868f6545592cded1521f84034df238c96a187c Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Mon, 27 Oct 2025 19:06:43 +0000 Subject: [PATCH 25/32] [clang][DebugInfo][test] Convert Objective-C property test to check LLVM IR (#165286) There's a couple of tests like this. This patch series renames these to something more descriptive and adjusts the tests to check IR. Currently the tests check raw assembly output (not even dwarfdump). Which most likely hid some bugs around property debug-info. --- clang/test/DebugInfo/ObjC/property-basic.m | 20 ++++++++++++++++++++ clang/test/DebugInfo/ObjC/property.m | 15 --------------- 2 files changed, 20 insertions(+), 15 deletions(-) create mode 100644 clang/test/DebugInfo/ObjC/property-basic.m delete mode 100644 clang/test/DebugInfo/ObjC/property.m diff --git a/clang/test/DebugInfo/ObjC/property-basic.m b/clang/test/DebugInfo/ObjC/property-basic.m new file mode 100644 index 0000000000000..65e1d7a6a9b1f --- /dev/null +++ b/clang/test/DebugInfo/ObjC/property-basic.m @@ -0,0 +1,20 @@ +// Checks basic debug-info generation for property. Makes sure we +// create a DIObjCProperty for the synthesized property. + +// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s + +// CHECK: !DIObjCProperty(name: "p1" +// CHECK-SAME: attributes: 2316 +// CHECK-SAME: type: ![[P1_TYPE:[0-9]+]] +// +// CHECK: ![[P1_TYPE]] = !DIBasicType(name: "int" + +@interface I1 { +int p1; +} +@property int p1; +@end + +@implementation I1 +@synthesize p1; +@end diff --git a/clang/test/DebugInfo/ObjC/property.m b/clang/test/DebugInfo/ObjC/property.m deleted file mode 100644 index ca013b24be421..0000000000000 --- a/clang/test/DebugInfo/ObjC/property.m +++ /dev/null @@ -1,15 +0,0 @@ -// FIXME: Check IR rather than asm, then triple is not needed. -// RUN: %clang_cc1 -triple %itanium_abi_triple -S -debug-info-kind=limited %s -o - | FileCheck %s - -// CHECK: AT_APPLE_property_name -// CHECK: AT_APPLE_property_attribute -// CHECK: AT_APPLE_property -@interface I1 { -int p1; -} -@property int p1; -@end - -@implementation I1 -@synthesize p1; -@end From 90489adf7a9944f53e7be411bab92174d9e069d1 Mon Sep 17 00:00:00 2001 From: Mark Danial Date: Mon, 27 Oct 2025 15:08:53 -0400 Subject: [PATCH 26/32] [clang][DebugInfo] Disable objective-CXX tests on AIX and z/OS (#164765) These tests not supported on AIX and z/OS, disable them to get the clang-ppc64-aix green --- clang/test/DebugInfo/ObjCXX/lit.local.cfg | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 clang/test/DebugInfo/ObjCXX/lit.local.cfg diff --git a/clang/test/DebugInfo/ObjCXX/lit.local.cfg b/clang/test/DebugInfo/ObjCXX/lit.local.cfg new file mode 100644 index 0000000000000..8d5c476a2f682 --- /dev/null +++ b/clang/test/DebugInfo/ObjCXX/lit.local.cfg @@ -0,0 +1,5 @@ +# objective-CXX is not supported on AIX and zOS +unsupported_platforms = [ "system-aix", "system-zos" ] + +if any(up in config.available_features for up in unsupported_platforms): + config.unsupported = True From 616f3b5aa118e95089679fd2a2b79fe769bbaf9a Mon Sep 17 00:00:00 2001 From: Shimin Cui Date: Mon, 27 Oct 2025 15:17:51 -0400 Subject: [PATCH 27/32] [DA] Fix crash when two loops have different type sizes of becount (#165021) The type sizes of backedge taken counts for two loops can be different and this is to fix the crash in haveSameSD (https://github.com/llvm/llvm-project/issues/165014). --------- Co-authored-by: Shimin Cui --- llvm/lib/Analysis/DependenceAnalysis.cpp | 11 ++- .../same-sd-for-diff-becount-type-loops.ll | 68 +++++++++++++++++++ 2 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index a572eefddd20e..84ee8c0bf3e18 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -1131,9 +1131,14 @@ bool DependenceInfo::haveSameSD(const Loop *SrcLoop, if (SE->hasLoopInvariantBackedgeTakenCount(DstLoop)) DstUP = SE->getBackedgeTakenCount(DstLoop); - if (SrcUB != nullptr && DstUP != nullptr && - SE->isKnownPredicate(ICmpInst::ICMP_EQ, SrcUB, DstUP)) - return true; + if (SrcUB != nullptr && DstUP != nullptr) { + Type *WiderType = SE->getWiderType(SrcUB->getType(), DstUP->getType()); + SrcUB = SE->getNoopOrZeroExtend(SrcUB, WiderType); + DstUP = SE->getNoopOrZeroExtend(DstUP, WiderType); + + if (SE->isKnownPredicate(ICmpInst::ICMP_EQ, SrcUB, DstUP)) + return true; + } return false; } diff --git a/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll b/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll new file mode 100644 index 0000000000000..66880b5a553ec --- /dev/null +++ b/llvm/test/Analysis/DependenceAnalysis/same-sd-for-diff-becount-type-loops.ll @@ -0,0 +1,68 @@ +; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 | FileCheck %s + +define void @f1() { +; CHECK-LABEL: 'f1' +; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4 +; CHECK-NEXT: da analyze - consistent output [S]! +; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4 +; CHECK-NEXT: da analyze - consistent flow [|<]! +; CHECK-NEXT: Src: %2 = load i32, ptr null, align 4 --> Dst: %2 = load i32, ptr null, align 4 +; CHECK-NEXT: da analyze - consistent input [S]! +; +entry: + br label %for.1.header + +for.1.header: ; preds = %for.2.end, %entry + br label %for.1.body + +for.1.body: ; preds = %for.1.body, %whiledo + %0 = phi i32 [ 0, %for.1.header ], [ 1, %for.1.body ] + store i32 0, ptr null, align 4 + %1 = icmp ult i32 %0, 1 + br i1 %1, label %for.1.body, label %for.1.end + +for.1.end: ; preds = %for.1.body + br label %for.2.body + +for.2.body: ; preds = %for.2.body, %for.1.end + %2 = load i32, ptr null, align 4 + br i1 false, label %for.2.body, label %exit + +exit: ; preds = %for.2.body + ret void +} + +define void @f2() { +; CHECK-LABEL: 'f2' +; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: store i32 0, ptr null, align 4 +; CHECK-NEXT: da analyze - consistent output [S]! +; CHECK-NEXT: Src: store i32 0, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4 +; CHECK-NEXT: da analyze - flow [|<] / assuming 1 loop level(s) fused: [S|<]! +; CHECK-NEXT: Src: %3 = load i32, ptr null, align 4 --> Dst: %3 = load i32, ptr null, align 4 +; CHECK-NEXT: da analyze - consistent input [S]! +; +entry: + br label %for.1.header + +for.1.header: ; preds = %for.2.end, %entry + br label %for.1.body + +for.1.body: ; preds = %for.1.body, %whiledo + %0 = phi i32 [ 0, %for.1.header ], [ 1, %for.1.body ] + store i32 0, ptr null, align 4 + %1 = icmp ult i32 %0, 1 + br i1 %1, label %for.1.body, label %for.1.end + +for.1.end: ; preds = %for.1.body + br label %for.2.body + +for.2.body: ; preds = %for.2.body, %for.1.end + %2 = phi i64 [ 0, %for.1.end ], [ %4, %for.2.body ] + %3 = load i32, ptr null, align 4 + %4 = add nuw nsw i64 %2, 1 + %5 = icmp ult i64 %4, 2 + br i1 %5, label %for.2.body, label %exit + +exit: ; preds = %for.2.body + ret void +} From 267b5b8901fad878add53159e7a11050bec2e0f3 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Mon, 27 Oct 2025 19:17:23 +0000 Subject: [PATCH 28/32] [clang][DebugInfo][test] Rename Objective-C test Changes test name to something more meaningful. In preparation to refactoring the test to check LLVM IR instead of assembly. --- clang/test/DebugInfo/ObjC/{property4.m => property-auto-synth.m} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename clang/test/DebugInfo/ObjC/{property4.m => property-auto-synth.m} (100%) diff --git a/clang/test/DebugInfo/ObjC/property4.m b/clang/test/DebugInfo/ObjC/property-auto-synth.m similarity index 100% rename from clang/test/DebugInfo/ObjC/property4.m rename to clang/test/DebugInfo/ObjC/property-auto-synth.m From cd9d48777e3b1f2d46791e7d834a80f1b6a14c74 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Mon, 27 Oct 2025 12:22:13 -0700 Subject: [PATCH 29/32] [MLIR][ExecutionEngine] don't dump decls (#164478) Currently ExecutionEngine tries to dump all functions declared in the module, even those which are "external" (i.e., linked/loaded at runtime). E.g. ```mlir func.func private @printF32(f32) func.func @supported_arg_types(%arg0: i32, %arg1: f32) { call @printF32(%arg1) : (f32) -> () return } ``` fails with ``` Could not compile printF32: Symbols not found: [ __mlir_printF32 ] Program aborted due to an unhandled Error: Symbols not found: [ __mlir_printF32 ] ``` even though `printF32` can be provided at final build time (i.e., when the object file is linked to some executable or shlib). E.g, if our own `libmlir_c_runner_utils` is linked. So just skip functions which have no bodies during dump (i.e., are decls without defns). --- mlir/lib/ExecutionEngine/ExecutionEngine.cpp | 2 ++ mlir/test/python/CMakeLists.txt | 2 +- mlir/test/python/execution_engine.py | 27 +++++++++++++++----- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp index 52162a43aeae3..2255633c746b3 100644 --- a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp +++ b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp @@ -239,6 +239,8 @@ ExecutionEngine::create(Operation *m, const ExecutionEngineOptions &options, // Remember all entry-points if object dumping is enabled. if (options.enableObjectDump) { for (auto funcOp : m->getRegion(0).getOps()) { + if (funcOp.getBlocks().empty()) + continue; StringRef funcName = funcOp.getSymName(); engine->functionNames.push_back(funcName.str()); } diff --git a/mlir/test/python/CMakeLists.txt b/mlir/test/python/CMakeLists.txt index e1e82ef367b1e..2c123811c2998 100644 --- a/mlir/test/python/CMakeLists.txt +++ b/mlir/test/python/CMakeLists.txt @@ -11,7 +11,7 @@ add_public_tablegen_target(MLIRPythonTestIncGen) add_subdirectory(lib) -set(MLIR_PYTHON_TEST_DEPENDS MLIRPythonModules) +set(MLIR_PYTHON_TEST_DEPENDS MLIRPythonModules mlir-runner) if(NOT MLIR_STANDALONE_BUILD) list(APPEND MLIR_PYTHON_TEST_DEPENDS FileCheck count not) endif() diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py index d569fcef32bfd..146e213a9229e 100644 --- a/mlir/test/python/execution_engine.py +++ b/mlir/test/python/execution_engine.py @@ -1,6 +1,7 @@ # RUN: env MLIR_RUNNER_UTILS=%mlir_runner_utils MLIR_C_RUNNER_UTILS=%mlir_c_runner_utils %PYTHON %s 2>&1 | FileCheck %s # REQUIRES: host-supports-jit import gc, sys, os, tempfile +from textwrap import dedent from mlir.ir import * from mlir.passmanager import * from mlir.execution_engine import * @@ -21,6 +22,7 @@ "MLIR_C_RUNNER_UTILS", "../../../../lib/libmlir_c_runner_utils.so" ) + # Log everything to stderr and flush so that we have a unified stream to match # errors/info emitted by MLIR to stderr. def log(*args): @@ -337,6 +339,7 @@ def callback(a): ctypes.pointer(ctypes.pointer(get_ranked_memref_descriptor(inp_arr))), ) + run(testUnrankedMemRefWithOffsetCallback) @@ -785,15 +788,25 @@ def testDumpToObjectFile(): try: with Context(): module = Module.parse( - """ - module { - func.func @main() attributes { llvm.emit_c_interface } { - return - } - }""" + dedent( + """ + func.func private @printF32(f32) + func.func @main(%arg0: f32) attributes { llvm.emit_c_interface } { + call @printF32(%arg0) : (f32) -> () + return + } + """ + ) ) - execution_engine = ExecutionEngine(lowerToLLVM(module), opt_level=3) + execution_engine = ExecutionEngine( + lowerToLLVM(module), + opt_level=3, + # Loading MLIR_C_RUNNER_UTILS is necessary even though we don't actually run the code (i.e., call printF32) + # because RTDyldObjectLinkingLayer::emit will try to resolve symbols before dumping + # (see the jitLinkForORC call at the bottom there). + shared_libs=[MLIR_C_RUNNER_UTILS], + ) # CHECK: Object file exists: True print(f"Object file exists: {os.path.exists(object_path)}") From 9abae17b25f937376e5036b080b473f948232968 Mon Sep 17 00:00:00 2001 From: Tomer Shafir Date: Mon, 27 Oct 2025 21:22:23 +0200 Subject: [PATCH 30/32] [UpdateTestChecks][llc] Support `arm64-apple-darwin` (#165092) Adds `arm64-apple-darwin` support to `asm.py` matching and removes now invalidated `target-triple-mismatch` test (I dont have another triple supported by llc but not the autogenerator that make this test useful). --- .../Inputs/target-triple-mismatch.ll | 7 ------- .../target-triple-mismatch.test | 11 ----------- llvm/utils/UpdateTestChecks/asm.py | 1 + 3 files changed, 1 insertion(+), 18 deletions(-) delete mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/target-triple-mismatch.ll delete mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/target-triple-mismatch.test diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/target-triple-mismatch.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/target-triple-mismatch.ll deleted file mode 100644 index 3da27cbacd172..0000000000000 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/target-triple-mismatch.ll +++ /dev/null @@ -1,7 +0,0 @@ -; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s - -define i64 @foo(i64 %a) { -entry: - %b = add i64 %a, 1 - ret i64 %b -} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/target-triple-mismatch.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/target-triple-mismatch.test deleted file mode 100644 index 3bbf14d469d4b..0000000000000 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/target-triple-mismatch.test +++ /dev/null @@ -1,11 +0,0 @@ -# REQUIRES: aarch64-registered-target -## Check that arm64-apple-darwin target triple is wrongly captured as arm64 (non-Apple) - -# RUN: cp -f %S/Inputs/target-triple-mismatch.ll %t.ll -# RUN: %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=LOG -# RUN: FileCheck --input-file=%t.ll %s --check-prefix=AUTOGEN - -# LOG: WARNING: Couldn't match any function. Possibly the wrong target triple has been provided - -# AUTOGEN: ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -# AUTOGEN-NEXT: ; CHECK: {{.*}} diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py index 457b23f4a5f59..469e27facedb0 100644 --- a/llvm/utils/UpdateTestChecks/asm.py +++ b/llvm/utils/UpdateTestChecks/asm.py @@ -570,6 +570,7 @@ def get_run_handler(triple): "arm64": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_RE), "arm64e": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE), "arm64ec": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_RE), + "arm64-apple-darwin": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE), "arm64-apple-ios": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE), "arm64-apple-macosx": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE), "armv7-apple-ios": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_IOS_RE), From dce825248bb0286d97f199ae4d628923e926083c Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 27 Oct 2025 12:30:28 -0700 Subject: [PATCH 31/32] [RadixTree] Use std::optional for Node::Value (#165299) Don't rely on comparison to singular iterator, it's UB. Fixes bot crashes after https://github.com/llvm/llvm-project/pull/164524. --- llvm/include/llvm/ADT/RadixTree.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/ADT/RadixTree.h b/llvm/include/llvm/ADT/RadixTree.h index 9e2ab9753d50c..87e2a3ebecc06 100644 --- a/llvm/include/llvm/ADT/RadixTree.h +++ b/llvm/include/llvm/ADT/RadixTree.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -92,7 +93,7 @@ template class RadixTree { /// If this node does not have a value (i.e., it's an internal node that /// only serves as a path to other values), this iterator will be equal /// to default constructed `ContainerType::iterator()`. - typename ContainerType::iterator Value; + std::optional Value; /// The first character of the Key. Used for fast child lookup. KeyValueType KeyFront; @@ -215,7 +216,7 @@ template class RadixTree { KeyConstIteratorType{}}; void findNextValid() { - while (Curr && Curr->Value == typename ContainerType::iterator()) + while (Curr && !Curr->Value.has_value()) advance(); } @@ -249,7 +250,7 @@ template class RadixTree { public: IteratorImpl() = default; - MappedType &operator*() const { return *Curr->Value; } + MappedType &operator*() const { return **Curr->Value; } IteratorImpl &operator++() { advance(); @@ -315,12 +316,12 @@ template class RadixTree { const value_type &NewValue = KeyValuePairs.emplace_front( std::move(Key), T(std::forward(Args)...)); Node &Node = findOrCreate(NewValue.first); - bool HasValue = Node.Value != typename ContainerType::iterator(); + bool HasValue = Node.Value.has_value(); if (!HasValue) Node.Value = KeyValuePairs.begin(); else KeyValuePairs.pop_front(); - return {Node.Value, !HasValue}; + return {*Node.Value, !HasValue}; } /// From b7ba98c2c22f6d3bf450a624964615d43846aac4 Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Mon, 27 Oct 2025 17:26:09 -0500 Subject: [PATCH 32/32] Regen llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir --- .../AMDGPU/spill-restore-partial-copy.mir | 182 ++++++++++++++++++ 1 file changed, 182 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir b/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir index bb87b6e52da89..cd9a4d07b870d 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-restore-partial-copy.mir @@ -30,6 +30,66 @@ body: | ; GFX950-LABEL: name: full_copy ; GFX950: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29 ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr0 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr1 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr2 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr3 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr4 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr5 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr6 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr7 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr8 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr9 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr10 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr11 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr12 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr13 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr14 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr15 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr16 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr17 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr18 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr19 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr20 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr21 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr22 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr23 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr24 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr25 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr26 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr27 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr0 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr1 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr2 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr3 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr4 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr5 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr6 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr7 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr8 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr9 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr10 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr11 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr12 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr13 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr14 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr15 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr16 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr17 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr18 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr19 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr20 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr21 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr22 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr23 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr24 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr25 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr26 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr27 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr28 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr29 ; GFX950-NEXT: renamable $agpr0_agpr1 = IMPLICIT_DEF ; GFX950-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF ; GFX950-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF @@ -136,6 +196,66 @@ body: | ; GFX950-LABEL: name: partial_copy ; GFX950: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27 ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr0 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr1 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr2 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr3 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr4 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr5 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr6 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr7 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr8 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr9 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr10 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr11 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr12 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr13 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr14 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr15 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr16 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr17 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr18 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr19 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr20 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr21 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr22 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr23 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr24 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr25 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr26 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr27 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr0 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr1 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr2 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr3 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr4 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr5 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr6 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr7 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr8 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr9 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr10 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr11 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr12 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr13 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr14 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr15 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr16 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr17 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr18 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr19 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr20 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr21 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr22 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr23 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr24 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr25 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr28 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr29 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr30 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr31 ; GFX950-NEXT: renamable $agpr0_agpr1 = IMPLICIT_DEF ; GFX950-NEXT: renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF ; GFX950-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF @@ -241,6 +361,68 @@ body: | ; GFX950-LABEL: name: full_spill ; GFX950: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25 ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x90, 0x40, 0x94, 0x04, 0x36, 0x24, 0x36, 0xe9, 0x02 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION llvm_register_pair $pc_reg, $sgpr30, 32, $sgpr31, 32 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr0 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr1 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr2 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr3 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr4 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr5 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr6 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr7 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr8 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr9 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr10 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr11 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr12 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr13 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr14 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr15 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr16 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr17 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr18 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr19 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr20 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr21 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr22 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr23 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr24 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr25 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr26 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $vgpr27 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr0 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr1 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr2 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr3 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr4 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr5 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr6 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr7 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr8 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr9 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr10 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr11 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr12 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr13 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr14 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr15 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr16 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr17 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr18 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr19 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr20 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr21 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr22 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr23 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr24 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr25 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr26 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr27 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr28 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr29 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr30 + ; GFX950-NEXT: frame-setup CFI_INSTRUCTION undefined $agpr31 ; GFX950-NEXT: renamable $agpr0_agpr1 = IMPLICIT_DEF ; GFX950-NEXT: renamable $agpr26_agpr27 = IMPLICIT_DEF ; GFX950-NEXT: renamable $agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF