ROCm · ronlieb · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/compiler-rt/test/profile/instrprof-tmpdir.c b/compiler-rt/test/profile/instrprof-tmpdir.c
@@ -1,3 +1,8 @@
+// AIX does not support env -u.
+// TODO(boomanaiden154): Reenable AIX support once we use the internal shell by
+// default.
+// UNSUPPORTED: system-aix
+
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
 // RUN: cd %t
@@ -12,8 +17,7 @@
 // RUN: llvm-profdata show ./raw2.profraw | FileCheck %s -check-prefix TMPDIR
 //
 // Check that we fall back to the default path if TMPDIR is missing.
-// RUN: %if system-aix %{ unset TMPDIR %}
-// RUN: env %if !system-aix %{ -u TMPDIR %} LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING
+// RUN: env -u TMPDIR LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING
 // RUN: llvm-profdata show ./default.profraw | FileCheck %s -check-prefix TMPDIR
 
 // TMPDIR: Maximum function count: 1

diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -103,7 +103,6 @@ steps:
       queue: libcxx-builders
       os: aix
     <<: *common
-    skip: "https://github.com/llvm/llvm-project/issues/162516"
 
   - label: AIX (64-bit)
     command: libcxx/utils/ci/run-buildbot aix
@@ -115,7 +114,6 @@ steps:
       queue: libcxx-builders
       os: aix
     <<: *common
-    skip: "https://github.com/llvm/llvm-project/issues/162516"
 
 - group: ':freebsd: FreeBSD'
   steps:

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1128,40 +1128,11 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
   if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx))
     return false;
 
-  MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
   if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
     appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
     return true;
   }
 
-  // TODO: Verify the following code handles subregisters correctly.
-  // TODO: Handle extract of global reference
-  if (UseOp.getSubReg())
-    return false;
-
-  if (!OpToFold.isReg())
-    return false;
-
-  Register UseReg = OpToFold.getReg();
-  if (!UseReg.isVirtual())
-    return false;
-
-  // Maybe it is just a COPY of an immediate itself.
-
-  // FIXME: Remove this handling. There is already special case folding of
-  // immediate into copy in foldOperand. This is looking for the def of the
-  // value the folding started from in the first place.
-  MachineInstr *Def = MRI->getVRegDef(UseReg);
-  if (Def && TII->isFoldableCopy(*Def)) {
-    MachineOperand &DefOp = Def->getOperand(1);
-    if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
-      FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
-                              OpToFold.DefSubReg);
-      appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
-      return true;
-    }
-  }
-
   return false;
 }
 

diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -393,6 +394,17 @@ class ConstantTerminatorFoldingImpl {
       DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
       ++NumLoopExitsDeleted;
     }
+    // We don't really need to add branch weights to DummySwitch, because all
+    // but one branches are just a temporary artifact - see the comment on top
+    // of this function. But, it's easy to estimate the weights, and it helps
+    // maintain a property of the overall compiler - that the branch weights
+    // don't "just get dropped" accidentally (i.e. profcheck)
+    if (DummySwitch->getParent()->getParent()->hasProfileData()) {
+      SmallVector<uint32_t> DummyBranchWeights(1 + DummySwitch->getNumCases());
+      // default. 100% probability, the rest are dead.
+      DummyBranchWeights[0] = 1;
+      setBranchWeights(*DummySwitch, DummyBranchWeights, /*IsExpected=*/false);
+    }
 
     assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
     if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {

diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -222,3 +222,34 @@ body:             |
     $vgpr0 = COPY %3
     S_ENDPGM 0, implicit $vgpr0
 ...
+
+# Make sure the immediate materialized by the v_mov_b16 isn't
+# incorrectly folded into the bfi as 0.
+
+# FIXME: %4:vgpr_32 = COPY %3 is a direct copy from v16 to v32 and
+# should probably fail the verifier
+---
+name:            mov_v16_copy_v32_fold_b32_regression
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: mov_v16_copy_v32_fold_b32_regression
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B16_t16_e64_]]
+    ; CHECK-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 32767, [[COPY2]], [[COPY1]], implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_BFI_B32_e64_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:vgpr_32 = COPY $vgpr1
+    %1:vgpr_32 = COPY $vgpr0
+    %3:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec
+    %4:vgpr_32 = COPY %3
+    %5:vgpr_32 = V_BFI_B32_e64 32767, %4, %1, implicit $exec
+    $vgpr0 = COPY %5
+    SI_RETURN implicit $vgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll b/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s
+
+; Make sure that the 16-bit constant 0x3c00 isn't folded as 0 into
+; v_bfi_b32.
+define i32 @mov16_bfi_fold_regression(half %arg, i32 %arg1) {
+; CHECK-LABEL: bfi_fold_regression:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b16_e32 v2.l, 0x3c00
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; CHECK-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %cmp = icmp eq i32 %arg1, 0
+  %call = call half @llvm.copysign.f16(half 0xH3C00, half %arg)
+  %select = select i1 %cmp, half 0xH3C00, half %call
+  %insertelement = insertelement <2 x half> zeroinitializer, half %select, i64 0
+  %bitcast = bitcast <2 x half> %insertelement to i32
+  ret i32 %bitcast
+}
+
+declare half @llvm.copysign.f16(half, half) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }