Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,19 @@ void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
.addReg(VgprSrc);
});
}

bool AMDGPU::isBRC(LLT Ty) {
if (Ty.isPointer())
return true;

unsigned Size = Ty.getSizeInBits();
if (Size % 32 != 0)
return false;

// 32, 2x32, 3x32 ... 12x32, 16x32, 32x32
unsigned NumB32s = Size / 32;
if ((NumB32s >= 1 && NumB32s <= 12) || NumB32s == 16 || NumB32s == 32)
return true;

return false;
}
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);

// "Reg Class" low level types. LLTs that fit into some register class on both
// sgpr and vgpr. LLTs with sizes 32, 2x32, 3x32 ... 12x32, 16x32, 32x32.
bool isBRC(LLT Ty);
}
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -442,8 +442,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {

// Opcodes that support pretty much all combinations of reg banks and LLTs
// (except S1). There is no point in writing rules for them.
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_MERGE_VALUES ||
Opc == AMDGPU::G_BITCAST) {
RBLHelper.applyMappingTrivial(*MI);
continue;
}
Expand Down
60 changes: 60 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,55 @@ bool RegBankLegalizeHelper::lower(MachineInstr &MI,
return lowerUnpackAExt(MI);
case WidenMMOToS32:
return widenMMOToS32(cast<GAnyLoad>(MI));
case VerifyAllSgpr: {
for (unsigned i = 0; i < MI.getNumOperands(); ++i)
assert(MRI.getRegBankOrNull(MI.getOperand(i).getReg()) == SgprRB);
return true;
}
case ApplyAllVgpr: {
B.setInstr(MI);
unsigned NumDefs = MI.getNumDefs();

for (unsigned i = 0; i < NumDefs; ++i)
assert(MRI.getRegBankOrNull(MI.getOperand(i).getReg()) == VgprRB);

for (unsigned i = NumDefs; i < MI.getNumOperands(); ++i) {
Register Reg = MI.getOperand(i).getReg();
if (MRI.getRegBank(Reg) != VgprRB) {
auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
MI.getOperand(i).setReg(Copy.getReg(0));
}
}

return true;
}
case UnmergeToShiftTrunc: {
GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
LLT Ty = MRI.getType(Unmerge->getSourceReg());
if (Ty.getSizeInBits() % 32 != 0) {
reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
"AMDGPU RegBankLegalize: unmerge not multiple of 32",
MI);
return false;
}

B.setInstr(MI);
if (Ty.getSizeInBits() > 32) {
auto Unmerge32 = B.buildUnmerge(SgprRB_S32, Unmerge->getSourceReg());
for (unsigned i = 0; i < Unmerge32->getNumDefs(); ++i) {
auto [Dst0S32, Dst1S32] = unpackAExt(Unmerge32->getOperand(i).getReg());
B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
}
} else {
auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
}

MI.eraseFromParent();
return true;
}
}

if (!WaterfallSgprs.empty()) {
Expand Down Expand Up @@ -1035,6 +1084,11 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
Ty == LLT::fixed_vector(8, 64))
return Ty;
return LLT();
case SgprBRC:
case VgprBRC:
if (isBRC(Ty))
return Ty;
return LLT();
default:
return LLT();
}
Expand Down Expand Up @@ -1069,6 +1123,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprB128:
case SgprB256:
case SgprB512:
case SgprBRC:
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
Expand Down Expand Up @@ -1108,6 +1163,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case VgprB128:
case VgprB256:
case VgprB512:
case VgprBRC:
case Vgpr32SExt:
case Vgpr32ZExt:
return VgprRB;
Expand Down Expand Up @@ -1167,6 +1223,7 @@ bool RegBankLegalizeHelper::applyMappingDst(
case SgprB128:
case SgprB256:
case SgprB512:
case SgprBRC:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128:
Expand All @@ -1176,6 +1233,7 @@ bool RegBankLegalizeHelper::applyMappingDst(
case VgprB128:
case VgprB256:
case VgprB512:
case VgprBRC:
case VgprPtr32:
case VgprPtr64:
case VgprPtr128: {
Expand Down Expand Up @@ -1307,6 +1365,7 @@ bool RegBankLegalizeHelper::applyMappingSrc(
case SgprB128:
case SgprB256:
case SgprB512:
case SgprBRC:
case SgprPtr32:
case SgprPtr64:
case SgprPtr128: {
Expand Down Expand Up @@ -1341,6 +1400,7 @@ bool RegBankLegalizeHelper::applyMappingSrc(
case VgprB128:
case VgprB256:
case VgprB512:
case VgprBRC:
case VgprPtr32:
case VgprPtr64:
case VgprPtr128: {
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
//===----------------------------------------------------------------------===//

#include "AMDGPURegBankLegalizeRules.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
Expand Down Expand Up @@ -132,6 +133,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
case UniB512:
return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
case UniBRC:
return isBRC(MRI.getType(Reg)) && MUI.isUniform(Reg);
case DivS1:
return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
case DivS16:
Expand Down Expand Up @@ -172,6 +175,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
case DivB512:
return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
case DivBRC:
return isBRC(MRI.getType(Reg)) && MUI.isDivergent(Reg);
case _:
return true;
default:
Expand Down Expand Up @@ -560,6 +565,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});

addRulesForGOpcs({G_UNMERGE_VALUES})
.Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
.Any({{DivS16}, {{}, {}, ApplyAllVgpr}})
.Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
.Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need tests for DivS16 and DivBRC

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are many in existing files, just moved this to AMDGPURegBankLegalizeRules.cpp, the sgpr s16 is changed. Also sgpr s16 appears in some tests and it was fine to just apply sgpr to all operands since it was combined away. I added test for the case when sgpr s16 g_unmerge was not combined away and actually meant to be inst selected but unlike any of the existing tests, tests in unmerge-sgpr-s16.ll are the cases that crash the compiler.


addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
.Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
Expand Down
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,15 @@ enum UniformityLLTOpPredicateID {
UniB128,
UniB256,
UniB512,
UniBRC,

DivB32,
DivB64,
DivB96,
DivB128,
DivB256,
DivB512,
DivBRC
};

// How to apply register bank on register operand.
Expand Down Expand Up @@ -156,6 +158,7 @@ enum RegBankLLTMappingApplyID {
SgprB128,
SgprB256,
SgprB512,
SgprBRC,

// vgpr scalars, pointers, vectors and B-types
Vgpr16,
Expand All @@ -178,6 +181,7 @@ enum RegBankLLTMappingApplyID {
VgprB128,
VgprB256,
VgprB512,
VgprBRC,
VgprV4S32,

// Dst only modifiers: read-any-lane and truncs
Expand Down Expand Up @@ -233,7 +237,10 @@ enum LoweringMethodID {
SplitLoad,
WidenLoad,
WidenMMOToS32,
UnpackAExt
UnpackAExt,
VerifyAllSgpr,
ApplyAllVgpr,
UnmergeToShiftTrunc
};

enum FastRulesTypes {
Expand Down
40 changes: 13 additions & 27 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fpext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -137,34 +137,20 @@ define amdgpu_ps void @fpext_f16_to_f64_div(half %a, ptr addrspace(1) %ptr) {
}

define amdgpu_ps <2 x float> @fpext_v2f16_to_v2f32_uniform(<2 x half> inreg %a) {
; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32_uniform:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32_uniform:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s0
Copy link
Collaborator Author

@petar-avramovic petar-avramovic Dec 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a bug.
There were _lo and _hi subreg copies, but they got lost at some point since input was a copy from physical sgpr. After that subreg copies looked like 32 bit sgpr copy (there were two but looked the same, so the result of the first was copied to result of second v_mov_b32_e32 v1, v0).
Maybe true 16 is missing assert somewhere?
Also for context, s16 unmerge now only appears on targets with true 16, see isRegisterClassType in AMDGPULegalizerInfo.cpp

; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-FAKE16-LABEL: fpext_v2f16_to_v2f32_uniform:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: s_cvt_f32_f16 s1, s0
; GFX12-FAKE16-NEXT: s_cvt_hi_f32_f16 s0, s0
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
; GFX12-FAKE16-NEXT: ; return to shader part epilog
; GFX11-LABEL: fpext_v2f16_to_v2f32_uniform:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s1
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-TRUE16-LABEL: fpext_v2f16_to_v2f32_uniform:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_cvt_f32_f16 s0, s0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s0
; GFX12-TRUE16-NEXT: ; return to shader part epilog
; GFX12-LABEL: fpext_v2f16_to_v2f32_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_cvt_f32_f16 s1, s0
; GFX12-NEXT: s_cvt_hi_f32_f16 s0, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: ; return to shader part epilog
%result = fpext <2 x half> %a to <2 x float>
ret <2 x float> %result
}
Expand Down
36 changes: 36 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/unmerge-sgpr-s16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX11 %s
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For testing purpose of the crash, try -global-isel or
if you wan to try out -global-isel -new-reg-bank-select replace
.Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}}) with
.Any({{UniS16}, {{}, {}, VerifyAllSgpr}})
need a true16 target because of legalizer


define amdgpu_ps void @unmerge_sgprS16_from_V2S16(ptr addrspace(1) inreg %ptr, ptr addrspace(1) inreg %out) {
; GFX11-LABEL: unmerge_sgprS16_from_V2S16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_endpgm
%load = load <2 x i16>, ptr addrspace(1) %ptr
%shuffle = shufflevector <2 x i16> %load, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
store <2 x i16> %shuffle, ptr addrspace(1) %out
ret void
}

define amdgpu_ps void @unmerge_sgprS16_from_V4S16(ptr addrspace(1) inreg %ptr, ptr addrspace(1) inreg %out) {
; GFX11-LABEL: unmerge_sgprS16_from_V4S16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_lh_b32_b16 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %ptr
%shuffle = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 3>
store <2 x i16> %shuffle, ptr addrspace(1) %out
ret void
}