-
Notifications
You must be signed in to change notification settings - Fork 15.1k
AMDGPU: Really use AV classes by default for vector classes #166483
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/arsenm/amdgpu/use-av-classes-default-vector-class
Are you sure you want to change the base?
Conversation
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesAMDGPU: Really use AV classes by default for vector classes Update getRegClassFor to use AV classes in place of VGPRs for Patch is 1.61 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166483.diff 37 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 98fe923147ccc..7a9a3daa4033a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18768,8 +18768,11 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
: &AMDGPU::SReg_32RegClass;
if (!TRI->isSGPRClass(RC) && !isDivergent)
return TRI->getEquivalentSGPRClass(RC);
- if (TRI->isSGPRClass(RC) && isDivergent)
+ if (TRI->isSGPRClass(RC) && isDivergent) {
+ if (Subtarget->hasGFX90AInsts())
+ return TRI->getEquivalentAVClass(RC);
return TRI->getEquivalentVGPRClass(RC);
+ }
return RC;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 3f52e8229ac08..bbae1c976ae1d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3638,6 +3638,14 @@ SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
return ARC;
}
+const TargetRegisterClass *
+SIRegisterInfo::getEquivalentAVClass(const TargetRegisterClass *SRC) const {
+ unsigned Size = getRegSizeInBits(*SRC);
+ const TargetRegisterClass *ARC = getVectorSuperClassForBitWidth(Size);
+ assert(ARC && "Invalid register class size");
+ return ARC;
+}
+
const TargetRegisterClass *
SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
unsigned Size = getRegSizeInBits(*VRC);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 6e119e5e7c194..e2fe991340494 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -289,6 +289,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
const TargetRegisterClass *
getEquivalentAGPRClass(const TargetRegisterClass *SRC) const;
+ /// \returns An AGPR+VGPR super reg class with the same width as \p SRC
+ const TargetRegisterClass *
+ getEquivalentAVClass(const TargetRegisterClass *SRC) const;
+
/// \returns A SGPR reg class with the same width as \p SRC
const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index ae83766cd6a4a..196958b74442f 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -644,10 +644,10 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a[0:1]
+; GFX90A-NEXT: ; def a[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -659,7 +659,7 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr2_agpr3
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -672,8 +672,8 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -758,7 +758,7 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB12_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB12_4
@@ -768,8 +768,8 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB12_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -926,12 +926,12 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -939,23 +939,23 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB14_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB14_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB14_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -1016,12 +1016,12 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -1029,23 +1029,23 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB15_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB15_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB15_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -1220,7 +1220,7 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB17_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB17_4
@@ -1230,8 +1230,8 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -1294,12 +1294,12 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -1307,23 +1307,23 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB18_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB18_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB18_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
@@ -1384,12 +1384,10 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_3
@@ -1406,14 +1404,14 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB19_2
; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1559,12 +1557,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1589,12 +1587,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB21_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1611,24 +1609,24 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1641,23 +1639,23 @@ define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_v:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1]
+; GFX950-NEXT: flat_load_dword v3, v[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB22_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1694,12 +1692,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1723,12 +1721,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB23_1
; GFX950-NEXT: ...
[truncated]
|
87ec603 to
1ed3130
Compare
3247d62 to
22f336c
Compare
Update getRegClassFor to use AV classes in place of VGPRs for gfx90a-gfx950. There are a handful of regressions. Most are enabling unprofitable rematerialization which reduce register count by 1 but add an unnecessary instruction.
1ed3130 to
dd5267f
Compare
22f336c to
02f6ebf
Compare

AMDGPU: Really use AV classes by default for vector classes
Update getRegClassFor to use AV classes in place of VGPRs for
gfx90a-gfx950. There are a handful of regressions. Most are
enabling unprofitable rematerialization which reduce register
count by 1 but add an unnecessary instruction.