Skip to content

Commit 9c1ccff

Browse files
committed
AMDGPU: Relax shouldCoalesce to allow more register tuple widening
Allow widening up to 128-bit registers or if the new register class is at least as large as one of the existing register classes. This was artificially limiting. In particular this was doing the wrong thing with sequences involving copies between VGPRs and AV registers. Nearly all test changes are improvements. The coalescer does not just widen registers out of nowhere. If it's trying to "widen" a register, it's generally packing a register into an existing register tuple, or in a situation where the constraints imply the wider class anyway. 067a110 addressed the allocation failure concern by rejecting coalescing if there are no available registers. The original change in a4e63ea didn't include a realistic testcase to judge if this is harmful for pressure. I would expect any issues from this to be of garden variety subreg handling issue. We could use more dynamic state information here if it really is an issue. I get the best results by removing this override completely. This is a smaller step for patch splitting purposes.
1 parent b7423af commit 9c1ccff

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+8259
-9418
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3741,18 +3741,11 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
37413741
unsigned DstSubReg,
37423742
const TargetRegisterClass *NewRC,
37433743
LiveIntervals &LIS) const {
3744-
unsigned SrcSize = getRegSizeInBits(*SrcRC);
3745-
unsigned DstSize = getRegSizeInBits(*DstRC);
3744+
// TODO: This should be more aggressive, but be more cautious with very wide
3745+
// tuples.
37463746
unsigned NewSize = getRegSizeInBits(*NewRC);
3747-
3748-
// Do not increase size of registers beyond dword, we would need to allocate
3749-
// adjacent registers and constraint regalloc more than needed.
3750-
3751-
// Always allow dword coalescing.
3752-
if (SrcSize <= 32 || DstSize <= 32)
3753-
return true;
3754-
3755-
return NewSize <= DstSize || NewSize <= SrcSize;
3747+
return NewSize <= 128 || NewSize <= getRegSizeInBits(*SrcRC) ||
3748+
NewSize <= getRegSizeInBits(*DstRC);
37563749
}
37573750

37583751
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,

llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll

Lines changed: 86 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,34 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
88
; GFX10-LABEL: v_mul_i64_no_zext:
99
; GFX10: ; %bb.0:
1010
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
11-
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
11+
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
1212
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1313
; GFX10-NEXT: s_clause 0x1
14-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
15-
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
14+
; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
15+
; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3]
1616
; GFX10-NEXT: s_waitcnt vmcnt(0)
17-
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
18-
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
19-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
20-
; GFX10-NEXT: v_mov_b32_e32 v5, v0
21-
; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
17+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v2, v4, 0
18+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v2, v5, v[1:2]
19+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v3, v4, v[1:2]
20+
; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
2221
; GFX10-NEXT: s_endpgm
2322
;
2423
; GFX11-LABEL: v_mul_i64_no_zext:
2524
; GFX11: ; %bb.0:
2625
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
2726
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2827
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
29-
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
28+
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
3029
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3130
; GFX11-NEXT: s_clause 0x1
32-
; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
33-
; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
31+
; GFX11-NEXT: global_load_b64 v[2:3], v8, s[0:1]
32+
; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3]
3433
; GFX11-NEXT: s_waitcnt vmcnt(0)
35-
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
34+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
3635
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
37-
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
38-
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
39-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
40-
; GFX11-NEXT: v_mov_b32_e32 v5, v7
41-
; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
36+
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[1:2]
37+
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v4, v[6:7]
38+
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[2:3]
4239
; GFX11-NEXT: s_endpgm
4340
%tid = call i32 @llvm.amdgcn.workitem.id.x()
4441
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -58,18 +55,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
5855
; GFX10-NEXT: s_clause 0x1
5956
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6057
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
61-
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
62-
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0
58+
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0
59+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
6360
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
64-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
65-
; GFX10-NEXT: global_load_dword v4, v3, s[6:7]
61+
; GFX10-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3]
62+
; GFX10-NEXT: global_load_dword v4, v0, s[6:7]
6663
; GFX10-NEXT: s_waitcnt vmcnt(0)
67-
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0
68-
; GFX10-NEXT: v_mov_b32_e32 v0, v3
69-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1]
70-
; GFX10-NEXT: v_mov_b32_e32 v3, v0
71-
; GFX10-NEXT: v_mov_b32_e32 v0, 0
72-
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
64+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v4, 0
65+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v3, v4, v[1:2]
66+
; GFX10-NEXT: v_mov_b32_e32 v2, 0
67+
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
7368
; GFX10-NEXT: s_endpgm
7469
;
7570
; GFX11-LABEL: v_mul_i64_zext_src1:
@@ -80,17 +75,17 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
8075
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
8176
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
8277
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
83-
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
78+
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
8479
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
85-
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3]
86-
; GFX11-NEXT: global_load_b32 v5, v2, s[4:5]
80+
; GFX11-NEXT: global_load_b64 v[2:3], v1, s[2:3]
81+
; GFX11-NEXT: global_load_b32 v5, v0, s[4:5]
8782
; GFX11-NEXT: s_waitcnt vmcnt(0)
88-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
83+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v5, 0
8984
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
90-
; GFX11-NEXT: v_mov_b32_e32 v0, v3
91-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
92-
; GFX11-NEXT: v_mov_b32_e32 v0, 0
93-
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
85+
; GFX11-NEXT: v_mov_b32_e32 v4, v1
86+
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v5, v[4:5]
87+
; GFX11-NEXT: v_mov_b32_e32 v2, 0
88+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
9489
; GFX11-NEXT: s_endpgm
9590
%tid = call i32 @llvm.amdgcn.workitem.id.x()
9691
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -110,18 +105,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
110105
; GFX10-NEXT: s_clause 0x1
111106
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
112107
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
113-
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
114-
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
108+
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
109+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
115110
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
116-
; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
117-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7]
111+
; GFX10-NEXT: global_load_dword v4, v1, s[2:3]
112+
; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7]
118113
; GFX10-NEXT: s_waitcnt vmcnt(0)
119-
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
120-
; GFX10-NEXT: v_mov_b32_e32 v0, v3
121-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
122-
; GFX10-NEXT: v_mov_b32_e32 v3, v0
123-
; GFX10-NEXT: v_mov_b32_e32 v0, 0
124-
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
114+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0
115+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2]
116+
; GFX10-NEXT: v_mov_b32_e32 v2, 0
117+
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
125118
; GFX10-NEXT: s_endpgm
126119
;
127120
; GFX11-LABEL: v_mul_i64_zext_src0:
@@ -135,14 +128,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
135128
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
136129
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
137130
; GFX11-NEXT: global_load_b32 v5, v1, s[2:3]
138-
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
131+
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5]
139132
; GFX11-NEXT: s_waitcnt vmcnt(0)
140-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
133+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0
141134
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
142-
; GFX11-NEXT: v_mov_b32_e32 v0, v3
143-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
144-
; GFX11-NEXT: v_mov_b32_e32 v0, 0
145-
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
135+
; GFX11-NEXT: v_mov_b32_e32 v4, v1
136+
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
137+
; GFX11-NEXT: v_mov_b32_e32 v2, 0
138+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
146139
; GFX11-NEXT: s_endpgm
147140
%tid = call i32 @llvm.amdgcn.workitem.id.x()
148141
%gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
@@ -209,18 +202,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
209202
; GFX10-NEXT: s_clause 0x1
210203
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
211204
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
212-
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
205+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
213206
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
214207
; GFX10-NEXT: s_clause 0x1
215-
; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
216-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
208+
; GFX10-NEXT: global_load_dword v4, v0, s[2:3]
209+
; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7]
217210
; GFX10-NEXT: s_waitcnt vmcnt(0)
218-
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
219-
; GFX10-NEXT: v_mov_b32_e32 v0, v3
220-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
221-
; GFX10-NEXT: v_mov_b32_e32 v3, v0
222-
; GFX10-NEXT: v_mov_b32_e32 v0, 0
223-
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
211+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v2, 0
212+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v3, v[1:2]
213+
; GFX10-NEXT: v_mov_b32_e32 v2, 0
214+
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
224215
; GFX10-NEXT: s_endpgm
225216
;
226217
; GFX11-LABEL: v_mul_i64_masked_src0_hi:
@@ -234,14 +225,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
234225
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
235226
; GFX11-NEXT: s_clause 0x1
236227
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
237-
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
228+
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[4:5]
238229
; GFX11-NEXT: s_waitcnt vmcnt(0)
239-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
230+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, 0
240231
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
241-
; GFX11-NEXT: v_mov_b32_e32 v0, v3
242-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
243-
; GFX11-NEXT: v_mov_b32_e32 v0, 0
244-
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
232+
; GFX11-NEXT: v_mov_b32_e32 v4, v1
233+
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
234+
; GFX11-NEXT: v_mov_b32_e32 v2, 0
235+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
245236
; GFX11-NEXT: s_endpgm
246237
%tid = call i32 @llvm.amdgcn.workitem.id.x()
247238
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -389,22 +380,20 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
389380
; GFX10-NEXT: s_clause 0x1
390381
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
391382
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
392-
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
383+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
393384
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
394385
; GFX10-NEXT: s_clause 0x1
395-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
396-
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
386+
; GFX10-NEXT: global_load_dwordx2 v[1:2], v0, s[2:3]
387+
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[6:7]
397388
; GFX10-NEXT: s_waitcnt vmcnt(1)
398-
; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0
389+
; GFX10-NEXT: v_and_b32_e32 v5, 0xfff00000, v1
399390
; GFX10-NEXT: s_waitcnt vmcnt(0)
400-
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0
401-
; GFX10-NEXT: v_mov_b32_e32 v0, v5
402-
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1]
403-
; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1
404-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6]
405-
; GFX10-NEXT: v_mov_b32_e32 v5, v0
406-
; GFX10-NEXT: v_mov_b32_e32 v0, 0
407-
; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1]
391+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v5, v3, 0
392+
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v5, v4, v[1:2]
393+
; GFX10-NEXT: v_and_b32_e32 v1, 0xf00f, v2
394+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v1, v3, v[4:5]
395+
; GFX10-NEXT: v_mov_b32_e32 v2, 0
396+
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
408397
; GFX10-NEXT: s_endpgm
409398
;
410399
; GFX11-LABEL: v_mul_i64_partially_masked_src0:
@@ -414,24 +403,22 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
414403
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
415404
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
416405
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
417-
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
406+
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
418407
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
419408
; GFX11-NEXT: s_clause 0x1
420-
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
421-
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5]
409+
; GFX11-NEXT: global_load_b64 v[1:2], v0, s[2:3]
410+
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[4:5]
422411
; GFX11-NEXT: s_waitcnt vmcnt(1)
423-
; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
412+
; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v1
424413
; GFX11-NEXT: s_waitcnt vmcnt(0)
425414
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
426-
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0
427-
; GFX11-NEXT: v_mov_b32_e32 v0, v5
428-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
429-
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
430-
; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1
431-
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
415+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v3, 0
416+
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v4, v[1:2]
417+
; GFX11-NEXT: v_and_b32_e32 v4, 0xf00f, v2
432418
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
433-
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
434-
; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1]
419+
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v4, v3, v[5:6]
420+
; GFX11-NEXT: v_mov_b32_e32 v2, 0
421+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
435422
; GFX11-NEXT: s_endpgm
436423
%tid = call i32 @llvm.amdgcn.workitem.id.x()
437424
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -536,28 +523,28 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
536523
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
537524
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
538525
; GFX11-NEXT: s_clause 0x1
539-
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3]
540-
; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5]
526+
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
527+
; GFX11-NEXT: global_load_b64 v[5:6], v0, s[4:5]
541528
; GFX11-NEXT: s_mov_b32 s2, exec_lo
542529
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
543530
; GFX11-NEXT: s_waitcnt vmcnt(1)
544-
; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3]
531+
; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[3:4]
545532
; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2
546533
; GFX11-NEXT: s_cbranch_execz .LBB10_2
547534
; GFX11-NEXT: ; %bb.1: ; %else
548535
; GFX11-NEXT: s_waitcnt vmcnt(0)
549-
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
536+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, 0
550537
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
551-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
552-
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
553-
; GFX11-NEXT: v_mov_b32_e32 v1, v3
554-
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
538+
; GFX11-NEXT: v_mov_b32_e32 v4, v1
539+
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v6, v[4:5]
540+
; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4
541+
; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6
555542
; GFX11-NEXT: .LBB10_2: ; %Flow
556543
; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2
557544
; GFX11-NEXT: s_cbranch_execz .LBB10_4
558545
; GFX11-NEXT: ; %bb.3: ; %if
559546
; GFX11-NEXT: s_waitcnt vmcnt(0)
560-
; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5
547+
; GFX11-NEXT: v_mul_lo_u32 v1, v3, v6
561548
; GFX11-NEXT: v_mov_b32_e32 v0, 0
562549
; GFX11-NEXT: .LBB10_4: ; %endif
563550
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2

0 commit comments

Comments
 (0)