Skip to content

Commit 8aa82ef

Browse files
authored
[AMDGPU][SIInsertWaitcnts] Wait on all LDS DMA operations when no aliasing store is found (#170660)
Previously, we would miss inserting a wait if the ds_read had AA info, but it didn't match any LDS DMA op, for example if we didn't track the LDS DMA op it aliases with because it exceeded the tracking limit.
1 parent 7a59ab0 commit 8aa82ef

File tree

2 files changed

+19
-5
lines changed

2 files changed

+19
-5
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,13 +1082,17 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
10821082
}
10831083
}
10841084
}
1085-
if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
1085+
if (Slot)
10861086
break;
1087+
// The slot may not be valid because it can be >= NUM_LDS_VGPRS which
1088+
// means the scoreboard cannot track it. We still want to preserve the
1089+
// MI in order to check alias information, though.
10871090
LDSDMAStores.push_back(&Inst);
10881091
Slot = LDSDMAStores.size();
10891092
break;
10901093
}
1091-
setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
1094+
if (Slot < NUM_LDS_VGPRS)
1095+
setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
10921096
if (Slot)
10931097
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
10941098
}
@@ -2006,15 +2010,23 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20062010
if (Ptr && Memop->getAAInfo()) {
20072011
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
20082012
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2009-
if (MI.mayAlias(AA, *LDSDMAStores[I], true))
2013+
if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2014+
if ((I + 1) >= NUM_LDS_VGPRS) {
2015+
// We didn't have enough slot to track this LDS DMA store, it
2016+
// has been tracked using the common RegNo (FIRST_LDS_VGPR).
2017+
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
2018+
break;
2019+
}
2020+
20102021
ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
2022+
}
20112023
}
20122024
} else {
20132025
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
20142026
}
2015-
if (Memop->isStore()) {
2027+
2028+
if (Memop->isStore())
20162029
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
2017-
}
20182030
}
20192031

20202032
// Loop over use and def operands.

llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32
223223
; GFX9-NEXT: s_waitcnt vmcnt(2)
224224
; GFX9-NEXT: ds_read_b32 v7, v9 offset:1792
225225
; GFX9-NEXT: ; wave barrier
226+
; GFX9-NEXT: s_waitcnt vmcnt(0)
226227
; GFX9-NEXT: ds_read_b32 v8, v9 offset:2048
227228
; GFX9-NEXT: ; wave barrier
228229
; GFX9-NEXT: ds_read_b32 v9, v9 offset:2304
@@ -288,6 +289,7 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32
288289
; GFX10-NEXT: s_waitcnt vmcnt(2)
289290
; GFX10-NEXT: ds_read_b32 v7, v9 offset:1792
290291
; GFX10-NEXT: ; wave barrier
292+
; GFX10-NEXT: s_waitcnt vmcnt(0)
291293
; GFX10-NEXT: ds_read_b32 v8, v9 offset:2048
292294
; GFX10-NEXT: ; wave barrier
293295
; GFX10-NEXT: ds_read_b32 v9, v9 offset:2304

0 commit comments

Comments
 (0)