Skip to content

Commit 6088a57

Browse files
committed
[VPlan] Re-enable narrowing masked interleave groups & mask stores
When isConsecutiveInterleaveGroup was first added in c73ad7b, there was no restriction on interleave groups that were masked. 8d29d09 then moved the transform earlier, but this supposedly caused some crashes related to masking. aca53f4 then restricted the transform to only non-masked interleave groups to fix said crashes, but after other issues were reported 8d29d09 was ultimately reverted in bfc322d. It looks like the mask restriction added in aca53f4 was left behind though, so this patch does two things: 1. It removes the restriction under the assumption that it was only a temporary workaround for 8d29d09 which is no longer in tree 2. Separately it fixes an issue where we didn't apply the mask on the new widened store (but we were applying it on the widened load) I gave this a quick test on armv9-a and rva23u64 on llvm-test-suite/SPEC CPU 2017 and didn't notice any crashes.
1 parent 5e4f177 commit 6088a57

File tree

2 files changed

+7
-79
lines changed

2 files changed

+7
-79
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4168,7 +4168,7 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
41684168
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
41694169
unsigned VF, VPTypeAnalysis &TypeInfo,
41704170
unsigned VectorRegWidth) {
4171-
if (!InterleaveR || InterleaveR->getMask())
4171+
if (!InterleaveR)
41724172
return false;
41734173

41744174
Type *GroupElementTy = nullptr;
@@ -4364,7 +4364,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
43644364
auto *SI =
43654365
cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
43664366
auto *S = new VPWidenStoreRecipe(
4367-
*SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4367+
*SI, StoreGroup->getAddr(), Res, StoreGroup->getMask(),
4368+
/*Consecutive=*/true,
43684369
/*Reverse=*/false, {}, StoreGroup->getDebugLoc());
43694370
S->insertBefore(StoreGroup);
43704371
StoreGroup->eraseFromParent();

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll

Lines changed: 4 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -105,71 +105,6 @@ exit:
105105
}
106106

107107
define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) {
108-
; IC1-LABEL: define void @test_masked_interleave_group(
109-
; IC1-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
110-
; IC1-NEXT: [[ENTRY:.*:]]
111-
; IC1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
112-
; IC1-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
113-
; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
114-
; IC1-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
115-
; IC1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
116-
; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
117-
; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
118-
; IC1: [[VECTOR_MEMCHECK]]:
119-
; IC1-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64
120-
; IC1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
121-
; IC1-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16
122-
; IC1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
123-
; IC1-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
124-
; IC1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]]
125-
; IC1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
126-
; IC1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
127-
; IC1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]]
128-
; IC1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
129-
; IC1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
130-
; IC1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
131-
; IC1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
132-
; IC1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
133-
; IC1-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
134-
; IC1: [[VECTOR_PH]]:
135-
; IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
136-
; IC1-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
137-
; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
138-
; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
139-
; IC1-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
140-
; IC1-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
141-
; IC1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
142-
; IC1-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
143-
; IC1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
144-
; IC1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
145-
; IC1-NEXT: br label %[[VECTOR_BODY:.*]]
146-
; IC1: [[VECTOR_BODY]]:
147-
; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
148-
; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
149-
; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
150-
; IC1-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
151-
; IC1-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]]
152-
; IC1-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
153-
; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
154-
; IC1-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
155-
; IC1-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
156-
; IC1-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
157-
; IC1-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
158-
; IC1-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
159-
; IC1-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
160-
; IC1-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
161-
; IC1-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
162-
; IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
163-
; IC1-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
164-
; IC1-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
165-
; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
166-
; IC1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
167-
; IC1-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
168-
; IC1: [[MIDDLE_BLOCK]]:
169-
; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
170-
; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
171-
; IC1: [[SCALAR_PH]]:
172-
;
173108
; CHECK-LABEL: define void @test_masked_interleave_group(
174109
; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
175110
; CHECK-NEXT: [[ENTRY:.*:]]
@@ -217,19 +152,11 @@ define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst)
217152
; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
218153
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
219154
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
220-
; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
221-
; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
222-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
223-
; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
224-
; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
225-
; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
226-
; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
227-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
228-
; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
229-
; CHECK-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
230-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
155+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x float> poison)
156+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_MASKED_LOAD]], ptr align 4 [[NEXT_GEP]], <vscale x 4 x i1> [[TMP16]])
157+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
231158
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
232-
; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
159+
; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
233160
; CHECK: [[MIDDLE_BLOCK]]:
234161
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
235162
; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]

0 commit comments

Comments
 (0)