Skip to content

Commit 5415817

Browse files
committed
AArch64: move memset optimize to AArch64 Lowering
Signed-off-by: Osama Abdelkader <[email protected]>
1 parent c9b595d commit 5415817

File tree

10 files changed

+287
-125
lines changed

10 files changed

+287
-125
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8543,20 +8543,6 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
85438543
if (!IntVT.isInteger())
85448544
IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());
85458545

8546-
// For repeated-byte patterns, generate a vector splat instead of MUL to
8547-
// enable efficient lowering to DUP on targets like AArch64.
8548-
// Only do this on AArch64 targets to avoid breaking other architectures.
8549-
const TargetMachine &TM = DAG.getTarget();
8550-
if (NumBits > 8 && VT.isInteger() && !VT.isVector() &&
8551-
(NumBits == 32 || NumBits == 64) &&
8552-
TM.getTargetTriple().getArch() == Triple::aarch64) {
8553-
// Generate a vector of bytes: v4i8 for i32, v8i8 for i64
8554-
EVT ByteVecTy = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumBits / 8);
8555-
SDValue VecSplat = DAG.getSplatBuildVector(ByteVecTy, dl, Value);
8556-
// Bitcast back to the target integer type
8557-
return DAG.getNode(ISD::BITCAST, dl, IntVT, VecSplat);
8558-
}
8559-
85608546
Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
85618547
if (NumBits > 8) {
85628548
// Use a multiplication with 0x010101... to extend the input to the
@@ -9089,6 +9075,12 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
90899075
for (unsigned i = 0; i < NumMemOps; i++) {
90909076
EVT VT = MemOps[i];
90919077
unsigned VTSize = VT.getSizeInBits() / 8;
9078+
// Skip stores when Size is already 0. This can happen when an oversized
9079+
// store was added to MemOps but the actual memset size was already
9080+
// covered by previous stores (e.g., when using extraction from a larger
9081+
// vector splat).
9082+
if (Size == 0)
9083+
continue;
90929084
if (VTSize > Size) {
90939085
// Issuing an unaligned load / store pair that overlaps with the previous
90949086
// pair. Adjust the offset accordingly.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18344,7 +18344,8 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
1834418344

1834518345
// For non-zero memset, use NEON even for smaller sizes as dup + scalar store
1834618346
// is efficient
18347-
if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset)
18347+
if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
18348+
AlignmentIsAcceptable(MVT::v16i8, Align(1)))
1834818349
return MVT::v16i8;
1834918350
if (CanUseFP && !IsSmallZeroMemset &&
1835018351
AlignmentIsAcceptable(MVT::f128, Align(16)))
@@ -18356,6 +18357,39 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
1835618357
return MVT::Other;
1835718358
}
1835818359

18360+
bool AArch64TargetLowering::findOptimalMemOpLowering(
18361+
LLVMContext &Context, std::vector<EVT> &MemOps, unsigned Limit,
18362+
const MemOp &Op, unsigned DstAS, unsigned SrcAS,
18363+
const AttributeList &FuncAttributes) const {
18364+
// For non-zero memset with v16i8, don't downgrade if we can extract
18365+
// the needed size efficiently using
18366+
// shallExtractConstSplatVectorElementToStore
18367+
EVT VT = getOptimalMemOpType(Context, Op, FuncAttributes);
18368+
if (VT == MVT::v16i8 && Op.isMemset() && !Op.isZeroMemset() &&
18369+
Op.size() < 16) {
18370+
// Check if we can extract the needed size
18371+
unsigned Index;
18372+
Type *VectorTy = VT.getTypeForEVT(Context);
18373+
if (shallExtractConstSplatVectorElementToStore(VectorTy, Op.size() * 8,
18374+
Index)) {
18375+
// To generate the vector splat (DUP), we need v16i8 to be the LargestVT.
18376+
// getMemsetStores requires oversized stores to be last with at least 2
18377+
// operations. We add the target size first (extracts from v16i8), then
18378+
// v16i8 last (satisfies assertion, and is LargestVT for splat
18379+
// generation). After the first store, Size becomes 0, so the oversized
18380+
// store is skipped by the early continue in getMemsetStores, avoiding
18381+
// redundant stores.
18382+
EVT TargetVT = (Op.size() >= 8) ? MVT::i64 : MVT::i32;
18383+
MemOps.push_back(TargetVT); // First: extract from v16i8
18384+
MemOps.push_back(VT); // Last: v16i8 (LargestVT, oversized)
18385+
return true;
18386+
}
18387+
}
18388+
// Otherwise, use the default implementation
18389+
return TargetLowering::findOptimalMemOpLowering(Context, MemOps, Limit, Op,
18390+
DstAS, SrcAS, FuncAttributes);
18391+
}
18392+
1835918393
LLT AArch64TargetLowering::getOptimalMemOpLLT(
1836018394
const MemOp &Op, const AttributeList &FuncAttributes) const {
1836118395
bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
@@ -18377,7 +18411,8 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
1837718411

1837818412
// For non-zero memset, use NEON for all sizes where it's beneficial.
1837918413
// NEON dup + scalar store works for any alignment and is efficient.
18380-
if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset)
18414+
if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
18415+
AlignmentIsAcceptable(MVT::v16i8, Align(1)))
1838118416
return LLT::fixed_vector(2, 64);
1838218417
if (CanUseFP && !IsSmallZeroMemset &&
1838318418
AlignmentIsAcceptable(MVT::f128, Align(16)))
@@ -29715,16 +29750,13 @@ bool AArch64TargetLowering::shallExtractConstSplatVectorElementToStore(
2971529750
// This is useful for memset where we generate a v16i8 splat and need to store
2971629751
// a smaller scalar (e.g., i32 for a 4-byte memset).
2971729752
if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(VectorTy)) {
29718-
// Only handle v16i8 splat (128 bits total, 16 elements of 8 bits each)
29719-
if (VTy->getNumElements() == 16 && VTy->getElementType()->isIntegerTy(8)) {
29753+
// Handle v16i8 splat (128 bits total, 16 elements of 8 bits each) and
29754+
// v8i8 splat (64 bits total, 8 elements of 8 bits each)
29755+
if ((VTy->getNumElements() == 16 || VTy->getNumElements() == 8) &&
29756+
VTy->getElementType()->isIntegerTy(8)) {
2972029757
// Check if we're extracting a 32-bit or 64-bit element
29721-
if (ElemSizeInBits == 32) {
29722-
// Extract element 0 of the 128-bit vector as a 32-bit scalar
29723-
Index = 0;
29724-
return true;
29725-
}
29726-
if (ElemSizeInBits == 64) {
29727-
// Extract elements 0-7 as a 64-bit scalar
29758+
if (ElemSizeInBits == 32 || ElemSizeInBits == 64) {
29759+
// Extract element 0 from the vector as a scalar
2972829760
Index = 0;
2972929761
return true;
2973029762
}

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,12 @@ class AArch64TargetLowering : public TargetLowering {
261261
LLT getOptimalMemOpLLT(const MemOp &Op,
262262
const AttributeList &FuncAttributes) const override;
263263

264+
bool
265+
findOptimalMemOpLowering(LLVMContext &Context, std::vector<EVT> &MemOps,
266+
unsigned Limit, const MemOp &Op, unsigned DstAS,
267+
unsigned SrcAS,
268+
const AttributeList &FuncAttributes) const override;
269+
264270
/// Return true if the addressing mode represented by AM is legal for this
265271
/// target, for a load/store of the specified type.
266272
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,

llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,8 @@ body: |
9898
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s8)
9999
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
100100
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
101-
; CHECK-NEXT: G_STORE [[MUL]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
102-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
103-
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
104-
; CHECK-NEXT: G_STORE [[MUL]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
101+
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL]](s64)
102+
; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>) into %ir.dst, align 1)
105103
; CHECK-NEXT: RET_ReallyLR
106104
%0:_(p0) = COPY $x0
107105
%1:_(s32) = COPY $w1
@@ -158,10 +156,8 @@ body: |
158156
; CHECK-NEXT: {{ $}}
159157
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
160158
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4629771061636907072
161-
; CHECK-NEXT: G_STORE [[C]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
162-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
163-
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
164-
; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
159+
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
160+
; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>) into %ir.dst, align 1)
165161
; CHECK-NEXT: RET_ReallyLR
166162
%0:_(p0) = COPY $x0
167163
%1:_(s8) = G_CONSTANT i8 64
@@ -220,10 +216,8 @@ body: |
220216
; CHECK-NEXT: {{ $}}
221217
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
222218
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4629771061636907072
223-
; CHECK-NEXT: G_STORE [[C]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
224-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
225-
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
226-
; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
219+
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
220+
; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>) into %ir.dst, align 1)
227221
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 16448
228222
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
229223
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64)
@@ -252,10 +246,8 @@ body: |
252246
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s8)
253247
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
254248
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
255-
; CHECK-NEXT: G_STORE [[MUL]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
256-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
257-
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
258-
; CHECK-NEXT: G_STORE [[MUL]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
249+
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL]](s64)
250+
; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>) into %ir.dst, align 1)
259251
; CHECK-NEXT: RET_ReallyLR
260252
%0:_(p0) = COPY $x0
261253
%1:_(s32) = COPY $w1

llvm/test/CodeGen/AArch64/aarch64-mops.ll

Lines changed: 38 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -391,45 +391,39 @@ entry:
391391
define void @memset_10(ptr %dst, i32 %value) {
392392
; GISel-WITHOUT-MOPS-O0-LABEL: memset_10:
393393
; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
394-
; GISel-WITHOUT-MOPS-O0-NEXT: // implicit-def: $x8
395-
; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, w1
396-
; GISel-WITHOUT-MOPS-O0-NEXT: and x8, x8, #0xff
397-
; GISel-WITHOUT-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
398-
; GISel-WITHOUT-MOPS-O0-NEXT: mul x8, x8, x9
399-
; GISel-WITHOUT-MOPS-O0-NEXT: str x8, [x0]
400-
; GISel-WITHOUT-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
394+
; GISel-WITHOUT-MOPS-O0-NEXT: and w8, w1, #0xff
395+
; GISel-WITHOUT-MOPS-O0-NEXT: mov w9, #16843009 // =0x1010101
396+
; GISel-WITHOUT-MOPS-O0-NEXT: mul w8, w8, w9
397+
; GISel-WITHOUT-MOPS-O0-NEXT: str w8, [x0]
398+
; GISel-WITHOUT-MOPS-O0-NEXT: str w8, [x0, #4]
401399
; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8]
402400
; GISel-WITHOUT-MOPS-O0-NEXT: ret
403401
;
404402
; GISel-WITHOUT-MOPS-O3-LABEL: memset_10:
405403
; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
406-
; GISel-WITHOUT-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
407-
; GISel-WITHOUT-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
408-
; GISel-WITHOUT-MOPS-O3-NEXT: and x9, x1, #0xff
409-
; GISel-WITHOUT-MOPS-O3-NEXT: mul x8, x9, x8
410-
; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0]
404+
; GISel-WITHOUT-MOPS-O3-NEXT: mov w8, #16843009 // =0x1010101
405+
; GISel-WITHOUT-MOPS-O3-NEXT: and w9, w1, #0xff
406+
; GISel-WITHOUT-MOPS-O3-NEXT: mul w8, w9, w8
407+
; GISel-WITHOUT-MOPS-O3-NEXT: stp w8, w8, [x0]
411408
; GISel-WITHOUT-MOPS-O3-NEXT: strh w8, [x0, #8]
412409
; GISel-WITHOUT-MOPS-O3-NEXT: ret
413410
;
414411
; GISel-MOPS-O0-LABEL: memset_10:
415412
; GISel-MOPS-O0: // %bb.0: // %entry
416-
; GISel-MOPS-O0-NEXT: // implicit-def: $x8
417-
; GISel-MOPS-O0-NEXT: mov w8, w1
418-
; GISel-MOPS-O0-NEXT: and x8, x8, #0xff
419-
; GISel-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
420-
; GISel-MOPS-O0-NEXT: mul x8, x8, x9
421-
; GISel-MOPS-O0-NEXT: str x8, [x0]
422-
; GISel-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
413+
; GISel-MOPS-O0-NEXT: and w8, w1, #0xff
414+
; GISel-MOPS-O0-NEXT: mov w9, #16843009 // =0x1010101
415+
; GISel-MOPS-O0-NEXT: mul w8, w8, w9
416+
; GISel-MOPS-O0-NEXT: str w8, [x0]
417+
; GISel-MOPS-O0-NEXT: str w8, [x0, #4]
423418
; GISel-MOPS-O0-NEXT: strh w8, [x0, #8]
424419
; GISel-MOPS-O0-NEXT: ret
425420
;
426421
; GISel-MOPS-O3-LABEL: memset_10:
427422
; GISel-MOPS-O3: // %bb.0: // %entry
428-
; GISel-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
429-
; GISel-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
430-
; GISel-MOPS-O3-NEXT: and x9, x1, #0xff
431-
; GISel-MOPS-O3-NEXT: mul x8, x9, x8
432-
; GISel-MOPS-O3-NEXT: str x8, [x0]
423+
; GISel-MOPS-O3-NEXT: mov w8, #16843009 // =0x1010101
424+
; GISel-MOPS-O3-NEXT: and w9, w1, #0xff
425+
; GISel-MOPS-O3-NEXT: mul w8, w9, w8
426+
; GISel-MOPS-O3-NEXT: stp w8, w8, [x0]
433427
; GISel-MOPS-O3-NEXT: strh w8, [x0, #8]
434428
; GISel-MOPS-O3-NEXT: ret
435429
;
@@ -461,45 +455,41 @@ entry:
461455
define void @memset_10_volatile(ptr %dst, i32 %value) {
462456
; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_volatile:
463457
; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
464-
; GISel-WITHOUT-MOPS-O0-NEXT: // implicit-def: $x8
465-
; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, w1
466-
; GISel-WITHOUT-MOPS-O0-NEXT: and x8, x8, #0xff
467-
; GISel-WITHOUT-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
468-
; GISel-WITHOUT-MOPS-O0-NEXT: mul x8, x8, x9
469-
; GISel-WITHOUT-MOPS-O0-NEXT: str x8, [x0]
470-
; GISel-WITHOUT-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
458+
; GISel-WITHOUT-MOPS-O0-NEXT: and w8, w1, #0xff
459+
; GISel-WITHOUT-MOPS-O0-NEXT: mov w9, #16843009 // =0x1010101
460+
; GISel-WITHOUT-MOPS-O0-NEXT: mul w8, w8, w9
461+
; GISel-WITHOUT-MOPS-O0-NEXT: str w8, [x0]
462+
; GISel-WITHOUT-MOPS-O0-NEXT: str w8, [x0, #4]
471463
; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8]
472464
; GISel-WITHOUT-MOPS-O0-NEXT: ret
473465
;
474466
; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_volatile:
475467
; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
476-
; GISel-WITHOUT-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
477-
; GISel-WITHOUT-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
478-
; GISel-WITHOUT-MOPS-O3-NEXT: and x9, x1, #0xff
479-
; GISel-WITHOUT-MOPS-O3-NEXT: mul x8, x9, x8
480-
; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0]
468+
; GISel-WITHOUT-MOPS-O3-NEXT: mov w8, #16843009 // =0x1010101
469+
; GISel-WITHOUT-MOPS-O3-NEXT: and w9, w1, #0xff
470+
; GISel-WITHOUT-MOPS-O3-NEXT: mul w8, w9, w8
471+
; GISel-WITHOUT-MOPS-O3-NEXT: str w8, [x0]
472+
; GISel-WITHOUT-MOPS-O3-NEXT: str w8, [x0, #4]
481473
; GISel-WITHOUT-MOPS-O3-NEXT: strh w8, [x0, #8]
482474
; GISel-WITHOUT-MOPS-O3-NEXT: ret
483475
;
484476
; GISel-MOPS-O0-LABEL: memset_10_volatile:
485477
; GISel-MOPS-O0: // %bb.0: // %entry
486-
; GISel-MOPS-O0-NEXT: // implicit-def: $x8
487-
; GISel-MOPS-O0-NEXT: mov w8, w1
488-
; GISel-MOPS-O0-NEXT: and x8, x8, #0xff
489-
; GISel-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
490-
; GISel-MOPS-O0-NEXT: mul x8, x8, x9
491-
; GISel-MOPS-O0-NEXT: str x8, [x0]
492-
; GISel-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
478+
; GISel-MOPS-O0-NEXT: and w8, w1, #0xff
479+
; GISel-MOPS-O0-NEXT: mov w9, #16843009 // =0x1010101
480+
; GISel-MOPS-O0-NEXT: mul w8, w8, w9
481+
; GISel-MOPS-O0-NEXT: str w8, [x0]
482+
; GISel-MOPS-O0-NEXT: str w8, [x0, #4]
493483
; GISel-MOPS-O0-NEXT: strh w8, [x0, #8]
494484
; GISel-MOPS-O0-NEXT: ret
495485
;
496486
; GISel-MOPS-O3-LABEL: memset_10_volatile:
497487
; GISel-MOPS-O3: // %bb.0: // %entry
498-
; GISel-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
499-
; GISel-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
500-
; GISel-MOPS-O3-NEXT: and x9, x1, #0xff
501-
; GISel-MOPS-O3-NEXT: mul x8, x9, x8
502-
; GISel-MOPS-O3-NEXT: str x8, [x0]
488+
; GISel-MOPS-O3-NEXT: mov w8, #16843009 // =0x1010101
489+
; GISel-MOPS-O3-NEXT: and w9, w1, #0xff
490+
; GISel-MOPS-O3-NEXT: mul w8, w9, w8
491+
; GISel-MOPS-O3-NEXT: str w8, [x0]
492+
; GISel-MOPS-O3-NEXT: str w8, [x0, #4]
503493
; GISel-MOPS-O3-NEXT: strh w8, [x0, #8]
504494
; GISel-MOPS-O3-NEXT: ret
505495
;

llvm/test/CodeGen/AArch64/arm64-memset-inline.ll

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ define void @memset_8_stack() {
331331
; CHECK-NEXT: .cfi_def_cfa_offset 16
332332
; CHECK-NEXT: .cfi_offset w30, -16
333333
; CHECK-NEXT: mov x8, #-6148914691236517206
334-
; CHECK-NEXT: stp x30, x8, [sp, #-16]! // 8-byte Folded Spill
334+
; CHECK-NEXT: stp x30, x8, [sp, #-16]!
335335
; CHECK-NEXT: add x0, sp, #8
336336
; CHECK-NEXT: bl something
337337
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -367,12 +367,12 @@ define void @memset_16_stack() {
367367
; CHECK-LABEL: memset_16_stack:
368368
; CHECK: // %bb.0:
369369
; CHECK-NEXT: sub sp, sp, #32
370+
; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
370371
; CHECK-NEXT: .cfi_def_cfa_offset 32
371372
; CHECK-NEXT: .cfi_offset w30, -16
372-
; CHECK-NEXT: mov x8, #-6148914691236517206
373+
; CHECK-NEXT: movi v0.16b, #170
373374
; CHECK-NEXT: mov x0, sp
374-
; CHECK-NEXT: stp x8, x30, [sp, #8] // 8-byte Folded Spill
375-
; CHECK-NEXT: str x8, [sp]
375+
; CHECK-NEXT: str q0, [sp]
376376
; CHECK-NEXT: bl something
377377
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
378378
; CHECK-NEXT: add sp, sp, #32
@@ -390,10 +390,10 @@ define void @memset_20_stack() {
390390
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
391391
; CHECK-NEXT: .cfi_def_cfa_offset 48
392392
; CHECK-NEXT: .cfi_offset w30, -16
393-
; CHECK-NEXT: mov x8, #-6148914691236517206
394-
; CHECK-NEXT: add x0, sp, #8
395-
; CHECK-NEXT: stp x8, x8, [sp, #8]
396-
; CHECK-NEXT: str w8, [sp, #24]
393+
; CHECK-NEXT: movi v0.16b, #170
394+
; CHECK-NEXT: mov x0, sp
395+
; CHECK-NEXT: str q0, [sp]
396+
; CHECK-NEXT: str s0, [sp, #16]
397397
; CHECK-NEXT: bl something
398398
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
399399
; CHECK-NEXT: add sp, sp, #48
@@ -411,11 +411,10 @@ define void @memset_26_stack() {
411411
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
412412
; CHECK-NEXT: .cfi_def_cfa_offset 48
413413
; CHECK-NEXT: .cfi_offset w30, -16
414-
; CHECK-NEXT: mov x8, #-6148914691236517206
414+
; CHECK-NEXT: movi v0.16b, #170
415415
; CHECK-NEXT: mov x0, sp
416-
; CHECK-NEXT: stp x8, x8, [sp, #8]
417-
; CHECK-NEXT: str x8, [sp]
418-
; CHECK-NEXT: strh w8, [sp, #24]
416+
; CHECK-NEXT: stur q0, [sp, #10]
417+
; CHECK-NEXT: str q0, [sp]
419418
; CHECK-NEXT: bl something
420419
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
421420
; CHECK-NEXT: add sp, sp, #48
@@ -454,10 +453,9 @@ define void @memset_40_stack() {
454453
; CHECK-NEXT: .cfi_def_cfa_offset 64
455454
; CHECK-NEXT: .cfi_offset w30, -16
456455
; CHECK-NEXT: movi v0.16b, #170
457-
; CHECK-NEXT: mov x8, #-6148914691236517206
458456
; CHECK-NEXT: mov x0, sp
459-
; CHECK-NEXT: str x8, [sp, #32]
460457
; CHECK-NEXT: stp q0, q0, [sp]
458+
; CHECK-NEXT: str d0, [sp, #32]
461459
; CHECK-NEXT: bl something
462460
; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
463461
; CHECK-NEXT: add sp, sp, #64
@@ -497,11 +495,10 @@ define void @memset_72_stack() {
497495
; CHECK-NEXT: .cfi_def_cfa_offset 96
498496
; CHECK-NEXT: .cfi_offset w30, -16
499497
; CHECK-NEXT: movi v0.16b, #170
500-
; CHECK-NEXT: mov x8, #-6148914691236517206
501498
; CHECK-NEXT: mov x0, sp
502-
; CHECK-NEXT: str x8, [sp, #64]
503499
; CHECK-NEXT: stp q0, q0, [sp]
504500
; CHECK-NEXT: stp q0, q0, [sp, #32]
501+
; CHECK-NEXT: str d0, [sp, #64]
505502
; CHECK-NEXT: bl something
506503
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
507504
; CHECK-NEXT: add sp, sp, #96

0 commit comments

Comments
 (0)