-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86] AVX512 optimised CTLZ/CTTZ implementations for i256/i512 scalars #164671
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
9f2ce78 to
5e2c7e7
Compare
Make use of AVX512 VPLZCNT/VPOPCNT to perform the big integer bit counts per vector element and then use VPCOMPRESS to extract the first non-zero element result
5e2c7e7 to
6ee5b3e
Compare
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesMake use of AVX512 VPLZCNT/VPOPCNT to perform the big integer bit counts per vector element and then use VPCOMPRESS to extract the first non-zero element result Patch is 21.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164671.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fa3dce256046f..484caa9191ab5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2654,6 +2654,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::AVGCEILU,
ISD::AVGFLOORS,
ISD::AVGFLOORU,
+ ISD::CTLZ,
+ ISD::CTTZ,
+ ISD::CTLZ_ZERO_UNDEF,
+ ISD::CTTZ_ZERO_UNDEF,
ISD::BITREVERSE,
ISD::ADD,
ISD::FADD,
@@ -55162,6 +55166,61 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
return combineFneg(N, DAG, DCI, Subtarget);
}
+// Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512
+// vXi64 CTLZ/CTTZ and VECTOR_COMPRESS.
+// Compute the CTLZ/CTTZ of each element, add the element's bit offset, compress
+// the result to remove all zero elements (passthru is set to scalar bitwidth if
+// all elements are zero) and extract the lowest compressed element.
+static SDValue combineCTZ(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ unsigned Opc = N->getOpcode();
+ unsigned SizeInBits = VT.getSizeInBits();
+ assert((Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF || Opc == ISD::CTTZ ||
+ Opc == ISD::CTTZ_ZERO_UNDEF) &&
+ "Unsupported bit count");
+
+ if (VT.isScalarInteger() && Subtarget.hasCDI() &&
+ ((SizeInBits == 512 && Subtarget.useAVX512Regs()) ||
+ (SizeInBits == 256 && Subtarget.hasVLX() &&
+ X86::mayFoldLoad(N0, Subtarget)))) {
+ MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64);
+ MVT BoolVT = VecVT.changeVectorElementType(MVT::i1);
+ SDValue Vec = DAG.getBitcast(VecVT, N0);
+ SDLoc DL(N);
+
+ SmallVector<int, 8> RevMask;
+ SmallVector<SDValue, 8> Offsets;
+ for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) {
+ RevMask.push_back((int)((E - 1) - I));
+ Offsets.push_back(DAG.getConstant(I * 64, DL, MVT::i64));
+ }
+
+ // CTLZ - reverse the elements as we want the top non-zero element.
+ if (Opc == ISD::CTLZ)
+ Vec = DAG.getVectorShuffle(VecVT, DL, Vec, Vec, RevMask);
+
+ SDValue PassThrough = DAG.getUNDEF(VecVT);
+ if (Opc == ISD::CTLZ || Opc == ISD::CTTZ)
+ PassThrough = DAG.getConstant(SizeInBits, DL, VecVT);
+
+ SDValue IsNonZero = DAG.getSetCC(DL, BoolVT, Vec,
+ DAG.getConstant(0, DL, VecVT), ISD::SETNE);
+ SDValue Cnt = DAG.getNode(Opc, DL, VecVT, Vec);
+ Cnt = DAG.getNode(ISD::ADD, DL, VecVT, Cnt,
+ DAG.getBuildVector(VecVT, DL, Offsets));
+ Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, DL, VecVT, Cnt, IsNonZero,
+ PassThrough);
+ Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cnt,
+ DAG.getVectorIdxConstant(0, DL));
+ return DAG.getZExtOrTrunc(Cnt, DL, VT);
+ }
+
+ return SDValue();
+}
+
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -60885,6 +60944,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
+ case ISD::CTLZ:
+ case ISD::CTTZ:
+ case ISD::CTLZ_ZERO_UNDEF:
+ case ISD::CTTZ_ZERO_UNDEF:return combineCTZ(N, DAG, DCI, Subtarget);
case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
case ISD::AVGCEILS:
case ISD::AVGCEILU:
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 13149d78b16fb..c425a297ca0f6 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512POPCNT
;
; CTPOP
@@ -712,23 +712,15 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind {
;
; AVX512-LABEL: load_ctlz_i256:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq 8(%rdi), %rcx
-; AVX512-NEXT: movq 16(%rdi), %rdx
-; AVX512-NEXT: movq 24(%rdi), %rsi
-; AVX512-NEXT: lzcntq %rsi, %rax
-; AVX512-NEXT: lzcntq %rdx, %r8
-; AVX512-NEXT: addl $64, %r8d
-; AVX512-NEXT: testq %rsi, %rsi
-; AVX512-NEXT: cmovnel %eax, %r8d
-; AVX512-NEXT: lzcntq %rcx, %r9
-; AVX512-NEXT: lzcntq (%rdi), %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %r9d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %rsi, %rdx
-; AVX512-NEXT: cmovnel %r8d, %eax
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512-NEXT: vplzcntq %ymm0, %ymm1
+; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%a0 = load i256, ptr %p0
%cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
@@ -845,47 +837,28 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
;
; AVX512-LABEL: test_ctlz_i512:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: lzcntq %r11, %rax
-; AVX512-NEXT: lzcntq %r10, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %r11, %r11
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: lzcntq %r9, %rax
-; AVX512-NEXT: lzcntq %r8, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %r9, %r9
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: subl $-128, %ebx
-; AVX512-NEXT: movq %r10, %rax
-; AVX512-NEXT: orq %r11, %rax
-; AVX512-NEXT: cmovnel %r14d, %ebx
-; AVX512-NEXT: lzcntq %rcx, %rax
-; AVX512-NEXT: lzcntq %rdx, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: lzcntq %rsi, %r15
-; AVX512-NEXT: lzcntq %rdi, %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %rsi, %rsi
-; AVX512-NEXT: cmovnel %r15d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %rcx, %rdx
-; AVX512-NEXT: cmovnel %r14d, %eax
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r11, %r9
-; AVX512-NEXT: orq %r10, %r8
-; AVX512-NEXT: orq %r9, %r8
-; AVX512-NEXT: cmovnel %ebx, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
+; AVX512-NEXT: vmovq %rdi, %xmm0
+; AVX512-NEXT: vmovq %rsi, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vmovq %rdx, %xmm1
+; AVX512-NEXT: vmovq %rcx, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovq %r8, %xmm1
+; AVX512-NEXT: vmovq %r9, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
%res = trunc i512 %cnt to i32
@@ -1010,50 +983,16 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
;
; AVX512-LABEL: load_ctlz_i512:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq 8(%rdi), %r11
-; AVX512-NEXT: movq 16(%rdi), %r9
-; AVX512-NEXT: movq 24(%rdi), %r10
-; AVX512-NEXT: movq 32(%rdi), %rcx
-; AVX512-NEXT: movq 40(%rdi), %rdx
-; AVX512-NEXT: movq 48(%rdi), %rsi
-; AVX512-NEXT: movq 56(%rdi), %r8
-; AVX512-NEXT: lzcntq %r8, %rax
-; AVX512-NEXT: lzcntq %rsi, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %r8, %r8
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: lzcntq %rdx, %rax
-; AVX512-NEXT: lzcntq %rcx, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: subl $-128, %ebx
-; AVX512-NEXT: movq %rsi, %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: cmovnel %r14d, %ebx
-; AVX512-NEXT: lzcntq %r10, %rax
-; AVX512-NEXT: lzcntq %r9, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %r10, %r10
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: lzcntq (%rdi), %rax
-; AVX512-NEXT: lzcntq %r11, %rdi
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %r11, %r11
-; AVX512-NEXT: cmovnel %edi, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %r10, %r9
-; AVX512-NEXT: cmovnel %r14d, %eax
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r8, %rdx
-; AVX512-NEXT: orq %rsi, %rcx
-; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: cmovnel %ebx, %eax
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%a0 = load i512, ptr %p0
%cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
@@ -1992,26 +1931,38 @@ define i32 @load_cttz_i256(ptr %p0) nounwind {
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_cttz_i256:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq 16(%rdi), %rcx
-; AVX512-NEXT: movq (%rdi), %rdx
-; AVX512-NEXT: movq 8(%rdi), %rsi
-; AVX512-NEXT: tzcntq %rdx, %rax
-; AVX512-NEXT: tzcntq %rsi, %r8
-; AVX512-NEXT: addl $64, %r8d
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %r8d
-; AVX512-NEXT: tzcntq %rcx, %r9
-; AVX512-NEXT: tzcntq 24(%rdi), %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %r9d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %rsi, %rdx
-; AVX512-NEXT: cmovnel %r8d, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_cttz_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vplzcntq %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512F-NEXT: vpsubq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512F-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%a0 = load i256, ptr %p0
%cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
%res = trunc i256 %cnt to i32
@@ -2109,47 +2060,58 @@ define i32 @test_cttz_i512(i512 %a0) nounwind {
; AVX2-NEXT: popq %r14
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cttz_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: tzcntq %rdi, %rax
-; AVX512-NEXT: tzcntq %rsi, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %rdi, %rdi
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: tzcntq %rdx, %rax
-; AVX512-NEXT: tzcntq %rcx, %r10
-; AVX512-NEXT: addl $64, %r10d
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %r10d
-; AVX512-NEXT: subl $-128, %r10d
-; AVX512-NEXT: movq %rdi, %rax
-; AVX512-NEXT: orq %rsi, %rax
-; AVX512-NEXT: cmovnel %ebx, %r10d
-; AVX512-NEXT: tzcntq %r8, %rax
-; AVX512-NEXT: tzcntq %r9, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %r8, %r8
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: tzcntq %r11, %r14
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %r11, %r11
-; AVX512-NEXT: cmovnel %r14d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %r9, %r8
-; AVX512-NEXT: cmovnel %ebx, %eax
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %rcx, %rsi
-; AVX512-NEXT: orq %rdx, %rdi
-; AVX512-NEXT: orq %rsi, %rdi
-; AVX512-NEXT: cmovnel %r10d, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cttz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vmovq %rdi, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_cttz_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm1
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
%res = trunc i512 %cnt to i32
ret i32 %res
@@ -2263,53 +2225,38 @@ define i32 @load_cttz_i512(ptr %p0) nounwind {
; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_cttz_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq 48(%rdi), %r11
-; AVX512-NEXT: movq 40(%rdi), %r9
-; AVX512-NEXT: movq 32(%rdi), %r10
-; AVX512-NEXT: movq 24(%rdi), %r8
-; AVX512-NEXT: movq 16(%rdi), %rdx
-; AVX512-NEXT: movq (%rdi), %rcx
-; AVX512-NEXT: movq 8(%rdi), %rsi
-; AVX512-NEXT: tzcntq %rcx, %rax
-; AVX512-NEXT: tzcntq %rsi, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: tzcntq %rdx, %rax
-; AVX512-NEXT: tzcntq %r8, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: subl $-128, %ebx
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: orq %rsi, %rax
-; AVX512-NEXT: cmovnel %r14d, %ebx
-; AVX512-NEXT: tzcntq %r10, %rax
-; AVX512-NEXT: tzcntq %r9, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %r10, %r10
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: tzcntq 56(%rdi), %rax
-; AVX512-NEXT: tzcntq %r11, %rdi
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %r11, %r11
-; AVX512-NEXT: cmovnel %edi, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %r9, %r10
-; AVX512-NEXT: cmovnel %r14d, %eax
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r8, %rsi
-; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: orq %rsi, %rcx
-; AVX512-NEXT: cmovnel %ebx, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_cttz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}...
[truncated]
|
|
|
||
| SDValue PassThrough = DAG.getUNDEF(VecVT); | ||
| if (Opc == ISD::CTLZ || Opc == ISD::CTTZ) | ||
| PassThrough = DAG.getConstant(SizeInBits, DL, VecVT); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should it be 64 instead of SizeInBits?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No - I've added the offsets to each element at this point, the pass through is for the compress and will only appear in elt[0] if the entire vector is zero - in which case it should return the full scalar integer width (256/512).
| Offsets.push_back(DAG.getConstant(I * 64, DL, MVT::i64)); | ||
| } | ||
|
|
||
| // CTLZ - reverse the elements as we want the top non-zero element. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand it. Isn't the MSB in the top element already?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For CTLZ we need to isolate the first non-zero element from the end of the vector - but to then use the compress trick we need it to be reversed to be in the first element instead.
| SDValue Cnt = DAG.getNode(Opc, DL, VecVT, Vec); | ||
| Cnt = DAG.getNode(ISD::ADD, DL, VecVT, Cnt, | ||
| DAG.getBuildVector(VecVT, DL, Offsets)); | ||
| Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, DL, VecVT, Cnt, IsNonZero, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we missing Cnt for IsNonZero?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice catch - yes, we mustn't use the ZERO_UNDEF variants on the vector op.
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
llvm#164671) Make use of AVX512 VPLZCNT/VPOPCNT to perform the big integer bit counts per vector element and then use VPCOMPRESS to extract the first non-zero element result. There's more we can do here (widen/split other vector widths etc.) - but this is a good starting point.
Make use of AVX512 VPLZCNT/VPOPCNT to perform the big integer bit counts per vector element and then use VPCOMPRESS to extract the first non-zero element result.
There's more we can do here (widen/split other vector widths etc.) - but this is a good starting point.