diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index 1b2e3f41479db..98652a624baa5 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -269,6 +269,40 @@ static mlir::Value emitX86FunnelShift(CIRGenBuilderTy &builder, mlir::ValueRange{op0, op1, amt}); } +static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, mlir::Location loc, + bool isSigned, + SmallVectorImpl &ops, + unsigned opTypePrimitiveSizeInBits) { + mlir::Type ty = cir::VectorType::get(builder.getSInt64Ty(), + opTypePrimitiveSizeInBits / 64); + mlir::Value lhs = builder.createBitcast(loc, ops[0], ty); + mlir::Value rhs = builder.createBitcast(loc, ops[1], ty); + if (isSigned) { + cir::ConstantOp shiftAmt = + builder.getConstant(loc, cir::IntAttr::get(builder.getSInt64Ty(), 32)); + cir::VecSplatOp shiftSplatVecOp = + cir::VecSplatOp::create(builder, loc, ty, shiftAmt.getResult()); + mlir::Value shiftSplatValue = shiftSplatVecOp.getResult(); + // In CIR, right-shift operations are automatically lowered to either an + // arithmetic or logical shift depending on the operand type. The purpose + // of the shifts here is to propagate the sign bit of the 32-bit input + // into the upper bits of each vector lane. + lhs = builder.createShift(loc, lhs, shiftSplatValue, true); + lhs = builder.createShift(loc, lhs, shiftSplatValue, false); + rhs = builder.createShift(loc, rhs, shiftSplatValue, true); + rhs = builder.createShift(loc, rhs, shiftSplatValue, false); + } else { + cir::ConstantOp maskScalar = builder.getConstant( + loc, cir::IntAttr::get(builder.getSInt64Ty(), 0xffffffff)); + cir::VecSplatOp mask = + cir::VecSplatOp::create(builder, loc, ty, maskScalar.getResult()); + // Clear the upper bits + lhs = builder.createAnd(loc, lhs, mask); + rhs = builder.createAnd(loc, rhs, mask); + } + return builder.createMul(loc, lhs, rhs); +} + mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) { if (builtinID == Builtin::BI__builtin_cpu_is) { @@ -1212,12 +1246,26 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_sqrtph512: case X86::BI__builtin_ia32_sqrtps512: case X86::BI__builtin_ia32_sqrtpd512: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; case X86::BI__builtin_ia32_pmuludq128: case X86::BI__builtin_ia32_pmuludq256: - case X86::BI__builtin_ia32_pmuludq512: + case X86::BI__builtin_ia32_pmuludq512: { + unsigned opTypePrimitiveSizeInBits = + cgm.getDataLayout().getTypeSizeInBits(ops[0].getType()); + return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned*/ false, + ops, opTypePrimitiveSizeInBits); + } case X86::BI__builtin_ia32_pmuldq128: case X86::BI__builtin_ia32_pmuldq256: - case X86::BI__builtin_ia32_pmuldq512: + case X86::BI__builtin_ia32_pmuldq512: { + unsigned opTypePrimitiveSizeInBits = + cgm.getDataLayout().getTypeSizeInBits(ops[0].getType()); + return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned*/ true, + ops, opTypePrimitiveSizeInBits); + } case X86::BI__builtin_ia32_pternlogd512_mask: case X86::BI__builtin_ia32_pternlogq512_mask: case X86::BI__builtin_ia32_pternlogd128_mask: diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c index b7497c2053b2d..f27d6e2862f83 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c @@ -51,3 +51,55 @@ __m256i test_mm256_shufflehi_epi16(__m256i a) { // OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> return _mm256_shufflehi_epi16(a, 107); } + +__m256i test_mm256_mul_epu32(__m256i a, __m256i b) { + // CIR-LABEL: _mm256_mul_epu32 + // CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i> + // CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i> + // CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i + // CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<4 x !s64i> + // CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]]) + // CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]]) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]]) + + // LLVM-LABEL: _mm256_mul_epu32 + // LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: mul <4 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm256_mul_epu32 + // OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: mul <4 x i64> %{{.*}}, %{{.*}} + +return _mm256_mul_epu32(a, b); +} + +__m256i test_mm256_mul_epi32(__m256i a, __m256i b) { + // CIR-LABEL: _mm256_mul_epi32 + // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i> + // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i> + // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i + // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<4 x !s64i> + // CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>) + // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>) + // CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>) + // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]]) + + // LLVM-LABEL: _mm256_mul_epi32 + // LLVM: shl <4 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32) + // LLVM: shl <4 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32) + // LLVM: mul <4 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm256_mul_epi32 + // OGCG: shl <4 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32) + // OGCG: shl <4 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32) + // OGCG: mul <4 x i64> %{{.*}}, %{{.*}} + + return _mm256_mul_epi32(a, b); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c index cdcdad42b2845..b0ed2397624d7 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c @@ -695,3 +695,55 @@ void test_mm512_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m512i __i // OGCG: @llvm.x86.avx512.mask.scatter.qpi.512 return _mm512_mask_i64scatter_epi32(__addr, __mask, __index, __v1, 2); } + +__m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) { + // CIR-LABEL: _mm512_mul_epi32 + // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i> + // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i> + // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i + // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<8 x !s64i> + // CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>) + // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>) + // CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>) + // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]]) + + // LLVM-LABEL: _mm512_mul_epi32 + // LLVM: shl <8 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32) + // LLVM: shl <8 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32) + // LLVM: mul <8 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm512_mul_epi32 + // OGCG: shl <8 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32) + // OGCG: shl <8 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32) + // OGCG: mul <8 x i64> %{{.*}}, %{{.*}} + + return _mm512_mul_epi32(__A, __B); +} + +__m512i test_mm512_mul_epu32(__m512i __A, __m512i __B) { + // CIR-LABEL: _mm512_mul_epu32 + // CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i> + // CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i> + // CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i + // CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<8 x !s64i> + // CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]]) + // CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]]) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]]) + + // LLVM-LABEL: _mm512_mul_epu32 + // LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: mul <8 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm512_mul_epu32 + // OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: mul <8 x i64> %{{.*}}, %{{.*}} + +return _mm512_mul_epu32(__A, __B); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c index 4bb17e9d20bc6..18cf553a3827b 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c @@ -159,3 +159,26 @@ __m128i test_mm_shuffle_epi32(__m128i A) { // OGCG: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> return _mm_shuffle_epi32(A, 0x4E); } + +__m128i test_mm_mul_epu32(__m128i A, __m128i B) { + // CIR-LABEL: _mm_mul_epu32 + // CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i> + // CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i> + // CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i + // CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<2 x !s64i> + // CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]]) + // CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]]) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]]) + + // LLVM-LABEL: _mm_mul_epu32 + // LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: mul <2 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm_mul_epu32 + // OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: mul <2 x i64> %{{.*}}, %{{.*}} + + return _mm_mul_epu32(A, B); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c new file mode 100644 index 0000000000000..c53d435842b27 --- /dev/null +++ b/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c @@ -0,0 +1,45 @@ +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG + +#include + +__m128i test_mm_mul_epi32(__m128i x, __m128i y) { + // CIR-LABEL: _mm_mul_epi32 + // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i> + // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i> + // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i + // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<2 x !s64i> + // CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>) + // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>) + // CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>) + // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]]) + + // LLVM-LABEL: _mm_mul_epi32 + // LLVM: shl <2 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32) + // LLVM: shl <2 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32) + // LLVM: mul <2 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm_mul_epi32 + // OGCG: shl <2 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32) + // OGCG: shl <2 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32) + // OGCG: mul <2 x i64> %{{.*}}, %{{.*}} + + return _mm_mul_epi32(x, y); +}