llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Thibault Monnier (Thibault-Monnier) <details> <summary>Changes</summary> This PR is part of #<!-- -->167752. It upstreams the codegen and tests for the shuffle builtins implemented in the incubator, including: - `vinsert` + `insert` - `pblend` + `blend` - `vpermilp` - `pshuf` + `shufp` - `palignr` It does NOT upstream the `perm`, `vperm2`, `vpshuf`, `shuf_i` / `shuf_f` and `align` builtins, which are not yet implement in the incubator. This _is_ a large commit, but most of it is tests. The `pshufd` / `vpermilp` builtins seem to have no test coverage in the incubator, what should I do? --- Patch is 72.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169178.diff 15 Files Affected: - (modified) clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp (+173-10) - (modified) clang/lib/CIR/CodeGen/CIRGenFunction.h (+23-1) - (modified) clang/test/CIR/CodeGen/X86/avx-builtins.c (+81) - (added) clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c (+90) - (added) clang/test/CIR/CodeGen/X86/avx2-builtins.c (+145) - (added) clang/test/CIR/CodeGen/X86/avx512bw-builtins.c (+53) - (added) clang/test/CIR/CodeGen/X86/avx512dq-builtins.c (+55) - (modified) clang/test/CIR/CodeGen/X86/avx512f-builtins.c (+69) - (added) clang/test/CIR/CodeGen/X86/avx512vl-builtins.c (+24) - (added) clang/test/CIR/CodeGen/X86/avx512vldq-builtins.c (+31) - (added) clang/test/CIR/CodeGen/X86/palignr.c (+32) - (modified) clang/test/CIR/CodeGen/X86/sse-builtins.c (+12) - (modified) clang/test/CIR/CodeGen/X86/sse2-builtins.c (+41-2) - (added) clang/test/CIR/CodeGen/X86/sse3-builtins.c (+21) - (added) clang/test/CIR/CodeGen/X86/sse41-builtins.c (+55) ``````````diff diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index 978fee7dbec9d..1cf38778b629f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -68,6 +68,35 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder, return bitCast; } +static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf, + CIRGenBuilderTy &builder, + llvm::SmallVector<mlir::Value> &ops, + const CallExpr *expr, const bool isLow) { + uint32_t imm = cgf.getZExtIntValueFromConstOp(ops[1]); + + auto vecTy = cast<cir::VectorType>(ops[0].getType()); + unsigned numElts = vecTy.getSize(); + + unsigned firstHalfStart = isLow ? 0 : 4; + unsigned secondHalfStart = 4 - firstHalfStart; + + // Splat the 8-bits of immediate 4 times to help the loop wrap around. + imm = (imm & 0xff) * 0x01010101; + + int64_t indices[32]; + for (unsigned l = 0; l != numElts; l += 8) { + for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) { + indices[l + i] = l + (imm & 3) + firstHalfStart; + imm /= 4; + } + for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i) + indices[l + i] = l + i; + } + + return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), ops[0], + ArrayRef(indices, numElts)); +} + mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) { if (builtinID == Builtin::BI__builtin_cpu_is) { @@ -163,9 +192,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_vec_ext_v4di: { unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize(); - uint64_t index = - ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue(); - + uint64_t index = getZExtIntValueFromConstOp(ops[1]); index &= numElts - 1; cir::ConstantOp indexVal = @@ -497,6 +524,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_extracti64x2_256_mask: case X86::BI__builtin_ia32_extractf64x2_512_mask: case X86::BI__builtin_ia32_extracti64x2_512_mask: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; case X86::BI__builtin_ia32_vinsertf128_pd256: case X86::BI__builtin_ia32_vinsertf128_ps256: case X86::BI__builtin_ia32_vinsertf128_si256: @@ -512,9 +543,39 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_insertf64x2_256: case X86::BI__builtin_ia32_inserti64x2_256: case X86::BI__builtin_ia32_insertf64x2_512: - case X86::BI__builtin_ia32_inserti64x2_512: + case X86::BI__builtin_ia32_inserti64x2_512: { + unsigned dstNumElts = cast<cir::VectorType>(ops[0].getType()).getSize(); + unsigned srcNumElts = cast<cir::VectorType>(ops[1].getType()).getSize(); + unsigned subVectors = dstNumElts / srcNumElts; + assert(llvm::isPowerOf2_32(subVectors) && "Expected power of 2 subvectors"); + + uint64_t index = getZExtIntValueFromConstOp(ops[2]); + index &= subVectors - 1; // Remove any extra bits. + index *= srcNumElts; + + int64_t indices[16]; + for (unsigned i = 0; i != dstNumElts; ++i) + indices[i] = (i >= srcNumElts) ? srcNumElts + (i % srcNumElts) : i; + + mlir::Value op1 = builder.createVecShuffle( + getLoc(expr->getExprLoc()), ops[1], ArrayRef(indices, dstNumElts)); + + for (unsigned i = 0; i != dstNumElts; ++i) { + if (i >= index && i < (index + srcNumElts)) + indices[i] = (i - index) + dstNumElts; + else + indices[i] = i; + } + + return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], op1, + ArrayRef(indices, dstNumElts)); + } case X86::BI__builtin_ia32_pmovqd512_mask: case X86::BI__builtin_ia32_pmovwb512_mask: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; case X86::BI__builtin_ia32_pblendw128: case X86::BI__builtin_ia32_blendpd: case X86::BI__builtin_ia32_blendps: @@ -522,13 +583,29 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_blendps256: case X86::BI__builtin_ia32_pblendw256: case X86::BI__builtin_ia32_pblendd128: - case X86::BI__builtin_ia32_pblendd256: + case X86::BI__builtin_ia32_pblendd256: { + uint32_t imm = getZExtIntValueFromConstOp(ops[2]); + unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize(); + + int64_t indices[16]; + // If there are more than 8 elements, the immediate is used twice so make + // sure we handle that. + for (unsigned i = 0; i != numElts; ++i) + indices[i] = ((imm >> (i % 8)) & 0x1) ? numElts + i : i; + + return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1], + ArrayRef(indices, numElts)); + } case X86::BI__builtin_ia32_pshuflw: case X86::BI__builtin_ia32_pshuflw256: - case X86::BI__builtin_ia32_pshuflw512: + case X86::BI__builtin_ia32_pshuflw512: { + return emitPshufW(*this, builder, ops, expr, true); + } case X86::BI__builtin_ia32_pshufhw: case X86::BI__builtin_ia32_pshufhw256: - case X86::BI__builtin_ia32_pshufhw512: + case X86::BI__builtin_ia32_pshufhw512: { + return emitPshufW(*this, builder, ops, expr, false); + } case X86::BI__builtin_ia32_pshufd: case X86::BI__builtin_ia32_pshufd256: case X86::BI__builtin_ia32_pshufd512: @@ -537,20 +614,106 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_vpermilpd256: case X86::BI__builtin_ia32_vpermilps256: case X86::BI__builtin_ia32_vpermilpd512: - case X86::BI__builtin_ia32_vpermilps512: + case X86::BI__builtin_ia32_vpermilps512: { + // TODO: Add tests for this branch. + uint32_t imm = getSExtIntValueFromConstOp(ops[1]); + + auto vecTy = cast<cir::VectorType>(ops[0].getType()); + unsigned numElts = vecTy.getSize(); + auto eltTy = vecTy.getElementType(); + + unsigned eltBitWidth = getTypeSizeInBits(eltTy).getFixedValue(); + unsigned numLaneElts = 128 / eltBitWidth; + + // Splat the 8-bits of immediate 4 times to help the loop wrap around. + imm = (imm & 0xff) * 0x01010101; + + llvm::SmallVector<int64_t, 16> indices; + for (unsigned l = 0; l != numElts; l += numLaneElts) { + for (unsigned i = 0; i != numLaneElts; ++i) { + indices.push_back((imm % numLaneElts) + l); + imm /= numLaneElts; + } + } + + return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], + indices); + } case X86::BI__builtin_ia32_shufpd: case X86::BI__builtin_ia32_shufpd256: case X86::BI__builtin_ia32_shufpd512: case X86::BI__builtin_ia32_shufps: case X86::BI__builtin_ia32_shufps256: - case X86::BI__builtin_ia32_shufps512: + case X86::BI__builtin_ia32_shufps512: { + uint32_t imm = getZExtIntValueFromConstOp(ops[2]); + + auto vecTy = cast<cir::VectorType>(ops[0].getType()); + unsigned numElts = vecTy.getSize(); + unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128; + unsigned numLaneElts = numElts / numLanes; + + // Splat the 8-bits of immediate 4 times to help the loop wrap around. + imm = (imm & 0xff) * 0x01010101; + + int64_t indices[16]; + for (unsigned l = 0; l != numElts; l += numLaneElts) { + for (unsigned i = 0; i != numLaneElts; ++i) { + uint32_t idx = imm % numLaneElts; + imm /= numLaneElts; + if (i >= (numLaneElts / 2)) + idx += numElts; + indices[l + i] = l + idx; + } + } + + return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1], + ArrayRef(indices, numElts)); + } case X86::BI__builtin_ia32_permdi256: case X86::BI__builtin_ia32_permdf256: case X86::BI__builtin_ia32_permdi512: case X86::BI__builtin_ia32_permdf512: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; case X86::BI__builtin_ia32_palignr128: case X86::BI__builtin_ia32_palignr256: - case X86::BI__builtin_ia32_palignr512: + case X86::BI__builtin_ia32_palignr512: { + uint32_t shiftVal = getZExtIntValueFromConstOp(ops[2]) & 0xff; + + unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize(); + assert(numElts % 16 == 0); + + // If palignr is shifting the pair of vectors more than the size of two + // lanes, emit zero. + if (shiftVal >= 32) + return builder.getNullValue(convertType(expr->getType()), + getLoc(expr->getExprLoc())); + + // If palignr is shifting the pair of input vectors more than one lane, + // but less than two lanes, convert to shifting in zeroes. + if (shiftVal > 16) { + shiftVal -= 16; + ops[1] = ops[0]; + ops[0] = + builder.getNullValue(ops[0].getType(), getLoc(expr->getExprLoc())); + } + + int64_t indices[64]; + // 256-bit palignr operates on 128-bit lanes so we need to handle that + for (unsigned l = 0; l != numElts; l += 16) { + for (unsigned i = 0; i != 16; ++i) { + uint32_t idx = shiftVal + i; + if (idx >= 16) + idx += numElts - 16; // End of lane, switch operand. + indices[l + i] = l + idx; + } + } + + return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[1], ops[0], + ArrayRef(indices, numElts)); + } case X86::BI__builtin_ia32_alignd128: case X86::BI__builtin_ia32_alignd256: case X86::BI__builtin_ia32_alignd512: diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index b426f3389ff1b..53920fbce7bde 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1349,6 +1349,28 @@ class CIRGenFunction : public CIRGenTypeCache { cir::IntType resType, mlir::Value emittedE, bool isDynamic); + /// Get integer from a mlir::Value that is an int constant or a constant op. + static int64_t getSExtIntValueFromConstOp(mlir::Value val) { + auto constOp = val.getDefiningOp<cir::ConstantOp>(); + assert(constOp && "getIntValueFromConstOp call with non ConstantOp"); + return constOp.getIntValue().getSExtValue(); + } + + /// Get zero-extended integer from a mlir::Value that is an int constant or a + /// constant op. + static int64_t getZExtIntValueFromConstOp(mlir::Value val) { + auto constOp = val.getDefiningOp<cir::ConstantOp>(); + assert(constOp && + "getZeroExtendedIntValueFromConstOp call with non ConstantOp"); + return constOp.getIntValue().getZExtValue(); + } + + /// Get size of type in bits using SizedTypeInterface + llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const { + assert(cir::isSized(Ty) && "Type must implement SizedTypeInterface"); + return cgm.getDataLayout().getTypeSizeInBits(ty); + } + mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e, unsigned type, cir::IntType resType, @@ -1804,7 +1826,7 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s); - mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e); + mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr); /// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is /// nonnull, if 1\p LHS is marked _Nonnull. diff --git a/clang/test/CIR/CodeGen/X86/avx-builtins.c b/clang/test/CIR/CodeGen/X86/avx-builtins.c index 82fa4358dc400..66c4e166971d2 100644 --- a/clang/test/CIR/CodeGen/X86/avx-builtins.c +++ b/clang/test/CIR/CodeGen/X86/avx-builtins.c @@ -73,4 +73,85 @@ __m256i test_mm256_undefined_si256(void) { // OGCG-LABEL: test_mm256_undefined_si256 // OGCG: ret <4 x i64> zeroinitializer return _mm256_undefined_si256(); +} + +__m256d test_mm256_blend_pd(__m256d A, __m256d B) { + // CIR-LABEL: test_mm256_blend_pd + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<4> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double> + + // LLVM-LABEL: test_mm256_blend_pd + // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + + // OGCG-LABEL: test_mm256_blend_pd + // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + return _mm256_blend_pd(A, B, 0x05); +} + +__m256 test_mm256_blend_ps(__m256 A, __m256 B) { + // CIR-LABEL: test_mm256_blend_ps + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<8> : !s32i, #cir.int<1> : !s32i, #cir.int<10> : !s32i, #cir.int<3> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float> + + // LLVM-LABEL: test_mm256_blend_ps + // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7> + + // OGCG-LABEL: test_mm256_blend_ps + // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7> + return _mm256_blend_ps(A, B, 0x35); +} + +__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) { + // CIR-LABEL: test_mm256_insertf128_pd + // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double> + // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double> + + // LLVM-LABEL: test_mm256_insertf128_pd + // LLVM: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3> + return _mm256_insertf128_pd(A, B, 0); +} + +__m256 test_mm256_insertf128_ps(__m256 A, __m128 B) { + // CIR-LABEL: test_mm256_insertf128_ps + // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !cir.float> + // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i] : !cir.vector<8 x !cir.float> + + // LLVM-LABEL: test_mm256_insertf128_ps + // LLVM: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> + return _mm256_insertf128_ps(A, B, 1); +} + +__m256i test_mm256_insertf128_si256(__m256i A, __m128i B) { + // CIR-LABEL: test_mm256_insertf128_si256 + // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<8 x !s32i> + // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s32i>) [#cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i] + + // LLVM-LABEL: test_mm256_insertf128_si256 + // LLVM: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + // LLVM: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7> + return _mm256_insertf128_si256(A, B, 0); +} + +__m256d test_mm256_shuffle_pd(__m256d A, __m256d B) { + // CIR-LABEL: test_mm256_shuffle_pd + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<2> : !s32i, #cir.int<6> : !s32i] : !cir.vector<4 x !cir.double> + + // CHECK-LABEL: test_mm256_shuffle_pd + // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + + // OGCG-LABEL: test_mm256_shuffle_pd + // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + return _mm256_shuffle_pd(A, B, 0); +} + +__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) { + // CIR-LABEL: test_mm256_shuffle_ps + // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<8> : !s32i, #cir.int<8> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : !s32i, #cir.int<12> : !s32i] : !cir.vector<8 x !cir.float> + + // CHECK-LABEL: test_mm256_shuffle_ps + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12> + + // OGCG-LABEL: test_mm256_shuffle_ps + // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12> + return _mm256_shuffle_ps(A, B, 0); } \ No newline at end of file diff --git a/clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c b/clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c new file mode 100644 index 0000000000000..6384dcd0973fa --- /dev/null +++ b/clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c @@ -0,0 +1,90 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: x86-registered-target +// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx -disable-O0-optnone -fclangir -emit-cir -o %t.cir | opt -S -passes=mem2reg +// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s + +// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx -disable-O0-optnone -fclangir -emit-llvm -o %t.ll | opt -S -passes=mem2reg +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +#include <immintrin.h> + +// CIR-LABEL: @test_mm256_insertf128_pd_0( +// CIR: [[A:%.*]] = cir.load align(32) %0 : !cir.ptr<!cir.vector<4 x !cir.double>>, !cir.vector<4 x !cir.double> +// CIR: [[B:%.*]] = cir.load align(16) %1 : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double> +// CIR: %{{.*}} = cir.vec.shuffle([[B]], %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double> +// CIR-NEXT: %{{.*}} = cir.vec.shuffle([[A]], %{{.*}} : !s32i, #cir.int<5> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double> +// CIR: cir.return %{{.*}} : !cir.vector<4 x !cir.double> + +// LLVM-LABEL: @test_mm256_insertf128_pd_0 +// LLVM: [[A:%.*]] = load <4 x double>, ptr %{{.*}}, align 32 +// LLVM: [[B:%.*]] = load <2 x double>, ptr %{{.*}}, align 16 +// LLVM-NEXT: [[WIDEN:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +// LLVM-NEXT: [[INSERT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[WIDEN]], <4 x i32> <i32 4, i32 5, i32 2, i32 3> +// LLVM: ret <4 x double> +__m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) { + return _mm256_insertf128_pd(a, b, 0); +} + +// CIR-LABEL: @test_mm256_insertf128_ps_0( +// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/169178 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
