Author: Omar Hossam Date: 2025-12-02T17:19:41-08:00 New Revision: 271e99daf0ff860d0ab50c688ba5e0480de78847
URL: https://github.com/llvm/llvm-project/commit/271e99daf0ff860d0ab50c688ba5e0480de78847 DIFF: https://github.com/llvm/llvm-project/commit/271e99daf0ff860d0ab50c688ba5e0480de78847.diff LOG: [CIR] Support x86 builtin rotate (#169566) This PR implements CodeGen for rotate builtins in CIR upstream. Issue https://github.com/llvm/llvm-project/issues/167765 Added: clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c Modified: clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c Removed: ################################################################################ diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index b2e498a0fea64..2d503ddb08a7d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -11,10 +11,14 @@ // //===----------------------------------------------------------------------===// +#include "CIRGenBuilder.h" #include "CIRGenFunction.h" #include "CIRGenModule.h" +#include "mlir/IR/Location.h" +#include "mlir/IR/ValueRange.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/TargetBuiltins.h" +#include "clang/CIR/Dialect/IR/CIRTypes.h" #include "clang/CIR/MissingFeatures.h" using namespace clang; @@ -231,6 +235,40 @@ static mlir::Value emitVecInsert(CIRGenBuilderTy &builder, mlir::Location loc, return cir::VecInsertOp::create(builder, loc, vec, value, indexVal); } +static mlir::Value emitX86FunnelShift(CIRGenBuilderTy &builder, + mlir::Location location, mlir::Value &op0, + mlir::Value &op1, mlir::Value &amt, + bool isRight) { + mlir::Type op0Ty = op0.getType(); + + // Amount may be scalar immediate, in which case create a splat vector. + // Funnel shifts amounts are treated as modulo and types are all power-of-2 + // so we only care about the lowest log2 bits anyway. + if (amt.getType() != op0Ty) { + auto vecTy = mlir::cast<cir::VectorType>(op0Ty); + uint64_t numElems = vecTy.getSize(); + + auto amtTy = mlir::cast<cir::IntType>(amt.getType()); + auto vecElemTy = mlir::cast<cir::IntType>(vecTy.getElementType()); + + // If signed, cast to the same width but unsigned first to + // ensure zero-extension when casting to a bigger unsigned `vecElemeTy`. + if (amtTy.isSigned()) { + cir::IntType unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth()); + amt = builder.createIntCast(amt, unsignedAmtTy); + } + cir::IntType unsignedVecElemType = builder.getUIntNTy(vecElemTy.getWidth()); + amt = builder.createIntCast(amt, unsignedVecElemType); + amt = cir::VecSplatOp::create( + builder, location, cir::VectorType::get(unsignedVecElemType, numElems), + amt); + } + + const StringRef intrinsicName = isRight ? "fshr" : "fshl"; + return emitIntrinsicCallOp(builder, location, intrinsicName, op0Ty, + mlir::ValueRange{op0, op1, amt}); +} + mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) { if (builtinID == Builtin::BI__builtin_cpu_is) { @@ -926,12 +964,16 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_prolq128: case X86::BI__builtin_ia32_prolq256: case X86::BI__builtin_ia32_prolq512: + return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[0], + ops[0], ops[1], false); case X86::BI__builtin_ia32_prord128: case X86::BI__builtin_ia32_prord256: case X86::BI__builtin_ia32_prord512: case X86::BI__builtin_ia32_prorq128: case X86::BI__builtin_ia32_prorq256: case X86::BI__builtin_ia32_prorq512: + return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[0], + ops[0], ops[1], true); case X86::BI__builtin_ia32_selectb_128: case X86::BI__builtin_ia32_selectb_256: case X86::BI__builtin_ia32_selectb_512: diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c index 7b9579ec4a238..a0e50ff9b4017 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c @@ -494,3 +494,36 @@ __m512i test_mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, __m25 // OGCG: call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512 return _mm512_mask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2); } + +__m512i test_mm512_ror_epi32(__m512i __A) { + // CIR-LABEL: test_mm512_ror_epi32 + // CIR: cir.cast integral %{{.*}} : !s32i -> !u32i + // CIR: cir.vec.splat %{{.*}} : !u32i, !cir.vector<16 x !u32i> + // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}: (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i> + + // LLVM-LABEL: test_mm512_ror_epi32 + // LLVM: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32> + // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5)) + + // OGCG-LABEL: test_mm512_ror_epi32 + // OGCG: %[[CASTED_VAR:.*]] = bitcast <8 x i64> %{{.*}} to <16 x i32> + // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5)) + return _mm512_ror_epi32(__A, 5); +} + +__m512i test_mm512_ror_epi64(__m512i __A) { + // CIR-LABEL: test_mm512_ror_epi64 + // CIR: cir.cast integral %{{.*}} : !s32i -> !u32i + // CIR: cir.cast integral %{{.*}} : !u32i -> !u64i + // CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<8 x !u64i> + // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}: (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i> + + // LLVM-LABEL: test_mm512_ror_epi64 + // LLVM: %[[VAR:.*]] = load <8 x i64>, ptr %{{.*}}, align 64 + // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5)) + + // OGCG-LABEL: test_mm512_ror_epi64 + // OGCG: %[[VAR:.*]] = load <8 x i64>, ptr %{{.*}}, align 64 + // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x i64> %[[VAR]], <8 x i64> splat (i64 5)) + return _mm512_ror_epi64(__A, 5); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c new file mode 100644 index 0000000000000..0aaba7b46327d --- /dev/null +++ b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c @@ -0,0 +1,92 @@ +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir -emit-llvm -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG + +#include <x86intrin.h> + +// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually +// CIR shall be able to support fully. + +__m128i test_mm_roti_epi8(__m128i a) { + // CIR-LABEL: test_mm_roti_epi8 + // CIR: cir.vec.splat %{{.*}} : !{{[us]}}8i, !cir.vector<16 x !{{[us]}}8i> + // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}} : (!cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> !cir.vector<16 x !{{[su]}}8i> + + // LLVM-LABEL: test_mm_roti_epi8 + // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8> + // LLVM: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1)) + + // OGCG-LABEL: test_mm_roti_epi8 + // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <16 x i8> + // OGCG: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1)) + return _mm_roti_epi8(a, 1); +} + +__m128i test_mm_roti_epi16(__m128i a) { + // CIR-LABEL: test_mm_roti_epi16 + // CIR: cir.cast integral %{{.*}} : !u8i -> !u16i + // CIR: cir.vec.splat %{{.*}} : !{{[us]}}16i, !cir.vector<8 x !u16i> + // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}} : (!cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !u16i>) -> !cir.vector<8 x !{{[su]}}16i> + + // LLVM-LABEL: test_mm_roti_epi16 + // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16> + // LLVM: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50)) + + // OGCG-LABEL: test_mm_roti_epi16 + // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <8 x i16> + // OGCG: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50)) + return _mm_roti_epi16(a, 50); + } + +__m128i test_mm_roti_epi32(__m128i a) { + // CIR-LABEL: test_mm_roti_epi32 + // CIR: cir.cast integral %{{.*}} : !u8i -> !u32i + // CIR: cir.vec.splat %{{.*}} : !{{[us]}}32i, !cir.vector<4 x !u32i> + // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}} : (!cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !u32i>) -> !cir.vector<4 x !{{[su]}}32i> + + // LLVM-LABEL: test_mm_roti_epi32 + // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32> + // LLVM: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226)) + + // OGCG-LABEL: test_mm_roti_epi32 + // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32> + // OGCG: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226)) + return _mm_roti_epi32(a, -30); + } + +__m128i test_mm_roti_epi64(__m128i a) { + // CIR-LABEL: test_mm_roti_epi64 + // CIR: cir.cast integral %{{.*}} : !u8i -> !u64i + // CIR: cir.vec.splat %{{.*}} : !u64i, !cir.vector<2 x !u64i> + // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}} : (!cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> !cir.vector<2 x !s64i> + + // LLVM-LABEL: test_mm_roti_epi64 + // LLVM: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16 + // LLVM: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100)) + + // OGCG-LABEL: test_mm_roti_epi64 + // OGCG: %[[VAR:.*]] = load <2 x i64>, ptr %{{.*}}, align 16 + // OGCG: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x i64> %[[VAR]], <2 x i64> splat (i64 100)) + return _mm_roti_epi64(a, 100); + } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
