simon_tatham created this revision. simon_tatham added reviewers: ostannard, MarkMurrayARM, dmgreen. Herald added subscribers: llvm-commits, cfe-commits, hiraditya, kristof.beyls. Herald added projects: clang, LLVM.
This commit adds the `vpselq` intrinsics which take an MVE predicate word and select lanes from two vectors; the `vctp` intrinsics which create a tail predicate word suitable for processing the first m elements of a vector (e.g. in the last iteration of a loop); and `vpnot`, which simply complements a predicate word and is just syntactic sugar for the `~` operator. Most sizes of `vctp` are lowered to the existing Arm IR intrinsics that MVE shares with NEON. We already had isel patterns for converting those into MVE VCTP, although I've added extra ones to generate the predicated version. But I've made a special MVE version of the 64-bit VCTP IR intrinsic, because the NEON version of that is defined to take a `v2i1`, whereas in MVE that's not a legal type. So the MVE version does the usual workaround of using `v4i1` instead. I needed one small tweak in MveEmitter to allow the `unpromoted` type modifier to apply to predicates as well as integers, so that `vpnot` doesn't pointlessly convert its input integer to an `<n x i1>` before complementing it. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D70485 Files: clang/include/clang/Basic/arm_mve.td clang/include/clang/Basic/arm_mve_defs.td clang/test/CodeGen/arm-mve-intrinsics/predicates.c clang/utils/TableGen/MveEmitter.cpp llvm/include/llvm/IR/IntrinsicsARM.td llvm/lib/Target/ARM/ARMInstrMVE.td llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll @@ -0,0 +1,219 @@ +; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s + +declare <16 x i1> @llvm.arm.vctp8(i32) +declare <8 x i1> @llvm.arm.vctp16(i32) +declare <4 x i1> @llvm.arm.vctp32(i32) +declare <4 x i1> @llvm.arm.mve.vctp64(i32) + +declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>) +declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>) +declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>) + +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) + +define arm_aapcs_vfpcc zeroext i16 @test_vctp8q(i32 %a) { +; CHECK-LABEL: test_vctp8q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.8 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i1> @llvm.arm.vctp8(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp8q_m(i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vctp8q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vctpt.8 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i1> @llvm.arm.vctp8(i32 %a) + %3 = and <16 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %3) + %5 = trunc i32 %4 to i16 + ret i16 %5 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp16q(i32 %a) { +; CHECK-LABEL: test_vctp16q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i1> @llvm.arm.vctp16(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp16q_m(i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vctp16q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vctpt.16 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i1> @llvm.arm.vctp16(i32 %a) + %3 = and <8 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %3) + %5 = trunc i32 %4 to i16 + ret i16 %5 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp32q(i32 %a) { +; CHECK-LABEL: test_vctp32q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i1> @llvm.arm.vctp32(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp32q_m(i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vctp32q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vctpt.32 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i1> @llvm.arm.vctp32(i32 %a) + %3 = and <4 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3) + %5 = trunc i32 %4 to i16 + ret i16 %5 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp64q(i32 %a) { +; CHECK-LABEL: test_vctp64q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.64 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a) + %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vctp64q_m(i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vctp64q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vctpt.64 r0 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a) + %3 = and <4 x i1> %1, %2 + %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3) + %5 = trunc i32 %4 to i16 + ret i16 %5 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vpselq_i8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vpselq_i16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vpselq_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = select <8 x i1> %1, <8 x half> %a, <8 x half> %b + ret <8 x half> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vpselq_i32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vpselq_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = select <4 x i1> %1, <4 x float> %a, <4 x float> %b + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vpselq_i64(<2 x i64> %a, <2 x i64> %b, i16 zeroext %p) #2 { +; CHECK-LABEL: test_vpselq_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = bitcast <2 x i64> %a to <4 x i32> + %3 = bitcast <2 x i64> %b to <4 x i32> + %4 = select <4 x i1> %1, <4 x i32> %2, <4 x i32> %3 + %5 = bitcast <4 x i32> %4 to <2 x i64> + ret <2 x i64> %5 +} Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4199,20 +4199,21 @@ let validForTailPredication = 1; } -def MVE_VCTP8 : MVE_VCTP<"8", 0b00>; -def MVE_VCTP16 : MVE_VCTP<"16", 0b01>; -def MVE_VCTP32 : MVE_VCTP<"32", 0b10>; -def MVE_VCTP64 : MVE_VCTP<"64", 0b11>; +multiclass MVE_VCTP_and_patterns<MVEVectorVTInfo VTI, Intrinsic intr> { + def "": MVE_VCTP<VTI.BitsSuffix, VTI.Size>; -let Predicates = [HasMVEInt] in { - def : Pat<(int_arm_vctp8 rGPR:$Rn), - (v16i1 (MVE_VCTP8 rGPR:$Rn))>; - def : Pat<(int_arm_vctp16 rGPR:$Rn), - (v8i1 (MVE_VCTP16 rGPR:$Rn))>; - def : Pat<(int_arm_vctp32 rGPR:$Rn), - (v4i1 (MVE_VCTP32 rGPR:$Rn))>; + let Predicates = [HasMVEInt] in { + def : Pat<(intr rGPR:$Rn), (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn))>; + def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)), + (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn, 1, VCCR:$mask))>; + } } +defm MVE_VCTP8 : MVE_VCTP_and_patterns<MVE_v16i8, int_arm_vctp8>; +defm MVE_VCTP16 : MVE_VCTP_and_patterns<MVE_v8i16, int_arm_vctp16>; +defm MVE_VCTP32 : MVE_VCTP_and_patterns<MVE_v4i32, int_arm_vctp32>; +defm MVE_VCTP64 : MVE_VCTP_and_patterns<MVE_v2i64, int_arm_mve_vctp64>; + // end of mve_qDest_rSrc // start of coproc mov Index: llvm/include/llvm/IR/IntrinsicsARM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsARM.td +++ llvm/include/llvm/IR/IntrinsicsARM.td @@ -782,6 +782,9 @@ def int_arm_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>; +// MVE version of vctp64, working around v2i1 not being a legal MVE type +def int_arm_mve_vctp64 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>; + // GNU eabi mcount def int_arm_gnu_eabi_mcount : Intrinsic<[], [], Index: clang/utils/TableGen/MveEmitter.cpp =================================================================== --- clang/utils/TableGen/MveEmitter.cpp +++ clang/utils/TableGen/MveEmitter.cpp @@ -1208,14 +1208,16 @@ Result::Ptr V = std::make_shared<BuiltinArgResult>(ArgNum, isa<PointerType>(ArgType)); - if (const auto *ST = dyn_cast<ScalarType>(ArgType)) { - if (Promote && ST->isInteger() && ST->sizeInBits() < 32) + if (Promote) { + if (const auto *ST = dyn_cast<ScalarType>(ArgType)) { + if (ST->isInteger() && ST->sizeInBits() < 32) + V = std::make_shared<IntCastResult>(getScalarType("u32"), V); + } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) { V = std::make_shared<IntCastResult>(getScalarType("u32"), V); - } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) { - V = std::make_shared<IntCastResult>(getScalarType("u32"), V); - V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v", - std::vector<const Type *>{PT}, - std::vector<Result::Ptr>{V}); + V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v", + std::vector<const Type *>{PT}, + std::vector<Result::Ptr>{V}); + } } return V; Index: clang/test/CodeGen/arm-mve-intrinsics/predicates.c =================================================================== --- /dev/null +++ clang/test/CodeGen/arm-mve-intrinsics/predicates.c @@ -0,0 +1,290 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s + +#include <arm_mve.h> + +// CHECK-LABEL: @test_vctp16q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i1> @llvm.arm.vctp16(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +mve_pred16_t test_vctp16q(uint32_t a) +{ + return vctp16q(a); +} + +// CHECK-LABEL: @test_vctp16q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.vctp16(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +// CHECK-NEXT: ret i16 [[TMP5]] +// +mve_pred16_t test_vctp16q_m(uint32_t a, mve_pred16_t p) +{ + return vctp16q_m(a, p); +} + +// CHECK-LABEL: @test_vctp32q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +mve_pred16_t test_vctp32q(uint32_t a) +{ + return vctp32q(a); +} + +// CHECK-LABEL: @test_vctp32q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +// CHECK-NEXT: ret i16 [[TMP5]] +// +mve_pred16_t test_vctp32q_m(uint32_t a, mve_pred16_t p) +{ + return vctp32q_m(a, p); +} + +// CHECK-LABEL: @test_vctp64q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +mve_pred16_t test_vctp64q(uint32_t a) +{ + return vctp64q(a); +} + +// CHECK-LABEL: @test_vctp64q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +// CHECK-NEXT: ret i16 [[TMP5]] +// +mve_pred16_t test_vctp64q_m(uint32_t a, mve_pred16_t p) +{ + return vctp64q_m(a, p); +} + +// CHECK-LABEL: @test_vctp8q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i1> @llvm.arm.vctp8(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +mve_pred16_t test_vctp8q(uint32_t a) +{ + return vctp8q(a); +} + +// CHECK-LABEL: @test_vctp8q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.vctp8(i32 [[A:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = and <16 x i1> [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +// CHECK-NEXT: ret i16 [[TMP5]] +// +mve_pred16_t test_vctp8q_m(uint32_t a, mve_pred16_t p) +{ + return vctp8q_m(a, p); +} + +// CHECK-LABEL: @test_vpnot( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = xor i16 [[A:%.*]], -1 +// CHECK-NEXT: ret i16 [[TMP0]] +// +mve_pred16_t test_vpnot(mve_pred16_t a) +{ + return vpnot(a); +} + +// CHECK-LABEL: @test_vpselq_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[A:%.*]], <8 x half> [[B:%.*]] +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// +float16x8_t test_vpselq_f16(float16x8_t a, float16x8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_f16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]] +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vpselq_f32(float32x4_t a, float32x4_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_f32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vpselq_s16(int16x8_t a, int16x8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vpselq_s32(int32x4_t a, int32x4_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP5]] +// +int64x2_t test_vpselq_s64(int64x2_t a, int64x2_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_s64(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]] +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vpselq_s8(int8x16_t a, int8x16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_s8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vpselq_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_u16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vpselq_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_u32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP5]] +// +uint64x2_t test_vpselq_u64(uint64x2_t a, uint64x2_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_u64(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vpselq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]] +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vpselq_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vpselq(a, b, p); +#else /* POLYMORPHIC */ + return vpselq_u8(a, b, p); +#endif /* POLYMORPHIC */ +} + Index: clang/include/clang/Basic/arm_mve_defs.td =================================================================== --- clang/include/clang/Basic/arm_mve_defs.td +++ clang/include/clang/Basic/arm_mve_defs.td @@ -60,6 +60,7 @@ def add: IRBuilder<"CreateAdd">; def or: IRBuilder<"CreateOr">; def and: IRBuilder<"CreateAnd">; +def xor: IRBuilder<"CreateXor">; def sub: IRBuilder<"CreateSub">; def shl: IRBuilder<"CreateShl">; def lshr: IRBuilder<"CreateLShr">; @@ -103,6 +104,7 @@ def fcmp_lt: IRBuilder<"CreateFCmpOLT">; def fcmp_le: IRBuilder<"CreateFCmpOLE">; def splat: CGHelperFn<"ARMMVEVectorSplat">; +def select: IRBuilder<"CreateSelect">; // A node that makes an Address out of a pointer-typed Value, by // providing an alignment as the second argument. Index: clang/include/clang/Basic/arm_mve.td =================================================================== --- clang/include/clang/Basic/arm_mve.td +++ clang/include/clang/Basic/arm_mve.td @@ -117,6 +117,33 @@ defm: compare<"le", fcmp_le>; } +def vpselq: Intrinsic<Vector, (args Vector:$t, Vector:$f, Predicate:$pred), + (select $pred, $t, $f)> { let params = T.Usual; } +def vpselq_64: Intrinsic< + Vector, (args Vector:$t, Vector:$f, PredOf<u32>:$pred), + (bitcast (select $pred, (bitcast $t, VecOf<u32>), + (bitcast $f, VecOf<u32>)), Vector)>, + NameOverride<"vpselq"> { let params = T.All64; } + +let params = [Void], pnt = PNT_None in { + + multiclass vctp<Type pred, string intname> { + def "": Intrinsic<pred, (args u32:$val), + (u16 (IRInt<"pred_v2i", [pred]> (IRIntBase<intname> $val)))>; + def _m: Intrinsic<pred, (args u32:$val, pred:$inpred), + (u16 (IRInt<"pred_v2i", [pred]> (and $inpred, + (IRIntBase<intname> $val))))>; + } + defm vctp8q: vctp<PredOf<u8>, "arm_vctp8">; + defm vctp16q: vctp<PredOf<u16>, "arm_vctp16">; + defm vctp32q: vctp<PredOf<u32>, "arm_vctp32">; + defm vctp64q: vctp<PredOf<u64>, "arm_mve_vctp64">; + + def vpnot: Intrinsic<PredOf<u8>, (args unpromoted<PredOf<u8>>:$pred), + (xor $pred, (u16 65535))>; + +} + multiclass contiguous_load<string mnemonic, PrimitiveType memtype, list<Type> same_size, list<Type> wider> { // Intrinsics named with explicit memory and element sizes that match:
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits