simon_tatham created this revision.
simon_tatham added reviewers: ostannard, MarkMurrayARM, dmgreen.
Herald added subscribers: llvm-commits, cfe-commits, hiraditya, kristof.beyls.
Herald added projects: clang, LLVM.

This commit adds the `vpselq` intrinsics which take an MVE predicate
word and select lanes from two vectors; the `vctp` intrinsics which
create a tail predicate word suitable for processing the first m
elements of a vector (e.g. in the last iteration of a loop); and
`vpnot`, which simply complements a predicate word and is just
syntactic sugar for the `~` operator.

Most sizes of `vctp` are lowered to the existing Arm IR intrinsics
that MVE shares with NEON. We already had isel patterns for converting
those into MVE VCTP, although I've added extra ones to generate the
predicated version. But I've made a special MVE version of the 64-bit
VCTP IR intrinsic, because the NEON version of that is defined to take
a `v2i1`, whereas in MVE that's not a legal type. So the MVE version
does the usual workaround of using `v4i1` instead.

I needed one small tweak in MveEmitter to allow the `unpromoted` type
modifier to apply to predicates as well as integers, so that `vpnot`
doesn't pointlessly convert its input integer to an `<n x i1>` before
complementing it.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D70485

Files:
  clang/include/clang/Basic/arm_mve.td
  clang/include/clang/Basic/arm_mve_defs.td
  clang/test/CodeGen/arm-mve-intrinsics/predicates.c
  clang/utils/TableGen/MveEmitter.cpp
  llvm/include/llvm/IR/IntrinsicsARM.td
  llvm/lib/Target/ARM/ARMInstrMVE.td
  llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll

Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
@@ -0,0 +1,219 @@
+; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s
+
+declare <16 x i1> @llvm.arm.vctp8(i32)
+declare <8 x i1> @llvm.arm.vctp16(i32)
+declare <4 x i1> @llvm.arm.vctp32(i32)
+declare <4 x i1> @llvm.arm.mve.vctp64(i32)
+
+declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>)
+declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
+declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>)
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp8q(i32 %a) {
+; CHECK-LABEL: test_vctp8q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.8 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <16 x i1> @llvm.arm.vctp8(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp8q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp8q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.8 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i1> @llvm.arm.vctp8(i32 %a)
+  %3 = and <16 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp16q(i32 %a) {
+; CHECK-LABEL: test_vctp16q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.16 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <8 x i1> @llvm.arm.vctp16(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp16q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp16q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.16 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i1> @llvm.arm.vctp16(i32 %a)
+  %3 = and <8 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp32q(i32 %a) {
+; CHECK-LABEL: test_vctp32q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.32 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i1> @llvm.arm.vctp32(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp32q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp32q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.32 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i1> @llvm.arm.vctp32(i32 %a)
+  %3 = and <4 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp64q(i32 %a) {
+; CHECK-LABEL: test_vctp64q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.64 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp64q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp64q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.64 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a)
+  %3 = and <4 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vpselq_i8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vpselq_i16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vpselq_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = select <8 x i1> %1, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vpselq_i32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vpselq_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vpselq_i64(<2 x i64> %a, <2 x i64> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = bitcast <2 x i64> %a to <4 x i32>
+  %3 = bitcast <2 x i64> %b to <4 x i32>
+  %4 = select <4 x i1> %1, <4 x i32> %2, <4 x i32> %3
+  %5 = bitcast <4 x i32> %4 to <2 x i64>
+  ret <2 x i64> %5
+}
Index: llvm/lib/Target/ARM/ARMInstrMVE.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrMVE.td
+++ llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4199,20 +4199,21 @@
   let validForTailPredication = 1;
 }
 
-def MVE_VCTP8  : MVE_VCTP<"8",  0b00>;
-def MVE_VCTP16 : MVE_VCTP<"16", 0b01>;
-def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
-def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
+multiclass MVE_VCTP_and_patterns<MVEVectorVTInfo VTI, Intrinsic intr> {
+  def "": MVE_VCTP<VTI.BitsSuffix, VTI.Size>;
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(int_arm_vctp8 rGPR:$Rn),
-            (v16i1 (MVE_VCTP8 rGPR:$Rn))>;
-  def : Pat<(int_arm_vctp16 rGPR:$Rn),
-            (v8i1 (MVE_VCTP16 rGPR:$Rn))>;
-  def : Pat<(int_arm_vctp32 rGPR:$Rn),
-            (v4i1 (MVE_VCTP32 rGPR:$Rn))>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(intr rGPR:$Rn), (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn))>;
+    def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)),
+              (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn, 1, VCCR:$mask))>;
+  }
 }
 
+defm MVE_VCTP8  : MVE_VCTP_and_patterns<MVE_v16i8, int_arm_vctp8>;
+defm MVE_VCTP16 : MVE_VCTP_and_patterns<MVE_v8i16, int_arm_vctp16>;
+defm MVE_VCTP32 : MVE_VCTP_and_patterns<MVE_v4i32, int_arm_vctp32>;
+defm MVE_VCTP64 : MVE_VCTP_and_patterns<MVE_v2i64, int_arm_mve_vctp64>;
+
 // end of mve_qDest_rSrc
 
 // start of coproc mov
Index: llvm/include/llvm/IR/IntrinsicsARM.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsARM.td
+++ llvm/include/llvm/IR/IntrinsicsARM.td
@@ -782,6 +782,9 @@
 def int_arm_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>;
 
+// MVE version of vctp64, working around v2i1 not being a legal MVE type
+def int_arm_mve_vctp64 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+
 // GNU eabi mcount
 def int_arm_gnu_eabi_mcount : Intrinsic<[],
                                     [],
Index: clang/utils/TableGen/MveEmitter.cpp
===================================================================
--- clang/utils/TableGen/MveEmitter.cpp
+++ clang/utils/TableGen/MveEmitter.cpp
@@ -1208,14 +1208,16 @@
   Result::Ptr V =
       std::make_shared<BuiltinArgResult>(ArgNum, isa<PointerType>(ArgType));
 
-  if (const auto *ST = dyn_cast<ScalarType>(ArgType)) {
-    if (Promote && ST->isInteger() && ST->sizeInBits() < 32)
+  if (Promote) {
+    if (const auto *ST = dyn_cast<ScalarType>(ArgType)) {
+      if (ST->isInteger() && ST->sizeInBits() < 32)
+        V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
+    } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
       V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-  } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
-    V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-    V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
-                                            std::vector<const Type *>{PT},
-                                            std::vector<Result::Ptr>{V});
+      V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
+                                              std::vector<const Type *>{PT},
+                                              std::vector<Result::Ptr>{V});
+    }
   }
 
   return V;
Index: clang/test/CodeGen/arm-mve-intrinsics/predicates.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/arm-mve-intrinsics/predicates.c
@@ -0,0 +1,290 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vctp16q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i1> @llvm.arm.vctp16(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp16q(uint32_t a)
+{
+    return vctp16q(a);
+}
+
+// CHECK-LABEL: @test_vctp16q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.vctp16(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp16q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp16q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp32q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp32q(uint32_t a)
+{
+    return vctp32q(a);
+}
+
+// CHECK-LABEL: @test_vctp32q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp32q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp32q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp64q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp64q(uint32_t a)
+{
+    return vctp64q(a);
+}
+
+// CHECK-LABEL: @test_vctp64q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp64q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp64q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp8q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i1> @llvm.arm.vctp8(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp8q(uint32_t a)
+{
+    return vctp8q(a);
+}
+
+// CHECK-LABEL: @test_vctp8q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.vctp8(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp8q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp8q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vpnot(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor i16 [[A:%.*]], -1
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
+mve_pred16_t test_vpnot(mve_pred16_t a)
+{
+    return vpnot(a);
+}
+
+// CHECK-LABEL: @test_vpselq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[A:%.*]], <8 x half> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vpselq_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vpselq_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vpselq_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vpselq_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+//
+int64x2_t test_vpselq_s64(int64x2_t a, int64x2_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s64(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vpselq_s8(int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vpselq_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vpselq_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+//
+uint64x2_t test_vpselq_u64(uint64x2_t a, uint64x2_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u64(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vpselq_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
Index: clang/include/clang/Basic/arm_mve_defs.td
===================================================================
--- clang/include/clang/Basic/arm_mve_defs.td
+++ clang/include/clang/Basic/arm_mve_defs.td
@@ -60,6 +60,7 @@
 def add: IRBuilder<"CreateAdd">;
 def or: IRBuilder<"CreateOr">;
 def and: IRBuilder<"CreateAnd">;
+def xor: IRBuilder<"CreateXor">;
 def sub: IRBuilder<"CreateSub">;
 def shl: IRBuilder<"CreateShl">;
 def lshr: IRBuilder<"CreateLShr">;
@@ -103,6 +104,7 @@
 def fcmp_lt: IRBuilder<"CreateFCmpOLT">;
 def fcmp_le: IRBuilder<"CreateFCmpOLE">;
 def splat: CGHelperFn<"ARMMVEVectorSplat">;
+def select: IRBuilder<"CreateSelect">;
 
 // A node that makes an Address out of a pointer-typed Value, by
 // providing an alignment as the second argument.
Index: clang/include/clang/Basic/arm_mve.td
===================================================================
--- clang/include/clang/Basic/arm_mve.td
+++ clang/include/clang/Basic/arm_mve.td
@@ -117,6 +117,33 @@
   defm: compare<"le", fcmp_le>;
 }
 
+def vpselq: Intrinsic<Vector, (args Vector:$t, Vector:$f, Predicate:$pred),
+                      (select $pred, $t, $f)> { let params = T.Usual; }
+def vpselq_64: Intrinsic<
+    Vector, (args Vector:$t, Vector:$f, PredOf<u32>:$pred),
+            (bitcast (select $pred, (bitcast $t, VecOf<u32>),
+                                    (bitcast $f, VecOf<u32>)), Vector)>,
+    NameOverride<"vpselq"> { let params = T.All64; }
+
+let params = [Void], pnt = PNT_None in {
+
+  multiclass vctp<Type pred, string intname> {
+    def "": Intrinsic<pred, (args u32:$val),
+        (u16 (IRInt<"pred_v2i", [pred]> (IRIntBase<intname> $val)))>;
+    def _m: Intrinsic<pred, (args u32:$val, pred:$inpred),
+        (u16 (IRInt<"pred_v2i", [pred]> (and $inpred,
+                                         (IRIntBase<intname> $val))))>;
+  }
+  defm vctp8q:  vctp<PredOf<u8>,  "arm_vctp8">;
+  defm vctp16q: vctp<PredOf<u16>, "arm_vctp16">;
+  defm vctp32q: vctp<PredOf<u32>, "arm_vctp32">;
+  defm vctp64q: vctp<PredOf<u64>, "arm_mve_vctp64">;
+
+  def vpnot: Intrinsic<PredOf<u8>, (args unpromoted<PredOf<u8>>:$pred),
+                       (xor $pred, (u16 65535))>;
+
+}
+
 multiclass contiguous_load<string mnemonic, PrimitiveType memtype,
                            list<Type> same_size, list<Type> wider> {
   // Intrinsics named with explicit memory and element sizes that match:
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to