[PATCH] D70485: [ARM,MVE] Add intrinsics to deal with predicates.

Simon Tatham via Phabricator via cfe-commits Mon, 02 Dec 2019 08:26:36 -0800

This revision was automatically updated to reflect the committed changes.
Closed by commit rGd173fb5d2854: [ARM,MVE] Add intrinsics to deal with 
predicates. (authored by simon_tatham).


Changed prior to commit:
  https://reviews.llvm.org/D70485?vs=230627&id=231718#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D70485/new/

https://reviews.llvm.org/D70485

Files:
  clang/include/clang/Basic/arm_mve.td
  clang/test/CodeGen/arm-mve-intrinsics/predicates.c
  clang/utils/TableGen/MveEmitter.cpp
  llvm/lib/Target/ARM/ARMInstrMVE.td
  llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll

Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
@@ -0,0 +1,219 @@
+; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s
+
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare <4 x i1> @llvm.arm.mve.vctp64(i32)
+
+declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>)
+declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
+declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>)
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp8q(i32 %a) {
+; CHECK-LABEL: test_vctp8q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.8 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp8q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp8q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.8 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %a)
+  %3 = and <16 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp16q(i32 %a) {
+; CHECK-LABEL: test_vctp16q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.16 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp16q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp16q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.16 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %a)
+  %3 = and <8 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp32q(i32 %a) {
+; CHECK-LABEL: test_vctp32q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.32 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp32q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp32q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.32 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %a)
+  %3 = and <4 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp64q(i32 %a) {
+; CHECK-LABEL: test_vctp64q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.64 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp64q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp64q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.64 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a)
+  %3 = and <4 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vpselq_i8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vpselq_i16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vpselq_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = select <8 x i1> %1, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vpselq_i32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vpselq_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vpselq_i64(<2 x i64> %a, <2 x i64> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = bitcast <2 x i64> %a to <4 x i32>
+  %3 = bitcast <2 x i64> %b to <4 x i32>
+  %4 = select <4 x i1> %1, <4 x i32> %2, <4 x i32> %3
+  %5 = bitcast <4 x i32> %4 to <2 x i64>
+  ret <2 x i64> %5
+}
Index: llvm/lib/Target/ARM/ARMInstrMVE.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrMVE.td
+++ llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4267,7 +4267,7 @@
 def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
 
 let hasSideEffects = 1 in
-class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
+class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
           "$Rn", vpred_n, "", pattern> {
   bits<4> Rn;
@@ -4285,20 +4285,22 @@
   let validForTailPredication = 1;
 }
 
-def MVE_VCTP8  : MVE_VCTP<"8",  0b00>;
-def MVE_VCTP16 : MVE_VCTP<"16", 0b01>;
-def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
-def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
+multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> {
+  def "": MVE_VCTPInst<VTI.BitsSuffix, VTI.Size>;
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(int_arm_mve_vctp8 rGPR:$Rn),
-            (v16i1 (MVE_VCTP8 rGPR:$Rn))>;
-  def : Pat<(int_arm_mve_vctp16 rGPR:$Rn),
-            (v8i1 (MVE_VCTP16 rGPR:$Rn))>;
-  def : Pat<(int_arm_mve_vctp32 rGPR:$Rn),
-            (v4i1 (MVE_VCTP32 rGPR:$Rn))>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(intr rGPR:$Rn),
+              (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn))>;
+    def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)),
+              (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn, 1, VCCR:$mask))>;
+  }
 }
 
+defm MVE_VCTP8  : MVE_VCTP<MVE_v16i8, int_arm_mve_vctp8>;
+defm MVE_VCTP16 : MVE_VCTP<MVE_v8i16, int_arm_mve_vctp16>;
+defm MVE_VCTP32 : MVE_VCTP<MVE_v4i32, int_arm_mve_vctp32>;
+defm MVE_VCTP64 : MVE_VCTP<MVE_v2i64, int_arm_mve_vctp64>;
+
 // end of mve_qDest_rSrc
 
 // start of coproc mov
Index: clang/utils/TableGen/MveEmitter.cpp
===================================================================
--- clang/utils/TableGen/MveEmitter.cpp
+++ clang/utils/TableGen/MveEmitter.cpp
@@ -1208,14 +1208,16 @@
   Result::Ptr V =
       std::make_shared<BuiltinArgResult>(ArgNum, isa<PointerType>(ArgType));
 
-  if (const auto *ST = dyn_cast<ScalarType>(ArgType)) {
-    if (Promote && ST->isInteger() && ST->sizeInBits() < 32)
+  if (Promote) {
+    if (const auto *ST = dyn_cast<ScalarType>(ArgType)) {
+      if (ST->isInteger() && ST->sizeInBits() < 32)
+        V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
+    } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
       V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-  } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
-    V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-    V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
-                                            std::vector<const Type *>{PT},
-                                            std::vector<Result::Ptr>{V});
+      V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
+                                              std::vector<const Type *>{PT},
+                                              std::vector<Result::Ptr>{V});
+    }
   }
 
   return V;
Index: clang/test/CodeGen/arm-mve-intrinsics/predicates.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/arm-mve-intrinsics/predicates.c
@@ -0,0 +1,290 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vctp16q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp16q(uint32_t a)
+{
+    return vctp16q(a);
+}
+
+// CHECK-LABEL: @test_vctp16q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp16q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp16q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp32q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp32q(uint32_t a)
+{
+    return vctp32q(a);
+}
+
+// CHECK-LABEL: @test_vctp32q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp32q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp32q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp64q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp64q(uint32_t a)
+{
+    return vctp64q(a);
+}
+
+// CHECK-LABEL: @test_vctp64q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp64q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp64q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp8q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp8q(uint32_t a)
+{
+    return vctp8q(a);
+}
+
+// CHECK-LABEL: @test_vctp8q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp8q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp8q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vpnot(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor i16 [[A:%.*]], -1
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
+mve_pred16_t test_vpnot(mve_pred16_t a)
+{
+    return vpnot(a);
+}
+
+// CHECK-LABEL: @test_vpselq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[A:%.*]], <8 x half> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vpselq_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vpselq_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vpselq_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vpselq_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+//
+int64x2_t test_vpselq_s64(int64x2_t a, int64x2_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s64(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vpselq_s8(int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vpselq_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vpselq_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+//
+uint64x2_t test_vpselq_u64(uint64x2_t a, uint64x2_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u64(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vpselq_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
Index: clang/include/clang/Basic/arm_mve.td
===================================================================
--- clang/include/clang/Basic/arm_mve.td
+++ clang/include/clang/Basic/arm_mve.td
@@ -214,6 +214,32 @@
                                (IRIntBase<"maxnum", [Vector]> $a, $b)>;
 }
 
+def vpselq: Intrinsic<Vector, (args Vector:$t, Vector:$f, Predicate:$pred),
+                      (select $pred, $t, $f)> { let params = T.Usual; }
+def vpselq_64: Intrinsic<
+    Vector, (args Vector:$t, Vector:$f, PredOf<u32>:$pred),
+            (bitcast (select $pred, (bitcast $t, VecOf<u32>),
+                                    (bitcast $f, VecOf<u32>)), Vector)>,
+    NameOverride<"vpselq"> { let params = T.All64; }
+
+let params = [Void], pnt = PNT_None in {
+
+  multiclass vctp<Type pred, string intname> {
+    def "": Intrinsic<pred, (args u32:$val),
+        (u16 (IRInt<"pred_v2i", [pred]> (IRIntBase<intname> $val)))>;
+    def _m: Intrinsic<pred, (args u32:$val, pred:$inpred),
+        (u16 (IRInt<"pred_v2i", [pred]> (and $inpred,
+                                         (IRIntBase<intname> $val))))>;
+  }
+  defm vctp8q:  vctp<PredOf<u8>,  "arm_mve_vctp8">;
+  defm vctp16q: vctp<PredOf<u16>, "arm_mve_vctp16">;
+  defm vctp32q: vctp<PredOf<u32>, "arm_mve_vctp32">;
+  defm vctp64q: vctp<PredOf<u64>, "arm_mve_vctp64">;
+
+  def vpnot: Intrinsic<PredOf<u8>, (args unpromoted<PredOf<u8>>:$pred),
+                       (xor $pred, (u16 65535))>;
+
+}
 
 multiclass contiguous_load<string mnemonic, PrimitiveType memtype,
                            list<Type> same_size, list<Type> wider> {

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D70485: [ARM,MVE] Add intrinsics to deal with predicates.

Reply via email to