This revision was automatically updated to reflect the committed changes.
Closed by commit rGc32af4447f79: [ARM,MVE] Add the vmovnbq,vmovntq intrinsic 
family. (authored by simon_tatham).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D74337/new/

https://reviews.llvm.org/D74337

Files:
  clang/include/clang/Basic/arm_mve.td
  clang/include/clang/Basic/arm_mve_defs.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
  clang/utils/TableGen/MveEmitter.cpp
  llvm/lib/Target/ARM/ARMInstrMVE.td
  llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll

Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=LE %s
+; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=BE %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_s16(<16 x i8> %a, <8 x i16> %b) {
+; LE-LABEL: test_vmovnbq_s16:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnb.i16 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovnbq_s16:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.16 q2, q1
+; BE-NEXT:    vrev64.8 q1, q0
+; BE-NEXT:    vmovnb.i16 q1, q2
+; BE-NEXT:    vrev64.8 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)
+  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %3 = trunc <16 x i16> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_s32(<8 x i16> %a, <4 x i32> %b) {
+; LE-LABEL: test_vmovnbq_s32:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnb.i32 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovnbq_s32:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.32 q2, q1
+; BE-NEXT:    vrev64.16 q1, q0
+; BE-NEXT:    vmovnb.i32 q1, q2
+; BE-NEXT:    vrev64.16 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)
+  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %3 = trunc <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_u16(<16 x i8> %a, <8 x i16> %b) {
+; LE-LABEL: test_vmovnbq_u16:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnb.i16 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovnbq_u16:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.16 q2, q1
+; BE-NEXT:    vrev64.8 q1, q0
+; BE-NEXT:    vmovnb.i16 q1, q2
+; BE-NEXT:    vrev64.8 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)
+  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %3 = trunc <16 x i16> %2 to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_u32(<8 x i16> %a, <4 x i32> %b) {
+; LE-LABEL: test_vmovnbq_u32:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnb.i32 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovnbq_u32:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.32 q2, q1
+; BE-NEXT:    vrev64.16 q1, q0
+; BE-NEXT:    vmovnb.i32 q1, q2
+; BE-NEXT:    vrev64.16 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)
+  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %3 = trunc <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_s16(<16 x i8> %a, <8 x i16> %b) {
+; LE-LABEL: test_vmovntq_s16:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnt.i16 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovntq_s16:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.16 q2, q1
+; BE-NEXT:    vrev64.8 q1, q0
+; BE-NEXT:    vmovnt.i16 q1, q2
+; BE-NEXT:    vrev64.8 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
+  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_s32(<8 x i16> %a, <4 x i32> %b) {
+; LE-LABEL: test_vmovntq_s32:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnt.i32 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovntq_s32:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.32 q2, q1
+; BE-NEXT:    vrev64.16 q1, q0
+; BE-NEXT:    vmovnt.i32 q1, q2
+; BE-NEXT:    vrev64.16 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
+  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_u16(<16 x i8> %a, <8 x i16> %b) {
+; LE-LABEL: test_vmovntq_u16:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnt.i16 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovntq_u16:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.16 q2, q1
+; BE-NEXT:    vrev64.8 q1, q0
+; BE-NEXT:    vmovnt.i16 q1, q2
+; BE-NEXT:    vrev64.8 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
+  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_u32(<8 x i16> %a, <4 x i32> %b) {
+; LE-LABEL: test_vmovntq_u32:
+; LE:       @ %bb.0: @ %entry
+; LE-NEXT:    vmovnt.i32 q0, q1
+; LE-NEXT:    bx lr
+;
+; BE-LABEL: test_vmovntq_u32:
+; BE:       @ %bb.0: @ %entry
+; BE-NEXT:    vrev64.32 q2, q1
+; BE-NEXT:    vrev64.16 q1, q0
+; BE-NEXT:    vmovnt.i32 q1, q2
+; BE-NEXT:    vrev64.16 q0, q1
+; BE-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
+  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
+
+declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8>)
+declare <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16>)
Index: llvm/lib/Target/ARM/ARMInstrMVE.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrMVE.td
+++ llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4322,8 +4322,16 @@
             (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
   def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))),
             (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+
+  def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qm),
+                             (v8i16 (ARMvrev32 MQPR:$Qd_src)), (i32 1))),
+            (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+  def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qm),
+                             (v16i8 (ARMvrev16 MQPR:$Qd_src)), (i32 1))),
+            (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
 }
 
+
 class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
                   list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
Index: clang/utils/TableGen/MveEmitter.cpp
===================================================================
--- clang/utils/TableGen/MveEmitter.cpp
+++ clang/utils/TableGen/MveEmitter.cpp
@@ -1188,6 +1188,18 @@
     } else {
       PrintFatalError("unsignedflag's argument should be a scalar type");
     }
+  } else if (Op->getName() == "bitsize") {
+    if (D->getNumArgs() != 1)
+      PrintFatalError("bitsize should have exactly one argument");
+    Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
+    if (!TypeRec->isSubClassOf("Type"))
+      PrintFatalError("bitsize's argument should be a type");
+    if (const auto *ST = dyn_cast<ScalarType>(getType(TypeRec, Param))) {
+      return std::make_shared<IntLiteralResult>(getScalarType("u32"),
+                                                ST->sizeInBits());
+    } else {
+      PrintFatalError("bitsize's argument should be a scalar type");
+    }
   } else {
     std::vector<Result::Ptr> Args;
     for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i)
Index: clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/arm-mve-intrinsics/vmovn.c
@@ -0,0 +1,199 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=LE %s
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=BE %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=LE %s
+// RUN: %clang_cc1 -triple thumbebv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck --check-prefix=BE %s
+
+#include <arm_mve.h>
+
+// LE-LABEL: @test_vmovnbq_s16(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// LE-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// LE-NEXT:    ret <16 x i8> [[TMP3]]
+//
+// BE-LABEL: @test_vmovnbq_s16(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// BE-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[TMP0]])
+// BE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// BE-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// BE-NEXT:    ret <16 x i8> [[TMP3]]
+//
+int8x16_t test_vmovnbq_s16(int8x16_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovnbq_s32(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// LE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
+// LE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// LE-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// LE-NEXT:    ret <8 x i16> [[TMP3]]
+//
+// BE-LABEL: @test_vmovnbq_s32(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// BE-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[TMP0]])
+// BE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// BE-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// BE-NEXT:    ret <8 x i16> [[TMP3]]
+//
+int16x8_t test_vmovnbq_s32(int16x8_t a, int32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovnbq_u16(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// LE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// LE-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// LE-NEXT:    ret <16 x i8> [[TMP3]]
+//
+// BE-LABEL: @test_vmovnbq_u16(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// BE-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[TMP0]])
+// BE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// BE-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[TMP2]] to <16 x i8>
+// BE-NEXT:    ret <16 x i8> [[TMP3]]
+//
+uint8x16_t test_vmovnbq_u16(uint8x16_t a, uint16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovnbq_u32(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// LE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
+// LE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// LE-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// LE-NEXT:    ret <8 x i16> [[TMP3]]
+//
+// BE-LABEL: @test_vmovnbq_u32(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// BE-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[TMP0]])
+// BE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// BE-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+// BE-NEXT:    ret <8 x i16> [[TMP3]]
+//
+uint16x8_t test_vmovnbq_u32(uint16x8_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovnbq(a, b);
+#else /* POLYMORPHIC */
+    return vmovnbq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovntq_s16(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x i16>
+// LE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// LE-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// LE-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// BE-LABEL: @test_vmovntq_s16(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[A:%.*]])
+// BE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// BE-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// BE-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vmovntq_s16(int8x16_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovntq_s32(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <4 x i32>
+// LE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// LE-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// LE-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// BE-LABEL: @test_vmovntq_s32(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[A:%.*]])
+// BE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// BE-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// BE-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmovntq_s32(int16x8_t a, int32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovntq_u16(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x i16>
+// LE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// LE-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// LE-NEXT:    ret <16 x i8> [[TMP2]]
+//
+// BE-LABEL: @test_vmovntq_u16(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> [[A:%.*]])
+// BE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// BE-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[TMP1]] to <16 x i8>
+// BE-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vmovntq_u16(uint8x16_t a, uint16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// LE-LABEL: @test_vmovntq_u32(
+// LE-NEXT:  entry:
+// LE-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <4 x i32>
+// LE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// LE-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// LE-NEXT:    ret <8 x i16> [[TMP2]]
+//
+// BE-LABEL: @test_vmovntq_u32(
+// BE-NEXT:  entry:
+// BE-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> [[A:%.*]])
+// BE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+// BE-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+// BE-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmovntq_u32(uint16x8_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmovntq(a, b);
+#else /* POLYMORPHIC */
+    return vmovntq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -7067,6 +7067,19 @@
                                      Indices);
 }
 
+static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
+                              llvm::Value *V1) {
+  // Make a shufflevector that interleaves two vectors element by element.
+  assert(V0->getType() == V1->getType() && "Can't zip different vector types");
+  SmallVector<uint32_t, 16> Indices;
+  unsigned InputElements = V0->getType()->getVectorNumElements();
+  for (unsigned i = 0; i < InputElements; i++) {
+    Indices.push_back(i);
+    Indices.push_back(i + InputElements);
+  }
+  return Builder.CreateShuffleVector(V0, V1, Indices);
+}
+
 template<unsigned HighBit, unsigned OtherBits>
 static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
   // MVE-specific helper function to make a vector splat of a constant such as
Index: clang/include/clang/Basic/arm_mve_defs.td
===================================================================
--- clang/include/clang/Basic/arm_mve_defs.td
+++ clang/include/clang/Basic/arm_mve_defs.td
@@ -131,6 +131,7 @@
 def unzip: CGHelperFn<"VectorUnzip"> {
   let special_params = [IRBuilderIntParam<1, "bool">];
 }
+def zip: CGHelperFn<"VectorZip">;
 
 // Helper for making boolean flags in IR
 def i1: IRBuilderBase {
@@ -187,6 +188,10 @@
 // and 0 for a signed (or floating) one.
 def unsignedflag;
 
+// 'bitsize' also takes a scalar type, and expands into an integer
+// constant giving its size in bits.
+def bitsize;
+
 // If you put CustomCodegen<"foo"> in an intrinsic's codegen field, it
 // indicates that the IR generation for that intrinsic is done by handwritten
 // C++ and not autogenerated at all. The effect in the MVE builtin codegen
Index: clang/include/clang/Basic/arm_mve.td
===================================================================
--- clang/include/clang/Basic/arm_mve.td
+++ clang/include/clang/Basic/arm_mve.td
@@ -427,6 +427,14 @@
     (extend (unzip $a, 1), DblVector, (unsignedflag Scalar))>;
 }
 
+let params = [s16, u16, s32, u32] in {
+  def vmovnbq: Intrinsic<HalfVector, (args HalfVector:$inactive, Vector:$a),
+    (trunc (zip $a, (vreinterpret (vrev $inactive, (bitsize Scalar)), Vector)),
+           HalfVector)>;
+  def vmovntq: Intrinsic<HalfVector, (args HalfVector:$inactive, Vector:$a),
+    (trunc (zip (vreinterpret $inactive, Vector), $a), HalfVector)>;
+}
+
 let params = T.Float in {
   def vrndq: Intrinsic<Vector, (args Vector:$a),
       (IRIntBase<"trunc", [Vector]> $a)>;
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to