[PATCH] D76490: [ARM,MVE] Add ACLE intrinsics for the vminv/vmaxv family.

Simon Tatham via Phabricator via cfe-commits Fri, 20 Mar 2020 09:13:25 -0700

This revision was automatically updated to reflect the committed changes.
Closed by commit rG45a9945b9ea9: [ARM,MVE] Add ACLE intrinsics for the 
vminv/vmaxv family. (authored by simon_tatham).


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76490/new/

https://reviews.llvm.org/D76490

Files:
  clang/include/clang/Basic/arm_mve.td
  clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
  llvm/include/llvm/IR/IntrinsicsARM.td
  llvm/lib/Target/ARM/ARMISelLowering.cpp
  llvm/lib/Target/ARM/ARMInstrMVE.td
  llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll

Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll
+++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll
@@ -1,36 +1,865 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
+define arm_aapcs_vfpcc signext i8 @test_vminvq_s8(i8 signext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vminvq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.s8 r0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minv.v16i8(i32 %0, <16 x i8> %b, i32 0)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc signext i16 @test_vminvq_s16(i16 signext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vminvq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.s16 r0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minv.v8i16(i32 %0, <8 x i16> %b, i32 0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vminvq_s32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vminvq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.minv.v4i32(i32 %a, <4 x i32> %b, i32 0)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vminvq_u8(i8 zeroext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vminvq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.u8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minv.v16i8(i32 %0, <16 x i8> %b, i32 1)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vminvq_u16(i16 zeroext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vminvq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminv.u16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minv.v8i16(i32 %0, <8 x i16> %b, i32 1)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
 define arm_aapcs_vfpcc i32 @test_vminvq_u32(i32 %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_vminvq_u32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vminv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call i32 @llvm.arm.mve.minv.u.v4i32(i32 %a, <4 x i32> %b)
+  %0 = tail call i32 @llvm.arm.mve.minv.v4i32(i32 %a, <4 x i32> %b, i32 1)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc signext i8 @test_vmaxvq_s8(i8 signext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmaxvq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.s8 r0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxv.v16i8(i32 %0, <16 x i8> %b, i32 0)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc signext i16 @test_vmaxvq_s16(i16 signext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmaxvq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.s16 r0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxv.v8i16(i32 %0, <8 x i16> %b, i32 0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxvq_s32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmaxvq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.maxv.v4i32(i32 %a, <4 x i32> %b, i32 0)
   ret i32 %0
 }
 
-define arm_aapcs_vfpcc i32 @test_vmaxvq_u8(i32 %a, <16 x i8> %b) {
+define arm_aapcs_vfpcc zeroext i8 @test_vmaxvq_u8(i8 zeroext %a, <16 x i8> %b) {
 ; CHECK-LABEL: test_vmaxvq_u8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmaxv.u8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxv.v16i8(i32 %0, <16 x i8> %b, i32 1)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vmaxvq_u16(i16 zeroext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmaxvq_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.u16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxv.v8i16(i32 %0, <8 x i16> %b, i32 1)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxvq_u32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmaxvq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxv.u32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call i32 @llvm.arm.mve.maxv.u.v16i8(i32 %a, <16 x i8> %b)
+  %0 = tail call i32 @llvm.arm.mve.maxv.v4i32(i32 %a, <4 x i32> %b, i32 1)
   ret i32 %0
 }
 
-define arm_aapcs_vfpcc i32 @test_vminvq_s16(i32 %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vminvq_s16:
+define arm_aapcs_vfpcc zeroext i8 @test_vminavq_s8(i8 zeroext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vminavq_s8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vminv.s16 r0, q0
+; CHECK-NEXT:    vminav.s8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minav.v16i8(i32 %0, <16 x i8> %b)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vminavq_s16(i16 zeroext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vminavq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminav.s16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.minav.v8i16(i32 %0, <8 x i16> %b)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vminavq_s32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vminavq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminav.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.arm.mve.minav.v4i32(i32 %a, <4 x i32> %b)
+  ret i32 %0
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vmaxavq_s8(i8 zeroext %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmaxavq_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxav.s8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxav.v16i8(i32 %0, <16 x i8> %b)
+  %2 = trunc i32 %1 to i8
+  ret i8 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vmaxavq_s16(i16 zeroext %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmaxavq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxav.s16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = tail call i32 @llvm.arm.mve.maxav.v8i16(i32 %0, <8 x i16> %b)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxavq_s32(i32 %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmaxavq_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxav.s32 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call i32 @llvm.arm.mve.minv.s.v8i16(i32 %a, <8 x i16> %b)
+  %0 = tail call i32 @llvm.arm.mve.maxav.v4i32(i32 %a, <4 x i32> %b)
   ret i32 %0
 }
 
-declare i32 @llvm.arm.mve.minv.u.v4i32(i32, <4 x i32>)
-declare i32 @llvm.arm.mve.maxv.u.v16i8(i32, <16 x i8>)
-declare i32 @llvm.arm.mve.minv.s.v8i16(i32, <8 x i16>)
+define arm_aapcs_vfpcc float @test_vminnmvq_f16(float %a.coerce, <8 x half> %b) {
+; CHECK-LABEL: test_vminnmvq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vminnmv.f16 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = tail call half @llvm.arm.mve.minnmv.f16.v8f16(half %1, <8 x half> %b)
+  %3 = bitcast half %2 to i16
+  %tmp2.0.insert.ext = zext i16 %3 to i32
+  %4 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %4
+}
+
+define arm_aapcs_vfpcc float @test_vminnmvq_f32(float %a, <4 x float> %b) {
+; CHECK-LABEL: test_vminnmvq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vminnmv.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call float @llvm.arm.mve.minnmv.f32.v4f32(float %a, <4 x float> %b)
+  ret float %0
+}
+
+define arm_aapcs_vfpcc float @test_vminnmavq_f16(float %a.coerce, <8 x half> %b) {
+; CHECK-LABEL: test_vminnmavq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vminnmav.f16 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = tail call half @llvm.arm.mve.minnmav.f16.v8f16(half %1, <8 x half> %b)
+  %3 = bitcast half %2 to i16
+  %tmp2.0.insert.ext = zext i16 %3 to i32
+  %4 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %4
+}
+
+define arm_aapcs_vfpcc float @test_vminnmavq_f32(float %a, <4 x float> %b) {
+; CHECK-LABEL: test_vminnmavq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vminnmav.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call float @llvm.arm.mve.minnmav.f32.v4f32(float %a, <4 x float> %b)
+  ret float %0
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmvq_f16(float %a.coerce, <8 x half> %b) {
+; CHECK-LABEL: test_vmaxnmvq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmaxnmv.f16 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = tail call half @llvm.arm.mve.maxnmv.f16.v8f16(half %1, <8 x half> %b)
+  %3 = bitcast half %2 to i16
+  %tmp2.0.insert.ext = zext i16 %3 to i32
+  %4 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %4
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmvq_f32(float %a, <4 x float> %b) {
+; CHECK-LABEL: test_vmaxnmvq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmaxnmv.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call float @llvm.arm.mve.maxnmv.f32.v4f32(float %a, <4 x float> %b)
+  ret float %0
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmavq_f16(float %a.coerce, <8 x half> %b) {
+; CHECK-LABEL: test_vmaxnmavq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmaxnmav.f16 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = tail call half @llvm.arm.mve.maxnmav.f16.v8f16(half %1, <8 x half> %b)
+  %3 = bitcast half %2 to i16
+  %tmp2.0.insert.ext = zext i16 %3 to i32
+  %4 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %4
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmavq_f32(float %a, <4 x float> %b) {
+; CHECK-LABEL: test_vmaxnmavq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmaxnmav.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call float @llvm.arm.mve.maxnmav.f32.v4f32(float %a, <4 x float> %b)
+  ret float %0
+}
+
+define arm_aapcs_vfpcc signext i8 @test_vminvq_p_s8(i8 signext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.s8 r0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 0, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc signext i16 @test_vminvq_p_s16(i16 signext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.s16 r0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 0, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vminvq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vminvq_p_u8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.u8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 1, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vminvq_p_u16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.u16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 1, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vminvq_p_u32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminvq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminvt.u32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 1, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc signext i8 @test_vmaxvq_p_s8(i8 signext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.s8 r0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 0, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc signext i16 @test_vmaxvq_p_s16(i16 signext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.s16 r0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 0, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxvq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vmaxvq_p_u8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.u8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 1, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vmaxvq_p_u16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.u16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 1, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxvq_p_u32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxvq_p_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxvt.u32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 1, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vminavq_p_s8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminavq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminavt.s8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minav.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vminavq_p_s16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminavq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminavt.s16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.minav.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vminavq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminavq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminavt.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.minav.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc zeroext i8 @test_vmaxavq_p_s8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxavq_p_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxavt.s8 r0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i8 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxav.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, <16 x i1> %2)
+  %4 = trunc i32 %3 to i8
+  ret i8 %4
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vmaxavq_p_s16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxavq_p_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxavt.s16 r0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %a to i32
+  %1 = zext i16 %p to i32
+  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
+  %3 = tail call i32 @llvm.arm.mve.maxav.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, <8 x i1> %2)
+  %4 = trunc i32 %3 to i16
+  ret i16 %4
+}
+
+define arm_aapcs_vfpcc i32 @test_vmaxavq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxavq_p_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxavt.s32 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call i32 @llvm.arm.mve.maxav.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, <4 x i1> %1)
+  ret i32 %2
+}
+
+define arm_aapcs_vfpcc float @test_vminnmvq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminnmvq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmvt.f16 r1, q1
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3)
+  %5 = bitcast half %4 to i16
+  %tmp2.0.insert.ext = zext i16 %5 to i32
+  %6 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %6
+}
+
+define arm_aapcs_vfpcc float @test_vminnmvq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminnmvq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmvt.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call float @llvm.arm.mve.minnmv.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1)
+  ret float %2
+}
+
+define arm_aapcs_vfpcc float @test_vminnmavq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminnmavq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmavt.f16 r1, q1
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3)
+  %5 = bitcast half %4 to i16
+  %tmp2.0.insert.ext = zext i16 %5 to i32
+  %6 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %6
+}
+
+define arm_aapcs_vfpcc float @test_vminnmavq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vminnmavq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmavt.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call float @llvm.arm.mve.minnmav.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1)
+  ret float %2
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmvq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxnmvq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmvt.f16 r1, q1
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3)
+  %5 = bitcast half %4 to i16
+  %tmp2.0.insert.ext = zext i16 %5 to i32
+  %6 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %6
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmvq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxnmvq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmvt.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call float @llvm.arm.mve.maxnmv.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1)
+  ret float %2
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmavq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxnmavq_p_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmavt.f16 r1, q1
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vstr.16 s0, [sp, #2]
+; CHECK-NEXT:    ldrh.w r0, [sp, #2]
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast float %a.coerce to i32
+  %tmp.0.extract.trunc = trunc i32 %0 to i16
+  %1 = bitcast i16 %tmp.0.extract.trunc to half
+  %2 = zext i16 %p to i32
+  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
+  %4 = tail call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3)
+  %5 = bitcast half %4 to i16
+  %tmp2.0.insert.ext = zext i16 %5 to i32
+  %6 = bitcast i32 %tmp2.0.insert.ext to float
+  ret float %6
+}
+
+define arm_aapcs_vfpcc float @test_vmaxnmavq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmaxnmavq_p_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmavt.f32 r0, q1
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call float @llvm.arm.mve.maxnmav.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1)
+  ret float %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+
+declare i32 @llvm.arm.mve.minv.v16i8(i32, <16 x i8>, i32)
+declare i32 @llvm.arm.mve.minv.v8i16(i32, <8 x i16>, i32)
+declare i32 @llvm.arm.mve.minv.v4i32(i32, <4 x i32>, i32)
+declare i32 @llvm.arm.mve.maxv.v16i8(i32, <16 x i8>, i32)
+declare i32 @llvm.arm.mve.maxv.v8i16(i32, <8 x i16>, i32)
+declare i32 @llvm.arm.mve.maxv.v4i32(i32, <4 x i32>, i32)
+declare i32 @llvm.arm.mve.minav.v16i8(i32, <16 x i8>)
+declare i32 @llvm.arm.mve.minav.v8i16(i32, <8 x i16>)
+declare i32 @llvm.arm.mve.minav.v4i32(i32, <4 x i32>)
+declare i32 @llvm.arm.mve.maxav.v16i8(i32, <16 x i8>)
+declare i32 @llvm.arm.mve.maxav.v8i16(i32, <8 x i16>)
+declare i32 @llvm.arm.mve.maxav.v4i32(i32, <4 x i32>)
+declare i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32, <16 x i8>, i32, <16 x i1>)
+declare i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32, <8 x i16>, i32, <8 x i1>)
+declare i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32, <4 x i32>, i32, <4 x i1>)
+declare i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32, <16 x i8>, i32, <16 x i1>)
+declare i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32, <8 x i16>, i32, <8 x i1>)
+declare i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32, <4 x i32>, i32, <4 x i1>)
+declare i32 @llvm.arm.mve.minav.predicated.v16i8.v16i1(i32, <16 x i8>, <16 x i1>)
+declare i32 @llvm.arm.mve.minav.predicated.v8i16.v8i1(i32, <8 x i16>, <8 x i1>)
+declare i32 @llvm.arm.mve.minav.predicated.v4i32.v4i1(i32, <4 x i32>, <4 x i1>)
+declare i32 @llvm.arm.mve.maxav.predicated.v16i8.v16i1(i32, <16 x i8>, <16 x i1>)
+declare i32 @llvm.arm.mve.maxav.predicated.v8i16.v8i1(i32, <8 x i16>, <8 x i1>)
+declare i32 @llvm.arm.mve.maxav.predicated.v4i32.v4i1(i32, <4 x i32>, <4 x i1>)
+
+declare half @llvm.arm.mve.minnmv.f16.v8f16(half, <8 x half>)
+declare half @llvm.arm.mve.minnmav.f16.v8f16(half, <8 x half>)
+declare half @llvm.arm.mve.maxnmv.f16.v8f16(half, <8 x half>)
+declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>)
+declare half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>)
+declare half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>)
+declare half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>)
+declare half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>)
+
+declare float @llvm.arm.mve.minnmv.f32.v4f32(float, <4 x float>)
+declare float @llvm.arm.mve.minnmav.f32.v4f32(float, <4 x float>)
+declare float @llvm.arm.mve.maxnmv.f32.v4f32(float, <4 x float>)
+declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>)
+declare float @llvm.arm.mve.minnmv.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>)
+declare float @llvm.arm.mve.minnmav.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>)
+declare float @llvm.arm.mve.maxnmv.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>)
+declare float @llvm.arm.mve.maxnmav.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>)
Index: llvm/lib/Target/ARM/ARMInstrMVE.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrMVE.td
+++ llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -742,21 +742,42 @@
   let Predicates = [HasMVEFloat];
 }
 
-multiclass MVE_VMINMAXNMV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b1, bit_7, pattern>;
-  def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b1, bit_7, pattern>;
-}
+multiclass MVE_VMINMAXNMV_p<string iname, bit notAbs, bit isMin,
+                            MVEVectorVTInfo VTI, string intrBaseName,
+                            ValueType Scalar, RegisterClass ScalarReg> {
+  def "": MVE_VMINMAXNMV<iname, VTI.Suffix, VTI.Size{0}, notAbs, isMin>;
+  defvar Inst        = !cast<Instruction>(NAME);
+  defvar unpred_intr = !cast<Intrinsic>(intrBaseName);
+  defvar pred_intr   = !cast<Intrinsic>(intrBaseName#"_predicated");
 
-defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>;
-defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>;
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(Scalar (unpred_intr (Scalar ScalarReg:$prev),
+                                   (VTI.Vec MQPR:$vec))),
+           (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR),
+                                   (VTI.Vec MQPR:$vec)),
+                              ScalarReg)>;
+    def : Pat<(Scalar (pred_intr   (Scalar ScalarReg:$prev),
+                                   (VTI.Vec MQPR:$vec),
+                                   (VTI.Pred VCCR:$pred))),
+           (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR),
+                                   (VTI.Vec MQPR:$vec),
+                                   ARMVCCThen, (VTI.Pred VCCR:$pred)),
+                              ScalarReg)>;
+  }
+}
 
-multiclass MVE_VMINMAXNMAV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b0, bit_7, pattern>;
-  def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b0, bit_7, pattern>;
+multiclass MVE_VMINMAXNMV_fty<string iname, bit notAbs, bit isMin,
+                              string intrBase> {
+  defm f32 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v4f32, intrBase,
+                              f32, SPR>;
+  defm f16 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v8f16, intrBase,
+                              f16, HPR>;
 }
 
-defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>;
-defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>;
+defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv",  1, 1, "int_arm_mve_minnmv">;
+defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv",  1, 0, "int_arm_mve_maxnmv">;
+defm MVE_VMINNMAV: MVE_VMINMAXNMV_fty<"vminnmav", 0, 1, "int_arm_mve_minnmav">;
+defm MVE_VMAXNMAV: MVE_VMINMAXNMV_fty<"vmaxnmav", 0, 0, "int_arm_mve_maxnmav">;
 
 class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
                  bit bit_17, bit bit_7, list<dag> pattern=[]>
@@ -778,31 +799,37 @@
   let Inst{0} = 0b0;
 }
 
-multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7,
-                          MVEVectorVTInfo VTI, Intrinsic intr> {
+multiclass MVE_VMINMAXV_p<string iname, bit notAbs, bit isMin,
+                          MVEVectorVTInfo VTI, string intrBaseName> {
   def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
-                       bit_17, bit_7>;
-  defvar Inst = !cast<Instruction>(NAME);
+                       notAbs, isMin>;
+  defvar Inst        = !cast<Instruction>(NAME);
+  defvar unpred_intr = !cast<Intrinsic>(intrBaseName);
+  defvar pred_intr   = !cast<Intrinsic>(intrBaseName#"_predicated");
+  defvar base_args   = (? (i32 rGPR:$prev), (VTI.Vec MQPR:$vec));
+  defvar args        = !if(notAbs, !con(base_args, (? (i32 VTI.Unsigned))),
+                           base_args);
 
-  let Predicates = [HasMVEInt] in
-  def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))),
-                 (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(i32 !con(args, (unpred_intr))),
+              (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
+    def : Pat<(i32 !con(args, (pred_intr (VTI.Pred VCCR:$pred)))),
+              (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec),
+                         ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+  }
 }
 
-multiclass MVE_VMINMAXV_ty<string iname, bit bit_7,
-                           Intrinsic intr_s, Intrinsic intr_u> {
-  defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>;
-  defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>;
-  defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>;
-  defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>;
-  defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>;
-  defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>;
+multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> {
+  defm s8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16s8, intrBaseName>;
+  defm s16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8s16, intrBaseName>;
+  defm s32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4s32, intrBaseName>;
+  defm u8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16u8, intrBaseName>;
+  defm u16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8u16, intrBaseName>;
+  defm u32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4u32, intrBaseName>;
 }
 
-defm MVE_VMINV : MVE_VMINMAXV_ty<
-  "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>;
-defm MVE_VMAXV : MVE_VMINMAXV_ty<
-  "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>;
+defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">;
+defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
@@ -833,14 +860,14 @@
 
 }
 
-multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def s8  : MVE_VMINMAXV<iname, "s8",  0b0, 0b00, 0b0, bit_7>;
-  def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>;
-  def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b0, bit_7>;
+multiclass MVE_VMINMAXAV_ty<string iname, bit isMin, string intrBaseName> {
+  defm s8 : MVE_VMINMAXV_p<iname, 0, isMin, MVE_v16s8, intrBaseName>;
+  defm s16: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v8s16, intrBaseName>;
+  defm s32: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v4s32, intrBaseName>;
 }
 
-defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>;
-defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>;
+defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 1, "int_arm_mve_minav">;
+defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0, "int_arm_mve_maxav">;
 
 class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
                    bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
Index: llvm/lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14356,6 +14356,23 @@
       return SDValue();
     break;
   }
+
+  case Intrinsic::arm_mve_minv:
+  case Intrinsic::arm_mve_maxv:
+  case Intrinsic::arm_mve_minav:
+  case Intrinsic::arm_mve_maxav:
+  case Intrinsic::arm_mve_minv_predicated:
+  case Intrinsic::arm_mve_maxv_predicated:
+  case Intrinsic::arm_mve_minav_predicated:
+  case Intrinsic::arm_mve_maxav_predicated: {
+    // These intrinsics all take an i32 scalar operand which is narrowed to the
+    // size of a single lane of the vector type they take as the other input.
+    unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
+    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
   }
 
   return SDValue();
Index: llvm/include/llvm/IR/IntrinsicsARM.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsARM.td
+++ llvm/include/llvm/IR/IntrinsicsARM.td
@@ -798,14 +798,6 @@
 def int_arm_mve_vreinterpretq : Intrinsic<
   [llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
 
-multiclass IntrinsicSignSuffix<list<LLVMType> rets, list<LLVMType> params = [],
-                                    list<IntrinsicProperty> props = [],
-                                    string name = "",
-                                    list<SDNodeProperty> sdprops = []> {
-  def _s: Intrinsic<rets, params, props, name, sdprops>;
-  def _u: Intrinsic<rets, params, props, name, sdprops>;
-}
-
 def int_arm_mve_min_predicated: Intrinsic<[llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */,
     llvm_anyvector_ty, LLVMMatchType<0>],
@@ -891,11 +883,6 @@
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty],
     [IntrNoMem]>;
 
-defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty],
-   [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
-defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty],
-   [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
-
 multiclass MVEPredicated<list<LLVMType> rets, list<LLVMType> params,
                          LLVMType pred = llvm_anyvector_ty,
                          list<IntrinsicProperty> props = [IntrNoMem]> {
@@ -911,6 +898,19 @@
           LLVMMatchType<0>, rets[0])], props>;
 }
 
+multiclass MVE_minmaxv {
+  defm v: MVEPredicated<[llvm_i32_ty],
+     [llvm_i32_ty, llvm_anyvector_ty, llvm_i32_ty /* unsigned */]>;
+  defm av: MVEPredicated<[llvm_i32_ty],
+     [llvm_i32_ty, llvm_anyvector_ty]>;
+  defm nmv: MVEPredicated<[llvm_anyfloat_ty],
+     [LLVMMatchType<0>, llvm_anyvector_ty]>;
+  defm nmav: MVEPredicated<[llvm_anyfloat_ty],
+     [LLVMMatchType<0>, llvm_anyvector_ty]>;
+}
+defm int_arm_mve_min: MVE_minmaxv;
+defm int_arm_mve_max: MVE_minmaxv;
+
 // Intrinsic with a predicated and a non-predicated case. The predicated case
 // has two additional parameters: inactive (the value for inactive lanes, can
 // be undef) and predicate.
Index: clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
===================================================================
--- clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
+++ clang/test/CodeGen/arm-mve-intrinsics/vminvq.c
@@ -1,97 +1,853 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
-// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s
 
 #include <arm_mve.h>
 
 // CHECK-LABEL: @test_vminvq_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.s.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0)
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
 // CHECK-NEXT:    ret i8 [[TMP2]]
 //
-int8_t test_vminvq_s8(int8_t a, int8x16_t b)
-{
+int8_t test_vminvq_s8(int8_t a, int8x16_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_s8(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_s8(a, b);
 #endif /* POLYMORPHIC */
 }
 
 // CHECK-LABEL: @test_vminvq_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.s.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0)
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
 //
-int16_t test_vminvq_s16(int16_t a, int16x8_t b)
-{
+int16_t test_vminvq_s16(int16_t a, int16x8_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_s16(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_s16(a, b);
 #endif /* POLYMORPHIC */
 }
 
 // CHECK-LABEL: @test_vminvq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.s.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
-int32_t test_vminvq_s32(int32_t a, int32x4_t b)
-{
+int32_t test_vminvq_s32(int32_t a, int32x4_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_s32(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_s32(a, b);
 #endif /* POLYMORPHIC */
 }
 
 // CHECK-LABEL: @test_vminvq_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.u.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1)
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
 // CHECK-NEXT:    ret i8 [[TMP2]]
 //
-uint8_t test_vminvq_u8(uint8_t a, uint8x16_t b)
-{
+uint8_t test_vminvq_u8(uint8_t a, uint8x16_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_u8(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_u8(a, b);
 #endif /* POLYMORPHIC */
 }
 
 // CHECK-LABEL: @test_vminvq_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.u.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1)
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
 //
-uint16_t test_vminvq_u16(uint16_t a, uint16x8_t b)
-{
+uint16_t test_vminvq_u16(uint16_t a, uint16x8_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_u16(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_u16(a, b);
 #endif /* POLYMORPHIC */
 }
 
 // CHECK-LABEL: @test_vminvq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.u.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
-uint32_t test_vminvq_u32(uint32_t a, uint32x4_t b)
-{
+uint32_t test_vminvq_u32(uint32_t a, uint32x4_t b) {
 #ifdef POLYMORPHIC
-    return vminvq(a, b);
-#else /* POLYMORPHIC */
-    return vminvq_u32(a, b);
+  return vminvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminvq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
+int8_t test_vmaxvq_s8(int8_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+int16_t test_vmaxvq_s16(int16_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.maxv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int32_t test_vmaxvq_s32(int32_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
+uint8_t test_vmaxvq_u8(uint8_t a, uint8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+uint16_t test_vmaxvq_u16(uint16_t a, uint16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.maxv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmaxvq_u32(uint32_t a, uint32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxvq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minav.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
+uint8_t test_vminavq_s8(uint8_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vminavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminavq_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.minav.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+uint16_t test_vminavq_s16(uint16_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vminavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminavq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.minav.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vminavq_s32(uint32_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vminavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminavq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxav.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
+uint8_t test_vmaxavq_s8(uint8_t a, int8x16_t b) {
+#ifdef POLYMORPHIC
+  return vmaxavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxavq_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxav.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+uint16_t test_vmaxavq_s16(uint16_t a, int16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxavq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.arm.mve.maxav.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t test_vmaxavq_s32(uint32_t a, int32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxavq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmvq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.minnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float16_t test_vminnmvq_f16(float16_t a, float16x8_t b) {
+#ifdef POLYMORPHIC
+  return vminnmvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminnmvq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmvq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.arm.mve.minnmv.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float32_t test_vminnmvq_f32(float32_t a, float32x4_t b) {
+#ifdef POLYMORPHIC
+  return vminnmvq(a, b);
+#else  /* POLYMORPHIC */
+  return vminnmvq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmavq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.minnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float16_t test_vminnmavq_f16(float16_t a, float16x8_t b) {
+#ifdef POLYMORPHIC
+  return vminnmavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminnmavq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmavq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.arm.mve.minnmav.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float32_t test_vminnmavq_f32(float32_t a, float32x4_t b) {
+#ifdef POLYMORPHIC
+  return vminnmavq(a, b);
+#else  /* POLYMORPHIC */
+  return vminnmavq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmvq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float16_t test_vmaxnmvq_f16(float16_t a, float16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxnmvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxnmvq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmvq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.arm.mve.maxnmv.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float32_t test_vmaxnmvq_f32(float32_t a, float32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxnmvq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxnmvq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmavq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[TMP2]] to i16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP5]]
+//
+float16_t test_vmaxnmavq_f16(float16_t a, float16x8_t b) {
+#ifdef POLYMORPHIC
+  return vmaxnmavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxnmavq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmavq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.arm.mve.maxnmav.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float32_t test_vmaxnmavq_f32(float32_t a, float32x4_t b) {
+#ifdef POLYMORPHIC
+  return vmaxnmavq(a, b);
+#else  /* POLYMORPHIC */
+  return vmaxnmavq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+int8_t test_vminvq_p_s8(int8_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+int16_t test_vminvq_p_s16(int16_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vminvq_p_s32(int32_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+uint8_t test_vminvq_p_u8(uint8_t a, uint8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+uint16_t test_vminvq_p_u16(uint16_t a, uint16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminvq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vminvq_p_u32(uint32_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminvq_p_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+int8_t test_vmaxvq_p_s8(int8_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+int16_t test_vmaxvq_p_s16(int16_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+int32_t test_vmaxvq_p_s32(int32_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+uint8_t test_vmaxvq_p_u8(uint8_t a, uint8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+uint16_t test_vmaxvq_p_u16(uint16_t a, uint16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxvq_p_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmaxvq_p_u32(uint32_t a, uint32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxvq_p_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minav.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+uint8_t test_vminavq_p_s8(uint8_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminavq_p_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.minav.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+uint16_t test_vminavq_p_s16(uint16_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminavq_p_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminavq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.minav.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vminavq_p_s32(uint32_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminavq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_p_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxav.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+// CHECK-NEXT:    ret i8 [[TMP4]]
+//
+uint8_t test_vmaxavq_p_s8(uint8_t a, int8x16_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxavq_p_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_p_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxav.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+// CHECK-NEXT:    ret i16 [[TMP4]]
+//
+uint16_t test_vmaxavq_p_s16(uint16_t a, int16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxavq_p_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxavq_p_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.mve.maxav.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t test_vmaxavq_p_s32(uint32_t a, int32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxavq_p_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmvq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP7]]
+//
+float16_t test_vminnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminnmvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminnmvq_p_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmvq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.mve.minnmv.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret float [[TMP2]]
+//
+float32_t test_vminnmvq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminnmvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminnmvq_p_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmavq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP7]]
+//
+float16_t test_vminnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminnmavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminnmavq_p_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmavq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.mve.minnmav.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret float [[TMP2]]
+//
+float32_t test_vminnmavq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vminnmavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vminnmavq_p_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmvq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP7]]
+//
+float16_t test_vmaxnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxnmvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxnmvq_p_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmvq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.mve.maxnmv.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret float [[TMP2]]
+//
+float32_t test_vmaxnmvq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxnmvq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxnmvq_p_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmavq_p_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32
+// CHECK-NEXT:    [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast half [[TMP4]] to i16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast float undef to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32
+// CHECK-NEXT:    [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536
+// CHECK-NEXT:    [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]]
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float
+// CHECK-NEXT:    ret float [[TMP7]]
+//
+float16_t test_vmaxnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxnmavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxnmavq_p_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmavq_p_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.arm.mve.maxnmav.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret float [[TMP2]]
+//
+float32_t test_vmaxnmavq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) {
+#ifdef POLYMORPHIC
+  return vmaxnmavq_p(a, b, p);
+#else  /* POLYMORPHIC */
+  return vmaxnmavq_p_f32(a, b, p);
 #endif /* POLYMORPHIC */
 }
Index: clang/include/clang/Basic/arm_mve.td
===================================================================
--- clang/include/clang/Basic/arm_mve.td
+++ clang/include/clang/Basic/arm_mve.td
@@ -536,11 +536,42 @@
                                     (IRInt<"vmaxnma_predicated", [Vector,Predicate]> $a, $b, $pred)>;
 }
 
+multiclass Reduction<Type Accumulator, string basename, list<Type> basetypes,
+                     bit needSign = 0,
+                     dag postCG = (seq (id $ret)),
+                     dag accArg = (args Accumulator:$prev),
+                     dag preCG = (seq)> {
+  defvar intArgsBase   = (? $prev, $vec);
+  defvar intArgsUnpred = !con(intArgsBase,
+                              !if(needSign, (? (unsignedflag Scalar)), (?)));
+  defvar intArgsPred   = !con(intArgsUnpred, (? $pred));
+  defvar intUnpred     = !setop(intArgsUnpred, IRInt<basename, basetypes>);
+  defvar intPred       = !setop(intArgsPred, IRInt<
+            basename#"_predicated", !listconcat(basetypes, [Predicate])>);
+
+  def "": Intrinsic<
+    Accumulator, !con(accArg, (args Vector:$vec)),
+    !con(preCG, (seq intUnpred:$ret), postCG)>;
+  def _p: Intrinsic<
+    Accumulator, !con(accArg, (args Vector:$vec, Predicate:$pred)),
+    !con(preCG, (seq intPred:$ret), postCG)>;
+}
+
 let params = T.Int in {
-def vminvq: Intrinsic<Scalar, (args Scalar:$prev, Vector:$vec),
-    (Scalar (IRInt<"minv", [Vector], 1> $prev, $vec))>;
-def vmaxvq: Intrinsic<Scalar, (args Scalar:$prev, Vector:$vec),
-    (Scalar (IRInt<"maxv", [Vector], 1> $prev, $vec))>;
+defm vminvq: Reduction<Scalar, "minv", [Vector], 1, (seq (Scalar $ret))>;
+defm vmaxvq: Reduction<Scalar, "maxv", [Vector], 1, (seq (Scalar $ret))>;
+}
+
+let params = T.Signed in {
+defm vminavq: Reduction<UScalar, "minav", [Vector], 0, (seq (UScalar $ret))>;
+defm vmaxavq: Reduction<UScalar, "maxav", [Vector], 0, (seq (UScalar $ret))>;
+}
+
+let params = T.Float in {
+defm vminnmvq: Reduction<Scalar, "minnmv", [Scalar, Vector]>;
+defm vmaxnmvq: Reduction<Scalar, "maxnmv", [Scalar, Vector]>;
+defm vminnmavq: Reduction<Scalar, "minnmav", [Scalar, Vector]>;
+defm vmaxnmavq: Reduction<Scalar, "maxnmav", [Scalar, Vector]>;
 }
 
 foreach half = [ "b", "t" ] in {

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D76490: [ARM,MVE] Add ACLE intrinsics for the vminv/vmaxv family.

Reply via email to