s}i.sat (PR #191365)

Valeriy Savchenko via cfe-commits Fri, 10 Apr 2026 01:59:24 -0700

https://github.com/SavchenkoValeriy updated 
https://github.com/llvm/llvm-project/pull/191365


>From a404bd88561bd900d51d21690e3145c1d8571031 Mon Sep 17 00:00:00 2001
From: Valeriy Savchenko <[email protected]>
Date: Wed, 8 Apr 2026 18:00:27 +0100
Subject: [PATCH] [Clang][AArch64] Lower NEON fcvtz{u/s} intrinsics into
 fpto{u/s}i.sat

---
 .../include/clang/Basic/AArch64CodeGenUtils.h |  38 +-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |  31 +-
 .../CodeGen/AArch64/neon-fcvt-intrinsics.c    |  16 +-
 .../AArch64/neon-intrinsics-constrained.c     |  24 +-
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 332 +++++++++---------
 clang/test/CodeGen/AArch64/neon-misc.c        |  38 +-
 .../v8.2a-fp16-intrinsics-constrained.c       |  12 +-
 .../CodeGen/AArch64/v8.2a-fp16-intrinsics.c   |  12 +-
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   |  44 +--
 9 files changed, 272 insertions(+), 275 deletions(-)

diff --git a/clang/include/clang/Basic/AArch64CodeGenUtils.h 
b/clang/include/clang/Basic/AArch64CodeGenUtils.h
index 9a97f0001cb12..c747f1bf9d825 100644
--- a/clang/include/clang/Basic/AArch64CodeGenUtils.h
+++ b/clang/include/clang/Basic/AArch64CodeGenUtils.h
@@ -173,6 +173,12 @@ const inline ARMVectorIntrinsicInfo 
AArch64SIMDIntrinsicMap [] = {
   NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
+  NEONMAP1(vcvt_s16_f16, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvt_s32_v, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvt_s64_v, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvt_u16_f16, fptoui_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvt_u32_v, fptoui_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvt_u64_v, fptoui_sat, AddRetType | Add1ArgType),
   NEONMAP0(vcvtq_f16_s16),
   NEONMAP0(vcvtq_f16_u16),
   NEONMAP0(vcvtq_f32_v),
@@ -186,6 +192,12 @@ const inline ARMVectorIntrinsicInfo 
AArch64SIMDIntrinsicMap [] = {
   NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
+  NEONMAP1(vcvtq_s16_f16, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtq_s32_v, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtq_s64_v, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtq_u16_f16, fptoui_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtq_u32_v, fptoui_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtq_u64_v, fptoui_sat, AddRetType | Add1ArgType),
   NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
   NEONMAP1(vdot_s32, aarch64_neon_sdot, 0),
   NEONMAP1(vdot_u32, aarch64_neon_udot, 0),
@@ -406,10 +418,10 @@ const inline ARMVectorIntrinsicInfo 
AArch64SISDIntrinsicMap[] = {
   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
-  NEONMAP1(vcvtd_s32_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
-  NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
-  NEONMAP1(vcvtd_u32_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
-  NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_s32_f64, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_s64_f64, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_u32_f64, fptoui_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_u64_f64, fptoui_sat, AddRetType | Add1ArgType),
   NEONMAP0(vcvth_bf16_f32),
   NEONMAP1(vcvtmd_s32_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -439,10 +451,10 @@ const inline ARMVectorIntrinsicInfo 
AArch64SISDIntrinsicMap[] = {
   NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
   NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
   NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
-  NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
-  NEONMAP1(vcvts_s64_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
-  NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
-  NEONMAP1(vcvts_u64_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_s32_f32, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_s64_f32, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_u32_f32, fptoui_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_u64_f32, fptoui_sat, AddRetType | Add1ArgType),
   NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
   NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
   NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
@@ -621,10 +633,12 @@ const inline ARMVectorIntrinsicInfo 
AArch64SISDIntrinsicMap[] = {
   NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
   NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
   NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
-  NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
-  NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
-  NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
-  NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_s16_f16, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_s32_f16, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_s64_f16, fptosi_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_u16_f16, fptoui_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_u32_f16, fptoui_sat, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_u64_f16, fptoui_sat, AddRetType | Add1ArgType),
   NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
   NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
   NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 8ec2f5b83085c..554038e4f7107 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -1384,6 +1384,12 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vcvtq_s16_f16:
   case NEON::BI__builtin_neon_vcvtq_u16_f16: {
     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
+    // AArch64 uses saturating FP-to-int intrinsics; ARM uses plain
+    // fptoui/fptosi.
+    if (Int) {
+      llvm::Type *Tys[2] = {Ty, Ops[0]->getType()};
+      return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz");
+    }
     return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
                 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
   }
@@ -5428,12 +5434,10 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vcvtmh_u16_f16:
   case NEON::BI__builtin_neon_vcvtnh_u16_f16:
   case NEON::BI__builtin_neon_vcvtph_u16_f16:
-  case NEON::BI__builtin_neon_vcvth_u16_f16:
   case NEON::BI__builtin_neon_vcvtah_s16_f16:
   case NEON::BI__builtin_neon_vcvtmh_s16_f16:
   case NEON::BI__builtin_neon_vcvtnh_s16_f16:
-  case NEON::BI__builtin_neon_vcvtph_s16_f16:
-  case NEON::BI__builtin_neon_vcvth_s16_f16: {
+  case NEON::BI__builtin_neon_vcvtph_s16_f16: {
     llvm::Type *InTy = Int16Ty;
     llvm::Type* FTy  = HalfTy;
     llvm::Type *Tys[2] = {InTy, FTy};
@@ -5447,8 +5451,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
       Int = Intrinsic::aarch64_neon_fcvtnu; break;
     case NEON::BI__builtin_neon_vcvtph_u16_f16:
       Int = Intrinsic::aarch64_neon_fcvtpu; break;
-    case NEON::BI__builtin_neon_vcvth_u16_f16:
-      Int = Intrinsic::aarch64_neon_fcvtzu; break;
     case NEON::BI__builtin_neon_vcvtah_s16_f16:
       Int = Intrinsic::aarch64_neon_fcvtas; break;
     case NEON::BI__builtin_neon_vcvtmh_s16_f16:
@@ -5457,8 +5459,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
       Int = Intrinsic::aarch64_neon_fcvtns; break;
     case NEON::BI__builtin_neon_vcvtph_s16_f16:
       Int = Intrinsic::aarch64_neon_fcvtps; break;
-    case NEON::BI__builtin_neon_vcvth_s16_f16:
-      Int = Intrinsic::aarch64_neon_fcvtzs; break;
     }
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt");
   }
@@ -6410,23 +6410,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
 
     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
   }
-  case NEON::BI__builtin_neon_vcvt_s32_v:
-  case NEON::BI__builtin_neon_vcvt_u32_v:
-  case NEON::BI__builtin_neon_vcvt_s64_v:
-  case NEON::BI__builtin_neon_vcvt_u64_v:
-  case NEON::BI__builtin_neon_vcvt_s16_f16:
-  case NEON::BI__builtin_neon_vcvt_u16_f16:
-  case NEON::BI__builtin_neon_vcvtq_s32_v:
-  case NEON::BI__builtin_neon_vcvtq_u32_v:
-  case NEON::BI__builtin_neon_vcvtq_s64_v:
-  case NEON::BI__builtin_neon_vcvtq_u64_v:
-  case NEON::BI__builtin_neon_vcvtq_s16_f16:
-  case NEON::BI__builtin_neon_vcvtq_u16_f16: {
-    Int =
-        usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs;
-    llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)};
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz");
-  }
   case NEON::BI__builtin_neon_vcvta_s16_f16:
   case NEON::BI__builtin_neon_vcvta_u16_f16:
   case NEON::BI__builtin_neon_vcvta_s32_v:
diff --git a/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c 
b/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c
index 929df94aa60ef..f14df46b89177 100644
--- a/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c
@@ -339,7 +339,7 @@ uint32_t test_vcvtpd_u32_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvts_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTS_S32_F32_I:%.*]] = call i32 
@llvm.aarch64.neon.fcvtzs.i32.f32(float [[A]])
+// CHECK-NEXT:    [[VCVTS_S32_F32_I:%.*]] = call i32 
@llvm.fptosi.sat.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTS_S32_F32_I]]
 //
 int32_t test_vcvts_s32_f32(float32_t a) {
@@ -349,7 +349,7 @@ int32_t test_vcvts_s32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_s64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTD_S64_F64_I:%.*]] = call i64 
@llvm.aarch64.neon.fcvtzs.i64.f64(double [[A]])
+// CHECK-NEXT:    [[VCVTD_S64_F64_I:%.*]] = call i64 
@llvm.fptosi.sat.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTD_S64_F64_I]]
 //
 int64_t test_vcvtd_s64_f64(float64_t a) {
@@ -359,7 +359,7 @@ int64_t test_vcvtd_s64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvts_s64_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTS_S64_F32_I:%.*]] = call i64 
@llvm.aarch64.neon.fcvtzs.i64.f32(float [[A]])
+// CHECK-NEXT:    [[VCVTS_S64_F32_I:%.*]] = call i64 
@llvm.fptosi.sat.i64.f32(float [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTS_S64_F32_I]]
 //
 int64_t test_vcvts_s64_f32(float32_t a) {
@@ -369,7 +369,7 @@ int64_t test_vcvts_s64_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_s32_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTD_S32_F64_I:%.*]] = call i32 
@llvm.aarch64.neon.fcvtzs.i32.f64(double [[A]])
+// CHECK-NEXT:    [[VCVTD_S32_F64_I:%.*]] = call i32 
@llvm.fptosi.sat.i32.f64(double [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTD_S32_F64_I]]
 //
 int32_t test_vcvtd_s32_f64(float64_t a) {
@@ -379,7 +379,7 @@ int32_t test_vcvtd_s32_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvts_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTS_U32_F32_I:%.*]] = call i32 
@llvm.aarch64.neon.fcvtzu.i32.f32(float [[A]])
+// CHECK-NEXT:    [[VCVTS_U32_F32_I:%.*]] = call i32 
@llvm.fptoui.sat.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTS_U32_F32_I]]
 //
 uint32_t test_vcvts_u32_f32(float32_t a) {
@@ -389,7 +389,7 @@ uint32_t test_vcvts_u32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_u64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTD_U64_F64_I:%.*]] = call i64 
@llvm.aarch64.neon.fcvtzu.i64.f64(double [[A]])
+// CHECK-NEXT:    [[VCVTD_U64_F64_I:%.*]] = call i64 
@llvm.fptoui.sat.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTD_U64_F64_I]]
 //
 uint64_t test_vcvtd_u64_f64(float64_t a) {
@@ -399,7 +399,7 @@ uint64_t test_vcvtd_u64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvts_u64_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTS_U64_F32_I:%.*]] = call i64 
@llvm.aarch64.neon.fcvtzu.i64.f32(float [[A]])
+// CHECK-NEXT:    [[VCVTS_U64_F32_I:%.*]] = call i64 
@llvm.fptoui.sat.i64.f32(float [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTS_U64_F32_I]]
 //
 uint64_t test_vcvts_u64_f32(float32_t a) {
@@ -409,7 +409,7 @@ uint64_t test_vcvts_u64_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_u32_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTD_U32_F64_I:%.*]] = call i32 
@llvm.aarch64.neon.fcvtzu.i32.f64(double [[A]])
+// CHECK-NEXT:    [[VCVTD_U32_F64_I:%.*]] = call i32 
@llvm.fptoui.sat.i32.f64(double [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTD_U32_F64_I]]
 //
 uint32_t test_vcvtd_u32_f64(float64_t a) {
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c 
b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c
index ba32cfb7f3bae..d38fae31c44d1 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c
@@ -1522,9 +1522,9 @@ float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) {
 // UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
 // UNCONSTRAINED-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = 
insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
 // UNCONSTRAINED-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> 
[[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// UNCONSTRAINED-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x 
double>
-// UNCONSTRAINED-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> 
@llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[VCVTZ_I]])
-// UNCONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+// UNCONSTRAINED-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x 
double>
+// UNCONSTRAINED-NEXT:    [[VCVTZ_I:%.*]] = call <1 x i64> 
@llvm.fptosi.sat.v1i64.v1f64(<1 x double> [[TMP2]])
+// UNCONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ_I]]
 //
 // CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64(
 // CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
@@ -1532,9 +1532,9 @@ float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) {
 // CONSTRAINED-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
 // CONSTRAINED-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = 
insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
 // CONSTRAINED-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> 
[[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CONSTRAINED-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x 
double>
-// CONSTRAINED-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> 
@llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) #[[ATTR3]]
-// CONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+// CONSTRAINED-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x 
double>
+// CONSTRAINED-NEXT:    [[VCVTZ_I:%.*]] = call <1 x i64> 
@llvm.fptosi.sat.v1i64.v1f64(<1 x double> [[TMP2]]) #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ_I]]
 //
 int64x1_t test_vcvt_s64_f64(float64x1_t a) {
   return vcvt_s64_f64(a);
@@ -1546,9 +1546,9 @@ int64x1_t test_vcvt_s64_f64(float64x1_t a) {
 // UNCONSTRAINED-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
 // UNCONSTRAINED-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = 
insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
 // UNCONSTRAINED-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> 
[[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// UNCONSTRAINED-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x 
double>
-// UNCONSTRAINED-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> 
@llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[VCVTZ_I]])
-// UNCONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+// UNCONSTRAINED-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x 
double>
+// UNCONSTRAINED-NEXT:    [[VCVTZ_I:%.*]] = call <1 x i64> 
@llvm.fptoui.sat.v1i64.v1f64(<1 x double> [[TMP2]])
+// UNCONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ_I]]
 //
 // CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64(
 // CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] {
@@ -1556,9 +1556,9 @@ int64x1_t test_vcvt_s64_f64(float64x1_t a) {
 // CONSTRAINED-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
 // CONSTRAINED-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = 
insertelement <1 x i64> undef, i64 [[TMP0]], i32 0
 // CONSTRAINED-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> 
[[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CONSTRAINED-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x 
double>
-// CONSTRAINED-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> 
@llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) #[[ATTR3]]
-// CONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+// CONSTRAINED-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x 
double>
+// CONSTRAINED-NEXT:    [[VCVTZ_I:%.*]] = call <1 x i64> 
@llvm.fptoui.sat.v1i64.v1f64(<1 x double> [[TMP2]]) #[[ATTR3]]
+// CONSTRAINED-NEXT:    ret <1 x i64> [[VCVTZ_I]]
 //
 uint64x1_t test_vcvt_u64_f64(float64x1_t a) {
   return vcvt_u64_f64(a);
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 80bb22cc43c78..8704a7827ad1d 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -10417,8 +10417,8 @@ uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, 
uint64x2_t a, uint64x2_t b) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_s8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) 
#[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
-// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VMOVL_I_I]]
 //
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
@@ -10430,10 +10430,10 @@ int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VMOVL_I_I]]
@@ -10447,10 +10447,10 @@ int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[VMOVL_I_I]]
@@ -10462,8 +10462,8 @@ int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_u8(
 // CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) 
#[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
-// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
+// CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VMOVL_I_I]]
 //
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
@@ -10475,10 +10475,10 @@ uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VMOVL_I_I]]
@@ -10492,10 +10492,10 @@ uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) 
{
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
 // CHECK-NEXT:    [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[VMOVL_I_I]]
@@ -10507,8 +10507,8 @@ uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vabal_s8(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x 
i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
-// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x 
i16>
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
 //
@@ -10521,10 +10521,10 @@ int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, 
int8x8_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> 
[[VABD1_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]]
@@ -10539,10 +10539,10 @@ int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, 
int16x4_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> 
[[VABD1_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
 // CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]]
@@ -10555,8 +10555,8 @@ int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, 
int32x2_t c) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vabal_u8(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x 
i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
-// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x 
i16>
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
 //
@@ -10569,10 +10569,10 @@ uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, 
uint8x8_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> 
[[VABD1_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]]
@@ -10587,10 +10587,10 @@ uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, 
uint16x4_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> 
[[VABD1_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
 // CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]]
@@ -10605,8 +10605,8 @@ uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, 
uint32x2_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x 
i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x 
i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
-// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x 
i16>
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VMOVL_I_I_I]]
 //
 int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) {
@@ -10620,10 +10620,10 @@ int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t 
b) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x 
i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> 
[[VABD1_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VMOVL_I_I_I]]
@@ -10639,10 +10639,10 @@ int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t 
b) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x 
i32> [[B]], <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> 
[[VABD1_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
 // CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[VMOVL_I_I_I]]
@@ -10656,8 +10656,8 @@ int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) 
{
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x 
i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x 
i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
-// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x 
i16>
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16>
 // CHECK-NEXT:    ret <8 x i16> [[VMOVL_I_I_I]]
 //
 uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) {
@@ -10671,10 +10671,10 @@ uint16x8_t test_vabdl_high_u8(uint8x16_t a, 
uint8x16_t b) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x 
i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> 
[[VABD1_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[VMOVL_I_I_I]]
@@ -10690,10 +10690,10 @@ uint32x4_t test_vabdl_high_u16(uint16x8_t a, 
uint16x8_t b) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x 
i32> [[B]], <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VABD2_I_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> 
[[VABD1_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
 // CHECK-NEXT:    [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 // CHECK-NEXT:    ret <2 x i64> [[VMOVL_I_I_I]]
@@ -10707,8 +10707,8 @@ uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t 
b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x 
i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x 
i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
-// CHECK-NEXT:    [[VABD_I_I_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
-// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 
x i16>
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16>
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I_I]]
 //
@@ -10723,10 +10723,10 @@ int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t 
b, int8x16_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x 
i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x 
i16>
-// CHECK-NEXT:    [[VABD2_I_I_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> 
[[VABD1_I_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x 
i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]]
@@ -10743,10 +10743,10 @@ int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t 
b, int16x8_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x 
i32> [[C]], <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x 
i32>
-// CHECK-NEXT:    [[VABD2_I_I_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> 
[[VABD1_I_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x 
i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
 // CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]]
@@ -10761,8 +10761,8 @@ int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, 
int32x4_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x 
i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x 
i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
-// CHECK-NEXT:    [[VABD_I_I_I_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
-// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 
x i16>
+// CHECK-NEXT:    [[VABD_I:%.*]] = call <8 x i8> 
@llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16>
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I_I]]
 //
@@ -10777,10 +10777,10 @@ uint16x8_t test_vabal_high_u8(uint16x8_t a, 
uint8x16_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x 
i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x 
i16>
-// CHECK-NEXT:    [[VABD2_I_I_I_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> 
[[VABD1_I_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x 
i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
 // CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]]
@@ -10797,10 +10797,10 @@ uint32x4_t test_vabal_high_u16(uint32x4_t a, 
uint16x8_t b, uint16x8_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x 
i32> [[C]], <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x 
i32>
-// CHECK-NEXT:    [[VABD2_I_I_I_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> 
[[VABD1_I_I_I_I]])
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x 
i8>
+// CHECK-NEXT:    [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
 // CHECK-NEXT:    [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]]
@@ -10813,8 +10813,8 @@ uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t 
b, uint32x4_t c) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_s8(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x 
i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
 //
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
@@ -10826,10 +10826,10 @@ int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, 
int8x8_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> 
[[VMULL1_I_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
@@ -10841,10 +10841,10 @@ int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, 
int16x4_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> 
[[VMULL1_I_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
@@ -10854,8 +10854,8 @@ int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, 
int32x2_t c) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_u8(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x 
i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
 //
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
@@ -10867,10 +10867,10 @@ uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, 
uint8x8_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> 
[[VMULL1_I_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
 //
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
@@ -10882,10 +10882,10 @@ uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, 
uint16x4_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> 
[[VMULL1_I_I]])
-// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]])
+// CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
 //
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
@@ -10897,8 +10897,8 @@ uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, 
uint32x2_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x 
i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x 
i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I_I]]
 //
 int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
@@ -10912,10 +10912,10 @@ int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t 
b, int8x16_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x 
i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> 
[[VMULL1_I_I_I]])
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
 //
 int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
@@ -10929,10 +10929,10 @@ int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t 
b, int16x8_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x 
i32> [[C]], <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> 
[[VMULL1_I_I_I]])
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
 //
 int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
@@ -10944,8 +10944,8 @@ int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, 
int32x4_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x 
i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x 
i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[ADD_I_I]]
 //
 uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
@@ -10959,10 +10959,10 @@ uint16x8_t test_vmlal_high_u8(uint16x8_t a, 
uint8x16_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x 
i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> 
[[VMULL1_I_I_I]])
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[ADD_I_I]]
 //
 uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
@@ -10976,10 +10976,10 @@ uint32x4_t test_vmlal_high_u16(uint32x4_t a, 
uint16x8_t b, uint16x8_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x 
i32> [[C]], <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> 
[[VMULL1_I_I_I]])
-// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]])
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[ADD_I_I]]
 //
 uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
@@ -10989,8 +10989,8 @@ uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t 
b, uint32x4_t c) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_s8(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x 
i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
 //
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
@@ -11002,10 +11002,10 @@ int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, 
int8x8_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> 
[[VMULL1_I_I]])
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
@@ -11017,10 +11017,10 @@ int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, 
int16x4_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> 
[[VMULL1_I_I]])
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
@@ -11030,8 +11030,8 @@ int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, 
int32x2_t c) {
 // CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_u8(
 // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x 
i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I]]
 //
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
@@ -11043,10 +11043,10 @@ uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, 
uint8x8_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> 
[[VMULL1_I_I]])
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I]]
 //
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
@@ -11058,10 +11058,10 @@ uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, 
uint16x4_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL2_I_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> 
[[VMULL1_I_I]])
-// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]])
+// CHECK-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I]]
 //
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
@@ -11073,8 +11073,8 @@ uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, 
uint32x2_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x 
i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x 
i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I_I]]
 //
 int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
@@ -11088,10 +11088,10 @@ int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t 
b, int8x16_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x 
i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> 
[[VMULL1_I_I_I]])
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I_I]]
 //
 int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
@@ -11105,10 +11105,10 @@ int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t 
b, int16x8_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x 
i32> [[C]], <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> 
[[VMULL1_I_I_I]])
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I_I]]
 //
 int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
@@ -11120,8 +11120,8 @@ int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, 
int32x4_t c) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x 
i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x 
i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 
15>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> 
[[SHUFFLE_I_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]]
 // CHECK-NEXT:    ret <8 x i16> [[SUB_I_I]]
 //
 uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
@@ -11135,10 +11135,10 @@ uint16x8_t test_vmlsl_high_u8(uint16x8_t a, 
uint8x16_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x 
i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> 
[[VMULL1_I_I_I]])
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SUB_I_I]]
 //
 uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
@@ -11152,10 +11152,10 @@ uint32x4_t test_vmlsl_high_u16(uint32x4_t a, 
uint16x8_t b, uint16x8_t c) {
 // CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x 
i32> [[C]], <2 x i32> <i32 2, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
-// CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> 
[[VMULL1_I_I_I]])
-// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]]
+// CHECK-NEXT:    [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VMULL2_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]])
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]]
 // CHECK-NEXT:    ret <2 x i64> [[SUB_I_I]]
 //
 uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
@@ -22513,9 +22513,9 @@ float64x1_t test_vneg_f64(float64x1_t a) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
 // CHECK-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x 
i64> undef, i64 [[TMP0]], i32 0
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> 
[[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> 
@llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <1 x i64> 
@llvm.fptosi.sat.v1i64.v1f64(<1 x double> [[TMP2]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTZ_I]]
 //
 int64x1_t test_vcvt_s64_f64(float64x1_t a) {
   return vcvt_s64_f64(a);
@@ -22527,9 +22527,9 @@ int64x1_t test_vcvt_s64_f64(float64x1_t a) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
 // CHECK-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x 
i64> undef, i64 [[TMP0]], i32 0
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> 
[[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <1 x i64> 
@llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <1 x i64> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <1 x i64> 
@llvm.fptoui.sat.v1i64.v1f64(<1 x double> [[TMP2]])
+// CHECK-NEXT:    ret <1 x i64> [[VCVTZ_I]]
 //
 uint64x1_t test_vcvt_u64_f64(float64x1_t a) {
   return vcvt_u64_f64(a);
diff --git a/clang/test/CodeGen/AArch64/neon-misc.c 
b/clang/test/CodeGen/AArch64/neon-misc.c
index ac2c83aa03ccf..9f6b49f993fb5 100644
--- a/clang/test/CodeGen/AArch64/neon-misc.c
+++ b/clang/test/CodeGen/AArch64/neon-misc.c
@@ -8,7 +8,7 @@
 #include <arm_neon.h>
 
 // CHECK-LABEL: define dso_local <8 x i8> @test_vcgez_s8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]])  #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = icmp sge <8 x i8> [[A]], zeroinitializer
 // CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
@@ -3152,9 +3152,9 @@ float64x2_t test_vrndiq_f64(float64x2_t a) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <2 x i32> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <2 x i32> 
@llvm.fptosi.sat.v2i32.v2f32(<2 x float> [[TMP2]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTZ_I]]
 //
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   return vcvt_s32_f32(a);
@@ -3165,9 +3165,9 @@ int32x2_t test_vcvt_s32_f32(float32x2_t a) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <4 x i32> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <4 x i32> 
@llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTZ_I]]
 //
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   return vcvtq_s32_f32(a);
@@ -3178,9 +3178,9 @@ int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <2 x i64> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <2 x i64> 
@llvm.fptosi.sat.v2i64.v2f64(<2 x double> [[TMP2]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTZ_I]]
 //
 int64x2_t test_vcvtq_s64_f64(float64x2_t a) {
   return vcvtq_s64_f64(a);
@@ -3191,9 +3191,9 @@ int64x2_t test_vcvtq_s64_f64(float64x2_t a) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <2 x i32> 
@llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <2 x i32> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <2 x i32> 
@llvm.fptoui.sat.v2i32.v2f32(<2 x float> [[TMP2]])
+// CHECK-NEXT:    ret <2 x i32> [[VCVTZ_I]]
 //
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   return vcvt_u32_f32(a);
@@ -3204,9 +3204,9 @@ uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i32> 
@llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <4 x i32> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <4 x i32> 
@llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP2]])
+// CHECK-NEXT:    ret <4 x i32> [[VCVTZ_I]]
 //
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   return vcvtq_u32_f32(a);
@@ -3217,9 +3217,9 @@ uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <2 x i64> 
@llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <2 x i64> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <2 x i64> 
@llvm.fptoui.sat.v2i64.v2f64(<2 x double> [[TMP2]])
+// CHECK-NEXT:    ret <2 x i64> [[VCVTZ_I]]
 //
 uint64x2_t test_vcvtq_u64_f64(float64x2_t a) {
   return vcvtq_u64_f64(a);
diff --git a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics-constrained.c 
b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics-constrained.c
index 4c19d75df96e2..df7c4b1ed4d02 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics-constrained.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics-constrained.c
@@ -105,42 +105,42 @@ float16_t test_vcvth_f16_u64 (uint64_t a) {
 }
 
 // COMMON-LABEL: test_vcvth_s16_f16
-// COMMONIR:       [[VCVT:%.*]] = call i16 
@llvm.aarch64.neon.fcvtzs.i16.f16(half %a)
+// COMMONIR:       [[VCVT:%.*]] = call i16 @llvm.fptosi.sat.i16.f16(half %a)
 // COMMONIR:       ret i16 [[VCVT]]
 int16_t test_vcvth_s16_f16 (float16_t a) {
   return vcvth_s16_f16(a);
 }
 
 // COMMON-LABEL: test_vcvth_s32_f16
-// COMMONIR:       [[VCVT:%.*]] = call i32 
@llvm.aarch64.neon.fcvtzs.i32.f16(half %a)
+// COMMONIR:       [[VCVT:%.*]] = call i32 @llvm.fptosi.sat.i32.f16(half %a)
 // COMMONIR:       ret i32 [[VCVT]]
 int32_t test_vcvth_s32_f16 (float16_t a) {
   return vcvth_s32_f16(a);
 }
 
 // COMMON-LABEL: test_vcvth_s64_f16
-// COMMONIR:       [[VCVT:%.*]] = call i64 
@llvm.aarch64.neon.fcvtzs.i64.f16(half %a)
+// COMMONIR:       [[VCVT:%.*]] = call i64 @llvm.fptosi.sat.i64.f16(half %a)
 // COMMONIR:       ret i64 [[VCVT]]
 int64_t test_vcvth_s64_f16 (float16_t a) {
   return vcvth_s64_f16(a);
 }
 
 // COMMON-LABEL: test_vcvth_u16_f16
-// COMMONIR:       [[VCVT:%.*]] = call i16 
@llvm.aarch64.neon.fcvtzu.i16.f16(half %a)
+// COMMONIR:       [[VCVT:%.*]] = call i16 @llvm.fptoui.sat.i16.f16(half %a)
 // COMMONIR:       ret i16 [[VCVT]]
 uint16_t test_vcvth_u16_f16 (float16_t a) {
   return vcvth_u16_f16(a);
 }
 
 // COMMON-LABEL: test_vcvth_u32_f16
-// COMMONIR:       [[VCVT:%.*]] = call i32 
@llvm.aarch64.neon.fcvtzu.i32.f16(half %a)
+// COMMONIR:       [[VCVT:%.*]] = call i32 @llvm.fptoui.sat.i32.f16(half %a)
 // COMMONIR:       ret i32 [[VCVT]]
 uint32_t test_vcvth_u32_f16 (float16_t a) {
   return vcvth_u32_f16(a);
 }
 
 // COMMON-LABEL: test_vcvth_u64_f16
-// COMMONIR:       [[VCVT:%.*]] = call i64 
@llvm.aarch64.neon.fcvtzu.i64.f16(half %a)
+// COMMONIR:       [[VCVT:%.*]] = call i64 @llvm.fptoui.sat.i64.f16(half %a)
 // COMMONIR:       ret i64 [[VCVT]]
 uint64_t test_vcvth_u64_f16 (float16_t a) {
   return vcvth_u64_f16(a);
diff --git a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c 
b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c
index c80d9e9d7f759..f4b7899b8360c 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c
@@ -82,42 +82,42 @@ float16_t test_vcvth_f16_u64 (uint64_t a) {
 }
 
 // CHECK-LABEL: test_vcvth_s16_f16
-// CHECK:  [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half %a)
+// CHECK:  [[VCVT:%.*]] = call i16 @llvm.fptosi.sat.i16.f16(half %a)
 // CHECK:  ret i16 [[VCVT]]
 int16_t test_vcvth_s16_f16 (float16_t a) {
   return vcvth_s16_f16(a);
 }
 
 // CHECK-LABEL: test_vcvth_s32_f16
-// CHECK:  [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half %a)
+// CHECK:  [[VCVT:%.*]] = call i32 @llvm.fptosi.sat.i32.f16(half %a)
 // CHECK:  ret i32 [[VCVT]]
 int32_t test_vcvth_s32_f16 (float16_t a) {
   return vcvth_s32_f16(a);
 }
 
 // CHECK-LABEL: test_vcvth_s64_f16
-// CHECK:  [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half %a)
+// CHECK:  [[VCVT:%.*]] = call i64 @llvm.fptosi.sat.i64.f16(half %a)
 // CHECK:  ret i64 [[VCVT]]
 int64_t test_vcvth_s64_f16 (float16_t a) {
   return vcvth_s64_f16(a);
 }
 
 // CHECK-LABEL: test_vcvth_u16_f16
-// CHECK:  [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half %a)
+// CHECK:  [[VCVT:%.*]] = call i16 @llvm.fptoui.sat.i16.f16(half %a)
 // CHECK:  ret i16 [[VCVT]]
 uint16_t test_vcvth_u16_f16 (float16_t a) {
   return vcvth_u16_f16(a);
 }
 
 // CHECK-LABEL: test_vcvth_u32_f16
-// CHECK:  [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half %a)
+// CHECK:  [[VCVT:%.*]] = call i32 @llvm.fptoui.sat.i32.f16(half %a)
 // CHECK:  ret i32 [[VCVT]]
 uint32_t test_vcvth_u32_f16 (float16_t a) {
   return vcvth_u32_f16(a);
 }
 
 // CHECK-LABEL: test_vcvth_u64_f16
-// CHECK:  [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f16(half %a)
+// CHECK:  [[VCVT:%.*]] = call i64 @llvm.fptoui.sat.i64.f16(half %a)
 // CHECK:  ret i64 [[VCVT]]
 uint64_t test_vcvth_u64_f16 (float16_t a) {
   return vcvth_u64_f16(a);
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index 9c408e8c702fd..b8380bd8ed6d4 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -227,9 +227,9 @@ float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <4 x i16> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <4 x i16> 
@llvm.fptosi.sat.v4i16.v4f16(<4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTZ_I]]
 //
 int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
   return vcvt_s16_f16(a);
@@ -240,9 +240,9 @@ int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <8 x i16> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <8 x i16> 
@llvm.fptosi.sat.v8i16.v8f16(<8 x half> [[TMP2]])
+// CHECK-NEXT:    ret <8 x i16> [[VCVTZ_I]]
 //
 int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
   return vcvtq_s16_f16(a);
@@ -253,9 +253,9 @@ int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> 
@llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <4 x i16> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <4 x i16> 
@llvm.fptoui.sat.v4i16.v4f16(<4 x half> [[TMP2]])
+// CHECK-NEXT:    ret <4 x i16> [[VCVTZ_I]]
 //
 uint16x4_t test_vcvt_u16_f16 (float16x4_t a) {
   return vcvt_u16_f16(a);
@@ -266,9 +266,9 @@ uint16x4_t test_vcvt_u16_f16 (float16x4_t a) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> 
@llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[VCVTZ_I]])
-// CHECK-NEXT:    ret <8 x i16> [[VCVTZ1_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[VCVTZ_I:%.*]] = call <8 x i16> 
@llvm.fptoui.sat.v8i16.v8f16(<8 x half> [[TMP2]])
+// CHECK-NEXT:    ret <8 x i16> [[VCVTZ_I]]
 //
 uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
   return vcvtq_u16_f16(a);
@@ -1973,10 +1973,10 @@ float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t 
b, float16_t c) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext half [[B]] to float
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[CONV]]
-// CHECK-NEXT:    [[TMP0:%.*]] = fptrunc float [[FNEG]] to half
+// CHECK-NEXT:    [[CONV1:%.*]] = fptrunc float [[FNEG]] to half
 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3
-// CHECK-NEXT:    [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half 
[[EXTRACT]], half [[A]])
-// CHECK-NEXT:    ret half [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[CONV1]], half 
[[EXTRACT]], half [[A]])
+// CHECK-NEXT:    ret half [[TMP0]]
 //
 float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) {
   return vfmsh_lane_f16(a, b, c, 3);
@@ -1987,10 +1987,10 @@ float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, 
float16x4_t c) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = fpext half [[B]] to float
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[CONV]]
-// CHECK-NEXT:    [[TMP0:%.*]] = fptrunc float [[FNEG]] to half
+// CHECK-NEXT:    [[CONV1:%.*]] = fptrunc float [[FNEG]] to half
 // CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7
-// CHECK-NEXT:    [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half 
[[EXTRACT]], half [[A]])
-// CHECK-NEXT:    ret half [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[CONV1]], half 
[[EXTRACT]], half [[A]])
+// CHECK-NEXT:    ret half [[TMP0]]
 //
 float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
   return vfmsh_laneq_f16(a, b, c, 7);
@@ -2094,8 +2094,8 @@ float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) {
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half
 // CHECK-NEXT:    [[CONV3:%.*]] = fpext half [[TMP1]] to float
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[CONV3]]
-// CHECK-NEXT:    [[TMP2:%.*]] = fptrunc float [[MUL]] to half
-// CHECK-NEXT:    ret half [[TMP2]]
+// CHECK-NEXT:    [[CONV4:%.*]] = fptrunc float [[MUL]] to half
+// CHECK-NEXT:    ret half [[CONV4]]
 //
 float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) {
   return vmulh_lane_f16(a, b, 3);
@@ -2110,8 +2110,8 @@ float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) 
{
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half
 // CHECK-NEXT:    [[CONV3:%.*]] = fpext half [[TMP1]] to float
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[CONV3]]
-// CHECK-NEXT:    [[TMP2:%.*]] = fptrunc float [[MUL]] to half
-// CHECK-NEXT:    ret half [[TMP2]]
+// CHECK-NEXT:    [[CONV4:%.*]] = fptrunc float [[MUL]] to half
+// CHECK-NEXT:    ret half [[CONV4]]
 //
 float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) {
   return vmulh_laneq_f16(a, b, 7);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [Clang][AArch64] Lower NEON fcvtz{u/s} intrinsics into fpto{u/s}i.sat (PR #191365)

Reply via email to