https://github.com/SavchenkoValeriy updated https://github.com/llvm/llvm-project/pull/191365
>From a404bd88561bd900d51d21690e3145c1d8571031 Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko <[email protected]> Date: Wed, 8 Apr 2026 18:00:27 +0100 Subject: [PATCH] [Clang][AArch64] Lower NEON fcvtz{u/s} intrinsics into fpto{u/s}i.sat --- .../include/clang/Basic/AArch64CodeGenUtils.h | 38 +- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 31 +- .../CodeGen/AArch64/neon-fcvt-intrinsics.c | 16 +- .../AArch64/neon-intrinsics-constrained.c | 24 +- clang/test/CodeGen/AArch64/neon-intrinsics.c | 332 +++++++++--------- clang/test/CodeGen/AArch64/neon-misc.c | 38 +- .../v8.2a-fp16-intrinsics-constrained.c | 12 +- .../CodeGen/AArch64/v8.2a-fp16-intrinsics.c | 12 +- .../CodeGen/AArch64/v8.2a-neon-intrinsics.c | 44 +-- 9 files changed, 272 insertions(+), 275 deletions(-) diff --git a/clang/include/clang/Basic/AArch64CodeGenUtils.h b/clang/include/clang/Basic/AArch64CodeGenUtils.h index 9a97f0001cb12..c747f1bf9d825 100644 --- a/clang/include/clang/Basic/AArch64CodeGenUtils.h +++ b/clang/include/clang/Basic/AArch64CodeGenUtils.h @@ -173,6 +173,12 @@ const inline ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap [] = { NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvt_s16_f16, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvt_s32_v, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvt_s64_v, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvt_u16_f16, fptoui_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvt_u32_v, fptoui_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvt_u64_v, fptoui_sat, AddRetType | Add1ArgType), NEONMAP0(vcvtq_f16_s16), NEONMAP0(vcvtq_f16_u16), NEONMAP0(vcvtq_f32_v), @@ -186,6 +192,12 @@ const inline ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap [] = { NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), + NEONMAP1(vcvtq_s16_f16, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvtq_s32_v, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvtq_s64_v, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvtq_u16_f16, fptoui_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvtq_u32_v, fptoui_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvtq_u64_v, fptoui_sat, AddRetType | Add1ArgType), NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType), NEONMAP1(vdot_s32, aarch64_neon_sdot, 0), NEONMAP1(vdot_u32, aarch64_neon_udot, 0), @@ -406,10 +418,10 @@ const inline ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), - NEONMAP1(vcvtd_s32_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), - NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), - NEONMAP1(vcvtd_u32_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), - NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_s32_f64, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_s64_f64, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_u32_f64, fptoui_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_u64_f64, fptoui_sat, AddRetType | Add1ArgType), NEONMAP0(vcvth_bf16_f32), NEONMAP1(vcvtmd_s32_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType), NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType), @@ -439,10 +451,10 @@ const inline ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), - NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), - NEONMAP1(vcvts_s64_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), - NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), - NEONMAP1(vcvts_u64_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), + NEONMAP1(vcvts_s32_f32, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvts_s64_f32, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvts_u32_f32, fptoui_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvts_u64_f32, fptoui_sat, AddRetType | Add1ArgType), NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0), NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), @@ -621,10 +633,12 @@ const inline ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), - NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), - NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), - NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), - NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), + NEONMAP1(vcvth_s16_f16, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvth_s32_f16, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvth_s64_f16, fptosi_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvth_u16_f16, fptoui_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvth_u32_f16, fptoui_sat, AddRetType | Add1ArgType), + NEONMAP1(vcvth_u64_f16, fptoui_sat, AddRetType | Add1ArgType), NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 8ec2f5b83085c..554038e4f7107 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -1384,6 +1384,12 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vcvtq_s16_f16: case NEON::BI__builtin_neon_vcvtq_u16_f16: { Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type)); + // AArch64 uses saturating FP-to-int intrinsics; ARM uses plain + // fptoui/fptosi. + if (Int) { + llvm::Type *Tys[2] = {Ty, Ops[0]->getType()}; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz"); + } return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt") : Builder.CreateFPToSI(Ops[0], Ty, "vcvt"); } @@ -5428,12 +5434,10 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vcvtmh_u16_f16: case NEON::BI__builtin_neon_vcvtnh_u16_f16: case NEON::BI__builtin_neon_vcvtph_u16_f16: - case NEON::BI__builtin_neon_vcvth_u16_f16: case NEON::BI__builtin_neon_vcvtah_s16_f16: case NEON::BI__builtin_neon_vcvtmh_s16_f16: case NEON::BI__builtin_neon_vcvtnh_s16_f16: - case NEON::BI__builtin_neon_vcvtph_s16_f16: - case NEON::BI__builtin_neon_vcvth_s16_f16: { + case NEON::BI__builtin_neon_vcvtph_s16_f16: { llvm::Type *InTy = Int16Ty; llvm::Type* FTy = HalfTy; llvm::Type *Tys[2] = {InTy, FTy}; @@ -5447,8 +5451,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_fcvtnu; break; case NEON::BI__builtin_neon_vcvtph_u16_f16: Int = Intrinsic::aarch64_neon_fcvtpu; break; - case NEON::BI__builtin_neon_vcvth_u16_f16: - Int = Intrinsic::aarch64_neon_fcvtzu; break; case NEON::BI__builtin_neon_vcvtah_s16_f16: Int = Intrinsic::aarch64_neon_fcvtas; break; case NEON::BI__builtin_neon_vcvtmh_s16_f16: @@ -5457,8 +5459,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_fcvtns; break; case NEON::BI__builtin_neon_vcvtph_s16_f16: Int = Intrinsic::aarch64_neon_fcvtps; break; - case NEON::BI__builtin_neon_vcvth_s16_f16: - Int = Intrinsic::aarch64_neon_fcvtzs; break; } return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt"); } @@ -6410,23 +6410,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt"); } - case NEON::BI__builtin_neon_vcvt_s32_v: - case NEON::BI__builtin_neon_vcvt_u32_v: - case NEON::BI__builtin_neon_vcvt_s64_v: - case NEON::BI__builtin_neon_vcvt_u64_v: - case NEON::BI__builtin_neon_vcvt_s16_f16: - case NEON::BI__builtin_neon_vcvt_u16_f16: - case NEON::BI__builtin_neon_vcvtq_s32_v: - case NEON::BI__builtin_neon_vcvtq_u32_v: - case NEON::BI__builtin_neon_vcvtq_s64_v: - case NEON::BI__builtin_neon_vcvtq_u64_v: - case NEON::BI__builtin_neon_vcvtq_s16_f16: - case NEON::BI__builtin_neon_vcvtq_u16_f16: { - Int = - usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs; - llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)}; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz"); - } case NEON::BI__builtin_neon_vcvta_s16_f16: case NEON::BI__builtin_neon_vcvta_u16_f16: case NEON::BI__builtin_neon_vcvta_s32_v: diff --git a/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c b/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c index 929df94aa60ef..f14df46b89177 100644 --- a/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon-fcvt-intrinsics.c @@ -339,7 +339,7 @@ uint32_t test_vcvtpd_u32_f64(float64_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvts_s32_f32 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VCVTS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float [[A]]) +// CHECK-NEXT: [[VCVTS_S32_F32_I:%.*]] = call i32 @llvm.fptosi.sat.i32.f32(float [[A]]) // CHECK-NEXT: ret i32 [[VCVTS_S32_F32_I]] // int32_t test_vcvts_s32_f32(float32_t a) { @@ -349,7 +349,7 @@ int32_t test_vcvts_s32_f32(float32_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_s64_f64 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VCVTD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double [[A]]) +// CHECK-NEXT: [[VCVTD_S64_F64_I:%.*]] = call i64 @llvm.fptosi.sat.i64.f64(double [[A]]) // CHECK-NEXT: ret i64 [[VCVTD_S64_F64_I]] // int64_t test_vcvtd_s64_f64(float64_t a) { @@ -359,7 +359,7 @@ int64_t test_vcvtd_s64_f64(float64_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvts_s64_f32 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VCVTS_S64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float [[A]]) +// CHECK-NEXT: [[VCVTS_S64_F32_I:%.*]] = call i64 @llvm.fptosi.sat.i64.f32(float [[A]]) // CHECK-NEXT: ret i64 [[VCVTS_S64_F32_I]] // int64_t test_vcvts_s64_f32(float32_t a) { @@ -369,7 +369,7 @@ int64_t test_vcvts_s64_f32(float32_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_s32_f64 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VCVTD_S32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double [[A]]) +// CHECK-NEXT: [[VCVTD_S32_F64_I:%.*]] = call i32 @llvm.fptosi.sat.i32.f64(double [[A]]) // CHECK-NEXT: ret i32 [[VCVTD_S32_F64_I]] // int32_t test_vcvtd_s32_f64(float64_t a) { @@ -379,7 +379,7 @@ int32_t test_vcvtd_s32_f64(float64_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvts_u32_f32 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VCVTS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float [[A]]) +// CHECK-NEXT: [[VCVTS_U32_F32_I:%.*]] = call i32 @llvm.fptoui.sat.i32.f32(float [[A]]) // CHECK-NEXT: ret i32 [[VCVTS_U32_F32_I]] // uint32_t test_vcvts_u32_f32(float32_t a) { @@ -389,7 +389,7 @@ uint32_t test_vcvts_u32_f32(float32_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_u64_f64 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VCVTD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double [[A]]) +// CHECK-NEXT: [[VCVTD_U64_F64_I:%.*]] = call i64 @llvm.fptoui.sat.i64.f64(double [[A]]) // CHECK-NEXT: ret i64 [[VCVTD_U64_F64_I]] // uint64_t test_vcvtd_u64_f64(float64_t a) { @@ -399,7 +399,7 @@ uint64_t test_vcvtd_u64_f64(float64_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvts_u64_f32 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VCVTS_U64_F32_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float [[A]]) +// CHECK-NEXT: [[VCVTS_U64_F32_I:%.*]] = call i64 @llvm.fptoui.sat.i64.f32(float [[A]]) // CHECK-NEXT: ret i64 [[VCVTS_U64_F32_I]] // uint64_t test_vcvts_u64_f32(float32_t a) { @@ -409,7 +409,7 @@ uint64_t test_vcvts_u64_f32(float32_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_u32_f64 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VCVTD_U32_F64_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double [[A]]) +// CHECK-NEXT: [[VCVTD_U32_F64_I:%.*]] = call i32 @llvm.fptoui.sat.i32.f64(double [[A]]) // CHECK-NEXT: ret i32 [[VCVTD_U32_F64_I]] // uint32_t test_vcvtd_u32_f64(float64_t a) { diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c index ba32cfb7f3bae..d38fae31c44d1 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c @@ -1522,9 +1522,9 @@ float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) { // UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 // UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> -// UNCONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// UNCONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) -// UNCONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f64(<1 x double> [[TMP2]]) +// UNCONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ_I]] // // CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64( // CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { @@ -1532,9 +1532,9 @@ float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) { // CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 // CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) #[[ATTR3]] -// CONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f64(<1 x double> [[TMP2]]) #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ_I]] // int64x1_t test_vcvt_s64_f64(float64x1_t a) { return vcvt_s64_f64(a); @@ -1546,9 +1546,9 @@ int64x1_t test_vcvt_s64_f64(float64x1_t a) { // UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 // UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> -// UNCONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// UNCONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) -// UNCONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f64(<1 x double> [[TMP2]]) +// UNCONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ_I]] // // CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64( // CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { @@ -1556,9 +1556,9 @@ int64x1_t test_vcvt_s64_f64(float64x1_t a) { // CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 // CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) #[[ATTR3]] -// CONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f64(<1 x double> [[TMP2]]) #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ_I]] // uint64x1_t test_vcvt_u64_f64(float64x1_t a) { return vcvt_u64_f64(a); diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c index 80bb22cc43c78..8704a7827ad1d 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c @@ -10417,8 +10417,8 @@ uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { // CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_s8( // CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I]] // int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) { @@ -10430,10 +10430,10 @@ int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> // CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I]] @@ -10447,10 +10447,10 @@ int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> // CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I]] @@ -10462,8 +10462,8 @@ int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) { // CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_u8( // CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) -// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I]] // uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) { @@ -10475,10 +10475,10 @@ uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> // CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I]] @@ -10492,10 +10492,10 @@ uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> // CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I]] @@ -10507,8 +10507,8 @@ uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) { // CHECK-LABEL: define dso_local <8 x i16> @test_vabal_s8( // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) -// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16> // CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I]] // @@ -10521,10 +10521,10 @@ int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> // CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]] @@ -10539,10 +10539,10 @@ int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> // CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]] @@ -10555,8 +10555,8 @@ int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { // CHECK-LABEL: define dso_local <8 x i16> @test_vabal_u8( // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) -// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16> // CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I]] // @@ -10569,10 +10569,10 @@ uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> // CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]] @@ -10587,10 +10587,10 @@ uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> // CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]] @@ -10605,8 +10605,8 @@ uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) -// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I_I]] // int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) { @@ -10620,10 +10620,10 @@ int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> // CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I_I]] @@ -10639,10 +10639,10 @@ int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> // CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I_I]] @@ -10656,8 +10656,8 @@ int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) -// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I_I]] // uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) { @@ -10671,10 +10671,10 @@ uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> // CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I_I]] @@ -10690,10 +10690,10 @@ uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> // CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I_I]] @@ -10707,8 +10707,8 @@ uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> -// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) -// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16> // CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] // @@ -10723,10 +10723,10 @@ int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> // CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> // CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]] @@ -10743,10 +10743,10 @@ int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> <i32 2, i32 3> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> // CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> // CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]] @@ -10761,8 +10761,8 @@ int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> -// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) -// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I]] to <8 x i16> // CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] // @@ -10777,10 +10777,10 @@ uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> // CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> // CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]] @@ -10797,10 +10797,10 @@ uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> <i32 2, i32 3> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> // CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> // CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]] @@ -10813,8 +10813,8 @@ uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { // CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_s8( // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) -// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I]] // int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { @@ -10826,10 +10826,10 @@ int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) -// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { @@ -10841,10 +10841,10 @@ int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) -// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { @@ -10854,8 +10854,8 @@ int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { // CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_u8( // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) -// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I]] // uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { @@ -10867,10 +10867,10 @@ uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) -// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { @@ -10882,10 +10882,10 @@ uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) -// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { @@ -10897,8 +10897,8 @@ uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) -// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] // int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { @@ -10912,10 +10912,10 @@ int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) -// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] // int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { @@ -10929,10 +10929,10 @@ int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> <i32 2, i32 3> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) -// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] // int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { @@ -10944,8 +10944,8 @@ int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) -// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] // uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { @@ -10959,10 +10959,10 @@ uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) -// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] // uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { @@ -10976,10 +10976,10 @@ uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> <i32 2, i32 3> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) -// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] // uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { @@ -10989,8 +10989,8 @@ uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { // CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_s8( // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) -// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]] // CHECK-NEXT: ret <8 x i16> [[SUB_I]] // int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { @@ -11002,10 +11002,10 @@ int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) -// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { @@ -11017,10 +11017,10 @@ int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) -// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { @@ -11030,8 +11030,8 @@ int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { // CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_u8( // CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) -// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]] // CHECK-NEXT: ret <8 x i16> [[SUB_I]] // uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { @@ -11043,10 +11043,10 @@ uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) -// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { @@ -11058,10 +11058,10 @@ uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) -// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { @@ -11073,8 +11073,8 @@ uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) -// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]] // CHECK-NEXT: ret <8 x i16> [[SUB_I_I]] // int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { @@ -11088,10 +11088,10 @@ int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) -// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I_I]] // int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { @@ -11105,10 +11105,10 @@ int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> <i32 2, i32 3> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) -// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I_I]] // int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { @@ -11120,8 +11120,8 @@ int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) -// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]] // CHECK-NEXT: ret <8 x i16> [[SUB_I_I]] // uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { @@ -11135,10 +11135,10 @@ uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) -// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I_I]] // uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { @@ -11152,10 +11152,10 @@ uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> <i32 2, i32 3> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) -// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I_I]] // uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { @@ -22513,9 +22513,9 @@ float64x1_t test_vneg_f64(float64x1_t a) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 // CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) -// CHECK-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <1 x i64> @llvm.fptosi.sat.v1i64.v1f64(<1 x double> [[TMP2]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTZ_I]] // int64x1_t test_vcvt_s64_f64(float64x1_t a) { return vcvt_s64_f64(a); @@ -22527,9 +22527,9 @@ int64x1_t test_vcvt_s64_f64(float64x1_t a) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 // CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) -// CHECK-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <1 x i64> @llvm.fptoui.sat.v1i64.v1f64(<1 x double> [[TMP2]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTZ_I]] // uint64x1_t test_vcvt_u64_f64(float64x1_t a) { return vcvt_u64_f64(a); diff --git a/clang/test/CodeGen/AArch64/neon-misc.c b/clang/test/CodeGen/AArch64/neon-misc.c index ac2c83aa03ccf..9f6b49f993fb5 100644 --- a/clang/test/CodeGen/AArch64/neon-misc.c +++ b/clang/test/CodeGen/AArch64/neon-misc.c @@ -8,7 +8,7 @@ #include <arm_neon.h> // CHECK-LABEL: define dso_local <8 x i8> @test_vcgez_s8( -// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = icmp sge <8 x i8> [[A]], zeroinitializer // CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> @@ -3152,9 +3152,9 @@ float64x2_t test_vrndiq_f64(float64x2_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> [[VCVTZ_I]]) -// CHECK-NEXT: ret <2 x i32> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> [[TMP2]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTZ_I]] // int32x2_t test_vcvt_s32_f32(float32x2_t a) { return vcvt_s32_f32(a); @@ -3165,9 +3165,9 @@ int32x2_t test_vcvt_s32_f32(float32x2_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> [[VCVTZ_I]]) -// CHECK-NEXT: ret <4 x i32> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP2]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTZ_I]] // int32x4_t test_vcvtq_s32_f32(float32x4_t a) { return vcvtq_s32_f32(a); @@ -3178,9 +3178,9 @@ int32x4_t test_vcvtq_s32_f32(float32x4_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> [[VCVTZ_I]]) -// CHECK-NEXT: ret <2 x i64> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> [[TMP2]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTZ_I]] // int64x2_t test_vcvtq_s64_f64(float64x2_t a) { return vcvtq_s64_f64(a); @@ -3191,9 +3191,9 @@ int64x2_t test_vcvtq_s64_f64(float64x2_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> [[VCVTZ_I]]) -// CHECK-NEXT: ret <2 x i32> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> [[TMP2]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTZ_I]] // uint32x2_t test_vcvt_u32_f32(float32x2_t a) { return vcvt_u32_f32(a); @@ -3204,9 +3204,9 @@ uint32x2_t test_vcvt_u32_f32(float32x2_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[VCVTZ_I]]) -// CHECK-NEXT: ret <4 x i32> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP2]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTZ_I]] // uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { return vcvtq_u32_f32(a); @@ -3217,9 +3217,9 @@ uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> [[VCVTZ_I]]) -// CHECK-NEXT: ret <2 x i64> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> [[TMP2]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTZ_I]] // uint64x2_t test_vcvtq_u64_f64(float64x2_t a) { return vcvtq_u64_f64(a); diff --git a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics-constrained.c b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics-constrained.c index 4c19d75df96e2..df7c4b1ed4d02 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics-constrained.c +++ b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics-constrained.c @@ -105,42 +105,42 @@ float16_t test_vcvth_f16_u64 (uint64_t a) { } // COMMON-LABEL: test_vcvth_s16_f16 -// COMMONIR: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half %a) +// COMMONIR: [[VCVT:%.*]] = call i16 @llvm.fptosi.sat.i16.f16(half %a) // COMMONIR: ret i16 [[VCVT]] int16_t test_vcvth_s16_f16 (float16_t a) { return vcvth_s16_f16(a); } // COMMON-LABEL: test_vcvth_s32_f16 -// COMMONIR: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half %a) +// COMMONIR: [[VCVT:%.*]] = call i32 @llvm.fptosi.sat.i32.f16(half %a) // COMMONIR: ret i32 [[VCVT]] int32_t test_vcvth_s32_f16 (float16_t a) { return vcvth_s32_f16(a); } // COMMON-LABEL: test_vcvth_s64_f16 -// COMMONIR: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half %a) +// COMMONIR: [[VCVT:%.*]] = call i64 @llvm.fptosi.sat.i64.f16(half %a) // COMMONIR: ret i64 [[VCVT]] int64_t test_vcvth_s64_f16 (float16_t a) { return vcvth_s64_f16(a); } // COMMON-LABEL: test_vcvth_u16_f16 -// COMMONIR: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half %a) +// COMMONIR: [[VCVT:%.*]] = call i16 @llvm.fptoui.sat.i16.f16(half %a) // COMMONIR: ret i16 [[VCVT]] uint16_t test_vcvth_u16_f16 (float16_t a) { return vcvth_u16_f16(a); } // COMMON-LABEL: test_vcvth_u32_f16 -// COMMONIR: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half %a) +// COMMONIR: [[VCVT:%.*]] = call i32 @llvm.fptoui.sat.i32.f16(half %a) // COMMONIR: ret i32 [[VCVT]] uint32_t test_vcvth_u32_f16 (float16_t a) { return vcvth_u32_f16(a); } // COMMON-LABEL: test_vcvth_u64_f16 -// COMMONIR: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f16(half %a) +// COMMONIR: [[VCVT:%.*]] = call i64 @llvm.fptoui.sat.i64.f16(half %a) // COMMONIR: ret i64 [[VCVT]] uint64_t test_vcvth_u64_f16 (float16_t a) { return vcvth_u64_f16(a); diff --git a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c index c80d9e9d7f759..f4b7899b8360c 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.2a-fp16-intrinsics.c @@ -82,42 +82,42 @@ float16_t test_vcvth_f16_u64 (uint64_t a) { } // CHECK-LABEL: test_vcvth_s16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half %a) +// CHECK: [[VCVT:%.*]] = call i16 @llvm.fptosi.sat.i16.f16(half %a) // CHECK: ret i16 [[VCVT]] int16_t test_vcvth_s16_f16 (float16_t a) { return vcvth_s16_f16(a); } // CHECK-LABEL: test_vcvth_s32_f16 -// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half %a) +// CHECK: [[VCVT:%.*]] = call i32 @llvm.fptosi.sat.i32.f16(half %a) // CHECK: ret i32 [[VCVT]] int32_t test_vcvth_s32_f16 (float16_t a) { return vcvth_s32_f16(a); } // CHECK-LABEL: test_vcvth_s64_f16 -// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half %a) +// CHECK: [[VCVT:%.*]] = call i64 @llvm.fptosi.sat.i64.f16(half %a) // CHECK: ret i64 [[VCVT]] int64_t test_vcvth_s64_f16 (float16_t a) { return vcvth_s64_f16(a); } // CHECK-LABEL: test_vcvth_u16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half %a) +// CHECK: [[VCVT:%.*]] = call i16 @llvm.fptoui.sat.i16.f16(half %a) // CHECK: ret i16 [[VCVT]] uint16_t test_vcvth_u16_f16 (float16_t a) { return vcvth_u16_f16(a); } // CHECK-LABEL: test_vcvth_u32_f16 -// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half %a) +// CHECK: [[VCVT:%.*]] = call i32 @llvm.fptoui.sat.i32.f16(half %a) // CHECK: ret i32 [[VCVT]] uint32_t test_vcvth_u32_f16 (float16_t a) { return vcvth_u32_f16(a); } // CHECK-LABEL: test_vcvth_u64_f16 -// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f16(half %a) +// CHECK: [[VCVT:%.*]] = call i64 @llvm.fptoui.sat.i64.f16(half %a) // CHECK: ret i64 [[VCVT]] uint64_t test_vcvth_u64_f16 (float16_t a) { return vcvth_u64_f16(a); diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c index 9c408e8c702fd..b8380bd8ed6d4 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c @@ -227,9 +227,9 @@ float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[VCVTZ_I]]) -// CHECK-NEXT: ret <4 x i16> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> [[TMP2]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTZ_I]] // int16x4_t test_vcvt_s16_f16 (float16x4_t a) { return vcvt_s16_f16(a); @@ -240,9 +240,9 @@ int16x4_t test_vcvt_s16_f16 (float16x4_t a) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[VCVTZ_I]]) -// CHECK-NEXT: ret <8 x i16> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTZ_I]] // int16x8_t test_vcvtq_s16_f16 (float16x8_t a) { return vcvtq_s16_f16(a); @@ -253,9 +253,9 @@ int16x8_t test_vcvtq_s16_f16 (float16x8_t a) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[VCVTZ_I]]) -// CHECK-NEXT: ret <4 x i16> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> [[TMP2]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTZ_I]] // uint16x4_t test_vcvt_u16_f16 (float16x4_t a) { return vcvt_u16_f16(a); @@ -266,9 +266,9 @@ uint16x4_t test_vcvt_u16_f16 (float16x4_t a) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[VCVTZ_I]]) -// CHECK-NEXT: ret <8 x i16> [[VCVTZ1_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTZ_I]] // uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) { return vcvtq_u16_f16(a); @@ -1973,10 +1973,10 @@ float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { // CHECK-NEXT: entry: // CHECK-NEXT: [[CONV:%.*]] = fpext half [[B]] to float // CHECK-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] -// CHECK-NEXT: [[TMP0:%.*]] = fptrunc float [[FNEG]] to half +// CHECK-NEXT: [[CONV1:%.*]] = fptrunc float [[FNEG]] to half // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3 -// CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]]) -// CHECK-NEXT: ret half [[TMP1]] +// CHECK-NEXT: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[CONV1]], half [[EXTRACT]], half [[A]]) +// CHECK-NEXT: ret half [[TMP0]] // float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) { return vfmsh_lane_f16(a, b, c, 3); @@ -1987,10 +1987,10 @@ float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) { // CHECK-NEXT: entry: // CHECK-NEXT: [[CONV:%.*]] = fpext half [[B]] to float // CHECK-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] -// CHECK-NEXT: [[TMP0:%.*]] = fptrunc float [[FNEG]] to half +// CHECK-NEXT: [[CONV1:%.*]] = fptrunc float [[FNEG]] to half // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7 -// CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]]) -// CHECK-NEXT: ret half [[TMP1]] +// CHECK-NEXT: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[CONV1]], half [[EXTRACT]], half [[A]]) +// CHECK-NEXT: ret half [[TMP0]] // float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) { return vfmsh_laneq_f16(a, b, c, 7); @@ -2094,8 +2094,8 @@ float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) { // CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half // CHECK-NEXT: [[CONV3:%.*]] = fpext half [[TMP1]] to float // CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV3]] -// CHECK-NEXT: [[TMP2:%.*]] = fptrunc float [[MUL]] to half -// CHECK-NEXT: ret half [[TMP2]] +// CHECK-NEXT: [[CONV4:%.*]] = fptrunc float [[MUL]] to half +// CHECK-NEXT: ret half [[CONV4]] // float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) { return vmulh_lane_f16(a, b, 3); @@ -2110,8 +2110,8 @@ float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) { // CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half // CHECK-NEXT: [[CONV3:%.*]] = fpext half [[TMP1]] to float // CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV3]] -// CHECK-NEXT: [[TMP2:%.*]] = fptrunc float [[MUL]] to half -// CHECK-NEXT: ret half [[TMP2]] +// CHECK-NEXT: [[CONV4:%.*]] = fptrunc float [[MUL]] to half +// CHECK-NEXT: ret half [[CONV4]] // float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) { return vmulh_laneq_f16(a, b, 7); _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
