Author: David Green Date: 2026-06-16T13:53:31+01:00 New Revision: af27c81cc4bbc8094b0ed82633bc8520898bbb51
URL: https://github.com/llvm/llvm-project/commit/af27c81cc4bbc8094b0ed82633bc8520898bbb51 DIFF: https://github.com/llvm/llvm-project/commit/af27c81cc4bbc8094b0ed82633bc8520898bbb51.diff LOG: [AArch64] Remove aarch64_neon_vcvtfp2hf and aarch64_neon_vcvthf2fp (#203903) This removes aarch64_neon_vcvtfp2hf and aarch64_neon_vcvthf2fp intrinsics, relying on fp16 fpext and fptrunc instructions directly. Arm is left using its version of the intrinsics, as the types in the backend are more difficult without fullfp16. Added: Modified: clang/include/clang/Basic/AArch64CodeGenUtils.h clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp clang/lib/CodeGen/TargetBuiltins/ARM.cpp clang/test/CodeGen/AArch64/neon-misc-constrained.c clang/test/CodeGen/AArch64/neon-misc.c clang/test/CodeGen/arm_neon_intrinsics.c llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/lib/IR/AutoUpgrade.cpp llvm/lib/Target/AArch64/AArch64InstrInfo.td llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp llvm/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt_f32_su32.ll Removed: ################################################################################ diff --git a/clang/include/clang/Basic/AArch64CodeGenUtils.h b/clang/include/clang/Basic/AArch64CodeGenUtils.h index f64a41df63cf8..3b9145920bd5f 100644 --- a/clang/include/clang/Basic/AArch64CodeGenUtils.h +++ b/clang/include/clang/Basic/AArch64CodeGenUtils.h @@ -158,10 +158,8 @@ const inline ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap [] = { NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType), NEONMAP1(vcnt_v, ctpop, Add1ArgType), NEONMAP1(vcntq_v, ctpop, Add1ArgType), - NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0), NEONMAP0(vcvt_f16_s16), NEONMAP0(vcvt_f16_u16), - NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0), NEONMAP0(vcvt_f32_v), NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0), NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0), diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 84b9bb1007763..82f2e3954bb47 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -2692,6 +2692,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, case NEON::BI__builtin_neon_vcvt_bf16_f32: case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: + case NEON::BI__builtin_neon_vcvt_f16_f32: + case NEON::BI__builtin_neon_vcvt_f32_f16: case clang::AArch64::BI_InterlockedAdd: case clang::AArch64::BI_InterlockedAdd_acq: case clang::AArch64::BI_InterlockedAdd_rel: diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 4c668dabd53dc..6d6f87a9439df 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -595,10 +595,8 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { NEONMAP1(vclzq_v, ctlz, Add1ArgType), NEONMAP1(vcnt_v, ctpop, Add1ArgType), NEONMAP1(vcntq_v, ctpop, Add1ArgType), - NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0), NEONMAP0(vcvt_f16_s16), NEONMAP0(vcvt_f16_u16), - NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0), NEONMAP0(vcvt_f32_v), NEONMAP1(vcvt_n_f16_s16, arm_neon_vcvtfxs2fp, 0), NEONMAP1(vcvt_n_f16_u16, arm_neon_vcvtfxu2fp, 0), @@ -2662,10 +2660,15 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops, "vsha1h"); - case NEON::BI__builtin_neon_vcvth_bf16_f32: { + case NEON::BI__builtin_neon_vcvth_bf16_f32: return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf), Ops, "vcvtbfp2bf"); - } + case NEON::BI__builtin_neon_vcvt_f16_f32: + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtfp2hf), Ops, + "vcvtfp2hf"); + case NEON::BI__builtin_neon_vcvt_f32_f16: + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvthf2fp), Ops, + "vcvthf2fp"); // The ARM _MoveToCoprocessor builtins put the input register value as // the first argument, but the LLVM intrinsic expects it as the third one. @@ -6050,6 +6053,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16); return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask); } + case NEON::BI__builtin_neon_vcvt_f16_f32: { + llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4); + llvm::Type *V4F16 = FixedVectorType::get(Builder.getHalfTy(), 4); + return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4F16); + } + case NEON::BI__builtin_neon_vcvt_f32_f16: { + llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4); + llvm::Type *V4F16 = FixedVectorType::get(Builder.getHalfTy(), 4); + return Builder.CreateFPExt(Builder.CreateBitCast(Ops[0], V4F16), V4F32); + } case clang::AArch64::BI_InterlockedAdd: case clang::AArch64::BI_InterlockedAdd_acq: diff --git a/clang/test/CodeGen/AArch64/neon-misc-constrained.c b/clang/test/CodeGen/AArch64/neon-misc-constrained.c index 06ecfd91252a1..49208892e3035 100644 --- a/clang/test/CodeGen/AArch64/neon-misc-constrained.c +++ b/clang/test/CodeGen/AArch64/neon-misc-constrained.c @@ -103,3 +103,95 @@ float32x4_t test_vsqrtq_f32(float32x4_t a) { float64x2_t test_vsqrtq_f64(float64x2_t a) { return vsqrtq_f64(a); } + +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vcvt_f16_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x half> +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP3]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vcvt_f16_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> [[TMP2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP3]] +// +float16x4_t test_vcvt_f16_f32(float32x4_t a) { + return vcvt_f16_f32(a); +} + +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vcvt_high_f16_f32( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x half> +// UNCONSTRAINED-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +// UNCONSTRAINED-NEXT: ret <8 x half> [[SHUFFLE_I]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vcvt_high_f16_f32( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> [[TMP2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +// CONSTRAINED-NEXT: ret <8 x half> [[SHUFFLE_I]] +// +float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) { + return vcvt_high_f16_f32(a, b); +} + +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vcvt_f32_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = fpext <4 x half> [[TMP2]] to <4 x float> +// UNCONSTRAINED-NEXT: ret <4 x float> [[TMP3]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vcvt_f32_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> [[TMP2]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vcvt_f32_f16(float16x4_t a) { + return vcvt_f32_f16(a); +} + +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vcvt_high_f32_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = fpext <4 x half> [[TMP2]] to <4 x float> +// UNCONSTRAINED-NEXT: ret <4 x float> [[TMP3]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vcvt_high_f32_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> [[TMP2]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vcvt_high_f32_f16(float16x8_t a) { + return vcvt_high_f32_f16(a); +} diff --git a/clang/test/CodeGen/AArch64/neon-misc.c b/clang/test/CodeGen/AArch64/neon-misc.c index 964b0059662d2..720a1c22933d1 100644 --- a/clang/test/CodeGen/AArch64/neon-misc.c +++ b/clang/test/CodeGen/AArch64/neon-misc.c @@ -2606,11 +2606,8 @@ uint64x2_t test_vshll_high_n_u32(uint32x4_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) -// CHECK-NEXT: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x half> // CHECK-NEXT: ret <4 x half> [[TMP3]] // float16x4_t test_vcvt_f16_f32(float32x4_t a) { @@ -2622,11 +2619,8 @@ float16x4_t test_vcvt_f16_f32(float32x4_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[VCVT_F16_F32_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I_I]]) -// CHECK-NEXT: [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x half> // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: ret <8 x half> [[SHUFFLE_I]] // @@ -2693,11 +2687,8 @@ float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) -// CHECK-NEXT: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fpext <4 x half> [[TMP2]] to <4 x float> // CHECK-NEXT: ret <4 x float> [[TMP3]] // float32x4_t test_vcvt_f32_f16(float16x4_t a) { @@ -2710,11 +2701,8 @@ float32x4_t test_vcvt_f32_f16(float16x4_t a) { // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <4 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) -// CHECK-NEXT: [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I_I]] to <4 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fpext <4 x half> [[TMP2]] to <4 x float> // CHECK-NEXT: ret <4 x float> [[TMP3]] // float32x4_t test_vcvt_high_f32_f16(float16x8_t a) { diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c index eb9fe126ff2a0..a471ccf52e173 100644 --- a/clang/test/CodeGen/arm_neon_intrinsics.c +++ b/clang/test/CodeGen/arm_neon_intrinsics.c @@ -3330,12 +3330,10 @@ int64x1_t test_vcreate_s64(uint64_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) -// CHECK-NEXT: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[VCVTFP2HF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTFP2HF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[VCVTFP2HF_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2HF1_I]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP2]] // float16x4_t test_vcvt_f16_f32(float32x4_t a) { return vcvt_f16_f32(a); @@ -3394,12 +3392,9 @@ float32x4_t test_vcvtq_f32_u32(uint32x4_t a) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) -// CHECK-NEXT: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[VCVTHF2FP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VCVTHF2FP1_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVTHF2FP_I]]) +// CHECK-NEXT: ret <4 x float> [[VCVTHF2FP1_I]] // float32x4_t test_vcvt_f32_f16(float16x4_t a) { return vcvt_f32_f16(a); diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index b1a2b348bc04b..2923595486712 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -465,12 +465,6 @@ let TargetPrefix = "aarch64" in { def int_aarch64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic; def int_aarch64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic; - // Vector Conversions Between Half-Precision and Single-Precision. - def int_aarch64_neon_vcvtfp2hf - : DefaultAttrsIntrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_aarch64_neon_vcvthf2fp - : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>; - // Vector Conversions Between Floating-point and Fixed-point. def int_aarch64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic; def int_aarch64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index f27713df7f146..814e985ebf7be 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1000,6 +1000,12 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, return true; } + // vcvtfp2hf and vcvthf2fp -> fpext and fptrunc + if (Name == "vcvtfp2hf" || Name == "vcvthf2fp") { + NewFn = nullptr; + return true; + } + return false; // No other 'aarch64.neon.*'. } if (Name.consume_front("sve.")) { @@ -4670,6 +4676,19 @@ static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI, CI->getName()); } + if (Name == "neon.vcvtfp2hf") + return Builder.CreateBitCast( + Builder.CreateFPTrunc( + CI->getOperand(0), + FixedVectorType::get(Type::getHalfTy(F->getContext()), 4)), + FixedVectorType::get(Type::getInt16Ty(F->getContext()), 4)); + if (Name == "neon.vcvthf2fp") + return Builder.CreateFPExt( + Builder.CreateBitCast( + CI->getOperand(0), + FixedVectorType::get(Type::getHalfTy(F->getContext()), 4)), + FixedVectorType::get(Type::getFloatTy(F->getContext()), 4)); + llvm_unreachable("Unhandled Intrinsic!"); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 3f3eec2a34f5a..96c77c2f75196 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5828,11 +5828,6 @@ defm FCMLT : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>; defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>; defm FCVTL : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">; -def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))), - (FCVTLv4i16 V64:$Rn)>; -def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn), - (i64 4)))), - (FCVTLv8i16 V128:$Rn)>; def : Pat<(v2f64 (any_fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; def : Pat<(v2f64 (any_fpextend (v2f32 (extract_high_v4f32 (v4f32 V128:$Rn))))), @@ -5846,11 +5841,6 @@ defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>; defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>; defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">; -def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))), - (FCVTNv4i16 V128:$Rn)>; -def : Pat<(concat_vectors V64:$Rd, - (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))), - (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; def : Pat<(v2f32 (any_fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; def : Pat<(v4f16 (any_fpround (v4f32 V128:$Rn))), diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index c45ec68f3cd07..383f503a2e87f 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -7186,9 +7186,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::aarch64_neon_fcvtzu: // Floating-point convert to lower precision narrow, rounding to odd case Intrinsic::aarch64_neon_fcvtxn: - // Vector Conversions Between Half-Precision and Single-Precision - case Intrinsic::aarch64_neon_vcvthf2fp: - case Intrinsic::aarch64_neon_vcvtfp2hf: handleGenericVectorConvertIntrinsic(I, /*FixedPoint=*/false); break; diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll index 310dc711fdc26..286390a8d80a0 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll @@ -33,6 +33,8 @@ define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp { ret <4 x float> %vcvt.i } +; Test that vcvtfp2hf and vcvthf2fp are AutoUpgraded to fp16 fptrunc + fpext. + define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp { ; CHECK-LABEL: cvtf16: ; CHECK: fcvtl v0.4s, v0.4h @@ -50,8 +52,6 @@ define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp { ret <4 x float> %vcvt1.i } - - define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp { ; CHECK-LABEL: cvtf16f32: ; CHECK: fcvtn v0.4h, v0.4s @@ -68,6 +68,3 @@ define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) { %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x i16> %res } - -declare <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16>) nounwind readnone -declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) nounwind readnone diff --git a/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll index c7e22b2f4301d..cef94f820d317 100644 --- a/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll +++ b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll @@ -10,9 +10,8 @@ define <8 x half> @test1(<4 x float> noundef %a) { ; CHECK-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NEXT: ret entry: - %vcvt_f16_f321.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) - %0 = bitcast <4 x i16> %vcvt_f16_f321.i to <4 x half> - %shuffle.i = shufflevector <4 x half> %0, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %b = fptrunc <4 x float> %a to <4 x half> + %shuffle.i = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x half> %shuffle.i } diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt_f32_su32.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt_f32_su32.ll index 802c3aa4784ee..cab8139c70042 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt_f32_su32.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt_f32_su32.ll @@ -75,9 +75,9 @@ define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp sanitize_memory { ; CHECK-SAME: <4 x i16> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i16> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> -; CHECK-NEXT: [[VCVT1_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[A]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[VCVT1_I:%.*]] = fpext <4 x half> [[TMP2]] to <4 x float> ; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[VCVT1_I]] ; @@ -92,9 +92,9 @@ define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp sanitize_mem ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7> ; CHECK-NEXT: [[IN:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i16> [[_MSPROP]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> -; CHECK-NEXT: [[VCVT1_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[IN]]) #[[ATTR3]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[IN]] to <4 x half> +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[_MSPROP]] to <4 x i32> +; CHECK-NEXT: [[VCVT1_I:%.*]] = fpext <4 x half> [[TMP2]] to <4 x float> ; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[VCVT1_I]] ; @@ -110,9 +110,9 @@ define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp sanitize_memo ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> -; CHECK-NEXT: [[VCVT1_I:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[A]]) #[[ATTR3]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = fptrunc <4 x float> [[A]] to <4 x half> +; CHECK-NEXT: [[VCVT1_I:%.*]] = bitcast <4 x half> [[TMP4]] to <4 x i16> ; CHECK-NEXT: store <4 x i16> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[VCVT1_I]] ; @@ -126,9 +126,9 @@ define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) sanitiz ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[HIGH:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[HIGH_BIG]]) +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = fptrunc <4 x float> [[HIGH_BIG]] to <4 x half> +; CHECK-NEXT: [[HIGH:%.*]] = bitcast <4 x half> [[TMP5]] to <4 x i16> ; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i16> [[LOW]], <4 x i16> [[HIGH]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; CHECK-NEXT: store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
