[PATCH] D49941: [ARM] Add ARMv8.2-A FP16 scalar intrinsic
az created this revision. az added a reviewer: SjoerdMeijer. Herald added a reviewer: javed.absar. Herald added subscribers: cfe-commits, chrib, kristof.beyls. This patch adds the fp16 scalar intrinsic for ARM as described in the ARM ACLE document. Only the frontend work is done here and some work is still needed in the backend codegen. Repository: rC Clang https://reviews.llvm.org/D49941 Files: clang/include/clang/Basic/arm_fp16.td clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/arm-v8.2a-fp16-intrinsics.c llvm/include/llvm/IR/IntrinsicsARM.td Index: llvm/include/llvm/IR/IntrinsicsARM.td === --- llvm/include/llvm/IR/IntrinsicsARM.td +++ llvm/include/llvm/IR/IntrinsicsARM.td @@ -355,6 +355,9 @@ class Neon_2Arg_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; +class Float_2Arg_Intrinsic + : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; class Neon_2Arg_Narrow_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMExtendedType<0>, LLVMExtendedType<0>], [IntrNoMem]>; @@ -377,8 +380,8 @@ : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; class Neon_CvtFPToFx_Intrinsic : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>; -class Neon_CvtFPtoInt_1Arg_Intrinsic - : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; +class CvtFPtoInt_1Arg_Intrinsic + : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>; class Neon_Compare_Intrinsic : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>], @@ -431,12 +434,12 @@ // Vector Maximum. def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic; def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic; - def int_arm_neon_vmaxnm : Neon_2Arg_Intrinsic; + def int_arm_neon_vmaxnm : Float_2Arg_Intrinsic; // Vector Minimum. def int_arm_neon_vmins : Neon_2Arg_Intrinsic; def int_arm_neon_vminu : Neon_2Arg_Intrinsic; - def int_arm_neon_vminnm : Neon_2Arg_Intrinsic; + def int_arm_neon_vminnm : Float_2Arg_Intrinsic; // Vector Reciprocal Step. def int_arm_neon_vrecps : Neon_2Arg_Intrinsic; @@ -552,15 +555,15 @@ // Vector Reciprocal Square Root Estimate. def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic; -// Vector Conversions Between Floating-point and Integer -def int_arm_neon_vcvtau : Neon_CvtFPtoInt_1Arg_Intrinsic; -def int_arm_neon_vcvtas : Neon_CvtFPtoInt_1Arg_Intrinsic; -def int_arm_neon_vcvtnu : Neon_CvtFPtoInt_1Arg_Intrinsic; -def int_arm_neon_vcvtns : Neon_CvtFPtoInt_1Arg_Intrinsic; -def int_arm_neon_vcvtpu : Neon_CvtFPtoInt_1Arg_Intrinsic; -def int_arm_neon_vcvtps : Neon_CvtFPtoInt_1Arg_Intrinsic; -def int_arm_neon_vcvtmu : Neon_CvtFPtoInt_1Arg_Intrinsic; -def int_arm_neon_vcvtms : Neon_CvtFPtoInt_1Arg_Intrinsic; +// Conversions Between Floating-point and Integer +def int_arm_neon_vcvtau : CvtFPtoInt_1Arg_Intrinsic; +def int_arm_neon_vcvtas : CvtFPtoInt_1Arg_Intrinsic; +def int_arm_neon_vcvtnu : CvtFPtoInt_1Arg_Intrinsic; +def int_arm_neon_vcvtns : CvtFPtoInt_1Arg_Intrinsic; +def int_arm_neon_vcvtpu : CvtFPtoInt_1Arg_Intrinsic; +def int_arm_neon_vcvtps : CvtFPtoInt_1Arg_Intrinsic; +def int_arm_neon_vcvtmu : CvtFPtoInt_1Arg_Intrinsic; +def int_arm_neon_vcvtms : CvtFPtoInt_1Arg_Intrinsic; // Vector Conversions Between Floating-point and Fixed-point. def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic; Index: clang/test/CodeGen/arm-v8.2a-fp16-intrinsics.c === --- /dev/null +++ clang/test/CodeGen/arm-v8.2a-fp16-intrinsics.c @@ -0,0 +1,225 @@ +// RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \ +// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg \ +// RUN: | FileCheck %s + +// REQUIRES: arm-registered-target + +#include + +// CHECK-LABEL: test_vabsh_f16 +// CHECK: [[ABS:%.*]] = call half @llvm.fabs.f16(half %{{.*}}) +float16_t test_vabsh_f16(float16_t a) { + return vabsh_f16(a); +} + +// CHECK-LABEL: test_vcvth_f16_s32 +// CHECK: [[VCVT:%.*]] = sitofp i32 %a to half +float16_t test_vcvth_f16_s32 (int32_t a) { + return vcvth_f16_s32(a); +} + +// CHECK-LABEL: test_vcvth_f16_u32 +// CHECK: [[VCVT:%.*]] = uitofp i32 %a to half +float16_t test_vcvth_f16_u32 (uint32_t a) { + return vcvth_f16_u32(a); +} + +// CHECK-LABEL: test_vcvth_s32_f16 +// CHECK: [[VCVT:%.*]] = fptosi half %{{.*}} to i32 +// CHECK: ret i32 [[VCVT]] +int32_t test_vcvth_s32_f16 (float16_t a) { + return vcvth_s32_f16(a); +} + +// CHECK-LABEL: test_vcvth_u32_f16 +// CHECK: [[VCVT:%.*]] = fptoui half %{{.*}} to i32 +// CHECK: ret i32 [[VCVT]] +uint32_t test_vcvth_u32_f16 (float16_t a) { + return vcvth_u32_f16(a); +} + +// CHECK-LABEL: test_vcvtah_s32_f16 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.
[PATCH] D44591: [AArch64] Add vmulxh_lane FP16 vector intrinsic
az updated this revision to Diff 139015. az added a comment. add LLVM codegen tests as suggested in the reviews. https://reviews.llvm.org/D44591 Files: clang/include/clang/Basic/arm_neon.td clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll Index: llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll === --- llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll +++ llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16 | FileCheck %s +declare half @llvm.aarch64.neon.fmulx.f16(half, half) declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>) declare <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half>, <8 x half>) declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) @@ -236,6 +237,25 @@ ret half %1 } +define dso_local half @t_vmulx_f16(half %a, half %b) { +; CHECK-LABEL: t_vmulx_f16: +; CHECK: fmulx h0, h0, h1 +; CHECK-NEXT:ret +entry: + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b) + ret half %fmulx.i +} + +define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) { +; CHECK-LABEL: t_vmulxh_lane_f16: +; CHECK: fmulx h0, h0, v1.h[3] +; CHECK-NEXT:ret +entry: + %extract = extractelement <4 x half> %b, i32 3 + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) + ret half %fmulx.i +} + define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) { ; CHECK-LABEL: t_vmulx_lane_f16: ; CHECK: fmulx v0.4h, v0.4h, v1.h[0] @@ -276,6 +296,16 @@ ret <8 x half> %vmulx2.i } +define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) { +; CHECK-LABEL: t_vmulxh_laneq_f16: +; CHECK: fmulx h0, h0, v1.h[7] +; CHECK-NEXT:ret +entry: + %extract = extractelement <8 x half> %b, i32 7 + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) + ret half %fmulx.i +} + define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) { ; CHECK-LABEL: t_vmulx_n_f16: ; CHECK: dup v1.4h, v1.h[0] Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c === --- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -1223,27 +1223,25 @@ return vmulxq_n_f16(a, b); } -/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h) -// CCHECK-LABEL: test_vmulxh_lane_f16 -// CCHECK: [[CONV0:%.*]] = fpext half %a to float -// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float -// CCHECK: [[MUL:%.*]] = fmul float [[CONV0:%.*]], [[CONV0:%.*]] -// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half -// CCHECK: ret half [[CONV3:%.*]] +// CHECK-LABEL: test_vmulxh_lane_f16 +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[EXTR:%.*]] = extractelement <4 x half> [[TMP1]], i32 3 +// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]] +// CHECK: ret half [[MULX]] float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) { return vmulxh_lane_f16(a, b, 3); } -// CCHECK-LABEL: test_vmulxh_laneq_f16 -// CCHECK: [[CONV0:%.*]] = fpext half %a to float -// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float -// CCHECK: [[MUL:%.*]] = fmul float [[CONV0:%.*]], [[CONV0:%.*]] -// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half -// CCHECK: ret half [[CONV3:%.*]] +// CHECK-LABEL: test_vmulxh_laneq_f16 +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[EXTR:%.*]] = extractelement <8 x half> [[TMP1]], i32 7 +// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]]) +// CHECK: ret half [[MULX]] float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) { return vmulxh_laneq_f16(a, b, 7); } -*/ // CHECK-LABEL: test_vmaxv_f16 // CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> Index: clang/lib/CodeGen/CGBuiltin.cpp === --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -7238,6 +7238,16 @@ Int = Intrinsic::aarch64_neon_fmulx; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx"); } + case NEON::BI__builtin_neon_vmulxh_lane_f16: + case NEON::BI__builtin_neon_vmulxh_laneq_f16: { +// vmulx_lane should be mapped to Neon scalar mulx after +// extracting the scalar element +Ops.push_back(EmitScalarExpr(E->getArg(2))); +Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); +Ops.pop_back(); +Int = Intrinsic::aarch64_neon_fmulx; +return EmitNeonCall(CGM.getIntrins
[PATCH] D44222: [AArch64] Add vmulxh_lane FP16 intrinsics
az added a comment. Was not able to update this particular review with the new code, So I created a new one in https://reviews.llvm.org/D44591 I manage to reuse the mulx scalar intrinsic work, not exactly calling the fp16 scalar intrinsic itself which is not available here but the same frontend codegen work with an extract instruction before that. https://reviews.llvm.org/D44222 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D44591: [AArch64] Add vmulxh_lane FP16 vector intrinsic
az created this revision. az added a reviewer: SjoerdMeijer. Herald added subscribers: kristof.beyls, javed.absar, rengolin. Add the two missing vmulxh_lane vector intrinsics that were originally left out. https://reviews.llvm.org/D44591 Files: clang/include/clang/Basic/arm_neon.td clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c === --- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -1223,27 +1223,25 @@ return vmulxq_n_f16(a, b); } -/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h) -// CCHECK-LABEL: test_vmulxh_lane_f16 -// CCHECK: [[CONV0:%.*]] = fpext half %a to float -// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float -// CCHECK: [[MUL:%.*]] = fmul float [[CONV0:%.*]], [[CONV0:%.*]] -// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half -// CCHECK: ret half [[CONV3:%.*]] +// CHECK-LABEL: test_vmulxh_lane_f16 +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %b to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[EXTR:%.*]] = extractelement <4 x half> [[TMP1]], i32 3 +// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]] +// CHECK: ret half [[MULX]] float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) { return vmulxh_lane_f16(a, b, 3); } -// CCHECK-LABEL: test_vmulxh_laneq_f16 -// CCHECK: [[CONV0:%.*]] = fpext half %a to float -// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float -// CCHECK: [[MUL:%.*]] = fmul float [[CONV0:%.*]], [[CONV0:%.*]] -// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half -// CCHECK: ret half [[CONV3:%.*]] +// CHECK-LABEL: test_vmulxh_laneq_f16 +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %b to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[EXTR:%.*]] = extractelement <8 x half> [[TMP1]], i32 7 +// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]]) +// CHECK: ret half [[MULX]] float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) { return vmulxh_laneq_f16(a, b, 7); } -*/ // CHECK-LABEL: test_vmaxv_f16 // CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> Index: clang/lib/CodeGen/CGBuiltin.cpp === --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -7248,6 +7248,16 @@ Int = Intrinsic::aarch64_neon_fmulx; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx"); } + case NEON::BI__builtin_neon_vmulxh_lane_f16: + case NEON::BI__builtin_neon_vmulxh_laneq_f16: { +// vmulx_lane should be mapped to Neon scalar mulx after +// extracting the scalar element +Ops.push_back(EmitScalarExpr(E->getArg(2))); +Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); +Ops.pop_back(); +Int = Intrinsic::aarch64_neon_fmulx; +return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx"); + } case NEON::BI__builtin_neon_vmul_lane_v: case NEON::BI__builtin_neon_vmul_laneq_v: { // v1f64 vmul_lane should be mapped to Neon scalar mul lane Index: clang/include/clang/Basic/arm_neon.td === --- clang/include/clang/Basic/arm_neon.td +++ clang/include/clang/Basic/arm_neon.td @@ -1547,11 +1547,9 @@ def VMULX_LANEH : IOpInst<"vmulx_lane", "ddgi", "hQh", OP_MULX_LN>; def VMULX_LANEQH : IOpInst<"vmulx_laneq", "ddji", "hQh", OP_MULX_LN>; def VMULX_NH : IOpInst<"vmulx_n", "dds", "hQh", OP_MULX_N>; - // TODO: Scalar floating point multiply extended (scalar, by element) - // Below ones are commented out because they need vmulx_f16(float16_t, float16_t) - // which will be implemented later with fp16 scalar intrinsic (arm_fp16.h) - //def SCALAR_FMULX_LANEH : IOpInst<"vmulx_lane", "ssdi", "Sh", OP_SCALAR_MUL_LN>; - //def SCALAR_FMULX_LANEQH : IOpInst<"vmulx_laneq", "ssji", "Sh", OP_SCALAR_MUL_LN>; + // Scalar floating point mulx (scalar, by element) + def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "ssdi", "Sh">; + def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "ssji", "Sh">; // ARMv8.2-A FP16 reduction vector intrinsics. def VMAXVH : SInst<"vmaxv", "sd", "hQh">; Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c === --- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -1223,27 +1223,25 @@ return vmulxq_n_f16(a, b); } -/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h) -// CCHECK-LABEL: test_vmulxh_lane_f16 -// CCHECK: [[CONV0:%.*]] = fpext half %a to float -// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float -// CCHECK: [[MUL:%.*]] = fmul float [[CONV0:%.*]], [[CONV0:%.*
[PATCH] D44222: [AArch64] Add vmulxh_lane FP16 intrinsics
az added inline comments. Comment at: include/clang/Basic/arm_neon.td:1504 + // Scalar floating point multiply extended (scalar, by element) + def SCALAR_FMULX_LANEH : IOpInst<"vmulx_lane", "ssdi", "Sh", OP_SCALAR_MUL_LN>; + def SCALAR_FMULX_LANEQH : IOpInst<"vmulx_laneq", "ssji", "Sh", OP_SCALAR_MUL_LN>; SjoerdMeijer wrote: > I found that unfortunately it's not that straightforward. This leads to wrong > code generation as it is generating a fmul instead of fmulx. I am suspecting > this instruction description should be using OP_SCALAR_MULX_LN, but also the > type decls are wrong. Need to dig a bit further here. Sorry for confusion as the commented code was never intended to be used and it is a copy of the code for the intrinsic vmulh_lane(). It was done that way in order to point out that vmulh_lane() and vmulxh_lane() intrinsics should be implemented in a similar way. The only useful thing in the commented code is the explanation that we need the scalar intrinsic vmulxh_f16() which was implemented in the scalar intrinsic patch later on. If we look at how vmulh_lane (a, b, lane) is implemented: x = extract (b, lane); res = a * x; return res; Similarly, I thought at the time that vmulxh_lane (a, b, lane) can be implemented: x = extract (b, lane); res = vmulxh_f16 (a, x); // no llvm native mulx instruction, so we use the fp16 scalar intrinsic. return res; I am not sure now that we can easily use scalar intrinsic while generating the arm_neon.h file. In case we can not do that, I am thinking that the frontend should generate a new builtin for intrinsic vmulxh_lane() that the backend recognizes and generate the right code for it which is fmulx h0, h0, v1.h[lane]. If you made or will be making progress on this, then that is great. Otherwise, I can look at a frontend solution for it. https://reviews.llvm.org/D44222 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43650: [ARM] Add ARMv8.2-A FP16 vector intrinsics
az created this revision. az added a reviewer: SjoerdMeijer. Herald added subscribers: kristof.beyls, javed.absar. This patch adds the fp16 neon vector intrinsic for ARM as described in the ARM ACLE document. While this patch may seem large at first but it is essentially a modification/addition on top of some old work by doing this: - Port AArch64 patch https://reviews.llvm.org/D32511 to ARM. - Enable the frontend fp16 data type for ARM (which is a revert of patch https://reviews.llvm.org/D41360). https://reviews.llvm.org/D43650 Files: clang/include/clang/Basic/arm_neon.td clang/lib/Basic/Targets/ARM.cpp clang/lib/Basic/Targets/ARM.h clang/lib/CodeGen/CGBuiltin.cpp clang/lib/CodeGen/CodeGenFunction.h clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c clang/test/CodeGen/arm_neon_intrinsics.c Index: clang/test/CodeGen/arm_neon_intrinsics.c === --- clang/test/CodeGen/arm_neon_intrinsics.c +++ clang/test/CodeGen/arm_neon_intrinsics.c @@ -3896,9 +3896,8 @@ // CHECK-LABEL: @test_vld1q_f16( // CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* -// CHECK: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2) -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half> -// CHECK: ret <8 x half> [[TMP1]] +// CHECK: [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0i8(i8* [[TMP0]], i32 2) +// CHECK: ret <8 x half> [[VLD1]] float16x8_t test_vld1q_f16(float16_t const * a) { return vld1q_f16(a); } @@ -3990,9 +3989,8 @@ // CHECK-LABEL: @test_vld1_f16( // CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* -// CHECK: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2) -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half> -// CHECK: ret <4 x half> [[TMP1]] +// CHECK: [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0i8(i8* [[TMP0]], i32 2) +// CHECK: ret <4 x half> [[VLD1]] float16x4_t test_vld1_f16(float16_t const * a) { return vld1_f16(a); } @@ -4106,12 +4104,11 @@ // CHECK-LABEL: @test_vld1q_dup_f16( // CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* -// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* -// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half> -// CHECK: ret <8 x half> [[TMP4]] +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half* +// CHECK: [[TMP2:%.*]] = load half, half* [[TMP1]], align 2 +// CHECK: [[TMP3:%.*]] = insertelement <8 x half> undef, half [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer +// CHECK: ret <8 x half> [[LANE]] float16x8_t test_vld1q_dup_f16(float16_t const * a) { return vld1q_dup_f16(a); } @@ -4233,12 +4230,11 @@ // CHECK-LABEL: @test_vld1_dup_f16( // CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* -// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16* -// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half> -// CHECK: ret <4 x half> [[TMP4]] +// CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half* +// CHECK: [[TMP2:%.*]] = load half, half* [[TMP1]], align 2 +// CHECK: [[TMP3:%.*]] = insertelement <4 x half> undef, half [[TMP2]], i32 0 +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: ret <4 x half> [[LANE]] float16x4_t test_vld1_dup_f16(float16_t const * a) { return vld1_dup_f16(a); } @@ -4365,12 +4361,11 @@ // CHECK-LABEL: @test_vld1q_lane_f16( // CHECK: [[TMP0:%.*]] = bitcast half* %a to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16* -// CHECK: [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half> -// CHECK: ret <8 x half> [[TMP5]] +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK: [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half* +// CHECK: [[TMP4:%.*]] = load half, half* [[TMP3]], align 2 +// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7 +// CHECK: ret <8 x half> [[VLD1_LANE]] float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) { return vld1q_lane_f16(a, b, 7); } @@ -4498,12 +44
[PATCH] D42993: [AArch64] Fixes for ARMv8.2-A FP16 scalar intrinsic
az closed this revision. az added a comment. Committed as r324940 and r324912 https://reviews.llvm.org/D42993 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D42993: [AArch64] Fixes for ARMv8.2-A FP16 scalar intrinsic
az updated this revision to Diff 133505. az added a comment. > Question about the failures: I am now wondering if this means we were and > still are missing tests? Given that this work is fixing https://reviews.llvm.org/D41792 which is mainly about adding frontend intrinsic support, then there is a test for each intrinsic and I am updating the tests for the intrinsics with code generation changes (they only test the frontend). As we previously discussed it, there is some work to be done in instruction selection in order to generate good IR for some intrinisics and the ones with problems need new tests. In this patch, I am fixing few simple ones that fall into that category (see AArch64InstrInfo.td) and I agree that I should have separated this patch into clang and llvm patches instead of just one. I am adding tests for those in this revision. https://reviews.llvm.org/D42993 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c llvm/lib/Target/AArch64/AArch64InstrInfo.td llvm/test/CodeGen/AArch64/f16-instructions.ll Index: llvm/test/CodeGen/AArch64/f16-instructions.ll === --- llvm/test/CodeGen/AArch64/f16-instructions.ll +++ llvm/test/CodeGen/AArch64/f16-instructions.ll @@ -736,6 +736,9 @@ declare half @llvm.nearbyint.f16(half %a) #0 declare half @llvm.round.f16(half %a) #0 declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0 +declare half @llvm.aarch64.neon.frecpe.f16(half %a) #0 +declare half @llvm.aarch64.neon.frecpx.f16(half %a) #0 +declare half @llvm.aarch64.neon.frsqrte.f16(half %a) #0 ; CHECK-CVT-LABEL: test_sqrt: ; CHECK-CVT-NEXT: fcvt s0, h0 @@ -1124,4 +1127,31 @@ ret half %r } +; CHECK-FP16-LABEL: test_vrecpeh_f16: +; CHECK-FP16-NEXT: frecpe h0, h0 +; CHECK-FP16-NEXT: ret + +define half @test_vrecpeh_f16(half %a) #0 { + %r = call half @llvm.aarch64.neon.frecpe.f16(half %a) + ret half %r +} + +; CHECK-FP16-LABEL: test_vrecpxh_f16: +; CHECK-FP16-NEXT: frecpx h0, h0 +; CHECK-FP16-NEXT: ret + +define half @test_vrecpxh_f16(half %a) #0 { + %r = call half @llvm.aarch64.neon.frecpx.f16(half %a) + ret half %r +} + +; CHECK-FP16-LABEL: test_vrsqrteh_f16: +; CHECK-FP16-NEXT: frsqrte h0, h0 +; CHECK-FP16-NEXT: ret + +define half @test_vrsqrteh_f16(half %a) #0 { + %r = call half @llvm.aarch64.neon.frsqrte.f16(half %a) + ret half %r +} + attributes #0 = { nounwind } Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td === --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3529,6 +3529,8 @@ def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))), (FCVTPUv1i64 FPR64:$Rn)>; +def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))), + (FRECPEv1f16 FPR16:$Rn)>; def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))), (FRECPEv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))), @@ -3560,11 +3562,15 @@ def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>; +def : Pat<(f16 (int_aarch64_neon_frecpx (f16 FPR16:$Rn))), + (FRECPXv1f16 FPR16:$Rn)>; def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))), (FRECPXv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), (FRECPXv1i64 FPR64:$Rn)>; +def : Pat<(f16 (int_aarch64_neon_frsqrte (f16 FPR16:$Rn))), + (FRSQRTEv1f16 FPR16:$Rn)>; def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))), (FRSQRTEv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))), Index: clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c === --- clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c +++ clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c @@ -8,7 +8,7 @@ #include // CHECK-LABEL: test_vabsh_f16 -// CHECK: [[ABS:%.*]] = call half @llvm.aarch64.neon.abs.f16(half %a) +// CHECK: [[ABS:%.*]] = call half @llvm.fabs.f16(half %a) // CHECK: ret half [[ABS]] float16_t test_vabsh_f16(float16_t a) { return vabsh_f16(a); @@ -139,8 +139,9 @@ } // CHECK-LABEL: test_vcvtah_s16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a) -// CHECK: ret i16 [[VCVT]] +// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a) +// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16 +// CHECK: ret i16 [[RET]] int16_t test_vcvtah_s16_f16 (float16_t a) { return vcvtah_s16_f16(a); } @@ -160,8 +161,9 @@ } // CHECK-LABEL: test_vcvtah_u16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a) -// CHECK: ret i16 [[VCVT]] +// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a) +// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16 +// CHECK: ret i16 [[RET
[PATCH] D42993: [AArch64] Fixes for ARMv8.2-A FP16 scalar intrinsic
az created this revision. az added a reviewer: SjoerdMeijer. Herald added subscribers: hiraditya, kristof.beyls, javed.absar, rengolin, aemerson. A couple of fixes on top of https://reviews.llvm.org/D41792: - Fixes for freceprical, and fsqrt instructions in the backend. - The intrinsics that generate builtin calls with i16 data types fails in instruction selection. In a preparation for future fixes in the backend for these, the code generated by the frontend is modified. For example, a builtin that returns i16 is implemented by returning i32 that is truncated to i16. This is done similar to other non-fp16 intrinsics that produces i16 data types. - Fix frontend code generated for abs. https://reviews.llvm.org/D42993 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c llvm/lib/Target/AArch64/AArch64InstrInfo.td Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td === --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3529,6 +3529,8 @@ def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))), (FCVTPUv1i64 FPR64:$Rn)>; +def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))), + (FRECPEv1f16 FPR16:$Rn)>; def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))), (FRECPEv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))), @@ -3560,11 +3562,15 @@ def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>; +def : Pat<(f16 (int_aarch64_neon_frecpx (f16 FPR16:$Rn))), + (FRECPXv1f16 FPR16:$Rn)>; def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))), (FRECPXv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), (FRECPXv1i64 FPR64:$Rn)>; +def : Pat<(f16 (int_aarch64_neon_frsqrte (f16 FPR16:$Rn))), + (FRSQRTEv1f16 FPR16:$Rn)>; def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))), (FRSQRTEv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))), Index: clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c === --- clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c +++ clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c @@ -8,7 +8,7 @@ #include // CHECK-LABEL: test_vabsh_f16 -// CHECK: [[ABS:%.*]] = call half @llvm.aarch64.neon.abs.f16(half %a) +// CHECK: [[ABS:%.*]] = call half @llvm.fabs.f16(half %a) // CHECK: ret half [[ABS]] float16_t test_vabsh_f16(float16_t a) { return vabsh_f16(a); @@ -139,8 +139,9 @@ } // CHECK-LABEL: test_vcvtah_s16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a) -// CHECK: ret i16 [[VCVT]] +// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a) +// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16 +// CHECK: ret i16 [[RET]] int16_t test_vcvtah_s16_f16 (float16_t a) { return vcvtah_s16_f16(a); } @@ -160,8 +161,9 @@ } // CHECK-LABEL: test_vcvtah_u16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a) -// CHECK: ret i16 [[VCVT]] +// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a) +// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16 +// CHECK: ret i16 [[RET]] uint16_t test_vcvtah_u16_f16 (float16_t a) { return vcvtah_u16_f16(a); } @@ -181,8 +183,9 @@ } // CHECK-LABEL: test_vcvtmh_s16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtms.i16.f16(half %a) -// CHECK: ret i16 [[VCVT]] +// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f16(half %a) +// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16 +// CHECK: ret i16 [[RET]] int16_t test_vcvtmh_s16_f16 (float16_t a) { return vcvtmh_s16_f16(a); } @@ -202,8 +205,9 @@ } // CHECK-LABEL: test_vcvtmh_u16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half %a) -// CHECK: ret i16 [[VCVT]] +// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half %a) +// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16 +// CHECK: ret i16 [[RET]] uint16_t test_vcvtmh_u16_f16 (float16_t a) { return vcvtmh_u16_f16(a); } @@ -223,8 +227,9 @@ } // CHECK-LABEL: test_vcvtnh_s16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtns.i16.f16(half %a) -// CHECK: ret i16 [[VCVT]] +// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f16(half %a) +// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16 +// CHECK: ret i16 [[RET]] int16_t test_vcvtnh_s16_f16 (float16_t a) { return vcvtnh_s16_f16(a); } @@ -244,8 +249,9 @@ } // CHECK-LABEL: test_vcvtnh_u16_f16 -// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half %a) -// CHECK: ret i16 [[VCVT]] +// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half %a) +// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i
[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics
az marked 6 inline comments as done. az added inline comments. Comment at: clang/include/clang/Basic/arm_fp16.td:58 +class IInst : Inst {} + +// ARMv8.2-A FP16 intrinsics. SjoerdMeijer wrote: > az wrote: > > SjoerdMeijer wrote: > > > There's a little bit of duplication here: the definitions above are the > > > same as in arm_neon.td. Would it be easy to share this, with e.g. an > > > include? > > The duplication is small compared to the overall infrastructure/data > > structure needed to automatically generate the intrinsics. There are 3 ways > > to do this: 1) copy only the needed data structure in arm_fp16.td (this is > > what was done in original review) 2) put all data structure in a newly > > created file and include it in arm_neon.td and arm_fp16.td (done here). 3) > > put only the duplication in a new file and include it. I did not go for > > this one given that we create a new file for the only purpose of avoiding a > > small duplication but I am fine of going with 3 too. Note that some of the > > duplicated structure in the original arm_fp16.td was a stripped down > > version of the copied one. > Given that the duplication is tiny, I don't have strong opinions to be > honest. Would be nice to share these definitions if that's easy to do, > otherwise we can perfectly live with this I think. So, let's keep the current version for now which is: all generic stuff goes into the file called arm_neon_incl.td. All specific code that creates and generates the intrinsic goes into specific files arm_neon.td and arm_fp16.td which include the generic file. This will work well when we create new .td file for future features if any. Comment at: llvm/include/llvm/IR/IntrinsicsAArch64.td:250 def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic; - def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic; def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic; SjoerdMeijer wrote: > There's a scalar and vector variant of FMAX and thus I am wondering if we > don't need two definitions here: one using AdvSIMD_2FloatArg_Intrinsic and > the other AdvSIMD_2VectorArg_Intrinsic? Maybe we can do that but there are many instances where vector and scalar share the same intrinsic name ( note that the type such as f16 or v4f32 is appended to that name). I have not checked carefully if what you propose already exists or not but it does seem less common from first look. https://reviews.llvm.org/D41792 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics
az updated this revision to Diff 129513. az marked 3 inline comments as done. https://reviews.llvm.org/D41792 Files: clang/include/clang/Basic/BuiltinsNEON.def clang/include/clang/Basic/CMakeLists.txt clang/include/clang/Basic/arm_fp16.td clang/include/clang/Basic/arm_neon.td clang/include/clang/Basic/arm_neon_incl.td clang/lib/Basic/Targets/AArch64.cpp clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/CMakeLists.txt clang/lib/Headers/module.modulemap clang/lib/Sema/SemaChecking.cpp clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c clang/utils/TableGen/NeonEmitter.cpp clang/utils/TableGen/TableGen.cpp clang/utils/TableGen/TableGenBackends.h llvm/include/llvm/IR/IntrinsicsAArch64.td Index: llvm/include/llvm/IR/IntrinsicsAArch64.td === --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -146,6 +146,9 @@ class AdvSIMD_CvtFPToFx_Intrinsic : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>; + + class AdvSIMD_1Arg_Intrinsic +: Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>; } // Arithmetic ops @@ -244,7 +247,7 @@ // Vector Max def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic; - def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic; def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic; // Vector Max Across Lanes @@ -256,7 +259,7 @@ // Vector Min def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic; - def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmin : AdvSIMD_2FloatArg_Intrinsic; def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic; // Vector Min/Max Number @@ -354,7 +357,7 @@ def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic; // Vector Absolute Value - def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic; + def int_aarch64_neon_abs : AdvSIMD_1Arg_Intrinsic; // Vector Saturating Absolute Value def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic; Index: clang/utils/TableGen/TableGenBackends.h === --- clang/utils/TableGen/TableGenBackends.h +++ clang/utils/TableGen/TableGenBackends.h @@ -65,6 +65,7 @@ void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS); void EmitNeon(RecordKeeper &Records, raw_ostream &OS); +void EmitFP16(RecordKeeper &Records, raw_ostream &OS); void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS); void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS); void EmitNeon2(RecordKeeper &Records, raw_ostream &OS); Index: clang/utils/TableGen/TableGen.cpp === --- clang/utils/TableGen/TableGen.cpp +++ clang/utils/TableGen/TableGen.cpp @@ -52,6 +52,7 @@ GenClangCommentCommandInfo, GenClangCommentCommandList, GenArmNeon, + GenArmFP16, GenArmNeonSema, GenArmNeonTest, GenAttrDocs, @@ -139,6 +140,7 @@ "Generate list of commands that are used in " "documentation comments"), clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"), +clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"), clEnumValN(GenArmNeonSema, "gen-arm-neon-sema", "Generate ARM NEON sema support for clang"), clEnumValN(GenArmNeonTest, "gen-arm-neon-test", @@ -250,6 +252,9 @@ case GenArmNeon: EmitNeon(Records, OS); break; + case GenArmFP16: +EmitFP16(Records, OS); +break; case GenArmNeonSema: EmitNeonSema(Records, OS); break; Index: clang/utils/TableGen/NeonEmitter.cpp === --- clang/utils/TableGen/NeonEmitter.cpp +++ clang/utils/TableGen/NeonEmitter.cpp @@ -552,7 +552,11 @@ // run - Emit arm_neon.h.inc void run(raw_ostream &o); + // runFP16 - Emit arm_fp16.h.inc + void runFP16(raw_ostream &o); + // runHeader - Emit all the __builtin prototypes used in arm_neon.h + // and arm_fp16.h void runHeader(raw_ostream &o); // runTests - Emit tests for all the Neon intrinsics. @@ -852,6 +856,35 @@ NumVectors = 0; Float = true; break; + case 'Y': +Bitwidth = ElementBitwidth = 16; +NumVectors = 0; +Float = true; +break; + case 'I': +Bitwidth = ElementBitwidth = 32; +NumVectors = 0; +Float = false; +Signed = true; +break; + case 'L': +Bitwidth = ElementBitwidth = 64; +NumVectors = 0; +Float = false; +Signed = true; +break; + case 'U': +Bitwidth = ElementBitwidth = 32; +NumVectors = 0; +Float = false; +Signed = false; +break; +
[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics
az marked 8 inline comments as done. az added inline comments. Comment at: clang/include/clang/Basic/arm_fp16.td:58 +class IInst : Inst {} + +// ARMv8.2-A FP16 intrinsics. SjoerdMeijer wrote: > There's a little bit of duplication here: the definitions above are the same > as in arm_neon.td. Would it be easy to share this, with e.g. an include? The duplication is small compared to the overall infrastructure/data structure needed to automatically generate the intrinsics. There are 3 ways to do this: 1) copy only the needed data structure in arm_fp16.td (this is what was done in original review) 2) put all data structure in a newly created file and include it in arm_neon.td and arm_fp16.td (done here). 3) put only the duplication in a new file and include it. I did not go for this one given that we create a new file for the only purpose of avoiding a small duplication but I am fine of going with 3 too. Note that some of the duplicated structure in the original arm_fp16.td was a stripped down version of the copied one. Comment at: clang/lib/CodeGen/CGBuiltin.cpp:4102 NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType), + // FP16 scalar intrinisics go here. + NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType), SjoerdMeijer wrote: > Looks like a few intrinsic descriptions are missing here. For example, the > first 2-operand intrinsic vaddh_f16 is missing, but there are also more. Is > this intentional, or might they have slipped through the cracks (or am I > missing something)? I agree that this is confusing. For the intrinsics listed in this table, code generation happens in a generic way based on the info in the table. The ones not listed in this table are addressed in a more specific way below in a the function called EmitAArch64BuiltinExpr. While I do not like how few things were implemented in generating the intrinsics, I am in general following the approach taken for arm_neon instead of introducing a new approach. Comment at: llvm/include/llvm/IR/IntrinsicsAArch64.td:149 [IntrNoMem]>; + + class AdvSIMD_1Arg_Intrinsic SjoerdMeijer wrote: > This and the other changes in this file are changes to LLVM. Do we need these > changes for this patch? It doesn't look like it. Some tests in aarch64-v8.2a-fp16-intrinsics.c will fail for me without these changes. In clang/lib/CodeGen/BackendUtil.cpp, there is code there that includes llvm files and header files. It fails there if I do not fix IntrinsicAArch64.td. If you know of a better way to test differently without the need for llvm, then let me know. For example, if I remove the option flag -S from testing (i.e from aarch64-v8.2a-fp16-intrinsics.c), then there is no need to llvm but I won't be able to compare results. https://reviews.llvm.org/D41792 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics
az updated this revision to Diff 129360. https://reviews.llvm.org/D41792 Files: clang/include/clang/Basic/BuiltinsNEON.def clang/include/clang/Basic/CMakeLists.txt clang/include/clang/Basic/arm_fp16.td clang/include/clang/Basic/arm_neon.td clang/include/clang/Basic/arm_neon_incl.td clang/lib/Basic/Targets/AArch64.cpp clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/CMakeLists.txt clang/lib/Headers/module.modulemap clang/lib/Sema/SemaChecking.cpp clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c clang/utils/TableGen/NeonEmitter.cpp clang/utils/TableGen/TableGen.cpp clang/utils/TableGen/TableGenBackends.h llvm/include/llvm/IR/IntrinsicsAArch64.td Index: llvm/include/llvm/IR/IntrinsicsAArch64.td === --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -146,6 +146,9 @@ class AdvSIMD_CvtFPToFx_Intrinsic : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>; + + class AdvSIMD_1Arg_Intrinsic +: Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>; } // Arithmetic ops @@ -244,7 +247,7 @@ // Vector Max def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic; - def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic; def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic; // Vector Max Across Lanes @@ -256,7 +259,7 @@ // Vector Min def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic; - def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmin : AdvSIMD_2FloatArg_Intrinsic; def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic; // Vector Min/Max Number @@ -354,7 +357,7 @@ def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic; // Vector Absolute Value - def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic; + def int_aarch64_neon_abs : AdvSIMD_1Arg_Intrinsic; // Vector Saturating Absolute Value def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic; Index: clang/utils/TableGen/TableGenBackends.h === --- clang/utils/TableGen/TableGenBackends.h +++ clang/utils/TableGen/TableGenBackends.h @@ -65,6 +65,7 @@ void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS); void EmitNeon(RecordKeeper &Records, raw_ostream &OS); +void EmitFP16(RecordKeeper &Records, raw_ostream &OS); void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS); void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS); void EmitNeon2(RecordKeeper &Records, raw_ostream &OS); Index: clang/utils/TableGen/TableGen.cpp === --- clang/utils/TableGen/TableGen.cpp +++ clang/utils/TableGen/TableGen.cpp @@ -52,6 +52,7 @@ GenClangCommentCommandInfo, GenClangCommentCommandList, GenArmNeon, + GenArmFP16, GenArmNeonSema, GenArmNeonTest, GenAttrDocs, @@ -139,6 +140,7 @@ "Generate list of commands that are used in " "documentation comments"), clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"), +clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"), clEnumValN(GenArmNeonSema, "gen-arm-neon-sema", "Generate ARM NEON sema support for clang"), clEnumValN(GenArmNeonTest, "gen-arm-neon-test", @@ -250,6 +252,9 @@ case GenArmNeon: EmitNeon(Records, OS); break; + case GenArmFP16: +EmitFP16(Records, OS); +break; case GenArmNeonSema: EmitNeonSema(Records, OS); break; Index: clang/utils/TableGen/NeonEmitter.cpp === --- clang/utils/TableGen/NeonEmitter.cpp +++ clang/utils/TableGen/NeonEmitter.cpp @@ -552,7 +552,11 @@ // run - Emit arm_neon.h.inc void run(raw_ostream &o); + // runFP16 - Emit arm_fp16.h.inc + void runFP16(raw_ostream &o); + // runHeader - Emit all the __builtin prototypes used in arm_neon.h + // and arm_fp16.h void runHeader(raw_ostream &o); // runTests - Emit tests for all the Neon intrinsics. @@ -852,6 +856,35 @@ NumVectors = 0; Float = true; break; + case 'Y': +Bitwidth = ElementBitwidth = 16; +NumVectors = 0; +Float = true; +break; + case 'I': +Bitwidth = ElementBitwidth = 32; +NumVectors = 0; +Float = false; +Signed = true; +break; + case 'L': +Bitwidth = ElementBitwidth = 64; +NumVectors = 0; +Float = false; +Signed = true; +break; + case 'U': +Bitwidth = ElementBitwidth = 32; +NumVectors = 0; +Float = false; +Signed = false; +break; + case 'O': +Bitwidth = ElementBi
[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics
az created this revision. az added a reviewer: SjoerdMeijer. Herald added subscribers: kristof.beyls, javed.absar, mgorny, rengolin, aemerson. ARMv8.2-A introduces half-precision floating point data processing. This patch adds the fp16 scalar intrinsics for this architecture as described in the ARM ACLE document. Only the front-end intrinsic work is done here. Some backend work related to instruction selection still needs to be done. This work is a continuation of https://reviews.llvm.org/D32511 which addressed ARMv8.2-A vector intrinsics. https://reviews.llvm.org/D41792 Files: clang/include/clang/Basic/BuiltinsNEON.def clang/include/clang/Basic/CMakeLists.txt clang/include/clang/Basic/arm_fp16.td clang/lib/Basic/Targets/AArch64.cpp clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/CMakeLists.txt clang/lib/Headers/module.modulemap clang/lib/Sema/SemaChecking.cpp clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c clang/utils/TableGen/NeonEmitter.cpp clang/utils/TableGen/TableGen.cpp clang/utils/TableGen/TableGenBackends.h llvm/include/llvm/IR/IntrinsicsAArch64.td Index: llvm/include/llvm/IR/IntrinsicsAArch64.td === --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -146,6 +146,9 @@ class AdvSIMD_CvtFPToFx_Intrinsic : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>; + + class AdvSIMD_1Arg_Intrinsic +: Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>; } // Arithmetic ops @@ -244,7 +247,7 @@ // Vector Max def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic; - def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic; def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic; // Vector Max Across Lanes @@ -256,7 +259,7 @@ // Vector Min def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic; - def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmin : AdvSIMD_2FloatArg_Intrinsic; def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic; // Vector Min/Max Number @@ -354,7 +357,8 @@ def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic; // Vector Absolute Value - def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic; + //def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic; + def int_aarch64_neon_abs : AdvSIMD_1Arg_Intrinsic; // Vector Saturating Absolute Value def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic; Index: clang/utils/TableGen/TableGenBackends.h === --- clang/utils/TableGen/TableGenBackends.h +++ clang/utils/TableGen/TableGenBackends.h @@ -65,6 +65,7 @@ void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS); void EmitNeon(RecordKeeper &Records, raw_ostream &OS); +void EmitFP16(RecordKeeper &Records, raw_ostream &OS); void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS); void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS); void EmitNeon2(RecordKeeper &Records, raw_ostream &OS); Index: clang/utils/TableGen/TableGen.cpp === --- clang/utils/TableGen/TableGen.cpp +++ clang/utils/TableGen/TableGen.cpp @@ -52,6 +52,7 @@ GenClangCommentCommandInfo, GenClangCommentCommandList, GenArmNeon, + GenArmFP16, GenArmNeonSema, GenArmNeonTest, GenAttrDocs, @@ -139,6 +140,7 @@ "Generate list of commands that are used in " "documentation comments"), clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"), +clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"), clEnumValN(GenArmNeonSema, "gen-arm-neon-sema", "Generate ARM NEON sema support for clang"), clEnumValN(GenArmNeonTest, "gen-arm-neon-test", @@ -250,6 +252,9 @@ case GenArmNeon: EmitNeon(Records, OS); break; + case GenArmFP16: +EmitFP16(Records, OS); +break; case GenArmNeonSema: EmitNeonSema(Records, OS); break; Index: clang/utils/TableGen/NeonEmitter.cpp === --- clang/utils/TableGen/NeonEmitter.cpp +++ clang/utils/TableGen/NeonEmitter.cpp @@ -552,7 +552,11 @@ // run - Emit arm_neon.h.inc void run(raw_ostream &o); + // runFP16 - Emit arm_fp16.h.inc + void runFP16(raw_ostream &o); + // runHeader - Emit all the __builtin prototypes used in arm_neon.h + // and arm_fp16.h void runHeader(raw_ostream &o); // runTests - Emit tests for all the Neon intrinsics. @@ -852,6 +856,35 @@ NumVectors = 0; Float = true; break; + case 'Y': +Bitwidth