[PATCH] D49941: [ARM] Add ARMv8.2-A FP16 scalar intrinsic

2018-07-27 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az created this revision.
az added a reviewer: SjoerdMeijer.
Herald added a reviewer: javed.absar.
Herald added subscribers: cfe-commits, chrib, kristof.beyls.

This patch adds the fp16 scalar intrinsic for ARM as described in the ARM ACLE 
document. Only the frontend work is done here and some work is still needed in 
the backend codegen.


Repository:
  rC Clang

https://reviews.llvm.org/D49941

Files:
  clang/include/clang/Basic/arm_fp16.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/arm-v8.2a-fp16-intrinsics.c
  llvm/include/llvm/IR/IntrinsicsARM.td

Index: llvm/include/llvm/IR/IntrinsicsARM.td
===
--- llvm/include/llvm/IR/IntrinsicsARM.td
+++ llvm/include/llvm/IR/IntrinsicsARM.td
@@ -355,6 +355,9 @@
 class Neon_2Arg_Intrinsic
   : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
   [IntrNoMem]>;
+class Float_2Arg_Intrinsic
+  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+  [IntrNoMem]>;
 class Neon_2Arg_Narrow_Intrinsic
   : Intrinsic<[llvm_anyvector_ty], [LLVMExtendedType<0>, LLVMExtendedType<0>],
   [IntrNoMem]>;
@@ -377,8 +380,8 @@
   : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 class Neon_CvtFPToFx_Intrinsic
   : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
-class Neon_CvtFPtoInt_1Arg_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+class CvtFPtoInt_1Arg_Intrinsic
+  : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
 
 class Neon_Compare_Intrinsic
   : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
@@ -431,12 +434,12 @@
   // Vector Maximum.
   def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic;
   def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic;
-  def int_arm_neon_vmaxnm : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmaxnm : Float_2Arg_Intrinsic;
 
   // Vector Minimum.
   def int_arm_neon_vmins : Neon_2Arg_Intrinsic;
   def int_arm_neon_vminu : Neon_2Arg_Intrinsic;
-  def int_arm_neon_vminnm : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vminnm : Float_2Arg_Intrinsic;
 
   // Vector Reciprocal Step.
   def int_arm_neon_vrecps : Neon_2Arg_Intrinsic;
@@ -552,15 +555,15 @@
 // Vector Reciprocal Square Root Estimate.
 def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic;
 
-// Vector Conversions Between Floating-point and Integer
-def int_arm_neon_vcvtau : Neon_CvtFPtoInt_1Arg_Intrinsic;
-def int_arm_neon_vcvtas : Neon_CvtFPtoInt_1Arg_Intrinsic;
-def int_arm_neon_vcvtnu : Neon_CvtFPtoInt_1Arg_Intrinsic;
-def int_arm_neon_vcvtns : Neon_CvtFPtoInt_1Arg_Intrinsic;
-def int_arm_neon_vcvtpu : Neon_CvtFPtoInt_1Arg_Intrinsic;
-def int_arm_neon_vcvtps : Neon_CvtFPtoInt_1Arg_Intrinsic;
-def int_arm_neon_vcvtmu : Neon_CvtFPtoInt_1Arg_Intrinsic;
-def int_arm_neon_vcvtms : Neon_CvtFPtoInt_1Arg_Intrinsic;
+// Conversions Between Floating-point and Integer
+def int_arm_neon_vcvtau : CvtFPtoInt_1Arg_Intrinsic;
+def int_arm_neon_vcvtas : CvtFPtoInt_1Arg_Intrinsic;
+def int_arm_neon_vcvtnu : CvtFPtoInt_1Arg_Intrinsic;
+def int_arm_neon_vcvtns : CvtFPtoInt_1Arg_Intrinsic;
+def int_arm_neon_vcvtpu : CvtFPtoInt_1Arg_Intrinsic;
+def int_arm_neon_vcvtps : CvtFPtoInt_1Arg_Intrinsic;
+def int_arm_neon_vcvtmu : CvtFPtoInt_1Arg_Intrinsic;
+def int_arm_neon_vcvtms : CvtFPtoInt_1Arg_Intrinsic;
 
 // Vector Conversions Between Floating-point and Fixed-point.
 def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic;
Index: clang/test/CodeGen/arm-v8.2a-fp16-intrinsics.c
===
--- /dev/null
+++ clang/test/CodeGen/arm-v8.2a-fp16-intrinsics.c
@@ -0,0 +1,225 @@
+// RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \
+// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
+
+// REQUIRES: arm-registered-target
+
+#include 
+
+// CHECK-LABEL: test_vabsh_f16
+// CHECK:  [[ABS:%.*]] =  call half @llvm.fabs.f16(half %{{.*}})
+float16_t test_vabsh_f16(float16_t a) {
+  return vabsh_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_s32
+// CHECK:  [[VCVT:%.*]] = sitofp i32 %a to half
+float16_t test_vcvth_f16_s32 (int32_t a) {
+  return vcvth_f16_s32(a);
+}
+
+// CHECK-LABEL: test_vcvth_f16_u32
+// CHECK:  [[VCVT:%.*]] = uitofp i32 %a to half
+float16_t test_vcvth_f16_u32 (uint32_t a) {
+  return vcvth_f16_u32(a);
+}
+
+// CHECK-LABEL: test_vcvth_s32_f16
+// CHECK:  [[VCVT:%.*]] = fptosi half %{{.*}} to i32
+// CHECK: ret i32 [[VCVT]]
+int32_t test_vcvth_s32_f16 (float16_t a) {
+  return vcvth_s32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvth_u32_f16
+// CHECK:  [[VCVT:%.*]] = fptoui half %{{.*}} to i32
+// CHECK: ret i32 [[VCVT]]
+uint32_t test_vcvth_u32_f16 (float16_t a) {
+  return vcvth_u32_f16(a);
+}
+
+// CHECK-LABEL: test_vcvtah_s32_f16
+// CHECK: [[VCVT:%.*]] = call i32 @llvm.

[PATCH] D44591: [AArch64] Add vmulxh_lane FP16 vector intrinsic

2018-03-19 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az updated this revision to Diff 139015.
az added a comment.

add LLVM codegen tests as suggested in the reviews.


https://reviews.llvm.org/D44591

Files:
  clang/include/clang/Basic/arm_neon.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
  llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll

Index: llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
===
--- llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
+declare half @llvm.aarch64.neon.fmulx.f16(half, half)
 declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>)
 declare <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half>, <8 x half>)
 declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
@@ -236,6 +237,25 @@
   ret half %1
 }
 
+define dso_local half @t_vmulx_f16(half %a, half %b) {
+; CHECK-LABEL: t_vmulx_f16:
+; CHECK: fmulx h0, h0, h1
+; CHECK-NEXT:ret
+entry:
+  %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b)
+  ret half %fmulx.i
+}
+
+define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) {
+; CHECK-LABEL: t_vmulxh_lane_f16:
+; CHECK: fmulx h0, h0, v1.h[3]
+; CHECK-NEXT:ret
+entry:
+  %extract = extractelement <4 x half> %b, i32 3
+  %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
+  ret half %fmulx.i
+}
+
 define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_lane_f16:
 ; CHECK: fmulx v0.4h, v0.4h, v1.h[0]
@@ -276,6 +296,16 @@
   ret <8 x half> %vmulx2.i
 }
 
+define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) {
+; CHECK-LABEL: t_vmulxh_laneq_f16:
+; CHECK: fmulx h0, h0, v1.h[7]
+; CHECK-NEXT:ret
+entry:
+  %extract = extractelement <8 x half> %b, i32 7
+  %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
+  ret half %fmulx.i
+}
+
 define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulx_n_f16:
 ; CHECK: dup v1.4h, v1.h[0]
Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
===
--- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -1223,27 +1223,25 @@
   return vmulxq_n_f16(a, b);
 }
 
-/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h)
-// CCHECK-LABEL: test_vmulxh_lane_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: test_vmulxh_lane_f16
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[EXTR:%.*]] = extractelement <4 x half> [[TMP1]], i32 3
+// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]]
+// CHECK: ret half [[MULX]]
 float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) {
   return vmulxh_lane_f16(a, b, 3);
 }
 
-// CCHECK-LABEL: test_vmulxh_laneq_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: test_vmulxh_laneq_f16
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[EXTR:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
+// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]])
+// CHECK: ret half [[MULX]]
 float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) {
   return vmulxh_laneq_f16(a, b, 7);
 }
-*/
 
 // CHECK-LABEL: test_vmaxv_f16
 // CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
Index: clang/lib/CodeGen/CGBuiltin.cpp
===
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -7238,6 +7238,16 @@
 Int = Intrinsic::aarch64_neon_fmulx;
 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
   }
+  case NEON::BI__builtin_neon_vmulxh_lane_f16:
+  case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
+// vmulx_lane should be mapped to Neon scalar mulx after
+// extracting the scalar element
+Ops.push_back(EmitScalarExpr(E->getArg(2)));
+Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
+Ops.pop_back();
+Int = Intrinsic::aarch64_neon_fmulx;
+return EmitNeonCall(CGM.getIntrins

[PATCH] D44222: [AArch64] Add vmulxh_lane FP16 intrinsics

2018-03-16 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az added a comment.

Was not able to update this particular review with the new code, So I created a 
new one in  https://reviews.llvm.org/D44591

I manage to reuse the mulx scalar intrinsic work, not exactly calling the fp16 
scalar intrinsic itself which is not available here but the same frontend 
codegen work with an extract instruction before that.


https://reviews.llvm.org/D44222



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D44591: [AArch64] Add vmulxh_lane FP16 vector intrinsic

2018-03-16 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az created this revision.
az added a reviewer: SjoerdMeijer.
Herald added subscribers: kristof.beyls, javed.absar, rengolin.

Add the two missing vmulxh_lane vector intrinsics that were originally left out.


https://reviews.llvm.org/D44591

Files:
  clang/include/clang/Basic/arm_neon.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c


Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
===
--- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -1223,27 +1223,25 @@
   return vmulxq_n_f16(a, b);
 }
 
-/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h)
-// CCHECK-LABEL: test_vmulxh_lane_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: test_vmulxh_lane_f16
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[EXTR:%.*]] = extractelement <4 x half> [[TMP1]], i32 3
+// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half 
[[EXTR]]
+// CHECK: ret half [[MULX]]
 float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) {
   return vmulxh_lane_f16(a, b, 3);
 }
 
-// CCHECK-LABEL: test_vmulxh_laneq_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: test_vmulxh_laneq_f16
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[EXTR:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
+// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half 
[[EXTR]])
+// CHECK: ret half [[MULX]]
 float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) {
   return vmulxh_laneq_f16(a, b, 7);
 }
-*/
 
 // CHECK-LABEL: test_vmaxv_f16
 // CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
Index: clang/lib/CodeGen/CGBuiltin.cpp
===
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -7248,6 +7248,16 @@
 Int = Intrinsic::aarch64_neon_fmulx;
 return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
   }
+  case NEON::BI__builtin_neon_vmulxh_lane_f16:
+  case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
+// vmulx_lane should be mapped to Neon scalar mulx after
+// extracting the scalar element
+Ops.push_back(EmitScalarExpr(E->getArg(2)));
+Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
+Ops.pop_back();
+Int = Intrinsic::aarch64_neon_fmulx;
+return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
+  }
   case NEON::BI__builtin_neon_vmul_lane_v:
   case NEON::BI__builtin_neon_vmul_laneq_v: {
 // v1f64 vmul_lane should be mapped to Neon scalar mul lane
Index: clang/include/clang/Basic/arm_neon.td
===
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -1547,11 +1547,9 @@
   def VMULX_LANEH   : IOpInst<"vmulx_lane", "ddgi", "hQh", OP_MULX_LN>;
   def VMULX_LANEQH  : IOpInst<"vmulx_laneq", "ddji", "hQh", OP_MULX_LN>;
   def VMULX_NH  : IOpInst<"vmulx_n", "dds", "hQh", OP_MULX_N>;
-  // TODO: Scalar floating point multiply extended (scalar, by element)
-  // Below ones are commented out because they need vmulx_f16(float16_t, 
float16_t)
-  // which will be implemented later with fp16 scalar intrinsic (arm_fp16.h)
-  //def SCALAR_FMULX_LANEH : IOpInst<"vmulx_lane", "ssdi", "Sh", 
OP_SCALAR_MUL_LN>;
-  //def SCALAR_FMULX_LANEQH : IOpInst<"vmulx_laneq", "ssji", "Sh", 
OP_SCALAR_MUL_LN>;
+  // Scalar floating point  mulx (scalar, by element)
+  def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "ssdi", "Sh">;
+  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "ssji", "Sh">;
 
   // ARMv8.2-A FP16 reduction vector intrinsics.
   def VMAXVH   : SInst<"vmaxv", "sd", "hQh">;


Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
===
--- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -1223,27 +1223,25 @@
   return vmulxq_n_f16(a, b);
 }
 
-/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h)
-// CCHECK-LABEL: test_vmulxh_lane_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*

[PATCH] D44222: [AArch64] Add vmulxh_lane FP16 intrinsics

2018-03-08 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az added inline comments.



Comment at: include/clang/Basic/arm_neon.td:1504
+  // Scalar floating point multiply extended (scalar, by element)
+  def SCALAR_FMULX_LANEH : IOpInst<"vmulx_lane", "ssdi", "Sh", 
OP_SCALAR_MUL_LN>;
+  def SCALAR_FMULX_LANEQH : IOpInst<"vmulx_laneq", "ssji", "Sh", 
OP_SCALAR_MUL_LN>;

SjoerdMeijer wrote:
> I found that unfortunately it's not that straightforward. This leads to wrong 
> code generation as it is generating a fmul instead of fmulx. I am suspecting 
> this instruction description should be using OP_SCALAR_MULX_LN, but also the 
> type decls are wrong. Need to dig a bit further here.
Sorry for confusion as the commented code was never intended to be used and it 
is a copy of the code for the intrinsic vmulh_lane(). It was done that way in 
order to point out that vmulh_lane() and vmulxh_lane() intrinsics should be 
implemented in a similar way. The only useful thing in the commented code is 
the explanation that we need the scalar intrinsic vmulxh_f16() which was 
implemented in the scalar intrinsic patch later on.
 
If we look at how vmulh_lane (a, b, lane) is implemented:
x = extract (b, lane);
res = a * x;
return res; 

Similarly, I thought at the time that vmulxh_lane (a, b, lane) can be 
implemented: 
x = extract (b, lane);
res = vmulxh_f16 (a, x);  // no llvm native mulx instruction, so we use the 
fp16 scalar intrinsic.
return res; 

I am not sure now that we can easily use scalar intrinsic while generating the 
arm_neon.h file. In case we can not do that, I am thinking that the frontend 
should generate a new builtin for intrinsic vmulxh_lane() that the backend 
recognizes and generate the right code for it which is fmulx  h0, h0, 
v1.h[lane]. If you made or will be making progress on this, then that is great. 
Otherwise, I can look at a frontend solution for it.  


https://reviews.llvm.org/D44222



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43650: [ARM] Add ARMv8.2-A FP16 vector intrinsics

2018-02-22 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az created this revision.
az added a reviewer: SjoerdMeijer.
Herald added subscribers: kristof.beyls, javed.absar.

This patch adds the fp16 neon vector intrinsic for ARM as described in the ARM 
ACLE document.

While this patch may seem large at first but it is essentially a 
modification/addition on top of  some old work by doing this:

- Port AArch64 patch https://reviews.llvm.org/D32511 to ARM.
- Enable the frontend fp16 data type for ARM (which is a revert of patch  
https://reviews.llvm.org/D41360).


https://reviews.llvm.org/D43650

Files:
  clang/include/clang/Basic/arm_neon.td
  clang/lib/Basic/Targets/ARM.cpp
  clang/lib/Basic/Targets/ARM.h
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
  clang/test/CodeGen/arm_neon_intrinsics.c

Index: clang/test/CodeGen/arm_neon_intrinsics.c
===
--- clang/test/CodeGen/arm_neon_intrinsics.c
+++ clang/test/CodeGen/arm_neon_intrinsics.c
@@ -3896,9 +3896,8 @@
 
 // CHECK-LABEL: @test_vld1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half>
-// CHECK:   ret <8 x half> [[TMP1]]
+// CHECK:   [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <8 x half> [[VLD1]]
 float16x8_t test_vld1q_f16(float16_t const * a) {
   return vld1q_f16(a);
 }
@@ -3990,9 +3989,8 @@
 
 // CHECK-LABEL: @test_vld1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half>
-// CHECK:   ret <4 x half> [[TMP1]]
+// CHECK:   [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <4 x half> [[VLD1]]
 float16x4_t test_vld1_f16(float16_t const * a) {
   return vld1_f16(a);
 }
@@ -4106,12 +4104,11 @@
 
 // CHECK-LABEL: @test_vld1q_dup_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
-// CHECK:   ret <8 x half> [[TMP4]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> undef, half [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x half> [[LANE]]
 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
   return vld1q_dup_f16(a);
 }
@@ -4233,12 +4230,11 @@
 
 // CHECK-LABEL: @test_vld1_dup_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
-// CHECK:   ret <4 x half> [[TMP4]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> undef, half [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x half> [[LANE]]
 float16x4_t test_vld1_dup_f16(float16_t const * a) {
   return vld1_dup_f16(a);
 }
@@ -4365,12 +4361,11 @@
 // CHECK-LABEL: @test_vld1q_lane_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
-// CHECK:   ret <8 x half> [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK:   [[TMP4:%.*]] = load half, half* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
+// CHECK:   ret <8 x half> [[VLD1_LANE]]
 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
@@ -4498,12 +44

[PATCH] D42993: [AArch64] Fixes for ARMv8.2-A FP16 scalar intrinsic

2018-02-12 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az closed this revision.
az added a comment.

Committed as r324940 and r324912


https://reviews.llvm.org/D42993



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42993: [AArch64] Fixes for ARMv8.2-A FP16 scalar intrinsic

2018-02-08 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az updated this revision to Diff 133505.
az added a comment.

> Question about the failures: I am now wondering if this means we were and 
> still  are missing tests?

Given that this work is fixing https://reviews.llvm.org/D41792 which is mainly 
about adding frontend intrinsic support, then there is a test for each 
intrinsic and I am updating the tests for the intrinsics with code generation 
changes (they only test the frontend). As we previously discussed it, there is 
some work to be done in instruction selection in order to generate good IR for 
some intrinisics and the ones with problems need new tests. In this patch, I am 
fixing few simple ones that fall into that category (see AArch64InstrInfo.td) 
and I agree that I should have separated this patch into clang and llvm patches 
instead of just one. I am adding tests for those in this revision.


https://reviews.llvm.org/D42993

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
  llvm/lib/Target/AArch64/AArch64InstrInfo.td
  llvm/test/CodeGen/AArch64/f16-instructions.ll

Index: llvm/test/CodeGen/AArch64/f16-instructions.ll
===
--- llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -736,6 +736,9 @@
 declare half @llvm.nearbyint.f16(half %a) #0
 declare half @llvm.round.f16(half %a) #0
 declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
+declare half @llvm.aarch64.neon.frecpe.f16(half %a) #0
+declare half @llvm.aarch64.neon.frecpx.f16(half %a) #0
+declare half @llvm.aarch64.neon.frsqrte.f16(half %a) #0
 
 ; CHECK-CVT-LABEL: test_sqrt:
 ; CHECK-CVT-NEXT: fcvt s0, h0
@@ -1124,4 +1127,31 @@
   ret half %r
 }
 
+; CHECK-FP16-LABEL: test_vrecpeh_f16:
+; CHECK-FP16-NEXT: frecpe h0, h0
+; CHECK-FP16-NEXT: ret
+
+define half @test_vrecpeh_f16(half %a) #0 {
+  %r = call half @llvm.aarch64.neon.frecpe.f16(half %a)
+  ret half %r
+}
+
+; CHECK-FP16-LABEL: test_vrecpxh_f16:
+; CHECK-FP16-NEXT: frecpx h0, h0
+; CHECK-FP16-NEXT: ret
+
+define half @test_vrecpxh_f16(half %a) #0 {
+  %r = call half @llvm.aarch64.neon.frecpx.f16(half %a)
+  ret half %r
+}
+
+; CHECK-FP16-LABEL: test_vrsqrteh_f16:
+; CHECK-FP16-NEXT: frsqrte h0, h0
+; CHECK-FP16-NEXT: ret
+
+define half @test_vrsqrteh_f16(half %a) #0 {
+  %r = call half @llvm.aarch64.neon.frsqrte.f16(half %a)
+  ret half %r
+}
+
 attributes #0 = { nounwind }
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3529,6 +3529,8 @@
 def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
   (FCVTPUv1i64 FPR64:$Rn)>;
 
+def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))),
+  (FRECPEv1f16 FPR16:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
   (FRECPEv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
@@ -3560,11 +3562,15 @@
 def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
   (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
 
+def : Pat<(f16 (int_aarch64_neon_frecpx (f16 FPR16:$Rn))),
+  (FRECPXv1f16 FPR16:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
   (FRECPXv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
   (FRECPXv1i64 FPR64:$Rn)>;
 
+def : Pat<(f16 (int_aarch64_neon_frsqrte (f16 FPR16:$Rn))),
+  (FRSQRTEv1f16 FPR16:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
   (FRSQRTEv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
Index: clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
===
--- clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
+++ clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
@@ -8,7 +8,7 @@
 #include 
 
 // CHECK-LABEL: test_vabsh_f16
-// CHECK:  [[ABS:%.*]] =  call half @llvm.aarch64.neon.abs.f16(half %a)
+// CHECK:  [[ABS:%.*]] =  call half @llvm.fabs.f16(half %a)
 // CHECK:  ret half [[ABS]]
 float16_t test_vabsh_f16(float16_t a) {
   return vabsh_f16(a);
@@ -139,8 +139,9 @@
 }
 
 // CHECK-LABEL: test_vcvtah_s16_f16
-// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a)
-// CHECK: ret i16 [[VCVT]]
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
 int16_t test_vcvtah_s16_f16 (float16_t a) {
   return vcvtah_s16_f16(a);
 }
@@ -160,8 +161,9 @@
 }
 
 // CHECK-LABEL: test_vcvtah_u16_f16
-// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a)
-// CHECK: ret i16 [[VCVT]]
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET

[PATCH] D42993: [AArch64] Fixes for ARMv8.2-A FP16 scalar intrinsic

2018-02-06 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az created this revision.
az added a reviewer: SjoerdMeijer.
Herald added subscribers: hiraditya, kristof.beyls, javed.absar, rengolin, 
aemerson.

A couple of fixes on top of https://reviews.llvm.org/D41792:

- Fixes for freceprical, and fsqrt instructions in the backend.
- The intrinsics that generate builtin calls with i16 data types fails in 
instruction selection. In a preparation for future fixes in the backend for 
these, the code generated by the frontend is modified. For example, a builtin 
that returns i16 is implemented by returning i32 that is truncated to i16. This 
is done similar to other non-fp16 intrinsics that produces i16 data types.
- Fix frontend code generated for abs.


https://reviews.llvm.org/D42993

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
  llvm/lib/Target/AArch64/AArch64InstrInfo.td

Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3529,6 +3529,8 @@
 def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
   (FCVTPUv1i64 FPR64:$Rn)>;
 
+def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))),
+  (FRECPEv1f16 FPR16:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
   (FRECPEv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
@@ -3560,11 +3562,15 @@
 def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
   (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
 
+def : Pat<(f16 (int_aarch64_neon_frecpx (f16 FPR16:$Rn))),
+  (FRECPXv1f16 FPR16:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
   (FRECPXv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
   (FRECPXv1i64 FPR64:$Rn)>;
 
+def : Pat<(f16 (int_aarch64_neon_frsqrte (f16 FPR16:$Rn))),
+  (FRSQRTEv1f16 FPR16:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
   (FRSQRTEv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
Index: clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
===
--- clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
+++ clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
@@ -8,7 +8,7 @@
 #include 
 
 // CHECK-LABEL: test_vabsh_f16
-// CHECK:  [[ABS:%.*]] =  call half @llvm.aarch64.neon.abs.f16(half %a)
+// CHECK:  [[ABS:%.*]] =  call half @llvm.fabs.f16(half %a)
 // CHECK:  ret half [[ABS]]
 float16_t test_vabsh_f16(float16_t a) {
   return vabsh_f16(a);
@@ -139,8 +139,9 @@
 }
 
 // CHECK-LABEL: test_vcvtah_s16_f16
-// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a)
-// CHECK: ret i16 [[VCVT]]
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
 int16_t test_vcvtah_s16_f16 (float16_t a) {
   return vcvtah_s16_f16(a);
 }
@@ -160,8 +161,9 @@
 }
 
 // CHECK-LABEL: test_vcvtah_u16_f16
-// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a)
-// CHECK: ret i16 [[VCVT]]
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
 uint16_t test_vcvtah_u16_f16 (float16_t a) {
   return vcvtah_u16_f16(a);
 }
@@ -181,8 +183,9 @@
 }
 
 // CHECK-LABEL: test_vcvtmh_s16_f16
-// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtms.i16.f16(half %a)
-// CHECK: ret i16 [[VCVT]]
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
 int16_t test_vcvtmh_s16_f16 (float16_t a) {
   return vcvtmh_s16_f16(a);
 }
@@ -202,8 +205,9 @@
 }
 
 // CHECK-LABEL: test_vcvtmh_u16_f16
-// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half %a)
-// CHECK: ret i16 [[VCVT]]
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
 uint16_t test_vcvtmh_u16_f16 (float16_t a) {
   return vcvtmh_u16_f16(a);
 }
@@ -223,8 +227,9 @@
 }
 
 // CHECK-LABEL: test_vcvtnh_s16_f16
-// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtns.i16.f16(half %a)
-// CHECK: ret i16 [[VCVT]]
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i16
+// CHECK: ret i16 [[RET]]
 int16_t test_vcvtnh_s16_f16 (float16_t a) {
   return vcvtnh_s16_f16(a);
 }
@@ -244,8 +249,9 @@
 }
 
 // CHECK-LABEL: test_vcvtnh_u16_f16
-// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half %a)
-// CHECK: ret i16 [[VCVT]]
+// CHECK: [[FCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half %a)
+// CHECK: [[RET:%.*]] = trunc i32 [[FCVT]] to i

[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics

2018-01-11 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az marked 6 inline comments as done.
az added inline comments.



Comment at: clang/include/clang/Basic/arm_fp16.td:58
+class IInst : Inst {}
+
+// ARMv8.2-A FP16 intrinsics.

SjoerdMeijer wrote:
> az wrote:
> > SjoerdMeijer wrote:
> > > There's a little bit of duplication here: the definitions above are the 
> > > same as in arm_neon.td. Would it be easy to share this, with e.g. an 
> > > include?
> > The duplication is small compared to the overall infrastructure/data 
> > structure needed to automatically generate the intrinsics. There are 3 ways 
> > to do this: 1) copy only the needed data structure in arm_fp16.td (this is 
> > what was done in original review) 2) put all data structure in a newly 
> > created file and include it in arm_neon.td and arm_fp16.td (done here). 3) 
> > put only the duplication in a new file and include it. I did not go for 
> > this one given that we create a new file for the only purpose of avoiding a 
> > small duplication but I am fine of going with 3 too. Note that some of the 
> > duplicated structure in the original arm_fp16.td was a stripped down 
> > version of the copied one.
> Given that the duplication is tiny, I don't have strong opinions to be 
> honest. Would be nice to share these definitions if that's easy to do, 
> otherwise we can perfectly live with this I think.
So, let's keep the current version for now which is: all generic stuff goes 
into the file called arm_neon_incl.td. All specific code that creates and 
generates the intrinsic goes into specific files arm_neon.td and arm_fp16.td 
which include the generic file. This will work well when we create new .td file 
for future features if any.  



Comment at: llvm/include/llvm/IR/IntrinsicsAArch64.td:250
   def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;

SjoerdMeijer wrote:
> There's a scalar and vector variant of FMAX and thus I am wondering if we 
> don't need two definitions here: one using AdvSIMD_2FloatArg_Intrinsic and 
> the other AdvSIMD_2VectorArg_Intrinsic?
Maybe we can do that but there are many instances where vector and scalar share 
the same intrinsic name ( note that the type such as f16 or v4f32 is appended 
to that name). I have not checked carefully if what you propose already exists 
or not but it does seem less common from first look.   


https://reviews.llvm.org/D41792



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics

2018-01-11 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az updated this revision to Diff 129513.
az marked 3 inline comments as done.

https://reviews.llvm.org/D41792

Files:
  clang/include/clang/Basic/BuiltinsNEON.def
  clang/include/clang/Basic/CMakeLists.txt
  clang/include/clang/Basic/arm_fp16.td
  clang/include/clang/Basic/arm_neon.td
  clang/include/clang/Basic/arm_neon_incl.td
  clang/lib/Basic/Targets/AArch64.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/module.modulemap
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
  clang/utils/TableGen/NeonEmitter.cpp
  clang/utils/TableGen/TableGen.cpp
  clang/utils/TableGen/TableGenBackends.h
  llvm/include/llvm/IR/IntrinsicsAArch64.td

Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -146,6 +146,9 @@
   class AdvSIMD_CvtFPToFx_Intrinsic
 : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
 [IntrNoMem]>;
+
+  class AdvSIMD_1Arg_Intrinsic
+: Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -244,7 +247,7 @@
   // Vector Max
   def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Max Across Lanes
@@ -256,7 +259,7 @@
   // Vector Min
   def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmin : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Min/Max Number
@@ -354,7 +357,7 @@
   def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
 
   // Vector Absolute Value
-  def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+  def int_aarch64_neon_abs : AdvSIMD_1Arg_Intrinsic;
 
   // Vector Saturating Absolute Value
   def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
Index: clang/utils/TableGen/TableGenBackends.h
===
--- clang/utils/TableGen/TableGenBackends.h
+++ clang/utils/TableGen/TableGenBackends.h
@@ -65,6 +65,7 @@
 void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS);
 
 void EmitNeon(RecordKeeper &Records, raw_ostream &OS);
+void EmitFP16(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeon2(RecordKeeper &Records, raw_ostream &OS);
Index: clang/utils/TableGen/TableGen.cpp
===
--- clang/utils/TableGen/TableGen.cpp
+++ clang/utils/TableGen/TableGen.cpp
@@ -52,6 +52,7 @@
   GenClangCommentCommandInfo,
   GenClangCommentCommandList,
   GenArmNeon,
+  GenArmFP16,
   GenArmNeonSema,
   GenArmNeonTest,
   GenAttrDocs,
@@ -139,6 +140,7 @@
"Generate list of commands that are used in "
"documentation comments"),
 clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"),
+clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"),
 clEnumValN(GenArmNeonSema, "gen-arm-neon-sema",
"Generate ARM NEON sema support for clang"),
 clEnumValN(GenArmNeonTest, "gen-arm-neon-test",
@@ -250,6 +252,9 @@
   case GenArmNeon:
 EmitNeon(Records, OS);
 break;
+  case GenArmFP16:
+EmitFP16(Records, OS);
+break;
   case GenArmNeonSema:
 EmitNeonSema(Records, OS);
 break;
Index: clang/utils/TableGen/NeonEmitter.cpp
===
--- clang/utils/TableGen/NeonEmitter.cpp
+++ clang/utils/TableGen/NeonEmitter.cpp
@@ -552,7 +552,11 @@
   // run - Emit arm_neon.h.inc
   void run(raw_ostream &o);
 
+  // runFP16 - Emit arm_fp16.h.inc
+  void runFP16(raw_ostream &o);
+
   // runHeader - Emit all the __builtin prototypes used in arm_neon.h
+	// and arm_fp16.h
   void runHeader(raw_ostream &o);
 
   // runTests - Emit tests for all the Neon intrinsics.
@@ -852,6 +856,35 @@
 NumVectors = 0;
 Float = true;
 break;
+  case 'Y':
+Bitwidth = ElementBitwidth = 16;
+NumVectors = 0;
+Float = true;
+break;
+  case 'I':
+Bitwidth = ElementBitwidth = 32;
+NumVectors = 0;
+Float = false;
+Signed = true;
+break;
+  case 'L':
+Bitwidth = ElementBitwidth = 64;
+NumVectors = 0;
+Float = false;
+Signed = true;
+break;
+  case 'U':
+Bitwidth = ElementBitwidth = 32;
+NumVectors = 0;
+Float = false;
+Signed = false;
+break;
+

[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics

2018-01-10 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az marked 8 inline comments as done.
az added inline comments.



Comment at: clang/include/clang/Basic/arm_fp16.td:58
+class IInst : Inst {}
+
+// ARMv8.2-A FP16 intrinsics.

SjoerdMeijer wrote:
> There's a little bit of duplication here: the definitions above are the same 
> as in arm_neon.td. Would it be easy to share this, with e.g. an include?
The duplication is small compared to the overall infrastructure/data structure 
needed to automatically generate the intrinsics. There are 3 ways to do this: 
1) copy only the needed data structure in arm_fp16.td (this is what was done in 
original review) 2) put all data structure in a newly created file and include 
it in arm_neon.td and arm_fp16.td (done here). 3) put only the duplication in a 
new file and include it. I did not go for this one given that we create a new 
file for the only purpose of avoiding a small duplication but I am fine of 
going with 3 too. Note that some of the duplicated structure in the original 
arm_fp16.td was a stripped down version of the copied one.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:4102
   NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
+  // FP16 scalar intrinisics go here.
+  NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),

SjoerdMeijer wrote:
> Looks like  a few intrinsic descriptions are missing here. For example, the 
> first 2-operand intrinsic vaddh_f16 is missing, but there are also more. Is 
> this intentional, or might they have slipped through the cracks (or am I 
> missing something)?
I agree that this is confusing. For the intrinsics listed in this table, code 
generation happens in a generic way based on the info in the table. The ones 
not listed in this table are addressed in a more specific way below in a the 
function called EmitAArch64BuiltinExpr. While I do not like how few things were 
implemented in generating the intrinsics, I am in general following the 
approach taken for arm_neon instead of introducing a new approach. 



Comment at: llvm/include/llvm/IR/IntrinsicsAArch64.td:149
 [IntrNoMem]>;
+
+  class AdvSIMD_1Arg_Intrinsic

SjoerdMeijer wrote:
> This and the other changes in this file are changes to LLVM. Do we need these 
> changes for this patch? It doesn't look like it.
Some tests in aarch64-v8.2a-fp16-intrinsics.c will fail for me without these 
changes. In clang/lib/CodeGen/BackendUtil.cpp, there is code there that 
includes llvm files and header files. It fails there if I do not fix 
IntrinsicAArch64.td. If you know of a better way to test differently without 
the need for llvm, then let me know. For example, if I remove the option flag 
-S from testing (i.e from aarch64-v8.2a-fp16-intrinsics.c), then there is no 
need to llvm but I won't be able to compare results.  


https://reviews.llvm.org/D41792



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics

2018-01-10 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az updated this revision to Diff 129360.

https://reviews.llvm.org/D41792

Files:
  clang/include/clang/Basic/BuiltinsNEON.def
  clang/include/clang/Basic/CMakeLists.txt
  clang/include/clang/Basic/arm_fp16.td
  clang/include/clang/Basic/arm_neon.td
  clang/include/clang/Basic/arm_neon_incl.td
  clang/lib/Basic/Targets/AArch64.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/module.modulemap
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
  clang/utils/TableGen/NeonEmitter.cpp
  clang/utils/TableGen/TableGen.cpp
  clang/utils/TableGen/TableGenBackends.h
  llvm/include/llvm/IR/IntrinsicsAArch64.td

Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -146,6 +146,9 @@
   class AdvSIMD_CvtFPToFx_Intrinsic
 : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
 [IntrNoMem]>;
+
+  class AdvSIMD_1Arg_Intrinsic
+: Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -244,7 +247,7 @@
   // Vector Max
   def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Max Across Lanes
@@ -256,7 +259,7 @@
   // Vector Min
   def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmin : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Min/Max Number
@@ -354,7 +357,7 @@
   def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
 
   // Vector Absolute Value
-  def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+  def int_aarch64_neon_abs : AdvSIMD_1Arg_Intrinsic;
 
   // Vector Saturating Absolute Value
   def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
Index: clang/utils/TableGen/TableGenBackends.h
===
--- clang/utils/TableGen/TableGenBackends.h
+++ clang/utils/TableGen/TableGenBackends.h
@@ -65,6 +65,7 @@
 void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS);
 
 void EmitNeon(RecordKeeper &Records, raw_ostream &OS);
+void EmitFP16(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeon2(RecordKeeper &Records, raw_ostream &OS);
Index: clang/utils/TableGen/TableGen.cpp
===
--- clang/utils/TableGen/TableGen.cpp
+++ clang/utils/TableGen/TableGen.cpp
@@ -52,6 +52,7 @@
   GenClangCommentCommandInfo,
   GenClangCommentCommandList,
   GenArmNeon,
+  GenArmFP16,
   GenArmNeonSema,
   GenArmNeonTest,
   GenAttrDocs,
@@ -139,6 +140,7 @@
"Generate list of commands that are used in "
"documentation comments"),
 clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"),
+clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"),
 clEnumValN(GenArmNeonSema, "gen-arm-neon-sema",
"Generate ARM NEON sema support for clang"),
 clEnumValN(GenArmNeonTest, "gen-arm-neon-test",
@@ -250,6 +252,9 @@
   case GenArmNeon:
 EmitNeon(Records, OS);
 break;
+  case GenArmFP16:
+EmitFP16(Records, OS);
+break;
   case GenArmNeonSema:
 EmitNeonSema(Records, OS);
 break;
Index: clang/utils/TableGen/NeonEmitter.cpp
===
--- clang/utils/TableGen/NeonEmitter.cpp
+++ clang/utils/TableGen/NeonEmitter.cpp
@@ -552,7 +552,11 @@
   // run - Emit arm_neon.h.inc
   void run(raw_ostream &o);
 
+  // runFP16 - Emit arm_fp16.h.inc
+  void runFP16(raw_ostream &o);
+
   // runHeader - Emit all the __builtin prototypes used in arm_neon.h
+	// and arm_fp16.h
   void runHeader(raw_ostream &o);
 
   // runTests - Emit tests for all the Neon intrinsics.
@@ -852,6 +856,35 @@
 NumVectors = 0;
 Float = true;
 break;
+  case 'Y':
+Bitwidth = ElementBitwidth = 16;
+NumVectors = 0;
+Float = true;
+break;
+  case 'I':
+Bitwidth = ElementBitwidth = 32;
+NumVectors = 0;
+Float = false;
+Signed = true;
+break;
+  case 'L':
+Bitwidth = ElementBitwidth = 64;
+NumVectors = 0;
+Float = false;
+Signed = true;
+break;
+  case 'U':
+Bitwidth = ElementBitwidth = 32;
+NumVectors = 0;
+Float = false;
+Signed = false;
+break;
+  case 'O':
+Bitwidth = ElementBi

[PATCH] D41792: [AArch64] Add ARMv8.2-A FP16 scalar intrinsics

2018-01-05 Thread Abderrazek Zaafrani via Phabricator via cfe-commits
az created this revision.
az added a reviewer: SjoerdMeijer.
Herald added subscribers: kristof.beyls, javed.absar, mgorny, rengolin, 
aemerson.

ARMv8.2-A introduces half-precision floating point data processing. This patch 
adds the fp16 scalar intrinsics for this architecture as described in the ARM 
ACLE document. Only the front-end intrinsic work is done here. Some backend 
work related to instruction selection still needs to be done.

This work is a continuation of https://reviews.llvm.org/D32511 which addressed 
ARMv8.2-A vector intrinsics.


https://reviews.llvm.org/D41792

Files:
  clang/include/clang/Basic/BuiltinsNEON.def
  clang/include/clang/Basic/CMakeLists.txt
  clang/include/clang/Basic/arm_fp16.td
  clang/lib/Basic/Targets/AArch64.cpp
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/module.modulemap
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c
  clang/utils/TableGen/NeonEmitter.cpp
  clang/utils/TableGen/TableGen.cpp
  clang/utils/TableGen/TableGenBackends.h
  llvm/include/llvm/IR/IntrinsicsAArch64.td

Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -146,6 +146,9 @@
   class AdvSIMD_CvtFPToFx_Intrinsic
 : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
 [IntrNoMem]>;
+
+  class AdvSIMD_1Arg_Intrinsic
+: Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -244,7 +247,7 @@
   // Vector Max
   def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Max Across Lanes
@@ -256,7 +259,7 @@
   // Vector Min
   def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
   def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmin : AdvSIMD_2FloatArg_Intrinsic;
   def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Vector Min/Max Number
@@ -354,7 +357,8 @@
   def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
 
   // Vector Absolute Value
-  def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+  //def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+  def int_aarch64_neon_abs : AdvSIMD_1Arg_Intrinsic;
 
   // Vector Saturating Absolute Value
   def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
Index: clang/utils/TableGen/TableGenBackends.h
===
--- clang/utils/TableGen/TableGenBackends.h
+++ clang/utils/TableGen/TableGenBackends.h
@@ -65,6 +65,7 @@
 void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS);
 
 void EmitNeon(RecordKeeper &Records, raw_ostream &OS);
+void EmitFP16(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeon2(RecordKeeper &Records, raw_ostream &OS);
Index: clang/utils/TableGen/TableGen.cpp
===
--- clang/utils/TableGen/TableGen.cpp
+++ clang/utils/TableGen/TableGen.cpp
@@ -52,6 +52,7 @@
   GenClangCommentCommandInfo,
   GenClangCommentCommandList,
   GenArmNeon,
+  GenArmFP16,
   GenArmNeonSema,
   GenArmNeonTest,
   GenAttrDocs,
@@ -139,6 +140,7 @@
"Generate list of commands that are used in "
"documentation comments"),
 clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"),
+clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"),
 clEnumValN(GenArmNeonSema, "gen-arm-neon-sema",
"Generate ARM NEON sema support for clang"),
 clEnumValN(GenArmNeonTest, "gen-arm-neon-test",
@@ -250,6 +252,9 @@
   case GenArmNeon:
 EmitNeon(Records, OS);
 break;
+  case GenArmFP16:
+EmitFP16(Records, OS);
+break;
   case GenArmNeonSema:
 EmitNeonSema(Records, OS);
 break;
Index: clang/utils/TableGen/NeonEmitter.cpp
===
--- clang/utils/TableGen/NeonEmitter.cpp
+++ clang/utils/TableGen/NeonEmitter.cpp
@@ -552,7 +552,11 @@
   // run - Emit arm_neon.h.inc
   void run(raw_ostream &o);
 
+  // runFP16 - Emit arm_fp16.h.inc
+  void runFP16(raw_ostream &o);
+
   // runHeader - Emit all the __builtin prototypes used in arm_neon.h
+	// and arm_fp16.h
   void runHeader(raw_ostream &o);
 
   // runTests - Emit tests for all the Neon intrinsics.
@@ -852,6 +856,35 @@
 NumVectors = 0;
 Float = true;
 break;
+  case 'Y':
+Bitwidth