[AArch64] Handle HFAs of float16 types properly

James Greenhalgh Tue, 26 Jul 2016 06:56:04 -0700

Hi,

It looks like we've not been handling structures of 16-bit floating-point
data correctly for AArch64. For some reason we end up passing them
packed in to integer registers. That is to say, on trunk and GCC 6, for:


  struct x {
    __fp16 x[4];
  };

  __fp16
  foo1 (struct x x)
  {
    return x.x[1];
  }

We generate:

  foo1:
        sbfx    x0, x0, 16, 16
        mov     v0.h[0], w0
        ret

Which is wrong.

This patch fixes that, so now we generate:

  foo1:
        umov    w0, v1.h[0]
        sxth    x0, w0
        mov     v0.h[0], w0
        ret

Far from optimal (I'll work on that...) but at least getting the data from
the right register bank!

To do this we need to keep around a reference to the fp16 type after we
construct it. I've moved this initialisation to a new function
aarch64_init_fp16_types in aarch64-builtins.c and made the references
available through arm_neon.h.

After that, we want to remove the #if 0 wrapping HFmode support in
aarch64_gimplify_va_arg_expr in aarch64.c, and add HFmode to the
REAL_TYPE and COMPLEX_TYPE support in aapcs_vfp_sub_candidate.

Strictly speaking, we don't need the hunk regarding COMPLEX_TYPE.
We can't build complex forms of __fp16. But, were we ever to support the
_Float16 type we'd need this. Rather than leave the chance it will be
forgotten about, I've just added it here. If the maintainers would prefer,
I can change this to a TODO and put a sticky-note somewhere near my desk.

With those simple changes, we fix the argument passing. The rest of the
patch is an update to the various testcases in aapcs64.exp to fully cover
various __fp16 cases (both naked, and within an HFA).

Bootstrapped on aarch64-none-linux-gnu and tested with no issues. Also
tested on aarch64_be-none-elf. All test came back clean.

OK? As this is an ABI break, I'm not proposing for it to go back to GCC 6,
though it will apply cleanly there if the maintainers support that.

Thanks,
James

---

gcc/

2016-07-26  James Greenhalgh  <james.greenha...@arm.com>

        * config/aarch64/aarch64.h (aarch64_fp16_type_node): Declare.
        (aarch64_fp16_ptr_type_node): Likewise.
        * config/aarch64/aarch64-simd-builtins.c
        (aarch64_fp16_ptr_type_node): Define.
        (aarch64_init_fp16_types): New, refactored out of...
        (aarch64_init_builtins): ...here, update to call
        aarch64_init_fp16_types.
        * config/aarch64/aarch64.c (aarch64_gimplify_va_arg_expr): Handle
        HFmode.
        (aapcs_vfp_sub_candidate): Likewise.

gcc/testsuite/

2016-07-26  James Greenhalgh  <james.greenha...@arm.com>

        * gcc.target/aarch64/aapcs64/abitest-common.h: Define half-precision
        registers.
        * gcc.target/aarch64/aapcs64/abitest.S (dumpregs): Add assembly for
        saving the half-precision registers.
        * gcc.target/aarch64/aapcs64/func-ret-1.c: Test that an __fp16
        value is returned in h0.
        * gcc.target/aarch64/aapcs64/test_2.c: Check that __FP16 arguments
        are passed in FP/SIMD registers.
        * gcc.target/aarch64/aapcs64/test_27.c: New, test that __fp16 HFA
        passing works corrcetly.
        * gcc.target/aarch64/aapcs64/type-def.h (hfa_f16x1_t): New.
        (hfa_f16x2_t): Likewise.
        (hfa_f16x3_t): Likewise.
        * gcc.target/aarch64/aapcs64/va_arg-1.c: Check that __fp16 values
        are promoted to double and passed in a double register.
        * gcc.target/aarch64/aapcs64/va_arg-2.c: Check that __fp16 values
        are promoted to double and stacked.
        * gcc.target/aarch64/aapcs64/va_arg-4.c: Check stacking of HFA of
        __fp16 data types.
        * gcc.target/aarch64/aapcs64/va_arg-5.c: Likewise.
        * gcc.target/aarch64/aapcs64/va_arg-16.c: New, check HFAs of
        __fp16 first get passed in FP/SIMD registers, then stacked.

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index ca91d91..1de325a 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -443,13 +443,15 @@ static struct aarch64_simd_type_info aarch64_simd_types [] = {
 };
 #undef ENTRY
 
-/* This type is not SIMD-specific; it is the user-visible __fp16.  */
-static tree aarch64_fp16_type_node = NULL_TREE;
-
 static tree aarch64_simd_intOI_type_node = NULL_TREE;
 static tree aarch64_simd_intCI_type_node = NULL_TREE;
 static tree aarch64_simd_intXI_type_node = NULL_TREE;
 
+/* The user-visible __fp16 type, and a pointer to that type.  Used
+   across the back-end.  */
+tree aarch64_fp16_type_node = NULL_TREE;
+tree aarch64_fp16_ptr_type_node = NULL_TREE;
+
 static const char *
 aarch64_mangle_builtin_scalar_type (const_tree type)
 {
@@ -883,6 +885,21 @@ aarch64_init_builtin_rsqrt (void)
   }
 }
 
+/* Initialize the backend types that support the user-visible __fp16
+   type, also initialize a pointer to that type, to be used when
+   forming HFAs.  */
+
+static void
+aarch64_init_fp16_types (void)
+{
+  aarch64_fp16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_fp16_type_node) = 16;
+  layout_type (aarch64_fp16_type_node);
+
+  (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node, "__fp16");
+  aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
+}
+
 void
 aarch64_init_builtins (void)
 {
@@ -904,11 +921,7 @@ aarch64_init_builtins (void)
     = add_builtin_function ("__builtin_aarch64_set_fpsr", ftype_set_fpr,
 			    AARCH64_BUILTIN_SET_FPSR, BUILT_IN_MD, NULL, NULL_TREE);
 
-  aarch64_fp16_type_node = make_node (REAL_TYPE);
-  TYPE_PRECISION (aarch64_fp16_type_node) = 16;
-  layout_type (aarch64_fp16_type_node);
-
-  (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node, "__fp16");
+  aarch64_init_fp16_types ();
 
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fe2683e..addcf2c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9888,15 +9888,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = long_double_type_node;
 	  field_ptr_t = long_double_ptr_type_node;
 	  break;
-/* The half precision and quad precision are not fully supported yet.  Enable
-   the following code after the support is complete.  Need to find the correct
-   type node for __fp16 *.  */
-#if 0
 	case HFmode:
-	  field_t = float_type_node;
-	  field_ptr_t = float_ptr_type_node;
+	  field_t = aarch64_fp16_type_node;
+	  field_ptr_t = aarch64_fp16_ptr_type_node;
 	  break;
-#endif
 	case V2SImode:
 	case V4SImode:
 	    {
@@ -10058,7 +10053,8 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
     {
     case REAL_TYPE:
       mode = TYPE_MODE (type);
-      if (mode != DFmode && mode != SFmode && mode != TFmode)
+      if (mode != DFmode && mode != SFmode
+	  && mode != TFmode && mode != HFmode)
 	return -1;
 
       if (*modep == VOIDmode)
@@ -10071,7 +10067,8 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
 
     case COMPLEX_TYPE:
       mode = TYPE_MODE (TREE_TYPE (type));
-      if (mode != DFmode && mode != SFmode && mode != TFmode)
+      if (mode != DFmode && mode != SFmode
+	  && mode != TFmode && mode != HFmode)
 	return -1;
 
       if (*modep == VOIDmode)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 1915980..9e26eb1 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -928,4 +928,9 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 
 #define ASM_OUTPUT_POOL_EPILOGUE  aarch64_asm_output_pool_epilogue
 
+/* This type is the user-visible __fp16, and a pointer to that type.  We
+   need it in many places in the backend.  Defined in aarch64-builtins.c.  */
+extern tree aarch64_fp16_type_node;
+extern tree aarch64_fp16_ptr_type_node;
+
 #endif /* GCC_AARCH64_H */
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest-common.h b/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest-common.h
index 4e2ef0d..138de73 100644
--- a/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest-common.h
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest-common.h
@@ -57,7 +57,17 @@
 #define X8     320
 #define X9     328
 
-#define STACK  336
+#define H0	336
+#define H1	338
+#define H2	340
+#define H3	342
+#define H4	344
+#define H5	346
+#define H6	348
+#define H7	350
+
+
+#define STACK  352
 
 /* The type of test.  'myfunc' in abitest.S needs to know which kind of
    test it is running to decide what to do at the runtime.  Keep the
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest.S b/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest.S
index c2fbd83..893e68c 100644
--- a/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest.S
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest.S
@@ -13,7 +13,12 @@ dumpregs:
 myfunc:
       mov	x16, sp
       mov	x17, sp
-      sub	sp,  sp, 352 // 336 for registers and 16 for old sp and lr
+      sub	sp,  sp, 368 // 352 for registers and 16 for old sp and lr
+
+      sub	x17, x17, 8
+      st4	{ v4.h, v5.h, v6.h, v7.h }[0], [x17] //344
+      sub	x17, x17, 8
+      st4	{ v0.h, v1.h, v2.h, v3.h }[0], [x17] //336
 
       stp	x8, x9, [x17, #-16]! //320
 
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/func-ret-1.c b/gcc/testsuite/gcc.target/aarch64/aapcs64/func-ret-1.c
index a21c926..29a1ca6 100644
--- a/gcc/testsuite/gcc.target/aarch64/aapcs64/func-ret-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/func-ret-1.c
@@ -44,4 +44,5 @@ FUNC_VAL_CHECK (12,         vf2_t,        vf2, D0, f32in64)
 FUNC_VAL_CHECK (13,         vi4_t,        vi4, Q0, i32in128)
 FUNC_VAL_CHECK (14,         int *,    int_ptr, X0, flat)
 FUNC_VAL_CHECK (15,         vlf1_t,    vlf1, Q0, flat)
+FUNC_VAL_CHECK (16,         __fp16,    0xabcd, H0, flat)
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/test_2.c b/gcc/testsuite/gcc.target/aarch64/aapcs64/test_2.c
index 94817ed..ce7c60a8 100644
--- a/gcc/testsuite/gcc.target/aarch64/aapcs64/test_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/test_2.c
@@ -12,5 +12,6 @@
   ARG(double, 4.0, D1)
   ARG(float, 2.0f, S2)
   ARG(double, 5.0, D3)
+  ARG(__fp16, 8.0f, H4)
   LAST_ARG(int, 3, W0)
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/test_27.c b/gcc/testsuite/gcc.target/aarch64/aapcs64/test_27.c
new file mode 100644
index 0000000..7bc79f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/test_27.c
@@ -0,0 +1,46 @@
+/* Test AAPCS64 layout
+
+   Test named homogeneous floating-point aggregates of __fp16 data,
+   which should be passed in SIMD/FP registers or via the stack.  */
+
+/* { dg-do run { target aarch64*-*-* } } */
+
+#ifndef IN_FRAMEWORK
+#define TESTFILE "test_27.c"
+
+struct x0
+{
+  __fp16 v[1];
+} f16x1;
+
+struct x1
+{
+  __fp16 v[2];
+} f16x2;
+
+struct x2
+{
+  __fp16 v[3];
+} f16x3;
+
+#define HAS_DATA_INIT_FUNC
+void init_data ()
+{
+  f16x1.v[0] = 2.0f;
+  f16x2.v[0] = 4.0f;
+  f16x2.v[1] = 8.0f;
+  f16x3.v[0] = 16.0f;
+  f16x3.v[1] = 32.0f;
+  f16x3.v[2] = 64.0f;
+}
+
+#include "abitest.h"
+#else
+ARG (struct x0, f16x1, H0)
+ARG (struct x1, f16x2, H1)
+ARG (struct x2, f16x3, H3)
+ARG (struct x1, f16x2, H6)
+ARG (struct x0, f16x1, STACK)
+ARG (int, 0xdeadbeef, W0)
+LAST_ARG (double, 456.789, STACK+8)
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/type-def.h b/gcc/testsuite/gcc.target/aarch64/aapcs64/type-def.h
index 3b9b349..ca1fa58 100644
--- a/gcc/testsuite/gcc.target/aarch64/aapcs64/type-def.h
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/type-def.h
@@ -44,6 +44,24 @@ struct hfa_fx3_t
   float c;
 };
 
+struct hfa_f16x1_t
+{
+  __fp16 a;
+};
+
+struct hfa_f16x2_t
+{
+  __fp16 a;
+  __fp16 b;
+};
+
+struct hfa_f16x3_t
+{
+  __fp16 a;
+  __fp16 b;
+  __fp16 c;
+};
+
 struct hfa_dx2_t
 {
   double a;
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-1.c b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-1.c
index 4fb9a03..5b9e057 100644
--- a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-1.c
@@ -19,6 +19,8 @@ signed short ss = 0xcba9;
 signed int ss_promoted = 0xffffcba9;
 float fp = 65432.12345f;
 double fp_promoted = (double)65432.12345f;
+__fp16 fp16 = 2.0f;
+__fp16 fp16_promoted = (double)2.0f;
 
 #define HAS_DATA_INIT_FUNC
 void init_data ()
@@ -46,9 +48,13 @@ void init_data ()
   ANON         (    long double   , 98765432123456789.987654321L,      Q2,      12)
   ANON         (             vf2_t, vf2   ,                            D3,      13)
   ANON         (             vi4_t, vi4   ,                            Q4,      14)
+  /* 7.2: For unprototyped (i.e. pre- ANSI or K&R C) and variadic functions,
+     in addition to the normal conversions and promotions, arguments of
+     type __fp16 are converted to type double.  */
+  ANON_PROMOTED(            __fp16, fp16  ,     double, fp16_promoted, D5,      15)
 #ifndef __AAPCS64_BIG_ENDIAN__
-  LAST_ANON    (         int      , 0xeeee,                            STACK+32,15)
+  LAST_ANON    (         int      , 0xeeee,                            STACK+32,16)
 #else
-  LAST_ANON    (         int      , 0xeeee,                            STACK+36,15)
+  LAST_ANON    (         int      , 0xeeee,                            STACK+36,16)
 #endif
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-16.c b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-16.c
new file mode 100644
index 0000000..73f8f1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-16.c
@@ -0,0 +1,28 @@
+/* Test AAPCS64 layout and __builtin_va_arg.
+
+   This test is focused particularly on __fp16 unnamed homogeneous
+   floating-point aggregate types which should be passed in fp/simd
+   registers until we run out of those, then the stack.  */
+
+/* { dg-do run { target aarch64*-*-* } } */
+
+#ifndef IN_FRAMEWORK
+#define AAPCS64_TEST_STDARG
+#define TESTFILE "va_arg-16.c"
+#include "type-def.h"
+
+struct hfa_f16x1_t hfa_f16x1 = {2.0f};
+struct hfa_f16x2_t hfa_f16x2 = {4.0f, 8.0f};
+struct hfa_f16x3_t hfa_f16x3 = {16.0f, 32.0f, 64.0f};
+
+#include "abitest.h"
+#else
+  ARG      (int, 1, W0, LAST_NAMED_ARG_ID)
+  DOTS
+  ANON     (struct hfa_f16x1_t, hfa_f16x1, H0     , 0)
+  ANON     (struct hfa_f16x2_t, hfa_f16x2, H1     , 1)
+  ANON     (struct hfa_f16x3_t, hfa_f16x3, H3     , 2)
+  ANON     (struct hfa_f16x2_t, hfa_f16x2, H6     , 3)
+  ANON     (struct hfa_f16x1_t, hfa_f16x1, STACK  , 4)
+  LAST_ANON(double            , 1.0      , STACK+8, 5)
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-2.c b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-2.c
index e972691..8f2f881 100644
--- a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-2.c
@@ -19,6 +19,8 @@ signed short ss = 0xcba9;
 signed int ss_promoted = 0xffffcba9;
 float fp = 65432.12345f;
 double fp_promoted = (double)65432.12345f;
+__fp16 fp16 = 2.0f;
+__fp16 fp16_promoted = (double)2.0f;
 
 #define HAS_DATA_INIT_FUNC
 void init_data ()
@@ -64,9 +66,10 @@ void init_data ()
   ANON         (    long double   , 98765432123456789.987654321L,      STACK+80, 20)
   ANON         (             vf2_t, vf2   ,                            STACK+96, 21)
   ANON         (             vi4_t, vi4   ,                            STACK+112,22)
+  ANON_PROMOTED(         __fp16   , fp16  ,     double, fp16_promoted, STACK+128,23)
 #ifndef __AAPCS64_BIG_ENDIAN__
-  LAST_ANON    (         int      , 0xeeee,                            STACK+128,23)
+  LAST_ANON    (         int      , 0xeeee,                            STACK+136,24)
 #else
-  LAST_ANON    (         int      , 0xeeee,                            STACK+132,23)
+  LAST_ANON    (         int      , 0xeeee,                            STACK+140,24)
 #endif
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-4.c b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-4.c
index fab3575..010ad8b 100644
--- a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-4.c
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-4.c
@@ -29,6 +29,8 @@ struct non_hfa_ffvf2_t non_hfa_ffvf2;
 struct non_hfa_fffd_t non_hfa_fffd = {33.f, 34.f, 35.f, 36.0};
 union hfa_union_t hfa_union;
 union non_hfa_union_t non_hfa_union;
+struct hfa_f16x2_t hfa_f16x2 = {2.0f, 4.0f};
+struct hfa_f16x3_t hfa_f16x3 = {2.0f, 4.0f, 8.0f};
 
 #define HAS_DATA_INIT_FUNC
 void init_data ()
@@ -89,9 +91,12 @@ void init_data ()
   PTR_ANON (struct non_hfa_ffs_t  , non_hfa_ffs  , STACK+120, 18)
   ANON     (struct non_hfa_ffs_2_t, non_hfa_ffs_2, STACK+128, 19)
   ANON     (union  non_hfa_union_t, non_hfa_union, STACK+144, 20)
+  /* HFA of __fp16 passed on stack, directed __fp16 test is va_arg-10.c.  */
+  ANON     (struct hfa_f16x2_t    , hfa_f16x2    , STACK+152, 21)
+  ANON     (struct hfa_f16x3_t    , hfa_f16x3    , STACK+160, 22)
 #ifndef __AAPCS64_BIG_ENDIAN__
-  LAST_ANON(int                   , 2            , STACK+152, 30)
+  LAST_ANON(int                   , 2            , STACK+168, 30)
 #else
-  LAST_ANON(int                   , 2            , STACK+156, 30)
+  LAST_ANON(int                   , 2            , STACK+172, 30)
 #endif
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-5.c b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-5.c
index 4853f92..e54f1f5 100644
--- a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-5.c
+++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-5.c
@@ -17,6 +17,8 @@ struct hfa_dx4_t hfa_dx4 = {1234.123, 2345.234, 3456.345, 4567.456};
 struct hfa_ldx3_t hfa_ldx3 = {123456.7890, 234567.8901, 345678.9012};
 struct hfa_ffs_t hfa_ffs;
 union hfa_union_t hfa_union;
+struct hfa_f16x2_t hfa_f16x2 = {2.0f, 4.0f};
+struct hfa_f16x3_t hfa_f16x3 = {2.0f, 4.0f, 8.0f};
 
 #define HAS_DATA_INIT_FUNC
 void init_data ()
@@ -43,5 +45,8 @@ void init_data ()
   ANON     (struct hfa_fx1_t  , hfa_fx1  , STACK+24, 4)
   ANON     (struct hfa_fx2_t  , hfa_fx2  , STACK+32, 5)
   ANON     (struct hfa_dx2_t  , hfa_dx2  , STACK+40, 6)
-  LAST_ANON(double            , 1.0      , STACK+56, 7)
+  /* HFA of __fp16 passed on stack, directed __fp16 test is va_arg-10.c.  */
+  ANON     (struct hfa_f16x2_t, hfa_f16x2, STACK+56, 7)
+  ANON     (struct hfa_f16x3_t, hfa_f16x3, STACK+64, 8)
+  LAST_ANON(double            , 1.0      , STACK+72, 9)
 #endif

[AArch64] Handle HFAs of float16 types properly

Reply via email to