On 26/07/16 14:55, James Greenhalgh wrote: > > Hi, > > It looks like we've not been handling structures of 16-bit floating-point > data correctly for AArch64. For some reason we end up passing them > packed in to integer registers. That is to say, on trunk and GCC 6, for: > > struct x { > __fp16 x[4]; > }; > > __fp16 > foo1 (struct x x) > { > return x.x[1]; > } > > We generate: > > foo1: > sbfx x0, x0, 16, 16 > mov v0.h[0], w0 > ret > > Which is wrong. > > This patch fixes that, so now we generate: > > foo1: > umov w0, v1.h[0] > sxth x0, w0 > mov v0.h[0], w0 > ret > > Far from optimal (I'll work on that...) but at least getting the data from > the right register bank! > > To do this we need to keep around a reference to the fp16 type after we > construct it. I've moved this initialisation to a new function > aarch64_init_fp16_types in aarch64-builtins.c and made the references > available through arm_neon.h. > > After that, we want to remove the #if 0 wrapping HFmode support in > aarch64_gimplify_va_arg_expr in aarch64.c, and add HFmode to the > REAL_TYPE and COMPLEX_TYPE support in aapcs_vfp_sub_candidate. > > Strictly speaking, we don't need the hunk regarding COMPLEX_TYPE. > We can't build complex forms of __fp16. But, were we ever to support the > _Float16 type we'd need this. Rather than leave the chance it will be > forgotten about, I've just added it here. If the maintainers would prefer, > I can change this to a TODO and put a sticky-note somewhere near my desk. > > With those simple changes, we fix the argument passing. The rest of the > patch is an update to the various testcases in aapcs64.exp to fully cover > various __fp16 cases (both naked, and within an HFA). > > Bootstrapped on aarch64-none-linux-gnu and tested with no issues. Also > tested on aarch64_be-none-elf. All test came back clean. > > OK? As this is an ABI break, I'm not proposing for it to go back to GCC 6, > though it will apply cleanly there if the maintainers support that. >
Can you please file a PR for this and use that when committing. As previously discussed, since this was new for 6.1 having a PR makes it easier if we do decide to have a back-port. OK on that basis. R. > Thanks, > James > > --- > > gcc/ > > 2016-07-26 James Greenhalgh <james.greenha...@arm.com> > > * config/aarch64/aarch64.h (aarch64_fp16_type_node): Declare. > (aarch64_fp16_ptr_type_node): Likewise. > * config/aarch64/aarch64-simd-builtins.c > (aarch64_fp16_ptr_type_node): Define. > (aarch64_init_fp16_types): New, refactored out of... > (aarch64_init_builtins): ...here, update to call > aarch64_init_fp16_types. > * config/aarch64/aarch64.c (aarch64_gimplify_va_arg_expr): Handle > HFmode. > (aapcs_vfp_sub_candidate): Likewise. > > gcc/testsuite/ > > 2016-07-26 James Greenhalgh <james.greenha...@arm.com> > > * gcc.target/aarch64/aapcs64/abitest-common.h: Define half-precision > registers. > * gcc.target/aarch64/aapcs64/abitest.S (dumpregs): Add assembly for > saving the half-precision registers. > * gcc.target/aarch64/aapcs64/func-ret-1.c: Test that an __fp16 > value is returned in h0. > * gcc.target/aarch64/aapcs64/test_2.c: Check that __FP16 arguments > are passed in FP/SIMD registers. > * gcc.target/aarch64/aapcs64/test_27.c: New, test that __fp16 HFA > passing works corrcetly. > * gcc.target/aarch64/aapcs64/type-def.h (hfa_f16x1_t): New. > (hfa_f16x2_t): Likewise. > (hfa_f16x3_t): Likewise. > * gcc.target/aarch64/aapcs64/va_arg-1.c: Check that __fp16 values > are promoted to double and passed in a double register. > * gcc.target/aarch64/aapcs64/va_arg-2.c: Check that __fp16 values > are promoted to double and stacked. > * gcc.target/aarch64/aapcs64/va_arg-4.c: Check stacking of HFA of > __fp16 data types. > * gcc.target/aarch64/aapcs64/va_arg-5.c: Likewise. > * gcc.target/aarch64/aapcs64/va_arg-16.c: New, check HFAs of > __fp16 first get passed in FP/SIMD registers, then stacked. > > > 0001-AArch64-Handle-HFAs-of-float16-types-properly.patch > > > diff --git a/gcc/config/aarch64/aarch64-builtins.c > b/gcc/config/aarch64/aarch64-builtins.c > index ca91d91..1de325a 100644 > --- a/gcc/config/aarch64/aarch64-builtins.c > +++ b/gcc/config/aarch64/aarch64-builtins.c > @@ -443,13 +443,15 @@ static struct aarch64_simd_type_info aarch64_simd_types > [] = { > }; > #undef ENTRY > > -/* This type is not SIMD-specific; it is the user-visible __fp16. */ > -static tree aarch64_fp16_type_node = NULL_TREE; > - > static tree aarch64_simd_intOI_type_node = NULL_TREE; > static tree aarch64_simd_intCI_type_node = NULL_TREE; > static tree aarch64_simd_intXI_type_node = NULL_TREE; > > +/* The user-visible __fp16 type, and a pointer to that type. Used > + across the back-end. */ > +tree aarch64_fp16_type_node = NULL_TREE; > +tree aarch64_fp16_ptr_type_node = NULL_TREE; > + > static const char * > aarch64_mangle_builtin_scalar_type (const_tree type) > { > @@ -883,6 +885,21 @@ aarch64_init_builtin_rsqrt (void) > } > } > > +/* Initialize the backend types that support the user-visible __fp16 > + type, also initialize a pointer to that type, to be used when > + forming HFAs. */ > + > +static void > +aarch64_init_fp16_types (void) > +{ > + aarch64_fp16_type_node = make_node (REAL_TYPE); > + TYPE_PRECISION (aarch64_fp16_type_node) = 16; > + layout_type (aarch64_fp16_type_node); > + > + (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node, > "__fp16"); > + aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node); > +} > + > void > aarch64_init_builtins (void) > { > @@ -904,11 +921,7 @@ aarch64_init_builtins (void) > = add_builtin_function ("__builtin_aarch64_set_fpsr", ftype_set_fpr, > AARCH64_BUILTIN_SET_FPSR, BUILT_IN_MD, NULL, > NULL_TREE); > > - aarch64_fp16_type_node = make_node (REAL_TYPE); > - TYPE_PRECISION (aarch64_fp16_type_node) = 16; > - layout_type (aarch64_fp16_type_node); > - > - (*lang_hooks.types.register_builtin_type) (aarch64_fp16_type_node, > "__fp16"); > + aarch64_init_fp16_types (); > > if (TARGET_SIMD) > aarch64_init_simd_builtins (); > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index fe2683e..addcf2c 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -9888,15 +9888,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, > gimple_seq *pre_p, > field_t = long_double_type_node; > field_ptr_t = long_double_ptr_type_node; > break; > -/* The half precision and quad precision are not fully supported yet. Enable > - the following code after the support is complete. Need to find the > correct > - type node for __fp16 *. */ > -#if 0 > case HFmode: > - field_t = float_type_node; > - field_ptr_t = float_ptr_type_node; > + field_t = aarch64_fp16_type_node; > + field_ptr_t = aarch64_fp16_ptr_type_node; > break; > -#endif > case V2SImode: > case V4SImode: > { > @@ -10058,7 +10053,8 @@ aapcs_vfp_sub_candidate (const_tree type, > machine_mode *modep) > { > case REAL_TYPE: > mode = TYPE_MODE (type); > - if (mode != DFmode && mode != SFmode && mode != TFmode) > + if (mode != DFmode && mode != SFmode > + && mode != TFmode && mode != HFmode) > return -1; > > if (*modep == VOIDmode) > @@ -10071,7 +10067,8 @@ aapcs_vfp_sub_candidate (const_tree type, > machine_mode *modep) > > case COMPLEX_TYPE: > mode = TYPE_MODE (TREE_TYPE (type)); > - if (mode != DFmode && mode != SFmode && mode != TFmode) > + if (mode != DFmode && mode != SFmode > + && mode != TFmode && mode != HFmode) > return -1; > > if (*modep == VOIDmode) > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index 1915980..9e26eb1 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -928,4 +928,9 @@ extern const char *host_detect_local_cpu (int argc, const > char **argv); > > #define ASM_OUTPUT_POOL_EPILOGUE aarch64_asm_output_pool_epilogue > > +/* This type is the user-visible __fp16, and a pointer to that type. We > + need it in many places in the backend. Defined in aarch64-builtins.c. */ > +extern tree aarch64_fp16_type_node; > +extern tree aarch64_fp16_ptr_type_node; > + > #endif /* GCC_AARCH64_H */ > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest-common.h > b/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest-common.h > index 4e2ef0d..138de73 100644 > --- a/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest-common.h > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest-common.h > @@ -57,7 +57,17 @@ > #define X8 320 > #define X9 328 > > -#define STACK 336 > +#define H0 336 > +#define H1 338 > +#define H2 340 > +#define H3 342 > +#define H4 344 > +#define H5 346 > +#define H6 348 > +#define H7 350 > + > + > +#define STACK 352 > > /* The type of test. 'myfunc' in abitest.S needs to know which kind of > test it is running to decide what to do at the runtime. Keep the > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest.S > b/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest.S > index c2fbd83..893e68c 100644 > --- a/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest.S > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest.S > @@ -13,7 +13,12 @@ dumpregs: > myfunc: > mov x16, sp > mov x17, sp > - sub sp, sp, 352 // 336 for registers and 16 for old sp and lr > + sub sp, sp, 368 // 352 for registers and 16 for old sp and lr > + > + sub x17, x17, 8 > + st4 { v4.h, v5.h, v6.h, v7.h }[0], [x17] //344 > + sub x17, x17, 8 > + st4 { v0.h, v1.h, v2.h, v3.h }[0], [x17] //336 > > stp x8, x9, [x17, #-16]! //320 > > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/func-ret-1.c > b/gcc/testsuite/gcc.target/aarch64/aapcs64/func-ret-1.c > index a21c926..29a1ca6 100644 > --- a/gcc/testsuite/gcc.target/aarch64/aapcs64/func-ret-1.c > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/func-ret-1.c > @@ -44,4 +44,5 @@ FUNC_VAL_CHECK (12, vf2_t, vf2, D0, f32in64) > FUNC_VAL_CHECK (13, vi4_t, vi4, Q0, i32in128) > FUNC_VAL_CHECK (14, int *, int_ptr, X0, flat) > FUNC_VAL_CHECK (15, vlf1_t, vlf1, Q0, flat) > +FUNC_VAL_CHECK (16, __fp16, 0xabcd, H0, flat) > #endif > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/test_2.c > b/gcc/testsuite/gcc.target/aarch64/aapcs64/test_2.c > index 94817ed..ce7c60a8 100644 > --- a/gcc/testsuite/gcc.target/aarch64/aapcs64/test_2.c > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/test_2.c > @@ -12,5 +12,6 @@ > ARG(double, 4.0, D1) > ARG(float, 2.0f, S2) > ARG(double, 5.0, D3) > + ARG(__fp16, 8.0f, H4) > LAST_ARG(int, 3, W0) > #endif > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/test_27.c > b/gcc/testsuite/gcc.target/aarch64/aapcs64/test_27.c > new file mode 100644 > index 0000000..7bc79f5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/test_27.c > @@ -0,0 +1,46 @@ > +/* Test AAPCS64 layout > + > + Test named homogeneous floating-point aggregates of __fp16 data, > + which should be passed in SIMD/FP registers or via the stack. */ > + > +/* { dg-do run { target aarch64*-*-* } } */ > + > +#ifndef IN_FRAMEWORK > +#define TESTFILE "test_27.c" > + > +struct x0 > +{ > + __fp16 v[1]; > +} f16x1; > + > +struct x1 > +{ > + __fp16 v[2]; > +} f16x2; > + > +struct x2 > +{ > + __fp16 v[3]; > +} f16x3; > + > +#define HAS_DATA_INIT_FUNC > +void init_data () > +{ > + f16x1.v[0] = 2.0f; > + f16x2.v[0] = 4.0f; > + f16x2.v[1] = 8.0f; > + f16x3.v[0] = 16.0f; > + f16x3.v[1] = 32.0f; > + f16x3.v[2] = 64.0f; > +} > + > +#include "abitest.h" > +#else > +ARG (struct x0, f16x1, H0) > +ARG (struct x1, f16x2, H1) > +ARG (struct x2, f16x3, H3) > +ARG (struct x1, f16x2, H6) > +ARG (struct x0, f16x1, STACK) > +ARG (int, 0xdeadbeef, W0) > +LAST_ARG (double, 456.789, STACK+8) > +#endif > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/type-def.h > b/gcc/testsuite/gcc.target/aarch64/aapcs64/type-def.h > index 3b9b349..ca1fa58 100644 > --- a/gcc/testsuite/gcc.target/aarch64/aapcs64/type-def.h > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/type-def.h > @@ -44,6 +44,24 @@ struct hfa_fx3_t > float c; > }; > > +struct hfa_f16x1_t > +{ > + __fp16 a; > +}; > + > +struct hfa_f16x2_t > +{ > + __fp16 a; > + __fp16 b; > +}; > + > +struct hfa_f16x3_t > +{ > + __fp16 a; > + __fp16 b; > + __fp16 c; > +}; > + > struct hfa_dx2_t > { > double a; > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-1.c > b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-1.c > index 4fb9a03..5b9e057 100644 > --- a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-1.c > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-1.c > @@ -19,6 +19,8 @@ signed short ss = 0xcba9; > signed int ss_promoted = 0xffffcba9; > float fp = 65432.12345f; > double fp_promoted = (double)65432.12345f; > +__fp16 fp16 = 2.0f; > +__fp16 fp16_promoted = (double)2.0f; > > #define HAS_DATA_INIT_FUNC > void init_data () > @@ -46,9 +48,13 @@ void init_data () > ANON ( long double , 98765432123456789.987654321L, Q2, > 12) > ANON ( vf2_t, vf2 , D3, > 13) > ANON ( vi4_t, vi4 , Q4, > 14) > + /* 7.2: For unprototyped (i.e. pre- ANSI or K&R C) and variadic functions, > + in addition to the normal conversions and promotions, arguments of > + type __fp16 are converted to type double. */ > + ANON_PROMOTED( __fp16, fp16 , double, fp16_promoted, D5, > 15) > #ifndef __AAPCS64_BIG_ENDIAN__ > - LAST_ANON ( int , 0xeeee, > STACK+32,15) > + LAST_ANON ( int , 0xeeee, > STACK+32,16) > #else > - LAST_ANON ( int , 0xeeee, > STACK+36,15) > + LAST_ANON ( int , 0xeeee, > STACK+36,16) > #endif > #endif > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-16.c > b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-16.c > new file mode 100644 > index 0000000..73f8f1c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-16.c > @@ -0,0 +1,28 @@ > +/* Test AAPCS64 layout and __builtin_va_arg. > + > + This test is focused particularly on __fp16 unnamed homogeneous > + floating-point aggregate types which should be passed in fp/simd > + registers until we run out of those, then the stack. */ > + > +/* { dg-do run { target aarch64*-*-* } } */ > + > +#ifndef IN_FRAMEWORK > +#define AAPCS64_TEST_STDARG > +#define TESTFILE "va_arg-16.c" > +#include "type-def.h" > + > +struct hfa_f16x1_t hfa_f16x1 = {2.0f}; > +struct hfa_f16x2_t hfa_f16x2 = {4.0f, 8.0f}; > +struct hfa_f16x3_t hfa_f16x3 = {16.0f, 32.0f, 64.0f}; > + > +#include "abitest.h" > +#else > + ARG (int, 1, W0, LAST_NAMED_ARG_ID) > + DOTS > + ANON (struct hfa_f16x1_t, hfa_f16x1, H0 , 0) > + ANON (struct hfa_f16x2_t, hfa_f16x2, H1 , 1) > + ANON (struct hfa_f16x3_t, hfa_f16x3, H3 , 2) > + ANON (struct hfa_f16x2_t, hfa_f16x2, H6 , 3) > + ANON (struct hfa_f16x1_t, hfa_f16x1, STACK , 4) > + LAST_ANON(double , 1.0 , STACK+8, 5) > +#endif > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-2.c > b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-2.c > index e972691..8f2f881 100644 > --- a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-2.c > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-2.c > @@ -19,6 +19,8 @@ signed short ss = 0xcba9; > signed int ss_promoted = 0xffffcba9; > float fp = 65432.12345f; > double fp_promoted = (double)65432.12345f; > +__fp16 fp16 = 2.0f; > +__fp16 fp16_promoted = (double)2.0f; > > #define HAS_DATA_INIT_FUNC > void init_data () > @@ -64,9 +66,10 @@ void init_data () > ANON ( long double , 98765432123456789.987654321L, > STACK+80, 20) > ANON ( vf2_t, vf2 , > STACK+96, 21) > ANON ( vi4_t, vi4 , > STACK+112,22) > + ANON_PROMOTED( __fp16 , fp16 , double, fp16_promoted, > STACK+128,23) > #ifndef __AAPCS64_BIG_ENDIAN__ > - LAST_ANON ( int , 0xeeee, > STACK+128,23) > + LAST_ANON ( int , 0xeeee, > STACK+136,24) > #else > - LAST_ANON ( int , 0xeeee, > STACK+132,23) > + LAST_ANON ( int , 0xeeee, > STACK+140,24) > #endif > #endif > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-4.c > b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-4.c > index fab3575..010ad8b 100644 > --- a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-4.c > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-4.c > @@ -29,6 +29,8 @@ struct non_hfa_ffvf2_t non_hfa_ffvf2; > struct non_hfa_fffd_t non_hfa_fffd = {33.f, 34.f, 35.f, 36.0}; > union hfa_union_t hfa_union; > union non_hfa_union_t non_hfa_union; > +struct hfa_f16x2_t hfa_f16x2 = {2.0f, 4.0f}; > +struct hfa_f16x3_t hfa_f16x3 = {2.0f, 4.0f, 8.0f}; > > #define HAS_DATA_INIT_FUNC > void init_data () > @@ -89,9 +91,12 @@ void init_data () > PTR_ANON (struct non_hfa_ffs_t , non_hfa_ffs , STACK+120, 18) > ANON (struct non_hfa_ffs_2_t, non_hfa_ffs_2, STACK+128, 19) > ANON (union non_hfa_union_t, non_hfa_union, STACK+144, 20) > + /* HFA of __fp16 passed on stack, directed __fp16 test is va_arg-10.c. */ > + ANON (struct hfa_f16x2_t , hfa_f16x2 , STACK+152, 21) > + ANON (struct hfa_f16x3_t , hfa_f16x3 , STACK+160, 22) > #ifndef __AAPCS64_BIG_ENDIAN__ > - LAST_ANON(int , 2 , STACK+152, 30) > + LAST_ANON(int , 2 , STACK+168, 30) > #else > - LAST_ANON(int , 2 , STACK+156, 30) > + LAST_ANON(int , 2 , STACK+172, 30) > #endif > #endif > diff --git a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-5.c > b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-5.c > index 4853f92..e54f1f5 100644 > --- a/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-5.c > +++ b/gcc/testsuite/gcc.target/aarch64/aapcs64/va_arg-5.c > @@ -17,6 +17,8 @@ struct hfa_dx4_t hfa_dx4 = {1234.123, 2345.234, 3456.345, > 4567.456}; > struct hfa_ldx3_t hfa_ldx3 = {123456.7890, 234567.8901, 345678.9012}; > struct hfa_ffs_t hfa_ffs; > union hfa_union_t hfa_union; > +struct hfa_f16x2_t hfa_f16x2 = {2.0f, 4.0f}; > +struct hfa_f16x3_t hfa_f16x3 = {2.0f, 4.0f, 8.0f}; > > #define HAS_DATA_INIT_FUNC > void init_data () > @@ -43,5 +45,8 @@ void init_data () > ANON (struct hfa_fx1_t , hfa_fx1 , STACK+24, 4) > ANON (struct hfa_fx2_t , hfa_fx2 , STACK+32, 5) > ANON (struct hfa_dx2_t , hfa_dx2 , STACK+40, 6) > - LAST_ANON(double , 1.0 , STACK+56, 7) > + /* HFA of __fp16 passed on stack, directed __fp16 test is va_arg-10.c. */ > + ANON (struct hfa_f16x2_t, hfa_f16x2, STACK+56, 7) > + ANON (struct hfa_f16x3_t, hfa_f16x3, STACK+64, 8) > + LAST_ANON(double , 1.0 , STACK+72, 9) > #endif >