On 27/06/17 07:13, Michael Collison wrote: > Richard, > > I reworked the patch using an assert as you suggested. Bootstrapped and > retested. Okay for trunk? >
Yes, fine thanks. R. > > -----Original Message----- > From: Richard Earnshaw (lists) [mailto:richard.earns...@arm.com] > Sent: Friday, June 23, 2017 2:09 AM > To: Michael Collison <michael.colli...@arm.com>; GCC Patches > <gcc-patches@gcc.gnu.org> > Cc: nd <n...@arm.com> > Subject: Re: [Neon intrinsics] Literal vector construction through vcombine > is poor > > On 23/06/17 00:10, Michael Collison wrote: >> Richard, >> >> I reworked the patch and retested on big endian as well as little. The >> original code was performing two swaps in the big endian case which works >> out to no swaps at all. >> >> I also updated the ChangeLog per your comments. Okay for trunk? >> >> 2017-06-19 Michael Collison <michael.colli...@arm.com> >> >> * config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly >> call aarch64_split_simd_combine. >> * (aarch64_combine_internal<mode>): Delete pattern. >> * config/aarch64/aarch64.c (aarch64_split_simd_combine): >> Allow register and subreg operands. >> >> -----Original Message----- >> From: Richard Earnshaw (lists) [mailto:richard.earns...@arm.com] >> Sent: Monday, June 19, 2017 6:37 AM >> To: Michael Collison <michael.colli...@arm.com>; GCC Patches >> <gcc-patches@gcc.gnu.org> >> Cc: nd <n...@arm.com> >> Subject: Re: [Neon intrinsics] Literal vector construction through >> vcombine is poor >> >> On 16/06/17 22:08, Michael Collison wrote: >>> This patch improves code generation for literal vector construction by >>> expanding and exposing the pattern to rtl optimization earlier. The current >>> implementation delays splitting the pattern until after reload which >>> results in poor code generation for the following code: >>> >>> >>> #include "arm_neon.h" >>> >>> int16x8_t >>> foo () >>> { >>> return vcombine_s16 (vdup_n_s16 (0), vdup_n_s16 (8)); } >>> >>> Trunk generates: >>> >>> foo: >>> movi v1.2s, 0 >>> movi v0.4h, 0x8 >>> dup d2, v1.d[0] >>> ins v2.d[1], v0.d[0] >>> orr v0.16b, v2.16b, v2.16b >>> ret >>> >>> With the patch we now generate: >>> >>> foo: >>> movi v1.4h, 0x8 >>> movi v0.4s, 0 >>> ins v0.d[1], v1.d[0] >>> ret >>> >>> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk. >>> >>> 2017-06-15 Michael Collison <michael.colli...@arm.com> >>> >>> * config/aarch64/aarch64-simd.md(aarch64_combine_internal<mode>): >>> Convert from define_insn_and_split into define_expand >>> * config/aarch64/aarch64.c(aarch64_split_simd_combine): >>> Allow register and subreg operands. >>> >> >> Your changelog entry is confusing. You've deleted the >> aarch64_combine_internal<mode> pattern entirely, having merged some of its >> functionality directly into its caller (aarch64_combine<mode>). >> >> So I think it should read: >> >> * config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly call >> aarch64_split_simd_combine. >> (aarch64_combine_internal<mode>): Delete pattern. >> * ... >> >> Note also there should be a space between the file name and the open bracket >> for the first function name. >> >> Why don't you need the big-endian code path any more? >> >> R. >> >>> >>> pr7057.patch >>> >>> >>> diff --git a/gcc/config/aarch64/aarch64-simd.md >>> b/gcc/config/aarch64/aarch64-simd.md >>> index c462164..4a253a9 100644 >>> --- a/gcc/config/aarch64/aarch64-simd.md >>> +++ b/gcc/config/aarch64/aarch64-simd.md >>> @@ -2807,27 +2807,11 @@ >>> op1 = operands[1]; >>> op2 = operands[2]; >>> } >>> - emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, >>> op2)); >>> - DONE; >>> -} >>> -) >>> >>> -(define_insn_and_split "aarch64_combine_internal<mode>" >>> - [(set (match_operand:<VDBL> 0 "register_operand" "=&w") >>> - (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w") >>> - (match_operand:VDC 2 "register_operand" "w")))] >>> - "TARGET_SIMD" >>> - "#" >>> - "&& reload_completed" >>> - [(const_int 0)] >>> -{ >>> - if (BYTES_BIG_ENDIAN) >>> - aarch64_split_simd_combine (operands[0], operands[2], operands[1]); >>> - else >>> - aarch64_split_simd_combine (operands[0], operands[1], operands[2]); >>> + aarch64_split_simd_combine (operands[0], op1, op2); >>> + >>> DONE; >>> } >>> -[(set_attr "type" "multiple")] >>> ) >>> >>> (define_expand "aarch64_simd_combine<mode>" >>> diff --git a/gcc/config/aarch64/aarch64.c >>> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644 >>> --- a/gcc/config/aarch64/aarch64.c >>> +++ b/gcc/config/aarch64/aarch64.c >>> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, >>> rtx src2) >>> >>> gcc_assert (VECTOR_MODE_P (dst_mode)); >>> >>> - if (REG_P (dst) && REG_P (src1) && REG_P (src2)) >>> + if (register_operand (dst, dst_mode) && register_operand (src1, src_mode) >>> + && register_operand (src2, src_mode)) >>> { >>> rtx (*gen) (rtx, rtx, rtx); >>> >>> >> >> >> pr7057v4.patch >> >> >> diff --git a/gcc/config/aarch64/aarch64-simd.md >> b/gcc/config/aarch64/aarch64-simd.md >> index c462164..3043f81 100644 >> --- a/gcc/config/aarch64/aarch64-simd.md >> +++ b/gcc/config/aarch64/aarch64-simd.md >> @@ -2796,38 +2796,10 @@ >> (match_operand:VDC 2 "register_operand")] >> "TARGET_SIMD" >> { >> - rtx op1, op2; >> - if (BYTES_BIG_ENDIAN) >> - { >> - op1 = operands[2]; >> - op2 = operands[1]; >> - } >> - else >> - { >> - op1 = operands[1]; >> - op2 = operands[2]; >> - } >> - emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, >> op2)); >> - DONE; >> -} >> -) >> + aarch64_split_simd_combine (operands[0], operands[1], operands[2]); >> >> -(define_insn_and_split "aarch64_combine_internal<mode>" >> - [(set (match_operand:<VDBL> 0 "register_operand" "=&w") >> - (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w") >> - (match_operand:VDC 2 "register_operand" "w")))] >> - "TARGET_SIMD" >> - "#" >> - "&& reload_completed" >> - [(const_int 0)] >> -{ >> - if (BYTES_BIG_ENDIAN) >> - aarch64_split_simd_combine (operands[0], operands[2], operands[1]); >> - else >> - aarch64_split_simd_combine (operands[0], operands[1], operands[2]); >> DONE; >> } >> -[(set_attr "type" "multiple")] >> ) >> >> (define_expand "aarch64_simd_combine<mode>" >> diff --git a/gcc/config/aarch64/aarch64.c >> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644 >> --- a/gcc/config/aarch64/aarch64.c >> +++ b/gcc/config/aarch64/aarch64.c >> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, >> rtx src2) >> >> gcc_assert (VECTOR_MODE_P (dst_mode)); >> >> - if (REG_P (dst) && REG_P (src1) && REG_P (src2)) >> + if (register_operand (dst, dst_mode) && register_operand (src1, src_mode) >> + && register_operand (src2, src_mode)) >> { >> rtx (*gen) (rtx, rtx, rtx); >> >> > > As far as I can see aarch64_split_simd_combine is only called from this one > expand pattern and the predicates on the pattern enforce all the operands > being registers. Furthermore, aarch64_split_simd_combine does nothing if > they aren't all registers, which would obviously result in wrong code > generation. > > So convert > >> + if (register_operand (dst, dst_mode) && register_operand (src1, > src_mode) >> + && register_operand (src2, src_mode)) > > into an assertion and make the following code unconditional. > > OK with that change. > > R. > > > pr7057v5.patch > > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index c462164..3043f81 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -2796,38 +2796,10 @@ > (match_operand:VDC 2 "register_operand")] > "TARGET_SIMD" > { > - rtx op1, op2; > - if (BYTES_BIG_ENDIAN) > - { > - op1 = operands[2]; > - op2 = operands[1]; > - } > - else > - { > - op1 = operands[1]; > - op2 = operands[2]; > - } > - emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, op2)); > - DONE; > -} > -) > + aarch64_split_simd_combine (operands[0], operands[1], operands[2]); > > -(define_insn_and_split "aarch64_combine_internal<mode>" > - [(set (match_operand:<VDBL> 0 "register_operand" "=&w") > - (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w") > - (match_operand:VDC 2 "register_operand" "w")))] > - "TARGET_SIMD" > - "#" > - "&& reload_completed" > - [(const_int 0)] > -{ > - if (BYTES_BIG_ENDIAN) > - aarch64_split_simd_combine (operands[0], operands[2], operands[1]); > - else > - aarch64_split_simd_combine (operands[0], operands[1], operands[2]); > DONE; > } > -[(set_attr "type" "multiple")] > ) > > (define_expand "aarch64_simd_combine<mode>" > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 2e385c4..12ae238 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -1649,41 +1649,41 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx > src2) > machine_mode dst_mode = GET_MODE (dst); > > gcc_assert (VECTOR_MODE_P (dst_mode)); > + gcc_assert (register_operand (dst, dst_mode) > + && register_operand (src1, src_mode) > + && register_operand (src2, src_mode)); > > - if (REG_P (dst) && REG_P (src1) && REG_P (src2)) > - { > - rtx (*gen) (rtx, rtx, rtx); > - > - switch (src_mode) > - { > - case V8QImode: > - gen = gen_aarch64_simd_combinev8qi; > - break; > - case V4HImode: > - gen = gen_aarch64_simd_combinev4hi; > - break; > - case V2SImode: > - gen = gen_aarch64_simd_combinev2si; > - break; > - case V4HFmode: > - gen = gen_aarch64_simd_combinev4hf; > - break; > - case V2SFmode: > - gen = gen_aarch64_simd_combinev2sf; > - break; > - case DImode: > - gen = gen_aarch64_simd_combinedi; > - break; > - case DFmode: > - gen = gen_aarch64_simd_combinedf; > - break; > - default: > - gcc_unreachable (); > - } > + rtx (*gen) (rtx, rtx, rtx); > > - emit_insn (gen (dst, src1, src2)); > - return; > + switch (src_mode) > + { > + case V8QImode: > + gen = gen_aarch64_simd_combinev8qi; > + break; > + case V4HImode: > + gen = gen_aarch64_simd_combinev4hi; > + break; > + case V2SImode: > + gen = gen_aarch64_simd_combinev2si; > + break; > + case V4HFmode: > + gen = gen_aarch64_simd_combinev4hf; > + break; > + case V2SFmode: > + gen = gen_aarch64_simd_combinev2sf; > + break; > + case DImode: > + gen = gen_aarch64_simd_combinedi; > + break; > + case DFmode: > + gen = gen_aarch64_simd_combinedf; > + break; > + default: > + gcc_unreachable (); > } > + > + emit_insn (gen (dst, src1, src2)); > + return; > } > > /* Split a complex SIMD move. */ >