Re: [Neon intrinsics] Literal vector construction through vcombine is poor

Richard Earnshaw (lists) Tue, 27 Jun 2017 06:50:36 -0700

On 27/06/17 07:13, Michael Collison wrote:
> Richard,
> 
> I reworked the patch using an assert as you suggested. Bootstrapped and 
> retested. Okay for trunk?
>


Yes, fine thanks.

R.

> 
> -----Original Message-----
> From: Richard Earnshaw (lists) [mailto:richard.earns...@arm.com] 
> Sent: Friday, June 23, 2017 2:09 AM
> To: Michael Collison <michael.colli...@arm.com>; GCC Patches 
> <gcc-patches@gcc.gnu.org>
> Cc: nd <n...@arm.com>
> Subject: Re: [Neon intrinsics] Literal vector construction through vcombine 
> is poor
> 
> On 23/06/17 00:10, Michael Collison wrote:
>> Richard,
>>
>> I reworked the patch and retested on big endian as well as little. The 
>> original code was performing two swaps in the big endian case which works 
>> out to no swaps at all.
>>
>> I also updated the ChangeLog per your comments. Okay for trunk?
>>
>> 2017-06-19  Michael Collison  <michael.colli...@arm.com>
>>
>>      * config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly
>>      call aarch64_split_simd_combine.
>>      * (aarch64_combine_internal<mode>): Delete pattern.
>>      * config/aarch64/aarch64.c (aarch64_split_simd_combine):
>>      Allow register and subreg operands.
>>
>> -----Original Message-----
>> From: Richard Earnshaw (lists) [mailto:richard.earns...@arm.com]
>> Sent: Monday, June 19, 2017 6:37 AM
>> To: Michael Collison <michael.colli...@arm.com>; GCC Patches 
>> <gcc-patches@gcc.gnu.org>
>> Cc: nd <n...@arm.com>
>> Subject: Re: [Neon intrinsics] Literal vector construction through 
>> vcombine is poor
>>
>> On 16/06/17 22:08, Michael Collison wrote:
>>> This patch improves code generation for literal vector construction by 
>>> expanding and exposing the pattern to rtl optimization earlier. The current 
>>> implementation delays splitting the pattern until after reload which 
>>> results in poor code generation for the following code:
>>>
>>>
>>> #include "arm_neon.h"
>>>
>>> int16x8_t
>>> foo ()
>>> {
>>>   return vcombine_s16 (vdup_n_s16 (0), vdup_n_s16 (8)); }
>>>
>>> Trunk generates:
>>>
>>> foo:
>>>     movi    v1.2s, 0
>>>     movi    v0.4h, 0x8
>>>     dup     d2, v1.d[0]
>>>     ins     v2.d[1], v0.d[0]
>>>     orr     v0.16b, v2.16b, v2.16b
>>>     ret
>>>
>>> With the patch we now generate:
>>>
>>> foo:
>>>     movi    v1.4h, 0x8
>>>     movi    v0.4s, 0
>>>     ins     v0.d[1], v1.d[0]
>>>     ret
>>>
>>> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk.
>>>
>>> 2017-06-15  Michael Collison  <michael.colli...@arm.com>
>>>
>>>     * config/aarch64/aarch64-simd.md(aarch64_combine_internal<mode>):
>>>     Convert from define_insn_and_split into define_expand
>>>     * config/aarch64/aarch64.c(aarch64_split_simd_combine):
>>>     Allow register and subreg operands.
>>>
>>
>> Your changelog entry is confusing.  You've deleted the 
>> aarch64_combine_internal<mode> pattern entirely, having merged some of its 
>> functionality directly into its caller (aarch64_combine<mode>).
>>
>> So I think it should read:
>>
>> * config/aarch64/aarch64-simd.md (aarch64_combine<mode>): Directly call 
>> aarch64_split_simd_combine.
>> (aarch64_combine_internal<mode>): Delete pattern.
>> * ...
>>
>> Note also there should be a space between the file name and the open bracket 
>> for the first function name.
>>
>> Why don't you need the big-endian code path any more?
>>
>> R.
>>
>>>
>>> pr7057.patch
>>>
>>>
>>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>>> b/gcc/config/aarch64/aarch64-simd.md
>>> index c462164..4a253a9 100644
>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>> @@ -2807,27 +2807,11 @@
>>>        op1 = operands[1];
>>>        op2 = operands[2];
>>>      }
>>> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, 
>>> op2));
>>> -  DONE;
>>> -}
>>> -)
>>>  
>>> -(define_insn_and_split "aarch64_combine_internal<mode>"
>>> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
>>> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
>>> -                      (match_operand:VDC 2 "register_operand" "w")))]
>>> -  "TARGET_SIMD"
>>> -  "#"
>>> -  "&& reload_completed"
>>> -  [(const_int 0)]
>>> -{
>>> -  if (BYTES_BIG_ENDIAN)
>>> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
>>> -  else
>>> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>>> +  aarch64_split_simd_combine (operands[0], op1, op2);
>>> +
>>>    DONE;
>>>  }
>>> -[(set_attr "type" "multiple")]
>>>  )
>>>  
>>>  (define_expand "aarch64_simd_combine<mode>"
>>> diff --git a/gcc/config/aarch64/aarch64.c 
>>> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644
>>> --- a/gcc/config/aarch64/aarch64.c
>>> +++ b/gcc/config/aarch64/aarch64.c
>>> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, 
>>> rtx src2)
>>>  
>>>    gcc_assert (VECTOR_MODE_P (dst_mode));
>>>  
>>> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
>>> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
>>> +      && register_operand (src2, src_mode))
>>>      {
>>>        rtx (*gen) (rtx, rtx, rtx);
>>>  
>>>
>>
>>
>> pr7057v4.patch
>>
>>
>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>> b/gcc/config/aarch64/aarch64-simd.md
>> index c462164..3043f81 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -2796,38 +2796,10 @@
>>     (match_operand:VDC 2 "register_operand")]
>>    "TARGET_SIMD"
>>  {
>> -  rtx op1, op2;
>> -  if (BYTES_BIG_ENDIAN)
>> -    {
>> -      op1 = operands[2];
>> -      op2 = operands[1];
>> -    }
>> -  else
>> -    {
>> -      op1 = operands[1];
>> -      op2 = operands[2];
>> -    }
>> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, 
>> op2));
>> -  DONE;
>> -}
>> -)
>> +  aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>>  
>> -(define_insn_and_split "aarch64_combine_internal<mode>"
>> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
>> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
>> -                       (match_operand:VDC 2 "register_operand" "w")))]
>> -  "TARGET_SIMD"
>> -  "#"
>> -  "&& reload_completed"
>> -  [(const_int 0)]
>> -{
>> -  if (BYTES_BIG_ENDIAN)
>> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
>> -  else
>> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>>    DONE;
>>  }
>> -[(set_attr "type" "multiple")]
>>  )
>>  
>>  (define_expand "aarch64_simd_combine<mode>"
>> diff --git a/gcc/config/aarch64/aarch64.c 
>> b/gcc/config/aarch64/aarch64.c index 2e385c4..46bd78b 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -1650,7 +1650,8 @@ aarch64_split_simd_combine (rtx dst, rtx src1, 
>> rtx src2)
>>  
>>    gcc_assert (VECTOR_MODE_P (dst_mode));
>>  
>> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
>> +  if (register_operand (dst, dst_mode) && register_operand (src1, src_mode)
>> +      && register_operand (src2, src_mode))
>>      {
>>        rtx (*gen) (rtx, rtx, rtx);
>>  
>>
> 
> As far as I can see aarch64_split_simd_combine is only called from this one 
> expand pattern and the predicates on the pattern enforce all the operands 
> being registers.  Furthermore, aarch64_split_simd_combine does nothing if 
> they aren't all registers, which would obviously result in wrong code 
> generation.
> 
> So convert
> 
>> +  if (register_operand (dst, dst_mode) && register_operand (src1,
> src_mode)
>> +      && register_operand (src2, src_mode))
> 
> into an assertion and make the following code unconditional.
> 
> OK with that change.
> 
> R.
> 
> 
> pr7057v5.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index c462164..3043f81 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2796,38 +2796,10 @@
>     (match_operand:VDC 2 "register_operand")]
>    "TARGET_SIMD"
>  {
> -  rtx op1, op2;
> -  if (BYTES_BIG_ENDIAN)
> -    {
> -      op1 = operands[2];
> -      op2 = operands[1];
> -    }
> -  else
> -    {
> -      op1 = operands[1];
> -      op2 = operands[2];
> -    }
> -  emit_insn (gen_aarch64_combine_internal<mode> (operands[0], op1, op2));
> -  DONE;
> -}
> -)
> +  aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>  
> -(define_insn_and_split "aarch64_combine_internal<mode>"
> -  [(set (match_operand:<VDBL> 0 "register_operand" "=&w")
> -        (vec_concat:<VDBL> (match_operand:VDC 1 "register_operand" "w")
> -                        (match_operand:VDC 2 "register_operand" "w")))]
> -  "TARGET_SIMD"
> -  "#"
> -  "&& reload_completed"
> -  [(const_int 0)]
> -{
> -  if (BYTES_BIG_ENDIAN)
> -    aarch64_split_simd_combine (operands[0], operands[2], operands[1]);
> -  else
> -    aarch64_split_simd_combine (operands[0], operands[1], operands[2]);
>    DONE;
>  }
> -[(set_attr "type" "multiple")]
>  )
>  
>  (define_expand "aarch64_simd_combine<mode>"
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 2e385c4..12ae238 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1649,41 +1649,41 @@ aarch64_split_simd_combine (rtx dst, rtx src1, rtx 
> src2)
>    machine_mode dst_mode = GET_MODE (dst);
>  
>    gcc_assert (VECTOR_MODE_P (dst_mode));
> +  gcc_assert (register_operand (dst, dst_mode)
> +           && register_operand (src1, src_mode)
> +           && register_operand (src2, src_mode));
>  
> -  if (REG_P (dst) && REG_P (src1) && REG_P (src2))
> -    {
> -      rtx (*gen) (rtx, rtx, rtx);
> -
> -      switch (src_mode)
> -     {
> -     case V8QImode:
> -       gen = gen_aarch64_simd_combinev8qi;
> -       break;
> -     case V4HImode:
> -       gen = gen_aarch64_simd_combinev4hi;
> -       break;
> -     case V2SImode:
> -       gen = gen_aarch64_simd_combinev2si;
> -       break;
> -     case V4HFmode:
> -       gen = gen_aarch64_simd_combinev4hf;
> -       break;
> -     case V2SFmode:
> -       gen = gen_aarch64_simd_combinev2sf;
> -       break;
> -     case DImode:
> -       gen = gen_aarch64_simd_combinedi;
> -       break;
> -     case DFmode:
> -       gen = gen_aarch64_simd_combinedf;
> -       break;
> -     default:
> -       gcc_unreachable ();
> -     }
> +  rtx (*gen) (rtx, rtx, rtx);
>  
> -      emit_insn (gen (dst, src1, src2));
> -      return;
> +  switch (src_mode)
> +    {
> +    case V8QImode:
> +      gen = gen_aarch64_simd_combinev8qi;
> +      break;
> +    case V4HImode:
> +      gen = gen_aarch64_simd_combinev4hi;
> +      break;
> +    case V2SImode:
> +      gen = gen_aarch64_simd_combinev2si;
> +      break;
> +    case V4HFmode:
> +      gen = gen_aarch64_simd_combinev4hf;
> +      break;
> +    case V2SFmode:
> +      gen = gen_aarch64_simd_combinev2sf;
> +      break;
> +    case DImode:
> +      gen = gen_aarch64_simd_combinedi;
> +      break;
> +    case DFmode:
> +      gen = gen_aarch64_simd_combinedf;
> +      break;
> +    default:
> +      gcc_unreachable ();
>      }
> +
> +  emit_insn (gen (dst, src1, src2));
> +  return;
>  }
>  
>  /* Split a complex SIMD move.  */
>

Re: [Neon intrinsics] Literal vector construction through vcombine is poor

Reply via email to