Remi Machet <rmac...@nvidia.com> writes:
> Add an optimization to aarch64 SIMD converting mvn+shrn into mvni+subhn 
> which
> allows for better optimization when the code is inside a loop by using a
> constant.
>
> Bootstrapped and regtested on aarch64-linux-gnu.
>
> Signed-off-by: Remi Machet <rmac...@nvidia.com>
>
> gcc/ChangeLog:
>
>      * config/aarch64/aarch64-simd.md (*shrn_to_subhn_<mode>): Add pattern
>          converting mvn+shrn into mvni+subhn.
>
> gcc/testsuite/ChangeLog:
>
>      * gcc.target/aarch64/simd/shrn2subhn.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 6e30dc48934..f49e6fe6a26 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -5028,6 +5028,34 @@
>     DONE;
>   })
>
> +;; convert what would be a mvn+shrn into a mvni+subhn because the use of a
> +;; constant load rather than not instructions allows for better loop
> +;; optimization. On some implementations the use of subhn also result 
> in better
> +;; throughput.
> +(define_insn_and_split "*shrn_to_subhn_<mode>"
> +  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=&w")
> +    (truncate:<VNARROWQ>
> +      (lshiftrt:VQN
> +        (not:VQN (match_operand:VQN 1 "register_operand" "w"))
> +        (match_operand:VQN 2 "aarch64_simd_shift_imm_vec_exact_top"))))]

Very minor, sorry, but it would be good to format this so that the
(truncate ...) lines up with the (match_operand...):

  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=&w")
        (truncate:<VNARROWQ>
          (lshiftrt:VQN
            (not:VQN (match_operand:VQN 1 "register_operand" "w"))
            (match_operand:VQN 2 "aarch64_simd_shift_imm_vec_exact_top"))))]

> +  "TARGET_SIMD"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +{
> +  rtx tmp;
> +  if (can_create_pseudo_p ())
> +    tmp = gen_reg_rtx (<MODE>mode);
> +  else
> +    tmp = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
> +  emit_insn (gen_move_insn (tmp,
> +              aarch64_simd_gen_const_vector_dup (<MODE>mode, -1)));

This can be simplified to:

  emit_insn (gen_move_insn (tmp, CONSTM1_RTX (<MODE>mode)));

> +  emit_insn (gen_aarch64_subhn<mode>_insn (operands[0], tmp,
> +                        operands[1], operands[2]));

Sorry for the formatting nit, but: "operands[1]" should line up with
"operands[0]".

> +  DONE;
> +})
> +
> +
>   ;; pmul.
>
>   (define_insn "aarch64_pmul<mode>"
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c
> new file mode 100644
> index 00000000000..d03af815671
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c
> @@ -0,0 +1,18 @@
> +/* This test case checks that replacing a not+shift by a sub -1 works. */
> +/* { dg-do compile } */
> +/* { dg-additional-options "--save-temps -O1" } */

The --save-temps isn't needed, since dg-do compile compiles to assembly
anyway.

Looks really good otherwise, thanks!  So OK with those changes from my POV,
but please leave a day or so for others to comment.

Richard

> +/* { dg-final { scan-assembler-times "\\tsubhn\\t" 2 } } */
> +
> +#include<stdint.h>
> +#include<arm_neon.h>
> +#include<stdlib.h>
> +
> +uint8x8_t neg_narrow(uint16x8_t a) {
> +  uint16x8_t b = vmvnq_u16(a);
> +  return vshrn_n_u16(b, 8);
> +}
> +
> +uint8x8_t neg_narrow_vsubhn(uint16x8_t a) {
> +  uint16x8_t ones = vdupq_n_u16(0xffff);
> +  return vsubhn_u16(ones, a);
> +}

Reply via email to