Add an optimization to aarch64 SIMD converting mvn+shrn into mvni+subhn which allows for better optimization when the code is inside a loop by using a constant.
Bootstrapped and regtested on aarch64-linux-gnu. Signed-off-by: Remi Machet <rmac...@nvidia.com> gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*shrn_to_subhn_<mode>): Add pattern converting mvn+shrn into mvni+subhn. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/shrn2subhn.c: New test. diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 6e30dc48934..f49e6fe6a26 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -5028,6 +5028,34 @@ DONE; }) +;; convert what would be a mvn+shrn into a mvni+subhn because the use of a +;; constant load rather than not instructions allows for better loop +;; optimization. On some implementations the use of subhn also result in better +;; throughput. +(define_insn_and_split "*shrn_to_subhn_<mode>" + [(set (match_operand:<VNARROWQ> 0 "register_operand" "=&w") + (truncate:<VNARROWQ> + (lshiftrt:VQN + (not:VQN (match_operand:VQN 1 "register_operand" "w")) + (match_operand:VQN 2 "aarch64_simd_shift_imm_vec_exact_top"))))] + "TARGET_SIMD" + "#" + "&& true" + [(const_int 0)] +{ + rtx tmp; + if (can_create_pseudo_p ()) + tmp = gen_reg_rtx (<MODE>mode); + else + tmp = gen_rtx_REG (<MODE>mode, REGNO (operands[0])); + emit_insn (gen_move_insn (tmp, + aarch64_simd_gen_const_vector_dup (<MODE>mode, -1))); + emit_insn (gen_aarch64_subhn<mode>_insn (operands[0], tmp, + operands[1], operands[2])); + DONE; +}) + + ;; pmul. (define_insn "aarch64_pmul<mode>" diff --git a/gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c b/gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c new file mode 100644 index 00000000000..d03af815671 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/shrn2subhn.c @@ -0,0 +1,18 @@ +/* This test case checks that replacing a not+shift by a sub -1 works. */ +/* { dg-do compile } */ +/* { dg-additional-options "--save-temps -O1" } */ +/* { dg-final { scan-assembler-times "\\tsubhn\\t" 2 } } */ + +#include<stdint.h> +#include<arm_neon.h> +#include<stdlib.h> + +uint8x8_t neg_narrow(uint16x8_t a) { + uint16x8_t b = vmvnq_u16(a); + return vshrn_n_u16(b, 8); +} + +uint8x8_t neg_narrow_vsubhn(uint16x8_t a) { + uint16x8_t ones = vdupq_n_u16(0xffff); + return vsubhn_u16(ones, a); +}