[gcc r16-2383] aarch64: Avoid INS-(W|X)ZR instructions when optimising for speed

Kyrylo Tkachov via Gcc-cvs Mon, 21 Jul 2025 06:12:08 -0700

https://gcc.gnu.org/g:d14a5db124777b12fcaeada4236878031d5ca505


commit r16-2383-gd14a5db124777b12fcaeada4236878031d5ca505
Author: Kyrylo Tkachov <ktkac...@nvidia.com>
Date:   Thu Jul 17 03:51:31 2025 -0700

    aarch64: Avoid INS-(W|X)ZR instructions when optimising for speed
    
    For inserting zero into a vector lane we usually use an instruction like:
            ins     v0.h[2], wzr
    
    This, however, has not-so-great performance on some CPUs.
    On Grace, for example it has a latency of 5 and throughput 1.
    The alternative sequence:
            movi    v31.8b, #0
            ins     v0.h[2], v31.h[0]
    is prefereble bcause the MOVI-0 is often a zero-latency operation that is
    eliminated by the CPU frontend and the lane-to-lane INS has a latency of 2 
and
    throughput of 4.
    We can avoid the merging of the two instructions into the 
aarch64_simd_vec_set_zero<mode>
    by disabling that pattern when optimizing for speed.
    
    Thanks to wider benchmarking from Tamar, it makes sense to make this change 
for
    all tunings, so no RTX costs or tuning flags are introduced to control this
    in a more fine-grained manner.  They can be easily added in the future if 
needed
    for a particular CPU.
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
    
    gcc/
    
            * config/aarch64/aarch64-simd.md (aarch64_simd_vec_set_zero<mode>):
            Enable only when optimizing for size.
    
    gcc/testsuite/
    
            * gcc.target/aarch64/simd/mf8_data_1.c (test_set_lane4,
            test_setq_lane4): Relax allowed assembly.
            * gcc.target/aarch64/vec-set-zero.c: Use -Os in flags.
            * gcc.target/aarch64/inszero_split_1.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md                 |  5 ++++-
 gcc/testsuite/gcc.target/aarch64/inszero_split_1.c | 18 ++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/simd/mf8_data_1.c | 10 ++++++++++
 gcc/testsuite/gcc.target/aarch64/vec-set-zero.c    |  2 +-
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 270cb2ff3a12..8b75c3d7f6d5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1190,13 +1190,16 @@
   [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )
 
+;; Inserting from the zero register into a vector lane is treated as an
+;; expensive GP->FP move on all CPUs.  Avoid it when optimizing for speed.
 (define_insn "aarch64_simd_vec_set_zero<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
        (vec_merge:VALL_F16
            (match_operand:VALL_F16 1 "register_operand" "0")
            (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "")
            (match_operand:SI 2 "immediate_operand" "i")))]
-  "TARGET_SIMD && aarch64_exact_log2_inverse (<nunits>, operands[2]) >= 0"
+  "TARGET_SIMD && aarch64_exact_log2_inverse (<nunits>, operands[2]) >= 0
+   && optimize_function_for_size_p (cfun)"
   {
     int elt = ENDIAN_LANE_N (<nunits>,
                             aarch64_exact_log2_inverse (<nunits>,
diff --git a/gcc/testsuite/gcc.target/aarch64/inszero_split_1.c 
b/gcc/testsuite/gcc.target/aarch64/inszero_split_1.c
new file mode 100644
index 000000000000..5c739bd7bb1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/inszero_split_1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Avoid INS from WZR register when optimizing for speed.  */
+
+#include <arm_neon.h>
+
+/*
+** foo:
+**     movi?   [vdz]([0-9]+)\.?(?:[0-9]*[bhsd])?, #?0
+**     ins     v0.h\[2\], v(\1).h\[0\]
+**     ret
+*/
+uint16x8_t foo(uint16x8_t a) {
+  a[2] = 0;
+  return a;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_1.c
index a3fd9b800e1e..79d1ccf6f7d5 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/mf8_data_1.c
@@ -1016,7 +1016,12 @@ mfloat8x8_t test_set_lane3(mfloat8x8_t a, const 
mfloat8_t *ptr)
 
 /*
 ** test_set_lane4:
+**     (
 **     ins     v0.b\[6\], wzr
+**     |
+**     movi?   [vdz]([0-9]+)\.?(?:[0-9]*[bhsd])?, #?0
+**     ins     v0.b\[6\], v(\1).b\[0\]
+**     )
 **     ret
 */
 mfloat8x8_t test_set_lane4(mfloat8x8_t a)
@@ -1056,7 +1061,12 @@ mfloat8x16_t test_setq_lane3(mfloat8x16_t a, const 
mfloat8_t *ptr)
 
 /*
 ** test_setq_lane4:
+**     (
 **     ins     v0.b\[14\], wzr
+**     |
+**     movi?   [vdz]([0-9]+)\.?(?:[0-9]*[bhsd])?, #?0
+**     ins     v0.b\[14\], v(\1).b\[0\]
+**     )
 **     ret
 */
 mfloat8x16_t test_setq_lane4(mfloat8x16_t a)
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c 
b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
index b34b902cf27b..ba4696e5840f 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-set-zero.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-Os" } */
 
 #include "arm_neon.h"

[gcc r16-2383] aarch64: Avoid INS-(W|X)ZR instructions when optimising for speed

Reply via email to