[PATCH] aarch64: Added optimization for combine-duplicate instructions

Andrei Nichita Tirziu Thu, 20 Nov 2025 07:07:08 -0800

Issue: GCC emits unnecessary instruction to zero-out the upper 64-bits of the 
vector, even though the previous instruction already had such an effect.


Example:
int32x4_t foo_s32(int32_t a) {
     int32x4_t b = vcombine_s32(vdup_n_s32(a), vdup_n_s32(0));
     return b;
}
int16x8_t foo_s16(int16_t a) {
     int16x8_t b = vcombine_s16(vdup_n_s16(a), vdup_n_s16(0));
     return b;
}

Generates:
foo_s32(int):
     dup     v0.2s, w0
     fmov    d0, d0
     ret
foo_s16(short):
     dup     v0.4h, w0
     fmov    d0, d0
     ret

The behavior itself is correct, but the fmov instructions are not required: 
using a dup to only write to the lower half of the vector ensures that the 
upper half is automatically zeroed. More details (for the first function):
after the dup, the vector is 0|0|a|a, so moving the lower half produces no 
further effects.

Fix: Use the compiler's combine optimization pass to emit an assembly 
instruction when such a sequence is encountered. This is achieved by advising 
the pass to replace the combination of such a pair of instructions.

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Changed its 
name to what is below.
        (aarch64_simd_dup<mode><vczle><vczbe>): Optimize vec_dup, followed by 
vec_concat.
        (aarch64_combine_optimize<vczle><vczbe>): Optimize zero_extend, 
followed by vec_concat.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/vcombine_vdup_optimize.c: New test.
        * gcc.target/aarch64/simd/vcombine_zero_extend_optimize.c: New test.

diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index
0d5b02a739fa74724d6dc8b658638d55b8db6890..5e0f44a5d961d26e37b5b536b00ad9180e80341e
100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -90,7 +90,7 @@ (define_expand "movmisalign<mode>"
      operands[1] = force_reg (<MODE>mode, operands[1]);
  })

-(define_insn "aarch64_simd_dup<mode>"
+(define_insn "aarch64_simd_dup<mode><vczle><vczbe>"
    [(set (match_operand:VDQ_I 0 "register_operand")
        (vec_duplicate:VDQ_I
          (match_operand:<VEL> 1 "register_operand")))] @@ -112,6 +112,31 @@ 
(define_insn "aarch64_simd_dup<mode>"
    }
  )

+;; This is a pattern that will optimize a sequence
+;;     y = zero_extend (x)
+;;     z = vec_concat (y, 0)
+;; into a single assembly instruction (instead of multiple fmov). This 
+is ;; achieved using the define_subst, which will expand the pattern to 
+a ;; new one with the vector concatenation within it.
+;;
+;; It is similar to "*zero_extendsidi2_aarch64" from aarch64.md, but 
+this ;; version provides an optimization.
+
+(define_insn "aarch64_combine_optimize<vczle><vczbe>"
+  [(set (match_operand:DI 0 "register_operand")
+       (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand"))
+  )]
+  "TARGET_FLOAT"
+  {@ [ cons: =0 , 1 ; attrs: type , arch ]
+     [ r       , r ; mov_reg   , *    ] fmov\t%s0, %w1
+     [ r       , m ; load_4    , *    ] ldr\t%w0, %1
+     [ w       , r ; f_mcr     , fp   ] fmov\t%s0, %w1
+     [ w       , m ; f_loads   , fp   ] ldr\t%s0, %1
+     [ r       , w ; f_mrc     , fp   ] fmov\t%w0, %s1
+     [ w       , w ; fmov      , fp   ] fmov\t%s0, %s1
+  }
+)
+
  (define_insn "@aarch64_dup_lane<mode>"
    [(set (match_operand:VALL_F16 0 "register_operand" "=w")
        (vec_duplicate:VALL_F16
diff --git
a/gcc/testsuite/gcc.target/aarch64/simd/vcombine_vdup_optimize.c
b/gcc/testsuite/gcc.target/aarch64/simd/vcombine_vdup_optimize.c
new file mode 100644
index
0000000000000000000000000000000000000000..31930a25b09e5d656387cb6dd7d8bb969c14f56b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vcombine_vdup_optimize.c
@@ -0,0 +1,25 @@
+/* Test to check that vcombine(vdup(i), vdup(0)) does not generate 
+unnecessary assembly instructions. */
+
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int32x4_t foo_s32(int32_t a) {
+    int32x4_t b = vcombine_s32(vdup_n_s32(a), vdup_n_s32(0));
+    return b;
+}
+
+int16x8_t foo_s16(int16_t a) {
+    int16x8_t b = vcombine_s16(vdup_n_s16(a), vdup_n_s16(0));
+    return b;
+}
+
+int8x16_t foo_s8(int8_t a) {
+    int8x16_t b = vcombine_s8(vdup_n_s8(a), vdup_n_s8(0));
+    return b;
+}
+
+/* { dg-final { scan-assembler {dup[^\n]*\n} } } */
+/* { dg-final { scan-assembler-not {fmov[^\n]*\n} } } */
diff --git
a/gcc/testsuite/gcc.target/aarch64/simd/vcombine_zero_extend_optimize.c
b/gcc/testsuite/gcc.target/aarch64/simd/vcombine_zero_extend_optimize.c
new file mode 100644
index
0000000000000000000000000000000000000000..aed7e90c0f65674e2b6ae989c27674ee4696bbf0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vcombine_zero_extend_optimiz
+++ e.c
@@ -0,0 +1,15 @@
+/* Test to check that vcombine(vdup(i), vdup(0)) does not generate 
+unnecessary assembly instructions. */
+
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int64x2_t foo(unsigned a) {
+  int64x2_t b = vcombine_s64(vdup_n_s64(a), vdup_n_s64(0));
+  return b;
+}
+
+/* { dg-final { scan-assembler {fmov[^\n]*\n} } } */
+/* { dg-final { scan-assembler-not {fmov[^\n]*\n[ \t]*fmov[^\n]*\n} } } 
+*/

[PATCH] aarch64: Added optimization for combine-duplicate instructions

Reply via email to