Issue: GCC emits unnecessary instruction to zero-out the upper 64-bits of the
vector, even though the previous instruction already had such an effect.
Example:
int32x4_t foo_s32(int32_t a) {
int32x4_t b = vcombine_s32(vdup_n_s32(a), vdup_n_s32(0));
return b;
}
int16x8_t foo_s16(int16_t a) {
int16x8_t b = vcombine_s16(vdup_n_s16(a), vdup_n_s16(0));
return b;
}
Generates:
foo_s32(int):
dup v0.2s, w0
fmov d0, d0
ret
foo_s16(short):
dup v0.4h, w0
fmov d0, d0
ret
The behavior itself is correct, but the fmov instructions are not required:
using a dup to only write to the lower half of the vector ensures that the
upper half is automatically zeroed. More details (for the first function):
after the dup, the vector is 0|0|a|a, so moving the lower half produces no
further effects.
Fix: Use the compiler's combine optimization pass to emit an assembly
instruction when such a sequence is encountered. This is achieved by advising
the pass to replace the combination of such a pair of instructions.
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Changed its
name to what is below.
(aarch64_simd_dup<mode><vczle><vczbe>): Optimize vec_dup, followed by
vec_concat.
(aarch64_combine_optimize<vczle><vczbe>): Optimize zero_extend,
followed by vec_concat.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/vcombine_vdup_optimize.c: New test.
* gcc.target/aarch64/simd/vcombine_zero_extend_optimize.c: New test.
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index
0d5b02a739fa74724d6dc8b658638d55b8db6890..5e0f44a5d961d26e37b5b536b00ad9180e80341e
100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -90,7 +90,7 @@ (define_expand "movmisalign<mode>"
operands[1] = force_reg (<MODE>mode, operands[1]);
})
-(define_insn "aarch64_simd_dup<mode>"
+(define_insn "aarch64_simd_dup<mode><vczle><vczbe>"
[(set (match_operand:VDQ_I 0 "register_operand")
(vec_duplicate:VDQ_I
(match_operand:<VEL> 1 "register_operand")))] @@ -112,6 +112,31 @@
(define_insn "aarch64_simd_dup<mode>"
}
)
+;; This is a pattern that will optimize a sequence
+;; y = zero_extend (x)
+;; z = vec_concat (y, 0)
+;; into a single assembly instruction (instead of multiple fmov). This
+is ;; achieved using the define_subst, which will expand the pattern to
+a ;; new one with the vector concatenation within it.
+;;
+;; It is similar to "*zero_extendsidi2_aarch64" from aarch64.md, but
+this ;; version provides an optimization.
+
+(define_insn "aarch64_combine_optimize<vczle><vczbe>"
+ [(set (match_operand:DI 0 "register_operand")
+ (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand"))
+ )]
+ "TARGET_FLOAT"
+ {@ [ cons: =0 , 1 ; attrs: type , arch ]
+ [ r , r ; mov_reg , * ] fmov\t%s0, %w1
+ [ r , m ; load_4 , * ] ldr\t%w0, %1
+ [ w , r ; f_mcr , fp ] fmov\t%s0, %w1
+ [ w , m ; f_loads , fp ] ldr\t%s0, %1
+ [ r , w ; f_mrc , fp ] fmov\t%w0, %s1
+ [ w , w ; fmov , fp ] fmov\t%s0, %s1
+ }
+)
+
(define_insn "@aarch64_dup_lane<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
(vec_duplicate:VALL_F16
diff --git
a/gcc/testsuite/gcc.target/aarch64/simd/vcombine_vdup_optimize.c
b/gcc/testsuite/gcc.target/aarch64/simd/vcombine_vdup_optimize.c
new file mode 100644
index
0000000000000000000000000000000000000000..31930a25b09e5d656387cb6dd7d8bb969c14f56b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vcombine_vdup_optimize.c
@@ -0,0 +1,25 @@
+/* Test to check that vcombine(vdup(i), vdup(0)) does not generate
+unnecessary assembly instructions. */
+
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int32x4_t foo_s32(int32_t a) {
+ int32x4_t b = vcombine_s32(vdup_n_s32(a), vdup_n_s32(0));
+ return b;
+}
+
+int16x8_t foo_s16(int16_t a) {
+ int16x8_t b = vcombine_s16(vdup_n_s16(a), vdup_n_s16(0));
+ return b;
+}
+
+int8x16_t foo_s8(int8_t a) {
+ int8x16_t b = vcombine_s8(vdup_n_s8(a), vdup_n_s8(0));
+ return b;
+}
+
+/* { dg-final { scan-assembler {dup[^\n]*\n} } } */
+/* { dg-final { scan-assembler-not {fmov[^\n]*\n} } } */
diff --git
a/gcc/testsuite/gcc.target/aarch64/simd/vcombine_zero_extend_optimize.c
b/gcc/testsuite/gcc.target/aarch64/simd/vcombine_zero_extend_optimize.c
new file mode 100644
index
0000000000000000000000000000000000000000..aed7e90c0f65674e2b6ae989c27674ee4696bbf0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vcombine_zero_extend_optimiz
+++ e.c
@@ -0,0 +1,15 @@
+/* Test to check that vcombine(vdup(i), vdup(0)) does not generate
+unnecessary assembly instructions. */
+
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int64x2_t foo(unsigned a) {
+ int64x2_t b = vcombine_s64(vdup_n_s64(a), vdup_n_s64(0));
+ return b;
+}
+
+/* { dg-final { scan-assembler {fmov[^\n]*\n} } } */
+/* { dg-final { scan-assembler-not {fmov[^\n]*\n[ \t]*fmov[^\n]*\n} } }
+*/