From: Artemiy Volkov <[email protected]>
Presently, when compiling:
int16x8_t foo( int16x8_t x )
{
return vcombine_s16( vget_high_s16( x ), vget_low_s16( x ) );
}
we produce:
foo:
dup d31, v0.d[1]
uzp1 v0.2d, v31.2d, v0.2d
ret
instead of the more efficient:
foo:
ext v0.16b, v0.16b, v0.16b, #8
ret
This happens because the vec_select expression used to extract the upper
half of the vector does not get combined into an insn, and thus has to
be materialized in another register. To fix this, add an insn pattern
for a vec_combine taking a vec_select as one of the arguments.
Additionally, provide an equivalent pattern for big-endian targets.
This patch also includes a new test file to cover this transformation.
Bootstrapped and regtested on aarch64-linux-gnu, and additionally
regtested on aarch64_be-linux-gnu, no issues.
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md
(*aarch64_combine_high_low_internal<mode>):
New insn.
(*aarch64_combine_high_low_internal_be<mode>): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/combine_ext.c: New test.
---
gcc/config/aarch64/aarch64-simd.md | 27 +++++++++++
.../gcc.target/aarch64/simd/combine_ext.c | 47 +++++++++++++++++++
2 files changed, 74 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index 0d5b02a739f..309c5ad3e3d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4423,6 +4423,33 @@
}
)
+;; Combine high half of operand 1 (extracted with vec_select) with
+;; low half of operand 2.
+
+(define_insn "*aarch64_combine_high_low_internal<mode>"
+ [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w")
+ (vec_concat:<VDBL>
+ (vec_select:VDC
+ (match_operand:<VDBL> 1 "register_operand" "w")
+ (match_operand:<VDBL> 3 "vect_par_cnst_hi_half"))
+ (match_operand:VDC 2 "register_operand" "w")))]
+ "TARGET_FLOAT && !BYTES_BIG_ENDIAN"
+ "ext\\t%0.16b, %1.16b, %2.16b, #8"
+ [(set_attr "type" "neon_ext<q>")]
+)
+
+(define_insn "*aarch64_combine_high_low_internal_be<mode>"
+ [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w")
+ (vec_concat:<VDBL>
+ (match_operand:VDC 1 "register_operand" "w")
+ (vec_select:VDC
+ (match_operand:<VDBL> 2 "register_operand" "w")
+ (match_operand:<VDBL> 3 "vect_par_cnst_hi_half"))))]
+ "TARGET_FLOAT && BYTES_BIG_ENDIAN"
+ "ext\\t%0.16b, %1.16b, %2.16b, #8"
+ [(set_attr "type" "neon_ext<q>")]
+)
+
;; In this insn, operand 1 should be low, and operand 2 the high part of the
;; dest vector.
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
b/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
new file mode 100644
index 00000000000..27bcf310e19
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+#ifndef TEST_COMBINE_HIGH_LOW_1
+#define TEST_COMBINE_HIGH_LOW_1(TYPE, SUFF) \
+ TYPE rev_##TYPE##_1 (TYPE x) \
+ { \
+ return vcombine_##SUFF (vget_high_##SUFF (x), vget_low_##SUFF (x)); \
+ }
+#endif
+
+#ifndef TEST_COMBINE_HIGH_LOW_2
+#define TEST_COMBINE_HIGH_LOW_2(TYPE, SUFF) \
+ TYPE rev_##TYPE##_2 (TYPE x, TYPE y) \
+ { \
+ return vcombine_##SUFF (vget_high_##SUFF (x), vget_low_##SUFF (y)); \
+ }
+#endif
+
+
+TEST_COMBINE_HIGH_LOW_1 (int8x16_t, s8)
+TEST_COMBINE_HIGH_LOW_1 (int16x8_t, s16)
+TEST_COMBINE_HIGH_LOW_1 (int32x4_t, s32)
+TEST_COMBINE_HIGH_LOW_1 (int64x2_t, s64)
+TEST_COMBINE_HIGH_LOW_1 (uint8x16_t, u8)
+TEST_COMBINE_HIGH_LOW_1 (uint16x8_t, u16)
+TEST_COMBINE_HIGH_LOW_1 (uint32x4_t, u32)
+TEST_COMBINE_HIGH_LOW_1 (uint64x2_t, u64)
+TEST_COMBINE_HIGH_LOW_1 (float16x8_t, f16)
+TEST_COMBINE_HIGH_LOW_1 (float32x4_t, f32)
+
+TEST_COMBINE_HIGH_LOW_2 (int8x16_t, s8)
+TEST_COMBINE_HIGH_LOW_2 (int16x8_t, s16)
+TEST_COMBINE_HIGH_LOW_2 (int32x4_t, s32)
+TEST_COMBINE_HIGH_LOW_2 (int64x2_t, s64)
+TEST_COMBINE_HIGH_LOW_2 (uint8x16_t, u8)
+TEST_COMBINE_HIGH_LOW_2 (uint16x8_t, u16)
+TEST_COMBINE_HIGH_LOW_2 (uint32x4_t, u32)
+TEST_COMBINE_HIGH_LOW_2 (uint64x2_t, u64)
+TEST_COMBINE_HIGH_LOW_2 (float16x8_t, f16)
+TEST_COMBINE_HIGH_LOW_2 (float32x4_t, f32)
+
+/* { dg-final { scan-assembler-times {ext\tv0.16b, v0.16b, v0.16b} 10 } } */
+/* { dg-final { scan-assembler-times {ext\tv0.16b, v0.16b, v1.16b} 10 { target
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {ext\tv0.16b, v1.16b, v0.16b} 10 { target
aarch64_big_endian } } } */
--
2.43.0