Hi all,
This patch fixes the failing gcc.dg/vect/slp-reduc-sad-2.c testcase on
aarch64
by implementing a vec_init optab that can handle two half-width vectors
producing a full-width one
by concatenating them.
In the gcc.dg/vect/slp-reduc-sad-2.c case it's a V8QI reg concatenated
with a V8QI const_vector of zeroes.
This can be implemented efficiently using the aarch64_combinez pattern
that just loads a D-register to make
use of the implicit zero-extending semantics of that load.
Otherwise it concatenates the two vector using aarch64_simd_combine.
With this patch I'm seeing the effect from richi's original patch that
added gcc.dg/vect/slp-reduc-sad-2.c on aarch64
and 525.x264_r improves by about 1.5%.
Bootstrapped and tested on aarch64-none-linux-gnu. Also tested on
aarch64_be-none-elf.
Ok for trunk?
Thanks,
Kyrill
2019-10-05 Kyrylo Tkachov <kyrylo.tkac...@arm.com>
PR tree-optimization/90332
* config/aarch64/aarch64.c (aarch64_expand_vector_init):
Handle VALS containing two vectors.
* config/aarch64/aarch64-simd.md (*aarch64_combinez<mode>): Rename
to...
(@aarch64_combinez<mode>): ... This.
(*aarch64_combinez_be<mode>): Rename to...
(@aarch64_combinez_be<mode>): ... This.
(vec_init<mode><Vhalf>): New define_expand.
* config/aarch64/iterators.md (Vhalf): Handle V8HF.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 8e1afe1704490cf399ba46e68acc8131cc932259..7d917584d5bce612ce2ebff68fd3e67b57e0d403 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3211,7 +3211,7 @@
;; In this insn, operand 1 should be low, and operand 2 the high part of the
;; dest vector.
-(define_insn "*aarch64_combinez<mode>"
+(define_insn "@aarch64_combinez<mode>"
[(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
(vec_concat:<VDBL>
(match_operand:VDC 1 "general_operand" "w,?r,m")
@@ -3225,7 +3225,7 @@
(set_attr "arch" "simd,fp,simd")]
)
-(define_insn "*aarch64_combinez_be<mode>"
+(define_insn "@aarch64_combinez_be<mode>"
[(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
(vec_concat:<VDBL>
(match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
@@ -5954,6 +5954,15 @@
DONE;
})
+(define_expand "vec_init<mode><Vhalf>"
+ [(match_operand:VQ_NO2E 0 "register_operand" "")
+ (match_operand 1 "" "")]
+ "TARGET_SIMD"
+{
+ aarch64_expand_vector_init (operands[0], operands[1]);
+ DONE;
+})
+
(define_insn "*aarch64_simd_ld1r<mode>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
(vec_duplicate:VALL_F16
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0c2c17ed8269923723d066b250974ee1ff423d26..52c933cfdac20c5c566c13ae2528f039efda4c46 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15075,6 +15075,43 @@ aarch64_expand_vector_init (rtx target, rtx vals)
rtx v0 = XVECEXP (vals, 0, 0);
bool all_same = true;
+ /* This is a special vec_init<M><N> where N is not an element mode but a
+ vector mode with half the elements of M. We expect to find two entries
+ of mode N in VALS and we must put their concatentation into TARGET. */
+ if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
+ {
+ rtx lo = XVECEXP (vals, 0, 0);
+ rtx hi = XVECEXP (vals, 0, 1);
+ machine_mode narrow_mode = GET_MODE (lo);
+ gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
+ gcc_assert (narrow_mode == GET_MODE (hi));
+
+ /* When we want to concatenate a half-width vector with zeroes we can
+ use the aarch64_combinez[_be] patterns. Just make sure that the
+ zeroes are in the right half. */
+ if (BYTES_BIG_ENDIAN
+ && aarch64_simd_imm_zero (lo, narrow_mode)
+ && general_operand (hi, narrow_mode))
+ emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
+ else if (!BYTES_BIG_ENDIAN
+ && aarch64_simd_imm_zero (hi, narrow_mode)
+ && general_operand (lo, narrow_mode))
+ emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
+ else
+ {
+ /* Else create the two half-width registers and combine them. */
+ if (!REG_P (lo))
+ lo = force_reg (GET_MODE (lo), lo);
+ if (!REG_P (hi))
+ hi = force_reg (GET_MODE (hi), hi);
+
+ if (BYTES_BIG_ENDIAN)
+ std::swap (lo, hi);
+ emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
+ }
+ return;
+ }
+
/* Count the number of variable elements to initialise. */
for (int i = 0; i < n_elts; ++i)
{
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index d59d72833c527da80af1ac8dd7264c8dd86047c7..c495e76b3b73e3a19603330989121fce51c3fe35 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -771,6 +771,7 @@
;; Half modes of all vector modes, in lower-case.
(define_mode_attr Vhalf [(V8QI "v4qi") (V16QI "v8qi")
(V4HI "v2hi") (V8HI "v4hi")
+ (V8HF "v4hf")
(V2SI "si") (V4SI "v2si")
(V2DI "di") (V2SF "sf")
(V4SF "v2sf") (V2DF "df")])