Jennifer Schmitz <[email protected]> writes:
> For the test case
> int32_t foo (svint32_t x)
> {
> svbool_t pg = svpfalse ();
> return svlastb_s32 (pg, x);
> }
> compiled with -O3 -mcpu=grace -msve-vector-bits=128, GCC produced:
> foo:
> pfalse p3.b
> lastb w0, p3, z0.s
> ret
> when it could use a Neon lane extract instead:
> foo:
> umov w0, v0.s[3]
> ret
>
> We implemented this optimization by guarding the emission of
> pfalse+lastb in the pattern vec_extract<mode><Vel> by
> known_gt (BYTES_PER_SVE_VECTOR, 16). Thus, for a last-extract operation
> in 128-bit VLS, the pattern *vec_extract<mode><Vel>_v128 is used instead.
>
> The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <[email protected]>
>
> gcc/
> * config/aarch64/aarch64-sve.md (vec_extract<mode><Vel>):
> Prevent the emission of pfalse+lastb for 128-bit VLS.
>
> gcc/testsuite/
> * gcc.target/aarch64/sve/extract_last_128.c: New test.
OK, thanks.
Richard
> ---
> gcc/config/aarch64/aarch64-sve.md | 7 ++--
> .../gcc.target/aarch64/sve/extract_last_128.c | 33 +++++++++++++++++++
> 2 files changed, 37 insertions(+), 3 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/extract_last_128.c
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md
> b/gcc/config/aarch64/aarch64-sve.md
> index 3dbd65986ec..824bd877e47 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -2969,10 +2969,11 @@
> {
> poly_int64 val;
> if (poly_int_rtx_p (operands[2], &val)
> - && known_eq (val, GET_MODE_NUNITS (<MODE>mode) - 1))
> + && known_eq (val, GET_MODE_NUNITS (<MODE>mode) - 1)
> + && known_gt (BYTES_PER_SVE_VECTOR, 16))
> {
> - /* The last element can be extracted with a LASTB and a false
> - predicate. */
> + /* Extract the last element with a LASTB and a false predicate.
> + Exclude 128-bit VLS to use *vec_extract<mode><Vel>_v128. */
> rtx sel = aarch64_pfalse_reg (<VPRED>mode);
> emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
> DONE;
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/extract_last_128.c
> b/gcc/testsuite/gcc.target/aarch64/sve/extract_last_128.c
> new file mode 100644
> index 00000000000..71d3561ec60
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/extract_last_128.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -msve-vector-bits=128" } */
> +
> +#include <arm_sve.h>
> +
> +#define TEST(TYPE, TY) \
> + TYPE exract_last_##TY (sv##TYPE x) \
> + { \
> + svbool_t pg = svpfalse (); \
> + return svlastb_##TY (pg, x); \
> + }
> +
> +TEST(bfloat16_t, bf16)
> +TEST(float16_t, f16)
> +TEST(float32_t, f32)
> +TEST(float64_t, f64)
> +TEST(int8_t, s8)
> +TEST(int16_t, s16)
> +TEST(int32_t, s32)
> +TEST(int64_t, s64)
> +TEST(uint8_t, u8)
> +TEST(uint16_t, u16)
> +TEST(uint32_t, u32)
> +TEST(uint64_t, u64)
> +
> +/* { dg-final { scan-assembler-times {\tdup\th0, v0\.h\[7\]} 2 } } */
> +/* { dg-final { scan-assembler-times {\tdup\ts0, v0\.s\[3\]} 1 } } */
> +/* { dg-final { scan-assembler-times {\tdup\td0, v0\.d\[1\]} 1 } } */
> +/* { dg-final { scan-assembler-times {\tumov\tw0, v0\.h\[7\]} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumov\tw0, v0\.b\[15\]} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumov\tw0, v0\.s\[3\]} 2 } } */
> +/* { dg-final { scan-assembler-times {\tumov\tx0, v0\.d\[1\]} 2 } } */
> +/* { dg-final { scan-assembler-not "lastb" } } */
> \ No newline at end of file