Re: [PATCH v2 2/3] aarch64: Optimize AND with certain vector of immediates as FMOV [PR100165]

Richard Sandiford Mon, 12 May 2025 14:36:35 -0700

Pengxuan Zheng <quic_pzh...@quicinc.com> writes:
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 15f08cebeb1..98ce85dfdae 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23621,6 +23621,36 @@ aarch64_simd_valid_and_imm (rtx op)
>    return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_AND);
>  }
>  
> +/* Return true if OP is a valid SIMD and immediate which allows the and be


s/and be/and to be/

> +   optimized as fmov.  If ELT_SIZE is nonnull, it represents the size of the
> +   register for fmov.  */

Maybe rename this to ELT_BITSIZE (see below), and say:

  If ELT_BITSIZE is nonnull, use it to return the number of bits to move.

> +bool
> +aarch64_simd_valid_and_imm_fmov (rtx op, unsigned int *elt_size)
> +{
> +  machine_mode mode = GET_MODE (op);
> +  gcc_assert (!aarch64_sve_mode_p (mode));
> +
> +  auto_vec<target_unit, 16> buffer;
> +  unsigned int n_bytes = GET_MODE_SIZE (mode).to_constant ();
> +  buffer.reserve (n_bytes);
> +
> +  bool ok = native_encode_rtx (mode, op, buffer, 0, n_bytes);
> +  gcc_assert (ok);
> +
> +  auto mask = native_decode_int (buffer, 0, n_bytes, n_bytes * 
> BITS_PER_UNIT);
> +  int set_bit = wi::exact_log2 (mask + 1);
> +  if ((set_bit == 16 && TARGET_SIMD_F16INST)
> +      || set_bit == 32
> +      || set_bit == 64)
> +    {
> +      if (elt_size)
> +     *elt_size = set_bit / BITS_PER_UNIT;

I didn't notice last time that the only consumer multiplies by BITS_PER_UNIT
again, so how about making this:

  *elt_bitsize = set_bit;

and removing the later multiplication.

Please leave 24 hours for other to comment, but otherwise the patch is
ok with those changes, thanks.

Richard

> +      return true;
> +    }
> +
> +  return false;
> +}
> +
>  /* Return true if OP is a valid SIMD xor immediate for SVE.  */
>  bool
>  aarch64_simd_valid_xor_imm (rtx op)
> @@ -25757,6 +25787,26 @@ aarch64_float_const_representable_p (rtx x)
>    return aarch64_real_float_const_representable_p (r);
>  }
>  
> +/* Returns the string with the fmov instruction which is equivalent to an and
> +   instruction with the SIMD immediate CONST_VECTOR.  */
> +char*
> +aarch64_output_fmov (rtx const_vector)
> +{
> +  bool is_valid;
> +  static char templ[40];
> +  char element_char;
> +  unsigned int elt_size;
> +
> +  is_valid = aarch64_simd_valid_and_imm_fmov (const_vector, &elt_size);
> +  gcc_assert (is_valid);
> +
> +  element_char = sizetochar (elt_size * BITS_PER_UNIT);
> +  snprintf (templ, sizeof (templ), "fmov\t%%%c0, %%%c1",
> +         element_char, element_char);
> +
> +  return templ;
> +}
> +
>  /* Returns the string with the instruction for the SIMD immediate
>   * CONST_VECTOR of MODE and WIDTH.  WHICH selects a move, and(bic) or orr.  
> */
>  char*
> diff --git a/gcc/config/aarch64/constraints.md 
> b/gcc/config/aarch64/constraints.md
> index e8321c4d2fb..e9f69f823a6 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -466,6 +466,13 @@ (define_constraint "Do"
>   (and (match_code "const_vector")
>        (match_test "aarch64_simd_valid_orr_imm (op)")))
>  
> +(define_constraint "Df"
> +  "@internal
> +   A constraint that matches a vector of immediates for and which can be
> +   optimized as fmov."
> + (and (match_code "const_vector")
> +      (match_test "aarch64_simd_valid_and_imm_fmov (op)")))
> +
>  (define_constraint "Db"
>    "@internal
>     A constraint that matches vector of immediates for and/bic."
> diff --git a/gcc/config/aarch64/predicates.md 
> b/gcc/config/aarch64/predicates.md
> index 1ab1c696c62..2c6af831eae 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -123,7 +123,8 @@ (define_predicate "aarch64_reg_or_orr_imm"
>  (define_predicate "aarch64_reg_or_and_imm"
>     (ior (match_operand 0 "register_operand")
>       (and (match_code "const_vector")
> -          (match_test "aarch64_simd_valid_and_imm (op)"))))
> +          (ior (match_test "aarch64_simd_valid_and_imm (op)")
> +               (match_test "aarch64_simd_valid_and_imm_fmov (op)")))))
>  
>  (define_predicate "aarch64_reg_or_xor_imm"
>     (ior (match_operand 0 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-1-be.c 
> b/gcc/testsuite/gcc.target/aarch64/fmov-1-be.c
> new file mode 100644
> index 00000000000..65dd4f52d09
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov-1-be.c
> @@ -0,0 +1,149 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbig-endian" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef int v2si __attribute__ ((vector_size (8)));
> +typedef float v2sf __attribute__ ((vector_size (8)));
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef long v2di __attribute__ ((vector_size (16)));
> +typedef double v2df __attribute__ ((vector_size (16)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef float v4sf __attribute__ ((vector_size (16)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +
> +/*
> +** f_v4hi:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v4hi
> +f_v4hi (v4hi x)
> +{
> +  return x & (v4hi){ 0, 0, 0xffff, 0xffff };
> +}
> +
> +/*
> +** g_v4hi:
> +**   movi    d([0-9]+), 0xffff00000000ffff
> +**   and     v0.8b, v0.8b, v\1.8b
> +**   ret
> +*/
> +v4hi
> +g_v4hi (v4hi x)
> +{
> +  return x & (v4hi){ 0xffff, 0, 0, 0xffff };
> +}
> +
> +/*
> +** f_v8hi:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v8hi
> +f_v8hi (v8hi x)
> +{
> +  return x & (v8hi){ 0, 0, 0, 0, 0, 0, 0xffff, 0xffff };
> +}
> +
> +/*
> +** g_v8hi:
> +**   fmov    d0, d0
> +**   ret
> +*/
> +v8hi
> +g_v8hi (v8hi x)
> +{
> +  return x & (v8hi){ 0, 0, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff };
> +}
> +
> +/*
> +** f_v2si:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v2si
> +f_v2si (v2si x)
> +{
> +  return x & (v2si){ 0, 0xffffffff };
> +}
> +
> +/*
> +** f_v2di:
> +**   fmov    d0, d0
> +**   ret
> +*/
> +v2di
> +f_v2di (v2di x)
> +{
> +  return x & (v2di){ 0, 0xffffffffffffffff };
> +}
> +
> +/*
> +** g_v2di:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v2di
> +g_v2di (v2di x)
> +{
> +  return x & (v2di){ 0, 0xffffffff };
> +}
> +
> +/*
> +** f_v4si:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v4si
> +f_v4si (v4si x)
> +{
> +  return x & (v4si){ 0, 0, 0, 0xffffffff };
> +}
> +
> +/*
> +** h_v4si:
> +**   fmov    d0, d0
> +**   ret
> +*/
> +v4si
> +h_v4si (v4si x)
> +{
> +  return x & (v4si){ 0, 0, 0xffffffff, 0xffffffff };
> +}
> +
> +/*
> +** f_v8qi:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v8qi
> +f_v8qi (v8qi x)
> +{
> +  return x & (v8qi){ 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff };
> +}
> +
> +/*
> +** f_v16qi:
> +**   fmov    d0, d0
> +**   ret
> +*/
> +v16qi
> +f_v16qi (v16qi x)
> +{
> +  return x & (v16qi){ 0,    0,    0,    0,    0,    0,    0,    0,
> +                   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
> +}
> +
> +/*
> +** g_v16qi:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v16qi
> +g_v16qi (v16qi x)
> +{
> +  return x & (v16qi){ 0, 0, 0, 0, 0,    0,    0,    0,
> +                   0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-1-le.c 
> b/gcc/testsuite/gcc.target/aarch64/fmov-1-le.c
> new file mode 100644
> index 00000000000..d969e2ab6b1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov-1-le.c
> @@ -0,0 +1,149 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mlittle-endian" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef int v2si __attribute__ ((vector_size (8)));
> +typedef float v2sf __attribute__ ((vector_size (8)));
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef long v2di __attribute__ ((vector_size (16)));
> +typedef double v2df __attribute__ ((vector_size (16)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef float v4sf __attribute__ ((vector_size (16)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +
> +/*
> +** f_v4hi:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v4hi
> +f_v4hi (v4hi x)
> +{
> +  return x & (v4hi){ 0xffff, 0xffff, 0, 0 };
> +}
> +
> +/*
> +** g_v4hi:
> +**   movi    d([0-9]+), 0xffff00000000ffff
> +**   and     v0.8b, v0.8b, v\1.8b
> +**   ret
> +*/
> +v4hi
> +g_v4hi (v4hi x)
> +{
> +  return x & (v4hi){ 0xffff, 0, 0, 0xffff };
> +}
> +
> +/*
> +** f_v8hi:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v8hi
> +f_v8hi (v8hi x)
> +{
> +  return x & (v8hi){ 0xffff, 0xffff, 0, 0, 0, 0, 0, 0 };
> +}
> +
> +/*
> +** g_v8hi:
> +**   fmov    d0, d0
> +**   ret
> +*/
> +v8hi
> +g_v8hi (v8hi x)
> +{
> +  return x & (v8hi){ 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 };
> +}
> +
> +/*
> +** f_v2si:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v2si
> +f_v2si (v2si x)
> +{
> +  return x & (v2si){ 0xffffffff, 0 };
> +}
> +
> +/*
> +** f_v2di:
> +**   fmov    d0, d0
> +**   ret
> +*/
> +v2di
> +f_v2di (v2di x)
> +{
> +  return x & (v2di){ 0xffffffffffffffff, 0 };
> +}
> +
> +/*
> +** g_v2di:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v2di
> +g_v2di (v2di x)
> +{
> +  return x & (v2di){ 0xffffffff, 0 };
> +}
> +
> +/*
> +** f_v4si:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v4si
> +f_v4si (v4si x)
> +{
> +  return x & (v4si){ 0xffffffff, 0, 0, 0 };
> +}
> +
> +/*
> +** h_v4si:
> +**   fmov    d0, d0
> +**   ret
> +*/
> +v4si
> +h_v4si (v4si x)
> +{
> +  return x & (v4si){ 0xffffffff, 0xffffffff, 0, 0 };
> +}
> +
> +/*
> +** f_v8qi:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v8qi
> +f_v8qi (v8qi x)
> +{
> +  return x & (v8qi){ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 };
> +}
> +
> +/*
> +** f_v16qi:
> +**   fmov    d0, d0
> +**   ret
> +*/
> +v16qi
> +f_v16qi (v16qi x)
> +{
> +  return x & (v16qi){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> +                   0,    0,    0,    0,    0,    0,    0,    0 };
> +}
> +
> +/*
> +** g_v16qi:
> +**   fmov    s0, s0
> +**   ret
> +*/
> +v16qi
> +g_v16qi (v16qi x)
> +{
> +  return x & (v16qi){ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
> +                   0,    0,    0,    0,    0, 0, 0, 0 };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-2-be.c 
> b/gcc/testsuite/gcc.target/aarch64/fmov-2-be.c
> new file mode 100644
> index 00000000000..1e38066b4cf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov-2-be.c
> @@ -0,0 +1,90 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbig-endian" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#pragma GCC target ("arch=armv8.2-a+fp16")
> +
> +typedef int v2si __attribute__ ((vector_size (8)));
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef long v2di __attribute__ ((vector_size (16)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +
> +/*
> +** f_v2di:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v2di
> +f_v2di (v2di x)
> +{
> +  return x & (v2di){ 0, 0xffff };
> +}
> +
> +/*
> +** f_v4si:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v4si
> +f_v4si (v4si x)
> +{
> +  return x & (v4si){ 0, 0, 0, 0xffff };
> +}
> +
> +/*
> +** f_v2si:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v2si
> +f_v2si (v2si x)
> +{
> +  return x & (v2si){ 0, 0xffff };
> +}
> +
> +/*
> +** f_v8hi:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v8hi
> +f_v8hi (v8hi x)
> +{
> +  return x & (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0xffff };
> +}
> +
> +/*
> +** f_v4hi:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v4hi
> +f_v4hi (v4hi x)
> +{
> +  return x & (v4hi){ 0, 0, 0, 0xffff };
> +}
> +
> +/*
> +** f_v16qi:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v16qi
> +f_v16qi (v16qi x)
> +{
> +  return x & (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff };
> +}
> +
> +/*
> +** f_v8qi:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v8qi
> +f_v8qi (v8qi x)
> +{
> +  return x & (v8qi){ 0, 0, 0, 0, 0, 0, 0xff, 0xff };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-2-le.c 
> b/gcc/testsuite/gcc.target/aarch64/fmov-2-le.c
> new file mode 100644
> index 00000000000..7627680a0b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov-2-le.c
> @@ -0,0 +1,90 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mlittle-endian" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#pragma GCC target ("arch=armv8.2-a+fp16")
> +
> +typedef int v2si __attribute__ ((vector_size (8)));
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef long v2di __attribute__ ((vector_size (16)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +
> +/*
> +** f_v2di:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v2di
> +f_v2di (v2di x)
> +{
> +  return x & (v2di){ 0xffff, 0 };
> +}
> +
> +/*
> +** f_v4si:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v4si
> +f_v4si (v4si x)
> +{
> +  return x & (v4si){ 0xffff, 0, 0, 0 };
> +}
> +
> +/*
> +** f_v2si:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v2si
> +f_v2si (v2si x)
> +{
> +  return x & (v2si){ 0xffff, 0 };
> +}
> +
> +/*
> +** f_v8hi:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v8hi
> +f_v8hi (v8hi x)
> +{
> +  return x & (v8hi){ 0xffff, 0, 0, 0, 0, 0, 0, 0 };
> +}
> +
> +/*
> +** f_v4hi:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v4hi
> +f_v4hi (v4hi x)
> +{
> +  return x & (v4hi){ 0xffff, 0, 0, 0 };
> +}
> +
> +/*
> +** f_v16qi:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v16qi
> +f_v16qi (v16qi x)
> +{
> +  return x & (v16qi){ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
> +}
> +
> +/*
> +** f_v8qi:
> +**   fmov    h0, h0
> +**   ret
> +*/
> +v8qi
> +f_v8qi (v8qi x)
> +{
> +  return x & (v8qi){ 0xff, 0xff, 0, 0, 0, 0, 0, 0 };
> +}

Re: [PATCH v2 2/3] aarch64: Optimize AND with certain vector of immediates as FMOV [PR100165]

Reply via email to