Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-14 Thread Christophe Lyon via Gcc-patches
On Mon, 14 Jun 2021 at 18:01, Richard Sandiford
 wrote:
>
> Christophe Lyon  writes:
> > On Fri, 11 Jun 2021 at 18:25, Richard Sandiford
> >  wrote:
> >>
> >> Christophe Lyon  writes:
> >> > Thanks for the feedback. How about v2 attached?
> >> > Do you want me to merge neon_vec_unpack and
> >> > mve_vec_unpack  and only have different assembly?
> >> > if (TARGET_HAVE_MVE)
> >> >   return "vmovlb.%# %q0, %q1";
> >> > else
> >> >   return "vmovlb.%# %q0, %q1";
> >>
> >> I think it'd be better to keep them separate, given that they have
> >> different attributes.
> >>
> >> > @@ -2199,10 +,23 @@ (define_insn "mve_vmovnbq_"
> >> >[(set_attr "type" "mve_move")
> >> >  ])
> >> >
> >> > +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
> >> > +;; need for an extra register.
> >>
> >> “to avoid the need for an uninitailized input operand” might
> >> be clearer.
> >>
> > Done
> >
> >> > +(define_insn "@mve_vec_pack_trunc_lo_"
> >> > +  [
> >> > +   (set (match_operand: 0 "s_register_operand" "=w")
> >> > + (unspec: [(match_operand:MVE_5 1 
> >> > "s_register_operand" "w")]
> >> > +  VMOVNBQ_S))
> >> > +  ]
> >> > +  "TARGET_HAVE_MVE"
> >> > +  "vmovnb.i%# %q0, %q1"
> >> > +  [(set_attr "type" "mve_move")
> >> > +])
> >> > +
> >> > […]
> >> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> >> > index 430a92ce966..3afb3e1d891 100644
> >> > --- a/gcc/config/arm/vec-common.md
> >> > +++ b/gcc/config/arm/vec-common.md
> >> > @@ -632,3 +632,91 @@ (define_expand "clz2"
> >> >"ARM_HAVE__ARITH
> >> > && !TARGET_REALLY_IWMMXT"
> >> >  )
> >> > +
> >> > +;; vmovl[tb] are not available for V4SI on MVE
> >> > +(define_expand "vec_unpack_hi_"
> >> > +  [(match_operand: 0 "register_operand")
> >> > +   (SE: (match_operand:VU 1 "register_operand"))]
> >> > + "ARM_HAVE__ARITH
> >> > +  && !TARGET_REALLY_IWMMXT
> >> > +  && ! (mode == V4SImode && TARGET_HAVE_MVE)
> >> > +  && !BYTES_BIG_ENDIAN"
> >> > +  {
> >> > +rtvec v = rtvec_alloc (/2);
> >> > +rtx t1;
> >> > +int i;
> >> > +for (i = 0; i < (/2); i++)
> >> > +  RTVEC_ELT (v, i) = GEN_INT ((/2) + i);
> >> > +
> >> > +t1 = gen_rtx_PARALLEL (mode, v);
> >> > +
> >> > +if (TARGET_NEON)
> >> > +  emit_insn (gen_neon_vec_unpack_hi_ (operands[0],
> >> > + operands[1],
> >> > + t1));
> >> > +else
> >> > +  emit_insn (gen_mve_vec_unpack_hi_ (operands[0],
> >> > +operands[1],
> >> > +t1));
> >> > +DONE;
> >> > +  }
> >> > +)
> >>
> >> Since the patterns are the same for both MVE and Neon (a good thing!),
> >> it would be simpler to write this as:
> >>
> >> (define_insn "mve_vec_unpack_lo_"
> > I guess you mean define_expand?
>
> Yes, sorry.
>
> >>   [(set (match_operand: 0 "register_operand" "=w")
> >> (SE: (vec_select:
> >>   (match_operand:VU 1 "register_operand" "w")
> >>   (match_dup 2]
> >>   …
> >>
> >> and then set operands[2] to what is t1 above.  There would then be
> >> no need to call emit_insn & DONE, and we could use the natural
> >> iterator (MVE_3) for the MVE patterns.
> >>
> >> Same idea for the lo version.
> >>
> >> > […]
> >> > +;; vmovn[tb] are not available for V2DI on MVE
> >> > +(define_expand "vec_pack_trunc_"
> >> > + [(set (match_operand: 0 "register_operand")
> >> > +   (vec_concat:
> >> > + (truncate:
> >> > + (match_operand:VN 1 "register_operand"))
> >> > + (truncate:
> >> > + (match_operand:VN 2 "register_operand"]
> >> > + "ARM_HAVE__ARITH
> >> > +  && !TARGET_REALLY_IWMMXT
> >> > +  && ! (mode == V2DImode && TARGET_HAVE_MVE)
> >> > +  && !BYTES_BIG_ENDIAN"
> >> > + {
> >> > +   if (TARGET_NEON)
> >> > + {
> >> > +   emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], 
> >> > operands[1],
> >> > +operands[2]));
> >> > + }
> >> > +   else
> >> > + {
> >> > +   rtx tmpreg = gen_reg_rtx (mode);
> >> > +   emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, 
> >> > operands[1]));
> >> > +   emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode,
> >> > +tmpreg, tmpreg, operands[2]));
> >> > +   emit_move_insn (operands[0], tmpreg);
> >>
> >> vmovntq can assign directly to operands[0], without a separate move.
> >>
> >> OK with those changes, thanks.
> >
> > Just to be sure, here is v3 which hopefully matches what you meant.
>
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 430a92ce966..1d6d4379855 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -632,3 +632,78 @@ (define_expand "clz2"
> >"ARM_HAVE__ARITH
> > && 

Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-14 Thread Richard Sandiford via Gcc-patches
Christophe Lyon  writes:
> On Fri, 11 Jun 2021 at 18:25, Richard Sandiford
>  wrote:
>>
>> Christophe Lyon  writes:
>> > Thanks for the feedback. How about v2 attached?
>> > Do you want me to merge neon_vec_unpack and
>> > mve_vec_unpack  and only have different assembly?
>> > if (TARGET_HAVE_MVE)
>> >   return "vmovlb.%# %q0, %q1";
>> > else
>> >   return "vmovlb.%# %q0, %q1";
>>
>> I think it'd be better to keep them separate, given that they have
>> different attributes.
>>
>> > @@ -2199,10 +,23 @@ (define_insn "mve_vmovnbq_"
>> >[(set_attr "type" "mve_move")
>> >  ])
>> >
>> > +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
>> > +;; need for an extra register.
>>
>> “to avoid the need for an uninitailized input operand” might
>> be clearer.
>>
> Done
>
>> > +(define_insn "@mve_vec_pack_trunc_lo_"
>> > +  [
>> > +   (set (match_operand: 0 "s_register_operand" "=w")
>> > + (unspec: [(match_operand:MVE_5 1 "s_register_operand" 
>> > "w")]
>> > +  VMOVNBQ_S))
>> > +  ]
>> > +  "TARGET_HAVE_MVE"
>> > +  "vmovnb.i%# %q0, %q1"
>> > +  [(set_attr "type" "mve_move")
>> > +])
>> > +
>> > […]
>> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
>> > index 430a92ce966..3afb3e1d891 100644
>> > --- a/gcc/config/arm/vec-common.md
>> > +++ b/gcc/config/arm/vec-common.md
>> > @@ -632,3 +632,91 @@ (define_expand "clz2"
>> >"ARM_HAVE__ARITH
>> > && !TARGET_REALLY_IWMMXT"
>> >  )
>> > +
>> > +;; vmovl[tb] are not available for V4SI on MVE
>> > +(define_expand "vec_unpack_hi_"
>> > +  [(match_operand: 0 "register_operand")
>> > +   (SE: (match_operand:VU 1 "register_operand"))]
>> > + "ARM_HAVE__ARITH
>> > +  && !TARGET_REALLY_IWMMXT
>> > +  && ! (mode == V4SImode && TARGET_HAVE_MVE)
>> > +  && !BYTES_BIG_ENDIAN"
>> > +  {
>> > +rtvec v = rtvec_alloc (/2);
>> > +rtx t1;
>> > +int i;
>> > +for (i = 0; i < (/2); i++)
>> > +  RTVEC_ELT (v, i) = GEN_INT ((/2) + i);
>> > +
>> > +t1 = gen_rtx_PARALLEL (mode, v);
>> > +
>> > +if (TARGET_NEON)
>> > +  emit_insn (gen_neon_vec_unpack_hi_ (operands[0],
>> > + operands[1],
>> > + t1));
>> > +else
>> > +  emit_insn (gen_mve_vec_unpack_hi_ (operands[0],
>> > +operands[1],
>> > +t1));
>> > +DONE;
>> > +  }
>> > +)
>>
>> Since the patterns are the same for both MVE and Neon (a good thing!),
>> it would be simpler to write this as:
>>
>> (define_insn "mve_vec_unpack_lo_"
> I guess you mean define_expand?

Yes, sorry.

>>   [(set (match_operand: 0 "register_operand" "=w")
>> (SE: (vec_select:
>>   (match_operand:VU 1 "register_operand" "w")
>>   (match_dup 2]
>>   …
>>
>> and then set operands[2] to what is t1 above.  There would then be
>> no need to call emit_insn & DONE, and we could use the natural
>> iterator (MVE_3) for the MVE patterns.
>>
>> Same idea for the lo version.
>>
>> > […]
>> > +;; vmovn[tb] are not available for V2DI on MVE
>> > +(define_expand "vec_pack_trunc_"
>> > + [(set (match_operand: 0 "register_operand")
>> > +   (vec_concat:
>> > + (truncate:
>> > + (match_operand:VN 1 "register_operand"))
>> > + (truncate:
>> > + (match_operand:VN 2 "register_operand"]
>> > + "ARM_HAVE__ARITH
>> > +  && !TARGET_REALLY_IWMMXT
>> > +  && ! (mode == V2DImode && TARGET_HAVE_MVE)
>> > +  && !BYTES_BIG_ENDIAN"
>> > + {
>> > +   if (TARGET_NEON)
>> > + {
>> > +   emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], 
>> > operands[1],
>> > +operands[2]));
>> > + }
>> > +   else
>> > + {
>> > +   rtx tmpreg = gen_reg_rtx (mode);
>> > +   emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, 
>> > operands[1]));
>> > +   emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode,
>> > +tmpreg, tmpreg, operands[2]));
>> > +   emit_move_insn (operands[0], tmpreg);
>>
>> vmovntq can assign directly to operands[0], without a separate move.
>>
>> OK with those changes, thanks.
>
> Just to be sure, here is v3 which hopefully matches what you meant.

> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 430a92ce966..1d6d4379855 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -632,3 +632,78 @@ (define_expand "clz2"
>"ARM_HAVE__ARITH
> && !TARGET_REALLY_IWMMXT"
>  )
> +
> +;; vmovl[tb] are not available for V4SI on MVE
> +(define_expand "vec_unpack_hi_"
> +  [(set (match_operand: 0 "register_operand")
> +   (SE: (vec_select:
> + (match_operand:VU 1 "register_operand")
> + (match_dup 2]

The formatting of the vec_unpack_lo_ 

Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-14 Thread Christophe Lyon via Gcc-patches
On Fri, 11 Jun 2021 at 18:25, Richard Sandiford
 wrote:
>
> Christophe Lyon  writes:
> > Thanks for the feedback. How about v2 attached?
> > Do you want me to merge neon_vec_unpack and
> > mve_vec_unpack  and only have different assembly?
> > if (TARGET_HAVE_MVE)
> >   return "vmovlb.%# %q0, %q1";
> > else
> >   return "vmovlb.%# %q0, %q1";
>
> I think it'd be better to keep them separate, given that they have
> different attributes.
>
> > @@ -2199,10 +,23 @@ (define_insn "mve_vmovnbq_"
> >[(set_attr "type" "mve_move")
> >  ])
> >
> > +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
> > +;; need for an extra register.
>
> “to avoid the need for an uninitailized input operand” might
> be clearer.
>
Done

> > +(define_insn "@mve_vec_pack_trunc_lo_"
> > +  [
> > +   (set (match_operand: 0 "s_register_operand" "=w")
> > + (unspec: [(match_operand:MVE_5 1 "s_register_operand" 
> > "w")]
> > +  VMOVNBQ_S))
> > +  ]
> > +  "TARGET_HAVE_MVE"
> > +  "vmovnb.i%# %q0, %q1"
> > +  [(set_attr "type" "mve_move")
> > +])
> > +
> > […]
> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> > index 430a92ce966..3afb3e1d891 100644
> > --- a/gcc/config/arm/vec-common.md
> > +++ b/gcc/config/arm/vec-common.md
> > @@ -632,3 +632,91 @@ (define_expand "clz2"
> >"ARM_HAVE__ARITH
> > && !TARGET_REALLY_IWMMXT"
> >  )
> > +
> > +;; vmovl[tb] are not available for V4SI on MVE
> > +(define_expand "vec_unpack_hi_"
> > +  [(match_operand: 0 "register_operand")
> > +   (SE: (match_operand:VU 1 "register_operand"))]
> > + "ARM_HAVE__ARITH
> > +  && !TARGET_REALLY_IWMMXT
> > +  && ! (mode == V4SImode && TARGET_HAVE_MVE)
> > +  && !BYTES_BIG_ENDIAN"
> > +  {
> > +rtvec v = rtvec_alloc (/2);
> > +rtx t1;
> > +int i;
> > +for (i = 0; i < (/2); i++)
> > +  RTVEC_ELT (v, i) = GEN_INT ((/2) + i);
> > +
> > +t1 = gen_rtx_PARALLEL (mode, v);
> > +
> > +if (TARGET_NEON)
> > +  emit_insn (gen_neon_vec_unpack_hi_ (operands[0],
> > + operands[1],
> > + t1));
> > +else
> > +  emit_insn (gen_mve_vec_unpack_hi_ (operands[0],
> > +operands[1],
> > +t1));
> > +DONE;
> > +  }
> > +)
>
> Since the patterns are the same for both MVE and Neon (a good thing!),
> it would be simpler to write this as:
>
> (define_insn "mve_vec_unpack_lo_"
I guess you mean define_expand?

>   [(set (match_operand: 0 "register_operand" "=w")
> (SE: (vec_select:
>   (match_operand:VU 1 "register_operand" "w")
>   (match_dup 2]
>   …
>
> and then set operands[2] to what is t1 above.  There would then be
> no need to call emit_insn & DONE, and we could use the natural
> iterator (MVE_3) for the MVE patterns.
>
> Same idea for the lo version.
>
> > […]
> > +;; vmovn[tb] are not available for V2DI on MVE
> > +(define_expand "vec_pack_trunc_"
> > + [(set (match_operand: 0 "register_operand")
> > +   (vec_concat:
> > + (truncate:
> > + (match_operand:VN 1 "register_operand"))
> > + (truncate:
> > + (match_operand:VN 2 "register_operand"]
> > + "ARM_HAVE__ARITH
> > +  && !TARGET_REALLY_IWMMXT
> > +  && ! (mode == V2DImode && TARGET_HAVE_MVE)
> > +  && !BYTES_BIG_ENDIAN"
> > + {
> > +   if (TARGET_NEON)
> > + {
> > +   emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], 
> > operands[1],
> > +operands[2]));
> > + }
> > +   else
> > + {
> > +   rtx tmpreg = gen_reg_rtx (mode);
> > +   emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, 
> > operands[1]));
> > +   emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode,
> > +tmpreg, tmpreg, operands[2]));
> > +   emit_move_insn (operands[0], tmpreg);
>
> vmovntq can assign directly to operands[0], without a separate move.
>
> OK with those changes, thanks.

Just to be sure, here is v3 which hopefully matches what you meant.
I updated the commit message and removed:
< I did not prefix them with '@', because I couldn't find how to call
< gen_mve_vec_unpack_[lo|hi] with  as first parameter.  Anyway, we
< can keep the VU iterator instead of MVE_3 since the offending V4SI
< mode is disabled in the expander.

OK?

Thanks

Christophe

>
> Richard
>
> > + }
> > +   DONE;
> > + }
> > +)
> > […]
From 9c1afae67249cd8c67cedd5eb852361a4d89d0b5 Mon Sep 17 00:00:00 2001
From: Christophe Lyon 
Date: Thu, 3 Jun 2021 14:35:50 +
Subject: [PATCH v3] arm: Auto-vectorization for MVE: add pack/unpack patterns

This patch adds vec_unpack_hi_, vec_unpack_lo_,
vec_pack_trunc_ patterns for MVE.

It does so by moving the unpack patterns from neon.md to
vec-common.md, while adding them support for MVE. The pack 

Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-11 Thread Richard Sandiford via Gcc-patches
Christophe Lyon  writes:
> Thanks for the feedback. How about v2 attached?
> Do you want me to merge neon_vec_unpack and
> mve_vec_unpack  and only have different assembly?
> if (TARGET_HAVE_MVE)
>   return "vmovlb.%# %q0, %q1";
> else
>   return "vmovlb.%# %q0, %q1";

I think it'd be better to keep them separate, given that they have
different attributes.

> @@ -2199,10 +,23 @@ (define_insn "mve_vmovnbq_"
>[(set_attr "type" "mve_move")
>  ])
>  
> +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
> +;; need for an extra register.

“to avoid the need for an uninitailized input operand” might
be clearer.

> +(define_insn "@mve_vec_pack_trunc_lo_"
> +  [
> +   (set (match_operand: 0 "s_register_operand" "=w")
> + (unspec: [(match_operand:MVE_5 1 "s_register_operand" 
> "w")]
> +  VMOVNBQ_S))
> +  ]
> +  "TARGET_HAVE_MVE"
> +  "vmovnb.i%# %q0, %q1"
> +  [(set_attr "type" "mve_move")
> +])
> +
> […]
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index 430a92ce966..3afb3e1d891 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -632,3 +632,91 @@ (define_expand "clz2"
>"ARM_HAVE__ARITH
> && !TARGET_REALLY_IWMMXT"
>  )
> +
> +;; vmovl[tb] are not available for V4SI on MVE
> +(define_expand "vec_unpack_hi_"
> +  [(match_operand: 0 "register_operand")
> +   (SE: (match_operand:VU 1 "register_operand"))]
> + "ARM_HAVE__ARITH
> +  && !TARGET_REALLY_IWMMXT
> +  && ! (mode == V4SImode && TARGET_HAVE_MVE)
> +  && !BYTES_BIG_ENDIAN"
> +  {
> +rtvec v = rtvec_alloc (/2);
> +rtx t1;
> +int i;
> +for (i = 0; i < (/2); i++)
> +  RTVEC_ELT (v, i) = GEN_INT ((/2) + i);
> +
> +t1 = gen_rtx_PARALLEL (mode, v);
> +
> +if (TARGET_NEON)
> +  emit_insn (gen_neon_vec_unpack_hi_ (operands[0],
> + operands[1],
> + t1));
> +else
> +  emit_insn (gen_mve_vec_unpack_hi_ (operands[0],
> +operands[1],
> +t1));
> +DONE;
> +  }
> +)

Since the patterns are the same for both MVE and Neon (a good thing!),
it would be simpler to write this as:

(define_insn "mve_vec_unpack_lo_"
  [(set (match_operand: 0 "register_operand" "=w")
(SE: (vec_select:
  (match_operand:VU 1 "register_operand" "w")
  (match_dup 2]
  …

and then set operands[2] to what is t1 above.  There would then be
no need to call emit_insn & DONE, and we could use the natural
iterator (MVE_3) for the MVE patterns.

Same idea for the lo version.

> […]
> +;; vmovn[tb] are not available for V2DI on MVE
> +(define_expand "vec_pack_trunc_"
> + [(set (match_operand: 0 "register_operand")
> +   (vec_concat:
> + (truncate:
> + (match_operand:VN 1 "register_operand"))
> + (truncate:
> + (match_operand:VN 2 "register_operand"]
> + "ARM_HAVE__ARITH
> +  && !TARGET_REALLY_IWMMXT
> +  && ! (mode == V2DImode && TARGET_HAVE_MVE)
> +  && !BYTES_BIG_ENDIAN"
> + {
> +   if (TARGET_NEON)
> + {
> +   emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], 
> operands[1],
> +operands[2]));
> + }
> +   else
> + {
> +   rtx tmpreg = gen_reg_rtx (mode);
> +   emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, 
> operands[1]));
> +   emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode,
> +tmpreg, tmpreg, operands[2]));
> +   emit_move_insn (operands[0], tmpreg);

vmovntq can assign directly to operands[0], without a separate move.

OK with those changes, thanks.

Richard

> + }
> +   DONE;
> + }
> +)
> […]


Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-11 Thread Christophe Lyon via Gcc-patches
On Fri, 11 Jun 2021 at 12:20, Richard Sandiford
 wrote:
>
> Christophe Lyon  writes:
> > In the meantime, I tried to make some progress, and looked at how
> > things work on aarch64.
> >
> > This led me to define (in mve.md):
> >
> > (define_insn "@mve_vec_pack_trunc_lo_"
> >  [(set (match_operand: 0 "register_operand" "=w")
> >(truncate: (match_operand:MVE_5 1 "register_operand" 
> > "w")))]
> >  "TARGET_HAVE_MVE"
> >  "vmovnb.i   %q0, %q1\;"
> >   [(set_attr "type" "mve_move")]
> > )
> >
> > (define_insn "@mve_vec_pack_trunc_hi_"
> >  [(set (match_operand: 0 "register_operand" "=w")
> >(vec_concat:
> > (match_operand: 1 "register_operand" "0")
> > (truncate:
> > (match_operand:MVE_5 2 "register_operand" "w"]
> >  "TARGET_HAVE_MVE"
> >  "vmovnt.i   %q0, %q2\;"
> >   [(set_attr "type" "mve_move")]
> > )
> >
> > and in vec-common.md, for
> > (define_expand "vec_pack_trunc_"
> >  [(set (match_operand: 0 "register_operand")
> >(vec_concat:
> > (truncate:
> > (match_operand:VN 1 "register_operand"))
> > (truncate:
> > (match_operand:VN 2 "register_operand"]
> >
> > I expand this for MVE:
> >   rtx tmpreg = gen_reg_rtx (mode);
> >   emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, 
> > operands[1]));
> >   emit_insn (gen_mve_vec_pack_trunc_hi (mode, operands[0],
> > tmpreg, operands[2]));
> >
> > I am getting an error in reload:
> > error: unable to generate reloads for:
> > (insn 10 9 11 2 (set (reg:V4HI 122 [ vect__4.16 ])
> > (truncate:V4HI (reg:V4SI 118 [ vect__4.16 ])))
> > "/gcc.target/arm/simd/mve-vec-pack.c":17:112 3609
> > {mve_vec_pack_trunc_lo_v4si}
> >  (expr_list:REG_DEAD (reg:V4SI 118 [ vect__4.16 ])
> > (nil)))
> >
> > The next insn is:
> > (insn 11 10 12 2 (set (reg:V8HI 121 [ vect__7.18 ])
> > (vec_concat:V8HI (reg:V4HI 122 [ vect__4.16 ])
> > (truncate:V4HI (reg:V4SI 119 [ vect__4.17 ]
> > "/gcc.target/arm/simd/mve-vec-pack.c":17:112 3611
> > {mve_vec_pack_trunc_hi_v4si}
> >  (expr_list:REG_DEAD (reg:V4HI 122 [ vect__4.16 ])
> > (expr_list:REG_DEAD (reg:V4SI 119 [ vect__4.17 ])
> > (nil
> >
> > What is causing the reload error?
>
> For this to work on MVE, we'd need to allow the 64-bit V_narrow modes to be
> stored in MVE registers.  I'm not sure off-hand whether allowing that
> would be a good idea or not.
>
> If we continue to allow only 128-bit vectors in MVE registers then
> we'll need to continue to use unspecs instead of truncate and vec_concat.
>

Thanks for the feedback. How about v2 attached?
Do you want me to merge neon_vec_unpack and
mve_vec_unpack  and only have different assembly?
if (TARGET_HAVE_MVE)
  return "vmovlb.%# %q0, %q1";
else
  return "vmovlb.%# %q0, %q1";


> Thanks,
> Richard
From 5abc6196c934438421e293b6b4733a98ec80e982 Mon Sep 17 00:00:00 2001
From: Christophe Lyon 
Date: Thu, 3 Jun 2021 14:35:50 +
Subject: [PATCH v2] arm: Auto-vectorization for MVE: add pack/unpack patterns

This patch adds vec_unpack_hi_, vec_unpack_lo_,
vec_pack_trunc_ patterns for MVE.

It does so by moving the unpack patterns from neon.md to
vec-common.md, while adding them support for MVE. The pack expander is
derived from the Neon one (which in turn is renamed into
neon_quad_vec_pack_trunc_).

The patch introduces mve_vec_unpack_lo_ and
mve_vec_unpack_hi_ which are similar to their Neon
counterparts, except for the assembly syntax.

I did not prefix them with '@', because I couldn't find how to call
gen_mve_vec_unpack_[lo|hi] with  as first parameter.  Anyway, we
can keep the VU iterator instead of MVE_3 since the offending V4SI
mode is disabled in the expander.

The patch introduces mve_vec_pack_trunc_lo_ to avoid the need for a
zero-initialized temporary, which is needed if the
vec_pack_trunc_ expander calls @mve_vmovn[bt]q_
instead.

With this patch, we can now vectorize the 16 and 8-bit versions of
vclz and vshl, although the generated code could still be improved.
For test_clz_s16, we now generate
vldrh.16q3, [r1]
vmovlb.s16   q2, q3
vmovlt.s16   q3, q3
vclz.i32  q2, q2
vclz.i32  q3, q3
vmovnb.i32  q1, q2
vmovnt.i32  q1, q3
vstrh.16q1, [r0]
which could be improved to
vldrh.16q3, [r1]
	vclz.i16	q1, q3
vstrh.16q1, [r0]
if we could avoid the need for unpack/pack steps.

For reference, clang-12 generates:
	vldrh.s32   q0, [r1]
	vldrh.s32   q1, [r1, #8]
	vclz.i32q0, q0
	vstrh.32q0, [r0]
	vclz.i32q0, q1
	vstrh.32q0, [r0, #8]

2021-06-11  Christophe Lyon  

	gcc/
	* config/arm/mve.md (mve_vec_unpack_lo_): New pattern.
	(mve_vec_unpack_hi_): New pattern.
	(@mve_vec_pack_trunc_lo_): New pattern.
	(mve_vmovntq_): Prefix with '@'.
	* config/arm/neon.md (vec_unpack_hi_): Move to
	

Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-11 Thread Richard Sandiford via Gcc-patches
Christophe Lyon  writes:
> In the meantime, I tried to make some progress, and looked at how
> things work on aarch64.
>
> This led me to define (in mve.md):
>
> (define_insn "@mve_vec_pack_trunc_lo_"
>  [(set (match_operand: 0 "register_operand" "=w")
>(truncate: (match_operand:MVE_5 1 "register_operand" "w")))]
>  "TARGET_HAVE_MVE"
>  "vmovnb.i   %q0, %q1\;"
>   [(set_attr "type" "mve_move")]
> )
>
> (define_insn "@mve_vec_pack_trunc_hi_"
>  [(set (match_operand: 0 "register_operand" "=w")
>(vec_concat:
> (match_operand: 1 "register_operand" "0")
> (truncate:
> (match_operand:MVE_5 2 "register_operand" "w"]
>  "TARGET_HAVE_MVE"
>  "vmovnt.i   %q0, %q2\;"
>   [(set_attr "type" "mve_move")]
> )
>
> and in vec-common.md, for
> (define_expand "vec_pack_trunc_"
>  [(set (match_operand: 0 "register_operand")
>(vec_concat:
> (truncate:
> (match_operand:VN 1 "register_operand"))
> (truncate:
> (match_operand:VN 2 "register_operand"]
>
> I expand this for MVE:
>   rtx tmpreg = gen_reg_rtx (mode);
>   emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, operands[1]));
>   emit_insn (gen_mve_vec_pack_trunc_hi (mode, operands[0],
> tmpreg, operands[2]));
>
> I am getting an error in reload:
> error: unable to generate reloads for:
> (insn 10 9 11 2 (set (reg:V4HI 122 [ vect__4.16 ])
> (truncate:V4HI (reg:V4SI 118 [ vect__4.16 ])))
> "/gcc.target/arm/simd/mve-vec-pack.c":17:112 3609
> {mve_vec_pack_trunc_lo_v4si}
>  (expr_list:REG_DEAD (reg:V4SI 118 [ vect__4.16 ])
> (nil)))
>
> The next insn is:
> (insn 11 10 12 2 (set (reg:V8HI 121 [ vect__7.18 ])
> (vec_concat:V8HI (reg:V4HI 122 [ vect__4.16 ])
> (truncate:V4HI (reg:V4SI 119 [ vect__4.17 ]
> "/gcc.target/arm/simd/mve-vec-pack.c":17:112 3611
> {mve_vec_pack_trunc_hi_v4si}
>  (expr_list:REG_DEAD (reg:V4HI 122 [ vect__4.16 ])
> (expr_list:REG_DEAD (reg:V4SI 119 [ vect__4.17 ])
> (nil
>
> What is causing the reload error?

For this to work on MVE, we'd need to allow the 64-bit V_narrow modes to be
stored in MVE registers.  I'm not sure off-hand whether allowing that
would be a good idea or not.

If we continue to allow only 128-bit vectors in MVE registers then
we'll need to continue to use unspecs instead of truncate and vec_concat.

Thanks,
Richard


Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-11 Thread Christophe Lyon via Gcc-patches
On Fri, 11 Jun 2021 at 11:38, Richard Sandiford
 wrote:
>
> Christophe Lyon  writes:
> > On Tue, 8 Jun 2021 at 14:10, Richard Sandiford
> >  wrote:
> >>
> >> Christophe Lyon  writes:
> >> > This patch adds vec_unpack_hi_, vec_unpack_lo_,
> >> > vec_pack_trunc_ patterns for MVE.
> >> >
> >> > It does so by moving the unpack patterns from neon.md to
> >> > vec-common.md, while adding them support for MVE. The pack expander is
> >> > derived from the Neon one (which in turn is renamed into
> >> > neon_quad_vec_pack_trunc_).
> >> >
> >> > The patch introduces mve_vec_pack_trunc_ to avoid the need for a
> >> > zero-initialized temporary, which is needed if the
> >> > vec_pack_trunc_ expander calls @mve_vmovn[bt]q_
> >> > instead.
> >> >
> >> > With this patch, we can now vectorize the 16 and 8-bit versions of
> >> > vclz and vshl, although the generated code could still be improved.
> >> > For test_clz_s16, we now generate
> >> > vldrh.16q3, [r1]
> >> > vmovlb.s16   q2, q3
> >> > vmovlt.s16   q3, q3
> >> > vclz.i32  q2, q2
> >> > vclz.i32  q3, q3
> >> > vmovnb.i32  q1, q2
> >> > vmovnt.i32  q1, q3
> >> > vstrh.16q1, [r0]
> >> > which could be improved to
> >> > vldrh.16q3, [r1]
> >> >   vclz.i16q1, q3
> >> > vstrh.16q1, [r0]
> >> > if we could avoid the need for unpack/pack steps.
> >>
> >> Yeah, there was a PR about fixing this for popcount.  I guess the same
> >> approach would apply here too.
> >>
> >> > For reference, clang-12 generates:
> >> >   vldrh.s32   q0, [r1]
> >> >   vldrh.s32   q1, [r1, #8]
> >> >   vclz.i32q0, q0
> >> >   vstrh.32q0, [r0]
> >> >   vclz.i32q0, q1
> >> >   vstrh.32q0, [r0, #8]
> >> >
> >> > 2021-06-03  Christophe Lyon  
> >> >
> >> >   gcc/
> >> >   * config/arm/mve.md (mve_vmovltq_): Prefix with '@'.
> >> >   (mve_vmovlbq_): Likewise.
> >> >   (mve_vmovnbq_): Likewise.
> >> >   (mve_vmovntq_): Likewise.
> >> >   (@mve_vec_pack_trunc_): New pattern.
> >> >   * config/arm/neon.md (vec_unpack_hi_): Move to
> >> >   vec-common.md.
> >> >   (vec_unpack_lo_): Likewise.
> >> >   (vec_pack_trunc_): Rename to
> >> >   neon_quad_vec_pack_trunc_.
> >> >   * config/arm/vec-common.md (vec_unpack_hi_): New
> >> >   pattern.
> >> >   (vec_unpack_lo_): New.
> >> >   (vec_pack_trunc_): New.
> >> >
> >> >   gcc/testsuite/
> >> >   * gcc.target/arm/simd/mve-vclz.c: Update expected results.
> >> >   * gcc.target/arm/simd/mve-vshl.c: Likewise.
> >> > ---
> >> >  gcc/config/arm/mve.md| 20 -
> >> >  gcc/config/arm/neon.md   | 39 +
> >> >  gcc/config/arm/vec-common.md | 89 
> >> >  gcc/testsuite/gcc.target/arm/simd/mve-vclz.c |  7 +-
> >> >  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c |  5 +-
> >> >  5 files changed, 114 insertions(+), 46 deletions(-)
> >> >
> >> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> >> > index 99e46d0bc69..b18292c07d3 100644
> >> > --- a/gcc/config/arm/mve.md
> >> > +++ b/gcc/config/arm/mve.md
> >> > @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_"
> >> >  ;;
> >> >  ;; [vmovltq_u, vmovltq_s])
> >> >  ;;
> >> > -(define_insn "mve_vmovltq_"
> >> > +(define_insn "@mve_vmovltq_"
> >> >[
> >> > (set (match_operand: 0 "s_register_operand" "=w")
> >> >   (unspec: [(match_operand:MVE_3 1 
> >> > "s_register_operand" "w")]
> >> > @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_"
> >> >  ;;
> >> >  ;; [vmovlbq_s, vmovlbq_u])
> >> >  ;;
> >> > -(define_insn "mve_vmovlbq_"
> >> > +(define_insn "@mve_vmovlbq_"
> >> >[
> >> > (set (match_operand: 0 "s_register_operand" "=w")
> >> >   (unspec: [(match_operand:MVE_3 1 
> >> > "s_register_operand" "w")]
> >> > @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s"
> >> >  ;;
> >> >  ;; [vmovnbq_u, vmovnbq_s])
> >> >  ;;
> >> > -(define_insn "mve_vmovnbq_"
> >> > +(define_insn "@mve_vmovnbq_"
> >> >[
> >> > (set (match_operand: 0 "s_register_operand" "=w")
> >> >   (unspec: [(match_operand: 1 
> >> > "s_register_operand" "0")
> >> > @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_"
> >> >  ;;
> >> >  ;; [vmovntq_s, vmovntq_u])
> >> >  ;;
> >> > -(define_insn "mve_vmovntq_"
> >> > +(define_insn "@mve_vmovntq_"
> >> >[
> >> > (set (match_operand: 0 "s_register_operand" "=w")
> >> >   (unspec: [(match_operand: 1 
> >> > "s_register_operand" "0")
> >> > @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_"
> >> >[(set_attr "type" "mve_move")
> >> >  ])
> >> >
> >> > +(define_insn "@mve_vec_pack_trunc_"
> >> > + [(set (match_operand: 0 "register_operand" "=")
> >> > +   (vec_concat:
> >> > + (truncate:
> >> > + (match_operand:MVE_5 1 "register_operand" "w"))
> >> > + (truncate:
> >> > +  

Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-11 Thread Richard Sandiford via Gcc-patches
Christophe Lyon  writes:
> On Tue, 8 Jun 2021 at 14:10, Richard Sandiford
>  wrote:
>>
>> Christophe Lyon  writes:
>> > This patch adds vec_unpack_hi_, vec_unpack_lo_,
>> > vec_pack_trunc_ patterns for MVE.
>> >
>> > It does so by moving the unpack patterns from neon.md to
>> > vec-common.md, while adding them support for MVE. The pack expander is
>> > derived from the Neon one (which in turn is renamed into
>> > neon_quad_vec_pack_trunc_).
>> >
>> > The patch introduces mve_vec_pack_trunc_ to avoid the need for a
>> > zero-initialized temporary, which is needed if the
>> > vec_pack_trunc_ expander calls @mve_vmovn[bt]q_
>> > instead.
>> >
>> > With this patch, we can now vectorize the 16 and 8-bit versions of
>> > vclz and vshl, although the generated code could still be improved.
>> > For test_clz_s16, we now generate
>> > vldrh.16q3, [r1]
>> > vmovlb.s16   q2, q3
>> > vmovlt.s16   q3, q3
>> > vclz.i32  q2, q2
>> > vclz.i32  q3, q3
>> > vmovnb.i32  q1, q2
>> > vmovnt.i32  q1, q3
>> > vstrh.16q1, [r0]
>> > which could be improved to
>> > vldrh.16q3, [r1]
>> >   vclz.i16q1, q3
>> > vstrh.16q1, [r0]
>> > if we could avoid the need for unpack/pack steps.
>>
>> Yeah, there was a PR about fixing this for popcount.  I guess the same
>> approach would apply here too.
>>
>> > For reference, clang-12 generates:
>> >   vldrh.s32   q0, [r1]
>> >   vldrh.s32   q1, [r1, #8]
>> >   vclz.i32q0, q0
>> >   vstrh.32q0, [r0]
>> >   vclz.i32q0, q1
>> >   vstrh.32q0, [r0, #8]
>> >
>> > 2021-06-03  Christophe Lyon  
>> >
>> >   gcc/
>> >   * config/arm/mve.md (mve_vmovltq_): Prefix with '@'.
>> >   (mve_vmovlbq_): Likewise.
>> >   (mve_vmovnbq_): Likewise.
>> >   (mve_vmovntq_): Likewise.
>> >   (@mve_vec_pack_trunc_): New pattern.
>> >   * config/arm/neon.md (vec_unpack_hi_): Move to
>> >   vec-common.md.
>> >   (vec_unpack_lo_): Likewise.
>> >   (vec_pack_trunc_): Rename to
>> >   neon_quad_vec_pack_trunc_.
>> >   * config/arm/vec-common.md (vec_unpack_hi_): New
>> >   pattern.
>> >   (vec_unpack_lo_): New.
>> >   (vec_pack_trunc_): New.
>> >
>> >   gcc/testsuite/
>> >   * gcc.target/arm/simd/mve-vclz.c: Update expected results.
>> >   * gcc.target/arm/simd/mve-vshl.c: Likewise.
>> > ---
>> >  gcc/config/arm/mve.md| 20 -
>> >  gcc/config/arm/neon.md   | 39 +
>> >  gcc/config/arm/vec-common.md | 89 
>> >  gcc/testsuite/gcc.target/arm/simd/mve-vclz.c |  7 +-
>> >  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c |  5 +-
>> >  5 files changed, 114 insertions(+), 46 deletions(-)
>> >
>> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
>> > index 99e46d0bc69..b18292c07d3 100644
>> > --- a/gcc/config/arm/mve.md
>> > +++ b/gcc/config/arm/mve.md
>> > @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_"
>> >  ;;
>> >  ;; [vmovltq_u, vmovltq_s])
>> >  ;;
>> > -(define_insn "mve_vmovltq_"
>> > +(define_insn "@mve_vmovltq_"
>> >[
>> > (set (match_operand: 0 "s_register_operand" "=w")
>> >   (unspec: [(match_operand:MVE_3 1 
>> > "s_register_operand" "w")]
>> > @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_"
>> >  ;;
>> >  ;; [vmovlbq_s, vmovlbq_u])
>> >  ;;
>> > -(define_insn "mve_vmovlbq_"
>> > +(define_insn "@mve_vmovlbq_"
>> >[
>> > (set (match_operand: 0 "s_register_operand" "=w")
>> >   (unspec: [(match_operand:MVE_3 1 
>> > "s_register_operand" "w")]
>> > @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s"
>> >  ;;
>> >  ;; [vmovnbq_u, vmovnbq_s])
>> >  ;;
>> > -(define_insn "mve_vmovnbq_"
>> > +(define_insn "@mve_vmovnbq_"
>> >[
>> > (set (match_operand: 0 "s_register_operand" "=w")
>> >   (unspec: [(match_operand: 1 
>> > "s_register_operand" "0")
>> > @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_"
>> >  ;;
>> >  ;; [vmovntq_s, vmovntq_u])
>> >  ;;
>> > -(define_insn "mve_vmovntq_"
>> > +(define_insn "@mve_vmovntq_"
>> >[
>> > (set (match_operand: 0 "s_register_operand" "=w")
>> >   (unspec: [(match_operand: 1 
>> > "s_register_operand" "0")
>> > @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_"
>> >[(set_attr "type" "mve_move")
>> >  ])
>> >
>> > +(define_insn "@mve_vec_pack_trunc_"
>> > + [(set (match_operand: 0 "register_operand" "=")
>> > +   (vec_concat:
>> > + (truncate:
>> > + (match_operand:MVE_5 1 "register_operand" "w"))
>> > + (truncate:
>> > + (match_operand:MVE_5 2 "register_operand" "w"]
>> > + "TARGET_HAVE_MVE"
>> > + "vmovnb.i%q0, %q1\;vmovnt.i   %q0, %q2"
>> > +  [(set_attr "type" "mve_move")]
>> > +)
>> > +
>>
>> I realise this is (like you say) based on the neon.md pattern,
>> but we should use separate vmovnb and 

Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-10 Thread Christophe Lyon via Gcc-patches
On Thu, 10 Jun 2021 at 11:50, Christophe Lyon
 wrote:
>
> On Tue, 8 Jun 2021 at 14:10, Richard Sandiford
>  wrote:
> >
> > Christophe Lyon  writes:
> > > This patch adds vec_unpack_hi_, vec_unpack_lo_,
> > > vec_pack_trunc_ patterns for MVE.
> > >
> > > It does so by moving the unpack patterns from neon.md to
> > > vec-common.md, while adding them support for MVE. The pack expander is
> > > derived from the Neon one (which in turn is renamed into
> > > neon_quad_vec_pack_trunc_).
> > >
> > > The patch introduces mve_vec_pack_trunc_ to avoid the need for a
> > > zero-initialized temporary, which is needed if the
> > > vec_pack_trunc_ expander calls @mve_vmovn[bt]q_
> > > instead.
> > >
> > > With this patch, we can now vectorize the 16 and 8-bit versions of
> > > vclz and vshl, although the generated code could still be improved.
> > > For test_clz_s16, we now generate
> > > vldrh.16q3, [r1]
> > > vmovlb.s16   q2, q3
> > > vmovlt.s16   q3, q3
> > > vclz.i32  q2, q2
> > > vclz.i32  q3, q3
> > > vmovnb.i32  q1, q2
> > > vmovnt.i32  q1, q3
> > > vstrh.16q1, [r0]
> > > which could be improved to
> > > vldrh.16q3, [r1]
> > >   vclz.i16q1, q3
> > > vstrh.16q1, [r0]
> > > if we could avoid the need for unpack/pack steps.
> >
> > Yeah, there was a PR about fixing this for popcount.  I guess the same
> > approach would apply here too.
> >
> > > For reference, clang-12 generates:
> > >   vldrh.s32   q0, [r1]
> > >   vldrh.s32   q1, [r1, #8]
> > >   vclz.i32q0, q0
> > >   vstrh.32q0, [r0]
> > >   vclz.i32q0, q1
> > >   vstrh.32q0, [r0, #8]
> > >
> > > 2021-06-03  Christophe Lyon  
> > >
> > >   gcc/
> > >   * config/arm/mve.md (mve_vmovltq_): Prefix with '@'.
> > >   (mve_vmovlbq_): Likewise.
> > >   (mve_vmovnbq_): Likewise.
> > >   (mve_vmovntq_): Likewise.
> > >   (@mve_vec_pack_trunc_): New pattern.
> > >   * config/arm/neon.md (vec_unpack_hi_): Move to
> > >   vec-common.md.
> > >   (vec_unpack_lo_): Likewise.
> > >   (vec_pack_trunc_): Rename to
> > >   neon_quad_vec_pack_trunc_.
> > >   * config/arm/vec-common.md (vec_unpack_hi_): New
> > >   pattern.
> > >   (vec_unpack_lo_): New.
> > >   (vec_pack_trunc_): New.
> > >
> > >   gcc/testsuite/
> > >   * gcc.target/arm/simd/mve-vclz.c: Update expected results.
> > >   * gcc.target/arm/simd/mve-vshl.c: Likewise.
> > > ---
> > >  gcc/config/arm/mve.md| 20 -
> > >  gcc/config/arm/neon.md   | 39 +
> > >  gcc/config/arm/vec-common.md | 89 
> > >  gcc/testsuite/gcc.target/arm/simd/mve-vclz.c |  7 +-
> > >  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c |  5 +-
> > >  5 files changed, 114 insertions(+), 46 deletions(-)
> > >
> > > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > > index 99e46d0bc69..b18292c07d3 100644
> > > --- a/gcc/config/arm/mve.md
> > > +++ b/gcc/config/arm/mve.md
> > > @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_"
> > >  ;;
> > >  ;; [vmovltq_u, vmovltq_s])
> > >  ;;
> > > -(define_insn "mve_vmovltq_"
> > > +(define_insn "@mve_vmovltq_"
> > >[
> > > (set (match_operand: 0 "s_register_operand" "=w")
> > >   (unspec: [(match_operand:MVE_3 1 
> > > "s_register_operand" "w")]
> > > @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_"
> > >  ;;
> > >  ;; [vmovlbq_s, vmovlbq_u])
> > >  ;;
> > > -(define_insn "mve_vmovlbq_"
> > > +(define_insn "@mve_vmovlbq_"
> > >[
> > > (set (match_operand: 0 "s_register_operand" "=w")
> > >   (unspec: [(match_operand:MVE_3 1 
> > > "s_register_operand" "w")]
> > > @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s"
> > >  ;;
> > >  ;; [vmovnbq_u, vmovnbq_s])
> > >  ;;
> > > -(define_insn "mve_vmovnbq_"
> > > +(define_insn "@mve_vmovnbq_"
> > >[
> > > (set (match_operand: 0 "s_register_operand" "=w")
> > >   (unspec: [(match_operand: 1 
> > > "s_register_operand" "0")
> > > @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_"
> > >  ;;
> > >  ;; [vmovntq_s, vmovntq_u])
> > >  ;;
> > > -(define_insn "mve_vmovntq_"
> > > +(define_insn "@mve_vmovntq_"
> > >[
> > > (set (match_operand: 0 "s_register_operand" "=w")
> > >   (unspec: [(match_operand: 1 
> > > "s_register_operand" "0")
> > > @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_"
> > >[(set_attr "type" "mve_move")
> > >  ])
> > >
> > > +(define_insn "@mve_vec_pack_trunc_"
> > > + [(set (match_operand: 0 "register_operand" "=")
> > > +   (vec_concat:
> > > + (truncate:
> > > + (match_operand:MVE_5 1 "register_operand" "w"))
> > > + (truncate:
> > > + (match_operand:MVE_5 2 "register_operand" "w"]
> > > + "TARGET_HAVE_MVE"
> > > + "vmovnb.i%q0, %q1\;vmovnt.i   %q0, %q2"
> > > + 

Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-10 Thread Christophe Lyon via Gcc-patches
On Tue, 8 Jun 2021 at 14:10, Richard Sandiford
 wrote:
>
> Christophe Lyon  writes:
> > This patch adds vec_unpack_hi_, vec_unpack_lo_,
> > vec_pack_trunc_ patterns for MVE.
> >
> > It does so by moving the unpack patterns from neon.md to
> > vec-common.md, while adding them support for MVE. The pack expander is
> > derived from the Neon one (which in turn is renamed into
> > neon_quad_vec_pack_trunc_).
> >
> > The patch introduces mve_vec_pack_trunc_ to avoid the need for a
> > zero-initialized temporary, which is needed if the
> > vec_pack_trunc_ expander calls @mve_vmovn[bt]q_
> > instead.
> >
> > With this patch, we can now vectorize the 16 and 8-bit versions of
> > vclz and vshl, although the generated code could still be improved.
> > For test_clz_s16, we now generate
> > vldrh.16q3, [r1]
> > vmovlb.s16   q2, q3
> > vmovlt.s16   q3, q3
> > vclz.i32  q2, q2
> > vclz.i32  q3, q3
> > vmovnb.i32  q1, q2
> > vmovnt.i32  q1, q3
> > vstrh.16q1, [r0]
> > which could be improved to
> > vldrh.16q3, [r1]
> >   vclz.i16q1, q3
> > vstrh.16q1, [r0]
> > if we could avoid the need for unpack/pack steps.
>
> Yeah, there was a PR about fixing this for popcount.  I guess the same
> approach would apply here too.
>
> > For reference, clang-12 generates:
> >   vldrh.s32   q0, [r1]
> >   vldrh.s32   q1, [r1, #8]
> >   vclz.i32q0, q0
> >   vstrh.32q0, [r0]
> >   vclz.i32q0, q1
> >   vstrh.32q0, [r0, #8]
> >
> > 2021-06-03  Christophe Lyon  
> >
> >   gcc/
> >   * config/arm/mve.md (mve_vmovltq_): Prefix with '@'.
> >   (mve_vmovlbq_): Likewise.
> >   (mve_vmovnbq_): Likewise.
> >   (mve_vmovntq_): Likewise.
> >   (@mve_vec_pack_trunc_): New pattern.
> >   * config/arm/neon.md (vec_unpack_hi_): Move to
> >   vec-common.md.
> >   (vec_unpack_lo_): Likewise.
> >   (vec_pack_trunc_): Rename to
> >   neon_quad_vec_pack_trunc_.
> >   * config/arm/vec-common.md (vec_unpack_hi_): New
> >   pattern.
> >   (vec_unpack_lo_): New.
> >   (vec_pack_trunc_): New.
> >
> >   gcc/testsuite/
> >   * gcc.target/arm/simd/mve-vclz.c: Update expected results.
> >   * gcc.target/arm/simd/mve-vshl.c: Likewise.
> > ---
> >  gcc/config/arm/mve.md| 20 -
> >  gcc/config/arm/neon.md   | 39 +
> >  gcc/config/arm/vec-common.md | 89 
> >  gcc/testsuite/gcc.target/arm/simd/mve-vclz.c |  7 +-
> >  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c |  5 +-
> >  5 files changed, 114 insertions(+), 46 deletions(-)
> >
> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> > index 99e46d0bc69..b18292c07d3 100644
> > --- a/gcc/config/arm/mve.md
> > +++ b/gcc/config/arm/mve.md
> > @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_"
> >  ;;
> >  ;; [vmovltq_u, vmovltq_s])
> >  ;;
> > -(define_insn "mve_vmovltq_"
> > +(define_insn "@mve_vmovltq_"
> >[
> > (set (match_operand: 0 "s_register_operand" "=w")
> >   (unspec: [(match_operand:MVE_3 1 "s_register_operand" 
> > "w")]
> > @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_"
> >  ;;
> >  ;; [vmovlbq_s, vmovlbq_u])
> >  ;;
> > -(define_insn "mve_vmovlbq_"
> > +(define_insn "@mve_vmovlbq_"
> >[
> > (set (match_operand: 0 "s_register_operand" "=w")
> >   (unspec: [(match_operand:MVE_3 1 "s_register_operand" 
> > "w")]
> > @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s"
> >  ;;
> >  ;; [vmovnbq_u, vmovnbq_s])
> >  ;;
> > -(define_insn "mve_vmovnbq_"
> > +(define_insn "@mve_vmovnbq_"
> >[
> > (set (match_operand: 0 "s_register_operand" "=w")
> >   (unspec: [(match_operand: 1 
> > "s_register_operand" "0")
> > @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_"
> >  ;;
> >  ;; [vmovntq_s, vmovntq_u])
> >  ;;
> > -(define_insn "mve_vmovntq_"
> > +(define_insn "@mve_vmovntq_"
> >[
> > (set (match_operand: 0 "s_register_operand" "=w")
> >   (unspec: [(match_operand: 1 
> > "s_register_operand" "0")
> > @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_"
> >[(set_attr "type" "mve_move")
> >  ])
> >
> > +(define_insn "@mve_vec_pack_trunc_"
> > + [(set (match_operand: 0 "register_operand" "=")
> > +   (vec_concat:
> > + (truncate:
> > + (match_operand:MVE_5 1 "register_operand" "w"))
> > + (truncate:
> > + (match_operand:MVE_5 2 "register_operand" "w"]
> > + "TARGET_HAVE_MVE"
> > + "vmovnb.i%q0, %q1\;vmovnt.i   %q0, %q2"
> > +  [(set_attr "type" "mve_move")]
> > +)
> > +
>
> I realise this is (like you say) based on the neon.md pattern,
> but we should use separate vmovnb and vmovnt instructions instead
> of putting two instructions into a single pattern.
>
> One specific advantage to using separate patterns is that it would
> avoid the 

Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-08 Thread Richard Sandiford via Gcc-patches
Christophe Lyon  writes:
> This patch adds vec_unpack_hi_, vec_unpack_lo_,
> vec_pack_trunc_ patterns for MVE.
>
> It does so by moving the unpack patterns from neon.md to
> vec-common.md, while adding them support for MVE. The pack expander is
> derived from the Neon one (which in turn is renamed into
> neon_quad_vec_pack_trunc_).
>
> The patch introduces mve_vec_pack_trunc_ to avoid the need for a
> zero-initialized temporary, which is needed if the
> vec_pack_trunc_ expander calls @mve_vmovn[bt]q_
> instead.
>
> With this patch, we can now vectorize the 16 and 8-bit versions of
> vclz and vshl, although the generated code could still be improved.
> For test_clz_s16, we now generate
> vldrh.16q3, [r1]
> vmovlb.s16   q2, q3
> vmovlt.s16   q3, q3
> vclz.i32  q2, q2
> vclz.i32  q3, q3
> vmovnb.i32  q1, q2
> vmovnt.i32  q1, q3
> vstrh.16q1, [r0]
> which could be improved to
> vldrh.16q3, [r1]
>   vclz.i16q1, q3
> vstrh.16q1, [r0]
> if we could avoid the need for unpack/pack steps.

Yeah, there was a PR about fixing this for popcount.  I guess the same
approach would apply here too.

> For reference, clang-12 generates:
>   vldrh.s32   q0, [r1]
>   vldrh.s32   q1, [r1, #8]
>   vclz.i32q0, q0
>   vstrh.32q0, [r0]
>   vclz.i32q0, q1
>   vstrh.32q0, [r0, #8]
>
> 2021-06-03  Christophe Lyon  
>
>   gcc/
>   * config/arm/mve.md (mve_vmovltq_): Prefix with '@'.
>   (mve_vmovlbq_): Likewise.
>   (mve_vmovnbq_): Likewise.
>   (mve_vmovntq_): Likewise.
>   (@mve_vec_pack_trunc_): New pattern.
>   * config/arm/neon.md (vec_unpack_hi_): Move to
>   vec-common.md.
>   (vec_unpack_lo_): Likewise.
>   (vec_pack_trunc_): Rename to
>   neon_quad_vec_pack_trunc_.
>   * config/arm/vec-common.md (vec_unpack_hi_): New
>   pattern.
>   (vec_unpack_lo_): New.
>   (vec_pack_trunc_): New.
>
>   gcc/testsuite/
>   * gcc.target/arm/simd/mve-vclz.c: Update expected results.
>   * gcc.target/arm/simd/mve-vshl.c: Likewise.
> ---
>  gcc/config/arm/mve.md| 20 -
>  gcc/config/arm/neon.md   | 39 +
>  gcc/config/arm/vec-common.md | 89 
>  gcc/testsuite/gcc.target/arm/simd/mve-vclz.c |  7 +-
>  gcc/testsuite/gcc.target/arm/simd/mve-vshl.c |  5 +-
>  5 files changed, 114 insertions(+), 46 deletions(-)
>
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 99e46d0bc69..b18292c07d3 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_"
>  ;;
>  ;; [vmovltq_u, vmovltq_s])
>  ;;
> -(define_insn "mve_vmovltq_"
> +(define_insn "@mve_vmovltq_"
>[
> (set (match_operand: 0 "s_register_operand" "=w")
>   (unspec: [(match_operand:MVE_3 1 "s_register_operand" 
> "w")]
> @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_"
>  ;;
>  ;; [vmovlbq_s, vmovlbq_u])
>  ;;
> -(define_insn "mve_vmovlbq_"
> +(define_insn "@mve_vmovlbq_"
>[
> (set (match_operand: 0 "s_register_operand" "=w")
>   (unspec: [(match_operand:MVE_3 1 "s_register_operand" 
> "w")]
> @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s"
>  ;;
>  ;; [vmovnbq_u, vmovnbq_s])
>  ;;
> -(define_insn "mve_vmovnbq_"
> +(define_insn "@mve_vmovnbq_"
>[
> (set (match_operand: 0 "s_register_operand" "=w")
>   (unspec: [(match_operand: 1 
> "s_register_operand" "0")
> @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_"
>  ;;
>  ;; [vmovntq_s, vmovntq_u])
>  ;;
> -(define_insn "mve_vmovntq_"
> +(define_insn "@mve_vmovntq_"
>[
> (set (match_operand: 0 "s_register_operand" "=w")
>   (unspec: [(match_operand: 1 
> "s_register_operand" "0")
> @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_"
>[(set_attr "type" "mve_move")
>  ])
>  
> +(define_insn "@mve_vec_pack_trunc_"
> + [(set (match_operand: 0 "register_operand" "=")
> +   (vec_concat:
> + (truncate:
> + (match_operand:MVE_5 1 "register_operand" "w"))
> + (truncate:
> + (match_operand:MVE_5 2 "register_operand" "w"]
> + "TARGET_HAVE_MVE"
> + "vmovnb.i%q0, %q1\;vmovnt.i   %q0, %q2"
> +  [(set_attr "type" "mve_move")]
> +)
> +

I realise this is (like you say) based on the neon.md pattern,
but we should use separate vmovnb and vmovnt instructions instead
of putting two instructions into a single pattern.

One specific advantage to using separate patterns is that it would
avoid the imprecision of the earlyclobber: the output only conflicts
with operand 1 and can be tied to operand 2.

>  ;;
>  ;; [vmulq_f])
>  ;;
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 0fdffaf4ec4..392d9607919 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -5924,43 +5924,6 @@ 

[PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns

2021-06-07 Thread Christophe Lyon via Gcc-patches
This patch adds vec_unpack_hi_, vec_unpack_lo_,
vec_pack_trunc_ patterns for MVE.

It does so by moving the unpack patterns from neon.md to
vec-common.md, while adding them support for MVE. The pack expander is
derived from the Neon one (which in turn is renamed into
neon_quad_vec_pack_trunc_).

The patch introduces mve_vec_pack_trunc_ to avoid the need for a
zero-initialized temporary, which is needed if the
vec_pack_trunc_ expander calls @mve_vmovn[bt]q_
instead.

With this patch, we can now vectorize the 16 and 8-bit versions of
vclz and vshl, although the generated code could still be improved.
For test_clz_s16, we now generate
vldrh.16q3, [r1]
vmovlb.s16   q2, q3
vmovlt.s16   q3, q3
vclz.i32  q2, q2
vclz.i32  q3, q3
vmovnb.i32  q1, q2
vmovnt.i32  q1, q3
vstrh.16q1, [r0]
which could be improved to
vldrh.16q3, [r1]
vclz.i16q1, q3
vstrh.16q1, [r0]
if we could avoid the need for unpack/pack steps.

For reference, clang-12 generates:
vldrh.s32   q0, [r1]
vldrh.s32   q1, [r1, #8]
vclz.i32q0, q0
vstrh.32q0, [r0]
vclz.i32q0, q1
vstrh.32q0, [r0, #8]

2021-06-03  Christophe Lyon  

gcc/
* config/arm/mve.md (mve_vmovltq_): Prefix with '@'.
(mve_vmovlbq_): Likewise.
(mve_vmovnbq_): Likewise.
(mve_vmovntq_): Likewise.
(@mve_vec_pack_trunc_): New pattern.
* config/arm/neon.md (vec_unpack_hi_): Move to
vec-common.md.
(vec_unpack_lo_): Likewise.
(vec_pack_trunc_): Rename to
neon_quad_vec_pack_trunc_.
* config/arm/vec-common.md (vec_unpack_hi_): New
pattern.
(vec_unpack_lo_): New.
(vec_pack_trunc_): New.

gcc/testsuite/
* gcc.target/arm/simd/mve-vclz.c: Update expected results.
* gcc.target/arm/simd/mve-vshl.c: Likewise.
---
 gcc/config/arm/mve.md| 20 -
 gcc/config/arm/neon.md   | 39 +
 gcc/config/arm/vec-common.md | 89 
 gcc/testsuite/gcc.target/arm/simd/mve-vclz.c |  7 +-
 gcc/testsuite/gcc.target/arm/simd/mve-vshl.c |  5 +-
 5 files changed, 114 insertions(+), 46 deletions(-)

diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 99e46d0bc69..b18292c07d3 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_"
 ;;
 ;; [vmovltq_u, vmovltq_s])
 ;;
-(define_insn "mve_vmovltq_"
+(define_insn "@mve_vmovltq_"
   [
(set (match_operand: 0 "s_register_operand" "=w")
(unspec: [(match_operand:MVE_3 1 "s_register_operand" 
"w")]
@@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_"
 ;;
 ;; [vmovlbq_s, vmovlbq_u])
 ;;
-(define_insn "mve_vmovlbq_"
+(define_insn "@mve_vmovlbq_"
   [
(set (match_operand: 0 "s_register_operand" "=w")
(unspec: [(match_operand:MVE_3 1 "s_register_operand" 
"w")]
@@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s"
 ;;
 ;; [vmovnbq_u, vmovnbq_s])
 ;;
-(define_insn "mve_vmovnbq_"
+(define_insn "@mve_vmovnbq_"
   [
(set (match_operand: 0 "s_register_operand" "=w")
(unspec: [(match_operand: 1 
"s_register_operand" "0")
@@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_"
 ;;
 ;; [vmovntq_s, vmovntq_u])
 ;;
-(define_insn "mve_vmovntq_"
+(define_insn "@mve_vmovntq_"
   [
(set (match_operand: 0 "s_register_operand" "=w")
(unspec: [(match_operand: 1 
"s_register_operand" "0")
@@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_"
   [(set_attr "type" "mve_move")
 ])
 
+(define_insn "@mve_vec_pack_trunc_"
+ [(set (match_operand: 0 "register_operand" "=")
+   (vec_concat:
+   (truncate:
+   (match_operand:MVE_5 1 "register_operand" "w"))
+   (truncate:
+   (match_operand:MVE_5 2 "register_operand" "w"]
+ "TARGET_HAVE_MVE"
+ "vmovnb.i  %q0, %q1\;vmovnt.i   %q0, %q2"
+  [(set_attr "type" "mve_move")]
+)
+
 ;;
 ;; [vmulq_f])
 ;;
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 0fdffaf4ec4..392d9607919 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -5924,43 +5924,6 @@ (define_insn "neon_vec_unpack_hi_"
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
-(define_expand "vec_unpack_hi_"
-  [(match_operand: 0 "register_operand")
-   (SE: (match_operand:VU 1 "register_operand"))]
- "TARGET_NEON && !BYTES_BIG_ENDIAN"
-  {
-   rtvec v = rtvec_alloc (/2)  ;
-   rtx t1;
-   int i;
-   for (i = 0; i < (/2); i++)
- RTVEC_ELT (v, i) = GEN_INT ((/2) + i);
-  
-   t1 = gen_rtx_PARALLEL (mode, v);
-   emit_insn (gen_neon_vec_unpack_hi_ (operands[0], 
- operands[1], 
-t1));
-   DONE;
-  }
-)
-
-(define_expand "vec_unpack_lo_"
-  [(match_operand: 0 "register_operand")
-