Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
On Mon, 14 Jun 2021 at 18:01, Richard Sandiford wrote: > > Christophe Lyon writes: > > On Fri, 11 Jun 2021 at 18:25, Richard Sandiford > > wrote: > >> > >> Christophe Lyon writes: > >> > Thanks for the feedback. How about v2 attached? > >> > Do you want me to merge neon_vec_unpack and > >> > mve_vec_unpack and only have different assembly? > >> > if (TARGET_HAVE_MVE) > >> > return "vmovlb.%# %q0, %q1"; > >> > else > >> > return "vmovlb.%# %q0, %q1"; > >> > >> I think it'd be better to keep them separate, given that they have > >> different attributes. > >> > >> > @@ -2199,10 +,23 @@ (define_insn "mve_vmovnbq_" > >> >[(set_attr "type" "mve_move") > >> > ]) > >> > > >> > +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the > >> > +;; need for an extra register. > >> > >> “to avoid the need for an uninitailized input operand” might > >> be clearer. > >> > > Done > > > >> > +(define_insn "@mve_vec_pack_trunc_lo_" > >> > + [ > >> > + (set (match_operand: 0 "s_register_operand" "=w") > >> > + (unspec: [(match_operand:MVE_5 1 > >> > "s_register_operand" "w")] > >> > + VMOVNBQ_S)) > >> > + ] > >> > + "TARGET_HAVE_MVE" > >> > + "vmovnb.i%# %q0, %q1" > >> > + [(set_attr "type" "mve_move") > >> > +]) > >> > + > >> > […] > >> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > >> > index 430a92ce966..3afb3e1d891 100644 > >> > --- a/gcc/config/arm/vec-common.md > >> > +++ b/gcc/config/arm/vec-common.md > >> > @@ -632,3 +632,91 @@ (define_expand "clz2" > >> >"ARM_HAVE__ARITH > >> > && !TARGET_REALLY_IWMMXT" > >> > ) > >> > + > >> > +;; vmovl[tb] are not available for V4SI on MVE > >> > +(define_expand "vec_unpack_hi_" > >> > + [(match_operand: 0 "register_operand") > >> > + (SE: (match_operand:VU 1 "register_operand"))] > >> > + "ARM_HAVE__ARITH > >> > + && !TARGET_REALLY_IWMMXT > >> > + && ! (mode == V4SImode && TARGET_HAVE_MVE) > >> > + && !BYTES_BIG_ENDIAN" > >> > + { > >> > +rtvec v = rtvec_alloc (/2); > >> > +rtx t1; > >> > +int i; > >> > +for (i = 0; i < (/2); i++) > >> > + RTVEC_ELT (v, i) = GEN_INT ((/2) + i); > >> > + > >> > +t1 = gen_rtx_PARALLEL (mode, v); > >> > + > >> > +if (TARGET_NEON) > >> > + emit_insn (gen_neon_vec_unpack_hi_ (operands[0], > >> > + operands[1], > >> > + t1)); > >> > +else > >> > + emit_insn (gen_mve_vec_unpack_hi_ (operands[0], > >> > +operands[1], > >> > +t1)); > >> > +DONE; > >> > + } > >> > +) > >> > >> Since the patterns are the same for both MVE and Neon (a good thing!), > >> it would be simpler to write this as: > >> > >> (define_insn "mve_vec_unpack_lo_" > > I guess you mean define_expand? > > Yes, sorry. > > >> [(set (match_operand: 0 "register_operand" "=w") > >> (SE: (vec_select: > >> (match_operand:VU 1 "register_operand" "w") > >> (match_dup 2] > >> … > >> > >> and then set operands[2] to what is t1 above. There would then be > >> no need to call emit_insn & DONE, and we could use the natural > >> iterator (MVE_3) for the MVE patterns. > >> > >> Same idea for the lo version. > >> > >> > […] > >> > +;; vmovn[tb] are not available for V2DI on MVE > >> > +(define_expand "vec_pack_trunc_" > >> > + [(set (match_operand: 0 "register_operand") > >> > + (vec_concat: > >> > + (truncate: > >> > + (match_operand:VN 1 "register_operand")) > >> > + (truncate: > >> > + (match_operand:VN 2 "register_operand"] > >> > + "ARM_HAVE__ARITH > >> > + && !TARGET_REALLY_IWMMXT > >> > + && ! (mode == V2DImode && TARGET_HAVE_MVE) > >> > + && !BYTES_BIG_ENDIAN" > >> > + { > >> > + if (TARGET_NEON) > >> > + { > >> > + emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], > >> > operands[1], > >> > +operands[2])); > >> > + } > >> > + else > >> > + { > >> > + rtx tmpreg = gen_reg_rtx (mode); > >> > + emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, > >> > operands[1])); > >> > + emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode, > >> > +tmpreg, tmpreg, operands[2])); > >> > + emit_move_insn (operands[0], tmpreg); > >> > >> vmovntq can assign directly to operands[0], without a separate move. > >> > >> OK with those changes, thanks. > > > > Just to be sure, here is v3 which hopefully matches what you meant. > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > > index 430a92ce966..1d6d4379855 100644 > > --- a/gcc/config/arm/vec-common.md > > +++ b/gcc/config/arm/vec-common.md > > @@ -632,3 +632,78 @@ (define_expand "clz2" > >"ARM_HAVE__ARITH > > &&
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
Christophe Lyon writes: > On Fri, 11 Jun 2021 at 18:25, Richard Sandiford > wrote: >> >> Christophe Lyon writes: >> > Thanks for the feedback. How about v2 attached? >> > Do you want me to merge neon_vec_unpack and >> > mve_vec_unpack and only have different assembly? >> > if (TARGET_HAVE_MVE) >> > return "vmovlb.%# %q0, %q1"; >> > else >> > return "vmovlb.%# %q0, %q1"; >> >> I think it'd be better to keep them separate, given that they have >> different attributes. >> >> > @@ -2199,10 +,23 @@ (define_insn "mve_vmovnbq_" >> >[(set_attr "type" "mve_move") >> > ]) >> > >> > +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the >> > +;; need for an extra register. >> >> “to avoid the need for an uninitailized input operand” might >> be clearer. >> > Done > >> > +(define_insn "@mve_vec_pack_trunc_lo_" >> > + [ >> > + (set (match_operand: 0 "s_register_operand" "=w") >> > + (unspec: [(match_operand:MVE_5 1 "s_register_operand" >> > "w")] >> > + VMOVNBQ_S)) >> > + ] >> > + "TARGET_HAVE_MVE" >> > + "vmovnb.i%# %q0, %q1" >> > + [(set_attr "type" "mve_move") >> > +]) >> > + >> > […] >> > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md >> > index 430a92ce966..3afb3e1d891 100644 >> > --- a/gcc/config/arm/vec-common.md >> > +++ b/gcc/config/arm/vec-common.md >> > @@ -632,3 +632,91 @@ (define_expand "clz2" >> >"ARM_HAVE__ARITH >> > && !TARGET_REALLY_IWMMXT" >> > ) >> > + >> > +;; vmovl[tb] are not available for V4SI on MVE >> > +(define_expand "vec_unpack_hi_" >> > + [(match_operand: 0 "register_operand") >> > + (SE: (match_operand:VU 1 "register_operand"))] >> > + "ARM_HAVE__ARITH >> > + && !TARGET_REALLY_IWMMXT >> > + && ! (mode == V4SImode && TARGET_HAVE_MVE) >> > + && !BYTES_BIG_ENDIAN" >> > + { >> > +rtvec v = rtvec_alloc (/2); >> > +rtx t1; >> > +int i; >> > +for (i = 0; i < (/2); i++) >> > + RTVEC_ELT (v, i) = GEN_INT ((/2) + i); >> > + >> > +t1 = gen_rtx_PARALLEL (mode, v); >> > + >> > +if (TARGET_NEON) >> > + emit_insn (gen_neon_vec_unpack_hi_ (operands[0], >> > + operands[1], >> > + t1)); >> > +else >> > + emit_insn (gen_mve_vec_unpack_hi_ (operands[0], >> > +operands[1], >> > +t1)); >> > +DONE; >> > + } >> > +) >> >> Since the patterns are the same for both MVE and Neon (a good thing!), >> it would be simpler to write this as: >> >> (define_insn "mve_vec_unpack_lo_" > I guess you mean define_expand? Yes, sorry. >> [(set (match_operand: 0 "register_operand" "=w") >> (SE: (vec_select: >> (match_operand:VU 1 "register_operand" "w") >> (match_dup 2] >> … >> >> and then set operands[2] to what is t1 above. There would then be >> no need to call emit_insn & DONE, and we could use the natural >> iterator (MVE_3) for the MVE patterns. >> >> Same idea for the lo version. >> >> > […] >> > +;; vmovn[tb] are not available for V2DI on MVE >> > +(define_expand "vec_pack_trunc_" >> > + [(set (match_operand: 0 "register_operand") >> > + (vec_concat: >> > + (truncate: >> > + (match_operand:VN 1 "register_operand")) >> > + (truncate: >> > + (match_operand:VN 2 "register_operand"] >> > + "ARM_HAVE__ARITH >> > + && !TARGET_REALLY_IWMMXT >> > + && ! (mode == V2DImode && TARGET_HAVE_MVE) >> > + && !BYTES_BIG_ENDIAN" >> > + { >> > + if (TARGET_NEON) >> > + { >> > + emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], >> > operands[1], >> > +operands[2])); >> > + } >> > + else >> > + { >> > + rtx tmpreg = gen_reg_rtx (mode); >> > + emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, >> > operands[1])); >> > + emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode, >> > +tmpreg, tmpreg, operands[2])); >> > + emit_move_insn (operands[0], tmpreg); >> >> vmovntq can assign directly to operands[0], without a separate move. >> >> OK with those changes, thanks. > > Just to be sure, here is v3 which hopefully matches what you meant. > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > index 430a92ce966..1d6d4379855 100644 > --- a/gcc/config/arm/vec-common.md > +++ b/gcc/config/arm/vec-common.md > @@ -632,3 +632,78 @@ (define_expand "clz2" >"ARM_HAVE__ARITH > && !TARGET_REALLY_IWMMXT" > ) > + > +;; vmovl[tb] are not available for V4SI on MVE > +(define_expand "vec_unpack_hi_" > + [(set (match_operand: 0 "register_operand") > + (SE: (vec_select: > + (match_operand:VU 1 "register_operand") > + (match_dup 2] The formatting of the vec_unpack_lo_
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
On Fri, 11 Jun 2021 at 18:25, Richard Sandiford wrote: > > Christophe Lyon writes: > > Thanks for the feedback. How about v2 attached? > > Do you want me to merge neon_vec_unpack and > > mve_vec_unpack and only have different assembly? > > if (TARGET_HAVE_MVE) > > return "vmovlb.%# %q0, %q1"; > > else > > return "vmovlb.%# %q0, %q1"; > > I think it'd be better to keep them separate, given that they have > different attributes. > > > @@ -2199,10 +,23 @@ (define_insn "mve_vmovnbq_" > >[(set_attr "type" "mve_move") > > ]) > > > > +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the > > +;; need for an extra register. > > “to avoid the need for an uninitailized input operand” might > be clearer. > Done > > +(define_insn "@mve_vec_pack_trunc_lo_" > > + [ > > + (set (match_operand: 0 "s_register_operand" "=w") > > + (unspec: [(match_operand:MVE_5 1 "s_register_operand" > > "w")] > > + VMOVNBQ_S)) > > + ] > > + "TARGET_HAVE_MVE" > > + "vmovnb.i%# %q0, %q1" > > + [(set_attr "type" "mve_move") > > +]) > > + > > […] > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > > index 430a92ce966..3afb3e1d891 100644 > > --- a/gcc/config/arm/vec-common.md > > +++ b/gcc/config/arm/vec-common.md > > @@ -632,3 +632,91 @@ (define_expand "clz2" > >"ARM_HAVE__ARITH > > && !TARGET_REALLY_IWMMXT" > > ) > > + > > +;; vmovl[tb] are not available for V4SI on MVE > > +(define_expand "vec_unpack_hi_" > > + [(match_operand: 0 "register_operand") > > + (SE: (match_operand:VU 1 "register_operand"))] > > + "ARM_HAVE__ARITH > > + && !TARGET_REALLY_IWMMXT > > + && ! (mode == V4SImode && TARGET_HAVE_MVE) > > + && !BYTES_BIG_ENDIAN" > > + { > > +rtvec v = rtvec_alloc (/2); > > +rtx t1; > > +int i; > > +for (i = 0; i < (/2); i++) > > + RTVEC_ELT (v, i) = GEN_INT ((/2) + i); > > + > > +t1 = gen_rtx_PARALLEL (mode, v); > > + > > +if (TARGET_NEON) > > + emit_insn (gen_neon_vec_unpack_hi_ (operands[0], > > + operands[1], > > + t1)); > > +else > > + emit_insn (gen_mve_vec_unpack_hi_ (operands[0], > > +operands[1], > > +t1)); > > +DONE; > > + } > > +) > > Since the patterns are the same for both MVE and Neon (a good thing!), > it would be simpler to write this as: > > (define_insn "mve_vec_unpack_lo_" I guess you mean define_expand? > [(set (match_operand: 0 "register_operand" "=w") > (SE: (vec_select: > (match_operand:VU 1 "register_operand" "w") > (match_dup 2] > … > > and then set operands[2] to what is t1 above. There would then be > no need to call emit_insn & DONE, and we could use the natural > iterator (MVE_3) for the MVE patterns. > > Same idea for the lo version. > > > […] > > +;; vmovn[tb] are not available for V2DI on MVE > > +(define_expand "vec_pack_trunc_" > > + [(set (match_operand: 0 "register_operand") > > + (vec_concat: > > + (truncate: > > + (match_operand:VN 1 "register_operand")) > > + (truncate: > > + (match_operand:VN 2 "register_operand"] > > + "ARM_HAVE__ARITH > > + && !TARGET_REALLY_IWMMXT > > + && ! (mode == V2DImode && TARGET_HAVE_MVE) > > + && !BYTES_BIG_ENDIAN" > > + { > > + if (TARGET_NEON) > > + { > > + emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], > > operands[1], > > +operands[2])); > > + } > > + else > > + { > > + rtx tmpreg = gen_reg_rtx (mode); > > + emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, > > operands[1])); > > + emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode, > > +tmpreg, tmpreg, operands[2])); > > + emit_move_insn (operands[0], tmpreg); > > vmovntq can assign directly to operands[0], without a separate move. > > OK with those changes, thanks. Just to be sure, here is v3 which hopefully matches what you meant. I updated the commit message and removed: < I did not prefix them with '@', because I couldn't find how to call < gen_mve_vec_unpack_[lo|hi] with as first parameter. Anyway, we < can keep the VU iterator instead of MVE_3 since the offending V4SI < mode is disabled in the expander. OK? Thanks Christophe > > Richard > > > + } > > + DONE; > > + } > > +) > > […] From 9c1afae67249cd8c67cedd5eb852361a4d89d0b5 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Thu, 3 Jun 2021 14:35:50 + Subject: [PATCH v3] arm: Auto-vectorization for MVE: add pack/unpack patterns This patch adds vec_unpack_hi_, vec_unpack_lo_, vec_pack_trunc_ patterns for MVE. It does so by moving the unpack patterns from neon.md to vec-common.md, while adding them support for MVE. The pack
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
Christophe Lyon writes: > Thanks for the feedback. How about v2 attached? > Do you want me to merge neon_vec_unpack and > mve_vec_unpack and only have different assembly? > if (TARGET_HAVE_MVE) > return "vmovlb.%# %q0, %q1"; > else > return "vmovlb.%# %q0, %q1"; I think it'd be better to keep them separate, given that they have different attributes. > @@ -2199,10 +,23 @@ (define_insn "mve_vmovnbq_" >[(set_attr "type" "mve_move") > ]) > > +;; vmovnb pattern used by the vec_pack_trunc expander to avoid the > +;; need for an extra register. “to avoid the need for an uninitailized input operand” might be clearer. > +(define_insn "@mve_vec_pack_trunc_lo_" > + [ > + (set (match_operand: 0 "s_register_operand" "=w") > + (unspec: [(match_operand:MVE_5 1 "s_register_operand" > "w")] > + VMOVNBQ_S)) > + ] > + "TARGET_HAVE_MVE" > + "vmovnb.i%# %q0, %q1" > + [(set_attr "type" "mve_move") > +]) > + > […] > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > index 430a92ce966..3afb3e1d891 100644 > --- a/gcc/config/arm/vec-common.md > +++ b/gcc/config/arm/vec-common.md > @@ -632,3 +632,91 @@ (define_expand "clz2" >"ARM_HAVE__ARITH > && !TARGET_REALLY_IWMMXT" > ) > + > +;; vmovl[tb] are not available for V4SI on MVE > +(define_expand "vec_unpack_hi_" > + [(match_operand: 0 "register_operand") > + (SE: (match_operand:VU 1 "register_operand"))] > + "ARM_HAVE__ARITH > + && !TARGET_REALLY_IWMMXT > + && ! (mode == V4SImode && TARGET_HAVE_MVE) > + && !BYTES_BIG_ENDIAN" > + { > +rtvec v = rtvec_alloc (/2); > +rtx t1; > +int i; > +for (i = 0; i < (/2); i++) > + RTVEC_ELT (v, i) = GEN_INT ((/2) + i); > + > +t1 = gen_rtx_PARALLEL (mode, v); > + > +if (TARGET_NEON) > + emit_insn (gen_neon_vec_unpack_hi_ (operands[0], > + operands[1], > + t1)); > +else > + emit_insn (gen_mve_vec_unpack_hi_ (operands[0], > +operands[1], > +t1)); > +DONE; > + } > +) Since the patterns are the same for both MVE and Neon (a good thing!), it would be simpler to write this as: (define_insn "mve_vec_unpack_lo_" [(set (match_operand: 0 "register_operand" "=w") (SE: (vec_select: (match_operand:VU 1 "register_operand" "w") (match_dup 2] … and then set operands[2] to what is t1 above. There would then be no need to call emit_insn & DONE, and we could use the natural iterator (MVE_3) for the MVE patterns. Same idea for the lo version. > […] > +;; vmovn[tb] are not available for V2DI on MVE > +(define_expand "vec_pack_trunc_" > + [(set (match_operand: 0 "register_operand") > + (vec_concat: > + (truncate: > + (match_operand:VN 1 "register_operand")) > + (truncate: > + (match_operand:VN 2 "register_operand"] > + "ARM_HAVE__ARITH > + && !TARGET_REALLY_IWMMXT > + && ! (mode == V2DImode && TARGET_HAVE_MVE) > + && !BYTES_BIG_ENDIAN" > + { > + if (TARGET_NEON) > + { > + emit_insn (gen_neon_quad_vec_pack_trunc_ (operands[0], > operands[1], > +operands[2])); > + } > + else > + { > + rtx tmpreg = gen_reg_rtx (mode); > + emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, > operands[1])); > + emit_insn (gen_mve_vmovntq (VMOVNTQ_S, mode, > +tmpreg, tmpreg, operands[2])); > + emit_move_insn (operands[0], tmpreg); vmovntq can assign directly to operands[0], without a separate move. OK with those changes, thanks. Richard > + } > + DONE; > + } > +) > […]
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
On Fri, 11 Jun 2021 at 12:20, Richard Sandiford wrote: > > Christophe Lyon writes: > > In the meantime, I tried to make some progress, and looked at how > > things work on aarch64. > > > > This led me to define (in mve.md): > > > > (define_insn "@mve_vec_pack_trunc_lo_" > > [(set (match_operand: 0 "register_operand" "=w") > >(truncate: (match_operand:MVE_5 1 "register_operand" > > "w")))] > > "TARGET_HAVE_MVE" > > "vmovnb.i %q0, %q1\;" > > [(set_attr "type" "mve_move")] > > ) > > > > (define_insn "@mve_vec_pack_trunc_hi_" > > [(set (match_operand: 0 "register_operand" "=w") > >(vec_concat: > > (match_operand: 1 "register_operand" "0") > > (truncate: > > (match_operand:MVE_5 2 "register_operand" "w"] > > "TARGET_HAVE_MVE" > > "vmovnt.i %q0, %q2\;" > > [(set_attr "type" "mve_move")] > > ) > > > > and in vec-common.md, for > > (define_expand "vec_pack_trunc_" > > [(set (match_operand: 0 "register_operand") > >(vec_concat: > > (truncate: > > (match_operand:VN 1 "register_operand")) > > (truncate: > > (match_operand:VN 2 "register_operand"] > > > > I expand this for MVE: > > rtx tmpreg = gen_reg_rtx (mode); > > emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, > > operands[1])); > > emit_insn (gen_mve_vec_pack_trunc_hi (mode, operands[0], > > tmpreg, operands[2])); > > > > I am getting an error in reload: > > error: unable to generate reloads for: > > (insn 10 9 11 2 (set (reg:V4HI 122 [ vect__4.16 ]) > > (truncate:V4HI (reg:V4SI 118 [ vect__4.16 ]))) > > "/gcc.target/arm/simd/mve-vec-pack.c":17:112 3609 > > {mve_vec_pack_trunc_lo_v4si} > > (expr_list:REG_DEAD (reg:V4SI 118 [ vect__4.16 ]) > > (nil))) > > > > The next insn is: > > (insn 11 10 12 2 (set (reg:V8HI 121 [ vect__7.18 ]) > > (vec_concat:V8HI (reg:V4HI 122 [ vect__4.16 ]) > > (truncate:V4HI (reg:V4SI 119 [ vect__4.17 ] > > "/gcc.target/arm/simd/mve-vec-pack.c":17:112 3611 > > {mve_vec_pack_trunc_hi_v4si} > > (expr_list:REG_DEAD (reg:V4HI 122 [ vect__4.16 ]) > > (expr_list:REG_DEAD (reg:V4SI 119 [ vect__4.17 ]) > > (nil > > > > What is causing the reload error? > > For this to work on MVE, we'd need to allow the 64-bit V_narrow modes to be > stored in MVE registers. I'm not sure off-hand whether allowing that > would be a good idea or not. > > If we continue to allow only 128-bit vectors in MVE registers then > we'll need to continue to use unspecs instead of truncate and vec_concat. > Thanks for the feedback. How about v2 attached? Do you want me to merge neon_vec_unpack and mve_vec_unpack and only have different assembly? if (TARGET_HAVE_MVE) return "vmovlb.%# %q0, %q1"; else return "vmovlb.%# %q0, %q1"; > Thanks, > Richard From 5abc6196c934438421e293b6b4733a98ec80e982 Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Thu, 3 Jun 2021 14:35:50 + Subject: [PATCH v2] arm: Auto-vectorization for MVE: add pack/unpack patterns This patch adds vec_unpack_hi_, vec_unpack_lo_, vec_pack_trunc_ patterns for MVE. It does so by moving the unpack patterns from neon.md to vec-common.md, while adding them support for MVE. The pack expander is derived from the Neon one (which in turn is renamed into neon_quad_vec_pack_trunc_). The patch introduces mve_vec_unpack_lo_ and mve_vec_unpack_hi_ which are similar to their Neon counterparts, except for the assembly syntax. I did not prefix them with '@', because I couldn't find how to call gen_mve_vec_unpack_[lo|hi] with as first parameter. Anyway, we can keep the VU iterator instead of MVE_3 since the offending V4SI mode is disabled in the expander. The patch introduces mve_vec_pack_trunc_lo_ to avoid the need for a zero-initialized temporary, which is needed if the vec_pack_trunc_ expander calls @mve_vmovn[bt]q_ instead. With this patch, we can now vectorize the 16 and 8-bit versions of vclz and vshl, although the generated code could still be improved. For test_clz_s16, we now generate vldrh.16q3, [r1] vmovlb.s16 q2, q3 vmovlt.s16 q3, q3 vclz.i32 q2, q2 vclz.i32 q3, q3 vmovnb.i32 q1, q2 vmovnt.i32 q1, q3 vstrh.16q1, [r0] which could be improved to vldrh.16q3, [r1] vclz.i16 q1, q3 vstrh.16q1, [r0] if we could avoid the need for unpack/pack steps. For reference, clang-12 generates: vldrh.s32 q0, [r1] vldrh.s32 q1, [r1, #8] vclz.i32q0, q0 vstrh.32q0, [r0] vclz.i32q0, q1 vstrh.32q0, [r0, #8] 2021-06-11 Christophe Lyon gcc/ * config/arm/mve.md (mve_vec_unpack_lo_): New pattern. (mve_vec_unpack_hi_): New pattern. (@mve_vec_pack_trunc_lo_): New pattern. (mve_vmovntq_): Prefix with '@'. * config/arm/neon.md (vec_unpack_hi_): Move to
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
Christophe Lyon writes: > In the meantime, I tried to make some progress, and looked at how > things work on aarch64. > > This led me to define (in mve.md): > > (define_insn "@mve_vec_pack_trunc_lo_" > [(set (match_operand: 0 "register_operand" "=w") >(truncate: (match_operand:MVE_5 1 "register_operand" "w")))] > "TARGET_HAVE_MVE" > "vmovnb.i %q0, %q1\;" > [(set_attr "type" "mve_move")] > ) > > (define_insn "@mve_vec_pack_trunc_hi_" > [(set (match_operand: 0 "register_operand" "=w") >(vec_concat: > (match_operand: 1 "register_operand" "0") > (truncate: > (match_operand:MVE_5 2 "register_operand" "w"] > "TARGET_HAVE_MVE" > "vmovnt.i %q0, %q2\;" > [(set_attr "type" "mve_move")] > ) > > and in vec-common.md, for > (define_expand "vec_pack_trunc_" > [(set (match_operand: 0 "register_operand") >(vec_concat: > (truncate: > (match_operand:VN 1 "register_operand")) > (truncate: > (match_operand:VN 2 "register_operand"] > > I expand this for MVE: > rtx tmpreg = gen_reg_rtx (mode); > emit_insn (gen_mve_vec_pack_trunc_lo (mode, tmpreg, operands[1])); > emit_insn (gen_mve_vec_pack_trunc_hi (mode, operands[0], > tmpreg, operands[2])); > > I am getting an error in reload: > error: unable to generate reloads for: > (insn 10 9 11 2 (set (reg:V4HI 122 [ vect__4.16 ]) > (truncate:V4HI (reg:V4SI 118 [ vect__4.16 ]))) > "/gcc.target/arm/simd/mve-vec-pack.c":17:112 3609 > {mve_vec_pack_trunc_lo_v4si} > (expr_list:REG_DEAD (reg:V4SI 118 [ vect__4.16 ]) > (nil))) > > The next insn is: > (insn 11 10 12 2 (set (reg:V8HI 121 [ vect__7.18 ]) > (vec_concat:V8HI (reg:V4HI 122 [ vect__4.16 ]) > (truncate:V4HI (reg:V4SI 119 [ vect__4.17 ] > "/gcc.target/arm/simd/mve-vec-pack.c":17:112 3611 > {mve_vec_pack_trunc_hi_v4si} > (expr_list:REG_DEAD (reg:V4HI 122 [ vect__4.16 ]) > (expr_list:REG_DEAD (reg:V4SI 119 [ vect__4.17 ]) > (nil > > What is causing the reload error? For this to work on MVE, we'd need to allow the 64-bit V_narrow modes to be stored in MVE registers. I'm not sure off-hand whether allowing that would be a good idea or not. If we continue to allow only 128-bit vectors in MVE registers then we'll need to continue to use unspecs instead of truncate and vec_concat. Thanks, Richard
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
On Fri, 11 Jun 2021 at 11:38, Richard Sandiford wrote: > > Christophe Lyon writes: > > On Tue, 8 Jun 2021 at 14:10, Richard Sandiford > > wrote: > >> > >> Christophe Lyon writes: > >> > This patch adds vec_unpack_hi_, vec_unpack_lo_, > >> > vec_pack_trunc_ patterns for MVE. > >> > > >> > It does so by moving the unpack patterns from neon.md to > >> > vec-common.md, while adding them support for MVE. The pack expander is > >> > derived from the Neon one (which in turn is renamed into > >> > neon_quad_vec_pack_trunc_). > >> > > >> > The patch introduces mve_vec_pack_trunc_ to avoid the need for a > >> > zero-initialized temporary, which is needed if the > >> > vec_pack_trunc_ expander calls @mve_vmovn[bt]q_ > >> > instead. > >> > > >> > With this patch, we can now vectorize the 16 and 8-bit versions of > >> > vclz and vshl, although the generated code could still be improved. > >> > For test_clz_s16, we now generate > >> > vldrh.16q3, [r1] > >> > vmovlb.s16 q2, q3 > >> > vmovlt.s16 q3, q3 > >> > vclz.i32 q2, q2 > >> > vclz.i32 q3, q3 > >> > vmovnb.i32 q1, q2 > >> > vmovnt.i32 q1, q3 > >> > vstrh.16q1, [r0] > >> > which could be improved to > >> > vldrh.16q3, [r1] > >> > vclz.i16q1, q3 > >> > vstrh.16q1, [r0] > >> > if we could avoid the need for unpack/pack steps. > >> > >> Yeah, there was a PR about fixing this for popcount. I guess the same > >> approach would apply here too. > >> > >> > For reference, clang-12 generates: > >> > vldrh.s32 q0, [r1] > >> > vldrh.s32 q1, [r1, #8] > >> > vclz.i32q0, q0 > >> > vstrh.32q0, [r0] > >> > vclz.i32q0, q1 > >> > vstrh.32q0, [r0, #8] > >> > > >> > 2021-06-03 Christophe Lyon > >> > > >> > gcc/ > >> > * config/arm/mve.md (mve_vmovltq_): Prefix with '@'. > >> > (mve_vmovlbq_): Likewise. > >> > (mve_vmovnbq_): Likewise. > >> > (mve_vmovntq_): Likewise. > >> > (@mve_vec_pack_trunc_): New pattern. > >> > * config/arm/neon.md (vec_unpack_hi_): Move to > >> > vec-common.md. > >> > (vec_unpack_lo_): Likewise. > >> > (vec_pack_trunc_): Rename to > >> > neon_quad_vec_pack_trunc_. > >> > * config/arm/vec-common.md (vec_unpack_hi_): New > >> > pattern. > >> > (vec_unpack_lo_): New. > >> > (vec_pack_trunc_): New. > >> > > >> > gcc/testsuite/ > >> > * gcc.target/arm/simd/mve-vclz.c: Update expected results. > >> > * gcc.target/arm/simd/mve-vshl.c: Likewise. > >> > --- > >> > gcc/config/arm/mve.md| 20 - > >> > gcc/config/arm/neon.md | 39 + > >> > gcc/config/arm/vec-common.md | 89 > >> > gcc/testsuite/gcc.target/arm/simd/mve-vclz.c | 7 +- > >> > gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 5 +- > >> > 5 files changed, 114 insertions(+), 46 deletions(-) > >> > > >> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md > >> > index 99e46d0bc69..b18292c07d3 100644 > >> > --- a/gcc/config/arm/mve.md > >> > +++ b/gcc/config/arm/mve.md > >> > @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_" > >> > ;; > >> > ;; [vmovltq_u, vmovltq_s]) > >> > ;; > >> > -(define_insn "mve_vmovltq_" > >> > +(define_insn "@mve_vmovltq_" > >> >[ > >> > (set (match_operand: 0 "s_register_operand" "=w") > >> > (unspec: [(match_operand:MVE_3 1 > >> > "s_register_operand" "w")] > >> > @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_" > >> > ;; > >> > ;; [vmovlbq_s, vmovlbq_u]) > >> > ;; > >> > -(define_insn "mve_vmovlbq_" > >> > +(define_insn "@mve_vmovlbq_" > >> >[ > >> > (set (match_operand: 0 "s_register_operand" "=w") > >> > (unspec: [(match_operand:MVE_3 1 > >> > "s_register_operand" "w")] > >> > @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s" > >> > ;; > >> > ;; [vmovnbq_u, vmovnbq_s]) > >> > ;; > >> > -(define_insn "mve_vmovnbq_" > >> > +(define_insn "@mve_vmovnbq_" > >> >[ > >> > (set (match_operand: 0 "s_register_operand" "=w") > >> > (unspec: [(match_operand: 1 > >> > "s_register_operand" "0") > >> > @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_" > >> > ;; > >> > ;; [vmovntq_s, vmovntq_u]) > >> > ;; > >> > -(define_insn "mve_vmovntq_" > >> > +(define_insn "@mve_vmovntq_" > >> >[ > >> > (set (match_operand: 0 "s_register_operand" "=w") > >> > (unspec: [(match_operand: 1 > >> > "s_register_operand" "0") > >> > @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_" > >> >[(set_attr "type" "mve_move") > >> > ]) > >> > > >> > +(define_insn "@mve_vec_pack_trunc_" > >> > + [(set (match_operand: 0 "register_operand" "=") > >> > + (vec_concat: > >> > + (truncate: > >> > + (match_operand:MVE_5 1 "register_operand" "w")) > >> > + (truncate: > >> > +
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
Christophe Lyon writes: > On Tue, 8 Jun 2021 at 14:10, Richard Sandiford > wrote: >> >> Christophe Lyon writes: >> > This patch adds vec_unpack_hi_, vec_unpack_lo_, >> > vec_pack_trunc_ patterns for MVE. >> > >> > It does so by moving the unpack patterns from neon.md to >> > vec-common.md, while adding them support for MVE. The pack expander is >> > derived from the Neon one (which in turn is renamed into >> > neon_quad_vec_pack_trunc_). >> > >> > The patch introduces mve_vec_pack_trunc_ to avoid the need for a >> > zero-initialized temporary, which is needed if the >> > vec_pack_trunc_ expander calls @mve_vmovn[bt]q_ >> > instead. >> > >> > With this patch, we can now vectorize the 16 and 8-bit versions of >> > vclz and vshl, although the generated code could still be improved. >> > For test_clz_s16, we now generate >> > vldrh.16q3, [r1] >> > vmovlb.s16 q2, q3 >> > vmovlt.s16 q3, q3 >> > vclz.i32 q2, q2 >> > vclz.i32 q3, q3 >> > vmovnb.i32 q1, q2 >> > vmovnt.i32 q1, q3 >> > vstrh.16q1, [r0] >> > which could be improved to >> > vldrh.16q3, [r1] >> > vclz.i16q1, q3 >> > vstrh.16q1, [r0] >> > if we could avoid the need for unpack/pack steps. >> >> Yeah, there was a PR about fixing this for popcount. I guess the same >> approach would apply here too. >> >> > For reference, clang-12 generates: >> > vldrh.s32 q0, [r1] >> > vldrh.s32 q1, [r1, #8] >> > vclz.i32q0, q0 >> > vstrh.32q0, [r0] >> > vclz.i32q0, q1 >> > vstrh.32q0, [r0, #8] >> > >> > 2021-06-03 Christophe Lyon >> > >> > gcc/ >> > * config/arm/mve.md (mve_vmovltq_): Prefix with '@'. >> > (mve_vmovlbq_): Likewise. >> > (mve_vmovnbq_): Likewise. >> > (mve_vmovntq_): Likewise. >> > (@mve_vec_pack_trunc_): New pattern. >> > * config/arm/neon.md (vec_unpack_hi_): Move to >> > vec-common.md. >> > (vec_unpack_lo_): Likewise. >> > (vec_pack_trunc_): Rename to >> > neon_quad_vec_pack_trunc_. >> > * config/arm/vec-common.md (vec_unpack_hi_): New >> > pattern. >> > (vec_unpack_lo_): New. >> > (vec_pack_trunc_): New. >> > >> > gcc/testsuite/ >> > * gcc.target/arm/simd/mve-vclz.c: Update expected results. >> > * gcc.target/arm/simd/mve-vshl.c: Likewise. >> > --- >> > gcc/config/arm/mve.md| 20 - >> > gcc/config/arm/neon.md | 39 + >> > gcc/config/arm/vec-common.md | 89 >> > gcc/testsuite/gcc.target/arm/simd/mve-vclz.c | 7 +- >> > gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 5 +- >> > 5 files changed, 114 insertions(+), 46 deletions(-) >> > >> > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md >> > index 99e46d0bc69..b18292c07d3 100644 >> > --- a/gcc/config/arm/mve.md >> > +++ b/gcc/config/arm/mve.md >> > @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_" >> > ;; >> > ;; [vmovltq_u, vmovltq_s]) >> > ;; >> > -(define_insn "mve_vmovltq_" >> > +(define_insn "@mve_vmovltq_" >> >[ >> > (set (match_operand: 0 "s_register_operand" "=w") >> > (unspec: [(match_operand:MVE_3 1 >> > "s_register_operand" "w")] >> > @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_" >> > ;; >> > ;; [vmovlbq_s, vmovlbq_u]) >> > ;; >> > -(define_insn "mve_vmovlbq_" >> > +(define_insn "@mve_vmovlbq_" >> >[ >> > (set (match_operand: 0 "s_register_operand" "=w") >> > (unspec: [(match_operand:MVE_3 1 >> > "s_register_operand" "w")] >> > @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s" >> > ;; >> > ;; [vmovnbq_u, vmovnbq_s]) >> > ;; >> > -(define_insn "mve_vmovnbq_" >> > +(define_insn "@mve_vmovnbq_" >> >[ >> > (set (match_operand: 0 "s_register_operand" "=w") >> > (unspec: [(match_operand: 1 >> > "s_register_operand" "0") >> > @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_" >> > ;; >> > ;; [vmovntq_s, vmovntq_u]) >> > ;; >> > -(define_insn "mve_vmovntq_" >> > +(define_insn "@mve_vmovntq_" >> >[ >> > (set (match_operand: 0 "s_register_operand" "=w") >> > (unspec: [(match_operand: 1 >> > "s_register_operand" "0") >> > @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_" >> >[(set_attr "type" "mve_move") >> > ]) >> > >> > +(define_insn "@mve_vec_pack_trunc_" >> > + [(set (match_operand: 0 "register_operand" "=") >> > + (vec_concat: >> > + (truncate: >> > + (match_operand:MVE_5 1 "register_operand" "w")) >> > + (truncate: >> > + (match_operand:MVE_5 2 "register_operand" "w"] >> > + "TARGET_HAVE_MVE" >> > + "vmovnb.i%q0, %q1\;vmovnt.i %q0, %q2" >> > + [(set_attr "type" "mve_move")] >> > +) >> > + >> >> I realise this is (like you say) based on the neon.md pattern, >> but we should use separate vmovnb and
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
On Thu, 10 Jun 2021 at 11:50, Christophe Lyon wrote: > > On Tue, 8 Jun 2021 at 14:10, Richard Sandiford > wrote: > > > > Christophe Lyon writes: > > > This patch adds vec_unpack_hi_, vec_unpack_lo_, > > > vec_pack_trunc_ patterns for MVE. > > > > > > It does so by moving the unpack patterns from neon.md to > > > vec-common.md, while adding them support for MVE. The pack expander is > > > derived from the Neon one (which in turn is renamed into > > > neon_quad_vec_pack_trunc_). > > > > > > The patch introduces mve_vec_pack_trunc_ to avoid the need for a > > > zero-initialized temporary, which is needed if the > > > vec_pack_trunc_ expander calls @mve_vmovn[bt]q_ > > > instead. > > > > > > With this patch, we can now vectorize the 16 and 8-bit versions of > > > vclz and vshl, although the generated code could still be improved. > > > For test_clz_s16, we now generate > > > vldrh.16q3, [r1] > > > vmovlb.s16 q2, q3 > > > vmovlt.s16 q3, q3 > > > vclz.i32 q2, q2 > > > vclz.i32 q3, q3 > > > vmovnb.i32 q1, q2 > > > vmovnt.i32 q1, q3 > > > vstrh.16q1, [r0] > > > which could be improved to > > > vldrh.16q3, [r1] > > > vclz.i16q1, q3 > > > vstrh.16q1, [r0] > > > if we could avoid the need for unpack/pack steps. > > > > Yeah, there was a PR about fixing this for popcount. I guess the same > > approach would apply here too. > > > > > For reference, clang-12 generates: > > > vldrh.s32 q0, [r1] > > > vldrh.s32 q1, [r1, #8] > > > vclz.i32q0, q0 > > > vstrh.32q0, [r0] > > > vclz.i32q0, q1 > > > vstrh.32q0, [r0, #8] > > > > > > 2021-06-03 Christophe Lyon > > > > > > gcc/ > > > * config/arm/mve.md (mve_vmovltq_): Prefix with '@'. > > > (mve_vmovlbq_): Likewise. > > > (mve_vmovnbq_): Likewise. > > > (mve_vmovntq_): Likewise. > > > (@mve_vec_pack_trunc_): New pattern. > > > * config/arm/neon.md (vec_unpack_hi_): Move to > > > vec-common.md. > > > (vec_unpack_lo_): Likewise. > > > (vec_pack_trunc_): Rename to > > > neon_quad_vec_pack_trunc_. > > > * config/arm/vec-common.md (vec_unpack_hi_): New > > > pattern. > > > (vec_unpack_lo_): New. > > > (vec_pack_trunc_): New. > > > > > > gcc/testsuite/ > > > * gcc.target/arm/simd/mve-vclz.c: Update expected results. > > > * gcc.target/arm/simd/mve-vshl.c: Likewise. > > > --- > > > gcc/config/arm/mve.md| 20 - > > > gcc/config/arm/neon.md | 39 + > > > gcc/config/arm/vec-common.md | 89 > > > gcc/testsuite/gcc.target/arm/simd/mve-vclz.c | 7 +- > > > gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 5 +- > > > 5 files changed, 114 insertions(+), 46 deletions(-) > > > > > > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md > > > index 99e46d0bc69..b18292c07d3 100644 > > > --- a/gcc/config/arm/mve.md > > > +++ b/gcc/config/arm/mve.md > > > @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_" > > > ;; > > > ;; [vmovltq_u, vmovltq_s]) > > > ;; > > > -(define_insn "mve_vmovltq_" > > > +(define_insn "@mve_vmovltq_" > > >[ > > > (set (match_operand: 0 "s_register_operand" "=w") > > > (unspec: [(match_operand:MVE_3 1 > > > "s_register_operand" "w")] > > > @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_" > > > ;; > > > ;; [vmovlbq_s, vmovlbq_u]) > > > ;; > > > -(define_insn "mve_vmovlbq_" > > > +(define_insn "@mve_vmovlbq_" > > >[ > > > (set (match_operand: 0 "s_register_operand" "=w") > > > (unspec: [(match_operand:MVE_3 1 > > > "s_register_operand" "w")] > > > @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s" > > > ;; > > > ;; [vmovnbq_u, vmovnbq_s]) > > > ;; > > > -(define_insn "mve_vmovnbq_" > > > +(define_insn "@mve_vmovnbq_" > > >[ > > > (set (match_operand: 0 "s_register_operand" "=w") > > > (unspec: [(match_operand: 1 > > > "s_register_operand" "0") > > > @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_" > > > ;; > > > ;; [vmovntq_s, vmovntq_u]) > > > ;; > > > -(define_insn "mve_vmovntq_" > > > +(define_insn "@mve_vmovntq_" > > >[ > > > (set (match_operand: 0 "s_register_operand" "=w") > > > (unspec: [(match_operand: 1 > > > "s_register_operand" "0") > > > @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_" > > >[(set_attr "type" "mve_move") > > > ]) > > > > > > +(define_insn "@mve_vec_pack_trunc_" > > > + [(set (match_operand: 0 "register_operand" "=") > > > + (vec_concat: > > > + (truncate: > > > + (match_operand:MVE_5 1 "register_operand" "w")) > > > + (truncate: > > > + (match_operand:MVE_5 2 "register_operand" "w"] > > > + "TARGET_HAVE_MVE" > > > + "vmovnb.i%q0, %q1\;vmovnt.i %q0, %q2" > > > +
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
On Tue, 8 Jun 2021 at 14:10, Richard Sandiford wrote: > > Christophe Lyon writes: > > This patch adds vec_unpack_hi_, vec_unpack_lo_, > > vec_pack_trunc_ patterns for MVE. > > > > It does so by moving the unpack patterns from neon.md to > > vec-common.md, while adding them support for MVE. The pack expander is > > derived from the Neon one (which in turn is renamed into > > neon_quad_vec_pack_trunc_). > > > > The patch introduces mve_vec_pack_trunc_ to avoid the need for a > > zero-initialized temporary, which is needed if the > > vec_pack_trunc_ expander calls @mve_vmovn[bt]q_ > > instead. > > > > With this patch, we can now vectorize the 16 and 8-bit versions of > > vclz and vshl, although the generated code could still be improved. > > For test_clz_s16, we now generate > > vldrh.16q3, [r1] > > vmovlb.s16 q2, q3 > > vmovlt.s16 q3, q3 > > vclz.i32 q2, q2 > > vclz.i32 q3, q3 > > vmovnb.i32 q1, q2 > > vmovnt.i32 q1, q3 > > vstrh.16q1, [r0] > > which could be improved to > > vldrh.16q3, [r1] > > vclz.i16q1, q3 > > vstrh.16q1, [r0] > > if we could avoid the need for unpack/pack steps. > > Yeah, there was a PR about fixing this for popcount. I guess the same > approach would apply here too. > > > For reference, clang-12 generates: > > vldrh.s32 q0, [r1] > > vldrh.s32 q1, [r1, #8] > > vclz.i32q0, q0 > > vstrh.32q0, [r0] > > vclz.i32q0, q1 > > vstrh.32q0, [r0, #8] > > > > 2021-06-03 Christophe Lyon > > > > gcc/ > > * config/arm/mve.md (mve_vmovltq_): Prefix with '@'. > > (mve_vmovlbq_): Likewise. > > (mve_vmovnbq_): Likewise. > > (mve_vmovntq_): Likewise. > > (@mve_vec_pack_trunc_): New pattern. > > * config/arm/neon.md (vec_unpack_hi_): Move to > > vec-common.md. > > (vec_unpack_lo_): Likewise. > > (vec_pack_trunc_): Rename to > > neon_quad_vec_pack_trunc_. > > * config/arm/vec-common.md (vec_unpack_hi_): New > > pattern. > > (vec_unpack_lo_): New. > > (vec_pack_trunc_): New. > > > > gcc/testsuite/ > > * gcc.target/arm/simd/mve-vclz.c: Update expected results. > > * gcc.target/arm/simd/mve-vshl.c: Likewise. > > --- > > gcc/config/arm/mve.md| 20 - > > gcc/config/arm/neon.md | 39 + > > gcc/config/arm/vec-common.md | 89 > > gcc/testsuite/gcc.target/arm/simd/mve-vclz.c | 7 +- > > gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 5 +- > > 5 files changed, 114 insertions(+), 46 deletions(-) > > > > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md > > index 99e46d0bc69..b18292c07d3 100644 > > --- a/gcc/config/arm/mve.md > > +++ b/gcc/config/arm/mve.md > > @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_" > > ;; > > ;; [vmovltq_u, vmovltq_s]) > > ;; > > -(define_insn "mve_vmovltq_" > > +(define_insn "@mve_vmovltq_" > >[ > > (set (match_operand: 0 "s_register_operand" "=w") > > (unspec: [(match_operand:MVE_3 1 "s_register_operand" > > "w")] > > @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_" > > ;; > > ;; [vmovlbq_s, vmovlbq_u]) > > ;; > > -(define_insn "mve_vmovlbq_" > > +(define_insn "@mve_vmovlbq_" > >[ > > (set (match_operand: 0 "s_register_operand" "=w") > > (unspec: [(match_operand:MVE_3 1 "s_register_operand" > > "w")] > > @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s" > > ;; > > ;; [vmovnbq_u, vmovnbq_s]) > > ;; > > -(define_insn "mve_vmovnbq_" > > +(define_insn "@mve_vmovnbq_" > >[ > > (set (match_operand: 0 "s_register_operand" "=w") > > (unspec: [(match_operand: 1 > > "s_register_operand" "0") > > @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_" > > ;; > > ;; [vmovntq_s, vmovntq_u]) > > ;; > > -(define_insn "mve_vmovntq_" > > +(define_insn "@mve_vmovntq_" > >[ > > (set (match_operand: 0 "s_register_operand" "=w") > > (unspec: [(match_operand: 1 > > "s_register_operand" "0") > > @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_" > >[(set_attr "type" "mve_move") > > ]) > > > > +(define_insn "@mve_vec_pack_trunc_" > > + [(set (match_operand: 0 "register_operand" "=") > > + (vec_concat: > > + (truncate: > > + (match_operand:MVE_5 1 "register_operand" "w")) > > + (truncate: > > + (match_operand:MVE_5 2 "register_operand" "w"] > > + "TARGET_HAVE_MVE" > > + "vmovnb.i%q0, %q1\;vmovnt.i %q0, %q2" > > + [(set_attr "type" "mve_move")] > > +) > > + > > I realise this is (like you say) based on the neon.md pattern, > but we should use separate vmovnb and vmovnt instructions instead > of putting two instructions into a single pattern. > > One specific advantage to using separate patterns is that it would > avoid the
Re: [PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
Christophe Lyon writes: > This patch adds vec_unpack_hi_, vec_unpack_lo_, > vec_pack_trunc_ patterns for MVE. > > It does so by moving the unpack patterns from neon.md to > vec-common.md, while adding them support for MVE. The pack expander is > derived from the Neon one (which in turn is renamed into > neon_quad_vec_pack_trunc_). > > The patch introduces mve_vec_pack_trunc_ to avoid the need for a > zero-initialized temporary, which is needed if the > vec_pack_trunc_ expander calls @mve_vmovn[bt]q_ > instead. > > With this patch, we can now vectorize the 16 and 8-bit versions of > vclz and vshl, although the generated code could still be improved. > For test_clz_s16, we now generate > vldrh.16q3, [r1] > vmovlb.s16 q2, q3 > vmovlt.s16 q3, q3 > vclz.i32 q2, q2 > vclz.i32 q3, q3 > vmovnb.i32 q1, q2 > vmovnt.i32 q1, q3 > vstrh.16q1, [r0] > which could be improved to > vldrh.16q3, [r1] > vclz.i16q1, q3 > vstrh.16q1, [r0] > if we could avoid the need for unpack/pack steps. Yeah, there was a PR about fixing this for popcount. I guess the same approach would apply here too. > For reference, clang-12 generates: > vldrh.s32 q0, [r1] > vldrh.s32 q1, [r1, #8] > vclz.i32q0, q0 > vstrh.32q0, [r0] > vclz.i32q0, q1 > vstrh.32q0, [r0, #8] > > 2021-06-03 Christophe Lyon > > gcc/ > * config/arm/mve.md (mve_vmovltq_): Prefix with '@'. > (mve_vmovlbq_): Likewise. > (mve_vmovnbq_): Likewise. > (mve_vmovntq_): Likewise. > (@mve_vec_pack_trunc_): New pattern. > * config/arm/neon.md (vec_unpack_hi_): Move to > vec-common.md. > (vec_unpack_lo_): Likewise. > (vec_pack_trunc_): Rename to > neon_quad_vec_pack_trunc_. > * config/arm/vec-common.md (vec_unpack_hi_): New > pattern. > (vec_unpack_lo_): New. > (vec_pack_trunc_): New. > > gcc/testsuite/ > * gcc.target/arm/simd/mve-vclz.c: Update expected results. > * gcc.target/arm/simd/mve-vshl.c: Likewise. > --- > gcc/config/arm/mve.md| 20 - > gcc/config/arm/neon.md | 39 + > gcc/config/arm/vec-common.md | 89 > gcc/testsuite/gcc.target/arm/simd/mve-vclz.c | 7 +- > gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 5 +- > 5 files changed, 114 insertions(+), 46 deletions(-) > > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md > index 99e46d0bc69..b18292c07d3 100644 > --- a/gcc/config/arm/mve.md > +++ b/gcc/config/arm/mve.md > @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_" > ;; > ;; [vmovltq_u, vmovltq_s]) > ;; > -(define_insn "mve_vmovltq_" > +(define_insn "@mve_vmovltq_" >[ > (set (match_operand: 0 "s_register_operand" "=w") > (unspec: [(match_operand:MVE_3 1 "s_register_operand" > "w")] > @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_" > ;; > ;; [vmovlbq_s, vmovlbq_u]) > ;; > -(define_insn "mve_vmovlbq_" > +(define_insn "@mve_vmovlbq_" >[ > (set (match_operand: 0 "s_register_operand" "=w") > (unspec: [(match_operand:MVE_3 1 "s_register_operand" > "w")] > @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s" > ;; > ;; [vmovnbq_u, vmovnbq_s]) > ;; > -(define_insn "mve_vmovnbq_" > +(define_insn "@mve_vmovnbq_" >[ > (set (match_operand: 0 "s_register_operand" "=w") > (unspec: [(match_operand: 1 > "s_register_operand" "0") > @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_" > ;; > ;; [vmovntq_s, vmovntq_u]) > ;; > -(define_insn "mve_vmovntq_" > +(define_insn "@mve_vmovntq_" >[ > (set (match_operand: 0 "s_register_operand" "=w") > (unspec: [(match_operand: 1 > "s_register_operand" "0") > @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_" >[(set_attr "type" "mve_move") > ]) > > +(define_insn "@mve_vec_pack_trunc_" > + [(set (match_operand: 0 "register_operand" "=") > + (vec_concat: > + (truncate: > + (match_operand:MVE_5 1 "register_operand" "w")) > + (truncate: > + (match_operand:MVE_5 2 "register_operand" "w"] > + "TARGET_HAVE_MVE" > + "vmovnb.i%q0, %q1\;vmovnt.i %q0, %q2" > + [(set_attr "type" "mve_move")] > +) > + I realise this is (like you say) based on the neon.md pattern, but we should use separate vmovnb and vmovnt instructions instead of putting two instructions into a single pattern. One specific advantage to using separate patterns is that it would avoid the imprecision of the earlyclobber: the output only conflicts with operand 1 and can be tied to operand 2. > ;; > ;; [vmulq_f]) > ;; > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md > index 0fdffaf4ec4..392d9607919 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -5924,43 +5924,6 @@
[PATCH 2/2] arm: Auto-vectorization for MVE: add pack/unpack patterns
This patch adds vec_unpack_hi_, vec_unpack_lo_, vec_pack_trunc_ patterns for MVE. It does so by moving the unpack patterns from neon.md to vec-common.md, while adding them support for MVE. The pack expander is derived from the Neon one (which in turn is renamed into neon_quad_vec_pack_trunc_). The patch introduces mve_vec_pack_trunc_ to avoid the need for a zero-initialized temporary, which is needed if the vec_pack_trunc_ expander calls @mve_vmovn[bt]q_ instead. With this patch, we can now vectorize the 16 and 8-bit versions of vclz and vshl, although the generated code could still be improved. For test_clz_s16, we now generate vldrh.16q3, [r1] vmovlb.s16 q2, q3 vmovlt.s16 q3, q3 vclz.i32 q2, q2 vclz.i32 q3, q3 vmovnb.i32 q1, q2 vmovnt.i32 q1, q3 vstrh.16q1, [r0] which could be improved to vldrh.16q3, [r1] vclz.i16q1, q3 vstrh.16q1, [r0] if we could avoid the need for unpack/pack steps. For reference, clang-12 generates: vldrh.s32 q0, [r1] vldrh.s32 q1, [r1, #8] vclz.i32q0, q0 vstrh.32q0, [r0] vclz.i32q0, q1 vstrh.32q0, [r0, #8] 2021-06-03 Christophe Lyon gcc/ * config/arm/mve.md (mve_vmovltq_): Prefix with '@'. (mve_vmovlbq_): Likewise. (mve_vmovnbq_): Likewise. (mve_vmovntq_): Likewise. (@mve_vec_pack_trunc_): New pattern. * config/arm/neon.md (vec_unpack_hi_): Move to vec-common.md. (vec_unpack_lo_): Likewise. (vec_pack_trunc_): Rename to neon_quad_vec_pack_trunc_. * config/arm/vec-common.md (vec_unpack_hi_): New pattern. (vec_unpack_lo_): New. (vec_pack_trunc_): New. gcc/testsuite/ * gcc.target/arm/simd/mve-vclz.c: Update expected results. * gcc.target/arm/simd/mve-vshl.c: Likewise. --- gcc/config/arm/mve.md| 20 - gcc/config/arm/neon.md | 39 + gcc/config/arm/vec-common.md | 89 gcc/testsuite/gcc.target/arm/simd/mve-vclz.c | 7 +- gcc/testsuite/gcc.target/arm/simd/mve-vshl.c | 5 +- 5 files changed, 114 insertions(+), 46 deletions(-) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 99e46d0bc69..b18292c07d3 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -510,7 +510,7 @@ (define_insn "mve_vrev32q_" ;; ;; [vmovltq_u, vmovltq_s]) ;; -(define_insn "mve_vmovltq_" +(define_insn "@mve_vmovltq_" [ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand:MVE_3 1 "s_register_operand" "w")] @@ -524,7 +524,7 @@ (define_insn "mve_vmovltq_" ;; ;; [vmovlbq_s, vmovlbq_u]) ;; -(define_insn "mve_vmovlbq_" +(define_insn "@mve_vmovlbq_" [ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand:MVE_3 1 "s_register_operand" "w")] @@ -2187,7 +2187,7 @@ (define_insn "mve_vmlsldavxq_s" ;; ;; [vmovnbq_u, vmovnbq_s]) ;; -(define_insn "mve_vmovnbq_" +(define_insn "@mve_vmovnbq_" [ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") @@ -2202,7 +2202,7 @@ (define_insn "mve_vmovnbq_" ;; ;; [vmovntq_s, vmovntq_u]) ;; -(define_insn "mve_vmovntq_" +(define_insn "@mve_vmovntq_" [ (set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand: 1 "s_register_operand" "0") @@ -2214,6 +2214,18 @@ (define_insn "mve_vmovntq_" [(set_attr "type" "mve_move") ]) +(define_insn "@mve_vec_pack_trunc_" + [(set (match_operand: 0 "register_operand" "=") + (vec_concat: + (truncate: + (match_operand:MVE_5 1 "register_operand" "w")) + (truncate: + (match_operand:MVE_5 2 "register_operand" "w"] + "TARGET_HAVE_MVE" + "vmovnb.i %q0, %q1\;vmovnt.i %q0, %q2" + [(set_attr "type" "mve_move")] +) + ;; ;; [vmulq_f]) ;; diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 0fdffaf4ec4..392d9607919 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5924,43 +5924,6 @@ (define_insn "neon_vec_unpack_hi_" [(set_attr "type" "neon_shift_imm_long")] ) -(define_expand "vec_unpack_hi_" - [(match_operand: 0 "register_operand") - (SE: (match_operand:VU 1 "register_operand"))] - "TARGET_NEON && !BYTES_BIG_ENDIAN" - { - rtvec v = rtvec_alloc (/2) ; - rtx t1; - int i; - for (i = 0; i < (/2); i++) - RTVEC_ELT (v, i) = GEN_INT ((/2) + i); - - t1 = gen_rtx_PARALLEL (mode, v); - emit_insn (gen_neon_vec_unpack_hi_ (operands[0], - operands[1], -t1)); - DONE; - } -) - -(define_expand "vec_unpack_lo_" - [(match_operand: 0 "register_operand") -