On 18/08/15 08:53, Michael Collison wrote: > > This patch is designed to address code that was not being vectorized due to > missing widening patterns in the ARM backend. Code such as: > > int t6(int len, void * dummy, short * __restrict x) > { > len = len & ~31; > int result = 0; > __asm volatile (""); > for (int i = 0; i < len; i++) > result += x[i]; > return result; > } > > Validated on arm-none-eabi, arm-none-linux-gnueabi, arm-none-linux-gnueabihf, > and armeb-none-linux-gnueabihf. > > There is one regression on gcc.dg/vect/slp-reduc-3.c that only occurs when > -flto is enabled: > > gcc.dg/vect/slp-reduc-3.c -flto -ffat-lto-objects scan-tree-dump-times vect > "vectorizing stmts using SLP" 1 > gcc.dg/vect/slp-reduc-3.c scan-tree-dump-times vect "vectorizing stmts using > SLP" 1 >
Interesting, though not sure why that happens without some digging further. > > I could use some feedback on whether this is a regression or issue with the > test case. > ------------------------------------------------------------------------------------------------------------- > 2015-08-18 Michael Collison <michael.colli...@linaro.org> > > * config/arm/neon.md (widen_<us>sum<mode>): New patterns > where mode is VQI to improve mixed mode vectorization. > * config/arm/unspec.md: Add new unspecs: UNSPEC_VZERO_EXTEND and > UNSPEC_VSIGN_EXTEND. > * gcc.target/arm/neon-vaddws16.c: New test. > * gcc.target/arm/neon-vaddws32.c: New test. > * gcc.target/arm/neon-vaddwu16.c: New test. > * gcc.target/arm/neon-vaddwu32.c: New test. > * gcc.target/arm/neon-vaddwu8.c: New test. > > > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md > index 654d9d5..50cb409 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -1174,6 +1174,27 @@ > > ;; Widening operations > > +(define_insn_and_split "widen_ssum<mode>3" > + [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w") > + (plus:<V_double_width> (unspec:<V_double_width> > + [(match_operand:VQI 1 "s_register_operand" "w")] > + UNSPEC_VSIGN_EXTEND) > + (match_operand:<V_double_width> 2 "s_register_operand" > "0")))] > + "TARGET_NEON" > + "#" > + "&& reload_completed" I notice widen_ssum and widen_usum do not have any documentation with it - can you look to provide some kind of followup documentation for these patterns in md.texi while you are here ? > + [(const_int 0)] > +{ > + rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, > 0); > + rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, > GET_MODE_SIZE (<V_HALF>mode)); > + > + emit_insn (gen_widen_ssum<V_half>3 (operands[0], loreg, operands[2])); > + emit_insn (gen_widen_ssum<V_half>3 (operands[0], hireg, operands[2])); > + DONE; > + } > + [(set_attr "type" "neon_add_widen") > + (set_attr "length" "8")]) Isn't it better to expand this into (set (reg:V4SI reg) (plus:V4SI (sign_extend:V4SI (vec_select:V4HI (reg:V8HI ...) (parallel:V8HI (const_vector { 4, 5, 6, 7}))) (reg:V4SI reg))) (set (reg:V4SI reg) (plus:V4SI (sign_extend: V4SI (vec_select:V4HI (reg:V8HI) (parallel: V8HI (const_vector { 0, 1, 2, 3})))) That way we can "combine" cases where we have this kind of expressions from the intrinsics - I'm wondering about combinations from vmovl / vadd / vget_low ? I'd like us to avoid unspecs where we can... regards Ramana > + > (define_insn "widen_ssum<mode>3" > [(set (match_operand:<V_widen> 0 "s_register_operand" "=w") > (plus:<V_widen> (sign_extend:<V_widen> > @@ -1184,6 +1205,27 @@ > [(set_attr "type" "neon_add_widen")] > ) > > +(define_insn_and_split "widen_usum<mode>3" > + [(set (match_operand:<V_double_width> 0 "s_register_operand" "=&w") > + (plus:<V_double_width> (unspec:<V_double_width> > + [(match_operand:VQI 1 "s_register_operand" "w")] > + UNSPEC_VZERO_EXTEND) > + (match_operand:<V_double_width> 2 "s_register_operand" > "0")))] > + "TARGET_NEON" > + "#" > + "&& reload_completed" > + [(const_int 0)] > +{ > + rtx loreg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, > 0); > + rtx hireg = simplify_gen_subreg (<V_HALF>mode, operands[1], <MODE>mode, > GET_MODE_SIZE (<V_HALF>mode)); > + > + emit_insn (gen_widen_usum<V_half>3 (operands[0], loreg, operands[2])); > + emit_insn (gen_widen_usum<V_half>3 (operands[0], hireg, operands[2])); > + DONE; > + } > + [(set_attr "type" "neon_add_widen") > + (set_attr "length" "8")]) > + > (define_insn "widen_usum<mode>3" > [(set (match_operand:<V_widen> 0 "s_register_operand" "=w") > (plus:<V_widen> (zero_extend:<V_widen> > diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md > index 0ec2c48..e9cf836 100644 > --- a/gcc/config/arm/unspecs.md > +++ b/gcc/config/arm/unspecs.md > @@ -358,5 +358,7 @@ > UNSPEC_NVRINTX > UNSPEC_NVRINTA > UNSPEC_NVRINTN > + UNSPEC_VZERO_EXTEND > + UNSPEC_VSIGN_EXTEND > ]) > > diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c > b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c > new file mode 100644 > index 0000000..ed10669 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c > @@ -0,0 +1,21 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_neon_hw } */ > +/* { dg-add-options arm_neon_ok } */ > +/* { dg-options "-O3" } */ > + > + > +int > +t6(int len, void * dummy, short * __restrict x) > +{ > + len = len & ~31; > + int result = 0; > + __asm volatile (""); > + for (int i = 0; i < len; i++) > + result += x[i]; > + return result; > +} > + > +/* { dg-final { scan-assembler "vaddw\.s16" } } */ > + > + > + > diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c > b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c > new file mode 100644 > index 0000000..94bf0c9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_neon_hw } */ > +/* { dg-add-options arm_neon_ok } */ > +/* { dg-options "-O3" } */ > + > +int > +t6(int len, void * dummy, int * __restrict x) > +{ > + len = len & ~31; > + long long result = 0; > + __asm volatile (""); > + for (int i = 0; i < len; i++) > + result += x[i]; > + return result; > +} > + > +/* { dg-final { scan-assembler "vaddw\.s32" } } */ > + > + > diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c > b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c > new file mode 100644 > index 0000000..98f8768 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_neon_hw } */ > +/* { dg-add-options arm_neon_ok } */ > +/* { dg-options "-O3" } */ > + > + > +int > +t6(int len, void * dummy, unsigned short * __restrict x) > +{ > + len = len & ~31; > + unsigned int result = 0; > + __asm volatile (""); > + for (int i = 0; i < len; i++) > + result += x[i]; > + return result; > +} > + > +/* { dg-final { scan-assembler "vaddw.u16" } } */ > diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c > b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c > new file mode 100644 > index 0000000..2e9af56 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_neon_hw } */ > +/* { dg-add-options arm_neon_ok } */ > +/* { dg-options "-O3" } */ > + > +int > +t6(int len, void * dummy, unsigned int * __restrict x) > +{ > + len = len & ~31; > + unsigned long long result = 0; > + __asm volatile (""); > + for (int i = 0; i < len; i++) > + result += x[i]; > + return result; > +} > + > +/* { dg-final { scan-assembler "vaddw\.u32" } } */ > + > diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c > b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c > new file mode 100644 > index 0000000..de2ad8a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c > @@ -0,0 +1,21 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_neon_hw } */ > +/* { dg-add-options arm_neon_ok } */ > +/* { dg-options "-O3" } */ > + > + > +int > +t6(int len, void * dummy, char * __restrict x) > +{ > + len = len & ~31; > + unsigned short result = 0; > + __asm volatile (""); > + for (int i = 0; i < len; i++) > + result += x[i]; > + return result; > +} > + > +/* { dg-final { scan-assembler "vaddw\.u8" } } */ > + > + > +