Re: [PATCH] Fixup nb_iterations_upper_bound adjustment for vectorized loops

2016-04-29 Thread Ilya Enkovich
On 28 Apr 15:59, Richard Biener wrote:
> On Thu, Apr 28, 2016 at 3:26 PM, Ilya Enkovich  wrote:
> > On 27 Apr 16:05, Richard Biener wrote:
> >> >>
> >> >> I'd like to see testcases covering the corner-cases - have them have
> >> >> upper bound estimates by adjusting known array sizes and also cover
> >> >> the case of peeling for gaps.
> >> >
> >> > OK, I'll make more tests.
> >> > Thanks,
> >> > Ilya
> >> >
> >> >>
> >> >> Richard.
> >> >>
> >
> > Could you please look at new tests?  I added one simple case with
> > known array size and similar tests with a peeling for gaps w/ and
> > w/o vector iteration peeled.
> >
> > Checked new tests with RUNTESTFLAGS="vect.exp=vect-nb-iter-ub-* 
> > --target_board=unix{-m32,}
> > on x86_64-pc-linux-gnu.  OK for trunk?
> 
> Can you make the new testcases runtime ones, thus check that the
> vectorized outcome
> is ok (so we don't forget any trailing iterations)?
> 
> Ok with that change.
> 
> Richard.
> 

Thanks for review!  Here is a version I'm going to apply.  I also changed tests
to be sse2 instead of avx512bw to make them more usabe on current HW.

Thanks,
Ilya
--
gcc/

2016-04-29  Ilya Enkovich  

* tree-vect-loop.c (vect_transform_loop): Fix
nb_iterations_upper_bound computation for vectorized loop.

gcc/testsuite/

2016-04-29  Ilya Enkovich  

* gcc.target/i386/vect-unpack-2.c (avx512bw_test): Avoid
optimization of vector loop.
* gcc.target/i386/vect-unpack-3.c: New test.
* gcc.dg/vect/vect-nb-iter-ub-1.c: New test.
* gcc.dg/vect/vect-nb-iter-ub-2.c: New test.
* gcc.dg/vect/vect-nb-iter-ub-3.c: New test.


diff --git a/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-1.c
new file mode 100644
index 000..456866d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-1.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
+/* { dg-additional-options "-msse2 -fdump-tree-cunroll-details" { target { 
i?86-*-* x86_64-*-* } } } */
+
+int ii[31];
+char cc[31] =
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 };
+
+void __attribute__((noinline,noclone))
+foo (int s)
+{
+  int i;
+  for (i = 0; i < s; i++)
+ii[i] = (int) cc[i];
+}
+
+int main (int argc, const char **argv)
+{
+  int i;
+  foo (31);
+  for (i = 0; i < 31; i++)
+if (ii[i] != i)
+  __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { i?86-*-* 
x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump "loop turned into non-loop; it never loops" 
"cunroll" { target { i?86-*-* x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump-not "loop with 2 iterations completely 
unrolled" "cunroll" { target { i?86-*-* x86_64-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-2.c
new file mode 100644
index 000..cf1c1ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-2.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
+/* { dg-additional-options "-msse2 -fdump-tree-cunroll-details" { target { 
i?86-*-* x86_64-*-* } } } */
+
+int ii[32];
+char cc[66] =
+  { 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0,
+10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, 16, 0, 17, 0, 18, 0, 19, 0,
+20, 0, 21, 0, 22, 0, 23, 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0,
+30, 0, 31, 0 };
+
+void __attribute__((noinline,noclone))
+foo (int s)
+{
+  int i;
+   for (i = 0; i < s; i++)
+ ii[i] = (int) cc[i*2];
+}
+
+int main (int argc, const char **argv)
+{
+  int i;
+  foo (32);
+  for (i = 0; i < 32; i++)
+if (ii[i] != i)
+  __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { i?86-*-* 
x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump "loop turned into non-loop; it never loops" 
"cunroll" { target { i?86-*-* x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump-not "loop with 2 iterations completely 
unrolled" "cunroll" { target { i?86-*-* x86_64-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c
new file mode 100644
index 000..d8fe307
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
+/* { dg-additional-options "-msse2 -fdump-tree-cunroll-details" { target { 
i?86-*-* x86_64-*-* } } } */
+
+int ii[33];
+char cc[66] =
+  { 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0,
+10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, 16, 0, 17, 0, 18, 0, 19, 0,
+20, 0, 21, 0, 22, 0, 23, 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0,
+30, 

Re: [PATCH] Fixup nb_iterations_upper_bound adjustment for vectorized loops

2016-04-28 Thread Richard Biener
On Thu, Apr 28, 2016 at 3:26 PM, Ilya Enkovich  wrote:
> On 27 Apr 16:05, Richard Biener wrote:
>> >>
>> >> I'd like to see testcases covering the corner-cases - have them have
>> >> upper bound estimates by adjusting known array sizes and also cover
>> >> the case of peeling for gaps.
>> >
>> > OK, I'll make more tests.
>> > Thanks,
>> > Ilya
>> >
>> >>
>> >> Richard.
>> >>
>
> Could you please look at new tests?  I added one simple case with
> known array size and similar tests with a peeling for gaps w/ and
> w/o vector iteration peeled.
>
> Checked new tests with RUNTESTFLAGS="vect.exp=vect-nb-iter-ub-* 
> --target_board=unix{-m32,}
> on x86_64-pc-linux-gnu.  OK for trunk?

Can you make the new testcases runtime ones, thus check that the
vectorized outcome
is ok (so we don't forget any trailing iterations)?

Ok with that change.

Richard.

> Thanks,
> Ilya
> --
> gcc/
>
> 2016-04-28  Ilya Enkovich  
>
> * tree-vect-loop.c (vect_transform_loop): Fix
> nb_iterations_upper_bound computation for vectorized loop.
>
> gcc/testsuite/
>
> 2016-04-28  Ilya Enkovich  
>
> * gcc.target/i386/vect-unpack-2.c (avx512bw_test): Avoid
> optimization of vector loop.
> * gcc.target/i386/vect-unpack-3.c: New test.
> * gcc.dg/vect/vect-nb-iter-ub-1.c: New test.
> * gcc.dg/vect/vect-nb-iter-ub-2.c: New test.
> * gcc.dg/vect/vect-nb-iter-ub-3.c: New test.
>
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-1.c 
> b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-1.c
> new file mode 100644
> index 000..b7504a8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mavx512bw -fdump-tree-cunroll-details" { target 
> { i?86-*-* x86_64-*-* } } } */
> +
> +int ii[127];
> +char cc[127];
> +
> +void
> +foo (int s)
> +{
> +  int i;
> +   for (i = 0; i < s; i++)
> + ii[i] = (int) cc[i];
> +}
> +
> +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { 
> i?86-*-* x86_64-*-* } } } } */
> +/* { dg-final { scan-tree-dump "loop turned into non-loop; it never loops" 
> "cunroll" { target { i?86-*-* x86_64-*-* } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-2.c 
> b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-2.c
> new file mode 100644
> index 000..5332636
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-2.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mavx512bw -fdump-tree-cunroll-details" { target 
> { i?86-*-* x86_64-*-* } } } */
> +
> +int ii[128];
> +char cc[256];
> +
> +void
> +foo (int s)
> +{
> +  int i;
> +   for (i = 0; i < s; i++)
> + ii[i] = (int) cc[i*2];
> +}
> +
> +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { 
> i?86-*-* x86_64-*-* } } } } */
> +/* { dg-final { scan-tree-dump "loop turned into non-loop; it never loops" 
> "cunroll" { target { i?86-*-* x86_64-*-* } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c 
> b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c
> new file mode 100644
> index 000..5610f6a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mavx512bw -fdump-tree-cunroll-details" { target 
> { i?86-*-* x86_64-*-* } } } */
> +
> +int ii[130];
> +char cc[258];
> +
> +void
> +foo (int s)
> +{
> +  int i;
> +   for (i = 0; i < s; i++)
> + ii[i] = (int) cc[i*2];
> +}
> +
> +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { 
> i?86-*-* x86_64-*-* } } } } */
> +/* { dg-final { scan-tree-dump-not "loop turned into non-loop; it never 
> loops" "cunroll" { target { i?86-*-* x86_64-*-* } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-2.c 
> b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
> index 4825248..51c518e 100644
> --- a/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
> +++ b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
> @@ -6,19 +6,22 @@
>
>  #define N 120
>  signed int yy[1];
> +signed char zz[1];
>
>  void
> -__attribute__ ((noinline)) foo (signed char s)
> +__attribute__ ((noinline,noclone)) foo (int s)
>  {
> -   signed char i;
> +   int i;
> for (i = 0; i < s; i++)
> - yy[i] = (signed int) i;
> + yy[i] = zz[i];
>  }
>
>  void
>  avx512bw_test ()
>  {
>signed char i;
> +  for (i = 0; i < N; i++)
> +zz[i] = i;
>foo (N);
>for (i = 0; i < N; i++)
>  if ( (signed int)i != yy [i] )
> diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-3.c 
> b/gcc/testsuite/gcc.target/i386/vect-unpack-3.c
> new file mode 100644
> index 000..eb8a93e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect-unpack-3.c
> @@ -0,0 +1,29 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fdump-tree-vect-details -ftree-vectorize -ffast-math 
> -mavx512bw -save-temps" } */
> +/* 

Re: [PATCH] Fixup nb_iterations_upper_bound adjustment for vectorized loops

2016-04-28 Thread Ilya Enkovich
On 27 Apr 16:05, Richard Biener wrote:
> >>
> >> I'd like to see testcases covering the corner-cases - have them have
> >> upper bound estimates by adjusting known array sizes and also cover
> >> the case of peeling for gaps.
> >
> > OK, I'll make more tests.
> > Thanks,
> > Ilya
> >
> >>
> >> Richard.
> >>

Could you please look at new tests?  I added one simple case with
known array size and similar tests with a peeling for gaps w/ and
w/o vector iteration peeled.

Checked new tests with RUNTESTFLAGS="vect.exp=vect-nb-iter-ub-* 
--target_board=unix{-m32,}
on x86_64-pc-linux-gnu.  OK for trunk?

Thanks,
Ilya
--
gcc/

2016-04-28  Ilya Enkovich  

* tree-vect-loop.c (vect_transform_loop): Fix
nb_iterations_upper_bound computation for vectorized loop.

gcc/testsuite/

2016-04-28  Ilya Enkovich  

* gcc.target/i386/vect-unpack-2.c (avx512bw_test): Avoid
optimization of vector loop.
* gcc.target/i386/vect-unpack-3.c: New test.
* gcc.dg/vect/vect-nb-iter-ub-1.c: New test.
* gcc.dg/vect/vect-nb-iter-ub-2.c: New test.
* gcc.dg/vect/vect-nb-iter-ub-3.c: New test.


diff --git a/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-1.c
new file mode 100644
index 000..b7504a8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mavx512bw -fdump-tree-cunroll-details" { target { 
i?86-*-* x86_64-*-* } } } */
+
+int ii[127];
+char cc[127];
+
+void
+foo (int s)
+{
+  int i;
+   for (i = 0; i < s; i++)
+ ii[i] = (int) cc[i];
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { i?86-*-* 
x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump "loop turned into non-loop; it never loops" 
"cunroll" { target { i?86-*-* x86_64-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-2.c
new file mode 100644
index 000..5332636
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mavx512bw -fdump-tree-cunroll-details" { target { 
i?86-*-* x86_64-*-* } } } */
+
+int ii[128];
+char cc[256];
+
+void
+foo (int s)
+{
+  int i;
+   for (i = 0; i < s; i++)
+ ii[i] = (int) cc[i*2];
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { i?86-*-* 
x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump "loop turned into non-loop; it never loops" 
"cunroll" { target { i?86-*-* x86_64-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c
new file mode 100644
index 000..5610f6a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mavx512bw -fdump-tree-cunroll-details" { target { 
i?86-*-* x86_64-*-* } } } */
+
+int ii[130];
+char cc[258];
+
+void
+foo (int s)
+{
+  int i;
+   for (i = 0; i < s; i++)
+ ii[i] = (int) cc[i*2];
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { i?86-*-* 
x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump-not "loop turned into non-loop; it never loops" 
"cunroll" { target { i?86-*-* x86_64-*-* } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-2.c 
b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
index 4825248..51c518e 100644
--- a/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
+++ b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
@@ -6,19 +6,22 @@
 
 #define N 120
 signed int yy[1];
+signed char zz[1];
 
 void
-__attribute__ ((noinline)) foo (signed char s)
+__attribute__ ((noinline,noclone)) foo (int s)
 {
-   signed char i;
+   int i;
for (i = 0; i < s; i++)
- yy[i] = (signed int) i;
+ yy[i] = zz[i];
 }
 
 void
 avx512bw_test ()
 {
   signed char i;
+  for (i = 0; i < N; i++)
+zz[i] = i;
   foo (N);
   for (i = 0; i < N; i++)
 if ( (signed int)i != yy [i] )
diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-3.c 
b/gcc/testsuite/gcc.target/i386/vect-unpack-3.c
new file mode 100644
index 000..eb8a93e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-unpack-3.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-vect-details -ftree-vectorize -ffast-math 
-mavx512bw -save-temps" } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "avx512bw-check.h"
+
+#define N 120
+signed int yy[1];
+
+void
+__attribute__ ((noinline)) foo (signed char s)
+{
+   signed char i;
+   for (i = 0; i < s; i++)
+ yy[i] = (signed int) i;
+}
+
+void
+avx512bw_test ()
+{
+  signed char i;
+  foo (N);
+  for (i = 0; i < N; i++)
+if ( (signed int)i != yy [i] )
+  abort ();
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-assembler-not "vpmovsxbw\[ \\t\]+\[^\n\]*%zmm" } } */
diff 

Re: [PATCH] Fixup nb_iterations_upper_bound adjustment for vectorized loops

2016-04-27 Thread Richard Biener
On Tue, Apr 26, 2016 at 2:29 PM, Ilya Enkovich  wrote:
> 2016-04-22 10:13 GMT+03:00 Richard Biener :
>> On Thu, Apr 21, 2016 at 6:09 PM, Ilya Enkovich  
>> wrote:
>>> Hi,
>>>
>>> Currently when loop is vectorized we adjust its nb_iterations_upper_bound
>>> by dividing it by VF.  This is incorrect since nb_iterations_upper_bound
>>> is upper bound for ( - 1) and therefore simple
>>> dividing it by VF in many cases gives us bounds greater than a real one.
>>> Correct value would be ((nb_iterations_upper_bound + 1) / VF - 1).
>>
>> Yeah, that seems correct.
>>
>>> Also decrement due to peeling for gaps should happen before we scale it
>>> by VF because peeling applies to a scalar loop, not vectorized one.
>>
>> That's not true - PEELING_FOR_GAPs is so that the last _vector_ iteration
>> is peeled as scalar operations.  We do not account for the amount
>> of known prologue peeling (if peeling for alignment and the misalignment
>> is known at compile-time) - that would be peeling of scalar iterations.
>
> My initial patch didn't change anything for PEELING_FOR_GAP and it caused
> a runfail for one of SPEC2006 benchmarks.  My investigation showed number
> of vector iterations calculation doesn't match nb_iterations_upper_bound
> adjustment in a way PEELING_FOR_GAP is accounted.
>
> Looking into vect_generate_tmps_on_preheader I see:
>
> /* If epilogue loop is required because of data accesses with gaps, we
>subtract one iteration from the total number of iterations here for
>correct calculation of RATIO.  */
>
> And then we decrement loop counter before dividing it by VF to compute
> ratio and ratio_mult_vf.  This doesn't match nb_iterations_upper_bound
> update and that's why I fixed it.  This resolved runfail for me.
>
> Thus ratio_mult_vf computation conflicts with your statement we peel a
> vector iteration.

Hum.  I stand corrected.  So yes, we remove the last vector iteration if
there are not already epilogue iterations.

>>
>> But it would be interesting to know why we need the != 0 check - static
>> cost modelling should have disabled vectorization if the vectorized body
>> isn't run.
>>
>>> This patch modifies nb_iterations_upper_bound computation to resolve
>>> these issues.
>>
>> You do not adjust the ->nb_iterations_estimate accordingly.
>>
>>> Running regression testing I got one fail due to optimized loop. Heres
>>> is a loop:
>>>
>>> foo (signed char s)
>>> {
>>>   signed char i;
>>>   for (i = 0; i < s; i++)
>>> yy[i] = (signed int) i;
>>> }
>>>
>>> Here we vectorize for AVX512 using VF=64.  Original loop has max 127
>>> iterations and therefore vectorized loop may be executed only once.
>>> With the patch applied compiler detects it and transforms loop into
>>> BB with just stores of constants vectors into yy.  Test was adjusted
>>> to increase number of possible iterations.  A copy of test was added
>>> to check we can optimize out the original loop.
>>>
>>> Bootstrapped and regtested on x86_64-pc-linux-gnu.  OK for trunk?
>>
>> I'd like to see testcases covering the corner-cases - have them have
>> upper bound estimates by adjusting known array sizes and also cover
>> the case of peeling for gaps.
>
> OK, I'll make more tests.

Thanks,
Richard.

> Thanks,
> Ilya
>
>>
>> Richard.
>>


Re: [PATCH] Fixup nb_iterations_upper_bound adjustment for vectorized loops

2016-04-26 Thread Ilya Enkovich
2016-04-22 10:13 GMT+03:00 Richard Biener :
> On Thu, Apr 21, 2016 at 6:09 PM, Ilya Enkovich  wrote:
>> Hi,
>>
>> Currently when loop is vectorized we adjust its nb_iterations_upper_bound
>> by dividing it by VF.  This is incorrect since nb_iterations_upper_bound
>> is upper bound for ( - 1) and therefore simple
>> dividing it by VF in many cases gives us bounds greater than a real one.
>> Correct value would be ((nb_iterations_upper_bound + 1) / VF - 1).
>
> Yeah, that seems correct.
>
>> Also decrement due to peeling for gaps should happen before we scale it
>> by VF because peeling applies to a scalar loop, not vectorized one.
>
> That's not true - PEELING_FOR_GAPs is so that the last _vector_ iteration
> is peeled as scalar operations.  We do not account for the amount
> of known prologue peeling (if peeling for alignment and the misalignment
> is known at compile-time) - that would be peeling of scalar iterations.

My initial patch didn't change anything for PEELING_FOR_GAP and it caused
a runfail for one of SPEC2006 benchmarks.  My investigation showed number
of vector iterations calculation doesn't match nb_iterations_upper_bound
adjustment in a way PEELING_FOR_GAP is accounted.

Looking into vect_generate_tmps_on_preheader I see:

/* If epilogue loop is required because of data accesses with gaps, we
   subtract one iteration from the total number of iterations here for
   correct calculation of RATIO.  */

And then we decrement loop counter before dividing it by VF to compute
ratio and ratio_mult_vf.  This doesn't match nb_iterations_upper_bound
update and that's why I fixed it.  This resolved runfail for me.

Thus ratio_mult_vf computation conflicts with your statement we peel a
vector iteration.

>
> But it would be interesting to know why we need the != 0 check - static
> cost modelling should have disabled vectorization if the vectorized body
> isn't run.
>
>> This patch modifies nb_iterations_upper_bound computation to resolve
>> these issues.
>
> You do not adjust the ->nb_iterations_estimate accordingly.
>
>> Running regression testing I got one fail due to optimized loop. Heres
>> is a loop:
>>
>> foo (signed char s)
>> {
>>   signed char i;
>>   for (i = 0; i < s; i++)
>> yy[i] = (signed int) i;
>> }
>>
>> Here we vectorize for AVX512 using VF=64.  Original loop has max 127
>> iterations and therefore vectorized loop may be executed only once.
>> With the patch applied compiler detects it and transforms loop into
>> BB with just stores of constants vectors into yy.  Test was adjusted
>> to increase number of possible iterations.  A copy of test was added
>> to check we can optimize out the original loop.
>>
>> Bootstrapped and regtested on x86_64-pc-linux-gnu.  OK for trunk?
>
> I'd like to see testcases covering the corner-cases - have them have
> upper bound estimates by adjusting known array sizes and also cover
> the case of peeling for gaps.

OK, I'll make more tests.

Thanks,
Ilya

>
> Richard.
>


Re: [PATCH] Fixup nb_iterations_upper_bound adjustment for vectorized loops

2016-04-22 Thread Richard Biener
On Thu, Apr 21, 2016 at 6:09 PM, Ilya Enkovich  wrote:
> Hi,
>
> Currently when loop is vectorized we adjust its nb_iterations_upper_bound
> by dividing it by VF.  This is incorrect since nb_iterations_upper_bound
> is upper bound for ( - 1) and therefore simple
> dividing it by VF in many cases gives us bounds greater than a real one.
> Correct value would be ((nb_iterations_upper_bound + 1) / VF - 1).

Yeah, that seems correct.

> Also decrement due to peeling for gaps should happen before we scale it
> by VF because peeling applies to a scalar loop, not vectorized one.

That's not true - PEELING_FOR_GAPs is so that the last _vector_ iteration
is peeled as scalar operations.  We do not account for the amount
of known prologue peeling (if peeling for alignment and the misalignment
is known at compile-time) - that would be peeling of scalar iterations.

But it would be interesting to know why we need the != 0 check - static
cost modelling should have disabled vectorization if the vectorized body
isn't run.

> This patch modifies nb_iterations_upper_bound computation to resolve
> these issues.

You do not adjust the ->nb_iterations_estimate accordingly.

> Running regression testing I got one fail due to optimized loop. Heres
> is a loop:
>
> foo (signed char s)
> {
>   signed char i;
>   for (i = 0; i < s; i++)
> yy[i] = (signed int) i;
> }
>
> Here we vectorize for AVX512 using VF=64.  Original loop has max 127
> iterations and therefore vectorized loop may be executed only once.
> With the patch applied compiler detects it and transforms loop into
> BB with just stores of constants vectors into yy.  Test was adjusted
> to increase number of possible iterations.  A copy of test was added
> to check we can optimize out the original loop.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu.  OK for trunk?

I'd like to see testcases covering the corner-cases - have them have
upper bound estimates by adjusting known array sizes and also cover
the case of peeling for gaps.

Richard.

> Thanks,
> Ilya
> --
> gcc/
>
> 2016-04-21  Ilya Enkovich  
>
> * tree-vect-loop.c (vect_transform_loop): Fix
> nb_iterations_upper_bound computation for vectorized loop.
>
> gcc/testsuite/
>
> 2016-04-21  Ilya Enkovich  
>
> * gcc.target/i386/vect-unpack-2.c (avx512bw_test): Avoid
> optimization of vector loop.
> * gcc.target/i386/vect-unpack-3.c: New test.
>
>
> diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-2.c 
> b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
> index 4825248..51c518e 100644
> --- a/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
> +++ b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
> @@ -6,19 +6,22 @@
>
>  #define N 120
>  signed int yy[1];
> +signed char zz[1];
>
>  void
> -__attribute__ ((noinline)) foo (signed char s)
> +__attribute__ ((noinline,noclone)) foo (int s)
>  {
> -   signed char i;
> +   int i;
> for (i = 0; i < s; i++)
> - yy[i] = (signed int) i;
> + yy[i] = zz[i];
>  }
>
>  void
>  avx512bw_test ()
>  {
>signed char i;
> +  for (i = 0; i < N; i++)
> +zz[i] = i;
>foo (N);
>for (i = 0; i < N; i++)
>  if ( (signed int)i != yy [i] )
> diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-3.c 
> b/gcc/testsuite/gcc.target/i386/vect-unpack-3.c
> new file mode 100644
> index 000..eb8a93e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect-unpack-3.c
> @@ -0,0 +1,29 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fdump-tree-vect-details -ftree-vectorize -ffast-math 
> -mavx512bw -save-temps" } */
> +/* { dg-require-effective-target avx512bw } */
> +
> +#include "avx512bw-check.h"
> +
> +#define N 120
> +signed int yy[1];
> +
> +void
> +__attribute__ ((noinline)) foo (signed char s)
> +{
> +   signed char i;
> +   for (i = 0; i < s; i++)
> + yy[i] = (signed int) i;
> +}
> +
> +void
> +avx512bw_test ()
> +{
> +  signed char i;
> +  foo (N);
> +  for (i = 0; i < N; i++)
> +if ( (signed int)i != yy [i] )
> +  abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-assembler-not "vpmovsxbw\[ \\t\]+\[^\n\]*%zmm" } } */
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index d813b86..da98211 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -6921,11 +6921,13 @@ vect_transform_loop (loop_vec_info loop_vinfo)
>/* Reduce loop iterations by the vectorization factor.  */
>scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
>   expected_iterations / vectorization_factor);
> -  loop->nb_iterations_upper_bound
> -= wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
>if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
>&& loop->nb_iterations_upper_bound != 0)
>  loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
> +  

[PATCH] Fixup nb_iterations_upper_bound adjustment for vectorized loops

2016-04-21 Thread Ilya Enkovich
Hi,

Currently when loop is vectorized we adjust its nb_iterations_upper_bound
by dividing it by VF.  This is incorrect since nb_iterations_upper_bound
is upper bound for ( - 1) and therefore simple
dividing it by VF in many cases gives us bounds greater than a real one.
Correct value would be ((nb_iterations_upper_bound + 1) / VF - 1).

Also decrement due to peeling for gaps should happen before we scale it
by VF because peeling applies to a scalar loop, not vectorized one.

This patch modifies nb_iterations_upper_bound computation to resolve
these issues.

Running regression testing I got one fail due to optimized loop. Heres
is a loop:

foo (signed char s)
{
  signed char i;
  for (i = 0; i < s; i++)
yy[i] = (signed int) i;
}

Here we vectorize for AVX512 using VF=64.  Original loop has max 127
iterations and therefore vectorized loop may be executed only once.
With the patch applied compiler detects it and transforms loop into
BB with just stores of constants vectors into yy.  Test was adjusted
to increase number of possible iterations.  A copy of test was added
to check we can optimize out the original loop.

Bootstrapped and regtested on x86_64-pc-linux-gnu.  OK for trunk?

Thanks,
Ilya
--
gcc/

2016-04-21  Ilya Enkovich  

* tree-vect-loop.c (vect_transform_loop): Fix
nb_iterations_upper_bound computation for vectorized loop.

gcc/testsuite/

2016-04-21  Ilya Enkovich  

* gcc.target/i386/vect-unpack-2.c (avx512bw_test): Avoid
optimization of vector loop.
* gcc.target/i386/vect-unpack-3.c: New test.


diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-2.c 
b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
index 4825248..51c518e 100644
--- a/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
+++ b/gcc/testsuite/gcc.target/i386/vect-unpack-2.c
@@ -6,19 +6,22 @@
 
 #define N 120
 signed int yy[1];
+signed char zz[1];
 
 void
-__attribute__ ((noinline)) foo (signed char s)
+__attribute__ ((noinline,noclone)) foo (int s)
 {
-   signed char i;
+   int i;
for (i = 0; i < s; i++)
- yy[i] = (signed int) i;
+ yy[i] = zz[i];
 }
 
 void
 avx512bw_test ()
 {
   signed char i;
+  for (i = 0; i < N; i++)
+zz[i] = i;
   foo (N);
   for (i = 0; i < N; i++)
 if ( (signed int)i != yy [i] )
diff --git a/gcc/testsuite/gcc.target/i386/vect-unpack-3.c 
b/gcc/testsuite/gcc.target/i386/vect-unpack-3.c
new file mode 100644
index 000..eb8a93e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-unpack-3.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-vect-details -ftree-vectorize -ffast-math 
-mavx512bw -save-temps" } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "avx512bw-check.h"
+
+#define N 120
+signed int yy[1];
+
+void
+__attribute__ ((noinline)) foo (signed char s)
+{
+   signed char i;
+   for (i = 0; i < s; i++)
+ yy[i] = (signed int) i;
+}
+
+void
+avx512bw_test ()
+{
+  signed char i;
+  foo (N);
+  for (i = 0; i < N; i++)
+if ( (signed int)i != yy [i] )
+  abort ();
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-assembler-not "vpmovsxbw\[ \\t\]+\[^\n\]*%zmm" } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index d813b86..da98211 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -6921,11 +6921,13 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   /* Reduce loop iterations by the vectorization factor.  */
   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
  expected_iterations / vectorization_factor);
-  loop->nb_iterations_upper_bound
-= wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
   && loop->nb_iterations_upper_bound != 0)
 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
+  loop->nb_iterations_upper_bound
+= wi::udiv_floor (loop->nb_iterations_upper_bound + 1,
+ vectorization_factor) - 1;
+
   if (loop->any_estimate)
 {
   loop->nb_iterations_estimate