Re: [PATCH GCC13 backport] Avoid compile time hog on vect_peel_nonlinear_iv_init for nonlinear induction vec_step_op_mul when iteration count is too big.

2023-10-26 Thread Richard Biener



> Am 24.10.2023 um 13:22 schrieb liuhongt :
> 
> This is the backport patch for releases/gcc-13 branch, the original patch 
> for main trunk
> is at [1].
> The only difference between this backport patch and [1] is GCC13 doesn't 
> support auto_mpz,
> So this patch manually use mpz_init/mpz_clear.
> 
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633661.html
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> Ok for backport to releases/gcc-13?

Ok.

Richard 

> There's loop in vect_peel_nonlinear_iv_init to get init_expr *
> pow (step_expr, skip_niters). When skipn_iters is too big, compile time
> hogs. To avoid that, optimize init_expr * pow (step_expr, skip_niters) to
> init_expr << (exact_log2 (step_expr) * skip_niters) when step_expr is
> pow of 2, otherwise give up vectorization when skip_niters >=
> TYPE_PRECISION (TREE_TYPE (init_expr)).
> 
> Also give up vectorization when niters_skip is negative which will be
> used for fully masked loop.
> 
> gcc/ChangeLog:
> 
>PR tree-optimization/111820
>PR tree-optimization/111833
>* tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Give
>up vectorization for nonlinear iv vect_step_op_mul when
>step_expr is not exact_log2 and niters is greater than
>TYPE_PRECISION (TREE_TYPE (step_expr)). Also don't vectorize
>for nagative niters_skip which will be used by fully masked
>loop.
>(vect_can_advance_ivs_p): Pass whole phi_info to
>vect_can_peel_nonlinear_iv_p.
>* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Optimize
>init_expr * pow (step_expr, skipn) to init_expr
><< (log2 (step_expr) * skipn) when step_expr is exact_log2.
> 
> gcc/testsuite/ChangeLog:
> 
>* gcc.target/i386/pr111820-1.c: New test.
>* gcc.target/i386/pr111820-2.c: New test.
>* gcc.target/i386/pr111820-3.c: New test.
>* gcc.target/i386/pr103144-mul-1.c: Adjust testcase.
>* gcc.target/i386/pr103144-mul-2.c: Adjust testcase.
> ---
> .../gcc.target/i386/pr103144-mul-1.c  |  8 +++---
> .../gcc.target/i386/pr103144-mul-2.c  |  8 +++---
> gcc/testsuite/gcc.target/i386/pr111820-1.c| 16 +++
> gcc/testsuite/gcc.target/i386/pr111820-2.c| 16 +++
> gcc/testsuite/gcc.target/i386/pr111820-3.c| 16 +++
> gcc/tree-vect-loop-manip.cc   | 28 +--
> gcc/tree-vect-loop.cc | 21 +++---
> 7 files changed, 98 insertions(+), 15 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-3.c
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c 
> b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> index 640c34fd959..913d7737dcd 100644
> --- a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> @@ -11,7 +11,7 @@ foo_mul (int* a, int b)
>   for (int i = 0; i != N; i++)
> {
>   a[i] = b;
> -  b *= 3;
> +  b *= 4;
> }
> }
> 
> @@ -23,7 +23,7 @@ foo_mul_const (int* a)
>   for (int i = 0; i != N; i++)
> {
>   a[i] = b;
> -  b *= 3;
> +  b *= 4;
> }
> }
> 
> @@ -34,7 +34,7 @@ foo_mul_peel (int* a, int b)
>   for (int i = 0; i != 39; i++)
> {
>   a[i] = b;
> -  b *= 3;
> +  b *= 4;
> }
> }
> 
> @@ -46,6 +46,6 @@ foo_mul_peel_const (int* a)
>   for (int i = 0; i != 39; i++)
> {
>   a[i] = b;
> -  b *= 3;
> +  b *= 4;
> }
> }
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c 
> b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> index 39fdea3a69d..b2ff186e335 100644
> --- a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
> @@ -16,12 +16,12 @@ avx2_test (void)
> 
>   __builtin_memset (epi32_exp, 0, N * sizeof (int));
>   int b = 8;
> -  v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 
> 243, b * 729, b * 2187 };
> +  v8si init = __extension__(v8si) { b, b * 4, b * 16, b * 64, b * 256, b * 
> 1024, b * 4096, b * 16384 };
> 
>   for (int i = 0; i != N / 8; i++)
> {
>   memcpy (epi32_exp + i * 8, , 32);
> -  init *= 6561;
> +  init *= 65536;
> }
> 
>   foo_mul (epi32_dst, b);
> @@ -32,11 +32,11 @@ avx2_test (void)
>   if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
> __builtin_abort ();
> 
> -  init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
> +  init = __extension__(v8si) { 1, 4, 16, 64, 256, 1024, 4096, 16384 };
>   for (int i = 0; i != N / 8; i++)
> {
>   memcpy (epi32_exp + i * 8, , 32);
> -  init *= 6561;
> +  init *= 65536;
> }
> 
>   foo_mul_const (epi32_dst);
> diff --git a/gcc/testsuite/gcc.target/i386/pr111820-1.c 
> b/gcc/testsuite/gcc.target/i386/pr111820-1.c
> new file mode 100644
> index 000..50e960c39d4
> --- /dev/null
> +++ 

[PATCH GCC13 backport] Avoid compile time hog on vect_peel_nonlinear_iv_init for nonlinear induction vec_step_op_mul when iteration count is too big.

2023-10-24 Thread liuhongt
This is the backport patch for releases/gcc-13 branch, the original patch for 
main trunk
is at [1].
The only difference between this backport patch and [1] is GCC13 doesn't 
support auto_mpz,
So this patch manually use mpz_init/mpz_clear.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633661.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for backport to releases/gcc-13?

There's loop in vect_peel_nonlinear_iv_init to get init_expr *
pow (step_expr, skip_niters). When skipn_iters is too big, compile time
hogs. To avoid that, optimize init_expr * pow (step_expr, skip_niters) to
init_expr << (exact_log2 (step_expr) * skip_niters) when step_expr is
pow of 2, otherwise give up vectorization when skip_niters >=
TYPE_PRECISION (TREE_TYPE (init_expr)).

Also give up vectorization when niters_skip is negative which will be
used for fully masked loop.

gcc/ChangeLog:

PR tree-optimization/111820
PR tree-optimization/111833
* tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Give
up vectorization for nonlinear iv vect_step_op_mul when
step_expr is not exact_log2 and niters is greater than
TYPE_PRECISION (TREE_TYPE (step_expr)). Also don't vectorize
for nagative niters_skip which will be used by fully masked
loop.
(vect_can_advance_ivs_p): Pass whole phi_info to
vect_can_peel_nonlinear_iv_p.
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Optimize
init_expr * pow (step_expr, skipn) to init_expr
<< (log2 (step_expr) * skipn) when step_expr is exact_log2.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111820-1.c: New test.
* gcc.target/i386/pr111820-2.c: New test.
* gcc.target/i386/pr111820-3.c: New test.
* gcc.target/i386/pr103144-mul-1.c: Adjust testcase.
* gcc.target/i386/pr103144-mul-2.c: Adjust testcase.
---
 .../gcc.target/i386/pr103144-mul-1.c  |  8 +++---
 .../gcc.target/i386/pr103144-mul-2.c  |  8 +++---
 gcc/testsuite/gcc.target/i386/pr111820-1.c| 16 +++
 gcc/testsuite/gcc.target/i386/pr111820-2.c| 16 +++
 gcc/testsuite/gcc.target/i386/pr111820-3.c| 16 +++
 gcc/tree-vect-loop-manip.cc   | 28 +--
 gcc/tree-vect-loop.cc | 21 +++---
 7 files changed, 98 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-3.c

diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
index 640c34fd959..913d7737dcd 100644
--- a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
@@ -11,7 +11,7 @@ foo_mul (int* a, int b)
   for (int i = 0; i != N; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -23,7 +23,7 @@ foo_mul_const (int* a)
   for (int i = 0; i != N; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -34,7 +34,7 @@ foo_mul_peel (int* a, int b)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
 
@@ -46,6 +46,6 @@ foo_mul_peel_const (int* a)
   for (int i = 0; i != 39; i++)
 {
   a[i] = b;
-  b *= 3;
+  b *= 4;
 }
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c 
b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
index 39fdea3a69d..b2ff186e335 100644
--- a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c
@@ -16,12 +16,12 @@ avx2_test (void)
 
   __builtin_memset (epi32_exp, 0, N * sizeof (int));
   int b = 8;
-  v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, 
b * 729, b * 2187 };
+  v8si init = __extension__(v8si) { b, b * 4, b * 16, b * 64, b * 256, b * 
1024, b * 4096, b * 16384 };
 
   for (int i = 0; i != N / 8; i++)
 {
   memcpy (epi32_exp + i * 8, , 32);
-  init *= 6561;
+  init *= 65536;
 }
 
   foo_mul (epi32_dst, b);
@@ -32,11 +32,11 @@ avx2_test (void)
   if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0)
 __builtin_abort ();
 
-  init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 };
+  init = __extension__(v8si) { 1, 4, 16, 64, 256, 1024, 4096, 16384 };
   for (int i = 0; i != N / 8; i++)
 {
   memcpy (epi32_exp + i * 8, , 32);
-  init *= 6561;
+  init *= 65536;
 }
 
   foo_mul_const (epi32_dst);
diff --git a/gcc/testsuite/gcc.target/i386/pr111820-1.c 
b/gcc/testsuite/gcc.target/i386/pr111820-1.c
new file mode 100644
index 000..50e960c39d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111820-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fno-tree-vrp -Wno-aggressive-loop-optimizations 
-fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump "Avoid