Hi,
this patch fixes the compilation of self-dependent loops in oacc kernels
regions.
First, consider a simple vector addition:
...
#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
{
for (COUNTERTYPE ii = 0; ii < N; ii++)
c[ii] = a[ii] + b[ii];
}
...
Until now (or rather, until the introduction of
transform_to_exit_first_loop_alt), the loop body of such a loop was
parallelized and executed by different gangs, but the last iteration of
the loop body was executed by all the gangs (due to
transform_to_exit_first_loop). This did not lead to wrong results for
this loop, because executing the statement 'c[N-1] = a[N-1] + b[N-1]'
once or 32 times did not change the results.
For self-dependent loops, we do get wrong results however:
...
#pragma acc kernels copy (c[0:N])
{
for (COUNTERTYPE ii = 0; ii < N; ii++)
c[ii] = c[ii] + ii + 1;
}
...
The introduction of transform_to_exit_first_loop_alt fixed that
correctness problem for this example. But
transform_to_exit_first_loop_alt does not always succeed. This patch
makes paralellization fail if transform_to_exit_first_loop_alt fails,
making sure we don't run into the same problem again.
Furthermore, the patch replaces pass_copy_prop with pass_fre. I found
this necessary at some point and added it to the patch, but I can't
reproduce the necessity now, so I'll revert that bit asap.
Committed to gomp-4_0-branch.
Thanks,
- Tom
Only use transform_to_exit_first_loop_alt for kernels
2015-05-28 Tom de Vries <t...@codesourcery.com>
* passes.def: Replace pass_copy_prop with pass_fre. Surround with
pass_tree_loop_done and pass_tree_loop_init.
* tree-parloops.c (gen_parallel_loop): Bail out of parallelization if
try_transform_to_exit_first_loop_alt fails.
* c-c++-common/goacc/kernels-loop-3.c: New test.
* testsuite/libgomp.oacc-c-c++-common/kernels-loop-3.c: New test.
diff --git a/gcc/passes.def b/gcc/passes.def
index da497ed..6be71db3 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -96,7 +96,9 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_fre);
NEXT_PASS (pass_tree_loop_init);
NEXT_PASS (pass_lim);
- NEXT_PASS (pass_copy_prop);
+ NEXT_PASS (pass_tree_loop_done);
+ NEXT_PASS (pass_fre);
+ NEXT_PASS (pass_tree_loop_init);
NEXT_PASS (pass_scev_cprop);
NEXT_PASS (pass_parallelize_loops_oacc_kernels);
NEXT_PASS (pass_expand_omp_ssa);
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c
new file mode 100644
index 0000000..57375db
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3.c
@@ -0,0 +1,48 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int i;
+
+ unsigned int *__restrict c;
+
+ c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ c[i] = i * 2;
+
+#pragma acc kernels copy (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = c[ii] + ii + 1;
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != i * 2 + i + 1)
+ abort ();
+
+ free (c);
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)pragma omp target oacc_parallel.*num_gangs\\(32\\)" 1 "parloops_oacc_kernels" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index f698eea..72877ee 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -2338,6 +2338,9 @@ gen_parallel_loop (struct loop *loop,
iterations of the loop by one. */
if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
{
+ if (oacc_kernels_p)
+ n_threads = 1;
+
/* Fall back on the method that handles more cases, but duplicates the
loop body: move the exit condition of LOOP to the beginning of its
header, and duplicate the part of the last iteration that gets disabled
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-3.c
new file mode 100644
index 0000000..7084711
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-3.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2" } */
+
+#include <stdlib.h>
+
+#define N (1024 * 512)
+#define COUNTERTYPE unsigned int
+
+int
+main (void)
+{
+ unsigned int i;
+
+ unsigned int *__restrict c;
+
+ c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ c[i] = i * 2;
+
+#pragma acc kernels copy (c[0:N])
+ {
+ for (COUNTERTYPE ii = 0; ii < N; ii++)
+ c[ii] = c[ii] + ii + 1;
+ }
+
+ for (COUNTERTYPE i = 0; i < N; i++)
+ if (c[i] != i * 2 + i + 1)
+ abort ();
+
+ free (c);
+
+ return 0;
+}
--
1.9.1