[committed] Address compiler diagnostics in libgomp.oacc-c-c++-common/pr87835.c (was: [committed][nvptx, libgomp] Fix map_push)

2019-05-08 Thread Thomas Schwinge
Hi!

On Wed, 23 Jan 2019 09:19:33 +0100, Tom de Vries  wrote:
> The map field of a struct ptx_stream is [...]

> The current implemention gets at least the first and most basic scenario 
> wrong:
> [...]

> This problem causes the test-case asyncwait-1.c to fail intermittently on some
> systems.  The pr87835.c test-case added here is a a minimized and modified
> version of asyncwait-1.c (avoiding the kernel construct) that is more likely 
> to
> fail.

Indeed, with one OpenACC directive fixed (see below), I've been able to
reliably reproduce the failure, too, for all optimization levels I tried.

> Fix this by rewriting map_pop more robustly, by: [...]

Thanks, belatedly.

Regarding the test case:

> --- /dev/null
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c
> @@ -0,0 +1,62 @@
> +/* { dg-do run { target openacc_nvidia_accel_selected } } */
> +/* { dg-additional-options "-lcuda" } */
> +
> +#include 
> +#include 
> +#include "cuda.h"
> +
> +#include 
> +
> +#define n 128
> +
> +int
> +main (void)
> +{
> +  CUresult r;
> +  CUstream stream1;
> +  int N = n;
> +  int a[n];
> +  int b[n];

source-gcc/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c:19:7: 
warning: unused variable 'b' [-Wunused-variable]
   19 |   int b[n];
  |   ^

> +  int c[n];
> +
> +  acc_init (acc_device_nvidia);
> +
> +  r = cuStreamCreate (, CU_STREAM_NON_BLOCKING);
> +  if (r != CUDA_SUCCESS)
> +{
> +  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
> +  abort ();
> +}
> +
> +  acc_set_cuda_stream (1, stream1);
> +
> +  for (int i = 0; i < n; i++)
> +{
> +  a[i] = 3;
> +  c[i] = 0;
> +}
> +
> +#pragma acc data copy (a, b, c) copyin (N)
> +  {
> +#pragma acc parallel async (1)
> +;
> +
> +#pragma acc parallel async (1) num_gangs (320)
> +#pragma loop gang

source-gcc/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c:45: 
warning: ignoring #pragma loop gang [-Wunknown-pragmas]
   45 | #pragma loop gang
  | 

> +for (int ii = 0; ii < N; ii++)
> +  c[ii] = (a[ii] + a[N - ii - 1]);
> +
> +#pragma acc parallel async (1)
> +#pragma acc loop seq
> +for (int ii = 0; ii < n; ii++)
> +  a[ii] = 6;
> +
> +#pragma acc wait (1)
> +  }
> +
> +  for (int i = 0; i < n; i++)
> +if (c[i] != 6)
> +  abort ();
> +
> +  return 0;
> +}

Addressed on trunk in r271004, and on gcc-9-branch in r271005, see
attached.


Grüße
 Thomas


From 253ef38b3c248b69e8ab493b19b1585f291c9843 Mon Sep 17 00:00:00 2001
From: tschwinge 
Date: Wed, 8 May 2019 10:01:30 +
Subject: [PATCH] Address compiler diagnostics in
 libgomp.oacc-c-c++-common/pr87835.c

source-gcc/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c: In function 'main':
source-gcc/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c:45: warning: ignoring #pragma loop gang [-Wunknown-pragmas]
   45 | #pragma loop gang
  |
source-gcc/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c:19:7: warning: unused variable 'b' [-Wunused-variable]
   19 |   int b[n];
  |   ^

	libgomp/
	PR target/87835
	* testsuite/libgomp.oacc-c-c++-common/pr87835.c: Update.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@271004 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog | 5 +
 libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c | 5 ++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 64e0a8ad8df..a8ce3c241fc 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,8 @@
+2019-05-07  Thomas Schwinge  
+
+	PR target/87835
+	* testsuite/libgomp.oacc-c-c++-common/pr87835.c: Update.
+
 2019-05-06  Thomas Schwinge  
 
 	* oacc-parallel.c: Add comments to legacy entry points (GCC 5).
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c
index 310a485e74f..88c2c7763cc 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c
@@ -16,7 +16,6 @@ main (void)
   CUstream stream1;
   int N = n;
   int a[n];
-  int b[n];
   int c[n];
 
   acc_init (acc_device_nvidia);
@@ -36,13 +35,13 @@ main (void)
   c[i] = 0;
 }
 
-#pragma acc data copy (a, b, c) copyin (N)
+#pragma acc data copy (a, c) copyin (N)
   {
 #pragma acc parallel async (1)
 ;
 
 #pragma acc parallel async (1) num_gangs (320)
-#pragma loop gang
+#pragma acc loop gang
 for (int ii = 0; ii < N; ii++)
   c[ii] = (a[ii] + a[N - ii - 1]);
 
-- 
2.17.1

From 9f852e24d6d75f00ccca80acb5a6804912a33282 Mon Sep 17 00:00:00 2001
From: tschwinge 
Date: Wed, 8 May 2019 10:03:04 +
Subject: [PATCH] Address compiler diagnostics in
 libgomp.oacc-c-c++-common/pr87835.c

source-gcc/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c: In function 'main':

[committed][nvptx, libgomp] Fix map_push

2019-01-23 Thread Tom de Vries
Hi,

The map field of a struct ptx_stream is a FIFO.  The FIFO is implemented as a
single linked list, with pop-from-the-front semantics.

The function map_pop pops an element, either by:
- deallocating the element, if there is more than one element
- or marking the element inactive, if there's only one element

The responsibility of map_push is to push an element to the back, as well as
selecting the element to push, by:
- allocating an element, or
- reusing the element at the front if inactive and big enough, or
- dropping the element at the front if inactive and not big enough, and
  allocating one that's big enough

The current implemention gets at least the first and most basic scenario wrong:

> map = cuda_map_create (size);

We create an element, and assign it to map.

> for (t = s->map; t->next != NULL; t = t->next)
>   ;

We determine the last element in the fifo.

> t->next = map;

We append the new element.

> s->map = map;

But here, we throw away the rest of the FIFO, and declare the FIFO to be just
the new element.

This problem causes the test-case asyncwait-1.c to fail intermittently on some
systems.  The pr87835.c test-case added here is a a minimized and modified
version of asyncwait-1.c (avoiding the kernel construct) that is more likely to
fail.

Fix this by rewriting map_pop more robustly, by:
- seperating the function in two phases: select element, push element
- when reusing or dropping an element, making sure that the element is cleanly
  popped from the queue
- rewriting the push element part in such a way that it can handle all cases
  without needing if statements, such that each line is exercised for each of
  the three cases.

Committed to trunk.

Thanks,
- Tom

[nvptx, libgomp] Fix map_push

2019-01-22  Tom de Vries  

PR target/87835
* plugin/plugin-nvptx.c (map_push): Fix adding of allocated element.
* testsuite/libgomp.oacc-c-c++-common/pr87835.c: New test.

---
 libgomp/plugin/plugin-nvptx.c  | 47 +---
 .../testsuite/libgomp.oacc-c-c++-common/pr87835.c  | 62 ++
 2 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index dd2bcf3083f..a220560b189 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -296,35 +296,46 @@ map_pop (struct ptx_stream *s)
 static CUdeviceptr
 map_push (struct ptx_stream *s, size_t size)
 {
-  struct cuda_map *map = NULL, *t = NULL;
+  struct cuda_map *map = NULL;
+  struct cuda_map **t;
 
   assert (s);
   assert (s->map);
 
-  /* Each PTX stream requires a separate data region to store the
- launch arguments for cuLaunchKernel.  Allocate a new
- cuda_map and push it to the end of the list.  */
+  /* Select an element to push.  */
   if (s->map->active)
+map = cuda_map_create (size);
+  else
 {
-  map = cuda_map_create (size);
+  /* Pop the inactive front element.  */
+  struct cuda_map *pop = s->map;
+  s->map = pop->next;
+  pop->next = NULL;
 
-  for (t = s->map; t->next != NULL; t = t->next)
-   ;
+  if (pop->size < size)
+   {
+ cuda_map_destroy (pop);
 
-  t->next = map;
-}
-  else if (s->map->size < size)
-{
-  cuda_map_destroy (s->map);
-  map = cuda_map_create (size);
+ map = cuda_map_create (size);
+   }
+  else
+   map = pop;
 }
-  else
-map = s->map;
 
-  s->map = map;
-  s->map->active = true;
+  /* Check that the element is as expected.  */
+  assert (map->next == NULL);
+  assert (!map->active);
+
+  /* Mark the element active.  */
+  map->active = true;
+
+  /* Push the element to the back of the list.  */
+  for (t = >map; (*t) != NULL; t = &(*t)->next)
+;
+  assert (t != NULL && *t == NULL);
+  *t = map;
 
-  return s->map->d;
+  return map->d;
 }
 
 /* Target data function launch information.  */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c 
b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c
new file mode 100644
index 000..310a485e74f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr87835.c
@@ -0,0 +1,62 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-additional-options "-lcuda" } */
+
+#include 
+#include 
+#include "cuda.h"
+
+#include 
+
+#define n 128
+
+int
+main (void)
+{
+  CUresult r;
+  CUstream stream1;
+  int N = n;
+  int a[n];
+  int b[n];
+  int c[n];
+
+  acc_init (acc_device_nvidia);
+
+  r = cuStreamCreate (, CU_STREAM_NON_BLOCKING);
+  if (r != CUDA_SUCCESS)
+{
+  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
+  abort ();
+}
+
+  acc_set_cuda_stream (1, stream1);
+
+  for (int i = 0; i < n; i++)
+{
+  a[i] = 3;
+  c[i] = 0;
+}
+
+#pragma acc data copy (a, b, c) copyin (N)
+  {
+#pragma acc parallel async (1)
+;
+
+#pragma acc parallel async (1) num_gangs (320)
+#pragma loop gang
+for (int ii = 0; ii <