At present, libgomp is using CUDA unified memory only as a buffer pass
to the struct containing the pointers to the data mappings to the
offloaded functions. I'm not sure why unified memory is needed here if
it is still being managed explicitly by the driver.

This patch removes the use of CUDA unified memory from the driver. I
don't recall observing any reduction in performance. Besides,
eventually, we'd like to eliminate the struct containing all pointers to
the offloaded data mappings and pass those pointers as individual
function arguments to cuLaunchKernel directly.

Is this patch OK for trunk? I bootstrapped and regression tested it for
x86_64 with nvptx offloading.

Thanks,
Cesar
[PATCH] [nvptx] Remove use of CUDA unified memory in libgomp

2018-XX-YY  Cesar Philippidis  <ce...@codesourcery.com>

	libgomp/
	* plugin/plugin-nvptx.c (struct cuda_map): New.
	(struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
	h_tail with (cuda_map *) map.
	(cuda_map_create): New function.
	(cuda_map_destroy): New function.
	(map_init): Update to use a linked list of cuda_map objects.
	(map_fini): Likewise.
	(map_pop): Likewise.
	(map_push): Likewise.  Return CUdeviceptr instead of void.
	(init_streams_for_device): Remove stales references to ptx_stream
	members.
	(select_stream_for_async): Likewise.
	(nvptx_exec): Update call to map_init.

(cherry picked from gomp-4_0-branch r242614)
---
 libgomp/plugin/plugin-nvptx.c | 167 +++++++++++++++++++++++-------------------
 1 file changed, 90 insertions(+), 77 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 1237ea10..d79ddf1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -200,20 +200,20 @@ cuda_error (CUresult r)
 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
+struct cuda_map
+{
+  CUdeviceptr d;
+  size_t size;
+  bool active;
+  struct cuda_map *next;
+};
+
 struct ptx_stream
 {
   CUstream stream;
   pthread_t host_thread;
   bool multithreaded;
-
-  CUdeviceptr d;
-  void *h;
-  void *h_begin;
-  void *h_end;
-  void *h_next;
-  void *h_prev;
-  void *h_tail;
-
+  struct cuda_map *map;
   struct ptx_stream *next;
 };
 
@@ -225,101 +225,114 @@ struct nvptx_thread
   struct ptx_device *ptx_dev;
 };
 
+static struct cuda_map *
+cuda_map_create (size_t size)
+{
+  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
+
+  assert (map);
+
+  map->next = NULL;
+  map->size = size;
+  map->active = false;
+
+  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
+  assert (map->d);
+
+  return map;
+}
+
+static void
+cuda_map_destroy (struct cuda_map *map)
+{
+  CUDA_CALL_ASSERT (cuMemFree, map->d);
+  free (map);
+}
+
+/* The following map_* routines manage the CUDA device memory that
+   contains the data mapping arguments for cuLaunchKernel.  Each
+   asynchronous PTX stream may have multiple pending kernel
+   invocations, which are launched in a FIFO order.  As such, the map
+   routines maintains a queue of cuLaunchKernel arguments.
+
+   Calls to map_push and map_pop must be guarded by ptx_event_lock.
+   Likewise, calls to map_init and map_fini are guarded by
+   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
+   GOMP_OFFLOAD_fini_device, respectively.  */
+
 static bool
 map_init (struct ptx_stream *s)
 {
   int size = getpagesize ();
 
   assert (s);
-  assert (!s->d);
-  assert (!s->h);
-
-  CUDA_CALL (cuMemAllocHost, &s->h, size);
-  CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 
-  assert (s->h);
+  s->map = cuda_map_create (size);
 
-  s->h_begin = s->h;
-  s->h_end = s->h_begin + size;
-  s->h_next = s->h_prev = s->h_tail = s->h_begin;
-
-  assert (s->h_next);
-  assert (s->h_end);
   return true;
 }
 
 static bool
 map_fini (struct ptx_stream *s)
 {
-  CUDA_CALL (cuMemFreeHost, s->h);
+  assert (s->map->next == NULL);
+  assert (!s->map->active);
+
+  cuda_map_destroy (s->map);
+
   return true;
 }
 
 static void
 map_pop (struct ptx_stream *s)
 {
-  assert (s != NULL);
-  assert (s->h_next);
-  assert (s->h_prev);
-  assert (s->h_tail);
-
-  s->h_tail = s->h_next;
-
-  if (s->h_tail >= s->h_end)
-    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+  struct cuda_map *next;
 
-  if (s->h_next == s->h_tail)
-    s->h_prev = s->h_next;
+  assert (s != NULL);
 
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
+  if (s->map->next == NULL)
+    {
+      s->map->active = false;
+      return;
+    }
 
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  next = s->map->next;
+  cuda_map_destroy (s->map);
+  s->map = next;
 }
 
-static void
-map_push (struct ptx_stream *s, size_t size, void **h, void **d)
+static CUdeviceptr
+map_push (struct ptx_stream *s, size_t size)
 {
-  int left;
-  int offset;
+  struct cuda_map *map = NULL, *t = NULL;
 
-  assert (s != NULL);
+  assert (s);
+  assert (s->map);
 
-  left = s->h_end - s->h_next;
+  /* Each PTX stream requires a separate data region to store the
+     launch arguments for cuLaunchKernel.  Allocate a new
+     cuda_map and push it to the end of the list.  */
+  if (s->map->active)
+    {
+      map = cuda_map_create (size);
 
-  assert (s->h_prev);
-  assert (s->h_next);
+      for (t = s->map; t->next != NULL; t = t->next)
+	;
 
-  if (size >= left)
+      t->next = map;
+    }
+  else if (s->map->size < size)
     {
-      assert (s->h_next == s->h_prev);
-      s->h_next = s->h_prev = s->h_tail = s->h_begin;
+      cuda_map_destroy (s->map);
+      map = cuda_map_create (size);
     }
+  else
+    map = s->map;
 
-  assert (s->h_next);
-
-  offset = s->h_next - s->h;
-
-  *d = (void *)(s->d + offset);
-  *h = (void *)(s->h + offset);
-
-  s->h_prev = s->h_next;
-  s->h_next += size;
-
-  assert (s->h_prev);
-  assert (s->h_next);
-
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  s->map = map;
+  s->map->active = true;
 
-  return;
+  return s->map->d;
 }
 
 /* Target data function launch information.  */
@@ -451,8 +464,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
   null_stream->stream = NULL;
   null_stream->host_thread = pthread_self ();
   null_stream->multithreaded = true;
-  null_stream->d = (CUdeviceptr) NULL;
-  null_stream->h = NULL;
   if (!map_init (null_stream))
     return false;
 
@@ -587,8 +598,6 @@ select_stream_for_async (int async, pthread_t thread, bool create,
 	  s->host_thread = thread;
 	  s->multithreaded = false;
 
-	  s->d = (CUdeviceptr) NULL;
-	  s->h = NULL;
 	  if (!map_init (s))
 	    {
 	      pthread_mutex_unlock (&ptx_dev->stream_lock);
@@ -1123,7 +1132,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   int i;
   struct ptx_stream *dev_str;
   void *kargs[1];
-  void *hp, *dp;
+  void *hp;
+  CUdeviceptr dp;
   struct nvptx_thread *nvthd = nvptx_thread ();
   int warp_size = nvthd->ptx_dev->warp_size;
   const char *maybe_abort_msg = "(perhaps abort was called)";
@@ -1270,17 +1280,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
      the corresponding device pointer.  */
-  map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
+  pthread_mutex_lock (&ptx_event_lock);
+  dp = map_push (dev_str, mapnum * sizeof (void *));
+  pthread_mutex_unlock (&ptx_event_lock);
 
   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 
   /* Copy the array of arguments to the mapped page.  */
+  hp = alloca(sizeof(void *) * mapnum);
   for (i = 0; i < mapnum; i++)
     ((void **) hp)[i] = devaddrs[i];
 
   /* Copy the (device) pointers to arguments to the device (dp and hp might in
      fact have the same value on a unified-memory system).  */
-  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
+  CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
 		    mapnum * sizeof (void *));
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",
-- 
2.7.4

Reply via email to