On 07/31/2018 05:27 PM, Cesar Philippidis wrote:
> At present, libgomp is using CUDA unified memory only as a buffer pass
> to the struct containing the pointers to the data mappings to the
> offloaded functions. I'm not sure why unified memory is needed here if
> it is still being managed explicitly by the driver.
> 
> This patch removes the use of CUDA unified memory from the driver. I
> don't recall observing any reduction in performance. Besides,
> eventually, we'd like to eliminate the struct containing all pointers to
> the offloaded data mappings and pass those pointers as individual
> function arguments to cuLaunchKernel directly.
> 
> Is this patch OK for trunk? I bootstrapped and regression tested it for
> x86_64 with nvptx offloading.
> 
> Thanks,
> Cesar
> 
> 
> 0001-nvptx-Remove-use-of-CUDA-unified-memory-in-libgomp.patch
> 
> 
> [PATCH] [nvptx] Remove use of CUDA unified memory in libgomp
> 
> 2018-XX-YY  Cesar Philippidis  <ce...@codesourcery.com>
> 
>       libgomp/
>       * plugin/plugin-nvptx.c (struct cuda_map): New.
>       (struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
>       h_tail with (cuda_map *) map.
>       (cuda_map_create): New function.
>       (cuda_map_destroy): New function.
>       (map_init): Update to use a linked list of cuda_map objects.
>       (map_fini): Likewise.
>       (map_pop): Likewise.
>       (map_push): Likewise.  Return CUdeviceptr instead of void.
>       (init_streams_for_device): Remove stales references to ptx_stream
>       members.
>       (select_stream_for_async): Likewise.
>       (nvptx_exec): Update call to map_init.
> 
> (cherry picked from gomp-4_0-branch r242614)
> ---
>  libgomp/plugin/plugin-nvptx.c | 167 
> +++++++++++++++++++++++-------------------
>  1 file changed, 90 insertions(+), 77 deletions(-)
> 
> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
> index 1237ea10..d79ddf1 100644
> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -200,20 +200,20 @@ cuda_error (CUresult r)
>  static unsigned int instantiated_devices = 0;
>  static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
>  
> +struct cuda_map
> +{
> +  CUdeviceptr d;
> +  size_t size;
> +  bool active;
> +  struct cuda_map *next;
> +};
> +
>  struct ptx_stream
>  {
>    CUstream stream;
>    pthread_t host_thread;
>    bool multithreaded;
> -
> -  CUdeviceptr d;
> -  void *h;
> -  void *h_begin;
> -  void *h_end;
> -  void *h_next;
> -  void *h_prev;
> -  void *h_tail;
> -
> +  struct cuda_map *map;
>    struct ptx_stream *next;
>  };
>  
> @@ -225,101 +225,114 @@ struct nvptx_thread
>    struct ptx_device *ptx_dev;
>  };
>  
> +static struct cuda_map *
> +cuda_map_create (size_t size)
> +{
> +  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
> +
> +  assert (map);
> +
> +  map->next = NULL;
> +  map->size = size;
> +  map->active = false;
> +
> +  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
> +  assert (map->d);
> +
> +  return map;
> +}
> +
> +static void
> +cuda_map_destroy (struct cuda_map *map)
> +{
> +  CUDA_CALL_ASSERT (cuMemFree, map->d);
> +  free (map);
> +}
> +
> +/* The following map_* routines manage the CUDA device memory that
> +   contains the data mapping arguments for cuLaunchKernel.  Each
> +   asynchronous PTX stream may have multiple pending kernel
> +   invocations, which are launched in a FIFO order.  As such, the map
> +   routines maintains a queue of cuLaunchKernel arguments.
> +
> +   Calls to map_push and map_pop must be guarded by ptx_event_lock.
> +   Likewise, calls to map_init and map_fini are guarded by
> +   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
> +   GOMP_OFFLOAD_fini_device, respectively.  */
> +
>  static bool
>  map_init (struct ptx_stream *s)
>  {
>    int size = getpagesize ();
>  
>    assert (s);
> -  assert (!s->d);
> -  assert (!s->h);
> -
> -  CUDA_CALL (cuMemAllocHost, &s->h, size);
> -  CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
>  
> -  assert (s->h);
> +  s->map = cuda_map_create (size);
>  
> -  s->h_begin = s->h;
> -  s->h_end = s->h_begin + size;
> -  s->h_next = s->h_prev = s->h_tail = s->h_begin;
> -
> -  assert (s->h_next);
> -  assert (s->h_end);
>    return true;
>  }
>  
>  static bool
>  map_fini (struct ptx_stream *s)
>  {
> -  CUDA_CALL (cuMemFreeHost, s->h);
> +  assert (s->map->next == NULL);
> +  assert (!s->map->active);
> +
> +  cuda_map_destroy (s->map);
> +
>    return true;
>  }
>  
>  static void
>  map_pop (struct ptx_stream *s)
>  {
> -  assert (s != NULL);
> -  assert (s->h_next);
> -  assert (s->h_prev);
> -  assert (s->h_tail);
> -
> -  s->h_tail = s->h_next;
> -
> -  if (s->h_tail >= s->h_end)
> -    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
> +  struct cuda_map *next;
>  
> -  if (s->h_next == s->h_tail)
> -    s->h_prev = s->h_next;
> +  assert (s != NULL);
>  
> -  assert (s->h_next >= s->h_begin);
> -  assert (s->h_tail >= s->h_begin);
> -  assert (s->h_prev >= s->h_begin);
> +  if (s->map->next == NULL)
> +    {
> +      s->map->active = false;
> +      return;
> +    }
>  
> -  assert (s->h_next <= s->h_end);
> -  assert (s->h_tail <= s->h_end);
> -  assert (s->h_prev <= s->h_end);
> +  next = s->map->next;
> +  cuda_map_destroy (s->map);
> +  s->map = next;
>  }
>  
> -static void
> -map_push (struct ptx_stream *s, size_t size, void **h, void **d)
> +static CUdeviceptr
> +map_push (struct ptx_stream *s, size_t size)
>  {
> -  int left;
> -  int offset;
> +  struct cuda_map *map = NULL, *t = NULL;
>  
> -  assert (s != NULL);
> +  assert (s);
> +  assert (s->map);
>  
> -  left = s->h_end - s->h_next;
> +  /* Each PTX stream requires a separate data region to store the
> +     launch arguments for cuLaunchKernel.  Allocate a new
> +     cuda_map and push it to the end of the list.  */
> +  if (s->map->active)
> +    {
> +      map = cuda_map_create (size);
>  
> -  assert (s->h_prev);
> -  assert (s->h_next);
> +      for (t = s->map; t->next != NULL; t = t->next)
> +     ;
>  
> -  if (size >= left)
> +      t->next = map;
> +    }
> +  else if (s->map->size < size)
>      {
> -      assert (s->h_next == s->h_prev);
> -      s->h_next = s->h_prev = s->h_tail = s->h_begin;
> +      cuda_map_destroy (s->map);
> +      map = cuda_map_create (size);
>      }
> +  else
> +    map = s->map;
>  
> -  assert (s->h_next);
> -
> -  offset = s->h_next - s->h;
> -
> -  *d = (void *)(s->d + offset);
> -  *h = (void *)(s->h + offset);
> -
> -  s->h_prev = s->h_next;
> -  s->h_next += size;
> -
> -  assert (s->h_prev);
> -  assert (s->h_next);
> -
> -  assert (s->h_next >= s->h_begin);
> -  assert (s->h_tail >= s->h_begin);
> -  assert (s->h_prev >= s->h_begin);
> -  assert (s->h_next <= s->h_end);
> -  assert (s->h_tail <= s->h_end);
> -  assert (s->h_prev <= s->h_end);
> +  s->map = map;
> +  s->map->active = true;
>  
> -  return;
> +  return s->map->d;
>  }
>  
>  /* Target data function launch information.  */
> @@ -451,8 +464,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int 
> concurrency)
>    null_stream->stream = NULL;
>    null_stream->host_thread = pthread_self ();
>    null_stream->multithreaded = true;
> -  null_stream->d = (CUdeviceptr) NULL;
> -  null_stream->h = NULL;
>    if (!map_init (null_stream))
>      return false;
>  
> @@ -587,8 +598,6 @@ select_stream_for_async (int async, pthread_t thread, 
> bool create,
>         s->host_thread = thread;
>         s->multithreaded = false;
>  
> -       s->d = (CUdeviceptr) NULL;
> -       s->h = NULL;
>         if (!map_init (s))
>           {
>             pthread_mutex_unlock (&ptx_dev->stream_lock);
> @@ -1123,7 +1132,8 @@ nvptx_exec (void (*fn), size_t mapnum, void 
> **hostaddrs, void **devaddrs,
>    int i;
>    struct ptx_stream *dev_str;
>    void *kargs[1];
> -  void *hp, *dp;
> +  void *hp;
> +  CUdeviceptr dp;
>    struct nvptx_thread *nvthd = nvptx_thread ();
>    int warp_size = nvthd->ptx_dev->warp_size;
>    const char *maybe_abort_msg = "(perhaps abort was called)";
> @@ -1270,17 +1280,20 @@ nvptx_exec (void (*fn), size_t mapnum, void 
> **hostaddrs, void **devaddrs,
>    /* This reserves a chunk of a pre-allocated page of memory mapped on both
>       the host and the device. HP is a host pointer to the new chunk, and DP 
> is
>       the corresponding device pointer.  */
> -  map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
> +  pthread_mutex_lock (&ptx_event_lock);
> +  dp = map_push (dev_str, mapnum * sizeof (void *));
> +  pthread_mutex_unlock (&ptx_event_lock);
>  
>    GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
>  
>    /* Copy the array of arguments to the mapped page.  */
> +  hp = alloca(sizeof(void *) * mapnum);
>    for (i = 0; i < mapnum; i++)
>      ((void **) hp)[i] = devaddrs[i];
>  
>    /* Copy the (device) pointers to arguments to the device (dp and hp might 
> in
>       fact have the same value on a unified-memory system).  */

This comment needs to be updated, right?

> -  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
> +  CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
>                   mapnum * sizeof (void *));
>    GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
>                    " gangs=%u, workers=%u, vectors=%u\n",
> -- 2.7.4
> 

Otherwise OK.

Thanks,
- Tom

Reply via email to