Re: [patch] OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect

Thomas Schwinge Thu, 27 Jul 2023 14:00:57 -0700

Hi Tobias!

On 2023-07-25T23:45:54+0200, Tobias Burnus <tob...@codesourcery.com> wrote:
> The attached patch calls CUDA's cuMemcopy2D and cuMemcpy3D
> for omp_target_memcpy_rect[,_async} for dim=2/dim=3. This should
> speed up the data transfer for noncontiguous data.


ACK, thanks.

> While being there, I ended up adding support for device to other device
> copying; while potentially slow, it is still better than not being able to
> copy - and with shared-memory, it shouldn't be that bad.

Makes sense, I guess.

> Comments, suggestions, remarks?
> If there are none, will commit it...

You're so quick -- I'm so slow...  ;-)

I've not verified all the logic in here, but I've got a few comments.

> Disclaimer: While I have done correctness tests (system with two nvptx GPUs,
> I have not done any performance tests.

Well, we should, eventually.

> (I also tested it without offloading
> configured, but that's rather boring.)

> OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect
>
> When copying a 2D or 3D rectangular memmory block, the performance is
> better when using CUDA's cuMemcpy2D/cuMemcpy3D instead of copying the
> data one by one. That's what this commit does.

So you've actually done some performance verification?

> Additionally, it permits device-to-device copies, if neccessary using a
> temporary variable on the host.

> --- a/include/cuda/cuda.h
> +++ b/include/cuda/cuda.h

I note that you're not actually using everything you're adding here.
(..., but I understand you're simply adding everying that relates to
these 'cuMemcpy[...]' routines -- OK as far as I'm concerned.)

> @@ -47,6 +47,7 @@ typedef void *CUevent;
>  typedef void *CUfunction;
>  typedef void *CUlinkState;
>  typedef void *CUmodule;
> +typedef void *CUarray;
>  typedef size_t (*CUoccupancyB2DSize)(int);
>  typedef void *CUstream;
>
> @@ -54,7 +55,10 @@ typedef enum {
>    CUDA_SUCCESS = 0,
>    CUDA_ERROR_INVALID_VALUE = 1,
>    CUDA_ERROR_OUT_OF_MEMORY = 2,
> +  CUDA_ERROR_NOT_INITIALIZED = 3,
> +  CUDA_ERROR_DEINITIALIZED = 4,
>    CUDA_ERROR_INVALID_CONTEXT = 201,
> +  CUDA_ERROR_INVALID_HANDLE = 400,
>    CUDA_ERROR_NOT_FOUND = 500,
>    CUDA_ERROR_NOT_READY = 600,
>    CUDA_ERROR_LAUNCH_FAILED = 719,
> @@ -126,6 +130,75 @@ typedef enum {
>    CU_LIMIT_MALLOC_HEAP_SIZE = 0x02,
>  } CUlimit;
>
> +typedef enum {
> +  CU_MEMORYTYPE_HOST = 0x01,
> +  CU_MEMORYTYPE_DEVICE = 0x02,
> +  CU_MEMORYTYPE_ARRAY = 0x03,
> +  CU_MEMORYTYPE_UNIFIED = 0x04
> +} CUmemorytype;
> +
> +typedef struct {
> +  size_t srcXInBytes, srcY;
> +  CUmemorytype srcMemoryType;
> +  const void *srcHost;
> +  CUdeviceptr srcDevice;
> +  CUarray srcArray;
> +  size_t srcPitch;
> +
> +  size_t dstXInBytes, dstY;
> +  CUmemorytype dstMemoryType;
> +  const void *dstHost;

That last one isn't 'const'.  ;-)

> +  CUdeviceptr dstDevice;
> +  CUarray dstArray;
> +  size_t dstPitch;
> +
> +  size_t WidthInBytes, Height;
> +} CUDA_MEMCPY2D;
> +
> +typedef struct {
> +  size_t srcXInBytes, srcY, srcZ;
> +  size_t srcLOD;
> +  CUmemorytype srcMemoryType;
> +  const void *srcHost;
> +  CUdeviceptr srcDevice;
> +  CUarray srcArray;
> +  void *dummy;

A 'cuda.h' that I looked at calls that last one 'reserved0', with comment
"Must be NULL".

> +  size_t srcPitch, srcHeight;
> +
> +  size_t dstXInBytes, dstY, dstZ;
> +  size_t dstLOD;
> +  CUmemorytype dstMemoryType;
> +  const void *dstHost;

Again, not 'const'.

> +  CUdeviceptr dstDevice;
> +  CUarray dstArray;
> +  void *dummy2;

Similar to above: 'reserved1', with comment "Must be NULL".

> +  size_t dstPitch, dstHeight;
> +
> +  size_t WidthInBytes, Height, Depth;
> +} CUDA_MEMCPY3D;
> +
> +typedef struct {
> +  size_t srcXInBytes, srcY, srcZ;
> +  size_t srcLOD;
> +  CUmemorytype srcMemoryType;
> +  const void *srcHost;
> +  CUdeviceptr srcDevice;
> +  CUarray srcArray;
> +  CUcontext srcContext;
> +  size_t srcPitch, srcHeight;
> +
> +  size_t dstXInBytes, dstY, dstZ;
> +  size_t dstLOD;
> +  CUmemorytype dstMemoryType;
> +  const void *dstHost;
> +  CUdeviceptr dstDevice;
> +  CUarray dstArray;
> +  CUcontext dstContext;
> +  size_t dstPitch, dstHeight;
> +
> +  size_t WidthInBytes, Height, Depth;
> +} CUDA_MEMCPY3D_PEER;
> +
>  #define cuCtxCreate cuCtxCreate_v2
>  CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice);
>  #define cuCtxDestroy cuCtxDestroy_v2
> @@ -183,6 +256,18 @@ CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, 
> CUstream);
>  CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t);
>  #define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
>  CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream);
> +#define cuMemcpy2D cuMemcpy2D_v2
> +CUresult cuMemcpy2D (const CUDA_MEMCPY2D *);
> +#define cuMemcpy2DAsync cuMemcpy2DAsync_v2
> +CUresult cuMemcpy2DAsync (const CUDA_MEMCPY2D *, CUstream);
> +#define cuMemcpy2DUnaligned cuMemcpy2DUnaligned_v2
> +CUresult cuMemcpy2DUnaligned (const CUDA_MEMCPY2D *);
> +#define cuMemcpy3D cuMemcpy3D_v2
> +CUresult cuMemcpy3D (const CUDA_MEMCPY3D *);
> +#define cuMemcpy3DAsync cuMemcpy3DAsync_v2
> +CUresult cuMemcpy3DAsync (const CUDA_MEMCPY3D *, CUstream);
> +CUresult cuMemcpy3DPeer (const CUDA_MEMCPY3D_PEER *);
> +CUresult cuMemcpy3DPeerAsync (const CUDA_MEMCPY3D_PEER *, CUstream);

So these are the 'v2' variants, unconditionally.  (Accompanied by the
'v2' data types defined above.)  But that'll be OK, as I see these
already in CUDA 6.5 'cuda.h'.

> --- a/libgomp/libgomp-plugin.h
> +++ b/libgomp/libgomp-plugin.h

> +extern int GOMP_OFFLOAD_memcpy2d (int, int, size_t, size_t,
> +                               void*, size_t, size_t, size_t,
> +                               const void*, size_t, size_t, size_t);
> +extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *,
> +                               size_t, size_t, size_t, size_t, size_t,
> +                               const void *, size_t, size_t, size_t, size_t,
> +                               size_t);

Oh, wow.  ;-)

> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -1781,6 +1781,122 @@ GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void 
> *src, size_t n)
>    return true;
>  }
>
> +int
> +GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
> +                    size_t dim0_len, void *dst, size_t dst_offset1_size,
> +                    size_t dst_offset0_len, size_t dst_dim1_size,
> +                    const void *src, size_t src_offset1_size,
> +                    size_t src_offset0_len, size_t src_dim1_size)
> +{
> +  if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : 
> dst_ord))
> +    return false;
> +
> +  /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
> +
> +  CUDA_MEMCPY2D data;
> +  data.WidthInBytes = dim1_size;
> +  data.Height = dim0_len;
> +
> +  if (dst_ord == -1)
> +    {
> +      data.dstMemoryType = CU_MEMORYTYPE_HOST;
> +      data.dstHost = dst;
> +    }
> +  else
> +    {
> +      data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
> +      data.dstDevice = (CUdeviceptr) dst;
> +    }
> +  data.dstPitch = dst_dim1_size;
> +  data.dstXInBytes = dst_offset1_size;
> +  data.dstY = dst_offset0_len;
> +
> +  if (src_ord == -1)
> +    {
> +      data.srcMemoryType = CU_MEMORYTYPE_HOST;
> +      data.srcHost = src;
> +    }
> +  else
> +    {
> +      data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
> +      data.srcDevice = (CUdeviceptr) src;
> +    }
> +  data.srcPitch = src_dim1_size;
> +  data.srcXInBytes = src_offset1_size;
> +  data.srcY = src_offset0_len;
> +
> +  CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
> +  if (res == CUDA_ERROR_INVALID_VALUE)
> +    /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
> +       for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
> +       with an error; try the slower cuMemcpy2DUnaligned now.  */
> +    CUDA_CALL (cuMemcpy2DUnaligned, &data);
> +  else if (res != CUDA_SUCCESS)
> +    {
> +      GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
> +      return false;
> +    }
> +  return true;
> +}
> +
> +int
> +GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
> +                    size_t dim1_len, size_t dim0_len, void *dst,
> +                    size_t dst_offset2_size, size_t dst_offset1_len,
> +                    size_t dst_offset0_len, size_t dst_dim2_size,
> +                    size_t dst_dim1_len, const void *src,
> +                    size_t src_offset2_size, size_t src_offset1_len,
> +                    size_t src_offset0_len, size_t src_dim2_size,
> +                    size_t src_dim1_len)
> +{
> +  if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : 
> dst_ord))
> +    return false;
> +
> +  /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
> +
> +  CUDA_MEMCPY3D data;

I note that this doesn't adhere to the two "Must be NULL" remarks from
above -- but I'm confused, because, for example, on
<https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4b5238975579f002c0199a3800ca44df>
"cuMemcpy3D", there also are no such remarks.  (... in contast to:
"srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be set to 0",
which you do set accordingly.)

Maybe just 'memset' the whole thing to '0', for good measure?

> +  data.WidthInBytes = dim2_size;
> +  data.Height = dim1_len;
> +  data.Depth = dim0_len;
> +
> +  if (dst_ord == -1)
> +    {
> +      data.dstMemoryType = CU_MEMORYTYPE_HOST;
> +      data.dstHost = dst;
> +    }
> +  else
> +    {
> +      data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
> +      data.dstDevice = (CUdeviceptr) dst;
> +    }
> +  data.dstPitch = dst_dim2_size;
> +  data.dstHeight = dst_dim1_len;
> +  data.dstXInBytes = dst_offset2_size;
> +  data.dstY = dst_offset1_len;
> +  data.dstZ = dst_offset0_len;
> +  data.dstLOD = 0;
> +
> +  if (src_ord == -1)
> +    {
> +      data.srcMemoryType = CU_MEMORYTYPE_HOST;
> +      data.srcHost = src;
> +    }
> +  else
> +    {
> +      data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
> +      data.srcDevice = (CUdeviceptr) src;
> +    }
> +  data.srcPitch = src_dim2_size;
> +  data.srcHeight = src_dim1_len;
> +  data.srcXInBytes = src_offset2_size;
> +  data.srcY = src_offset1_len;
> +  data.srcZ = src_offset0_len;
> +  data.srcLOD = 0;
> +
> +  CUDA_CALL (cuMemcpy3D, &data);
> +  return true;
> +}

> --- a/libgomp/target.c
> +++ b/libgomp/target.c
> @@ -4526,7 +4526,8 @@ omp_target_memcpy_rect_worker (void *dst, const void 
> *src, size_t element_size,
>                              const size_t *dst_dimensions,
>                              const size_t *src_dimensions,
>                              struct gomp_device_descr *dst_devicep,
> -                            struct gomp_device_descr *src_devicep)
> +                            struct gomp_device_descr *src_devicep,
> +                            size_t *tmp_size, void **tmp)
>  {
>    size_t dst_slice = element_size;
>    size_t src_slice = element_size;
> @@ -4539,36 +4540,120 @@ omp_target_memcpy_rect_worker (void *dst, const void 
> *src, size_t element_size,
>         || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
>         || __builtin_mul_overflow (element_size, src_offsets[0], &src_off))
>       return EINVAL;
> -      if (dst_devicep == NULL && src_devicep == NULL)
> -     {
> -       memcpy ((char *) dst + dst_off, (const char *) src + src_off,
> -               length);
> -       ret = 1;
> -     }
> -      else if (src_devicep == NULL)
> -     ret = dst_devicep->host2dev_func (dst_devicep->target_id,
> +      if (src_devicep != NULL && src_devicep == dst_devicep)
> +     ret = src_devicep->dev2dev_func (src_devicep->target_id,
> +                                      (char *) dst + dst_off,
> +                                      (const char *) src + src_off,
> +                                      length);

So you moved up the intra-device case up here...

> +      else if (src_devicep != NULL
> +            && (dst_devicep == NULL
> +                || (dst_devicep->capabilities
> +                    & GOMP_OFFLOAD_CAP_SHARED_MEM)))

Are these 'GOMP_OFFLOAD_CAP_SHARED_MEM' actually reachable, given that
'omp_target_memcpy_check' (via 'omp_target_memcpy_rect_check') clears out
the device to 'NULL' for 'GOMP_OFFLOAD_CAP_SHARED_MEM'?  In other words,
was the original code already doing what you've implemented here?

> +     ret = src_devicep->dev2host_func (src_devicep->target_id,
>                                         (char *) dst + dst_off,
>                                         (const char *) src + src_off,
>                                         length);
> -      else if (dst_devicep == NULL)
> -     ret = src_devicep->dev2host_func (src_devicep->target_id,
> +      else if (dst_devicep != NULL
> +            && (src_devicep == NULL
> +                || (src_devicep->capabilities
> +                     & GOMP_OFFLOAD_CAP_SHARED_MEM)))
> +     ret = dst_devicep->host2dev_func (dst_devicep->target_id,
>                                         (char *) dst + dst_off,
>                                         (const char *) src + src_off,
>                                         length);
> +      else if (dst_devicep == NULL && src_devicep == NULL)
> +     {
> +       memcpy ((char *) dst + dst_off, (const char *) src + src_off,
> +               length);
> +       ret = 1;
> +     }
>        else if (src_devicep == dst_devicep)
>       ret = src_devicep->dev2dev_func (src_devicep->target_id,
>                                        (char *) dst + dst_off,
>                                        (const char *) src + src_off,
>                                        length);

..., but also left the intra-device case here -- which should now be dead
code here?

>        else
> -     ret = 0;
> +     {
> +       if (*tmp_size == 0)
> +         {
> +           *tmp_size = length;
> +           *tmp = malloc (length);
> +           if (*tmp == NULL)
> +             return ENOMEM;
> +         }
> +       else if (*tmp_size < length)
> +         {
> +           *tmp_size = length;
> +           *tmp = realloc (*tmp, length);
> +           if (*tmp == NULL)
> +             return ENOMEM;

If 'realloc' returns 'NULL', we should 'free' the original '*tmp'?

Do we really need here the property here that if the re-allocation can't
be done in-place, 'realloc' copies the original content to the new?  In
other words, should we just unconditionally 'free' and re-'malloc' here,
instead of 'realloc'?

I haven't looked whether the re-use of 'tmp' for multiple calls to this
is then actually useful, or whether we should just always 'malloc', use,
'free' the buffer here?

> +         }
> +       ret = src_devicep->dev2host_func (src_devicep->target_id, *tmp,
> +                                         (const char *) src + src_off,
> +                                         length);
> +       if (ret == 1)
> +         ret = dst_devicep->host2dev_func (dst_devicep->target_id,
> +                                           (char *) dst + dst_off, *tmp,
> +                                           length);
> +     }
>        return ret ? 0 : EINVAL;

(For later...)  Is that what
<https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1ge1f5c7771544fee150ada8853c7cbf4a>
"cuMemcpyPeer" would be used for?

>      }
>
> -  /* FIXME: it would be nice to have some plugin function to handle
> -     num_dims == 2 and num_dims == 3 more efficiently.  Larger ones can
> -     be handled in the generic recursion below, and for host-host it
> -     should be used even for any num_dims >= 2.  */
> +  /* host->device, device->host and same-device device->device.  */
> +  if (num_dims == 2
> +      && ((src_devicep
> +        && src_devicep == dst_devicep
> +        && src_devicep->memcpy2d_func)
> +       || (!src_devicep != !dst_devicep
> +           && ((src_devicep && src_devicep->memcpy2d_func)
> +               || (dst_devicep && dst_devicep->memcpy2d_func)))))
> +    {
> +      size_t vol_sz1, dst_sz1, src_sz1, dst_off_sz1, src_off_sz1;
> +      int dst_id = dst_devicep ? dst_devicep->target_id : -1;
> +      int src_id = src_devicep ? src_devicep->target_id : -1;
> +      struct gomp_device_descr *devp = dst_devicep ? dst_devicep : 
> src_devicep;
> +
> +      if (__builtin_mul_overflow (volume[1], element_size, &vol_sz1)
> +       || __builtin_mul_overflow (dst_dimensions[1], element_size, &dst_sz1)
> +       || __builtin_mul_overflow (src_dimensions[1], element_size, &src_sz1)
> +       || __builtin_mul_overflow (dst_offsets[1], element_size, &dst_off_sz1)
> +       || __builtin_mul_overflow (src_offsets[1], element_size,
> +                                  &src_off_sz1))
> +     return EINVAL;
> +      ret = devp->memcpy2d_func (dst_id, src_id, vol_sz1, volume[0],
> +                              dst, dst_off_sz1, dst_offsets[0], dst_sz1,
> +                              src, src_off_sz1, src_offsets[0], src_sz1);
> +      if (ret != -1)
> +     return ret ? 0 : EINVAL;
> +    }
> +  else if (num_dims == 3
> +        && ((src_devicep
> +             && src_devicep == dst_devicep
> +             && src_devicep->memcpy3d_func)
> +            || (!src_devicep != !dst_devicep
> +                && ((src_devicep && src_devicep->memcpy3d_func)
> +                    || (dst_devicep && dst_devicep->memcpy3d_func)))))
> +    {
> +      size_t vol_sz2, dst_sz2, src_sz2, dst_off_sz2, src_off_sz2;
> +      int dst_id = dst_devicep ? dst_devicep->target_id : -1;
> +      int src_id = src_devicep ? src_devicep->target_id : -1;
> +      struct gomp_device_descr *devp = dst_devicep ? dst_devicep : 
> src_devicep;
> +
> +      if (__builtin_mul_overflow (volume[2], element_size, &vol_sz2)
> +       || __builtin_mul_overflow (dst_dimensions[2], element_size, &dst_sz2)
> +       || __builtin_mul_overflow (src_dimensions[2], element_size, &src_sz2)
> +       || __builtin_mul_overflow (dst_offsets[2], element_size, &dst_off_sz2)
> +       || __builtin_mul_overflow (src_offsets[2], element_size,
> +                                  &src_off_sz2))
> +     return EINVAL;
> +      ret = devp->memcpy3d_func (dst_id, src_id, vol_sz2, volume[1], 
> volume[0],
> +                              dst, dst_off_sz2, dst_offsets[1],
> +                              dst_offsets[0], dst_sz2, dst_dimensions[1],
> +                              src, src_off_sz2, src_offsets[1],
> +                              src_offsets[0], src_sz2, src_dimensions[1]);
> +      if (ret != -1)
> +     return ret ? 0 : EINVAL;
> +    }
>
>    for (i = 1; i < num_dims; i++)
>      if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice)
> @@ -4585,7 +4670,7 @@ omp_target_memcpy_rect_worker (void *dst, const void 
> *src, size_t element_size,
>                                          volume + 1, dst_offsets + 1,
>                                          src_offsets + 1, dst_dimensions + 1,
>                                          src_dimensions + 1, dst_devicep,
> -                                        src_devicep);
> +                                        src_devicep, tmp_size, tmp);
>        if (ret)
>       return ret;
>        dst_off += dst_slice;
> @@ -4608,9 +4693,6 @@ omp_target_memcpy_rect_check (void *dst, const void 
> *src, int dst_device_num,
>    if (ret)
>      return ret;
>
> -  if (*src_devicep != NULL && *dst_devicep != NULL && *src_devicep != 
> *dst_devicep)
> -    return EINVAL;
> -
>    return 0;
>  }
>
> @@ -4624,18 +4706,36 @@ omp_target_memcpy_rect_copy (void *dst, const void 
> *src,
>                            struct gomp_device_descr *dst_devicep,
>                            struct gomp_device_descr *src_devicep)
>  {
> -  if (src_devicep)
> +  size_t tmp_size = 0;
> +  void *tmp = NULL;
> +  bool lock_src;
> +  bool lock_dst;
> +
> +  lock_src = (src_devicep
> +           && (!dst_devicep
> +               || src_devicep == dst_devicep
> +               || !(src_devicep->capabilities
> +                    & GOMP_OFFLOAD_CAP_SHARED_MEM)));

Similar doubt than above re "'GOMP_OFFLOAD_CAP_SHARED_MEM' actually
reachable"?

> +  lock_dst = (dst_devicep
> +           && (!lock_src
> +               || (src_devicep != dst_devicep
> +                   && !(dst_devicep->capabilities
> +                        & GOMP_OFFLOAD_CAP_SHARED_MEM))));
> +  if (lock_src)
>      gomp_mutex_lock (&src_devicep->lock);
> -  else if (dst_devicep)
> +  if (lock_dst)
>      gomp_mutex_lock (&dst_devicep->lock);

(Pre-existing issue, and I've not myself tried to figure out the details
at this time -- why do we actually lock the devices here, and in similar
other places?)

>    int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims,
>                                          volume, dst_offsets, src_offsets,
>                                          dst_dimensions, src_dimensions,
> -                                        dst_devicep, src_devicep);
> -  if (src_devicep)
> +                                        dst_devicep, src_devicep,
> +                                        &tmp_size, &tmp);
> +  if (lock_src)
>      gomp_mutex_unlock (&src_devicep->lock);
> -  else if (dst_devicep)
> +  if (lock_dst)
>      gomp_mutex_unlock (&dst_devicep->lock);
> +  if (tmp)
> +    free (tmp);
>
>    return ret;
>  }
> @@ -4976,6 +5076,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr 
> *device,
>    DLSYM (free);
>    DLSYM (dev2host);
>    DLSYM (host2dev);
> +  DLSYM (memcpy2d);
> +  DLSYM (memcpy3d);

With 'DLSYM' used here, won't that fail if these symbols don't actually
exist (like for 'libgomp/plugin/plugin-gcn.c')?


I'm attaching the humble beginnings of a follow-on patch; feel free to
use.


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955

>From ed85b983970fb42be2e1db172a0d7e20c484ed06 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <tho...@codesourcery.com>
Date: Thu, 27 Jul 2023 15:46:26 +0200
Subject: [PATCH] Re: OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for
 omp_target_memcpy_rect

---
 include/cuda/cuda.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index 09c3c2b8dbe..d556a130379 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -147,7 +147,7 @@ typedef struct {
 
   size_t dstXInBytes, dstY;
   CUmemorytype dstMemoryType;
-  const void *dstHost;
+  void *dstHost;
   CUdeviceptr dstDevice;
   CUarray dstArray;
   size_t dstPitch;
@@ -162,16 +162,16 @@ typedef struct {
   const void *srcHost;
   CUdeviceptr srcDevice;
   CUarray srcArray;
-  void *dummy;
+  void *reserved0;
   size_t srcPitch, srcHeight;
 
   size_t dstXInBytes, dstY, dstZ;
   size_t dstLOD;
   CUmemorytype dstMemoryType;
-  const void *dstHost;
+  void *dstHost;
   CUdeviceptr dstDevice;
   CUarray dstArray;
-  void *dummy2;
+  void *reserved1;
   size_t dstPitch, dstHeight;
 
   size_t WidthInBytes, Height, Depth;
-- 
2.34.1

Re: [patch] OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect

Reply via email to