Hi Tobias! On 2023-07-25T23:45:54+0200, Tobias Burnus <tob...@codesourcery.com> wrote: > The attached patch calls CUDA's cuMemcopy2D and cuMemcpy3D > for omp_target_memcpy_rect[,_async} for dim=2/dim=3. This should > speed up the data transfer for noncontiguous data.
ACK, thanks. > While being there, I ended up adding support for device to other device > copying; while potentially slow, it is still better than not being able to > copy - and with shared-memory, it shouldn't be that bad. Makes sense, I guess. > Comments, suggestions, remarks? > If there are none, will commit it... You're so quick -- I'm so slow... ;-) I've not verified all the logic in here, but I've got a few comments. > Disclaimer: While I have done correctness tests (system with two nvptx GPUs, > I have not done any performance tests. Well, we should, eventually. > (I also tested it without offloading > configured, but that's rather boring.) > OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect > > When copying a 2D or 3D rectangular memmory block, the performance is > better when using CUDA's cuMemcpy2D/cuMemcpy3D instead of copying the > data one by one. That's what this commit does. So you've actually done some performance verification? > Additionally, it permits device-to-device copies, if neccessary using a > temporary variable on the host. > --- a/include/cuda/cuda.h > +++ b/include/cuda/cuda.h I note that you're not actually using everything you're adding here. (..., but I understand you're simply adding everying that relates to these 'cuMemcpy[...]' routines -- OK as far as I'm concerned.) > @@ -47,6 +47,7 @@ typedef void *CUevent; > typedef void *CUfunction; > typedef void *CUlinkState; > typedef void *CUmodule; > +typedef void *CUarray; > typedef size_t (*CUoccupancyB2DSize)(int); > typedef void *CUstream; > > @@ -54,7 +55,10 @@ typedef enum { > CUDA_SUCCESS = 0, > CUDA_ERROR_INVALID_VALUE = 1, > CUDA_ERROR_OUT_OF_MEMORY = 2, > + CUDA_ERROR_NOT_INITIALIZED = 3, > + CUDA_ERROR_DEINITIALIZED = 4, > CUDA_ERROR_INVALID_CONTEXT = 201, > + CUDA_ERROR_INVALID_HANDLE = 400, > CUDA_ERROR_NOT_FOUND = 500, > CUDA_ERROR_NOT_READY = 600, > CUDA_ERROR_LAUNCH_FAILED = 719, > @@ -126,6 +130,75 @@ typedef enum { > CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, > } CUlimit; > > +typedef enum { > + CU_MEMORYTYPE_HOST = 0x01, > + CU_MEMORYTYPE_DEVICE = 0x02, > + CU_MEMORYTYPE_ARRAY = 0x03, > + CU_MEMORYTYPE_UNIFIED = 0x04 > +} CUmemorytype; > + > +typedef struct { > + size_t srcXInBytes, srcY; > + CUmemorytype srcMemoryType; > + const void *srcHost; > + CUdeviceptr srcDevice; > + CUarray srcArray; > + size_t srcPitch; > + > + size_t dstXInBytes, dstY; > + CUmemorytype dstMemoryType; > + const void *dstHost; That last one isn't 'const'. ;-) > + CUdeviceptr dstDevice; > + CUarray dstArray; > + size_t dstPitch; > + > + size_t WidthInBytes, Height; > +} CUDA_MEMCPY2D; > + > +typedef struct { > + size_t srcXInBytes, srcY, srcZ; > + size_t srcLOD; > + CUmemorytype srcMemoryType; > + const void *srcHost; > + CUdeviceptr srcDevice; > + CUarray srcArray; > + void *dummy; A 'cuda.h' that I looked at calls that last one 'reserved0', with comment "Must be NULL". > + size_t srcPitch, srcHeight; > + > + size_t dstXInBytes, dstY, dstZ; > + size_t dstLOD; > + CUmemorytype dstMemoryType; > + const void *dstHost; Again, not 'const'. > + CUdeviceptr dstDevice; > + CUarray dstArray; > + void *dummy2; Similar to above: 'reserved1', with comment "Must be NULL". > + size_t dstPitch, dstHeight; > + > + size_t WidthInBytes, Height, Depth; > +} CUDA_MEMCPY3D; > + > +typedef struct { > + size_t srcXInBytes, srcY, srcZ; > + size_t srcLOD; > + CUmemorytype srcMemoryType; > + const void *srcHost; > + CUdeviceptr srcDevice; > + CUarray srcArray; > + CUcontext srcContext; > + size_t srcPitch, srcHeight; > + > + size_t dstXInBytes, dstY, dstZ; > + size_t dstLOD; > + CUmemorytype dstMemoryType; > + const void *dstHost; > + CUdeviceptr dstDevice; > + CUarray dstArray; > + CUcontext dstContext; > + size_t dstPitch, dstHeight; > + > + size_t WidthInBytes, Height, Depth; > +} CUDA_MEMCPY3D_PEER; > + > #define cuCtxCreate cuCtxCreate_v2 > CUresult cuCtxCreate (CUcontext *, unsigned, CUdevice); > #define cuCtxDestroy cuCtxDestroy_v2 > @@ -183,6 +256,18 @@ CUresult cuMemcpyDtoHAsync (void *, CUdeviceptr, size_t, > CUstream); > CUresult cuMemcpyHtoD (CUdeviceptr, const void *, size_t); > #define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2 > CUresult cuMemcpyHtoDAsync (CUdeviceptr, const void *, size_t, CUstream); > +#define cuMemcpy2D cuMemcpy2D_v2 > +CUresult cuMemcpy2D (const CUDA_MEMCPY2D *); > +#define cuMemcpy2DAsync cuMemcpy2DAsync_v2 > +CUresult cuMemcpy2DAsync (const CUDA_MEMCPY2D *, CUstream); > +#define cuMemcpy2DUnaligned cuMemcpy2DUnaligned_v2 > +CUresult cuMemcpy2DUnaligned (const CUDA_MEMCPY2D *); > +#define cuMemcpy3D cuMemcpy3D_v2 > +CUresult cuMemcpy3D (const CUDA_MEMCPY3D *); > +#define cuMemcpy3DAsync cuMemcpy3DAsync_v2 > +CUresult cuMemcpy3DAsync (const CUDA_MEMCPY3D *, CUstream); > +CUresult cuMemcpy3DPeer (const CUDA_MEMCPY3D_PEER *); > +CUresult cuMemcpy3DPeerAsync (const CUDA_MEMCPY3D_PEER *, CUstream); So these are the 'v2' variants, unconditionally. (Accompanied by the 'v2' data types defined above.) But that'll be OK, as I see these already in CUDA 6.5 'cuda.h'. > --- a/libgomp/libgomp-plugin.h > +++ b/libgomp/libgomp-plugin.h > +extern int GOMP_OFFLOAD_memcpy2d (int, int, size_t, size_t, > + void*, size_t, size_t, size_t, > + const void*, size_t, size_t, size_t); > +extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *, > + size_t, size_t, size_t, size_t, size_t, > + const void *, size_t, size_t, size_t, size_t, > + size_t); Oh, wow. ;-) > --- a/libgomp/plugin/plugin-nvptx.c > +++ b/libgomp/plugin/plugin-nvptx.c > @@ -1781,6 +1781,122 @@ GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void > *src, size_t n) > return true; > } > > +int > +GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size, > + size_t dim0_len, void *dst, size_t dst_offset1_size, > + size_t dst_offset0_len, size_t dst_dim1_size, > + const void *src, size_t src_offset1_size, > + size_t src_offset0_len, size_t src_dim1_size) > +{ > + if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : > dst_ord)) > + return false; > + > + /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ > + > + CUDA_MEMCPY2D data; > + data.WidthInBytes = dim1_size; > + data.Height = dim0_len; > + > + if (dst_ord == -1) > + { > + data.dstMemoryType = CU_MEMORYTYPE_HOST; > + data.dstHost = dst; > + } > + else > + { > + data.dstMemoryType = CU_MEMORYTYPE_DEVICE; > + data.dstDevice = (CUdeviceptr) dst; > + } > + data.dstPitch = dst_dim1_size; > + data.dstXInBytes = dst_offset1_size; > + data.dstY = dst_offset0_len; > + > + if (src_ord == -1) > + { > + data.srcMemoryType = CU_MEMORYTYPE_HOST; > + data.srcHost = src; > + } > + else > + { > + data.srcMemoryType = CU_MEMORYTYPE_DEVICE; > + data.srcDevice = (CUdeviceptr) src; > + } > + data.srcPitch = src_dim1_size; > + data.srcXInBytes = src_offset1_size; > + data.srcY = src_offset0_len; > + > + CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data); > + if (res == CUDA_ERROR_INVALID_VALUE) > + /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device > + for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails > + with an error; try the slower cuMemcpy2DUnaligned now. */ > + CUDA_CALL (cuMemcpy2DUnaligned, &data); > + else if (res != CUDA_SUCCESS) > + { > + GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res)); > + return false; > + } > + return true; > +} > + > +int > +GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, > + size_t dim1_len, size_t dim0_len, void *dst, > + size_t dst_offset2_size, size_t dst_offset1_len, > + size_t dst_offset0_len, size_t dst_dim2_size, > + size_t dst_dim1_len, const void *src, > + size_t src_offset2_size, size_t src_offset1_len, > + size_t src_offset0_len, size_t src_dim2_size, > + size_t src_dim1_len) > +{ > + if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : > dst_ord)) > + return false; > + > + /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ > + > + CUDA_MEMCPY3D data; I note that this doesn't adhere to the two "Must be NULL" remarks from above -- but I'm confused, because, for example, on <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4b5238975579f002c0199a3800ca44df> "cuMemcpy3D", there also are no such remarks. (... in contast to: "srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be set to 0", which you do set accordingly.) Maybe just 'memset' the whole thing to '0', for good measure? > + data.WidthInBytes = dim2_size; > + data.Height = dim1_len; > + data.Depth = dim0_len; > + > + if (dst_ord == -1) > + { > + data.dstMemoryType = CU_MEMORYTYPE_HOST; > + data.dstHost = dst; > + } > + else > + { > + data.dstMemoryType = CU_MEMORYTYPE_DEVICE; > + data.dstDevice = (CUdeviceptr) dst; > + } > + data.dstPitch = dst_dim2_size; > + data.dstHeight = dst_dim1_len; > + data.dstXInBytes = dst_offset2_size; > + data.dstY = dst_offset1_len; > + data.dstZ = dst_offset0_len; > + data.dstLOD = 0; > + > + if (src_ord == -1) > + { > + data.srcMemoryType = CU_MEMORYTYPE_HOST; > + data.srcHost = src; > + } > + else > + { > + data.srcMemoryType = CU_MEMORYTYPE_DEVICE; > + data.srcDevice = (CUdeviceptr) src; > + } > + data.srcPitch = src_dim2_size; > + data.srcHeight = src_dim1_len; > + data.srcXInBytes = src_offset2_size; > + data.srcY = src_offset1_len; > + data.srcZ = src_offset0_len; > + data.srcLOD = 0; > + > + CUDA_CALL (cuMemcpy3D, &data); > + return true; > +} > --- a/libgomp/target.c > +++ b/libgomp/target.c > @@ -4526,7 +4526,8 @@ omp_target_memcpy_rect_worker (void *dst, const void > *src, size_t element_size, > const size_t *dst_dimensions, > const size_t *src_dimensions, > struct gomp_device_descr *dst_devicep, > - struct gomp_device_descr *src_devicep) > + struct gomp_device_descr *src_devicep, > + size_t *tmp_size, void **tmp) > { > size_t dst_slice = element_size; > size_t src_slice = element_size; > @@ -4539,36 +4540,120 @@ omp_target_memcpy_rect_worker (void *dst, const void > *src, size_t element_size, > || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off) > || __builtin_mul_overflow (element_size, src_offsets[0], &src_off)) > return EINVAL; > - if (dst_devicep == NULL && src_devicep == NULL) > - { > - memcpy ((char *) dst + dst_off, (const char *) src + src_off, > - length); > - ret = 1; > - } > - else if (src_devicep == NULL) > - ret = dst_devicep->host2dev_func (dst_devicep->target_id, > + if (src_devicep != NULL && src_devicep == dst_devicep) > + ret = src_devicep->dev2dev_func (src_devicep->target_id, > + (char *) dst + dst_off, > + (const char *) src + src_off, > + length); So you moved up the intra-device case up here... > + else if (src_devicep != NULL > + && (dst_devicep == NULL > + || (dst_devicep->capabilities > + & GOMP_OFFLOAD_CAP_SHARED_MEM))) Are these 'GOMP_OFFLOAD_CAP_SHARED_MEM' actually reachable, given that 'omp_target_memcpy_check' (via 'omp_target_memcpy_rect_check') clears out the device to 'NULL' for 'GOMP_OFFLOAD_CAP_SHARED_MEM'? In other words, was the original code already doing what you've implemented here? > + ret = src_devicep->dev2host_func (src_devicep->target_id, > (char *) dst + dst_off, > (const char *) src + src_off, > length); > - else if (dst_devicep == NULL) > - ret = src_devicep->dev2host_func (src_devicep->target_id, > + else if (dst_devicep != NULL > + && (src_devicep == NULL > + || (src_devicep->capabilities > + & GOMP_OFFLOAD_CAP_SHARED_MEM))) > + ret = dst_devicep->host2dev_func (dst_devicep->target_id, > (char *) dst + dst_off, > (const char *) src + src_off, > length); > + else if (dst_devicep == NULL && src_devicep == NULL) > + { > + memcpy ((char *) dst + dst_off, (const char *) src + src_off, > + length); > + ret = 1; > + } > else if (src_devicep == dst_devicep) > ret = src_devicep->dev2dev_func (src_devicep->target_id, > (char *) dst + dst_off, > (const char *) src + src_off, > length); ..., but also left the intra-device case here -- which should now be dead code here? > else > - ret = 0; > + { > + if (*tmp_size == 0) > + { > + *tmp_size = length; > + *tmp = malloc (length); > + if (*tmp == NULL) > + return ENOMEM; > + } > + else if (*tmp_size < length) > + { > + *tmp_size = length; > + *tmp = realloc (*tmp, length); > + if (*tmp == NULL) > + return ENOMEM; If 'realloc' returns 'NULL', we should 'free' the original '*tmp'? Do we really need here the property here that if the re-allocation can't be done in-place, 'realloc' copies the original content to the new? In other words, should we just unconditionally 'free' and re-'malloc' here, instead of 'realloc'? I haven't looked whether the re-use of 'tmp' for multiple calls to this is then actually useful, or whether we should just always 'malloc', use, 'free' the buffer here? > + } > + ret = src_devicep->dev2host_func (src_devicep->target_id, *tmp, > + (const char *) src + src_off, > + length); > + if (ret == 1) > + ret = dst_devicep->host2dev_func (dst_devicep->target_id, > + (char *) dst + dst_off, *tmp, > + length); > + } > return ret ? 0 : EINVAL; (For later...) Is that what <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1ge1f5c7771544fee150ada8853c7cbf4a> "cuMemcpyPeer" would be used for? > } > > - /* FIXME: it would be nice to have some plugin function to handle > - num_dims == 2 and num_dims == 3 more efficiently. Larger ones can > - be handled in the generic recursion below, and for host-host it > - should be used even for any num_dims >= 2. */ > + /* host->device, device->host and same-device device->device. */ > + if (num_dims == 2 > + && ((src_devicep > + && src_devicep == dst_devicep > + && src_devicep->memcpy2d_func) > + || (!src_devicep != !dst_devicep > + && ((src_devicep && src_devicep->memcpy2d_func) > + || (dst_devicep && dst_devicep->memcpy2d_func))))) > + { > + size_t vol_sz1, dst_sz1, src_sz1, dst_off_sz1, src_off_sz1; > + int dst_id = dst_devicep ? dst_devicep->target_id : -1; > + int src_id = src_devicep ? src_devicep->target_id : -1; > + struct gomp_device_descr *devp = dst_devicep ? dst_devicep : > src_devicep; > + > + if (__builtin_mul_overflow (volume[1], element_size, &vol_sz1) > + || __builtin_mul_overflow (dst_dimensions[1], element_size, &dst_sz1) > + || __builtin_mul_overflow (src_dimensions[1], element_size, &src_sz1) > + || __builtin_mul_overflow (dst_offsets[1], element_size, &dst_off_sz1) > + || __builtin_mul_overflow (src_offsets[1], element_size, > + &src_off_sz1)) > + return EINVAL; > + ret = devp->memcpy2d_func (dst_id, src_id, vol_sz1, volume[0], > + dst, dst_off_sz1, dst_offsets[0], dst_sz1, > + src, src_off_sz1, src_offsets[0], src_sz1); > + if (ret != -1) > + return ret ? 0 : EINVAL; > + } > + else if (num_dims == 3 > + && ((src_devicep > + && src_devicep == dst_devicep > + && src_devicep->memcpy3d_func) > + || (!src_devicep != !dst_devicep > + && ((src_devicep && src_devicep->memcpy3d_func) > + || (dst_devicep && dst_devicep->memcpy3d_func))))) > + { > + size_t vol_sz2, dst_sz2, src_sz2, dst_off_sz2, src_off_sz2; > + int dst_id = dst_devicep ? dst_devicep->target_id : -1; > + int src_id = src_devicep ? src_devicep->target_id : -1; > + struct gomp_device_descr *devp = dst_devicep ? dst_devicep : > src_devicep; > + > + if (__builtin_mul_overflow (volume[2], element_size, &vol_sz2) > + || __builtin_mul_overflow (dst_dimensions[2], element_size, &dst_sz2) > + || __builtin_mul_overflow (src_dimensions[2], element_size, &src_sz2) > + || __builtin_mul_overflow (dst_offsets[2], element_size, &dst_off_sz2) > + || __builtin_mul_overflow (src_offsets[2], element_size, > + &src_off_sz2)) > + return EINVAL; > + ret = devp->memcpy3d_func (dst_id, src_id, vol_sz2, volume[1], > volume[0], > + dst, dst_off_sz2, dst_offsets[1], > + dst_offsets[0], dst_sz2, dst_dimensions[1], > + src, src_off_sz2, src_offsets[1], > + src_offsets[0], src_sz2, src_dimensions[1]); > + if (ret != -1) > + return ret ? 0 : EINVAL; > + } > > for (i = 1; i < num_dims; i++) > if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice) > @@ -4585,7 +4670,7 @@ omp_target_memcpy_rect_worker (void *dst, const void > *src, size_t element_size, > volume + 1, dst_offsets + 1, > src_offsets + 1, dst_dimensions + 1, > src_dimensions + 1, dst_devicep, > - src_devicep); > + src_devicep, tmp_size, tmp); > if (ret) > return ret; > dst_off += dst_slice; > @@ -4608,9 +4693,6 @@ omp_target_memcpy_rect_check (void *dst, const void > *src, int dst_device_num, > if (ret) > return ret; > > - if (*src_devicep != NULL && *dst_devicep != NULL && *src_devicep != > *dst_devicep) > - return EINVAL; > - > return 0; > } > > @@ -4624,18 +4706,36 @@ omp_target_memcpy_rect_copy (void *dst, const void > *src, > struct gomp_device_descr *dst_devicep, > struct gomp_device_descr *src_devicep) > { > - if (src_devicep) > + size_t tmp_size = 0; > + void *tmp = NULL; > + bool lock_src; > + bool lock_dst; > + > + lock_src = (src_devicep > + && (!dst_devicep > + || src_devicep == dst_devicep > + || !(src_devicep->capabilities > + & GOMP_OFFLOAD_CAP_SHARED_MEM))); Similar doubt than above re "'GOMP_OFFLOAD_CAP_SHARED_MEM' actually reachable"? > + lock_dst = (dst_devicep > + && (!lock_src > + || (src_devicep != dst_devicep > + && !(dst_devicep->capabilities > + & GOMP_OFFLOAD_CAP_SHARED_MEM)))); > + if (lock_src) > gomp_mutex_lock (&src_devicep->lock); > - else if (dst_devicep) > + if (lock_dst) > gomp_mutex_lock (&dst_devicep->lock); (Pre-existing issue, and I've not myself tried to figure out the details at this time -- why do we actually lock the devices here, and in similar other places?) > int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims, > volume, dst_offsets, src_offsets, > dst_dimensions, src_dimensions, > - dst_devicep, src_devicep); > - if (src_devicep) > + dst_devicep, src_devicep, > + &tmp_size, &tmp); > + if (lock_src) > gomp_mutex_unlock (&src_devicep->lock); > - else if (dst_devicep) > + if (lock_dst) > gomp_mutex_unlock (&dst_devicep->lock); > + if (tmp) > + free (tmp); > > return ret; > } > @@ -4976,6 +5076,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr > *device, > DLSYM (free); > DLSYM (dev2host); > DLSYM (host2dev); > + DLSYM (memcpy2d); > + DLSYM (memcpy3d); With 'DLSYM' used here, won't that fail if these symbols don't actually exist (like for 'libgomp/plugin/plugin-gcn.c')? I'm attaching the humble beginnings of a follow-on patch; feel free to use. Grüße Thomas ----------------- Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
>From ed85b983970fb42be2e1db172a0d7e20c484ed06 Mon Sep 17 00:00:00 2001 From: Thomas Schwinge <tho...@codesourcery.com> Date: Thu, 27 Jul 2023 15:46:26 +0200 Subject: [PATCH] Re: OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect --- include/cuda/cuda.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h index 09c3c2b8dbe..d556a130379 100644 --- a/include/cuda/cuda.h +++ b/include/cuda/cuda.h @@ -147,7 +147,7 @@ typedef struct { size_t dstXInBytes, dstY; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; size_t dstPitch; @@ -162,16 +162,16 @@ typedef struct { const void *srcHost; CUdeviceptr srcDevice; CUarray srcArray; - void *dummy; + void *reserved0; size_t srcPitch, srcHeight; size_t dstXInBytes, dstY, dstZ; size_t dstLOD; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; - void *dummy2; + void *reserved1; size_t dstPitch, dstHeight; size_t WidthInBytes, Height, Depth; -- 2.34.1