Hi Julian!

On 2023-09-06T02:34:30-0700, Julian Brown <jul...@codesourcery.com> wrote:
> This patch works around behaviour of the 2D and 3D memcpy operations in
> the CUDA driver runtime.  Particularly in Fortran, the "base pointer"
> of an array (used for either source or destination of a host/device copy)
> may lie outside of data that is actually stored on the device.  The fix
> is to make sure that we use the first element of data to be transferred
> instead, and adjust parameters accordingly.

Do you (a) have a stand-alone test case for this (that is, not depending
on your other pending patches, so that this could go in directly --
together with the before-FAIL test case).  Do you (b) know if is this a
bug in our use of the CUDA Driver API or rather in CUDA itself?  If the
latter, have you reported this to Nvidia?

(I didn't quickly understand
<https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g27f885b30c34cc20a663a671dbf6fc27>
"cuMemcpy2D" etc.)


Grüße
 Thomas


> 2023-09-05  Julian Brown  <jul...@codesourcery.com>
>
> libgomp/
>       * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d): Adjust parameters to
>       avoid out-of-bounds array checks in CUDA runtime.
>       (GOMP_OFFLOAD_memcpy3d): Likewise.
> ---
>  libgomp/plugin/plugin-nvptx.c | 67 +++++++++++++++++++++++++++++++++++
>  1 file changed, 67 insertions(+)
>
> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
> index 00d4241ae02b..cefe288a8aab 100644
> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -1827,6 +1827,35 @@ GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, 
> size_t dim1_size,
>    data.srcXInBytes = src_offset1_size;
>    data.srcY = src_offset0_len;
>
> +  if (data.srcXInBytes != 0 || data.srcY != 0)
> +    {
> +      /* Adjust origin to the actual array data, else the CUDA 2D memory
> +      copy API calls below may fail to validate source/dest pointers
> +      correctly (especially for Fortran where the "virtual origin" of an
> +      array is often outside the stored data).  */
> +      if (src_ord == -1)
> +     data.srcHost = (const void *) ((const char *) data.srcHost
> +                                   + data.srcY * data.srcPitch
> +                                   + data.srcXInBytes);
> +      else
> +     data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
> +      data.srcXInBytes = 0;
> +      data.srcY = 0;
> +    }
> +
> +  if (data.dstXInBytes != 0 || data.dstY != 0)
> +    {
> +      /* As above.  */
> +      if (dst_ord == -1)
> +     data.dstHost = (void *) ((char *) data.dstHost
> +                              + data.dstY * data.dstPitch
> +                              + data.dstXInBytes);
> +      else
> +     data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
> +      data.dstXInBytes = 0;
> +      data.dstY = 0;
> +    }
> +
>    CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
>    if (res == CUDA_ERROR_INVALID_VALUE)
>      /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
> @@ -1895,6 +1924,44 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, 
> size_t dim2_size,
>    data.srcY = src_offset1_len;
>    data.srcZ = src_offset0_len;
>
> +  if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
> +    {
> +      /* Adjust origin to the actual array data, else the CUDA 3D memory
> +      copy API call below may fail to validate source/dest pointers
> +      correctly (especially for Fortran where the "virtual origin" of an
> +      array is often outside the stored data).  */
> +      if (src_ord == -1)
> +     data.srcHost
> +       = (const void *) ((const char *) data.srcHost
> +                         + (data.srcZ * data.srcHeight + data.srcY)
> +                           * data.srcPitch
> +                         + data.srcXInBytes);
> +      else
> +     data.srcDevice
> +       += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
> +          + data.srcXInBytes;
> +      data.srcXInBytes = 0;
> +      data.srcY = 0;
> +      data.srcZ = 0;
> +    }
> +
> +  if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
> +    {
> +      /* As above.  */
> +      if (dst_ord == -1)
> +     data.dstHost = (void *) ((char *) data.dstHost
> +                              + (data.dstZ * data.dstHeight + data.dstY)
> +                                * data.dstPitch
> +                              + data.dstXInBytes);
> +      else
> +     data.dstDevice
> +       += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
> +          + data.dstXInBytes;
> +      data.dstXInBytes = 0;
> +      data.dstY = 0;
> +      data.dstZ = 0;
> +    }
> +
>    CUDA_CALL (cuMemcpy3D, &data);
>    return true;
>  }
> --
> 2.41.0
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955

Reply via email to