On 08/07/2018 06:52 AM, Cesar Philippidis wrote:
> I attached an updated version of the CUDA driver patch, although I
> haven't rebased it against your changes yet. It still needs to be tested
> against CUDA 5.5 using the systems/Nvidia's cuda.h. But I wanted to give
> you an update.
>
> Does this patch look OK, at least after testing competes? I removed the
> tests for CUDA_ONE_CALL_MAYBE_NULL, because the newer CUDA API isn't
> supported in the older drivers.
I've finally finished testing this patch. Besides for a couple of
regressions with CUDA 5.5 in libgomp.oacc-c-c++-common/lib-75.c,
lib-76.c and lib-79.c, the results came back clean.
This patch has been tested the following ways using a K40 GPU:
* Using GCC's cuda.h with CUDA 9.2 drivers.
* Using cuda.h from CUDA 5.5 and Nvidia drivers 331.133 (supports CUDA
6.0) and the driver from CUDA 8.0.
* Using cuda.h from CUDA 8.0.
As mentioned before, because GCC's cuda.h defines CUDA_VERSION as 8000,
there was a conflict with using it against CUDA 5.5, because of the
missing cuLinkAddData_v2 symbol.
Note how the usage of cuOccupancyMaxPotentialBlockSize is guarded by
checking for the version of CUDA_VERSION. I don't really like this, but
it's a necessary evil of maintaining backwards compatibility.
Is this patch OK for trunk?
Thanks,
Cesar
[nvptx] Use CUDA driver API to select default runtime launch geometry
2018-08-YY Cesar Philippidis <[email protected]>
libgomp/
plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
(cuDriverGetVersion): Declare.
(cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
(ptx_device): Add driver_version member.
(nvptx_open_device): Initialize it.
(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
default num_gangs and num_workers when the driver supports it.
---
libgomp/plugin/cuda-lib.def | 2 ++
libgomp/plugin/cuda/cuda.h | 4 ++++
libgomp/plugin/plugin-nvptx.c | 40 +++++++++++++++++++++++++++++++++++++++-
3 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index be8e3b3..f2433e1 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate)
CUDA_ONE_CALL (cuCtxDestroy)
CUDA_ONE_CALL (cuCtxGetCurrent)
CUDA_ONE_CALL (cuCtxGetDevice)
+CUDA_ONE_CALL (cuDriverGetVersion)
CUDA_ONE_CALL (cuCtxPopCurrent)
CUDA_ONE_CALL (cuCtxPushCurrent)
CUDA_ONE_CALL (cuCtxSynchronize)
@@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
CUDA_ONE_CALL (cuModuleLoad)
CUDA_ONE_CALL (cuModuleLoadData)
CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize)
CUDA_ONE_CALL (cuStreamCreate)
CUDA_ONE_CALL (cuStreamDestroy)
CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825..3a790e6 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
typedef void *CUfunction;
typedef void *CUlinkState;
typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
typedef void *CUstream;
typedef enum {
@@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
CUresult cuDeviceGet (CUdevice *, int);
CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
CUresult cuDeviceGetCount (int *);
+CUresult cuDriverGetVersion(int *);
CUresult cuEventCreate (CUevent *, unsigned);
#define cuEventDestroy cuEventDestroy_v2
CUresult cuEventDestroy (CUevent);
@@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
CUresult cuModuleLoad (CUmodule *, const char *);
CUresult cuModuleLoadData (CUmodule *, const void *);
CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+ CUoccupancyB2DSize, size_t, int);
CUresult cuStreamCreate (CUstream *, unsigned);
#define cuStreamDestroy cuStreamDestroy_v2
CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 825470a..b0ccf0b 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -376,6 +376,7 @@ struct ptx_device
int max_threads_per_block;
int max_threads_per_multiprocessor;
int default_dims[GOMP_DIM_MAX];
+ int driver_version;
struct ptx_image_data *images; /* Images loaded on device. */
pthread_mutex_t image_lock; /* Lock for above list. */
@@ -687,6 +688,7 @@ nvptx_open_device (int n)
ptx_dev->ord = n;
ptx_dev->dev = dev;
ptx_dev->ctx_shared = false;
+ ptx_dev->driver_version = 0;
r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
@@ -780,6 +782,9 @@ nvptx_open_device (int n)
for (int i = 0; i != GOMP_DIM_MAX; i++)
ptx_dev->default_dims[i] = 0;
+ CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi);
+ ptx_dev->driver_version = pi;
+
ptx_dev->images = NULL;
pthread_mutex_init (&ptx_dev->image_lock, NULL);
@@ -1173,11 +1178,44 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
{
bool default_dim_p[GOMP_DIM_MAX];
+ int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR];
+ int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER];
+ int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG];
+
+ /* The CUDA driver occupancy calculator is only available on
+ CUDA version 6.5 (6050) and newer. */
+#if (CUDA_VERSION >= 6050)
+ if (nvthd->ptx_dev->driver_version > 6050)
+ {
+ int grids, blocks;
+ CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+ &blocks, function, NULL, 0,
+ dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+ GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+ "grid = %d, block = %d\n", grids, blocks);
+
+ /* Keep the num_gangs proportional to the block size. The
+ constant factor 2 is there to prevent threads from
+ idling when there is sufficient work for them. */
+ if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0)
+ gangs = 2 * grids * (blocks / warp_size);
+
+ if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0)
+ workers = blocks / vectors;
+ }
+#endif
+
for (i = 0; i != GOMP_DIM_MAX; i++)
{
default_dim_p[i] = !dims[i];
if (default_dim_p[i])
- dims[i] = nvthd->ptx_dev->default_dims[i];
+ switch (i)
+ {
+ case GOMP_DIM_GANG: dims[i] = gangs; break;
+ case GOMP_DIM_WORKER: dims[i] = workers; break;
+ case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+ default: GOMP_PLUGIN_fatal ("invalid dim");
+ }
}
if (default_dim_p[GOMP_DIM_VECTOR])
--
2.7.4