On 07/31/2018 04:58 PM, Cesar Philippidis wrote: > The attached patch teaches libgomp how to use the CUDA thread occupancy > calculator built into the CUDA driver. Despite both being based off the > CUDA thread occupancy spreadsheet distributed with CUDA, the built in > occupancy calculator differs from the occupancy calculator in og8 in two > key ways. First, og8 launches twice the number of gangs as the driver > thread occupancy calculator. This was my attempt at preventing threads > from idling, and it operating on a similar principle of running 'make > -jN', where N is twice the number of CPU threads.
You're saying the two methods are different, and that the difference between the two methods is a factor two, which is a heuristic you added yourself on top of one of the methods, which implies that in fact the two methods are identical. Is my understanding correct here? > Second, whereas og8 > always attempts to maximize the CUDA block size, the driver may select a > smaller block, which effectively decreases num_workers. > So, do I understand it correctly that using the function cuOccupancyMaxPotentialBlockSize gives us "minimum block size that can achieve the maximum occupancy" or some such and og8 gives us "maximum block size"? > In terms of performance, there really isn't that much of a difference > between the CUDA driver's occupancy calculator and og8's. However, on > the tests that are impacted, they are generally within a factor of two > from one another, with some tests running faster with the driver > occupancy calculator and others with og8's. > Ack. Well, until we understand that in more detail, going with the driver's occupancy calculator seems the right thing to do. > Unfortunately, support for the CUDA driver API isn't universal; it's > only available in CUDA version 6.5 (or 6050) and newer. In this patch, > I'm exploiting the fact that init_cuda_lib only checks for errors on the > last library function initialized. That sounds incorrect to me. In init_cuda_lib I see: ... # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call) # define CUDA_ONE_CALL_1(call) \ cuda_lib.call = dlsym (h, #call); \ if (cuda_lib.call == NULL) \ return false; CUDA_CALLS ... so in fact every library function is checked. Have you tested this with pre 6-5 cuda? I think we need to add and handle: ... CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) ... > Therefore it guards the usage of > > cuOccupancyMaxPotentialBlockSizeWithFlags > > by checking driver_version. If we allow the cuOccupancyMaxPotentialBlockSize field to be NULL, we can test for NULL, which seems a simpler solution than testing the version. > If the driver occupancy calculator isn't > available, it falls back to the existing defaults. Maybe the og8 thread > occupancy would make a better default for older versions of CUDA, but > that's a patch for another day. > Agreed. > Is this patch OK for trunk? The patch doesn't build in a setup with --enable-offload-targets=nvptx-none and without cuda, that enables usage of plugin/cuda/cuda.h: ... /data/offload-nvptx/src/libgomp/plugin/plugin-nvptx.c:98:16: error: ‘cuOccupancyMaxPotentialBlockSize’ undeclared here (not in a function); did you mean ‘cuOccupancyMaxPotentialBlockSizeWithFlags’? CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \ ... > @@ -1220,11 +1227,39 @@ nvptx_exec (void (*fn), size_t mapnum, void > **hostaddrs, void **devaddrs, > > { > bool default_dim_p[GOMP_DIM_MAX]; > + int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR]; > + int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER]; > + int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG]; > + > + /* The CUDA driver occupancy calculator is only available on > + CUDA version 6.5 (6050) and newer. */ > + if (nvthd->ptx_dev->driver_version > 6050) > + { > + int grids, blocks; > + CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, > + &blocks, function, NULL, 0, > + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); > + GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " > + "grid = %d, block = %d\n", grids, blocks); > + > + if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0) You should use gomp_openacc_dims[0]. > + gangs = grids * (blocks / warp_size); So, we launch with gangs == grids * workers ? Is that intentional? > + > + if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0) > + workers = blocks / vectors; Also, the new default calculation is not nicely separated from the fallback default calculation. I've updated the patch with a cleaner separation, attached and build without cuda but untested. Thanks, - Tom
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h index 4799825bda2..67a475b695e 100644 --- a/libgomp/plugin/cuda/cuda.h +++ b/libgomp/plugin/cuda/cuda.h @@ -44,6 +44,7 @@ typedef void *CUevent; typedef void *CUfunction; typedef void *CUlinkState; typedef void *CUmodule; +typedef size_t (*CUoccupancyB2DSize)(int); typedef void *CUstream; typedef enum { @@ -170,6 +171,9 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *); CUresult cuModuleLoad (CUmodule *, const char *); CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleUnload (CUmodule); +CUresult cuOccupancyMaxPotentialBlockSize (int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, + int); CUresult cuStreamCreate (CUstream *, unsigned); #define cuStreamDestroy cuStreamDestroy_v2 CUresult cuStreamDestroy (CUstream); diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index b6ec5f88d59..47518f1a73f 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -94,6 +94,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) \ CUDA_ONE_CALL (cuModuleLoad) \ CUDA_ONE_CALL (cuModuleLoadData) \ CUDA_ONE_CALL (cuModuleUnload) \ +CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) \ CUDA_ONE_CALL (cuStreamCreate) \ CUDA_ONE_CALL (cuStreamDestroy) \ CUDA_ONE_CALL (cuStreamQuery) \ @@ -101,6 +102,7 @@ CUDA_ONE_CALL (cuStreamSynchronize) \ CUDA_ONE_CALL (cuStreamWaitEvent) # define CUDA_ONE_CALL(call) \ __typeof (call) *call; +# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL(call) struct cuda_lib_s { CUDA_CALLS } cuda_lib; @@ -122,10 +124,12 @@ init_cuda_lib (void) if (h == NULL) return false; # undef CUDA_ONE_CALL -# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call) -# define CUDA_ONE_CALL_1(call) \ +# undef CUDA_ONE_CALL_MAYBE_NULL +# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, true) +# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, false) +# define CUDA_ONE_CALL_1(call, check) \ cuda_lib.call = dlsym (h, #call); \ - if (cuda_lib.call == NULL) \ + if (check && cuda_lib.call == NULL) \ return false; CUDA_CALLS cuda_lib_inited = true; @@ -1221,10 +1225,44 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, { bool default_dim_p[GOMP_DIM_MAX]; for (i = 0; i != GOMP_DIM_MAX; i++) + default_dim_p[i] = !dims[i]; + + if (cuda_lib.cuOccupancyMaxPotentialBlockSize == NULL) + { + for (i = 0; i != GOMP_DIM_MAX; i++) + if (default_dim_p[i]) + dims[i] = nvthd->ptx_dev->default_dims[i]; + } + else { - default_dim_p[i] = !dims[i]; - if (default_dim_p[i]) - dims[i] = nvthd->ptx_dev->default_dims[i]; + int vectors = gomp_openacc_dims[GOMP_DIM_VECTOR]; + int workers = gomp_openacc_dims[GOMP_DIM_WORKER]; + int gangs = gomp_openacc_dims[GOMP_DIM_GANG]; + int grids, blocks; + CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, + &blocks, function, NULL, 0, + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); + GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " + "grid = %d, block = %d\n", grids, blocks); + + if (vectors == 0) + vectors = warp_size; + + if (workers == 0) + workers = blocks / vectors; + + if (gangs == 0) + gangs = grids * workers; + + for (i = 0; i != GOMP_DIM_MAX; i++) + if (default_dim_p[i]) + switch (i) + { + case GOMP_DIM_GANG: dims[i] = gangs; break; + case GOMP_DIM_WORKER: dims[i] = workers; break; + case GOMP_DIM_VECTOR: dims[i] = vectors; break; + default: GOMP_PLUGIN_fatal ("invalid dim"); + } } if (default_dim_p[GOMP_DIM_VECTOR])