On 07/31/2018 04:58 PM, Cesar Philippidis wrote:
> The attached patch teaches libgomp how to use the CUDA thread occupancy
> calculator built into the CUDA driver. Despite both being based off the
> CUDA thread occupancy spreadsheet distributed with CUDA, the built in
> occupancy calculator differs from the occupancy calculator in og8 in two
> key ways. First, og8 launches twice the number of gangs as the driver
> thread occupancy calculator. This was my attempt at preventing threads
> from idling, and it operating on a similar principle of running 'make
> -jN', where N is twice the number of CPU threads.

You're saying the two methods are different, and that the difference
between the two methods is a factor two, which is a heuristic you added
yourself on top of one of the methods, which implies that in fact the
two methods are identical. Is my understanding correct here?

> Second, whereas og8
> always attempts to maximize the CUDA block size, the driver may select a
> smaller block, which effectively decreases num_workers.
> 

So, do I understand it correctly that using the function
cuOccupancyMaxPotentialBlockSize gives us "minimum block size that can
achieve the maximum occupancy" or some such and og8 gives us "maximum
block size"?

> In terms of performance, there really isn't that much of a difference
> between the CUDA driver's occupancy calculator and og8's. However, on
> the tests that are impacted, they are generally within a factor of two
> from one another, with some tests running faster with the driver
> occupancy calculator and others with og8's.
> 

Ack. Well, until we understand that in more detail, going with the
driver's occupancy calculator seems the right thing to do.

> Unfortunately, support for the CUDA driver API isn't universal; it's
> only available in CUDA version 6.5 (or 6050) and newer. In this patch,
> I'm exploiting the fact that init_cuda_lib only checks for errors on the
> last library function initialized.

That sounds incorrect to me. In init_cuda_lib I see:
...
# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
# define CUDA_ONE_CALL_1(call) \
  cuda_lib.call = dlsym (h, #call);     \
  if (cuda_lib.call == NULL)            \
    return false;
  CUDA_CALLS
...
so in fact every library function is checked. Have you tested this with
pre 6-5 cuda?

I think we need to add and handle:
...
  CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
...

> Therefore it guards the usage of
> 
>   cuOccupancyMaxPotentialBlockSizeWithFlags
> 
> by checking driver_version.

If we allow the cuOccupancyMaxPotentialBlockSize field to be NULL, we
can test for NULL, which seems a simpler solution than testing the version.

> If the driver occupancy calculator isn't
> available, it falls back to the existing defaults. Maybe the og8 thread
> occupancy would make a better default for older versions of CUDA, but
> that's a patch for another day.
> 

Agreed.

> Is this patch OK for trunk?

The patch doesn't build in a setup with
--enable-offload-targets=nvptx-none and without cuda, that enables usage
of plugin/cuda/cuda.h:
...
/data/offload-nvptx/src/libgomp/plugin/plugin-nvptx.c:98:16: error:
‘cuOccupancyMaxPotentialBlockSize’ undeclared here (not in a function);
did you mean ‘cuOccupancyMaxPotentialBlockSizeWithFlags’?
 CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \
...

> @@ -1220,11 +1227,39 @@ nvptx_exec (void (*fn), size_t mapnum, void 
> **hostaddrs, void **devaddrs,
>  
>        {
>       bool default_dim_p[GOMP_DIM_MAX];
> +     int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR];
> +     int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER];
> +     int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG];
> +
> +     /* The CUDA driver occupancy calculator is only available on
> +        CUDA version 6.5 (6050) and newer.  */
> +     if (nvthd->ptx_dev->driver_version > 6050)
> +       {
> +         int grids, blocks;
> +         CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
> +                           &blocks, function, NULL, 0,
> +                           dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
> +         GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
> +                            "grid = %d, block = %d\n", grids, blocks);
> +


> +         if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0)

You should use gomp_openacc_dims[0].

> +           gangs = grids * (blocks / warp_size);

So, we launch with gangs == grids * workers ? Is that intentional?

> +
> +         if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0)
> +           workers = blocks / vectors;

Also, the new default calculation is not nicely separated from the
fallback default calculation.  I've updated the patch with a cleaner
separation, attached and build without cuda but untested.

Thanks,
- Tom
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825bda2..67a475b695e 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -170,6 +171,9 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize (int *, int *, CUfunction,
+					   CUoccupancyB2DSize, size_t,
+					   int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index b6ec5f88d59..47518f1a73f 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -94,6 +94,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)	\
 CUDA_ONE_CALL (cuModuleLoad)		\
 CUDA_ONE_CALL (cuModuleLoadData)	\
 CUDA_ONE_CALL (cuModuleUnload)		\
+CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) \
 CUDA_ONE_CALL (cuStreamCreate)		\
 CUDA_ONE_CALL (cuStreamDestroy)		\
 CUDA_ONE_CALL (cuStreamQuery)		\
@@ -101,6 +102,7 @@ CUDA_ONE_CALL (cuStreamSynchronize)	\
 CUDA_ONE_CALL (cuStreamWaitEvent)
 # define CUDA_ONE_CALL(call) \
   __typeof (call) *call;
+# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL(call)
 struct cuda_lib_s {
   CUDA_CALLS
 } cuda_lib;
@@ -122,10 +124,12 @@ init_cuda_lib (void)
   if (h == NULL)
     return false;
 # undef CUDA_ONE_CALL
-# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
-# define CUDA_ONE_CALL_1(call) \
+# undef CUDA_ONE_CALL_MAYBE_NULL
+# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, true)
+# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, false)
+# define CUDA_ONE_CALL_1(call, check)		\
   cuda_lib.call = dlsym (h, #call);	\
-  if (cuda_lib.call == NULL)		\
+  if (check && cuda_lib.call == NULL)		\
     return false;
   CUDA_CALLS
   cuda_lib_inited = true;
@@ -1221,10 +1225,44 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
       {
 	bool default_dim_p[GOMP_DIM_MAX];
 	for (i = 0; i != GOMP_DIM_MAX; i++)
+	  default_dim_p[i] = !dims[i];
+
+	if (cuda_lib.cuOccupancyMaxPotentialBlockSize == NULL)
+	  {
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      if (default_dim_p[i])
+		dims[i] = nvthd->ptx_dev->default_dims[i];
+	  }
+	else
 	  {
-	    default_dim_p[i] = !dims[i];
-	    if (default_dim_p[i])
-	      dims[i] = nvthd->ptx_dev->default_dims[i];
+	    int vectors = gomp_openacc_dims[GOMP_DIM_VECTOR];
+	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
+	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
+	    int grids, blocks;
+	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+			      &blocks, function, NULL, 0,
+			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+			       "grid = %d, block = %d\n", grids, blocks);
+
+	    if (vectors == 0)
+	      vectors = warp_size;
+
+	    if (workers == 0)
+	      workers = blocks / vectors;
+
+	    if (gangs == 0)
+	      gangs = grids * workers;
+
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      if (default_dim_p[i])
+		switch (i)
+		  {
+		  case GOMP_DIM_GANG: dims[i] = gangs; break;
+		  case GOMP_DIM_WORKER: dims[i] = workers; break;
+		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+		  default: GOMP_PLUGIN_fatal ("invalid dim");
+		  }
 	  }
 
 	if (default_dim_p[GOMP_DIM_VECTOR])

Reply via email to