This patch does the followings:

 * Adjusts the default num_gangs to utilize more of GPU hardware.
 * Teach libgomp to emit a diagnostic when num_workers isn't supported.

According to the confusing CUDA literature, it appears that the previous
num_gangs wasn't fully utilizing the GPU hardware. The previous strategy
was to set num_gangs to the number of SM processors. However, SM
processors can execute multiple CUDA blocks concurrently. In this patch,
I'm using a relaxed version of the formulas from Nvidia's CUDA Occupancy
Calculator spreadsheet to determine num_gangs. More specifically, since
we're not using shared-memory that extensively right now, I've omitted
that restraint from the formula.

While I was at it, I also taught the nvptx plugin how to emit a
diagnostic when the hardware doesn't have enough registers to support
the requested num_workers at run time. There are two problems here. 1)
The register file is a shared resource between all of the threads in a
SM. The more registers each thread in the SM utilizes, the fewer threads
the CUDA blocks can contain. 2) In order to eliminate MIN_EXPRs in the
for-loop branches in worker-partitioned loops, the nvptx BE is currently
hard-coding the default num_workers to 32 for any parallel region that
doesn't contain an explicit num_workers. When I disabled that
optimization, I observed a 2.5x slow down in CloverLeaf. So rather than
disabling that optimization, I taught the runtime to give the end user
some performance hints. E.g., recompile your program with
-fopenacc-dim=-:num_workers.

This patch has been applied to gomp-4_0-branch.

Cesar
2017-02-13  Cesar Philippidis  <ce...@codesourcery.com>

	libgomp/
	* plugin/plugin-nvptx.c (nvptx_exec): Adjust the default num_gangs.
	Add diagnostic when the hardware cannot support the requested
	num_workers.


diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index d1261b4..8c696eb 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -899,6 +899,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   CUdeviceptr dp;
   struct nvptx_thread *nvthd = nvptx_thread ();
   const char *maybe_abort_msg = "(perhaps abort was called)";
+  static int warp_size, block_size, dev_size, cpu_size, rf_size, sm_size;
 
   function = targ_fn->fn;
 
@@ -917,90 +918,145 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 	seen_zero = 1;
     }
 
-  if (seen_zero)
-    {
-      /* See if the user provided GOMP_OPENACC_DIM environment
-	 variable to specify runtime defaults. */
-      static int default_dims[GOMP_DIM_MAX];
+  /* Both reg_granuarlity and warp_granuularity were extracted from
+     the "Register Allocation Granularity" in Nvidia's CUDA Occupancy
+     Calculator spreadsheet.  Specifically, this required SM_30+
+     targets.  */
+  const int reg_granularity = 256;
+  const int warp_granularity = 4;
 
-      pthread_mutex_lock (&ptx_dev_lock);
-      if (!default_dims[0])
+  /* See if the user provided GOMP_OPENACC_DIM environment variable to
+     specify runtime defaults. */
+  static int default_dims[GOMP_DIM_MAX];
+
+  pthread_mutex_lock (&ptx_dev_lock);
+  if (!default_dims[0])
+    {
+      /* We only read the environment variable once.  You can't
+	 change it in the middle of execution.  The syntax  is
+	 the same as for the -fopenacc-dim compilation option.  */
+      const char *env_var = getenv ("GOMP_OPENACC_DIM");
+      if (env_var)
 	{
-	  /* We only read the environment variable once.  You can't
-	     change it in the middle of execution.  The syntax  is
-	     the same as for the -fopenacc-dim compilation option.  */
-	  const char *env_var = getenv ("GOMP_OPENACC_DIM");
-	  if (env_var)
-	    {
-	      const char *pos = env_var;
+	  const char *pos = env_var;
 
-	      for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+	  for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+	    {
+	      if (i && *pos++ != ':')
+		break;
+	      if (*pos != ':')
 		{
-		  if (i && *pos++ != ':')
+		  const char *eptr;
+
+		  errno = 0;
+		  long val = strtol (pos, (char **)&eptr, 10);
+		  if (errno || val < 0 || (unsigned)val != val)
 		    break;
-		  if (*pos != ':')
-		    {
-		      const char *eptr;
-
-		      errno = 0;
-		      long val = strtol (pos, (char **)&eptr, 10);
-		      if (errno || val < 0 || (unsigned)val != val)
-			break;
-		      default_dims[i] = (int)val;
-		      pos = eptr;
-		    }
+		  default_dims[i] = (int)val;
+		  pos = eptr;
 		}
 	    }
+	}
 
-	  int warp_size, block_size, dev_size, cpu_size;
-	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
-	  /* 32 is the default for known hardware.  */
-	  int gang = 0, worker = 32, vector = 32;
-	  CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
-
-	  cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
-	  cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
-	  cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
-	  cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
-
-	  if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS
-	      && cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS
-	      && cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS
-	      && cuDeviceGetAttribute (&cpu_size, cu_tpm, dev)  == CUDA_SUCCESS)
-	    {
-	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
-				 " dev_size=%d, cpu_size=%d\n",
-				 warp_size, block_size, dev_size, cpu_size);
-	      gang = (cpu_size / block_size) * dev_size;
-	      worker = block_size / warp_size;
-	      vector = warp_size;
-	    }
-
-	  /* There is no upper bound on the gang size.  The best size
-	     matches the hardware configuration.  Logical gangs are
-	     scheduled onto physical hardware.  To maximize usage, we
-	     should guess a large number.  */
-	  if (default_dims[GOMP_DIM_GANG] < 1)
-	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
-	  /* The worker size must not exceed the hardware.  */
-	  if (default_dims[GOMP_DIM_WORKER] < 1
-	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
-	    default_dims[GOMP_DIM_WORKER] = worker;
-	  /* The vector size must exactly match the hardware.  */
-	  if (default_dims[GOMP_DIM_VECTOR] < 1
-	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
-	    default_dims[GOMP_DIM_VECTOR] = vector;
-
-	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
-			     default_dims[GOMP_DIM_GANG],
-			     default_dims[GOMP_DIM_WORKER],
-			     default_dims[GOMP_DIM_VECTOR]);
+      CUdevice dev = nvptx_thread()->ptx_dev->dev;
+      /* 32 is the default for known hardware.  */
+      int gang = 0, worker = 32, vector = 32;
+      CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm, cu_rf, cu_sm;
+
+      cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
+      cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
+      cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
+      cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
+      cu_rf  = CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR;
+      cu_sm  = CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR;
+
+      if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS
+	  && cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS
+	  && cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS
+	  && cuDeviceGetAttribute (&cpu_size, cu_tpm, dev) == CUDA_SUCCESS
+	  && cuDeviceGetAttribute (&rf_size, cu_rf, dev)  == CUDA_SUCCESS
+	  && cuDeviceGetAttribute (&sm_size, cu_sm, dev)  == CUDA_SUCCESS)
+	{
+	  GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
+			     " dev_size=%d, cpu_size=%d, regfile_size=%d,"
+			     " smem_size=%d\n",
+			     warp_size, block_size, dev_size, cpu_size,
+			     rf_size, sm_size);
+	  gang = (cpu_size / block_size) * dev_size;
+	  worker = block_size / warp_size;
+	  vector = warp_size;
 	}
-      pthread_mutex_unlock (&ptx_dev_lock);
 
+      /* There is no upper bound on the gang size.  The best size
+	 matches the hardware configuration.  Logical gangs are
+	 scheduled onto physical hardware.  To maximize usage, we
+	 should guess a large number.  */
+      /* The worker size must not exceed the hardware.  */
+      if (default_dims[GOMP_DIM_WORKER] < 1
+	  || (default_dims[GOMP_DIM_WORKER] > worker && gang))
+	default_dims[GOMP_DIM_WORKER] = worker;
+      /* The vector size must exactly match the hardware.  */
+      if (default_dims[GOMP_DIM_VECTOR] < 1
+	  || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
+	default_dims[GOMP_DIM_VECTOR] = vector;
+
+      GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
+			 default_dims[GOMP_DIM_GANG],
+			 default_dims[GOMP_DIM_WORKER],
+			 default_dims[GOMP_DIM_VECTOR]);
+    }
+  pthread_mutex_unlock (&ptx_dev_lock);
+
+  int reg_used = -1;  /* Dummy value.  */
+  cuFuncGetAttribute (&reg_used, CU_FUNC_ATTRIBUTE_NUM_REGS, function);
+
+  int reg_per_warp = ((reg_used * warp_size + reg_granularity - 1)
+		      / reg_granularity) * reg_granularity;
+
+  int threads_per_sm = (rf_size / reg_per_warp / warp_granularity)
+    * warp_granularity * warp_size;
+
+  if (threads_per_sm > cpu_size)
+    threads_per_sm = cpu_size;
+
+  if (seen_zero)
+    {
       for (i = 0; i != GOMP_DIM_MAX; i++)
 	if (!dims[i])
-	  dims[i] = default_dims[i];
+	  {
+	    if (default_dims[i] > 0)
+	      dims[i] = default_dims[i];
+	    else
+	      switch (i) {
+	      case GOMP_DIM_GANG:
+		dims[i] = 2 * threads_per_sm / warp_size * dev_size;
+		break;
+	      case GOMP_DIM_WORKER:
+	      case GOMP_DIM_VECTOR:
+		dims[i] = warp_size;
+		break;
+	      default:
+		abort ();
+	      }
+	  }
+    }
+
+  /* Check if the accelerator has sufficient hardware resources to
+     launch the offloaded kernel.  */
+  if (dims[GOMP_DIM_WORKER] > 1)
+    {
+      int threads_per_block = threads_per_sm > block_size
+	? block_size : threads_per_sm;
+
+      threads_per_block /= warp_size;
+
+      if (dims[GOMP_DIM_WORKER] > threads_per_block)
+	GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources "
+			   "to launch '%s'; recompile the program with "
+			   "'num_workers = %d' on that offloaded region or "
+			   "'-fopenacc-dim=-:%d'.\n",
+			   targ_fn->launch->fn, threads_per_block,
+			   threads_per_block);
     }
 
   /* This reserves a chunk of a pre-allocated page of memory mapped on both

Reply via email to