This is the libgomp plugin side of omp_clock_wtime support on NVPTX.  Query
GPU frequency and copy the value into the device image.

At the moment CUDA driver sets GPU to a fixed frequency when a CUDA context is
created (the default is to use the highest non-boost frequency, but it can be
altered with the nvidia-smi utility), so as long as dynamic boost is not
implemented, and thermal throttling does not happen, what was queried should
correspond to the actual frequency of %clock64 updates.  However, on GTX Titan
we observed that the driver returns GPU frequency that is midway between
actual frequency and boost frequency -- we consider that a driver bug.  Thus,
the implementation comes with a caveat that device-side measurements are less
reliable (than host-side).

        * plugin/plugin-nvptx.c (struct ptx_device): New field (clock_khz).
        (nvptx_open_device): Set it.
        (nvptx_set_clocktick): New.  Use it...
        (GOMP_OFFLOAD_load_image): ...here.
---
 libgomp/ChangeLog.gomp-nvptx  |  7 +++++++
 libgomp/plugin/plugin-nvptx.c | 28 +++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index e687586..87e0494 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -287,8 +287,9 @@ struct ptx_device
   bool overlap;
   bool map;
   bool concur;
-  int  mode;
   bool mkern;
+  int  mode;
+  int clock_khz;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -641,6 +642,12 @@ nvptx_open_device (int n)
 
   ptx_dev->mkern = pi;
 
+  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+  ptx_dev->clock_khz = pi;
+
   r = cuDeviceGetAttribute (&async_engines,
                            CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
   if (r != CUDA_SUCCESS)
@@ -1505,6 +1512,23 @@ GOMP_OFFLOAD_version (void)
   return GOMP_VERSION;
 }
 
+/* Initialize __nvptx_clocktick, if present in MODULE.  */
+
+static void
+nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
+{
+  CUdeviceptr dptr;
+  CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
+  if (r == CUDA_ERROR_NOT_FOUND)
+    return;
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+  double __nvptx_clocktick = 1e-3 / dev->clock_khz;
+  r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+}
+
 /* Load the (partial) program described by TARGET_DATA to device
    number ORD.  Allocate and return TARGET_TABLE.  */
 
@@ -1590,6 +1614,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const 
void *target_data,
       targ_tbl->end = targ_tbl->start + bytes;
     }
 
+  nvptx_set_clocktick (module, dev);
+
   return fn_entries + var_entries;
 }
 

Reply via email to