This is the libgomp plugin side of omp_clock_wtime support on NVPTX. Query GPU frequency and copy the value into the device image.
At the moment CUDA driver sets GPU to a fixed frequency when a CUDA context is created (the default is to use the highest non-boost frequency, but it can be altered with the nvidia-smi utility), so as long as dynamic boost is not implemented, and thermal throttling does not happen, what was queried should correspond to the actual frequency of %clock64 updates. However, on GTX Titan we observed that the driver returns GPU frequency that is midway between actual frequency and boost frequency -- we consider that a driver bug. Thus, the implementation comes with a caveat that device-side measurements are less reliable (than host-side). * plugin/plugin-nvptx.c (struct ptx_device): New field (clock_khz). (nvptx_open_device): Set it. (nvptx_set_clocktick): New. Use it... (GOMP_OFFLOAD_load_image): ...here. --- libgomp/ChangeLog.gomp-nvptx | 7 +++++++ libgomp/plugin/plugin-nvptx.c | 28 +++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index e687586..87e0494 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -287,8 +287,9 @@ struct ptx_device bool overlap; bool map; bool concur; - int mode; bool mkern; + int mode; + int clock_khz; struct ptx_image_data *images; /* Images loaded on device. */ pthread_mutex_t image_lock; /* Lock for above list. */ @@ -641,6 +642,12 @@ nvptx_open_device (int n) ptx_dev->mkern = pi; + r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); + + ptx_dev->clock_khz = pi; + r = cuDeviceGetAttribute (&async_engines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); if (r != CUDA_SUCCESS) @@ -1505,6 +1512,23 @@ GOMP_OFFLOAD_version (void) return GOMP_VERSION; } +/* Initialize __nvptx_clocktick, if present in MODULE. */ + +static void +nvptx_set_clocktick (CUmodule module, struct ptx_device *dev) +{ + CUdeviceptr dptr; + CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick"); + if (r == CUDA_ERROR_NOT_FOUND) + return; + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); + double __nvptx_clocktick = 1e-3 / dev->clock_khz; + r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick)); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); +} + /* Load the (partial) program described by TARGET_DATA to device number ORD. Allocate and return TARGET_TABLE. */ @@ -1590,6 +1614,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, targ_tbl->end = targ_tbl->start + bytes; } + nvptx_set_clocktick (module, dev); + return fn_entries + var_entries; }