Hi,
when using cuda 9 nvprof with an openacc executable, the executable hangs.
The scenario resulting in the hang is as follows:
1. goacc_lazy_initialize calls gomp_mutex_lock (&acc_device_lock)
2. goacc_lazy_initialize calls acc_init_1
3. acc_init_1 calls goacc_profiling_dispatch (&prof_info,
&device_init_event_info, &api_info);
4. goacc_profiling_dispatch calls the registered callback in the cuda
profiling library
5. the registered call back calls acc_get_device_type
6. acc_get_device_type calls gomp_mutex_lock (&acc_device_lock)
7. The lock is not recursive, so we have deadlock
The registered callback in cuda 8 does not call acc_get_device_type, so
the hang doesn't occur there.
This patch fixes the hang by detecting in acc_get_device_type that the
calling thread is a thread that is currently initializing the openacc
part of the libgomp library, and returning acc_device_none, which is a
legal value given that the openacc standard states "If the device type
has not yet been selected, the value acc_device_none may be returned".
Committed to og7 branch.
Thanks,
- Tom
Fix hang when running oacc exec with CUDA 9.0 nvprof
2018-02-15 Tom de Vries <t...@codesourcery.com>
* oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread):
New variable.
(acc_init_1): Set acc_init_thread to pthread_self (). Set
acc_init_state to initializing at the start, and to initialized at the
end.
(self_initializing_p): New function.
(acc_get_device_type): Return acc_device_none if called by thread that
is currently executing acc_init_1.
---
libgomp/oacc-init.c | 33 +++++++++++++++++++++++++++++++++
2 files changed, 44 insertions(+)
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 6dada0b..d8348c0 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -40,6 +40,11 @@
static gomp_mutex_t acc_device_lock;
+static gomp_mutex_t acc_init_state_lock;
+static enum { uninitialized, initializing, initialized } acc_init_state
+ = uninitialized;
+static pthread_t acc_init_thread;
+
/* A cached version of the dispatcher for the global "current" accelerator type,
e.g. used as the default when creating new host threads. This is the
device-type equivalent of goacc_device_num (which specifies which device to
@@ -220,6 +225,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
static struct gomp_device_descr *
acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
{
+ gomp_mutex_lock (&acc_init_state_lock);
+ acc_init_state = initializing;
+ acc_init_thread = pthread_self ();
+ gomp_mutex_unlock (&acc_init_state_lock);
+
bool check_not_nested_p;
if (implicit)
{
@@ -312,6 +322,9 @@ acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
&api_info);
}
+ gomp_mutex_lock (&acc_init_state_lock);
+ acc_init_state = initialized;
+ gomp_mutex_unlock (&acc_init_state_lock);
return base_dev;
}
@@ -644,6 +657,17 @@ acc_set_device_type (acc_device_t d)
ialias (acc_set_device_type)
+static bool
+self_initializing_p (void)
+{
+ bool res;
+ gomp_mutex_lock (&acc_init_state_lock);
+ res = (acc_init_state == initializing
+ && pthread_equal (acc_init_thread, pthread_self ()));
+ gomp_mutex_unlock (&acc_init_state_lock);
+ return res;
+}
+
acc_device_t
acc_get_device_type (void)
{
@@ -653,6 +677,15 @@ acc_get_device_type (void)
if (thr && thr->base_dev)
res = acc_device_type (thr->base_dev->type);
+ else if (self_initializing_p ())
+ /* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the
+ acc_ev_device_init_start event callback, which is dispatched during
+ acc_init_1. Trying to lock acc_device_lock during such a call (as we do
+ in the else clause below), will result in deadlock, since the lock has
+ already been taken by the acc_init_1 caller. We work around this problem
+ by using the acc_get_device_type property "If the device type has not yet
+ been selected, the value acc_device_none may be returned". */
+ ;
else
{
acc_prof_info prof_info;