This bug is encountered in version 17.02.3.
There is a race condition where the pthread condition variable is
destroyed before the watching threads exit.
From 8fda001f9477223d584fe7a42b664b9973e431ac Mon Sep 17 00:00:00 2001
From: Hongjia Cao <[email protected]>
Date: Thu, 22 Jun 2017 20:45:48 +0800
Subject: [PATCH] fix of stepd hang on job exit
---
src/common/slurm_acct_gather_profile.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/common/slurm_acct_gather_profile.c b/src/common/slurm_acct_gather_profile.c
index 3281170644..37f672494b 100644
--- a/src/common/slurm_acct_gather_profile.c
+++ b/src/common/slurm_acct_gather_profile.c
@@ -267,6 +267,10 @@ extern int acct_gather_profile_fini(void)
pthread_join(timer_thread_id, NULL);
}
+ for (i=0; i < PROFILE_CNT; i++) {
+ pthread_cond_destroy(&acct_gather_profile_timer[i].notify);
+ }
+
rc = plugin_context_destroy(g_context);
g_context = NULL;
done:
@@ -524,7 +528,6 @@ extern void acct_gather_profile_endpoll(void)
slurm_mutex_lock(&acct_gather_profile_timer[i].notify_mutex);
slurm_cond_signal(&acct_gather_profile_timer[i].notify);
slurm_mutex_unlock(&acct_gather_profile_timer[i].notify_mutex);
- pthread_cond_destroy(&acct_gather_profile_timer[i].notify);
acct_gather_profile_timer[i].freq = 0;
switch (i) {
case PROFILE_ENERGY:
--
2.11.0