This bug is encountered in version 17.02.3.

There is a race condition where the pthread condition variable is
destroyed before the watching threads exit.
From 8fda001f9477223d584fe7a42b664b9973e431ac Mon Sep 17 00:00:00 2001
From: Hongjia Cao <[email protected]>
Date: Thu, 22 Jun 2017 20:45:48 +0800
Subject: [PATCH] fix of stepd hang on job exit

---
 src/common/slurm_acct_gather_profile.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/common/slurm_acct_gather_profile.c b/src/common/slurm_acct_gather_profile.c
index 3281170644..37f672494b 100644
--- a/src/common/slurm_acct_gather_profile.c
+++ b/src/common/slurm_acct_gather_profile.c
@@ -267,6 +267,10 @@ extern int acct_gather_profile_fini(void)
 		pthread_join(timer_thread_id, NULL);
 	}
 
+	for (i=0; i < PROFILE_CNT; i++) {
+		pthread_cond_destroy(&acct_gather_profile_timer[i].notify);
+	}
+
 	rc = plugin_context_destroy(g_context);
 	g_context = NULL;
 done:
@@ -524,7 +528,6 @@ extern void acct_gather_profile_endpoll(void)
 		slurm_mutex_lock(&acct_gather_profile_timer[i].notify_mutex);
 		slurm_cond_signal(&acct_gather_profile_timer[i].notify);
 		slurm_mutex_unlock(&acct_gather_profile_timer[i].notify_mutex);
-		pthread_cond_destroy(&acct_gather_profile_timer[i].notify);
 		acct_gather_profile_timer[i].freq = 0;
 		switch (i) {
 		case PROFILE_ENERGY:
-- 
2.11.0

Reply via email to