Hello,
We want to introduce a new behavior in the way slurmd uses the
HealthCheckProgram. The idea is to avoid a race condition between the
first HealthCheckProgram run and the node accepting jobs. The slurmd
daemon will initialize and then loop on HealthCheckProgram execution
before registering with slurmctld. It will stay in this loop until the
HealthCheckProgram returns successfully (the node is still DOWN).
On our clusters we are using NHC as an HealthCheckProgram. NHC drains
the node if it fails and remove the drain if it is successfull, this
behavior fits well with our purpose. This behavior permits us to start
slurmd at boot without setting up a complex boot sequence in the init
system, slurmd just wait for the node to be ready before registering.
The HealthCheckProgram is not run during slurmd startup if
HealthCheckInteval is 0.
We are looking for comments and feedback on this proposed behavior and
would like to know if something like this could be included in the next
15.08 release or 16.05.
regards,
Thomas
Patch (on 15.08.8):
diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index 309c91d..912b1fe 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -2534,10 +2534,8 @@ _rpc_health_check(slurm_msg_t *msg)
send_registration_msg(SLURM_SUCCESS, false);
}
- if ((rc == SLURM_SUCCESS) && (conf->health_check_program)) {
- char *env[1] = { NULL };
- rc = run_script("health_check",
conf->health_check_program,
- 0, 60, env, 0);
+ if (rc == SLURM_SUCCESS) {
+ rc = run_script_health_check();
}
/* Take this opportunity to enforce any job memory limits */
diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c
index dfb416f..b9e15fc 100644
--- a/src/slurmd/slurmd/slurmd.c
+++ b/src/slurmd/slurmd/slurmd.c
@@ -104,6 +104,7 @@
#include "src/slurmd/common/core_spec_plugin.h"
#include "src/slurmd/common/job_container_plugin.h"
#include "src/slurmd/common/proctrack.h"
+#include "src/slurmd/common/run_script.h"
#include "src/slurmd/common/slurmd_cgroup.h"
#include "src/slurmd/common/xcpuinfo.h"
#include "src/slurmd/slurmd/get_mach_stat.h"
@@ -119,6 +120,8 @@
#define MAX_THREADS 256
+#define HEALTH_RETRY_DELAY 10
+
#define _free_and_set(__dst, __src) \
xfree(__dst); __dst = __src
@@ -203,6 +206,7 @@ static void _update_nice(void);
static void _usage(void);
static int _validate_and_convert_cpu_list(void);
static void _wait_for_all_threads(int secs);
+static void _wait_health_check(void);
int
@@ -363,6 +367,10 @@ main (int argc, char *argv[])
if (slurmd_plugstack_init())
fatal("failed to initialize slurmd_plugstack");
+ /* Wait for a successfull health check if
+ * HealthCheckInterval != 0 */
+ _wait_health_check();
+
_spawn_registration_engine();
msg_aggr_sender_init(conf->hostname, conf->port,
conf->msg_aggr_window_time,
@@ -1002,6 +1010,8 @@ _read_config(void)
conf->mem_limit_enforce = cf->mem_limit_enforce;
+ conf->health_check_interval = cf->health_check_interval;
+
slurm_mutex_unlock(&conf->config_mutex);
slurm_conf_unlock();
}
@@ -2290,3 +2300,40 @@ static void _resource_spec_fini(void)
FREE_NULL_BITMAP(res_core_bitmap);
FREE_NULL_BITMAP(res_cpu_bitmap);
}
+
+/*
+ * Wait for health check to execute successfully
+ *
+ * Return imediately if a shutdown has been requested or
+ * if the HealthCheckInterval is 0.
+ */
+static void _wait_health_check(void)
+{
+ while (!_shutdown &&
+ (conf->health_check_interval != 0 ) &&
+ (run_script_health_check() != SLURM_SUCCESS)) {
+ info ("Node Health Check failed, retrying in %ds...",
+ HEALTH_RETRY_DELAY);
+ sleep(HEALTH_RETRY_DELAY);
+ }
+}
+
+/*
+ * Run the configured health check program
+ *
+ * Returns the run result. If the health check program
+ * is not defined, returns success immediately.
+ *
+ */
+extern int run_script_health_check(void)
+{
+ int rc = SLURM_SUCCESS;
+
+ if (conf->health_check_program) {
+ char *env[1] = { NULL };
+ rc = run_script("health_check",
conf->health_check_program,
+ 0, 60, env, 0);
+ }
+
+ return rc;
+}
diff --git a/src/slurmd/slurmd/slurmd.h b/src/slurmd/slurmd/slurmd.h
index d253815..c58a54a 100644
--- a/src/slurmd/slurmd/slurmd.h
+++ b/src/slurmd/slurmd/slurmd.h
@@ -125,7 +125,8 @@ typedef struct slurmd_config {
char *logfile; /* slurmd logfile, if any
*/
char *spooldir; /* SlurmdSpoolDir
*/
char *pidfile; /* PidFile location
*/
- char *health_check_program; /* run on RPC request
*/
+ char *health_check_program; /* run on RPC request or
at start */
+ uint64_t health_check_interval; /* Interval between runs
*/
char *tmpfs; /* directory of tmp FS
*/
char *pubkey; /* location of job cred public
key */
char *epilog; /* Path to Epilog script
*/
@@ -194,5 +195,8 @@ int send_registration_msg(uint32_t status, bool
startup);
*/
int save_cred_state(slurm_cred_ctx_t vctx);
+/* Run the health check program if configured
+ */
+int run_script_health_check(void);
#endif /* !_SLURMD_H */
--
Thomas HAMEL
github: hmlth