Hello,

We want to introduce a new behavior in the way slurmd uses the HealthCheckProgram. The idea is to avoid a race condition between the first HealthCheckProgram run and the node accepting jobs. The slurmd daemon will initialize and then loop on HealthCheckProgram execution before registering with slurmctld. It will stay in this loop until the HealthCheckProgram returns successfully (the node is still DOWN).

On our clusters we are using NHC as an HealthCheckProgram. NHC drains the node if it fails and remove the drain if it is successfull, this behavior fits well with our purpose. This behavior permits us to start slurmd at boot without setting up a complex boot sequence in the init system, slurmd just wait for the node to be ready before registering.

The HealthCheckProgram is not run during slurmd startup if HealthCheckInteval is 0.

We are looking for comments and feedback on this proposed behavior and would like to know if something like this could be included in the next 15.08 release or 16.05.

regards,

Thomas

Patch (on 15.08.8):

diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index 309c91d..912b1fe 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -2534,10 +2534,8 @@ _rpc_health_check(slurm_msg_t *msg)
                send_registration_msg(SLURM_SUCCESS, false);
        }

-       if ((rc == SLURM_SUCCESS) && (conf->health_check_program)) {
-               char *env[1] = { NULL };
- rc = run_script("health_check", conf->health_check_program,
-                               0, 60, env, 0);
+       if (rc == SLURM_SUCCESS) {
+               rc = run_script_health_check();
        }

        /* Take this opportunity to enforce any job memory limits */
diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c
index dfb416f..b9e15fc 100644
--- a/src/slurmd/slurmd/slurmd.c
+++ b/src/slurmd/slurmd/slurmd.c
@@ -104,6 +104,7 @@
 #include "src/slurmd/common/core_spec_plugin.h"
 #include "src/slurmd/common/job_container_plugin.h"
 #include "src/slurmd/common/proctrack.h"
+#include "src/slurmd/common/run_script.h"
 #include "src/slurmd/common/slurmd_cgroup.h"
 #include "src/slurmd/common/xcpuinfo.h"
 #include "src/slurmd/slurmd/get_mach_stat.h"
@@ -119,6 +120,8 @@

 #define MAX_THREADS            256

+#define HEALTH_RETRY_DELAY 10
+
 #define _free_and_set(__dst, __src) \
        xfree(__dst); __dst = __src

@@ -203,6 +206,7 @@ static void      _update_nice(void);
 static void      _usage(void);
 static int       _validate_and_convert_cpu_list(void);
 static void      _wait_for_all_threads(int secs);
+static void      _wait_health_check(void);


 int
@@ -363,6 +367,10 @@ main (int argc, char *argv[])
        if (slurmd_plugstack_init())
                fatal("failed to initialize slurmd_plugstack");

+       /* Wait for a successfull health check if
+        * HealthCheckInterval != 0 */
+       _wait_health_check();
+
        _spawn_registration_engine();
        msg_aggr_sender_init(conf->hostname, conf->port,
                             conf->msg_aggr_window_time,
@@ -1002,6 +1010,8 @@ _read_config(void)

        conf->mem_limit_enforce = cf->mem_limit_enforce;

+       conf->health_check_interval = cf->health_check_interval;
+
        slurm_mutex_unlock(&conf->config_mutex);
        slurm_conf_unlock();
 }
@@ -2290,3 +2300,40 @@ static void _resource_spec_fini(void)
        FREE_NULL_BITMAP(res_core_bitmap);
        FREE_NULL_BITMAP(res_cpu_bitmap);
 }
+
+/*
+ * Wait for health check to execute successfully
+ *
+ * Return imediately if a shutdown has been requested or
+ * if the HealthCheckInterval is 0.
+ */
+static void _wait_health_check(void)
+{
+       while (!_shutdown &&
+           (conf->health_check_interval != 0 ) &&
+           (run_script_health_check() != SLURM_SUCCESS)) {
+               info ("Node Health Check failed, retrying in %ds...",
+                   HEALTH_RETRY_DELAY);
+               sleep(HEALTH_RETRY_DELAY);
+  }
+}
+
+/*
+ * Run the configured health check program
+ *
+ * Returns the run result. If the health check program
+ * is not defined, returns success immediately.
+ *
+ */
+extern int run_script_health_check(void)
+{
+       int rc = SLURM_SUCCESS;
+
+       if (conf->health_check_program) {
+               char *env[1] = { NULL };
+ rc = run_script("health_check", conf->health_check_program,
+                   0, 60, env, 0);
+       }
+
+  return rc;
+}
diff --git a/src/slurmd/slurmd/slurmd.h b/src/slurmd/slurmd/slurmd.h
index d253815..c58a54a 100644
--- a/src/slurmd/slurmd/slurmd.h
+++ b/src/slurmd/slurmd/slurmd.h
@@ -125,7 +125,8 @@ typedef struct slurmd_config {
char *logfile; /* slurmd logfile, if any */ char *spooldir; /* SlurmdSpoolDir */ char *pidfile; /* PidFile location */ - char *health_check_program; /* run on RPC request */ + char *health_check_program; /* run on RPC request or at start */ + uint64_t health_check_interval; /* Interval between runs */ char *tmpfs; /* directory of tmp FS */ char *pubkey; /* location of job cred public key */ char *epilog; /* Path to Epilog script */ @@ -194,5 +195,8 @@ int send_registration_msg(uint32_t status, bool startup);
  */
 int save_cred_state(slurm_cred_ctx_t vctx);

+/* Run the health check program if configured
+ */
+int run_script_health_check(void);

 #endif /* !_SLURMD_H */

--
Thomas HAMEL
github: hmlth

Reply via email to