[slurm-dev] Maintenance reboot sheduling mechanics. Re: A question.

Andrej N. Gritsenko Thu, 18 Aug 2011 16:55:15 -0700

    Hello there!

>On Monday, July 18, at 17:27 I've received:
>>Your best solution today would be to use the "drain" node state flag  
>>and the "reason" field (scontrol update nodename=... state=drain  
>>reason=maint). Another program would need to notice when the state  
>>node was drained (no running jobs)m perform the maintenance, and  
>>resume the node (scontrol update nodename=... state=resume). This  
>>could be better automated, but this is what exists today and I know  
>>some sites use this.


>    I see... it's not right way to do it as nodes will be out of field
>right after 'scontrol' command for every nodes in that case and nobody
>can even queue a job... so it really needs some deep development then.
>I'll make a solution and send it to list when it's ready and tested. :)

See the patch for on-the-fly maintenance is in attachment. How it works:

1) we have new parameter in slurm.conf:
    RebootProgram
that parameter have the command which would be ran on computing nodes to
do real reboot - for example, "/sbin/shutdown -r now"

2) we execute (as administrator) command:
    scontrol reboot_nodes

3) every node (as soon as it goes idle) will attain MAINT state and go to
reboot; if node has netboot then it should get upgraded/bugfixed after
the reboot and returned to work in the cluster

4) nodes which have running jobs will continue to receive queued jobs but
also are scheduled to reboot so new jobs will run after reboot

5) nodes which are in maintenance reservation are not affected by this.

It's tested on test cluster yet, clusters in production yet not upgraded.
Everyone welcomed to test this and give me your complains or questions.
No documentation changes were made too and no RPC change made to see the
RebootProgram parameter printed by
    scontrol show config
but appropriate note is there, see the patch. Also member reboot_program
was not placed into alphabetical order in slurm_ctl_conf_t as it breaks
existing plugins, that should be done only with version change (we still
use 2.2 branch).

Hope to see it accepted into next SLURM version. :)

>>>     There is a need sometimes to do some softupgrades on nodes - for
>>> example, for kernel upgrade due to security fixes. Such kind of upgrades
>>> should not affect cluster functionality but nodes have to be rebooted as
>>> soon no job running on them without anything else - i.e. node marked as
>>> wanted reboot but still able to accept queued jobs and when current job
>>> ends then node rebooted, unmarked and ready to get next job. As such work
>>> may affect whole cluster to do it manually is inappropriate and also
>>> there is no need to stop whole cluster for maintenance and even we should
>>> not stop it as it's not critical issue for the cluster.
>>>     I found not solution for this in SLURM. Have I missed something or
>>> should I make such functionality (probably using NODE_STATE_MAINT flag)
>>> as our tech.support highly require this?

    With best wishes.
    Andriy.

diff -ur slurm-2.2.6/slurm/slurm.h.in slurm-2.2.6.reboot/slurm/slurm.h.in
--- slurm-2.2.6/slurm/slurm.h.in	2011-05-27 21:25:04.000000000 +0300
+++ slurm-2.2.6.reboot/slurm/slurm.h.in	2011-08-16 19:31:05.000000000 +0300
@@ -1849,6 +1849,8 @@
 	uint16_t z_16;		/* reserved for future use */
 	uint32_t z_32;		/* reserved for future use */
 	char *z_char;		/* reserved for future use */
+	/* should be put above but hold here for compatibility with plugins */
+	char *reboot_program;	/* program to reboot the node */
 } slurm_ctl_conf_t;
 
 typedef struct slurmd_status_msg {
diff -ur slurm-2.2.6/src/api/config_info.c slurm-2.2.6.reboot/src/api/config_info.c
--- slurm-2.2.6/src/api/config_info.c	2011-05-27 21:25:04.000000000 +0300
+++ slurm-2.2.6.reboot/src/api/config_info.c	2011-08-12 15:39:44.000000000 +0300
@@ -754,6 +754,11 @@
 	list_append(ret_list, key_pair);
 
 	key_pair = xmalloc(sizeof(config_key_pair_t));
+	key_pair->name = xstrdup("RebootProgram");
+	key_pair->value = xstrdup(slurm_ctl_conf_ptr->reboot_program);
+	list_append(ret_list, key_pair);
+
+	key_pair = xmalloc(sizeof(config_key_pair_t));
 	key_pair->name = xstrdup("ResumeProgram");
 	key_pair->value = xstrdup(slurm_ctl_conf_ptr->resume_program);
 	list_append(ret_list, key_pair);
diff -ur slurm-2.2.6/src/common/forward.c slurm-2.2.6.reboot/src/common/forward.c
--- slurm-2.2.6/src/common/forward.c	2011-05-27 21:25:05.000000000 +0300
+++ slurm-2.2.6.reboot/src/common/forward.c	2011-08-17 15:25:49.000000000 +0300
@@ -176,7 +176,8 @@
 		}
 
 		if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) ||
-		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE)) {
+		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) ||
+		    (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) {
 			slurm_mutex_lock(fwd_msg->forward_mutex);
 			ret_data_info = xmalloc(sizeof(ret_data_info_t));
 			list_push(fwd_msg->ret_list, ret_data_info);
diff -ur slurm-2.2.6/src/common/read_config.c slurm-2.2.6.reboot/src/common/read_config.c
--- slurm-2.2.6/src/common/read_config.c	2011-05-27 21:25:05.000000000 +0300
+++ slurm-2.2.6.reboot/src/common/read_config.c	2011-08-12 15:45:15.000000000 +0300
@@ -229,6 +229,7 @@
 	{"PropagatePrioProcess", S_P_UINT16},
 	{"PropagateResourceLimitsExcept", S_P_STRING},
 	{"PropagateResourceLimits", S_P_STRING},
+	{"RebootProgram", S_P_STRING},
 	{"ResumeProgram", S_P_STRING},
 	{"ResumeRate", S_P_UINT16},
 	{"ResumeTimeout", S_P_UINT16},
@@ -1507,6 +1508,7 @@
 	xfree (ctl_conf_ptr->prolog_slurmctld);
 	xfree (ctl_conf_ptr->propagate_rlimits);
 	xfree (ctl_conf_ptr->propagate_rlimits_except);
+	xfree (ctl_conf_ptr->reboot_program);
 	xfree (ctl_conf_ptr->resume_program);
 	xfree (ctl_conf_ptr->salloc_default_command);
 	xfree (ctl_conf_ptr->sched_logfile);
@@ -1621,6 +1623,7 @@
 	ctl_conf_ptr->propagate_prio_process	= (uint16_t) NO_VAL;
 	xfree (ctl_conf_ptr->propagate_rlimits);
 	xfree (ctl_conf_ptr->propagate_rlimits_except);
+	xfree (ctl_conf_ptr->reboot_program);
 	ctl_conf_ptr->resume_timeout		= 0;
 	xfree (ctl_conf_ptr->resume_program);
 	ctl_conf_ptr->resume_rate		= (uint16_t) NO_VAL;
@@ -2507,6 +2510,8 @@
 	if (!s_p_get_uint16(&conf->resume_timeout, "ResumeTimeout", hashtbl))
 		conf->resume_timeout = DEFAULT_RESUME_TIMEOUT;
 
+	s_p_get_string(&conf->reboot_program, "RebootProgram", hashtbl);
+
 	s_p_get_string(&conf->salloc_default_command, "SallocDefaultCommand",
 			hashtbl);
 
diff -ur slurm-2.2.6/src/common/slurm_protocol_defs.c slurm-2.2.6.reboot/src/common/slurm_protocol_defs.c
--- slurm-2.2.6/src/common/slurm_protocol_defs.c	2011-05-27 21:25:05.000000000 +0300
+++ slurm-2.2.6.reboot/src/common/slurm_protocol_defs.c	2011-08-17 17:16:49.000000000 +0300
@@ -1194,7 +1194,8 @@
 	if (maint_flag) {
 		if (no_resp_flag)
 			return "MAINT*";
-		return "MAINT";
+		if (base != NODE_STATE_ALLOCATED)
+			return "MAINT";
 	}
 	if (drain_flag) {
 		if (comp_flag || (base == NODE_STATE_ALLOCATED)) {
@@ -1308,7 +1309,8 @@
 	if (maint_flag) {
 		if (no_resp_flag)
 			return "MAINT*";
-		return "MAINT";
+		if (inx != NODE_STATE_ALLOCATED)
+			return "MAINT";
 	}
 	if (drain_flag) {
 		if (comp_flag || (inx == NODE_STATE_ALLOCATED)) {
@@ -2257,6 +2259,7 @@
 	case REQUEST_HEALTH_CHECK:
 	case ACCOUNTING_FIRST_REG:
 	case REQUEST_TOPO_INFO:
+	case REQUEST_REBOOT_NODES:
 		/* No body to free */
 		break;
 	case ACCOUNTING_UPDATE_MSG:
diff -ur slurm-2.2.6/src/common/slurm_protocol_defs.h slurm-2.2.6.reboot/src/common/slurm_protocol_defs.h
--- slurm-2.2.6/src/common/slurm_protocol_defs.h	2011-05-27 21:25:05.000000000 +0300
+++ slurm-2.2.6.reboot/src/common/slurm_protocol_defs.h	2011-07-19 15:47:53.000000000 +0300
@@ -179,6 +179,7 @@
 	REQUEST_HEALTH_CHECK,
 	REQUEST_TAKEOVER,
 	REQUEST_SET_SCHEDLOG_LEVEL,
+	REQUEST_REBOOT_NODES,
 
 	REQUEST_BUILD_INFO = 2001,
 	RESPONSE_BUILD_INFO,
diff -ur slurm-2.2.6/src/common/slurm_protocol_pack.c slurm-2.2.6.reboot/src/common/slurm_protocol_pack.c
--- slurm-2.2.6/src/common/slurm_protocol_pack.c	2011-05-27 21:25:05.000000000 +0300
+++ slurm-2.2.6.reboot/src/common/slurm_protocol_pack.c	2011-08-17 17:46:55.000000000 +0300
@@ -721,6 +721,7 @@
 	case REQUEST_HEALTH_CHECK:
 	case ACCOUNTING_FIRST_REG:
 	case REQUEST_TOPO_INFO:
+	case REQUEST_REBOOT_NODES:
 		/* Message contains no body/information */
 		break;
 	case REQUEST_SHUTDOWN:
@@ -1198,6 +1199,7 @@
 	case REQUEST_HEALTH_CHECK:
 	case ACCOUNTING_FIRST_REG:
 	case REQUEST_TOPO_INFO:
+	case REQUEST_REBOOT_NODES:
 		/* Message contains no body/information */
 		break;
 	case REQUEST_SHUTDOWN:
@@ -3976,6 +3978,8 @@
 		packstr(build_ptr->propagate_rlimits, buffer);
 		packstr(build_ptr->propagate_rlimits_except, buffer);
 
+		/* should be added with protocol_version increase!
+		packstr(build_ptr->reboot_program, buffer); */
 		packstr(build_ptr->resume_program, buffer);
 		pack16(build_ptr->resume_rate, buffer);
 		pack16(build_ptr->resume_timeout, buffer);
@@ -4426,6 +4430,9 @@
 		safe_unpackstr_xmalloc(&build_ptr->propagate_rlimits_except,
 				       &uint32_tmp, buffer);
 
+		/* should be added with protocol_version increase!
+		safe_unpackstr_xmalloc(&build_ptr->reboot_program, &uint32_tmp,
+				       buffer); */
 		safe_unpackstr_xmalloc(&build_ptr->resume_program,
 				       &uint32_tmp, buffer);
 		safe_unpack16(&build_ptr->resume_rate, buffer);
diff -ur slurm-2.2.6/src/scontrol/scontrol.c slurm-2.2.6.reboot/src/scontrol/scontrol.c
--- slurm-2.2.6/src/scontrol/scontrol.c	2011-05-27 21:25:06.000000000 +0300
+++ slurm-2.2.6.reboot/src/scontrol/scontrol.c	2011-08-15 18:43:29.000000000 +0300
@@ -526,6 +526,28 @@
 }
 
 /*
+ * _reboot_nodes - issue RPC to have computing nodes reboot when idle
+ * RET 0 or a slurm error code
+ */
+int _reboot_nodes(void)
+{
+	int rc;
+	slurm_msg_t req;
+
+	slurm_msg_t_init(&req);
+
+	req.msg_type = REQUEST_REBOOT_NODES;
+
+	if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
+		return SLURM_ERROR;
+
+	if (rc)
+		slurm_seterrno_ret(rc);
+
+	return rc;
+}
+
+/*
  * _process_command - process the user's command
  * IN argc - count of arguments
  * IN argv - the arguments
@@ -708,6 +730,20 @@
 		}
 		exit_flag = 1;
 	}
+	else if (strncasecmp (tag, "reboot_nodes", MAX(taglen, 3)) == 0) {
+		if (argc > 1) {
+			exit_code = 1;
+			fprintf (stderr,
+				 "too many arguments for keyword:%s\n",
+				 tag);
+		}
+		error_code = _reboot_nodes();
+		if (error_code) {
+			exit_code = 1;
+			if (quiet_flag != 1)
+				slurm_perror ("scontrol_reboot_nodes error");
+		}
+	}
 	else if (strncasecmp (tag, "reconfigure", MAX(taglen, 3)) == 0) {
 		if (argc > 2) {
 			exit_code = 1;
diff -ur slurm-2.2.6/src/slurmctld/controller.c slurm-2.2.6.reboot/src/slurmctld/controller.c
--- slurm-2.2.6/src/slurmctld/controller.c	2011-05-27 21:25:06.000000000 +0300
+++ slurm-2.2.6.reboot/src/slurmctld/controller.c	2011-08-17 15:13:49.000000000 +0300
@@ -160,6 +160,7 @@
 bool ping_nodes_now = false;
 uint32_t      cluster_cpus = 0;
 int   with_slurmdbd = 0;
+bool want_nodes_reboot = true;
 
 /* Local variables */
 static int	daemonize = DEFAULT_DAEMONIZE;
@@ -1252,6 +1253,55 @@
 	unlock_slurmctld(job_write_lock);
 }
 
+static void _queue_reboot_msg(void)
+{
+	agent_arg_t *reboot_agent_args = NULL;
+	struct node_record *node_ptr;
+	char *host_str;
+	time_t now = time(NULL);
+	int i;
+	bool want_reboot;
+
+	want_nodes_reboot = false;
+	for (i = 0, node_ptr = node_record_table_ptr;
+	     i < node_record_count; i++, node_ptr++) {
+		if (!IS_NODE_MAINT(node_ptr) || /* do it only if node */
+		    is_node_in_maint_reservation(i)) /*isn't in reservation */
+			continue;
+		want_nodes_reboot = true; /* mark it for the next cycle */
+		if (IS_NODE_IDLE(node_ptr) && !IS_NODE_NO_RESPOND(node_ptr) &&
+		    !IS_NODE_POWER_UP(node_ptr)) /* only active idle nodes */
+			want_reboot = true;
+		else if (IS_NODE_FUTURE(node_ptr) &&
+			 (node_ptr->last_response == (time_t) 0))
+			want_reboot = true; /* system just restarted */
+		else
+			want_reboot = false;
+		if (!want_reboot)
+			continue;
+		if (reboot_agent_args == NULL) {
+			reboot_agent_args = xmalloc(sizeof(agent_arg_t));
+			reboot_agent_args->msg_type = REQUEST_REBOOT_NODES;
+			reboot_agent_args->retry = 0;
+			reboot_agent_args->hostlist = hostlist_create("");
+		}
+		hostlist_push(reboot_agent_args->hostlist, node_ptr->name);
+		reboot_agent_args->node_count++;
+		node_ptr->node_state = NODE_STATE_FUTURE |
+				(node_ptr->node_state & NODE_STATE_FLAGS);
+		node_ptr->last_response = now;
+	}
+	if (reboot_agent_args != NULL) {
+		hostlist_uniq(reboot_agent_args->hostlist);
+		host_str = hostlist_ranged_string_xmalloc(
+				reboot_agent_args->hostlist);
+		debug("Queuing reboot request for nodes %s", host_str);
+		xfree(host_str);
+		agent_queue_request(reboot_agent_args);
+		last_node_update = now;
+	}
+}
+
 /*
  * _slurmctld_background - process slurmctld background activities
  *	purge defunct job records, save state, schedule jobs, and
@@ -1274,6 +1324,7 @@
 	static time_t last_node_acct;
 	static time_t last_ctld_bu_ping;
 	static time_t last_uid_update;
+	static time_t last_reboot_msg_time;
 	static bool ping_msg_sent = false;
 	time_t now;
 	int no_resp_msg_interval, ping_interval, purge_job_interval;
@@ -1309,7 +1360,7 @@
 	last_purge_job_time = last_trigger = last_health_check_time = now;
 	last_timelimit_time = last_assert_primary_time = now;
 	last_no_resp_msg_time = last_resv_time = last_ctld_bu_ping = now;
-	last_uid_update = now;
+	last_uid_update = last_reboot_msg_time = now;
 
 	if ((slurmctld_conf.min_job_age > 0) &&
 	    (slurmctld_conf.min_job_age < PURGE_JOB_INTERVAL)) {
@@ -1440,6 +1491,14 @@
 			unlock_slurmctld(job_read_lock);
 		}
 
+		if (want_nodes_reboot && (now > last_reboot_msg_time)) {
+			now = time(NULL);
+			last_reboot_msg_time = now;
+			lock_slurmctld(node_write_lock);
+			_queue_reboot_msg();
+			unlock_slurmctld(node_write_lock);
+		}
+
 		/* Process any pending agent work */
 		agent_retry(RPC_RETRY_INTERVAL, true);
 
diff -ur slurm-2.2.6/src/slurmctld/node_mgr.c slurm-2.2.6.reboot/src/slurmctld/node_mgr.c
--- slurm-2.2.6/src/slurmctld/node_mgr.c	2011-05-27 21:25:06.000000000 +0300
+++ slurm-2.2.6.reboot/src/slurmctld/node_mgr.c	2011-08-17 17:36:17.000000000 +0300
@@ -69,6 +69,7 @@
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/ping_nodes.h"
 #include "src/slurmctld/proc_req.h"
+#include "src/slurmctld/reservation.h"
 #include "src/slurmctld/sched_plugin.h"
 #include "src/slurmctld/slurmctld.h"
 #include "src/slurmctld/state_save.h"
@@ -609,7 +610,8 @@
 			if (((show_flags & SHOW_ALL) == 0) && (uid != 0) &&
 			    (_node_is_hidden(node_ptr)))
 				hidden = true;
-			else if (IS_NODE_FUTURE(node_ptr))
+			else if (IS_NODE_FUTURE(node_ptr) &&
+				 !IS_NODE_MAINT(node_ptr)) /* reboot req sent */
 				hidden = true;
 			else if ((node_ptr->name == NULL) ||
 				 (node_ptr->name[0] == '\0'))
@@ -1629,10 +1631,14 @@
 #endif
 		}
 	} else {
-		if (IS_NODE_UNKNOWN(node_ptr)) {
+		if (IS_NODE_UNKNOWN(node_ptr) || IS_NODE_FUTURE(node_ptr)) {
 			reset_job_priority();
 			debug("validate_node_specs: node %s has registered",
 				reg_msg->node_name);
+			if (IS_NODE_FUTURE(node_ptr) &&
+			    IS_NODE_MAINT(node_ptr) &&
+			    !is_node_in_maint_reservation(node_inx))
+				node_flags &= (~NODE_STATE_MAINT);
 			if (reg_msg->job_count) {
 				node_ptr->node_state = NODE_STATE_ALLOCATED |
 					node_flags;
@@ -2073,6 +2079,8 @@
 		reset_job_priority();
 		node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
 		node_ptr->node_state &= (~NODE_STATE_POWER_UP);
+		if (!is_node_in_maint_reservation(node_inx))
+			node_ptr->node_state &= (~NODE_STATE_MAINT);
 		last_node_update = now;
 	}
 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
diff -ur slurm-2.2.6/src/slurmctld/proc_req.c slurm-2.2.6.reboot/src/slurmctld/proc_req.c
--- slurm-2.2.6/src/slurmctld/proc_req.c	2011-05-27 21:25:06.000000000 +0300
+++ slurm-2.2.6.reboot/src/slurmctld/proc_req.c	2011-08-15 19:55:59.000000000 +0300
@@ -124,6 +124,7 @@
 inline static void  _slurm_rpc_job_alloc_info(slurm_msg_t * msg);
 inline static void  _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg);
 inline static void  _slurm_rpc_ping(slurm_msg_t * msg);
+inline static void  _slurm_rpc_reboot_nodes(slurm_msg_t * msg);
 inline static void  _slurm_rpc_reconfigure_controller(slurm_msg_t * msg);
 inline static void  _slurm_rpc_resv_create(slurm_msg_t * msg);
 inline static void  _slurm_rpc_resv_update(slurm_msg_t * msg);
@@ -398,6 +399,10 @@
 		_slurm_rpc_get_topo(msg);
 		/* No body to free */
 		break;
+	case REQUEST_REBOOT_NODES:
+		_slurm_rpc_reboot_nodes(msg);
+		/* No body to free */
+		break;
 	default:
 		error("invalid RPC msg_type=%d", msg->msg_type);
 		slurm_send_rc_msg(msg, EINVAL);
@@ -530,6 +535,7 @@
 	conf_ptr->propagate_rlimits_except = xstrdup(conf->
 						     propagate_rlimits_except);
 
+	conf_ptr->reboot_program      = xstrdup(conf->reboot_program);
 	conf_ptr->resume_program      = xstrdup(conf->resume_program);
 	conf_ptr->resume_rate         = conf->resume_rate;
 	conf_ptr->resume_timeout      = conf->resume_timeout;
@@ -3809,6 +3815,42 @@
 	slurm_send_rc_msg(msg, rc);
 }
 
+/* _slurm_rpc_reboot_nodes - process RPC to schedule nodes reboot */
+inline static void _slurm_rpc_reboot_nodes(slurm_msg_t * msg)
+{
+	struct node_record *node_ptr;
+	int i;
+	uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL);
+	/* Locks: write node lock */
+	slurmctld_lock_t node_write_lock = {
+		NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK };
+	DEF_TIMERS;
+
+	START_TIMER;
+	debug2("Processing RPC: REQUEST_REBOOT_NODES from uid=%d", uid);
+	if (!validate_super_user(uid)) {
+		error("Security violation, REBOOT_NODES RPC from uid=%d", uid);
+		slurm_send_rc_msg(msg, EACCES);
+		return;
+	}
+
+	/* do RPC call */
+	lock_slurmctld(node_write_lock);
+	for (i = 0, node_ptr = node_record_table_ptr;
+	     i < node_record_count; i++, node_ptr++) {
+		if (IS_NODE_MAINT(node_ptr)) /* already on maintenance */
+			continue;
+		if (IS_NODE_FUTURE(node_ptr) || IS_NODE_DOWN(node_ptr))
+			continue;
+		node_ptr->node_state |= NODE_STATE_MAINT;
+		want_nodes_reboot = true;
+	}
+	unlock_slurmctld(node_write_lock);
+
+	END_TIMER2("_slurm_rpc_reboot_nodes");
+	slurm_send_rc_msg(msg, SLURM_SUCCESS);
+}
+
 inline static void  _slurm_rpc_accounting_first_reg(slurm_msg_t *msg)
 {
 	uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL);
diff -ur slurm-2.2.6/src/slurmctld/reservation.c slurm-2.2.6.reboot/src/slurmctld/reservation.c
--- slurm-2.2.6/src/slurmctld/reservation.c	2011-05-27 21:25:06.000000000 +0300
+++ slurm-2.2.6.reboot/src/slurmctld/reservation.c	2011-08-15 19:56:51.000000000 +0300
@@ -3125,6 +3125,32 @@
 	list_iterator_destroy(iter);
 }
 
+/* checks if node within node_record_table_ptr is in maint reservation */
+extern bool is_node_in_maint_reservation(int nodenum)
+{
+	bool res = false;
+	ListIterator iter;
+	slurmctld_resv_t *resv_ptr;
+
+	if (nodenum < 0 || nodenum >= node_record_count || !resv_list)
+		return false;
+
+	iter = list_iterator_create(resv_list);
+	if (!iter)
+		fatal("malloc: list_iterator_create");
+	while ((resv_ptr = (slurmctld_resv_t *) list_next(iter))) {
+		if ((resv_ptr->flags & RESERVE_FLAG_MAINT) == 0)
+			continue;
+		if (bit_test(resv_ptr->node_bitmap, nodenum)) {
+			res = true;
+			break;
+		}
+	}
+	list_iterator_destroy(iter);
+
+	return res;
+}
+
 extern void update_assocs_in_resvs(void)
 {
 	slurmctld_resv_t *resv_ptr = NULL;
diff -ur slurm-2.2.6/src/slurmctld/reservation.h slurm-2.2.6.reboot/src/slurmctld/reservation.h
--- slurm-2.2.6/src/slurmctld/reservation.h	2011-05-27 21:25:06.000000000 +0300
+++ slurm-2.2.6.reboot/src/slurmctld/reservation.h	2011-07-19 14:00:26.000000000 +0300
@@ -73,6 +73,9 @@
 /* Set or clear NODE_STATE_MAINT for node_state as needed */
 extern void set_node_maint_mode(void);
 
+/* checks if node within node_record_table_ptr is in maint reservation */
+extern bool is_node_in_maint_reservation(int nodenum);
+
 /* After an assocation has been added or removed update the lists. */
 extern void update_assocs_in_resvs(void);
 
diff -ur slurm-2.2.6/src/slurmctld/slurmctld.h slurm-2.2.6.reboot/src/slurmctld/slurmctld.h
--- slurm-2.2.6/src/slurmctld/slurmctld.h	2011-05-27 21:25:06.000000000 +0300
+++ slurm-2.2.6.reboot/src/slurmctld/slurmctld.h	2011-08-15 17:39:20.000000000 +0300
@@ -183,6 +183,7 @@
 \*****************************************************************************/
 extern uint32_t total_cpus;		/* count of CPUs in the entire cluster */
 extern bool ping_nodes_now;		/* if set, ping nodes immediately */
+extern bool want_nodes_reboot;		/* if set, check for idle nodes */
 
 /*****************************************************************************\
  *  NODE states and bitmaps
diff -ur slurm-2.2.6/src/slurmd/slurmd/req.c slurm-2.2.6.reboot/src/slurmd/slurmd/req.c
--- slurm-2.2.6/src/slurmd/slurmd/req.c	2011-05-27 21:25:06.000000000 +0300
+++ slurm-2.2.6.reboot/src/slurmd/slurmd/req.c	2011-08-17 14:04:32.000000000 +0300
@@ -148,6 +148,7 @@
 static void _rpc_update_time(slurm_msg_t *);
 static void _rpc_shutdown(slurm_msg_t *msg);
 static void _rpc_reconfig(slurm_msg_t *msg);
+static void _rpc_reboot(slurm_msg_t *msg);
 static void _rpc_pid2jid(slurm_msg_t *msg);
 static int  _rpc_file_bcast(slurm_msg_t *msg);
 static int  _rpc_ping(slurm_msg_t *);
@@ -307,6 +308,10 @@
 		last_slurmctld_msg = time(NULL);
 		/* No body to free */
 		break;
+	case REQUEST_REBOOT_NODES:
+		_rpc_reboot(msg);
+		/* No body to free */
+		break;
 	case REQUEST_NODE_REGISTRATION_STATUS:
 		/* Treat as ping (for slurmctld agent, just return SUCCESS) */
 		rc = _rpc_ping(msg);
@@ -1487,6 +1492,44 @@
 	/* Never return a message, slurmctld does not expect one */
 }
 
+static void
+_rpc_reboot(slurm_msg_t *msg)
+{
+	char *reboot_program, *sp;
+	slurm_ctl_conf_t *cfg;
+	uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL);
+	int exit_code, rc = SLURM_ERROR;
+
+	if (!_slurm_authorized_user(req_uid))
+		error("Security violation, reboot RPC from uid %d",
+		      req_uid);
+	else {
+		cfg = slurm_conf_lock();
+		reboot_program = cfg->reboot_program;
+		if (reboot_program) {
+			sp = strchr(reboot_program, ' ');
+			if (sp)
+				sp = xstrndup(reboot_program,
+					      (sp - reboot_program));
+			else
+			    sp = xstrdup(reboot_program);
+			if (access(sp, R_OK | X_OK) < 0)
+				error("Cannot run RebootProgram [%s]: %m", sp);
+			else if ((exit_code = system(reboot_program)))
+				error("system(%s) returned %d", reboot_program,
+				      exit_code);
+			else
+				rc = SLURM_SUCCESS;
+			xfree(sp);
+		} else
+			error("RebootProgram isn't defined in config");
+		slurm_conf_unlock();
+	}
+
+	/* Never return a message, slurmctld does not expect one */
+	/* slurm_send_rc_msg(msg, rc); */
+}
+
 static void _job_limits_free(void *x)
 {
 	xfree(x);

[slurm-dev] Maintenance reboot sheduling mechanics. Re: A question.

Reply via email to