Hello there!

    We found that simple command such as 'srun hostname' executes at
random - from 1 second to tens.  After a lot of research the cause of
that strange behavior was found - process of power save runs in own
thread and gets not always in sync with scheduler.  Sure, if we had the
power management fully power off nodes that delay would be unnoticed
(and, probably it's why it was unnoticed before) but our power save
function just manipulate CPU frequency and such so delay 20 seconds looks
strange enough.  As soon the cause was found the source is patched and
patch is in attachment.

    Andriy.
diff -udpr slurm-2.2.3.plus2/src/slurmctld/node_mgr.c slurm-2.2.3.plus3/src/slurmctld/node_mgr.c
--- slurm-2.2.3.plus2/src/slurmctld/node_mgr.c	2011-03-30 17:35:58.000000000 +0300
+++ slurm-2.2.3.plus3/src/slurmctld/node_mgr.c	2011-05-16 15:35:10.000000000 +0300
@@ -779,7 +779,6 @@ int update_node ( update_node_msg_t * up
 		return ESLURM_INVALID_NODE_NAME;
 	}
 
-	last_node_update = now;
 	while ( (this_node_name = hostlist_shift (host_list)) ) {
 		int err_code = 0;
 		state_val = update_node_msg->node_state;
@@ -976,6 +975,7 @@ int update_node ( update_node_msg_t * up
 		free (this_node_name);
 	}
 	hostlist_destroy (host_list);
+	last_node_update = now;
 
 	if ((error_code == 0) && (update_node_msg->features)) {
 		error_code = _update_node_features(update_node_msg->node_names,
@@ -1355,7 +1355,6 @@ extern int drain_nodes ( char *nodes, ch
 		return ESLURM_INVALID_NODE_NAME;
 	}
 
-	last_node_update = time (NULL);
 	while ( (this_node_name = hostlist_shift (host_list)) ) {
 		node_ptr = find_node_record (this_node_name);
 		node_inx = node_ptr - node_record_table_ptr;
@@ -1395,6 +1394,7 @@ extern int drain_nodes ( char *nodes, ch
 
 		free (this_node_name);
 	}
+	last_node_update = time (NULL);
 
 	hostlist_destroy (host_list);
 	return error_code;
@@ -1589,10 +1589,10 @@ extern int validate_node_specs(slurm_nod
 	reg_msg->os = NULL;	/* Nothing left to free */
 
 	if (IS_NODE_NO_RESPOND(node_ptr)) {
-		last_node_update = time (NULL);
 		reset_job_priority();
 		node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
 		node_ptr->node_state &= (~NODE_STATE_POWER_UP);
+		last_node_update = time (NULL);
 	}
 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
 	if (error_code) {
@@ -1600,22 +1600,21 @@ extern int validate_node_specs(slurm_nod
 			error ("Setting node %s state to DOWN",
 				reg_msg->node_name);
 		}
-		last_node_update = time (NULL);
 		set_node_down(reg_msg->node_name, reason_down);
+		last_node_update = time (NULL);
 	} else if (reg_msg->status == ESLURMD_PROLOG_FAILED) {
 		if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
 #ifdef HAVE_BG
 			info("Prolog failure on node %s", reg_msg->node_name);
 #else
-			last_node_update = time (NULL);
 			error("Prolog failure on node %s, state to DOWN",
 				reg_msg->node_name);
 			set_node_down(reg_msg->node_name, "Prolog failed");
+			last_node_update = time (NULL);
 #endif
 		}
 	} else {
 		if (IS_NODE_UNKNOWN(node_ptr)) {
-			last_node_update = now;
 			reset_job_priority();
 			debug("validate_node_specs: node %s has registered",
 				reg_msg->node_name);
@@ -1627,6 +1626,7 @@ extern int validate_node_specs(slurm_nod
 					node_flags;
 				node_ptr->last_idle = now;
 			}
+			last_node_update = now;
 			if (!IS_NODE_DRAIN(node_ptr)
 			    && !IS_NODE_FAIL(node_ptr)) {
 				xfree(node_ptr->reason);
@@ -1641,7 +1641,6 @@ extern int validate_node_specs(slurm_nod
 			     (node_ptr->reason != NULL) &&
 			     (strncmp(node_ptr->reason, "Not responding", 14)
 					== 0)))) {
-			last_node_update = now;
 			if (reg_msg->job_count) {
 				node_ptr->node_state = NODE_STATE_ALLOCATED |
 					node_flags;
@@ -1654,6 +1653,7 @@ extern int validate_node_specs(slurm_nod
 			     reg_msg->node_name);
 			reset_job_priority();
 			trigger_node_up(node_ptr);
+			last_node_update = now;
 			if (!IS_NODE_DRAIN(node_ptr)
 			    && !IS_NODE_FAIL(node_ptr)) {
 				xfree(node_ptr->reason);
@@ -1665,7 +1665,6 @@ extern int validate_node_specs(slurm_nod
 		} else if (node_ptr->last_response
 			   && (node_ptr->boot_time > node_ptr->last_response)
 			   && (slurmctld_conf.ret2service != 2)) {
-			last_node_update = now;
 			if(!node_ptr->reason) {
 				node_ptr->reason_time = now;
 				node_ptr->reason_uid =
@@ -1677,20 +1676,20 @@ extern int validate_node_specs(slurm_nod
 			     reg_msg->node_name);
 			_make_node_down(node_ptr, last_node_update);
 			kill_running_job_by_node_name(reg_msg->node_name);
+			last_node_update = now;
 			reg_msg->job_count = 0;
 		} else if (IS_NODE_ALLOCATED(node_ptr) &&
 			   (reg_msg->job_count == 0)) {	/* job vanished */
-			last_node_update = now;
 			node_ptr->node_state = NODE_STATE_IDLE | node_flags;
 			node_ptr->last_idle = now;
+			last_node_update = now;
 		} else if (IS_NODE_COMPLETING(node_ptr) &&
 			   (reg_msg->job_count == 0)) {	/* job already done */
-			last_node_update = now;
 			node_ptr->node_state &= (~NODE_STATE_COMPLETING);
+			last_node_update = now;
 			bit_clear(cg_node_bitmap, node_inx);
 		} else if (IS_NODE_IDLE(node_ptr) &&
 			   (reg_msg->job_count != 0)) {
-			last_node_update = now;
 			if (node_ptr->run_job_cnt != 0) {
 				node_ptr->node_state = NODE_STATE_ALLOCATED |
 						       node_flags;
@@ -1707,6 +1706,7 @@ extern int validate_node_specs(slurm_nod
 				node_ptr->node_state |= NODE_STATE_COMPLETING;
 				bit_set(cg_node_bitmap, node_inx);
 			}
+			last_node_update = now;
 		}
 
 		select_g_update_node_config(node_inx);
@@ -1990,8 +1990,8 @@ extern int validate_nodes_via_front_end(
 	}
 
 	if (update_node_state) {
-		last_node_update = time (NULL);
 		reset_job_priority();
+		last_node_update = time (NULL);
 	}
 	return error_code;
 }
@@ -2054,20 +2054,20 @@ static void _node_did_resp(struct node_r
 	resp_state = node_ptr->node_state & NODE_STATE_NO_RESPOND;
 	if (resp_state) {
 		info("Node %s now responding", node_ptr->name);
-		last_node_update = now;
 		reset_job_priority();
 		node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
 		node_ptr->node_state &= (~NODE_STATE_POWER_UP);
+		last_node_update = now;
 	}
 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
 	if (IS_NODE_UNKNOWN(node_ptr)) {
-		last_node_update = now;
 		node_ptr->last_idle = now;
 		if (node_ptr->run_job_cnt) {
 			node_ptr->node_state = NODE_STATE_ALLOCATED |
 					       node_flags;
 		} else
 			node_ptr->node_state = NODE_STATE_IDLE | node_flags;
+		last_node_update = now;
 		if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
 			clusteracct_storage_g_node_up(acct_db_conn,
 						      node_ptr, now);
@@ -2077,12 +2077,12 @@ static void _node_did_resp(struct node_r
 	    (slurmctld_conf.ret2service == 1) &&
 	    (node_ptr->reason != NULL) &&
 	    (strncmp(node_ptr->reason, "Not responding", 14) == 0)) {
-		last_node_update = now;
 		node_ptr->last_idle = now;
 		node_ptr->node_state = NODE_STATE_IDLE | node_flags;
 		info("node_did_resp: node %s returned to service",
 			node_ptr->name);
 		trigger_node_up(node_ptr);
+		last_node_update = now;
 		if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
 			xfree(node_ptr->reason);
 			node_ptr->reason_time = 0;
@@ -2185,9 +2185,9 @@ static void _node_not_resp (struct node_
 			node_ptr->name);
 		return;
 	}
-	last_node_update = time (NULL);
 	bit_clear (avail_node_bitmap, i);
 	node_ptr->node_state |= NODE_STATE_NO_RESPOND;
+	last_node_update = time (NULL);
 	return;
 }
 #endif
@@ -2334,8 +2334,6 @@ extern void make_node_alloc(struct node_
 	int inx = node_ptr - node_record_table_ptr;
 	uint16_t node_flags;
 
-	last_node_update = time (NULL);
-
 	(node_ptr->run_job_cnt)++;
 	bit_clear(idle_node_bitmap, inx);
 	if (job_ptr->details && (job_ptr->details->shared == 0)) {
@@ -2348,6 +2346,8 @@ extern void make_node_alloc(struct node_
 	xfree(node_ptr->reason);
 	node_ptr->reason_time = 0;
 	node_ptr->reason_uid = NO_VAL;
+
+	last_node_update = time (NULL);
 }
 
 /* make_node_comp - flag specified node as completing a job
@@ -2363,7 +2363,6 @@ extern void make_node_comp(struct node_r
 	time_t now = time(NULL);
 
 	xassert(node_ptr);
-	last_node_update = now;
 	if (suspended) {
 		if (node_ptr->sus_job_cnt)
 			(node_ptr->sus_job_cnt)--;
@@ -2417,6 +2416,7 @@ extern void make_node_comp(struct node_r
 		node_ptr->node_state = NODE_STATE_IDLE | node_flags;
 		node_ptr->last_idle = now;
 	}
+	last_node_update = now;
 }
 
 /* _make_node_down - flag specified node as down */
@@ -2426,7 +2426,6 @@ static void _make_node_down(struct node_
 	uint16_t node_flags;
 
 	xassert(node_ptr);
-	last_node_update = time (NULL);
 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
 	node_flags &= (~NODE_STATE_COMPLETING);
 	node_ptr->node_state = NODE_STATE_DOWN | node_flags;
@@ -2437,6 +2436,7 @@ static void _make_node_down(struct node_
 	bit_clear (up_node_bitmap,    inx);
 	select_g_update_node_state(inx, node_ptr->node_state);
 	trigger_node_down(node_ptr);
+	last_node_update = time (NULL);
 	clusteracct_storage_g_node_down(acct_db_conn,
 					node_ptr, event_time, NULL,
 					node_ptr->reason_uid);
@@ -2513,7 +2513,6 @@ void make_node_idle(struct node_record *
 		}
 	}
 
-	last_node_update = now;
 	if (node_ptr->comp_job_cnt == 0) {
 		node_ptr->node_state &= (~NODE_STATE_COMPLETING);
 		bit_clear(cg_node_bitmap, inx);
@@ -2551,6 +2550,7 @@ void make_node_idle(struct node_record *
 		bit_set(up_node_bitmap, inx);
 		node_ptr->last_idle = now;
 	}
+	last_node_update = now;
 }
 
 extern int send_nodes_to_accounting(time_t event_time)
diff -udpr slurm-2.2.3.plus2/src/slurmctld/node_scheduler.c slurm-2.2.3.plus3/src/slurmctld/node_scheduler.c
--- slurm-2.2.3.plus2/src/slurmctld/node_scheduler.c	2011-01-26 01:29:10.000000000 +0200
+++ slurm-2.2.3.plus3/src/slurmctld/node_scheduler.c	2011-05-18 12:55:50.000000000 +0300
@@ -130,13 +130,13 @@ extern void allocate_nodes(struct job_re
 {
 	int i;
 
-	last_node_update = time(NULL);
-
 	for (i = 0; i < node_record_count; i++) {
 		if (bit_test(job_ptr->node_bitmap, i))
 			make_node_alloc(&node_record_table_ptr[i], job_ptr);
 	}
 
+	last_node_update = time(NULL);
+
 	license_job_get(job_ptr);
 	return;
 }
@@ -1991,10 +1991,10 @@ extern void re_kill_job(struct job_recor
 			if (node_ptr->comp_job_cnt)
 				(node_ptr->comp_job_cnt)--;
 			if ((--job_ptr->node_cnt) == 0) {
-				last_node_update = time(NULL);
 				job_ptr->job_state &= (~JOB_COMPLETING);
 				delete_step_records(job_ptr, 0);
 				slurm_sched_schedule();
+				last_node_update = time(NULL);
 			}
 			continue;
 		}
diff -udpr slurm-2.2.3.plus2/src/slurmctld/power_save.c slurm-2.2.3.plus3/src/slurmctld/power_save.c
--- slurm-2.2.3.plus2/src/slurmctld/power_save.c	2011-05-17 13:49:04.000000000 +0300
+++ slurm-2.2.3.plus3/src/slurmctld/power_save.c	2011-05-18 12:56:43.000000000 +0300
@@ -95,7 +95,7 @@ int   suspend_cnt,   resume_cnt;
 float suspend_cnt_f, resume_cnt_f;
 
 static void  _clear_power_config(void);
-static void  _do_power_work(void);
+static void  _do_power_work(time_t now);
 static void  _do_resume(char *host);
 static void  _do_suspend(char *host);
 static int   _init_power_config(void);
@@ -108,11 +108,11 @@ static void  _shutdown_power(void);
 static bool  _valid_prog(char *file_name);
 
 /* Perform any power change work to nodes */
-static void _do_power_work(void)
+static void _do_power_work(time_t now)
 {
 	static time_t last_log = 0, last_work_scan = 0;
 	int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0;
-	time_t now = time(NULL), delta_t;
+	time_t delta_t;
 	uint16_t susp_state;
 	bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
 	struct node_record *node_ptr;
@@ -212,7 +212,8 @@ static void _do_power_work(void)
 			error("power_save: bitmap2nodename");
 		xfree(nodes);
 		FREE_NULL_BITMAP(sleep_node_bitmap);
-		last_node_update = now;
+		/* last_node_update could be changed already by another thread!
+		last_node_update = now; */
 	}
 
 	if (wake_node_bitmap) {
@@ -224,7 +225,8 @@ static void _do_power_work(void)
 			error("power_save: bitmap2nodename");
 		xfree(nodes);
 		FREE_NULL_BITMAP(wake_node_bitmap);
-		last_node_update = now;
+		/* last_node_update could be changed already by another thread!
+		last_node_update = now; */
 	}
 }
 
@@ -370,7 +372,8 @@ static int  _reap_procs(void)
 			if (rc != 0) {
 				error("power_save: program exit status of %d",
 				      rc);
-			}
+			} else
+				ping_nodes_now = true;
 		} else if (WIFSIGNALED(status)) {
 			error("power_save: program signalled: %s",
 			      strsignal(WTERMSIG(status)));
@@ -648,7 +651,7 @@ static void *_init_power_save(void *arg)
 		if ((last_node_update >= last_power_scan) ||
 		    (now >= (last_power_scan + 60))) {
 			lock_slurmctld(node_write_lock);
-			_do_power_work();
+			_do_power_work(now);
 			unlock_slurmctld(node_write_lock);
 			last_power_scan = now;
 		}

Reply via email to