Hello there!
We found that simple command such as 'srun hostname' executes at
random - from 1 second to tens. After a lot of research the cause of
that strange behavior was found - process of power save runs in own
thread and gets not always in sync with scheduler. Sure, if we had the
power management fully power off nodes that delay would be unnoticed
(and, probably it's why it was unnoticed before) but our power save
function just manipulate CPU frequency and such so delay 20 seconds looks
strange enough. As soon the cause was found the source is patched and
patch is in attachment.
Andriy.diff -udpr slurm-2.2.3.plus2/src/slurmctld/node_mgr.c slurm-2.2.3.plus3/src/slurmctld/node_mgr.c
--- slurm-2.2.3.plus2/src/slurmctld/node_mgr.c 2011-03-30 17:35:58.000000000 +0300
+++ slurm-2.2.3.plus3/src/slurmctld/node_mgr.c 2011-05-16 15:35:10.000000000 +0300
@@ -779,7 +779,6 @@ int update_node ( update_node_msg_t * up
return ESLURM_INVALID_NODE_NAME;
}
- last_node_update = now;
while ( (this_node_name = hostlist_shift (host_list)) ) {
int err_code = 0;
state_val = update_node_msg->node_state;
@@ -976,6 +975,7 @@ int update_node ( update_node_msg_t * up
free (this_node_name);
}
hostlist_destroy (host_list);
+ last_node_update = now;
if ((error_code == 0) && (update_node_msg->features)) {
error_code = _update_node_features(update_node_msg->node_names,
@@ -1355,7 +1355,6 @@ extern int drain_nodes ( char *nodes, ch
return ESLURM_INVALID_NODE_NAME;
}
- last_node_update = time (NULL);
while ( (this_node_name = hostlist_shift (host_list)) ) {
node_ptr = find_node_record (this_node_name);
node_inx = node_ptr - node_record_table_ptr;
@@ -1395,6 +1394,7 @@ extern int drain_nodes ( char *nodes, ch
free (this_node_name);
}
+ last_node_update = time (NULL);
hostlist_destroy (host_list);
return error_code;
@@ -1589,10 +1589,10 @@ extern int validate_node_specs(slurm_nod
reg_msg->os = NULL; /* Nothing left to free */
if (IS_NODE_NO_RESPOND(node_ptr)) {
- last_node_update = time (NULL);
reset_job_priority();
node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
node_ptr->node_state &= (~NODE_STATE_POWER_UP);
+ last_node_update = time (NULL);
}
node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
if (error_code) {
@@ -1600,22 +1600,21 @@ extern int validate_node_specs(slurm_nod
error ("Setting node %s state to DOWN",
reg_msg->node_name);
}
- last_node_update = time (NULL);
set_node_down(reg_msg->node_name, reason_down);
+ last_node_update = time (NULL);
} else if (reg_msg->status == ESLURMD_PROLOG_FAILED) {
if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
#ifdef HAVE_BG
info("Prolog failure on node %s", reg_msg->node_name);
#else
- last_node_update = time (NULL);
error("Prolog failure on node %s, state to DOWN",
reg_msg->node_name);
set_node_down(reg_msg->node_name, "Prolog failed");
+ last_node_update = time (NULL);
#endif
}
} else {
if (IS_NODE_UNKNOWN(node_ptr)) {
- last_node_update = now;
reset_job_priority();
debug("validate_node_specs: node %s has registered",
reg_msg->node_name);
@@ -1627,6 +1626,7 @@ extern int validate_node_specs(slurm_nod
node_flags;
node_ptr->last_idle = now;
}
+ last_node_update = now;
if (!IS_NODE_DRAIN(node_ptr)
&& !IS_NODE_FAIL(node_ptr)) {
xfree(node_ptr->reason);
@@ -1641,7 +1641,6 @@ extern int validate_node_specs(slurm_nod
(node_ptr->reason != NULL) &&
(strncmp(node_ptr->reason, "Not responding", 14)
== 0)))) {
- last_node_update = now;
if (reg_msg->job_count) {
node_ptr->node_state = NODE_STATE_ALLOCATED |
node_flags;
@@ -1654,6 +1653,7 @@ extern int validate_node_specs(slurm_nod
reg_msg->node_name);
reset_job_priority();
trigger_node_up(node_ptr);
+ last_node_update = now;
if (!IS_NODE_DRAIN(node_ptr)
&& !IS_NODE_FAIL(node_ptr)) {
xfree(node_ptr->reason);
@@ -1665,7 +1665,6 @@ extern int validate_node_specs(slurm_nod
} else if (node_ptr->last_response
&& (node_ptr->boot_time > node_ptr->last_response)
&& (slurmctld_conf.ret2service != 2)) {
- last_node_update = now;
if(!node_ptr->reason) {
node_ptr->reason_time = now;
node_ptr->reason_uid =
@@ -1677,20 +1676,20 @@ extern int validate_node_specs(slurm_nod
reg_msg->node_name);
_make_node_down(node_ptr, last_node_update);
kill_running_job_by_node_name(reg_msg->node_name);
+ last_node_update = now;
reg_msg->job_count = 0;
} else if (IS_NODE_ALLOCATED(node_ptr) &&
(reg_msg->job_count == 0)) { /* job vanished */
- last_node_update = now;
node_ptr->node_state = NODE_STATE_IDLE | node_flags;
node_ptr->last_idle = now;
+ last_node_update = now;
} else if (IS_NODE_COMPLETING(node_ptr) &&
(reg_msg->job_count == 0)) { /* job already done */
- last_node_update = now;
node_ptr->node_state &= (~NODE_STATE_COMPLETING);
+ last_node_update = now;
bit_clear(cg_node_bitmap, node_inx);
} else if (IS_NODE_IDLE(node_ptr) &&
(reg_msg->job_count != 0)) {
- last_node_update = now;
if (node_ptr->run_job_cnt != 0) {
node_ptr->node_state = NODE_STATE_ALLOCATED |
node_flags;
@@ -1707,6 +1706,7 @@ extern int validate_node_specs(slurm_nod
node_ptr->node_state |= NODE_STATE_COMPLETING;
bit_set(cg_node_bitmap, node_inx);
}
+ last_node_update = now;
}
select_g_update_node_config(node_inx);
@@ -1990,8 +1990,8 @@ extern int validate_nodes_via_front_end(
}
if (update_node_state) {
- last_node_update = time (NULL);
reset_job_priority();
+ last_node_update = time (NULL);
}
return error_code;
}
@@ -2054,20 +2054,20 @@ static void _node_did_resp(struct node_r
resp_state = node_ptr->node_state & NODE_STATE_NO_RESPOND;
if (resp_state) {
info("Node %s now responding", node_ptr->name);
- last_node_update = now;
reset_job_priority();
node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
node_ptr->node_state &= (~NODE_STATE_POWER_UP);
+ last_node_update = now;
}
node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
if (IS_NODE_UNKNOWN(node_ptr)) {
- last_node_update = now;
node_ptr->last_idle = now;
if (node_ptr->run_job_cnt) {
node_ptr->node_state = NODE_STATE_ALLOCATED |
node_flags;
} else
node_ptr->node_state = NODE_STATE_IDLE | node_flags;
+ last_node_update = now;
if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
clusteracct_storage_g_node_up(acct_db_conn,
node_ptr, now);
@@ -2077,12 +2077,12 @@ static void _node_did_resp(struct node_r
(slurmctld_conf.ret2service == 1) &&
(node_ptr->reason != NULL) &&
(strncmp(node_ptr->reason, "Not responding", 14) == 0)) {
- last_node_update = now;
node_ptr->last_idle = now;
node_ptr->node_state = NODE_STATE_IDLE | node_flags;
info("node_did_resp: node %s returned to service",
node_ptr->name);
trigger_node_up(node_ptr);
+ last_node_update = now;
if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
xfree(node_ptr->reason);
node_ptr->reason_time = 0;
@@ -2185,9 +2185,9 @@ static void _node_not_resp (struct node_
node_ptr->name);
return;
}
- last_node_update = time (NULL);
bit_clear (avail_node_bitmap, i);
node_ptr->node_state |= NODE_STATE_NO_RESPOND;
+ last_node_update = time (NULL);
return;
}
#endif
@@ -2334,8 +2334,6 @@ extern void make_node_alloc(struct node_
int inx = node_ptr - node_record_table_ptr;
uint16_t node_flags;
- last_node_update = time (NULL);
-
(node_ptr->run_job_cnt)++;
bit_clear(idle_node_bitmap, inx);
if (job_ptr->details && (job_ptr->details->shared == 0)) {
@@ -2348,6 +2346,8 @@ extern void make_node_alloc(struct node_
xfree(node_ptr->reason);
node_ptr->reason_time = 0;
node_ptr->reason_uid = NO_VAL;
+
+ last_node_update = time (NULL);
}
/* make_node_comp - flag specified node as completing a job
@@ -2363,7 +2363,6 @@ extern void make_node_comp(struct node_r
time_t now = time(NULL);
xassert(node_ptr);
- last_node_update = now;
if (suspended) {
if (node_ptr->sus_job_cnt)
(node_ptr->sus_job_cnt)--;
@@ -2417,6 +2416,7 @@ extern void make_node_comp(struct node_r
node_ptr->node_state = NODE_STATE_IDLE | node_flags;
node_ptr->last_idle = now;
}
+ last_node_update = now;
}
/* _make_node_down - flag specified node as down */
@@ -2426,7 +2426,6 @@ static void _make_node_down(struct node_
uint16_t node_flags;
xassert(node_ptr);
- last_node_update = time (NULL);
node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
node_flags &= (~NODE_STATE_COMPLETING);
node_ptr->node_state = NODE_STATE_DOWN | node_flags;
@@ -2437,6 +2436,7 @@ static void _make_node_down(struct node_
bit_clear (up_node_bitmap, inx);
select_g_update_node_state(inx, node_ptr->node_state);
trigger_node_down(node_ptr);
+ last_node_update = time (NULL);
clusteracct_storage_g_node_down(acct_db_conn,
node_ptr, event_time, NULL,
node_ptr->reason_uid);
@@ -2513,7 +2513,6 @@ void make_node_idle(struct node_record *
}
}
- last_node_update = now;
if (node_ptr->comp_job_cnt == 0) {
node_ptr->node_state &= (~NODE_STATE_COMPLETING);
bit_clear(cg_node_bitmap, inx);
@@ -2551,6 +2550,7 @@ void make_node_idle(struct node_record *
bit_set(up_node_bitmap, inx);
node_ptr->last_idle = now;
}
+ last_node_update = now;
}
extern int send_nodes_to_accounting(time_t event_time)
diff -udpr slurm-2.2.3.plus2/src/slurmctld/node_scheduler.c slurm-2.2.3.plus3/src/slurmctld/node_scheduler.c
--- slurm-2.2.3.plus2/src/slurmctld/node_scheduler.c 2011-01-26 01:29:10.000000000 +0200
+++ slurm-2.2.3.plus3/src/slurmctld/node_scheduler.c 2011-05-18 12:55:50.000000000 +0300
@@ -130,13 +130,13 @@ extern void allocate_nodes(struct job_re
{
int i;
- last_node_update = time(NULL);
-
for (i = 0; i < node_record_count; i++) {
if (bit_test(job_ptr->node_bitmap, i))
make_node_alloc(&node_record_table_ptr[i], job_ptr);
}
+ last_node_update = time(NULL);
+
license_job_get(job_ptr);
return;
}
@@ -1991,10 +1991,10 @@ extern void re_kill_job(struct job_recor
if (node_ptr->comp_job_cnt)
(node_ptr->comp_job_cnt)--;
if ((--job_ptr->node_cnt) == 0) {
- last_node_update = time(NULL);
job_ptr->job_state &= (~JOB_COMPLETING);
delete_step_records(job_ptr, 0);
slurm_sched_schedule();
+ last_node_update = time(NULL);
}
continue;
}
diff -udpr slurm-2.2.3.plus2/src/slurmctld/power_save.c slurm-2.2.3.plus3/src/slurmctld/power_save.c
--- slurm-2.2.3.plus2/src/slurmctld/power_save.c 2011-05-17 13:49:04.000000000 +0300
+++ slurm-2.2.3.plus3/src/slurmctld/power_save.c 2011-05-18 12:56:43.000000000 +0300
@@ -95,7 +95,7 @@ int suspend_cnt, resume_cnt;
float suspend_cnt_f, resume_cnt_f;
static void _clear_power_config(void);
-static void _do_power_work(void);
+static void _do_power_work(time_t now);
static void _do_resume(char *host);
static void _do_suspend(char *host);
static int _init_power_config(void);
@@ -108,11 +108,11 @@ static void _shutdown_power(void);
static bool _valid_prog(char *file_name);
/* Perform any power change work to nodes */
-static void _do_power_work(void)
+static void _do_power_work(time_t now)
{
static time_t last_log = 0, last_work_scan = 0;
int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0;
- time_t now = time(NULL), delta_t;
+ time_t delta_t;
uint16_t susp_state;
bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
struct node_record *node_ptr;
@@ -212,7 +212,8 @@ static void _do_power_work(void)
error("power_save: bitmap2nodename");
xfree(nodes);
FREE_NULL_BITMAP(sleep_node_bitmap);
- last_node_update = now;
+ /* last_node_update could be changed already by another thread!
+ last_node_update = now; */
}
if (wake_node_bitmap) {
@@ -224,7 +225,8 @@ static void _do_power_work(void)
error("power_save: bitmap2nodename");
xfree(nodes);
FREE_NULL_BITMAP(wake_node_bitmap);
- last_node_update = now;
+ /* last_node_update could be changed already by another thread!
+ last_node_update = now; */
}
}
@@ -370,7 +372,8 @@ static int _reap_procs(void)
if (rc != 0) {
error("power_save: program exit status of %d",
rc);
- }
+ } else
+ ping_nodes_now = true;
} else if (WIFSIGNALED(status)) {
error("power_save: program signalled: %s",
strsignal(WTERMSIG(status)));
@@ -648,7 +651,7 @@ static void *_init_power_save(void *arg)
if ((last_node_update >= last_power_scan) ||
(now >= (last_power_scan + 60))) {
lock_slurmctld(node_write_lock);
- _do_power_work();
+ _do_power_work(now);
unlock_slurmctld(node_write_lock);
last_power_scan = now;
}