Hi!
We have a problem with backfill.Jobs are not backfilled due to the fact that backfill does not finish the complete backlog of jobs in the queue before it's interrupted and starts all over again. We sometimes have lots of jobs in the queue of various sizes and users and even with idle nodes short job will not start because of this.
I have made a patch for backfill with a configuration option (bf_continue) to let backfill continue from the last JobID of the last cycle.
This will make backfill look at the whole queue eventually. Best regards, Magnus -- Magnus Jonsson, Developer, HPC2N, Umeå Universitet
diff -r -u a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c
--- a/src/plugins/sched/backfill/backfill.c 2013-02-05 23:59:05.000000000 +0100
+++ b/src/plugins/sched/backfill/backfill.c 2013-03-01 10:31:24.000000000 +0100
@@ -125,6 +125,7 @@
static int backfill_window = BACKFILL_WINDOW;
static int max_backfill_job_cnt = 50;
static int max_backfill_job_per_user = 0;
+static bool backfill_continue = false;
/*********************** local functions *********************/
static void _add_reservation(uint32_t start_time, uint32_t end_reserve,
@@ -410,6 +411,18 @@
max_backfill_job_per_user);
}
+ /* bf_continue=true makes backfill continue where it was if interrupted
+ */
+ if (sched_params && (strstr(sched_params, "bf_continue="))) {
+ if (strstr(sched_params, "bf_continue=1")) {
+ backfill_continue = true;
+ } else if (strstr(sched_params, "bf_continue=0")) {
+ backfill_continue = false;
+ } else {
+ fatal("Invalid bf_continue (use only 0 or 1)");
+ }
+ }
+
xfree(sched_params);
}
@@ -530,6 +543,8 @@
uint32_t *uid = NULL, nuser = 0;
uint16_t *njobs = NULL;
bool already_counted;
+ static uint32_t last_job_id=0;
+ bool last_job_id_found = false;
#ifdef HAVE_CRAY
/*
@@ -597,12 +612,33 @@
uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
}
+ /*
+ * Reset last_job_id if not using bf_continue
+ */
+ if (!backfill_continue) {
+ last_job_id = 0;
+ }
+ if (last_job_id == 0) {
+ last_job_id_found = true;
+ }
while ((job_queue_rec = (job_queue_rec_t *)
list_pop_bottom(job_queue, sort_job_queue2))) {
job_test_count++;
job_ptr = job_queue_rec->job_ptr;
part_ptr = job_queue_rec->part_ptr;
xfree(job_queue_rec);
+
+ /*
+ * Skip job checked last time
+ */
+ if (backfill_continue && !last_job_id_found) {
+ if (last_job_id == job_ptr->job_id) {
+ last_job_id_found = true;
+ last_job_id = 0;
+ }
+ continue;
+ }
+
if (!IS_JOB_PENDING(job_ptr))
continue; /* started in other partition */
job_ptr->part_ptr = part_ptr;
@@ -783,6 +819,10 @@
"breaking out after testing %d "
"jobs", job_test_count);
}
+ /*
+ * Save last JobID for next turn
+ */
+ last_job_id = job_ptr->job_id;
rc = 1;
break;
}
@@ -865,6 +905,10 @@
if (node_space_recs >= max_backfill_job_cnt) {
/* Already have too many jobs to deal with */
+ /*
+ * Save last JobID for next turn
+ */
+ last_job_id = job_ptr->job_id;
break;
}
@@ -890,6 +934,15 @@
if (debug_flags & DEBUG_FLAG_BACKFILL)
_dump_node_space_table(node_space);
}
+
+ /*
+ * Reset last_job_id pointer if reached end of queue
+ * without finding anything to do
+ */
+ if (!last_job_id_found) {
+ debug("backfill: last_job_id=%d (reached end of queue without finding old job)",last_job_id);
+ last_job_id = 0;
+ }
xfree(uid);
xfree(njobs);
FREE_NULL_BITMAP(avail_bitmap);
smime.p7s
Description: S/MIME Cryptographic Signature
