From e9a1a4246c476c54f5f4d3281ba3f726cbdd8c82 Mon Sep 17 00:00:00 2001
From: Trey Dockendorf <treydock@tamu.edu>
Date: Tue, 19 Aug 2014 18:03:36 -0500
Subject: [PATCH] Steve's patch for allowing SUSPEND preemption with assumption that swap is available

---
 src/plugins/select/cons_res/select_cons_res.c |  218 +++++++++++++++++--------
 1 files changed, 150 insertions(+), 68 deletions(-)

diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index 82734b3..cc896d4 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -188,6 +188,7 @@ static bool select_state_initializing = true;
 static int select_node_cnt = 0;
 static int preempt_reorder_cnt = 1;
 static bool preempt_strict_order = false;
+static bool assume_swap = true;
 
 struct select_nodeinfo {
 	uint16_t magic;		/* magic number */
@@ -1186,6 +1187,11 @@ static int _rm_job_from_res(struct part_res_record *part_record_ptr,
 			} else {
 				node_usage[i].alloc_memory -=
 					job->memory_allocated[n];
+ 				debug3("_rm_job_from_res: job %d mem %d nodemem %d",
+				       job_ptr->job_id,
+				       job->memory_allocated[n],
+				       node_usage[i].alloc_memory);
+
 			}
 		}
 	}
@@ -1467,10 +1473,12 @@ static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
 		}
 	}
 
+ 	debug3("_test_only: %d cr_job_test()",job_ptr->job_id);
 	rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes,
 			 SELECT_MODE_TEST_ONLY, tmp_cr_type, job_node_req,
 			 select_node_cnt, select_part_record,
 			 select_node_usage, NULL);
+ 	debug3("_test_only: %d cr_job_test() rc=%d",job_ptr->job_id,rc);
 	return rc;
 }
 
@@ -1498,20 +1506,17 @@ static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		    List preemptee_candidates, List *preemptee_job_list,
 		    bitstr_t *exc_core_bitmap)
 {
-	int rc;
+	int rc = SLURM_SUCCESS;
 	bitstr_t *orig_map = NULL, *save_bitmap;
 	struct job_record *tmp_job_ptr;
 	ListIterator job_iterator, preemptee_iterator;
-	struct part_res_record *future_part;
-	struct node_use_record *future_usage;
+	struct part_res_record *future_part=NULL;
+	struct node_use_record *future_usage=NULL;
 	bool remove_some_jobs = false;
 	uint16_t pass_count = 0;
 	uint16_t mode;
 	uint16_t tmp_cr_type = cr_type;
 
-	save_bitmap = bit_copy(bitmap);
-top:	orig_map = bit_copy(save_bitmap);
-
 	if (job_ptr->part_ptr->cr_type) {
 		if (((cr_type & CR_SOCKET) || (cr_type & CR_CORE)) &&
 		    (cr_type & CR_ALLOCATE_FULL_SOCKET)) {
@@ -1524,28 +1529,53 @@ top:	orig_map = bit_copy(save_bitmap);
 		}
 	}
 
+	/* call cr_job_test() early and get out if successful */
+	/*
 	rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes,
 			 SELECT_MODE_RUN_NOW, tmp_cr_type, job_node_req,
 			 select_node_cnt, select_part_record,
 			 select_node_usage, exc_core_bitmap);
+	debug3("_run_now: %d cr_job_test0()=%d\n",job_ptr->job_id, rc);
+	if (rc == SLURM_SUCCESS)
+	  return rc;
+	*/
+
+	save_bitmap = bit_copy(bitmap);
+top:	orig_map = bit_copy(save_bitmap);
+
+	/* Setup simulated future partition and node usage */
+	if (future_part == NULL) {
+	  future_part = _dup_part_data(select_part_record);
+	  if (future_part == NULL) {
+	    FREE_NULL_BITMAP(orig_map);
+	    FREE_NULL_BITMAP(save_bitmap);
+	    return SLURM_ERROR;
+	  }
+	}
+	if (future_usage == NULL) {
+	  future_usage = _dup_node_usage(select_node_usage);
+	  if (future_usage == NULL) {
+	    _destroy_part_data(future_part);
+	    FREE_NULL_BITMAP(orig_map);
+	    FREE_NULL_BITMAP(save_bitmap);
+	    return SLURM_ERROR;
+	  }
+	}
+	/*
+	 * This gets called an extra time if we failed above.
+	 * Then it gets iterated upon as we select preemptable jobs.
+	 */
+	rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes,
+			 SELECT_MODE_RUN_NOW, tmp_cr_type, job_node_req,
+			 select_node_cnt, future_part,
+			 future_usage, exc_core_bitmap);
+	debug3("_run_now: %d cr_job_test1()=%d\n",job_ptr->job_id, rc);
 
 	if ((rc != SLURM_SUCCESS) && preemptee_candidates) {
 		int preemptee_cand_cnt = list_count(preemptee_candidates);
+		debug3("_run_now: %u removing up to %d preemptable jobs",
+		       job_ptr->job_id, preemptee_cand_cnt);
 		/* Remove preemptable jobs from simulated environment */
-		future_part = _dup_part_data(select_part_record);
-		if (future_part == NULL) {
-			FREE_NULL_BITMAP(orig_map);
-			FREE_NULL_BITMAP(save_bitmap);
-			return SLURM_ERROR;
-		}
-		future_usage = _dup_node_usage(select_node_usage);
-		if (future_usage == NULL) {
-			_destroy_part_data(future_part);
-			FREE_NULL_BITMAP(orig_map);
-			FREE_NULL_BITMAP(save_bitmap);
-			return SLURM_ERROR;
-		}
-
 		job_iterator = list_iterator_create(preemptee_candidates);
 		while ((tmp_job_ptr = (struct job_record *)
 			list_next(job_iterator))) {
@@ -1553,28 +1583,46 @@ top:	orig_map = bit_copy(save_bitmap);
 			    !IS_JOB_SUSPENDED(tmp_job_ptr))
 				continue;
 			mode = slurm_job_preempt_mode(tmp_job_ptr);
+			debug3("_run_now: %d candidate %d mode %d",
+			       job_ptr->job_id,
+			       tmp_job_ptr->job_id,
+			       mode);
 			if ((mode != PREEMPT_MODE_REQUEUE)    &&
+			    (mode != PREEMPT_MODE_SUSPEND)    &&
 			    (mode != PREEMPT_MODE_CHECKPOINT) &&
 			    (mode != PREEMPT_MODE_CANCEL))
-				continue;	/* can't remove job */
+			  continue;	/* can't remove job */
+			if (mode == PREEMPT_MODE_SUSPEND && !assume_swap)
+			  continue;
 			/* Remove preemptable job now */
-			_rm_job_from_res(future_part, future_usage,
+			rc = _rm_job_from_res(future_part, future_usage,
 					 tmp_job_ptr, 0);
+			debug3("_run_now: %d _rm_job_from_res(%d)=%d",
+			       job_ptr->job_id, tmp_job_ptr->job_id, rc);
 			bit_or(bitmap, orig_map);
 			rc = cr_job_test(job_ptr, bitmap, min_nodes,
 					 max_nodes, req_nodes,
 					 SELECT_MODE_WILL_RUN,
+					 /* SELECT_MODE_RUN_NOW, */
 					 tmp_cr_type, job_node_req,
 					 select_node_cnt,
 					 future_part, future_usage,
 					 exc_core_bitmap);
-			tmp_job_ptr->details->usable_nodes = 0;
+			debug3("_run_now: %d cr_job_test2(%d)=%d",
+			       job_ptr->job_id, tmp_job_ptr->job_id, rc);
+			tmp_job_ptr->details->usable_nodes = 99998;
 			if (rc != SLURM_SUCCESS)
 				continue;
 
 			if ((pass_count++ > preempt_reorder_cnt) ||
-			    (preemptee_cand_cnt <= pass_count))
+			    (preemptee_cand_cnt <= pass_count)) {
+			  debug3("_run_now: %u break loop: pass_count=%d, preempt_reorder_cnt=%d, preemptee_cand_cnt=%d",
+				 job_ptr->job_id,
+				 pass_count,
+				 preempt_reorder_cnt,
+				 preemptee_cand_cnt);
 				break;
+			}
 
 			/* Reorder preemption candidates to minimize number
 			 * of preempted jobs and their priorities. */
@@ -1584,6 +1632,7 @@ top:	orig_map = bit_copy(save_bitmap);
 				 * jobs. */
 				tmp_job_ptr = (struct job_record *)
 					      list_remove(job_iterator);
+				debug3("_run_now: %u preempt_strict_order list_prepend(preemptee_candidates,%u)",job_ptr->job_id,tmp_job_ptr->job_id);
 				list_prepend(preemptee_candidates, tmp_job_ptr);
 			} else {
 				/* Set the last job's usable count to a large
@@ -1607,53 +1656,71 @@ top:	orig_map = bit_copy(save_bitmap);
 					list_next(job_iterator))) {
 					tmp_job_ptr->details->usable_nodes = 0;
 				}
+				debug3("_run_now: %u resorting list",
+				       job_ptr->job_id);
 				list_sort(preemptee_candidates,
 					  (ListCmpF)_sort_usable_nodes_dec);
 			}
 			FREE_NULL_BITMAP(orig_map);
 			list_iterator_destroy(job_iterator);
-			_destroy_part_data(future_part);
-			_destroy_node_data(future_usage, NULL);
+			debug3("_run_now: %d goto top", job_ptr->job_id);
 			goto top;
-		}
+		} /* while(tmp_job_ptr) */
 		list_iterator_destroy(job_iterator);
+	} /* if ((rc != SLURM_SUCCESS) && preemptee_candidates) */
 
-		if ((rc == SLURM_SUCCESS) && preemptee_job_list &&
-		    preemptee_candidates) {
-			/* Build list of preemptee jobs whose resources are
-			 * actually used */
-			if (*preemptee_job_list == NULL) {
-				*preemptee_job_list = list_create(NULL);
-			}
-			preemptee_iterator = list_iterator_create(
-				preemptee_candidates);
-			while ((tmp_job_ptr = (struct job_record *)
-				list_next(preemptee_iterator))) {
-				mode = slurm_job_preempt_mode(tmp_job_ptr);
-				if ((mode != PREEMPT_MODE_REQUEUE)    &&
-				    (mode != PREEMPT_MODE_CHECKPOINT) &&
-				    (mode != PREEMPT_MODE_CANCEL))
-					continue;
-				if (bit_overlap(bitmap,
-						tmp_job_ptr->node_bitmap) == 0)
-					continue;
-				list_append(*preemptee_job_list,
-					    tmp_job_ptr);
-				remove_some_jobs = true;
-			}
-			list_iterator_destroy(preemptee_iterator);
-			if (!remove_some_jobs) {
-				list_destroy(*preemptee_job_list);
-				*preemptee_job_list = NULL;
-			}
-		}
 
-		_destroy_part_data(future_part);
-		_destroy_node_data(future_usage, NULL);
-	}
+	if ((rc == SLURM_SUCCESS) && preemptee_job_list &&
+	    preemptee_candidates) {
+	  /* Build list of preemptee jobs whose resources are
+	   * actually used */
+	  debug3("_run_now: %u building preemptee list",
+		 job_ptr->job_id);
+	  if (*preemptee_job_list == NULL) {
+	    debug3("_run_now: %u creating new preemptee_job_list",
+		   job_ptr->job_id);
+	    *preemptee_job_list = list_create(NULL);
+	  }
+	  preemptee_iterator = list_iterator_create(
+						    preemptee_candidates);
+	  while ((tmp_job_ptr = (struct job_record *)
+		  list_next(preemptee_iterator))) {
+	    mode = slurm_job_preempt_mode(tmp_job_ptr);
+	    if ((mode != PREEMPT_MODE_REQUEUE)    &&
+		(mode != PREEMPT_MODE_SUSPEND)    &&
+		(mode != PREEMPT_MODE_CHECKPOINT) &&
+		(mode != PREEMPT_MODE_CANCEL))
+	      continue;
+	    if (mode == PREEMPT_MODE_SUSPEND &&
+		!assume_swap)
+	      continue;
+	    /*
+	    if (bit_overlap(bitmap,
+			    tmp_job_ptr->node_bitmap) == 0)
+	      continue;
+	    */
+	    if (tmp_job_ptr->details->usable_nodes != 99998)
+	      continue;
+	    debug3("_run_now: %u appending preemptee %u",
+		   job_ptr->job_id,
+		   tmp_job_ptr->job_id);
+	    list_append(*preemptee_job_list,
+			tmp_job_ptr);
+	    remove_some_jobs = true;
+	  }
+	  list_iterator_destroy(preemptee_iterator);
+	  if (!remove_some_jobs) {
+	    list_destroy(*preemptee_job_list);
+	    *preemptee_job_list = NULL;
+	  }
+	} /* if (rc==SLURM_SUCCESS && preemptee_candidates) */
+
+	_destroy_part_data(future_part);
+	_destroy_node_data(future_usage, NULL);
 	FREE_NULL_BITMAP(orig_map);
 	FREE_NULL_BITMAP(save_bitmap);
 
+	debug3("_run_now: %d returning %d", job_ptr->job_id, rc);
 	return rc;
 }
 
@@ -1695,6 +1762,7 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 			 SELECT_MODE_WILL_RUN, tmp_cr_type, job_node_req,
 			 select_node_cnt, select_part_record,
 			 select_node_usage, exc_core_bitmap);
+	debug3("_will_run_test: %u cr_job_test1()=%d",job_ptr->job_id,rc);
 	if (rc == SLURM_SUCCESS) {
 		FREE_NULL_BITMAP(orig_map);
 		job_ptr->start_time = time(NULL);
@@ -1729,16 +1797,21 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 			continue;
 		}
 		if (_is_preemptable(tmp_job_ptr, preemptee_candidates)) {
+		        int r;
 			uint16_t mode = slurm_job_preempt_mode(tmp_job_ptr);
 			if (mode == PREEMPT_MODE_OFF)
 				continue;
-			if (mode == PREEMPT_MODE_SUSPEND)
+			if (mode == PREEMPT_MODE_SUSPEND && !assume_swap)
 				action = 2;	/* remove cores, keep memory */
 			else
 				action = 0;	/* remove cores and memory */
 			/* Remove preemptable job now */
-			_rm_job_from_res(future_part, future_usage,
+			debug3("_will_run_test: %u _rm_job_from_res %u act %d",
+			       job_ptr->job_id,tmp_job_ptr->job_id, action);
+			r = _rm_job_from_res(future_part, future_usage,
 					 tmp_job_ptr, action);
+			debug3("_will_run_test: %u _rm_job_from_res %u rc=%d",
+			       job_ptr->job_id,tmp_job_ptr->job_id, r);
 		} else
 			list_append(cr_job_list, tmp_job_ptr);
 	}
@@ -1751,6 +1824,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 				 req_nodes, SELECT_MODE_WILL_RUN, tmp_cr_type,
 				 job_node_req, select_node_cnt, future_part,
 				 future_usage, exc_core_bitmap);
+		debug3("_will_run_test: %u cr_job_test2()=%d",
+		       job_ptr->job_id,rc);
 		if (rc == SLURM_SUCCESS) {
 			/* Actual start time will actually be later than "now",
 			 * but return "now" for backfill scheduler to
@@ -1770,8 +1845,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 			ovrlap = bit_overlap(bitmap, tmp_job_ptr->node_bitmap);
 			if (ovrlap == 0)	/* job has no usable nodes */
 				continue;	/* skip it */
-			debug2("cons_res: _will_run_test, job %u: overlap=%d",
-			       tmp_job_ptr->job_id, ovrlap);
+			debug2("cons_res: _will_run_test %u, job %u: overlap=%d",
+			       job_ptr->job_id, tmp_job_ptr->job_id, ovrlap);
 			_rm_job_from_res(future_part, future_usage,
 					 tmp_job_ptr, 0);
 			rc = cr_job_test(job_ptr, bitmap, min_nodes,
@@ -1780,6 +1855,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 					 job_node_req, select_node_cnt,
 					 future_part, future_usage,
 					 exc_core_bitmap);
+			debug3("_will_run_test: %u cr_job_test3()=%d",
+			       job_ptr->job_id, tmp_job_ptr->job_id, rc);
 			if (rc == SLURM_SUCCESS) {
 				if (tmp_job_ptr->end_time <= now)
 					job_ptr->start_time = now + 1;
@@ -1806,6 +1883,9 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 			if (bit_overlap(bitmap,
 					tmp_job_ptr->node_bitmap) == 0)
 				continue;
+ 			debug3("_will_run_test: %u appending preemptee %u",
+			       job_ptr->job_id,
+			       tmp_job_ptr->job_id);
 			list_append(*preemptee_job_list, tmp_job_ptr);
 		}
 		list_iterator_destroy(preemptee_iterator);
@@ -1815,6 +1895,7 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 	_destroy_part_data(future_part);
 	_destroy_node_data(future_usage, NULL);
 	FREE_NULL_BITMAP(orig_map);
+	debug3("_will_run_test: %u returning rc=%d", job_ptr->job_id, rc);
 	return rc;
 }
 
@@ -2038,7 +2119,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap,
 
 	xassert(bitmap);
 
-	debug2("select_p_job_test for job %u", job_ptr->job_id);
+	debug2("select_p_job_test for job %u mode %u", job_ptr->job_id, mode);
 	if (!debug_check) {
 		debug_check = true;
 		if (slurm_get_debug_flags() & DEBUG_FLAG_CPU_BIND)
@@ -2083,6 +2164,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap,
 	} else
 		fatal("select_p_job_test: Mode %d is invalid", mode);
 
+	debug3("select_p_job_test: %u rc = %d",job_ptr->job_id, rc);
 	if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
 		if (job_ptr->job_resrcs)
 			log_job_resources(job_ptr->job_id, job_ptr->job_resrcs);
@@ -2182,9 +2264,9 @@ extern int select_p_job_suspend(struct job_record *job_ptr, bool indf_susp)
 
 	if (!indf_susp)
 		return SLURM_SUCCESS;
-
+	debug3("select_p_job_suspend: %d", job_ptr->job_id);
 	return _rm_job_from_res(select_part_record, select_node_usage,
-				job_ptr, 2);
+				job_ptr, 0);
 }
 
 /* See NOTE with select_p_job_suspend above */
@@ -2194,8 +2276,8 @@ extern int select_p_job_resume(struct job_record *job_ptr, bool indf_susp)
 
 	if (!indf_susp)
 		return SLURM_SUCCESS;
-
-	return _add_job_to_res(job_ptr, 2);
+	debug3("select_p_job_resume: %d", job_ptr->job_id);
+	return _add_job_to_res(job_ptr, 0);
 }
 
 
-- 
1.7.1

