At CCNI, we use backfill scheduling on all our systems. However, we have
found that users typically do not specify a time limit for their job so
the scheduler assumes the maximum from QoS/user limits/partition
limits/etc. This really hurts backfilling since the scheduler remains
ignorant of short jobs.

Attached is a small patch I wrote containing a job submit plugin and a
new error message. The plugin rejects a job submission when it is
missing a time limit and will provide the user with a clear and distinct
error.

I've just re-tested and the patch applies and builds cleanly on the
slurm-2.5, slurm-2.6, and master branches.

Please let me know if you find this useful, run across problems, or have
suggestions/improvements. Thanks.

-- 
Daniel M. Weeks
Systems Programmer
Computational Center for Nanotechnology Innovations
Rensselaer Polytechnic Institute
Troy, NY 12180
518-276-4458
diff --git a/configure.ac b/configure.ac
index 609534b..beb14cb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -501,6 +503,7 @@ AC_CONFIG_FILES([Makefile
		 src/plugins/job_submit/logging/Makefile
		 src/plugins/job_submit/lua/Makefile
		 src/plugins/job_submit/partition/Makefile
+		 src/plugins/job_submit/require_timelimit/Makefile
		 src/plugins/launch/Makefile
		 src/plugins/launch/aprun/Makefile
		 src/plugins/launch/poe/Makefile
diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h
index 7f8bb72..01267c3 100644
--- a/slurm/slurm_errno.h
+++ b/slurm/slurm_errno.h
@@ -257,7 +257,10 @@ enum {
 	ESLURM_JOBS_RUNNING_ON_ASSOC,
 	ESLURM_CLUSTER_DELETED,
 	ESLURM_ONE_CHANGE,
-	ESLURM_BAD_NAME
+	ESLURM_BAD_NAME,
+
+	/* require_timelimit custom errors */
+	ESLURM_MISSING_TIME_LIMIT       = 8000
 };
 
 /* look up an errno value */
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index 24f5018..28834fd 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -391,7 +391,11 @@ static slurm_errtab_t slurm_errtab[] = {
 	{ ESLURM_ONE_CHANGE,
 	  "Can only change one at a time"                       },
 	{ ESLURM_BAD_NAME,
-	  "Unacceptable name given. (No '.' in name allowed)"   }
+	  "Unacceptable name given. (No '.' in name allowed)"   },
+
+	/* require_timelimit custom errors */
+	{ ESLURM_MISSING_TIME_LIMIT,
+	  "Missing time limit"                                  }
 };
 
 /*
diff --git a/src/plugins/job_submit/Makefile.am b/src/plugins/job_submit/Makefile.am
index e35d4fe..c0cc646 100644
--- a/src/plugins/job_submit/Makefile.am
+++ b/src/plugins/job_submit/Makefile.am
@@ -1,3 +1,3 @@
 # Makefile for job_submit plugins
 
-SUBDIRS = all_partitions cnode defaults logging lua partition
+SUBDIRS = all_partitions cnode defaults logging lua partition require_timelimit
diff --git a/src/plugins/job_submit/require_timelimit/Makefile.am b/src/plugins/job_submit/require_timelimit/Makefile.am
new file mode 100644
index 0000000..117103a
--- /dev/null
+++ b/src/plugins/job_submit/require_timelimit/Makefile.am
@@ -0,0 +1,13 @@
+# Makefile for job_submit/require_timelimit plugin
+
+AUTOMAKE_OPTIONS = foreign
+
+PLUGIN_FLAGS = -module -avoid-version --export-dynamic
+
+INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common
+
+pkglib_LTLIBRARIES = job_submit_require_timelimit.la
+
+# Job submit require_timelimit plugin.
+job_submit_require_timelimit_la_SOURCES = job_submit_require_timelimit.c
+job_submit_require_timelimit_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS)
diff --git a/src/plugins/job_submit/require_timelimit/job_submit_require_timelimit.c b/src/plugins/job_submit/require_timelimit/job_submit_require_timelimit.c
new file mode 100644
index 0000000..32367d7
--- /dev/null
+++ b/src/plugins/job_submit/require_timelimit/job_submit_require_timelimit.c
@@ -0,0 +1,34 @@
+#include <slurm/slurm.h>
+#include <slurm/slurm_errno.h>
+
+#include "src/slurmctld/slurmctld.h"
+
+const char plugin_name[]="Require time limit jobsubmit plugin";
+const char plugin_type[]="job_submit/require_timelimit";
+const uint32_t plugin_version   = 100;
+const uint32_t min_plug_version = 100;
+
+int job_submit(struct job_descriptor *job_desc, uint32_t submit_uid)
+{
+	// NOTE: no job id actually exists yet (=NO_VAL)
+
+	if (job_desc->time_limit == NO_VAL) {
+		info("Missing time limit for job by uid:%u", submit_uid);
+		return ESLURM_MISSING_TIME_LIMIT;
+	} else if (job_desc->time_limit == INFINITE) {
+		info("Bad time limit for job by uid:%u", submit_uid);
+		return ESLURM_INVALID_TIME_LIMIT;
+	}
+
+	return SLURM_SUCCESS;
+}
+
+int job_modify(struct job_descriptor *job_desc, struct job_record *job_ptr, uint32_t submit_uid)
+{
+	if (job_desc->time_limit == INFINITE) {
+		info("Bad replacement time limit for %u", job_desc->job_id);
+		return ESLURM_INVALID_TIME_LIMIT;
+	}
+
+	return SLURM_SUCCESS;
+}

Reply via email to