At CCNI, we use backfill scheduling on all our systems. However, we have
found that users typically do not specify a time limit for their job so
the scheduler assumes the maximum from QoS/user limits/partition
limits/etc. This really hurts backfilling since the scheduler remains
ignorant of short jobs.
Attached is a small patch I wrote containing a job submit plugin and a
new error message. The plugin rejects a job submission when it is
missing a time limit and will provide the user with a clear and distinct
error.
I've just re-tested and the patch applies and builds cleanly on the
slurm-2.5, slurm-2.6, and master branches.
Please let me know if you find this useful, run across problems, or have
suggestions/improvements. Thanks.
--
Daniel M. Weeks
Systems Programmer
Computational Center for Nanotechnology Innovations
Rensselaer Polytechnic Institute
Troy, NY 12180
518-276-4458
diff --git a/configure.ac b/configure.ac
index 609534b..beb14cb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -501,6 +503,7 @@ AC_CONFIG_FILES([Makefile
src/plugins/job_submit/logging/Makefile
src/plugins/job_submit/lua/Makefile
src/plugins/job_submit/partition/Makefile
+ src/plugins/job_submit/require_timelimit/Makefile
src/plugins/launch/Makefile
src/plugins/launch/aprun/Makefile
src/plugins/launch/poe/Makefile
diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h
index 7f8bb72..01267c3 100644
--- a/slurm/slurm_errno.h
+++ b/slurm/slurm_errno.h
@@ -257,7 +257,10 @@ enum {
ESLURM_JOBS_RUNNING_ON_ASSOC,
ESLURM_CLUSTER_DELETED,
ESLURM_ONE_CHANGE,
- ESLURM_BAD_NAME
+ ESLURM_BAD_NAME,
+
+ /* require_timelimit custom errors */
+ ESLURM_MISSING_TIME_LIMIT = 8000
};
/* look up an errno value */
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index 24f5018..28834fd 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -391,7 +391,11 @@ static slurm_errtab_t slurm_errtab[] = {
{ ESLURM_ONE_CHANGE,
"Can only change one at a time" },
{ ESLURM_BAD_NAME,
- "Unacceptable name given. (No '.' in name allowed)" }
+ "Unacceptable name given. (No '.' in name allowed)" },
+
+ /* require_timelimit custom errors */
+ { ESLURM_MISSING_TIME_LIMIT,
+ "Missing time limit" }
};
/*
diff --git a/src/plugins/job_submit/Makefile.am b/src/plugins/job_submit/Makefile.am
index e35d4fe..c0cc646 100644
--- a/src/plugins/job_submit/Makefile.am
+++ b/src/plugins/job_submit/Makefile.am
@@ -1,3 +1,3 @@
# Makefile for job_submit plugins
-SUBDIRS = all_partitions cnode defaults logging lua partition
+SUBDIRS = all_partitions cnode defaults logging lua partition require_timelimit
diff --git a/src/plugins/job_submit/require_timelimit/Makefile.am b/src/plugins/job_submit/require_timelimit/Makefile.am
new file mode 100644
index 0000000..117103a
--- /dev/null
+++ b/src/plugins/job_submit/require_timelimit/Makefile.am
@@ -0,0 +1,13 @@
+# Makefile for job_submit/require_timelimit plugin
+
+AUTOMAKE_OPTIONS = foreign
+
+PLUGIN_FLAGS = -module -avoid-version --export-dynamic
+
+INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common
+
+pkglib_LTLIBRARIES = job_submit_require_timelimit.la
+
+# Job submit require_timelimit plugin.
+job_submit_require_timelimit_la_SOURCES = job_submit_require_timelimit.c
+job_submit_require_timelimit_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS)
diff --git a/src/plugins/job_submit/require_timelimit/job_submit_require_timelimit.c b/src/plugins/job_submit/require_timelimit/job_submit_require_timelimit.c
new file mode 100644
index 0000000..32367d7
--- /dev/null
+++ b/src/plugins/job_submit/require_timelimit/job_submit_require_timelimit.c
@@ -0,0 +1,34 @@
+#include <slurm/slurm.h>
+#include <slurm/slurm_errno.h>
+
+#include "src/slurmctld/slurmctld.h"
+
+const char plugin_name[]="Require time limit jobsubmit plugin";
+const char plugin_type[]="job_submit/require_timelimit";
+const uint32_t plugin_version = 100;
+const uint32_t min_plug_version = 100;
+
+int job_submit(struct job_descriptor *job_desc, uint32_t submit_uid)
+{
+ // NOTE: no job id actually exists yet (=NO_VAL)
+
+ if (job_desc->time_limit == NO_VAL) {
+ info("Missing time limit for job by uid:%u", submit_uid);
+ return ESLURM_MISSING_TIME_LIMIT;
+ } else if (job_desc->time_limit == INFINITE) {
+ info("Bad time limit for job by uid:%u", submit_uid);
+ return ESLURM_INVALID_TIME_LIMIT;
+ }
+
+ return SLURM_SUCCESS;
+}
+
+int job_modify(struct job_descriptor *job_desc, struct job_record *job_ptr, uint32_t submit_uid)
+{
+ if (job_desc->time_limit == INFINITE) {
+ info("Bad replacement time limit for %u", job_desc->job_id);
+ return ESLURM_INVALID_TIME_LIMIT;
+ }
+
+ return SLURM_SUCCESS;
+}