Hi all,
I noticed something odd when I was testing some resource limit stuff.
- AccountingStorageEnforce set to 'assocations', or turned off completely
(previously it had been 'limits')
- some associations previously had GrpCPUMins limits set, from earlier testing
- jobs were still being killed when their usage went over the limit, e.g. with
this message:
"Job 759 timed out, assoc 452 is at or exceeds group max cpu minutes limit
5940 with 5956 for account testing"
Looking at src/slurmctld/job_mgr.c it seems that the job_time_limit() function
wasn't actually checking to see if limits were being enforced before killing the
job.
See attached a patch which checks to see if limits or qos are enforced before
killing the job. I've tested it with 2.4.3 and it does what I expect - haven't
tried 2.4.4, but the job_time_limit() logic seems to the same.
Thanks,
Paddy
--
Paddy Doyle
Trinity Centre for High Performance Computing,
Lloyd Building, Trinity College Dublin, Dublin 2, Ireland.
Phone: +353-1-896-3725
http://www.tchpc.tcd.ie/
diff -ru slurm-2.4.3-orig/src/slurmctld/job_mgr.c
slurm-2.4.3/src/slurmctld/job_mgr.c
--- slurm-2.4.3-orig/src/slurmctld/job_mgr.c 2012-09-18 22:15:12.000000000
+0100
+++ slurm-2.4.3/src/slurmctld/job_mgr.c 2012-11-06 16:42:39.000000000 +0000
@@ -5147,7 +5147,8 @@
usage_mins = (uint64_t)(qos->usage->usage_raw / 60.0);
wall_mins = qos->usage->grp_used_wall / 60;
- if ((qos->grp_cpu_mins != (uint64_t)INFINITE)
+ if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
+ && (qos->grp_cpu_mins != (uint64_t)INFINITE)
&& (usage_mins >= qos->grp_cpu_mins)) {
last_job_update = now;
info("Job %u timed out, "
@@ -5162,7 +5163,8 @@
goto job_failed;
}
- if ((qos->grp_wall != INFINITE)
+ if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
+ && (qos->grp_wall != INFINITE)
&& (wall_mins >= qos->grp_wall)) {
last_job_update = now;
info("Job %u timed out, "
@@ -5175,7 +5177,8 @@
goto job_failed;
}
- if ((qos->max_cpu_mins_pj != (uint64_t)INFINITE)
+ if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
+ && (qos->max_cpu_mins_pj != (uint64_t)INFINITE)
&& (job_cpu_usage_mins >= qos->max_cpu_mins_pj)) {
last_job_update = now;
info("Job %u timed out, "
@@ -5196,7 +5199,8 @@
usage_mins = (uint64_t)(assoc->usage->usage_raw / 60.0);
wall_mins = assoc->usage->grp_used_wall / 60;
- if ((qos && (qos->grp_cpu_mins == INFINITE))
+ if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
+ && (qos && (qos->grp_cpu_mins == INFINITE))
&& (assoc->grp_cpu_mins != (uint64_t)INFINITE)
&& (usage_mins >= assoc->grp_cpu_mins)) {
info("Job %u timed out, "
@@ -5211,7 +5215,8 @@
break;
}
- if ((qos && (qos->grp_wall == INFINITE))
+ if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
+ && (qos && (qos->grp_wall == INFINITE))
&& (assoc->grp_wall != INFINITE)
&& (wall_mins >= assoc->grp_wall)) {
info("Job %u timed out, "
@@ -5225,7 +5230,8 @@
break;
}
- if ((qos && (qos->max_cpu_mins_pj == INFINITE))
+ if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
+ && (qos && (qos->max_cpu_mins_pj == INFINITE))
&& (assoc->max_cpu_mins_pj != (uint64_t)INFINITE)
&& (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) {
info("Job %u timed out, "