Hi all,

I noticed something odd when I was testing some resource limit stuff.

- AccountingStorageEnforce set to 'assocations', or turned off completely
  (previously it had been 'limits')
- some associations previously had GrpCPUMins limits set, from earlier testing
- jobs were still being killed when their usage went over the limit, e.g. with
  this message:

  "Job 759 timed out, assoc 452 is at or exceeds group max cpu minutes limit
   5940 with 5956 for account testing"

Looking at src/slurmctld/job_mgr.c it seems that the job_time_limit() function
wasn't actually checking to see if limits were being enforced before killing the
job.

See attached a patch which checks to see if limits or qos are enforced before
killing the job. I've tested it with 2.4.3 and it does what I expect - haven't
tried 2.4.4, but the job_time_limit() logic seems to the same.

Thanks,
Paddy

-- 
Paddy Doyle
Trinity Centre for High Performance Computing,
Lloyd Building, Trinity College Dublin, Dublin 2, Ireland.
Phone: +353-1-896-3725
http://www.tchpc.tcd.ie/
diff -ru slurm-2.4.3-orig/src/slurmctld/job_mgr.c 
slurm-2.4.3/src/slurmctld/job_mgr.c
--- slurm-2.4.3-orig/src/slurmctld/job_mgr.c    2012-09-18 22:15:12.000000000 
+0100
+++ slurm-2.4.3/src/slurmctld/job_mgr.c 2012-11-06 16:42:39.000000000 +0000
@@ -5147,7 +5147,8 @@
                        usage_mins = (uint64_t)(qos->usage->usage_raw / 60.0);
                        wall_mins = qos->usage->grp_used_wall / 60;
 
-                       if ((qos->grp_cpu_mins != (uint64_t)INFINITE)
+                       if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
+                           && (qos->grp_cpu_mins != (uint64_t)INFINITE)
                            && (usage_mins >= qos->grp_cpu_mins)) {
                                last_job_update = now;
                                info("Job %u timed out, "
@@ -5162,7 +5163,8 @@
                                goto job_failed;
                        }
 
-                       if ((qos->grp_wall != INFINITE)
+                       if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
+                           && (qos->grp_wall != INFINITE)
                            && (wall_mins >= qos->grp_wall)) {
                                last_job_update = now;
                                info("Job %u timed out, "
@@ -5175,7 +5177,8 @@
                                goto job_failed;
                        }
 
-                       if ((qos->max_cpu_mins_pj != (uint64_t)INFINITE)
+                       if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
+                           && (qos->max_cpu_mins_pj != (uint64_t)INFINITE)
                            && (job_cpu_usage_mins >= qos->max_cpu_mins_pj)) {
                                last_job_update = now;
                                info("Job %u timed out, "
@@ -5196,7 +5199,8 @@
                        usage_mins = (uint64_t)(assoc->usage->usage_raw / 60.0);
                        wall_mins = assoc->usage->grp_used_wall / 60;
 
-                       if ((qos && (qos->grp_cpu_mins == INFINITE))
+                       if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
+                           && (qos && (qos->grp_cpu_mins == INFINITE))
                            && (assoc->grp_cpu_mins != (uint64_t)INFINITE)
                            && (usage_mins >= assoc->grp_cpu_mins)) {
                                info("Job %u timed out, "
@@ -5211,7 +5215,8 @@
                                break;
                        }
 
-                       if ((qos && (qos->grp_wall == INFINITE))
+                       if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
+                           && (qos && (qos->grp_wall == INFINITE))
                            && (assoc->grp_wall != INFINITE)
                            && (wall_mins >= assoc->grp_wall)) {
                                info("Job %u timed out, "
@@ -5225,7 +5230,8 @@
                                break;
                        }
 
-                       if ((qos && (qos->max_cpu_mins_pj == INFINITE))
+                       if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
+                           && (qos && (qos->max_cpu_mins_pj == INFINITE))
                            && (assoc->max_cpu_mins_pj != (uint64_t)INFINITE)
                            && (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) {
                                info("Job %u timed out, "

Reply via email to