Forgot to mention, it's slurm version slurm-15.08.7

On Fri, May 20, 2016 at 09:03:06AM +0100, Paddy Doyle wrote:

> Hi all,
> 
> We're seeing a really strange scheduling issue on one of our clusters, whereby
> jobs are not being scheduled, even though there are many idle nodes.
> 
> In fact there were 19 idle nodes, with the first priority job only needing 1;
> the next few needed 6 nodes etc.
> 
> Turning up Debug logging showed lots of "best_fit topology failure: no switch
> currently has sufficient resource to satisfy the request" messages in the 
> logs,
> but even so the 'max_switch_wait' (which we haven't set and so should default 
> to
> 300 seconds) doesn't seem to be honoured.
> 
> This morning there are 25 idle nodes, with the top priority job needing 6.
> 
> I'll copy in the slurm.conf, topology.conf, and some relevant logs and queue
> snapshots.
> 
> Any help would be appreciated.
> 
> Thanks,
> Paddy
> 
> 
> 
> 
> 
> #############################################################
> # the cluster and queue state yesterday evening:
> #############################################################
> 
> root@kelvin01:/etc/slurm # sinfo
> PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
> compute      up 3-00:00:00      1 drain* kelvin-n027
> compute      up 3-00:00:00     76  alloc 
> kelvin-n[001-026,028-045,049-054,072-094,096-098]
> compute      up 3-00:00:00     19   idle kelvin-n[046-048,055-067,069-071]
> debug*       up      30:00      2   idle kelvin-n[099-100]
> root@kelvin01:/etc/slurm # squeue --start | head
>              JOBID PARTITION     NAME     USER ST          START_TIME  NODES 
> SCHEDNODES           NODELIST(REASON)
>              86801   compute debo_ben  aaaaaaa PD 2016-05-20T12:20:35      1 
> kelvin-n039          (Priority)
>              86677   compute GdDC_25_   bbbbbb PD 2016-05-20T21:24:29      6 
> (null)               (Resources)
>              86678   compute GdDC_25_   bbbbbb PD 2016-05-20T21:24:29      6 
> (null)               (Priority)
>              86679   compute        B  ccccccc PD 2016-05-20T21:24:29      8 
> (null)               (Priority)
>              86680   compute       BA  ccccccc PD 2016-05-20T21:24:29      8 
> (null)               (Priority)
>              86682   compute GdDC_5_9   bbbbbb PD 2016-05-20T21:24:29      6 
> (null)               (Priority)
>              86683   compute GdDC_5_9   bbbbbb PD 2016-05-20T21:24:29      6 
> (null)               (Priority)
>              86684   compute GdDC_5_9   bbbbbb PD 2016-05-20T21:24:29      6 
> (null)               (Priority)
>              86685   compute GdDC_10_   bbbbbb PD 2016-05-20T21:24:29      6 
> (null)               (Priority)
> root@kelvin01:/etc/slurm # squeue -tr
>              JOBID PARTITION     NAME     USER ST       TIME  NODES 
> NODELIST(REASON)
>              86699   compute antiferr  aaaaaaa  R 2-05:41:33      2 
> kelvin-n[019,021]
>              86700   compute   ferro3  aaaaaaa  R 2-05:40:03      2 
> kelvin-n[030-031]
>              86745   compute lco_int_  ddddddd  R   19:41:24      6 
> kelvin-n[040,075,084,096-098]
>              86681   compute lco_int_  ddddddd  R    9:33:37      6 
> kelvin-n[013,017-018,022-024]
>              86729   compute lco_int_  ddddddd  R    9:21:36      6 
> kelvin-n[049-054]
>              86765   compute debo_ace  aaaaaaa  R    9:04:06      1 
> kelvin-n072
>              86766   compute   acetic  aaaaaaa  R    6:40:31      1 
> kelvin-n038
>              86793   compute lco_int_  ddddddd  R    6:37:31      6 
> kelvin-n[039,041-045]
>              86662   compute        B  eeeeeee  R    4:37:31     16 
> kelvin-n[032-035,077,079,085-094]
>              86810   compute    C_opt  fffffff  R    3:47:40      6 
> kelvin-n[007-012]
>              86808   compute lco_int_  ddddddd  R    3:01:55      6 
> kelvin-n[016,020,036-037,082-083]
>              86674   compute GdDC_20_   bbbbbb  R    2:52:57      6 
> kelvin-n[073-074,076,078,080-081]
>              86675   compute GdDC_20_   bbbbbb  R    2:45:57      6 
> kelvin-n[001-006]
>              86676   compute GdDC_25_   bbbbbb  R    1:09:55      6 
> kelvin-n[014-015,025-026,028-029]
> 
> 
> #############################################################
> # snips from logs at that time:
> #############################################################
> 
> [2016-05-19T18:58:24.658] backfill: beginning
> [2016-05-19T18:58:24.658] debug:  backfill: 45 jobs to backfill
> [2016-05-19T18:58:24.658] backfill test for JobID=86677 Prio=12823347 
> Partition=compute
> [2016-05-19T18:58:24.658] debug:  job 86677: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.658] debug:  job 86677: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.658] debug:  job 86677: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.658] Job 86677 to start at 2016-05-20T21:24:29, end at 
> 2016-05-22T21:24:00 on kelvin-n[013,017-019,021-022]
> [2016-05-19T18:58:24.658] backfill test for JobID=86678 Prio=12823322 
> Partition=compute
> [2016-05-19T18:58:24.658] debug:  job 86678: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.658] debug:  job 86678: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.658] debug:  job 86678: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.658] Job 86678 to start at 2016-05-20T21:24:29, end at 
> 2016-05-22T21:24:00 on kelvin-n[013,017-019,021-022]
> [2016-05-19T18:58:24.658] backfill test for JobID=86679 Prio=12662420 
> Partition=compute
> [2016-05-19T18:58:24.659] debug:  job 86679: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.659] debug:  job 86679: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.659] debug:  job 86679: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.659] Job 86679 to start at 2016-05-20T21:24:29, end at 
> 2016-05-23T21:24:00 on kelvin-n[013,017-019,021-024]
> [2016-05-19T18:58:24.659] backfill test for JobID=86680 Prio=12662172 
> Partition=compute
> [2016-05-19T18:58:24.659] debug:  job 86680: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.659] debug:  job 86680: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.659] debug:  job 86680: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.659] Job 86680 to start at 2016-05-20T21:24:29, end at 
> 2016-05-23T21:24:00 on kelvin-n[013,017-019,021-024]
> [2016-05-19T18:58:24.659] backfill test for JobID=86731 Prio=12564061 
> Partition=compute
> [2016-05-19T18:58:24.659] Job 86731 to start at 2016-05-21T16:05:09, end at 
> 2016-05-24T16:05:00 on 
> kelvin-n[013,017-019,021-024,030-031,073-074,076,078,080-081]
> [2016-05-19T18:58:24.659] backfill test for JobID=86682 Prio=12286095 
> Partition=compute
> [2016-05-19T18:58:24.659] debug:  job 86682: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.659] debug:  job 86682: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.659] debug:  job 86682: best_fit topology failure: no 
> switch currently has sufficient resource to satisfy the request
> [2016-05-19T18:58:24.659] Job 86682 to start at 2016-05-20T21:24:29, end at 
> 2016-05-22T21:24:00 on kelvin-n[013,017-019,021-022]
> etc
> 
> 
> 
> #############################################################
> # it's even worse this morning: 25 idle nodes!
> #############################################################
> 
> root@kelvin01:/etc/slurm # sinfo
> PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
> compute      up 3-00:00:00      1 drain* kelvin-n027
> compute      up 3-00:00:00     70  alloc 
> kelvin-n[001-026,028-038,040,049-054,072-094,096-098]
> compute      up 3-00:00:00     25   idle kelvin-n[039,041-048,055-067,069-071]
> debug*       up      30:00      2   idle kelvin-n[099-100]
> root@kelvin01:/etc/slurm # squeue --start | head
>              JOBID PARTITION     NAME     USER ST          START_TIME  NODES 
> SCHEDNODES           NODELIST(REASON)
>              86678   compute GdDC_25_   lucida PD 2016-05-20T21:24:00      6 
> kelvin-n[039,041-045 (Priority)
>              86679   compute        B  shuklag PD 2016-05-20T21:24:00      8 
> kelvin-n[055-062]    (Priority)
>              86680   compute       BA  shuklag PD 2016-05-20T21:24:00      8 
> kelvin-n[063-067,069 (Priority)
>              86682   compute GdDC_5_9   lucida PD 2016-05-20T21:24:00      6 
> kelvin-n[049-054]    (Priority)
>              86683   compute GdDC_5_9   lucida PD 2016-05-20T21:24:29      6 
> kelvin-n[023,030-031 (Priority)
>              86801   compute debo_ben  tandons PD 2016-05-20T21:24:29      1 
> kelvin-n024          (Priority)
>              86818   compute    C_opt  watsong PD 2016-05-20T21:24:29      6 
> kelvin-n[013,017-019 (Resources)
>              86684   compute GdDC_5_9   lucida PD 2016-05-21T16:05:09      6 
> (null)               (Priority)
>              86685   compute GdDC_10_   lucida PD 2016-05-21T16:05:09      6 
> (null)               (Priority)
> root@kelvin01:/etc/slurm # squeue -tr -l
> Fri May 20 08:59:35 2016
>              JOBID PARTITION     NAME     USER    STATE       TIME TIME_LIMI  
> NODES NODELIST(REASON)
>              86699   compute antiferr  tandons  RUNNING 2-19:43:02 3-00:00:00 
>      2 kelvin-n[019,021]
>              86700   compute   ferro3  tandons  RUNNING 2-19:41:32 3-00:00:00 
>      2 kelvin-n[030-031]
>              86745   compute lco_int_  gavinai  RUNNING 1-09:42:53 3-00:00:00 
>      6 kelvin-n[040,075,084,096-098]
>              86681   compute lco_int_  gavinai  RUNNING   23:35:06 1-12:00:00 
>      6 kelvin-n[013,017-018,022-024]
>              86729   compute lco_int_  gavinai  RUNNING   23:23:05 1-00:00:00 
>      6 kelvin-n[049-054]
>              86765   compute debo_ace  tandons  RUNNING   23:05:35 3-00:00:00 
>      1 kelvin-n072
>              86766   compute   acetic  tandons  RUNNING   20:42:00 3-00:00:00 
>      1 kelvin-n038
>              86662   compute        B  montese  RUNNING   18:39:00 3-00:00:00 
>     16 kelvin-n[032-035,077,079,085-094]
>              86808   compute lco_int_  gavinai  RUNNING   17:03:24 3-00:00:00 
>      6 kelvin-n[016,020,036-037,082-083]
>              86674   compute GdDC_20_   lucida  RUNNING   16:54:26 2-00:00:00 
>      6 kelvin-n[073-074,076,078,080-081]
>              86675   compute GdDC_20_   lucida  RUNNING   16:47:26 2-00:00:00 
>      6 kelvin-n[001-006]
>              86676   compute GdDC_25_   lucida  RUNNING   15:11:24 2-00:00:00 
>      6 kelvin-n[014-015,025-026,028-029]
>              86677   compute GdDC_25_   lucida  RUNNING   13:16:34 2-00:00:00 
>      6 kelvin-n[007-012]
> 
> 
> 
> 
> -- 
> Paddy Doyle
> Trinity Centre for High Performance Computing,
> Lloyd Building, Trinity College Dublin, Dublin 2, Ireland.
> Phone: +353-1-896-3725
> http://www.tchpc.tcd.ie/

> #
> # Example slurm.conf file. Please run configurator.html
> # (in doc/html) to build a configuration file customized
> # for your environment.
> #
> #
> # slurm.conf file generated by configurator.html.
> #
> # See the slurm.conf man page for more information.
> #
> ClusterName=kelvin
> ControlMachine=kelvin01
> ControlAddr=192.168.19.254
> BackupController=kelvin-n001
> BackupAddr=192.168.16.1
> #
> SlurmUser=root
> #SlurmdUser=root
> SlurmctldPort=6817
> SlurmdPort=6818
> AuthType=auth/munge
> EnforcePartLimits=YES
> JobRequeue=1
> #JobCredentialPrivateKey=
> #JobCredentialPublicCertificate=
> StateSaveLocation=/var/slurm_state/kelvin
> #SlurmdSpoolDir=/tmp/slurmd
> SwitchType=switch/none
> MpiDefault=none
> SlurmctldPidFile=/var/run/slurmctld.pid
> SlurmdPidFile=/var/run/slurmd.pid
> #ProctrackType=proctrack/pgid
> ProctrackType=proctrack/cgroup
> #PluginDir=
> CacheGroups=0
> #FirstJobId=
> ReturnToService=1
> #MaxJobCount=
> #PlugStackConfig=
> #PropagatePrioProcess=
> #PropagateResourceLimits=
> #PropagateResourceLimitsExcept=
> #PropagateResourceLimits=NONE
> PropagateResourceLimitsExcept=CPU,RSS,DATA,AS
> Prolog=/etc/slurm/prolog
> PrologFlags=Alloc
> Epilog=/etc/slurm/slurm.epilog.clean
> EpilogSlurmctld=/etc/slurm/slurm.epilogslurmctld
> #SrunProlog=
> #SrunEpilog=
> #TaskProlog=
> #TaskEpilog=
> #TaskPlugin=
> TaskPlugin=task/cgroup
> #TrackWCKey=no
> #TreeWidth=50
> #TmpFs=
> #UsePAM=
> #
> # TIMERS
> SlurmctldTimeout=300
> SlurmdTimeout=300
> HealthCheckInterval=3600
> HealthCheckProgram=/etc/slurm/slurm.healthcheck
> InactiveLimit=0
> MinJobAge=300
> KillWait=30
> Waittime=0
> RebootProgram=/sbin/reboot
> #
> # SCHEDULING
> SchedulerType=sched/backfill
> #SchedulerType=sched/wiki
> SchedulerPort=7321
> SelectType=select/cons_res
> SelectTypeParameters=CR_Core_Memory
> #SchedulerAuth=
> #SchedulerPort=
> #SchedulerRootFilter=
> FastSchedule=0
> #PriorityType=priority/multifactor
> #PriorityDecayHalfLife=14-0
> #PriorityUsageResetPeriod=14-0
> #PriorityWeightFairshare=100000
> #PriorityWeightAge=1000
> #PriorityWeightPartition=10000
> #PriorityWeightJobSize=1000
> #PriorityMaxAge=1-0
> #
> # LOGGING
> SlurmctldDebug=3
> SlurmctldLogFile=/var/log/slurm.log
> SlurmdDebug=3
> SlurmdLogFile=/var/log/slurm.log
> JobCompType=jobcomp/none
> #JobCompLoc=
> #
> # ACCOUNTING
> #JobAcctGatherType=jobacct_gather/linux
> #JobAcctGatherFrequency=30
> #
> # LOGGING AND ACCOUNTING
> #AccountingStorageEnforce=0
> #AccountingStorageEnforce=limits
> AccountingStorageEnforce=safe # don't start a job unless there's enough 
> balance
> AccountingStorageHost=service01
> #AccountingStorageLoc=
> #AccountingStoragePass=
> #AccountingStoragePort=
> AccountingStorageType=accounting_storage/slurmdbd
> #AccountingStorageUser=
> #JobCompHost=
> #JobCompLoc=
> #JobCompPass=
> #JobCompPort=
> #JobCompUser=
> JobAcctGatherFrequency=30
> JobAcctGatherType=jobacct_gather/cgroup
> #AccountingStorageType=accounting_storage/slurmdbd
> #AccountingStorageHost=
> #AccountingStorageLoc=
> #AccountingStoragePass=
> #AccountingStorageUser=
> 
> 
> 
> # Activate the Multi-factor Job Priority Plugin with decay
> PriorityType=priority/multifactor
> 
> # apply decay of 2 weeks
> #PriorityDecayHalfLife=14-0
> # for slurm-bank 1.2
> #PriorityDecayHalfLife=0
> # for slurm-bank 1.3
> PriorityDecayHalfLife=14-0
> 
> 
> # reset usage after 28 days
> #PriorityUsageResetPeriod=MONTHLY
> PriorityUsageResetPeriod=NONE
> 
> # The larger the job, the greater its job size priority.
> #PriorityFavorSmall=YES
> 
> # The job's age factor reaches 1.0 after waiting in the
> # queue for 2 weeks.
> PriorityMaxAge=14-0
> 
> # re-calc priority
> PriorityCalcPeriod=00:01:00
> 
> # This next group determines the weighting of each of the
> # components of the Multi-factor Job Priority Plugin.
> # The default value for each of the following is 1.
> PriorityWeightAge=10000000
> PriorityWeightFairshare=10000000
> PriorityWeightJobSize=10000000
> PriorityWeightPartition=10000000
> PriorityWeightQOS=0 # don't use the qos factor
> 
> 
> # describe the node's memory (only one of the two following options is 
> allowed)
> #DefMemPerCPU=1900
> DefMemPerNode=23000
> 
> MaxMemPerNode=24000
> 
> # turn on the topology/tree plugin
> TopologyPlugin=topology/tree
> 
> # COMPUTE NODES
> #NodeName=DEFAULT State=UNKNOWN Feature=debug Sockets=2 CoresPerSocket=6 
> ThreadsPerCore=1
> NodeName=kelvin-n[001-067,069-094,096-100] RealMemory=24020 Sockets=2 
> CoresPerSocket=6 ThreadsPerCore=1 State=UNKNOWN
> #NodeName=kelvin-n[001-067,069,070,072-094,096-100] RealMemory=24020 
> Sockets=2 CoresPerSocket=6 ThreadsPerCore=1 State=UNKNOWN
> #NodeName=kelvin-n[001-070,072-100] RealMemory=24020 Sockets=2 
> CoresPerSocket=6 ThreadsPerCore=1 State=UNKNOWN
> #NodeName=kelvin-n099 RealMemory=19980 Sockets=2 CoresPerSocket=6 
> ThreadsPerCore=1 State=UNKNOWN
> 
> 
> #PartitionName=compute   Nodes=kelvin-n[001-096] Default=NO MaxTime=72:00:00 
> State=UP 
> #PartitionName=debug     Nodes=kelvin-n[097-100] Default=YES MaxTime=3:00:00 
> State=UP
> #PartitionName=compute   Nodes=kelvin-n[001-067,069,070,072-094,096-098] 
> Default=NO DefaultTime=01:00:00 MaxTime=72:00:00 State=UP Shared=Exclusive
> PartitionName=compute   Nodes=kelvin-n[001-067,069-094,096-098] Default=NO 
> DefaultTime=01:00:00 MaxTime=72:00:00 State=UP Shared=Exclusive
> PartitionName=debug     Nodes=kelvin-n[099-100] Default=YES 
> DefaultTime=00:30:00 MaxTime=00:30:00 State=UP Shared=Exclusive
> 

> 
> # Rack C-04[42]
> SwitchName=kelvinibsw03 Nodes=kelvin-n[025-048]
> 
> # Rack C-02[42]
> SwitchName=kelvinibsw04 Nodes=kelvin-n[073-094,096-100]
> 
> # Rack C-02[17]
> SwitchName=kelvinibsw05 Nodes=kelvin-n[049-067,069-072]
> 
> # Rack C-04[17]
> SwitchName=kelvinibsw06 Nodes=kelvin-n[001-024]
> 
> # Rack C-03[6] (top-level switch)
> SwitchName=kelvinibsw01 
> Switches=kelvinibsw03,kelvinibsw04,kelvinibsw05,kelvinibsw06
> # (and kelvin01,io03,io04,io06)
> 
> # Rack C-03[7] (top-level switch)
> SwitchName=kelvinibsw02 
> Switches=kelvinibsw03,kelvinibsw04,kelvinibsw05,kelvinibsw06
> # (and io01,io02,io05)
> 


-- 
Paddy Doyle
Trinity Centre for High Performance Computing,
Lloyd Building, Trinity College Dublin, Dublin 2, Ireland.
Phone: +353-1-896-3725
http://www.tchpc.tcd.ie/

Reply via email to