Hi all,
We're seeing a really strange scheduling issue on one of our clusters, whereby
jobs are not being scheduled, even though there are many idle nodes.
In fact there were 19 idle nodes, with the first priority job only needing 1;
the next few needed 6 nodes etc.
Turning up Debug logging showed lots of "best_fit topology failure: no switch
currently has sufficient resource to satisfy the request" messages in the logs,
but even so the 'max_switch_wait' (which we haven't set and so should default to
300 seconds) doesn't seem to be honoured.
This morning there are 25 idle nodes, with the top priority job needing 6.
I'll copy in the slurm.conf, topology.conf, and some relevant logs and queue
snapshots.
Any help would be appreciated.
Thanks,
Paddy
#############################################################
# the cluster and queue state yesterday evening:
#############################################################
root@kelvin01:/etc/slurm # sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
compute up 3-00:00:00 1 drain* kelvin-n027
compute up 3-00:00:00 76 alloc
kelvin-n[001-026,028-045,049-054,072-094,096-098]
compute up 3-00:00:00 19 idle kelvin-n[046-048,055-067,069-071]
debug* up 30:00 2 idle kelvin-n[099-100]
root@kelvin01:/etc/slurm # squeue --start | head
JOBID PARTITION NAME USER ST START_TIME NODES
SCHEDNODES NODELIST(REASON)
86801 compute debo_ben aaaaaaa PD 2016-05-20T12:20:35 1
kelvin-n039 (Priority)
86677 compute GdDC_25_ bbbbbb PD 2016-05-20T21:24:29 6
(null) (Resources)
86678 compute GdDC_25_ bbbbbb PD 2016-05-20T21:24:29 6
(null) (Priority)
86679 compute B ccccccc PD 2016-05-20T21:24:29 8
(null) (Priority)
86680 compute BA ccccccc PD 2016-05-20T21:24:29 8
(null) (Priority)
86682 compute GdDC_5_9 bbbbbb PD 2016-05-20T21:24:29 6
(null) (Priority)
86683 compute GdDC_5_9 bbbbbb PD 2016-05-20T21:24:29 6
(null) (Priority)
86684 compute GdDC_5_9 bbbbbb PD 2016-05-20T21:24:29 6
(null) (Priority)
86685 compute GdDC_10_ bbbbbb PD 2016-05-20T21:24:29 6
(null) (Priority)
root@kelvin01:/etc/slurm # squeue -tr
JOBID PARTITION NAME USER ST TIME NODES
NODELIST(REASON)
86699 compute antiferr aaaaaaa R 2-05:41:33 2
kelvin-n[019,021]
86700 compute ferro3 aaaaaaa R 2-05:40:03 2
kelvin-n[030-031]
86745 compute lco_int_ ddddddd R 19:41:24 6
kelvin-n[040,075,084,096-098]
86681 compute lco_int_ ddddddd R 9:33:37 6
kelvin-n[013,017-018,022-024]
86729 compute lco_int_ ddddddd R 9:21:36 6
kelvin-n[049-054]
86765 compute debo_ace aaaaaaa R 9:04:06 1 kelvin-n072
86766 compute acetic aaaaaaa R 6:40:31 1 kelvin-n038
86793 compute lco_int_ ddddddd R 6:37:31 6
kelvin-n[039,041-045]
86662 compute B eeeeeee R 4:37:31 16
kelvin-n[032-035,077,079,085-094]
86810 compute C_opt fffffff R 3:47:40 6
kelvin-n[007-012]
86808 compute lco_int_ ddddddd R 3:01:55 6
kelvin-n[016,020,036-037,082-083]
86674 compute GdDC_20_ bbbbbb R 2:52:57 6
kelvin-n[073-074,076,078,080-081]
86675 compute GdDC_20_ bbbbbb R 2:45:57 6
kelvin-n[001-006]
86676 compute GdDC_25_ bbbbbb R 1:09:55 6
kelvin-n[014-015,025-026,028-029]
#############################################################
# snips from logs at that time:
#############################################################
[2016-05-19T18:58:24.658] backfill: beginning
[2016-05-19T18:58:24.658] debug: backfill: 45 jobs to backfill
[2016-05-19T18:58:24.658] backfill test for JobID=86677 Prio=12823347
Partition=compute
[2016-05-19T18:58:24.658] debug: job 86677: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.658] debug: job 86677: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.658] debug: job 86677: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.658] Job 86677 to start at 2016-05-20T21:24:29, end at
2016-05-22T21:24:00 on kelvin-n[013,017-019,021-022]
[2016-05-19T18:58:24.658] backfill test for JobID=86678 Prio=12823322
Partition=compute
[2016-05-19T18:58:24.658] debug: job 86678: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.658] debug: job 86678: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.658] debug: job 86678: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.658] Job 86678 to start at 2016-05-20T21:24:29, end at
2016-05-22T21:24:00 on kelvin-n[013,017-019,021-022]
[2016-05-19T18:58:24.658] backfill test for JobID=86679 Prio=12662420
Partition=compute
[2016-05-19T18:58:24.659] debug: job 86679: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.659] debug: job 86679: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.659] debug: job 86679: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.659] Job 86679 to start at 2016-05-20T21:24:29, end at
2016-05-23T21:24:00 on kelvin-n[013,017-019,021-024]
[2016-05-19T18:58:24.659] backfill test for JobID=86680 Prio=12662172
Partition=compute
[2016-05-19T18:58:24.659] debug: job 86680: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.659] debug: job 86680: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.659] debug: job 86680: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.659] Job 86680 to start at 2016-05-20T21:24:29, end at
2016-05-23T21:24:00 on kelvin-n[013,017-019,021-024]
[2016-05-19T18:58:24.659] backfill test for JobID=86731 Prio=12564061
Partition=compute
[2016-05-19T18:58:24.659] Job 86731 to start at 2016-05-21T16:05:09, end at
2016-05-24T16:05:00 on
kelvin-n[013,017-019,021-024,030-031,073-074,076,078,080-081]
[2016-05-19T18:58:24.659] backfill test for JobID=86682 Prio=12286095
Partition=compute
[2016-05-19T18:58:24.659] debug: job 86682: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.659] debug: job 86682: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.659] debug: job 86682: best_fit topology failure: no
switch currently has sufficient resource to satisfy the request
[2016-05-19T18:58:24.659] Job 86682 to start at 2016-05-20T21:24:29, end at
2016-05-22T21:24:00 on kelvin-n[013,017-019,021-022]
etc
#############################################################
# it's even worse this morning: 25 idle nodes!
#############################################################
root@kelvin01:/etc/slurm # sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
compute up 3-00:00:00 1 drain* kelvin-n027
compute up 3-00:00:00 70 alloc
kelvin-n[001-026,028-038,040,049-054,072-094,096-098]
compute up 3-00:00:00 25 idle kelvin-n[039,041-048,055-067,069-071]
debug* up 30:00 2 idle kelvin-n[099-100]
root@kelvin01:/etc/slurm # squeue --start | head
JOBID PARTITION NAME USER ST START_TIME NODES
SCHEDNODES NODELIST(REASON)
86678 compute GdDC_25_ lucida PD 2016-05-20T21:24:00 6
kelvin-n[039,041-045 (Priority)
86679 compute B shuklag PD 2016-05-20T21:24:00 8
kelvin-n[055-062] (Priority)
86680 compute BA shuklag PD 2016-05-20T21:24:00 8
kelvin-n[063-067,069 (Priority)
86682 compute GdDC_5_9 lucida PD 2016-05-20T21:24:00 6
kelvin-n[049-054] (Priority)
86683 compute GdDC_5_9 lucida PD 2016-05-20T21:24:29 6
kelvin-n[023,030-031 (Priority)
86801 compute debo_ben tandons PD 2016-05-20T21:24:29 1
kelvin-n024 (Priority)
86818 compute C_opt watsong PD 2016-05-20T21:24:29 6
kelvin-n[013,017-019 (Resources)
86684 compute GdDC_5_9 lucida PD 2016-05-21T16:05:09 6
(null) (Priority)
86685 compute GdDC_10_ lucida PD 2016-05-21T16:05:09 6
(null) (Priority)
root@kelvin01:/etc/slurm # squeue -tr -l
Fri May 20 08:59:35 2016
JOBID PARTITION NAME USER STATE TIME TIME_LIMI
NODES NODELIST(REASON)
86699 compute antiferr tandons RUNNING 2-19:43:02 3-00:00:00
2 kelvin-n[019,021]
86700 compute ferro3 tandons RUNNING 2-19:41:32 3-00:00:00
2 kelvin-n[030-031]
86745 compute lco_int_ gavinai RUNNING 1-09:42:53 3-00:00:00
6 kelvin-n[040,075,084,096-098]
86681 compute lco_int_ gavinai RUNNING 23:35:06 1-12:00:00
6 kelvin-n[013,017-018,022-024]
86729 compute lco_int_ gavinai RUNNING 23:23:05 1-00:00:00
6 kelvin-n[049-054]
86765 compute debo_ace tandons RUNNING 23:05:35 3-00:00:00
1 kelvin-n072
86766 compute acetic tandons RUNNING 20:42:00 3-00:00:00
1 kelvin-n038
86662 compute B montese RUNNING 18:39:00 3-00:00:00
16 kelvin-n[032-035,077,079,085-094]
86808 compute lco_int_ gavinai RUNNING 17:03:24 3-00:00:00
6 kelvin-n[016,020,036-037,082-083]
86674 compute GdDC_20_ lucida RUNNING 16:54:26 2-00:00:00
6 kelvin-n[073-074,076,078,080-081]
86675 compute GdDC_20_ lucida RUNNING 16:47:26 2-00:00:00
6 kelvin-n[001-006]
86676 compute GdDC_25_ lucida RUNNING 15:11:24 2-00:00:00
6 kelvin-n[014-015,025-026,028-029]
86677 compute GdDC_25_ lucida RUNNING 13:16:34 2-00:00:00
6 kelvin-n[007-012]
--
Paddy Doyle
Trinity Centre for High Performance Computing,
Lloyd Building, Trinity College Dublin, Dublin 2, Ireland.
Phone: +353-1-896-3725
http://www.tchpc.tcd.ie/
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=kelvin
ControlMachine=kelvin01
ControlAddr=192.168.19.254
BackupController=kelvin-n001
BackupAddr=192.168.16.1
#
SlurmUser=root
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
EnforcePartLimits=YES
JobRequeue=1
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/slurm_state/kelvin
#SlurmdSpoolDir=/tmp/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
#ProctrackType=proctrack/pgid
ProctrackType=proctrack/cgroup
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=1
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#PropagateResourceLimits=NONE
PropagateResourceLimitsExcept=CPU,RSS,DATA,AS
Prolog=/etc/slurm/prolog
PrologFlags=Alloc
Epilog=/etc/slurm/slurm.epilog.clean
EpilogSlurmctld=/etc/slurm/slurm.epilogslurmctld
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
TaskPlugin=task/cgroup
#TrackWCKey=no
#TreeWidth=50
#TmpFs=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
HealthCheckInterval=3600
HealthCheckProgram=/etc/slurm/slurm.healthcheck
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
RebootProgram=/sbin/reboot
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerType=sched/wiki
SchedulerPort=7321
SelectType=select/cons_res
SelectTypeParameters=CR_Core_Memory
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
FastSchedule=0
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm.log
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageEnforce=limits
AccountingStorageEnforce=safe # don't start a job unless there's enough
balance
AccountingStorageHost=service01
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStoragePort=
AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageUser=
#JobCompHost=
#JobCompLoc=
#JobCompPass=
#JobCompPort=
#JobCompUser=
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
# Activate the Multi-factor Job Priority Plugin with decay
PriorityType=priority/multifactor
# apply decay of 2 weeks
#PriorityDecayHalfLife=14-0
# for slurm-bank 1.2
#PriorityDecayHalfLife=0
# for slurm-bank 1.3
PriorityDecayHalfLife=14-0
# reset usage after 28 days
#PriorityUsageResetPeriod=MONTHLY
PriorityUsageResetPeriod=NONE
# The larger the job, the greater its job size priority.
#PriorityFavorSmall=YES
# The job's age factor reaches 1.0 after waiting in the
# queue for 2 weeks.
PriorityMaxAge=14-0
# re-calc priority
PriorityCalcPeriod=00:01:00
# This next group determines the weighting of each of the
# components of the Multi-factor Job Priority Plugin.
# The default value for each of the following is 1.
PriorityWeightAge=10000000
PriorityWeightFairshare=10000000
PriorityWeightJobSize=10000000
PriorityWeightPartition=10000000
PriorityWeightQOS=0 # don't use the qos factor
# describe the node's memory (only one of the two following options is allowed)
#DefMemPerCPU=1900
DefMemPerNode=23000
MaxMemPerNode=24000
# turn on the topology/tree plugin
TopologyPlugin=topology/tree
# COMPUTE NODES
#NodeName=DEFAULT State=UNKNOWN Feature=debug Sockets=2 CoresPerSocket=6
ThreadsPerCore=1
NodeName=kelvin-n[001-067,069-094,096-100] RealMemory=24020 Sockets=2
CoresPerSocket=6 ThreadsPerCore=1 State=UNKNOWN
#NodeName=kelvin-n[001-067,069,070,072-094,096-100] RealMemory=24020 Sockets=2
CoresPerSocket=6 ThreadsPerCore=1 State=UNKNOWN
#NodeName=kelvin-n[001-070,072-100] RealMemory=24020 Sockets=2 CoresPerSocket=6
ThreadsPerCore=1 State=UNKNOWN
#NodeName=kelvin-n099 RealMemory=19980 Sockets=2 CoresPerSocket=6
ThreadsPerCore=1 State=UNKNOWN
#PartitionName=compute Nodes=kelvin-n[001-096] Default=NO MaxTime=72:00:00
State=UP
#PartitionName=debug Nodes=kelvin-n[097-100] Default=YES MaxTime=3:00:00
State=UP
#PartitionName=compute Nodes=kelvin-n[001-067,069,070,072-094,096-098]
Default=NO DefaultTime=01:00:00 MaxTime=72:00:00 State=UP Shared=Exclusive
PartitionName=compute Nodes=kelvin-n[001-067,069-094,096-098] Default=NO
DefaultTime=01:00:00 MaxTime=72:00:00 State=UP Shared=Exclusive
PartitionName=debug Nodes=kelvin-n[099-100] Default=YES
DefaultTime=00:30:00 MaxTime=00:30:00 State=UP Shared=Exclusive
# Rack C-04[42]
SwitchName=kelvinibsw03 Nodes=kelvin-n[025-048]
# Rack C-02[42]
SwitchName=kelvinibsw04 Nodes=kelvin-n[073-094,096-100]
# Rack C-02[17]
SwitchName=kelvinibsw05 Nodes=kelvin-n[049-067,069-072]
# Rack C-04[17]
SwitchName=kelvinibsw06 Nodes=kelvin-n[001-024]
# Rack C-03[6] (top-level switch)
SwitchName=kelvinibsw01
Switches=kelvinibsw03,kelvinibsw04,kelvinibsw05,kelvinibsw06
# (and kelvin01,io03,io04,io06)
# Rack C-03[7] (top-level switch)
SwitchName=kelvinibsw02
Switches=kelvinibsw03,kelvinibsw04,kelvinibsw05,kelvinibsw06
# (and io01,io02,io05)