I'm running into an issue on version 14.03.3 and now 14.03.4 where partition 
based preemption using SUSPEND,GANG is simply not working.

My full slurm.conf is below.  The relevant lines:

PreemptMode=SUSPEND,GANG
PreemptType=preempt/partition_prio
SchedulerType=sched/backfill
SelectType=select/cons_res
SelectTypeParameters=CR_Core_Memory,CR_CORE_DEFAULT_DIST_BLOCK

PartitionName=DEFAULT 
Nodes=c[0101-0104],c[0133-0134],c[0237-0238],c[0925-0926]n[1-2],c[0935-0936] 
DefMemPerCPU=1900
PartitionName=hi Nodes=c0101 Priority=3 State=UP
PartitionName=low Nodes=c0101 Priority=2 State=UP

While debugging the Preemption not working issue I found that putting only 
certain nodes in "Nodes=" on the partitions is when SUSPEND stops working.

$ scontrol update PartitionName=low 
Nodes=c[0101-0104],c[0133-0134],c[0237-0238],c[0925-0926]n[1-2],c[0935-0936]
$ scontrol update PartitionName=hi Nodes=c[0101-0104]

$ sbatch -o /dev/null --array 1-224 -N1 -p low --qos background bin/sleep.sh
Submitted batch job 1322
$ sbatch -o /dev/null --array 1-20 -N1 -p hi --qos background bin/sleep.sh
Submitted batch job 1546

$ squeue -t S
             JOBID PARTITION     NAME     USER ST       TIME  NODES 
NODELIST(REASON)
$ squeue -t PD
             JOBID PARTITION     NAME     USER ST       TIME  NODES 
NODELIST(REASON)
       1546_[1-20]        hi sleep.sh treydock PD       0:00      1 (Resources)

$ scontrol update PartitionName=hi 
Nodes=c[0101-0104],c[0133-0134],c[0237-0238],c[0925-0926]n[1-2],c[0935-0936]
$ squeue -t S
             JOBID PARTITION     NAME     USER ST       TIME  NODES 
NODELIST(REASON)
           1322_43       low sleep.sh treydock  S       1:13      1 c0133
           1322_44       low sleep.sh treydock  S       1:13      1 c0133
           1322_45       low sleep.sh treydock  S       1:13      1 c0133
           1322_46       low sleep.sh treydock  S       1:13      1 c0133
           1322_47       low sleep.sh treydock  S       1:13      1 c0133
           1322_48       low sleep.sh treydock  S       1:13      1 c0133
           1322_49       low sleep.sh treydock  S       1:13      1 c0133
           1322_50       low sleep.sh treydock  S       1:13      1 c0133
           1322_51       low sleep.sh treydock  S       1:13      1 c0133
           1322_52       low sleep.sh treydock  S       1:13      1 c0133
           1322_33       low sleep.sh treydock  S       1:14      1 c0133
           1322_34       low sleep.sh treydock  S       1:14      1 c0133
           1322_35       low sleep.sh treydock  S       1:14      1 c0133
           1322_36       low sleep.sh treydock  S       1:14      1 c0133
           1322_37       low sleep.sh treydock  S       1:14      1 c0133
           1322_38       low sleep.sh treydock  S       1:14      1 c0133
           1322_39       low sleep.sh treydock  S       1:14      1 c0133
           1322_40       low sleep.sh treydock  S       1:14      1 c0133
           1322_41       low sleep.sh treydock  S       1:14      1 c0133
           1322_42       low sleep.sh treydock  S       1:14      1 c0133

Now what makes this VERY troubling is I can set Nodes=c[0133-0134] for 
PartitionName=hi and SUSPEND works.  It's only c0101-c0104.  Those nodes are 
our oldest compute nodes, and the only ones that only have GigE (no IB).  Not 
sure what about those nodes is breaking SLURM's ability to SUSPEND, or if it's 
something specific to SLURM and these nodes are just the trigger.

$ scontrol update PartitionName=low 
Nodes=c[0101-0104],c[0133-0134],c[0237-0238],c[0925-0926]n[1-2],c[0935-0936]
$ scontrol update PartitionName=hi Nodes=c[0101-0104]

$ sbatch -o /dev/null --array 1-224 -N1 -p low --qos background bin/sleep.sh
Submitted batch job 1810
$ sbatch -o /dev/null --array 1-20 -N1 -p hi --qos background bin/sleep.sh
Submitted batch job 2034

$ squeue -t S
             JOBID PARTITION     NAME     USER ST       TIME  NODES 
NODELIST(REASON)
$ squeue -t PD
             JOBID PARTITION     NAME     USER ST       TIME  NODES 
NODELIST(REASON)
       2034_[1-20]        hi sleep.sh treydock PD       0:00      1 (Resources)

$ scontrol update PartitionName=hi Nodes=c[0133-0134]
$ $ squeue -t S
             JOBID PARTITION     NAME     USER ST       TIME  NODES 
NODELIST(REASON)
           1810_33       low sleep.sh treydock  S       1:32      1 c0133
           1810_34       low sleep.sh treydock  S       1:32      1 c0133
           1810_35       low sleep.sh treydock  S       1:32      1 c0133
           1810_36       low sleep.sh treydock  S       1:32      1 c0133
           1810_37       low sleep.sh treydock  S       1:32      1 c0133
           1810_38       low sleep.sh treydock  S       1:32      1 c0133
           1810_39       low sleep.sh treydock  S       1:32      1 c0133
           1810_40       low sleep.sh treydock  S       1:32      1 c0133
           1810_41       low sleep.sh treydock  S       1:32      1 c0133
           1810_42       low sleep.sh treydock  S       1:32      1 c0133
           1810_43       low sleep.sh treydock  S       1:32      1 c0133
           1810_44       low sleep.sh treydock  S       1:32      1 c0133
           1810_45       low sleep.sh treydock  S       1:32      1 c0133
           1810_46       low sleep.sh treydock  S       1:32      1 c0133
           1810_47       low sleep.sh treydock  S       1:32      1 c0133
           1810_48       low sleep.sh treydock  S       1:32      1 c0133
           1810_49       low sleep.sh treydock  S       1:32      1 c0133
           1810_50       low sleep.sh treydock  S       1:32      1 c0133
           1810_51       low sleep.sh treydock  S       1:32      1 c0133
           1810_52       low sleep.sh treydock  S       1:32      1 c0133

Thanks
- Trey

slurm.conf:

AccountingStorageEnforce=limits,qos
AccountingStorageHost=<OMIT>
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd
AccountingStoreJobComment=YES
AuthType=auth/munge
CacheGroups=0
CheckpointType=checkpoint/none
ClusterName=brazos
CompleteWait=0
ControlMachine=<OMIT>
CryptoType=crypto/munge
DefaultStorageHost=<OMIT>
DefaultStoragePort=6819
DefaultStorageType=slurmdbd
DisableRootJobs=NO
Epilog=/home/slurm/epilog
EpilogMsgTime=2000
FastSchedule=1
FirstJobId=1
GetEnvTimeout=2
GroupUpdateForce=0
GroupUpdateTime=600
HealthCheckInterval=0
HealthCheckNodeState=IDLE
InactiveLimit=0
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
JobCheckpointDir=/var/lib/slurm/checkpoint
JobCompType=jobcomp/none
JobRequeue=1
JobSubmitPlugins=lua
KillOnBadExit=0
KillWait=15
MailProg=/usr/bin/Mail
MaxJobCount=25000
MaxJobId=2147483647
MaxMemPerCPU=0
MaxMemPerNode=0
MaxStepCount=40000
MaxTasksPerNode=128
MessageTimeout=10
MinJobAge=300
MpiDefault=pmi2
MpiParams=ports=30000-39999
OverTimeLimit=0
PlugStackConfig=/home/slurm/conf/plugstack.conf
PluginDir=/usr/lib64/slurm
PreemptMode=SUSPEND,GANG
PreemptType=preempt/partition_prio
PriorityCalcPeriod=5
PriorityDecayHalfLife=7-0
PriorityFavorSmall=NO
PriorityMaxAge=7-0
PriorityType=priority/multifactor
PriorityUsageResetPeriod=NONE
PriorityWeightAge=2000000
PriorityWeightFairshare=4000000
PriorityWeightJobSize=1000000
PriorityWeightPartition=5000000
PriorityWeightQOS=10000000
ProctrackType=proctrack/linuxproc
Prolog=/home/slurm/prolog
PropagatePrioProcess=0
PropagateResourceLimits=NONE
ResvOverRun=0
ReturnToService=2
SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_res
SelectTypeParameters=CR_Core_Memory,CR_CORE_DEFAULT_DIST_BLOCK
SlurmSchedLogFile=/var/log/slurm/slurmsched.log
SlurmSchedLogLevel=0
SlurmUser=slurm
SlurmctldDebug=debug5
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
SlurmctldPort=6817
SlurmctldTimeout=300
SlurmdDebug=debug5
SlurmdLogFile=/var/log/slurm/slurmd.log
SlurmdPidFile=/var/run/slurm/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmdTimeout=300
SlurmdUser=root
StateSaveLocation=/var/lib/slurm/state
SwitchType=switch/none
TaskPlugin=task/affinity
TaskPluginParam=Sched
TaskProlog=/home/slurm/taskprolog
TmpFS=/tmp
TopologyPlugin=topology/none
TrackWCKey=no
TreeWidth=50
UsePAM=0
VSizeFactor=0
WaitTime=0

NodeName=c0101 NodeAddr=192.168.200.10 CPUs=8 Sockets=2 CoresPerSocket=4 
ThreadsPerCore=1 RealMemory=15900 TmpDisk=16000 
Feature=core8,mem16gb,gig,harpertown State=UNKNOWN
NodeName=c0102 NodeAddr=192.168.200.11 CPUs=8 Sockets=2 CoresPerSocket=4 
ThreadsPerCore=1 RealMemory=15900 TmpDisk=16000 
Feature=core8,mem16gb,gig,harpertown State=UNKNOWN
NodeName=c0103 NodeAddr=192.168.200.12 CPUs=8 Sockets=2 CoresPerSocket=4 
ThreadsPerCore=1 RealMemory=15900 TmpDisk=16000 
Feature=core8,mem16gb,gig,harpertown State=UNKNOWN
NodeName=c0104 NodeAddr=192.168.200.13 CPUs=8 Sockets=2 CoresPerSocket=4 
ThreadsPerCore=1 RealMemory=15900 TmpDisk=16000 
Feature=core8,mem16gb,gig,harpertown State=UNKNOWN
NodeName=c0133 NodeAddr=192.168.200.42 CPUs=32 Sockets=4 CoresPerSocket=8 
ThreadsPerCore=1 RealMemory=129000 TmpDisk=16000 
Feature=core32,mem128gb,ib_ddr,piledriver,abu_dhabi State=UNKNOWN
NodeName=c0134 NodeAddr=192.168.200.43 CPUs=32 Sockets=4 CoresPerSocket=8 
ThreadsPerCore=1 RealMemory=129000 TmpDisk=16000 
Feature=core32,mem128gb,ib_ddr,piledriver,abu_dhabi State=UNKNOWN
NodeName=c0237 NodeAddr=192.168.200.87 CPUs=32 Sockets=4 CoresPerSocket=8 
ThreadsPerCore=1 RealMemory=129000 TmpDisk=16000 
Feature=core32,mem128gb,ib_ddr,bulldozer,interlagos State=UNKNOWN
NodeName=c0238 NodeAddr=192.168.200.88 CPUs=32 Sockets=4 CoresPerSocket=8 
ThreadsPerCore=1 RealMemory=129000 TmpDisk=16000 
Feature=core32,mem128gb,ib_ddr,bulldozer,interlagos State=UNKNOWN
NodeName=c0925n1 NodeAddr=192.168.201.67 CPUs=8 Sockets=2 CoresPerSocket=4 
ThreadsPerCore=1 RealMemory=32100 TmpDisk=16000 
Feature=core8,mem32gb,ib_ddr,k10,shanghai State=UNKNOWN
NodeName=c0925n2 NodeAddr=192.168.201.68 CPUs=8 Sockets=2 CoresPerSocket=4 
ThreadsPerCore=1 RealMemory=32100 TmpDisk=16000 
Feature=core8,mem32gb,ib_ddr,k10,shanghai State=UNKNOWN
NodeName=c0926n1 NodeAddr=192.168.201.69 CPUs=8 Sockets=2 CoresPerSocket=4 
ThreadsPerCore=1 RealMemory=32100 TmpDisk=16000 
Feature=core8,mem32gb,ib_ddr,k10,shanghai State=UNKNOWN
NodeName=c0926n2 NodeAddr=192.168.201.70 CPUs=8 Sockets=2 CoresPerSocket=4 
ThreadsPerCore=1 RealMemory=32100 TmpDisk=16000 
Feature=core8,mem32gb,ib_ddr,k10,shanghai State=UNKNOWN
NodeName=c0935 NodeAddr=192.168.201.97 CPUs=32 Sockets=4 CoresPerSocket=8 
ThreadsPerCore=1 RealMemory=64300 TmpDisk=16000 
Feature=core32,mem64gb,ib_ddr,piledriver,abu_dhabi State=UNKNOWN
NodeName=c0936 NodeAddr=192.168.201.98 CPUs=32 Sockets=4 CoresPerSocket=8 
ThreadsPerCore=1 RealMemory=64300 TmpDisk=16000 
Feature=core32,mem64gb,ib_ddr,piledriver,abu_dhabi State=UNKNOWN

PartitionName=DEFAULT 
Nodes=c[0101-0104],c[0133-0134],c[0237-0238],c[0925-0926]n[1-2],c[0935-0936] 
DefMemPerCPU=1900
PartitionName=admin Priority=100 AllowGroups=mgmt,root MaxTime=72:00:00 State=UP
PartitionName=hi Nodes=c0101 Priority=3 State=UP
PartitionName=low Nodes=c0101 Priority=2 State=UP
PartitionName=hepx Nodes=c[0101-0104] Priority=99 AllowQOS=hepx 
AllowGroups=hepx,mgmt MaxTime=120:00:00 PreemptMode=OFF State=UP
PartitionName=idhmc Nodes=c[0101-0104] Priority=99 AllowQOS=idhmc 
AllowGroups=idhmc,mgmt MaxTime=72:00:00 PreemptMode=OFF State=UP
PartitionName=mpi 
Nodes=c[0133-0134],c[0237-0238],c[0925-0926]n[1-2],c[0935-0936] Priority=50 
AllowQOS=mpi DenyAccounts=hepx,idhmc MinNodes=2 MaxTime=48:00:00 
PreemptMode=OFF State=UP
PartitionName=serial Default=YES Nodes=c[0101-0104] Priority=50 
AllowQOS=serial,aglife DenyAccounts=hepx,idhmc MaxNodes=1 MaxTime=72:00:00 
PreemptMode=OFF State=UP
PartitionName=background Priority=1 AllowQOS=background,hepx_bg,idhmc_bg 
MaxNodes=1 MaxTime=96:00:00 State=UP
PartitionName=grid Priority=1 AllowQOS=hepx_bg 
AllowGroups=hepx,cms,suragrid,mgmt MaxNodes=1 MaxTime=72:00:00 State=UP

=============================

Trey Dockendorf 
Systems Analyst I 
Texas A&M University 
Academy for Advanced Telecommunications and Learning Technologies 
Phone: (979)458-2396 
Email: [email protected] 
Jabber: [email protected]

Reply via email to