[slurm-dev] Draining of nodes don't work!?

Bjørn-Helge Mevik Thu, 24 Mar 2011 04:48:20 -0700

It seems that "scontrol update nodename=foo state=drain" does not
prevent new jobs from starting on the node.  We've verified this on our
test cluster with an unpatched slurm 2.2.3.


Because we always put a maintenance reservation on nodes we drain, we
haven't discovered this earlier.

I guess there must be something strange with our configuration that
triggers this behaviour, otherwise it would have been discovered
already.  See the attached slurm.conf below.


Here is a demonstration:

teflon 774(1)# scontrol show node compute-0-1
NodeName=compute-0-1 Arch=x86_64 CoresPerSocket=2
   CPUAlloc=4 CPUErr=0 CPUTot=4 Features=intel,rack0,ib,sse
   Gres=(null)
   OS=Linux RealMemory=3018 Sockets=2
   State=ALLOCATED ThreadsPerCore=1 TmpDisk=10000 Weight=794
   BootTime=2010-06-07T13:30:58 SlurmdStartTime=2011-03-24T11:17:53
   Reason=(null)

teflon 775(1)# squeue -n compute-0-1
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  10221    normal arraytes      bhm   R       1:51      1 compute-0-1
  10222    normal  10221.1      bhm   R       1:51      1 compute-0-1
  10219    normal arraytes      bhm   R       1:52      1 compute-0-1
  10220    normal  10219.1      bhm   R       1:52      1 compute-0-1
teflon 776(1)# scontrol update nodename=compute-0-1 state=drain reason=testing
teflon 777(1)# scontrol show node compute-0-1
NodeName=compute-0-1 Arch=x86_64 CoresPerSocket=2
   CPUAlloc=4 CPUErr=0 CPUTot=4 Features=intel,rack0,ib,sse
   Gres=(null)
   OS=Linux RealMemory=3018 Sockets=2
   State=ALLOCATED+DRAIN ThreadsPerCore=1 TmpDisk=10000 Weight=794
   BootTime=2010-06-07T13:30:58 SlurmdStartTime=2011-03-24T11:17:53
   Reason=testing [root@2011-03-24T12:27:39]

teflon 778(1)# squeue -n compute-0-1
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  10221    normal arraytes      bhm   R       2:29      1 compute-0-1
  10222    normal  10221.1      bhm   R       2:29      1 compute-0-1
  10219    normal arraytes      bhm   R       2:30      1 compute-0-1
  10220    normal  10219.1      bhm   R       2:30      1 compute-0-1
teflon 779(1)# scancel 10222
teflon 780(1)# squeue -n compute-0-1
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  10221    normal arraytes      bhm   R       2:41      1 compute-0-1
  10219    normal arraytes      bhm   R       2:42      1 compute-0-1
  10220    normal  10219.1      bhm   R       2:42      1 compute-0-1
  10272    normal 10221.11      bhm   R       0:03      1 compute-0-1

So job 10272 started after job 10222 was cancelled.

We thought it might be that slurm allows short jobs to start if they
finish before the longest running job finishes (according to the --time
specification), so we cancelled the two jobs with longer --time (10219
and 10221).  But new jobs continue to start:

teflon 781(1)# squeue -n compute-0-1
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  10324    normal 10219.21      bhm   R       0:23      1 compute-0-1
  10221    normal arraytes      bhm   R       5:26      1 compute-0-1
  10219    normal arraytes      bhm   R       5:27      1 compute-0-1
  10272    normal 10221.11      bhm   R       2:48      1 compute-0-1
teflon 782(1)# scancel 10219
teflon 783(1)# squeue -n compute-0-1
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  10373    normal 10223.28      bhm   R       0:02      1 compute-0-1
  10374    normal 10229.30      bhm   R       0:02      1 compute-0-1
  10221    normal arraytes      bhm   R       5:47      1 compute-0-1
  10272    normal 10221.11      bhm   R       3:09      1 compute-0-1
teflon 784(1)# scancel 10221
teflon 785(1)# squeue -n compute-0-1
  JOBID PARTITION     NAME     USER  ST       TIME  NODES NODELIST(REASON)
  10391    normal 10223.32      bhm   R       0:01      1 compute-0-1
  10393    normal 10226.32      bhm   R       0:01      1 compute-0-1
  10373    normal 10223.28      bhm   R       0:18      1 compute-0-1
  10374    normal 10229.30      bhm   R       0:18      1 compute-0-1


Is this a bug, or are we misunderstanding something here?

## slurm.conf: main configuration file for SLURM
## $Id: slurm_2.2.conf,v 1.26 2011/02/15 00:41:56 root Exp $

## FIXME: check GroupUpdate*, TopologyPlugin,
## UnkillableStepProgram, UsePAM


###
### Cluster
###

ClusterName=titan
#default: AuthType=auth/munge
#default: CryptoType=crypto/munge
SlurmctldPort=6817
SlurmdPort=6818
TmpFs=/work
#default: TreeWidth=50 FIXME: try ceil(sqrt(#nodes))
TreeWidth=5

## Timers:
#default: MessageTimeout=10
## FIXME: should be reduced when/if we see that slurmd is behaving:
SlurmdTimeout=36000
WaitTime=0


###
### Slurmctld
###

ControlMachine=teflon  # should have been blaster
#default: MinJobAge=300
SlurmUser=slurm
StateSaveLocation=/state/partition1/slurm/slurmstate


###
### Nodes
###

FastSchedule=2
HealthCheckInterval=300
HealthCheckProgram=/sbin/healthcheck
ReturnToService=1
Nodename=DEFAULT CoresPerSocket=2 Sockets=2 RealMemory=3949 State=unknown 
TmpDisk=10000 Weight=2027
PartitionName=DEFAULT MaxTime=Infinite State=up Shared=NO
Include /etc/slurm/slurmnodes.conf
#FIXME: test Gres (does it need a plugin?)


###
### Jobs
###

PropagateResourceLimits=NONE
DefMemPerCPU=500
EnforcePartLimits=yes
#default: InactiveLimit=0
JobFileAppend=1
#default: JobRequeue=1
JobSubmitPlugins=lua
#default: MaxJobCount=10000
#default: MpiDefault=none #FIXME: openmpi?
#default: OverTimeLimit=0
VSizeFactor=150

## Prologs/Epilogs
## teflon-note: prologs/epilogs are on /site/sbin instead of /sbin
# run by slurmctld as SlurmUser on ControlMachine before granting a job 
allocation:
#PrologSlurmctld=
# run by slurmd on each node prior to the first job step on the node:
Prolog=/site/sbin/slurmprolog
# run by srun on the node running srun, prior to the launch of a job step:
#SrunProlog=
# run as user for each task prior to initiate the task:
TaskProlog=/site/sbin/taskprolog
# run as user for each task after the task finishes:
#TaskEpilog=
# run by srun on the node running srun, after a job step finishes:
#SrunEpilog=
# run as root on each node when job has completed
Epilog=/site/sbin/slurmepilog
# run as SlurmUser on ControlMachine after the allocation is released:
#EpilogSlurmctld=


###
### Job Priority
###

PriorityType=priority/multifactor
#default: PriorityCalcPeriod=5
#default: PriorityDecayHalfLife=7-0 #(7 days)
#default: PriorityUsageResetPeriod=NONE
#default: PriorityMaxAge=7-0 #(7 days)
#default: PriorityFavorSmall=no
PriorityWeightAge=10000
#default: PriorityWeightFairshare=0
PriorityWeightJobSize=1000
#default: PriorityWeightPartition=0
PriorityWeightQOS=10000


###
### Scheduling
###

SchedulerType=sched/backfill
#default: 
SchedulerParameters=default_queue_depth=100,defer=?,bf_interval=30,bf_window=1440,max_job_bf=50
SelectType=select/cons_res
SelectTypeParameters=CR_CPU_Memory # FIXME: perhaps Core!
PreemptMode=requeue
#PreemptMode=checkpoint         # FIXME: cancels if checkpoint is not possible!
PreemptType=preempt/qos
CompleteWait=32                 # KillWait + 2
#default: KillWait=30


###
### Checkpointing
###

# ************** WARNING ***********************
# *** ENABLING/DISABLING THIS KILLS ALL JOBS ***
# **********************************************
CheckpointType=checkpoint/blcr
# blcr & xlch: HongJia Cao <[email protected]>
# ompi: "performed with Indiana University while they
#        were adding checkpoint support to ompi"
# ompi: only supports steps.  Must be restarted manually
# xlch: does not support batch jobs (i.e., only support steps?)
JobCheckpointDir=/state/partition1/slurm/checkpoint
# FIXME: check scontrol: ImageDir/sbatch: --checkpoint/SLURM_CHECKPOINT_DIR


###
### Logging
###

SlurmctldDebug=5
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmSchedLogLevel=1
SlurmSchedLogFile=/var/log/slurm/sched.log
SlurmdDebug=5
SlurmdLogFile=/var/log/slurm/slurmd.log
#default: DebugFlags=
DebugFlags=Backfill


###
### Accounting (Slurmdbd)
###

AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=blaster
JobAcctGatherType=jobacct_gather/linux
#default: JobAcctGatherFrequency=30
ProctrackType=proctrack/linuxproc # FIXME: check out cgroup
AccountingStorageEnforce=limits,qos
# kombinasjon av associations < limits < wckeys, qos

# $Id: slurmnodes.conf,v 1.27 2011/02/12 17:01:40 root Exp $

# weight = 1000*IB + 1*mem/core + 10*cores
# (default = 2027 = ib + default mem & cores)

Nodename=compute-0-0 Feature=intel,rack0,hugemem Weight=1027
Nodename=compute-0-1 RealMemory=3018 Feature=intel,rack0,ib Weight=794
Nodename=compute-0-2 Feature=intel,rack0,hugemem Weight=1027
Nodename=compute-0-3 Feature=intel,rack0,hugemem Weight=1027
Nodename=compute-0-4 Feature=intel,rack0,ib Weight=1027
Nodename=compute-0-5 RealMemory=3018 Feature=intel,rack0,ib Weight=1794
Nodename=compute-0-6 Feature=intel,rack0,ib
Nodename=compute-0-7 Feature=intel,rack0,ib Weight=1027
Nodename=compute-0-8 RealMemory=3018 Feature=intel,rack0,ib Weight=794
Nodename=compute-0-9 RealMemory=3018 Feature=intel,rack0,ib Weight=794
Nodename=compute-0-10 RealMemory=3018 Feature=intel,rack0,ib Weight=1794
Nodename=compute-0-11 Feature=intel,rack0,ib
Nodename=compute-0-12 RealMemory=2011 Feature=intel,rack0,ib Weight=1542
Nodename=compute-0-13 Feature=intel,rack0,ib
Nodename=compute-0-14 Feature=intel,rack0,ib
Nodename=compute-0-15 RealMemory=3018 Feature=intel,rack0,ib Weight=1794
Nodename=compute-1-0 CoresPerSocket=1 Sockets=1 RealMemory=2010 
Feature=amd,rack1 Weight=2020
###Nodename=compute-1-1
###Nodename=compute-3-0

#FIXME: MaxTime?
PartitionName=normal Default=yes Priority=1000 
Nodes=compute-0-[1,4-15],compute-1-0
PartitionName=hugemem Default=no Priority=1000 Nodes=compute-0-[0,2,3]
PartitionName=lowpri Default=no Priority=0 Nodes=compute-0-[0-15],compute-1-0


-- 
Regards,
Bjørn-Helge Mevik, dr. scient,
Research Computing Services, University of Oslo

[slurm-dev] Draining of nodes don't work!?

Reply via email to