Hi,

I have configuring our system using Slurm and everything seems fine, but the 
'suspend,gang' won't work for QOS preemption. My test on 'cancel' and 'requeue' 
worked well. But a lower QOS job with 'preemptmode=suspend,gang' would not be 
preempted.
My slurm.conf is as follows:


# Put this file on all nodes of your cluster.

# See the slurm.conf man page for more information.

#

ControlMachine=Alex-test-1

#ControlAddr=

#BackupController=

#BackupAddr=

#

AuthType=auth/munge

CacheGroups=0

#CheckpointType=checkpoint/none

CryptoType=crypto/munge

#DisableRootJobs=NO

#EnforcePartLimits=NO

#Epilog=

#EpilogSlurmctld=

#FirstJobId=1

#MaxJobId=999999

#GresTypes=

#GroupUpdateForce=0

#GroupUpdateTime=600

#JobCheckpointDir=/var/slurm/checkpoint

#JobCredentialPrivateKey=

#JobCredentialPublicCertificate=

#JobFileAppend=0

#JobRequeue=1

#JobSubmitPlugins=1

#KillOnBadExit=0

#LaunchType=launch/slurm

#Licenses=foo*4,bar

#MailProg=/bin/mail

#MaxJobCount=5000

#MaxStepCount=40000

#MaxTasksPerNode=128

MpiDefault=none

#MpiParams=ports=#-#

#PluginDir=

#PlugStackConfig=

#PrivateData=jobs

ProctrackType=proctrack/linuxproc

#Prolog=

#PrologFlags=

#PrologSlurmctld=

#PropagatePrioProcess=0

#PropagateResourceLimits=

#PropagateResourceLimitsExcept=

#RebootProgram=

ReturnToService=0

#SallocDefaultCommand=

SlurmctldPidFile=/var/run/slurmctld.pid

SlurmctldPort=6817

SlurmdPidFile=/var/run/slurmd.pid

SlurmdPort=6818

SlurmdSpoolDir=/var/spool/slurmd

SlurmUser=slurm

#SlurmdUser=root

#SrunEpilog=

#SrunProlog=

StateSaveLocation=/var/spool/slurmctld

SwitchType=switch/none

#TaskEpilog=

TaskPlugin=task/affinity

TaskPluginParam=Sched

#TaskProlog=

#TopologyPlugin=topology/tree

#TmpFS=/tmp

#TrackWCKey=no

#TreeWidth=

#UnkillableStepProgram=

#UsePAM=0

#

#

# TIMERS

#BatchStartTimeout=10

#CompleteWait=0

#EpilogMsgTime=2000

#GetEnvTimeout=2

#HealthCheckInterval=0

#HealthCheckProgram=

InactiveLimit=0

KillWait=30

#MessageTimeout=10

#ResvOverRun=0

MinJobAge=300

#OverTimeLimit=0

SlurmctldTimeout=120

SlurmdTimeout=300

#UnkillableStepTimeout=60

#VSizeFactor=0

Waittime=0

#

#

# SCHEDULING

#DefMemPerCPU=0

FastSchedule=1

#MaxMemPerCPU=20

#SchedulerRootFilter=1

#SchedulerTimeSlice=30

SchedulerType=sched/backfill

SchedulerPort=7321

SchedulerParameters=preempt_strict_order,preempt_reorder_count=3

SelectType=select/cons_res

SelectTypeParameters=CR_Core_Memory

PreemptMode=Suspend,Gang

PreemptType=preempt/qos

#

#

# JOB PRIORITY

#PriorityFlags=

PriorityType=priority/multifactor

#PriorityDecayHalfLife=

#PriorityCalcPeriod=

#PriorityFavorSmall=

#PriorityMaxAge=

#PriorityUsageResetPeriod=

#PriorityWeightAge=

#PriorityWeightFairshare=

#PriorityWeightJobSize=

#PriorityWeightPartition=

PriorityWeightQOS=2000

#

#

# LOGGING AND ACCOUNTING

AccountingStorageEnforce=qos

#AccountingStorageHost=

AccountingStorageLoc=/var/log/slurm/accounting

#AccountingStoragePass=

#AccountingStoragePort=

AccountingStorageType=accounting_storage/slurmdbd

#AccountingStorageUser=

AccountingStoreJobComment=YES

ClusterName=terra

#DebugFlags=

#JobCompHost=

JobCompLoc=/var/log/slurm/job_completions

#JobCompPass=

#JobCompPort=

JobCompType=jobcomp/filetxt

#JobCompUser=

#JobContainerType=job_container/none

JobAcctGatherFrequency=30

JobAcctGatherType=jobacct_gather/linux

SlurmctldDebug=3

#SlurmctldLogFile=

SlurmdDebug=3

#SlurmdLogFile=

#SlurmSchedLogFile=

#SlurmSchedLogLevel=

#

#

# POWER SAVE SUPPORT FOR IDLE NODES (optional)

#SuspendProgram=

#ResumeProgram=

#SuspendTimeout=

#ResumeTimeout=

#ResumeRate=

#SuspendExcNodes=

#SuspendExcParts=

#SuspendRate=

#SuspendTime=

#

#

# COMPUTE NODES

NodeName=Alex-test-1 RealMemory=3700 Sockets=2 CoresPerSocket=1 
ThreadsPerCore=1 State=UNKNOWN

PartitionName=debug Nodes=Alex-test-1 Default=YES MaxTime=INFINITE State=UP


I appreciate your help.


Alex

Reply via email to