Hi, I have configuring our system using Slurm and everything seems fine, but the 'suspend,gang' won't work for QOS preemption. My test on 'cancel' and 'requeue' worked well. But a lower QOS job with 'preemptmode=suspend,gang' would not be preempted. My slurm.conf is as follows:
# Put this file on all nodes of your cluster. # See the slurm.conf man page for more information. # ControlMachine=Alex-test-1 #ControlAddr= #BackupController= #BackupAddr= # AuthType=auth/munge CacheGroups=0 #CheckpointType=checkpoint/none CryptoType=crypto/munge #DisableRootJobs=NO #EnforcePartLimits=NO #Epilog= #EpilogSlurmctld= #FirstJobId=1 #MaxJobId=999999 #GresTypes= #GroupUpdateForce=0 #GroupUpdateTime=600 #JobCheckpointDir=/var/slurm/checkpoint #JobCredentialPrivateKey= #JobCredentialPublicCertificate= #JobFileAppend=0 #JobRequeue=1 #JobSubmitPlugins=1 #KillOnBadExit=0 #LaunchType=launch/slurm #Licenses=foo*4,bar #MailProg=/bin/mail #MaxJobCount=5000 #MaxStepCount=40000 #MaxTasksPerNode=128 MpiDefault=none #MpiParams=ports=#-# #PluginDir= #PlugStackConfig= #PrivateData=jobs ProctrackType=proctrack/linuxproc #Prolog= #PrologFlags= #PrologSlurmctld= #PropagatePrioProcess=0 #PropagateResourceLimits= #PropagateResourceLimitsExcept= #RebootProgram= ReturnToService=0 #SallocDefaultCommand= SlurmctldPidFile=/var/run/slurmctld.pid SlurmctldPort=6817 SlurmdPidFile=/var/run/slurmd.pid SlurmdPort=6818 SlurmdSpoolDir=/var/spool/slurmd SlurmUser=slurm #SlurmdUser=root #SrunEpilog= #SrunProlog= StateSaveLocation=/var/spool/slurmctld SwitchType=switch/none #TaskEpilog= TaskPlugin=task/affinity TaskPluginParam=Sched #TaskProlog= #TopologyPlugin=topology/tree #TmpFS=/tmp #TrackWCKey=no #TreeWidth= #UnkillableStepProgram= #UsePAM=0 # # # TIMERS #BatchStartTimeout=10 #CompleteWait=0 #EpilogMsgTime=2000 #GetEnvTimeout=2 #HealthCheckInterval=0 #HealthCheckProgram= InactiveLimit=0 KillWait=30 #MessageTimeout=10 #ResvOverRun=0 MinJobAge=300 #OverTimeLimit=0 SlurmctldTimeout=120 SlurmdTimeout=300 #UnkillableStepTimeout=60 #VSizeFactor=0 Waittime=0 # # # SCHEDULING #DefMemPerCPU=0 FastSchedule=1 #MaxMemPerCPU=20 #SchedulerRootFilter=1 #SchedulerTimeSlice=30 SchedulerType=sched/backfill SchedulerPort=7321 SchedulerParameters=preempt_strict_order,preempt_reorder_count=3 SelectType=select/cons_res SelectTypeParameters=CR_Core_Memory PreemptMode=Suspend,Gang PreemptType=preempt/qos # # # JOB PRIORITY #PriorityFlags= PriorityType=priority/multifactor #PriorityDecayHalfLife= #PriorityCalcPeriod= #PriorityFavorSmall= #PriorityMaxAge= #PriorityUsageResetPeriod= #PriorityWeightAge= #PriorityWeightFairshare= #PriorityWeightJobSize= #PriorityWeightPartition= PriorityWeightQOS=2000 # # # LOGGING AND ACCOUNTING AccountingStorageEnforce=qos #AccountingStorageHost= AccountingStorageLoc=/var/log/slurm/accounting #AccountingStoragePass= #AccountingStoragePort= AccountingStorageType=accounting_storage/slurmdbd #AccountingStorageUser= AccountingStoreJobComment=YES ClusterName=terra #DebugFlags= #JobCompHost= JobCompLoc=/var/log/slurm/job_completions #JobCompPass= #JobCompPort= JobCompType=jobcomp/filetxt #JobCompUser= #JobContainerType=job_container/none JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/linux SlurmctldDebug=3 #SlurmctldLogFile= SlurmdDebug=3 #SlurmdLogFile= #SlurmSchedLogFile= #SlurmSchedLogLevel= # # # POWER SAVE SUPPORT FOR IDLE NODES (optional) #SuspendProgram= #ResumeProgram= #SuspendTimeout= #ResumeTimeout= #ResumeRate= #SuspendExcNodes= #SuspendExcParts= #SuspendRate= #SuspendTime= # # # COMPUTE NODES NodeName=Alex-test-1 RealMemory=3700 Sockets=2 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN PartitionName=debug Nodes=Alex-test-1 Default=YES MaxTime=INFINITE State=UP I appreciate your help. Alex
