Hi All
So I am succesfully launching ~1.5K jobs in about 5 minutes. Slurm seems to
get a hold of the jobs and schedule them

All the nodes in my environment are either 2 cores or 4 cores.

While scheduling jobs on cores, slurm seems to use some sort of sequential
scheduling algorithm, which results in multiple jobs being scheduled on the
same machine before moving on the next machine. (Hope this makes sense). Is
there a way to schedule only one job on all nodes, before having to schedule
multiple jobs on nodes.

Now when multiple jobs are scheduled on the same node, it seems like the
jobs are actually scheduled on the same core. I am using select/cons_res and
CR_Core, Could someone please take look at my slurm.conf below and tell me
if I am doing something wrong.

Thanks in advance
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ControlMachine=z21
#ControlAddr=
BackupController=z18
#BackupAddr=
#
AuthType=auth/none
#AuthType=auth/munge
CacheGroups=0
#CheckpointType=checkpoint/none
CryptoType=crypto/munge
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#PrologSlurmctld=
#FirstJobId=1
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobCheckpointDir=/var/slurm/checkpoint
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=1
#KillOnBadExit=0
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxTasksPerNode=128
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
#ProctrackType=proctrack/pgid
ProctrackType=proctrack/linuxproc
#Prolog=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
ReturnToService=1
#SallocDefaultCommand=
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6820-6823
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/tmp/slurmd
SlurmUser=slurm
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/tmp
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/none
#TaskPluginParam=
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFs=/tmp
#TrackWCKey=no
#TreeWidth=327
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
MessageTimeout=100
#ResvOverRun=0
#MinJobAge=20
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
FastSchedule=0
#MaxMemPerCPU=0
#SchedulerRootFilter=1
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SchedulerPort=7321
SchedulerParameters=max_job_bf=50,interval=30
#SelectType=select/linear
SelectType=select/cons_res
SelectTypeParameters=CR_Core
#
#
# JOB PRIORITY
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
AccountingStorageHost=z21
AccountingStorageLoc=slurm_job_acc
AccountingStoragePass=slurm
AccountingStoragePort=3306
AccountingStorageType=accounting_storage/mysql
AccountingStorageUser=slurm
ClusterName=cluster
#DebugFlags=
JobCompHost=z21
JobCompLoc=slurm_job_comp
JobCompPass=slurm
JobCompPort=3306
JobCompType=jobcomp/mysql
JobCompUser=slurm
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=2
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=7
SlurmdLogFile=/var/log/slurm/slurmd.log.%h
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=z[23,24,26,28-39] Procs=4 Sockets=1 CoresPerSocket=4
ThreadsPerCore=1
NodeName=z[25,27] Procs=2 Sockets=1 CoresPerSocket=2 ThreadsPerCore=1
NodeName=obi[23-40] Procs=4 Sockets=1 CoresPerSocket=4 ThreadsPerCore=1
NodeName=w[001-108] Procs=2 Sockets=1 CoresPerSocket=2 ThreadsPerCore=1
NodeName=y[002-010,012-025,027-038] Procs=2 Sockets=1 CoresPerSocket=2
ThreadsPerCore=1
NodeName=y[040-062,064-108,111-119,121-186] Procs=4 Sockets=1
CoresPerSocket=4 ThreadsPerCore=1
PartitionName=z_part Nodes=z[23-39] Default=NO Shared=NO MaxNodes=1
MaxTime=INFINITE State=UP
#PartitionName=obi_part Nodes=obi[23-40] Default=NO Shared=NO MaxNodes=1
MaxTime=INFINITE State=UP
PartitionName=w_part Nodes=w[001-108] Default=NO Shared=NO MaxNodes=1
MaxTime=INFINITE State=UP
PartitionName=y_part
Nodes=y[002-010,012-025,027-038,040-062,064-108,111-119,121-186] Default=NO
Shared=NO MaxNodes=1 MaxTime=INFINITE State=UP
#PartitionName=all_part
Nodes=w[001-108],y[2-10,12-25,27-38,41-62,64-108,111-119,121-186]
 Default=YES MaxNodes=1 MaxTime=INFINITE State=UP
#PartitionName=all_part
Nodes=z[23-39,41-46],obi[23-40],w[1-108],y[2-10,12-25,27-38,41-62,64-108,111-119,121-186]
Shared=NO Default=YES MaxNodes=1 MaxTime=INFINITE State=UP
PartitionName=all_part
Nodes=z[23-39],w[001-108],y[002-010,012-025,027-038,040-062,064-108,111-119,121-186]
Shared=NO Default=YES MaxTime=INFINITE State=UP

Reply via email to