So I made the following changes to slurm.conf.

SchedulerParameters=max_job_bf=100,interval=30
MinJobAge=20
MessageTimeout=30
MaxJobCount=50000

I also changed the parameter
SlurmctldPort=6820-6823

This way slurmctld takes request on mutipel nodes.

Now when I launch jobs the jobs never run, they are always in Pending state.
The contents of slurm.conf are below. Thanks in advance


# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ControlMachine=z21
#ControlAddr=
BackupController=z18
#BackupAddr=
#
AuthType=auth/none
#AuthType=auth/munge
CacheGroups=0
#CheckpointType=checkpoint/none
CryptoType=crypto/munge
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#PrologSlurmctld=
#FirstJobId=1
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobCheckpointDir=/var/slurm/checkpoint
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=1
#KillOnBadExit=0
#Licenses=foo*4,bar
#MailProg=/bin/mail
MaxJobCount=50000
#MaxTasksPerNode=128
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
#ProctrackType=proctrack/pgid
ProctrackType=proctrack/linuxproc

#Prolog=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
ReturnToService=1
#SallocDefaultCommand=
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817-6820
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6821
SlurmdSpoolDir=/tmp/slurmd
SlurmUser=slurm
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/tmp
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/none
#TaskPluginParam=
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFs=/tmp
#TrackWCKey=no
#TreeWidth=327
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
MessageTimeout=30
#ResvOverRun=0
MinJobAge=20
#OverTimeLimit=0
SlurmctldTimeout=120

SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
FastSchedule=1
#MaxMemPerCPU=0
#SchedulerRootFilter=1
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SchedulerPort=7321
SchedulerParameters=max_job_bf=100,interval=30
SelectType=select/linear
#SelectType=select/cons_res
#SelectTypeParameters=CR_CPU
#
#
# JOB PRIORITY
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
AccountingStorageHost=z21
AccountingStorageLoc=slurm_job_acc
AccountingStoragePass=slurm
AccountingStoragePort=3306
AccountingStorageType=accounting_storage/mysql
AccountingStorageUser=slurm
ClusterName=cluster
#DebugFlags=
JobCompHost=z21
JobCompLoc=slurm_job_comp
JobCompPass=slurm
JobCompPort=3306
JobCompType=jobcomp/mysql
JobCompUser=slurm
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=7
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=7
SlurmdLogFile=/var/log/slurm/slurmd.log.%h
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=z[23,24,26,28-39] Procs=4 State=UNKNOWN
NodeName=z[25,27] Procs=2 State=UNKNOWN
NodeName=obi[23-40] Procs=4 State=UNKNOWN
NodeName=w[001-108] Procs=2 State=UNKNOWN
NodeName=y[002-010,012-025,027-038] Procs=2 State=UNKNOWN
NodeName=y[040-062,064-108,111-119,121-186] Procs=4 State=UNKNOWN
PartitionName=z_part Nodes=z[23-39] Default=NO Shared=NO MaxTime=INFINITE
State=UP
#PartitionName=obi_part Nodes=obi[23-40] Default=NO Shared=NO
MaxTime=INFINITE State=UP
PartitionName=w_part Nodes=w[001-108] Default=NO Shared=NO MaxTime=INFINITE
State=UP
PartitionName=y_part
Nodes=y[002-010,012-025,027-038,040-062,064-108,111-119,121-186] Default=NO
Shared=NO MaxTime=INFINITE State=UP
#PartitionName=all_part
Nodes=w[001-108],y[2-10,12-25,27-38,41-62,64-108,111-119,121-186]
 Default=YES MaxTime=INFINITE State=UP
#PartitionName=all_part
Nodes=z[23-39,41-46],obi[23-40],w[1-108],y[2-10,12-25,27-38,41-62,64-108,111-119,121-186]
Shared=NO Default=YES MaxTime=INFINITE State=UP
PartitionName=all_part
Nodes=z[23-39],w[001-108],y[002-010,012-025,027-038,040-062,064-108,111-119,121-186]
Shared=NO Default=YES MaxTime=INFINITE State=UP

Reply via email to