So I made the following changes to slurm.conf. SchedulerParameters=max_job_bf=100,interval=30 MinJobAge=20 MessageTimeout=30 MaxJobCount=50000
I also changed the parameter SlurmctldPort=6820-6823 This way slurmctld takes request on mutipel nodes. Now when I launch jobs the jobs never run, they are always in Pending state. The contents of slurm.conf are below. Thanks in advance # slurm.conf file generated by configurator.html. # Put this file on all nodes of your cluster. # See the slurm.conf man page for more information. # ControlMachine=z21 #ControlAddr= BackupController=z18 #BackupAddr= # AuthType=auth/none #AuthType=auth/munge CacheGroups=0 #CheckpointType=checkpoint/none CryptoType=crypto/munge #DisableRootJobs=NO #EnforcePartLimits=NO #Epilog= #PrologSlurmctld= #FirstJobId=1 #GresTypes= #GroupUpdateForce=0 #GroupUpdateTime=600 #JobCheckpointDir=/var/slurm/checkpoint #JobCredentialPrivateKey= #JobCredentialPublicCertificate= #JobFileAppend=0 #JobRequeue=1 #JobSubmitPlugins=1 #KillOnBadExit=0 #Licenses=foo*4,bar #MailProg=/bin/mail MaxJobCount=50000 #MaxTasksPerNode=128 MpiDefault=none #MpiParams=ports=#-# #PluginDir= #PlugStackConfig= #PrivateData=jobs #ProctrackType=proctrack/pgid ProctrackType=proctrack/linuxproc #Prolog= #PrologSlurmctld= #PropagatePrioProcess=0 #PropagateResourceLimits= #PropagateResourceLimitsExcept= ReturnToService=1 #SallocDefaultCommand= SlurmctldPidFile=/var/run/slurmctld.pid SlurmctldPort=6817-6820 SlurmdPidFile=/var/run/slurmd.pid SlurmdPort=6821 SlurmdSpoolDir=/tmp/slurmd SlurmUser=slurm #SrunEpilog= #SrunProlog= StateSaveLocation=/tmp SwitchType=switch/none #TaskEpilog= TaskPlugin=task/none #TaskPluginParam= #TaskProlog= #TopologyPlugin=topology/tree #TmpFs=/tmp #TrackWCKey=no #TreeWidth=327 #UnkillableStepProgram= #UsePAM=0 # # # TIMERS #BatchStartTimeout=10 #CompleteWait=0 #EpilogMsgTime=2000 #GetEnvTimeout=2 #HealthCheckInterval=0 #HealthCheckProgram= InactiveLimit=0 KillWait=30 MessageTimeout=30 #ResvOverRun=0 MinJobAge=20 #OverTimeLimit=0 SlurmctldTimeout=120 SlurmdTimeout=300 #UnkillableStepTimeout=60 #VSizeFactor=0 Waittime=0 # # # SCHEDULING #DefMemPerCPU=0 FastSchedule=1 #MaxMemPerCPU=0 #SchedulerRootFilter=1 #SchedulerTimeSlice=30 SchedulerType=sched/backfill SchedulerPort=7321 SchedulerParameters=max_job_bf=100,interval=30 SelectType=select/linear #SelectType=select/cons_res #SelectTypeParameters=CR_CPU # # # JOB PRIORITY #PriorityType=priority/basic #PriorityDecayHalfLife= #PriorityCalcPeriod= #PriorityFavorSmall= #PriorityMaxAge= #PriorityUsageResetPeriod= #PriorityWeightAge= #PriorityWeightFairshare= #PriorityWeightJobSize= #PriorityWeightPartition= #PriorityWeightQOS= # # # LOGGING AND ACCOUNTING #AccountingStorageEnforce=0 AccountingStorageHost=z21 AccountingStorageLoc=slurm_job_acc AccountingStoragePass=slurm AccountingStoragePort=3306 AccountingStorageType=accounting_storage/mysql AccountingStorageUser=slurm ClusterName=cluster #DebugFlags= JobCompHost=z21 JobCompLoc=slurm_job_comp JobCompPass=slurm JobCompPort=3306 JobCompType=jobcomp/mysql JobCompUser=slurm JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/linux SlurmctldDebug=7 SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdDebug=7 SlurmdLogFile=/var/log/slurm/slurmd.log.%h #SlurmSchedLogFile= #SlurmSchedLogLevel= # # # POWER SAVE SUPPORT FOR IDLE NODES (optional) #SuspendProgram= #ResumeProgram= #SuspendTimeout= #ResumeTimeout= #ResumeRate= #SuspendExcNodes= #SuspendExcParts= #SuspendRate= #SuspendTime= # # # COMPUTE NODES NodeName=z[23,24,26,28-39] Procs=4 State=UNKNOWN NodeName=z[25,27] Procs=2 State=UNKNOWN NodeName=obi[23-40] Procs=4 State=UNKNOWN NodeName=w[001-108] Procs=2 State=UNKNOWN NodeName=y[002-010,012-025,027-038] Procs=2 State=UNKNOWN NodeName=y[040-062,064-108,111-119,121-186] Procs=4 State=UNKNOWN PartitionName=z_part Nodes=z[23-39] Default=NO Shared=NO MaxTime=INFINITE State=UP #PartitionName=obi_part Nodes=obi[23-40] Default=NO Shared=NO MaxTime=INFINITE State=UP PartitionName=w_part Nodes=w[001-108] Default=NO Shared=NO MaxTime=INFINITE State=UP PartitionName=y_part Nodes=y[002-010,012-025,027-038,040-062,064-108,111-119,121-186] Default=NO Shared=NO MaxTime=INFINITE State=UP #PartitionName=all_part Nodes=w[001-108],y[2-10,12-25,27-38,41-62,64-108,111-119,121-186] Default=YES MaxTime=INFINITE State=UP #PartitionName=all_part Nodes=z[23-39,41-46],obi[23-40],w[1-108],y[2-10,12-25,27-38,41-62,64-108,111-119,121-186] Shared=NO Default=YES MaxTime=INFINITE State=UP PartitionName=all_part Nodes=z[23-39],w[001-108],y[002-010,012-025,027-038,040-062,064-108,111-119,121-186] Shared=NO Default=YES MaxTime=INFINITE State=UP
