Hi,

I'm currently testing slurm 2.5.6 with around 100 nodes configured to have 64 
slots each. When I use:

SchedulerType=sched/builtin

everything seems fine when running over 6000 jobs. However, when I try:

SchedulerType=sched/backfill

commands like "sinfo" and "qstat -q" timeout after 100 seconds (I have 
MessageTimeout set to 100) and give errors like this:

[root@lcgvm24 ~]#  sinfo
sinfo: error: slurm_receive_msg: Socket timed out on send/recv operation
slurm_load_partitions: Socket timed out on send/recv operation

Job submission starts failing with a similar error:

sbatch: error: Batch job submission failed: Socket timed out on send/recv 
operation

Also, errors like this also appear in the slurmctld log file:

[2013-05-21T16:28:33+01:00] error: Munge decode failed: Expired credential
[2013-05-21T16:28:33+01:00] ENCODED: Tue May 21 16:23:16 2013

[2013-05-21T16:28:33+01:00] DECODED: Tue May 21 16:28:33 2013

[2013-05-21T16:28:33+01:00] error: authentication: Expired credential
[2013-05-21T16:28:33+01:00] error: slurm_receive_msg: Protocol authentication 
error
[2013-05-21T16:28:33+01:00] error: Munge decode failed: Expired credential

My full config file is shown below (*). I'm using test jobs which sleep for a 
random period of time between 600 and 6600 secs. I've noticed that if I 
increase them to sleep for a longer period of time, then slurm is fine until 
jobs start completing, at which point it the problems above occur.

Is there anything I can do to make slurm more responsive when using 
sched/backfill?

Many Thanks,
Andrew.

(*)
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=ral_lcg2
ControlMachine=lcgvm24
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/lib/slurm
SlurmdSpoolDir=/var/lib/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProcTracktype=proctrack/linuxproc
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=2
MessageTimeout=100
MaxJobCount=30000
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFs=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerType=sched/builtin
#SchedulerParameters=defer,bf_interval=300
PriorityType=priority/multifactor
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/cons_res
SelectTypeParameters=CR_Core
FastSchedule=2
PriorityDecayHalfLife=1440
PriorityCalcPeriod=10
PriorityUsageResetPeriod=NONE
PriorityMaxAge=2-0
PriorityWeightAge=100000
PriorityWeightFairshare=10000000
PriorityWeightJobSize=1
PriorityWeightPartition=1
PriorityWeightQOS=0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurm.log
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=lcgvm24
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=lcg0[729-842] Sockets=2 CoresPerSocket=32 ThreadsPerCore=1 
RealMemory=16053 State=UNKNOWN
PartitionName=gridS Nodes=lcg0[729-842] Default=YES MaxTime=06:00:00 State=UP 
Shared=NO


-- 
Scanned by iCritical.

Reply via email to