Hi all,


having a very strange problem:
a single network has 2 slurm clusters:
slurm01 is responsible for single big SMP node bignode
slurmhpc is responsible for many weaker nodes node[001-200]

both can ping each other, resolve correctly (dns seems correct, nslookup works on all)

slurmhpc is working correctly (no issues).
slurm01 will not manage bignode.
scontrol ping from bignode returns slurm01 as it's primary, but scontrol ping from slurm01 returns slurmhpc as its primary (munge keys are different for the 2 clusters).
slurm01 uses accounting (slurmdbd) and it used to manage all nodes, but they have all been removed from slurm.conf
slurmhpc doesn't use any accounting, and it's slurm.conf doesn't have slurm01 or bignode.

slurm01 resolves slurm01 and slurmhpc correctly (nslookup, ping) but for whatever reason keeps trying to connect to slurmhpc as it's primary.

scontrol reconfig doesn't help,
service slurm stop ; service slurm startclean doesn't help.

attached the slurm.conf for slurm01

# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ControlMachine=slurm01
#ControlAddr=
#BackupController=
#BackupAddr=
#
AuthType=auth/munge
CacheGroups=0
CheckpointType=checkpoint/none
#CheckpointType=checkpoint/blcr
CryptoType=crypto/munge
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#PrologSlurmctld=
#FirstJobId=1
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobCheckpointDir=/var/slurm/checkpoint
JobCheckpointDir=/slurm-tmp/checkpoint
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=1
#KillOnBadExit=0
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=5000
#MaxTasksPerNode=128
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/pgid
#Prolog=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
ReturnToService=2
#SallocDefaultCommand=
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/slurm-tmp/slurmd
SlurmUser=slurm
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/slurm-tmp
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/none
#TaskPluginParam=
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFs=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
FastSchedule=0
#MaxMemPerCPU=0
#SchedulerRootFilter=1
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SchedulerPort=7321
SelectType=select/cons_res
#SelectTypeParameters=
#
#
# JOB PRIORITY
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
AccountingStorageEnforce=limits
AccountingStorageHost=localhost
AccountingStorageLoc=slurm_acct_db
#AccountingStoragePass=
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd
#AccountingStoragePort=3306
#AccountingStorageType=accounting_storage/none
ClusterName=cluster
AccountingStorageUser=slurm
#DebugFlags=
#JobCompHost=
JobCompLoc=/home/slurm/log/JobComp.log
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/filetxt
#JobCompUser=
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=3
SlurmctldLogFile=/slurm-tmp/Slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/slurm-tmp/Slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
#

NodeName=bignode Sockets=4 CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
PartitionName=bignode Nodes=bignode Default=YES MaxTime=INFINITE State=UP

Reply via email to