Hi Balaji –

 

We had the same experience with a nearly identical version upgrade. We had to 
change the SelectTypeParamters value. We also had CR_Core_Memory prior to the 
upgrade. We found that for jobs that explicitly specified a memory value, SLURM 
would schedule multiple jobs per node. This behavior isn’t pervasive among our 
users, however. So, until we change our users’ behavior, we have changed the 
SLURM value to CR_Core. Our full setting is:

 

SelectTypeParameters=CR_Core,CR_ONE_TASK_PER_CORE,CR_CORE_DEFAULT_DIST_BLOCK

 

You can read more here: http://slurm.schedmd.com/slurm.conf.html

 

Best of luck,

 

--

Ed Swindelles

Manager of Advanced Computing

University of Connecticut

 

From: Balaji Deivam [mailto:balaji.dei...@seagate.com] 
Sent: Friday, September 16, 2016 4:45 PM
To: slurm-dev <slurm-dev@schedmd.com>
Subject: [slurm-dev] Slurm 15.08.12 - Issue after upgrading to 15.08 - only one 
job per node is running

 

Hello, 

 

Its look like only one job per node is running after the upgrade from 14.11 to 
15.08 this week. 

 

Below are the slurm.conf file, and not sure any new parameter have to define to 
fix this. 

 

Could you please help me on this issue? 

 

NodeName=okdev1314 NodeAddr=10.9.137.84 SocketsPerBoard=4 CoresPerSocket=1 
ThreadsPerCore=1 State=UNKNOWN Gres=DIR_SIZE:2591387,Process:24 RealMemory=15947

NodeName=okdev1324 NodeAddr=10.9.137.94 SocketsPerBoard=4 CoresPerSocket=1 
ThreadsPerCore=1 State=UNKNOWN Gres=DIR_SIZE:1003126,Process:24 RealMemory=15947

NodeName=okdev1367 NodeAddr=10.9.137.167 SocketsPerBoard=2 CoresPerSocket=4 
ThreadsPerCore=2 State=UNKNOWN Gres=DIR_SIZE:1274,Process:24 RealMemory=775382

NodeName=okdev1368 NodeAddr=10.9.137.168 SocketsPerBoard=2 CoresPerSocket=4 
ThreadsPerCore=2 State=UNKNOWN Gres=DIR_SIZE:1274,Process:24 RealMemory=775382

NodeName=okdev1447 NodeAddr=10.9.137.207 SocketsPerBoard=2 CoresPerSocket=1 
ThreadsPerCore=1 State=UNKNOWN Gres=DIR_SIZE:708102,Process:24 RealMemory=15951

 

PartitionName=debug Nodes=okdev1324,okdev1314 Default=YES MaxTime=INFINITE 
State=UP shared=force:4

PartitionName=R Nodes=okdev1367,okdev1368  MaxTime=INFINITE State=UP 
shared=force:3

PartitionName=PY34 Nodes=okdev1368  MaxTime=INFINITE State=UP shared=force:4

PartitionName=PY27 Nodes=okdev1368  MaxTime=INFINITE State=UP shared=force:4

PartitionName=EM Nodes=okdev1447  MaxTime=INFINITE State=UP shared=force:4

PartitionName=admin Nodes=okdev1368,okdev1447,okdev1367,okdev1324,okdev1314  
MaxTime=INFINITE State=UP shared=force:5

 

Slurm.conf file:

 

 

okdev1314:/apps/slurm/etc/slurm>cat slurm.conf

# slurm.conf file generated by configurator.html.

# Put this file on all nodes of your cluster.

# See the slurm.conf man page for more information.

#

ControlMachine=okdev1315

ControlAddr=10.9.137.85

#BackupController=cloudlg016171

#BackupAddr=10.48.16.171

#

AuthType=auth/munge

CacheGroups=0

#CheckpointType=checkpoint/none

CryptoType=crypto/munge

#DisableRootJobs=NO

#EnforcePartLimits=NO

#Epilog=

#EpilogSlurmctld=

#FirstJobId=1

#MaxJobId=999999

GresTypes=DIR_SIZE,Process

#GroupUpdateForce=0

#GroupUpdateTime=600

#JobCheckpointDir=/var/slurm/checkpoint

#JobCredentialPrivateKey=

#JobCredentialPublicCertificate=

#JobFileAppend=0

#JobRequeue=1

#JobSubmitPlugins=1

#KillOnBadExit=0

#LaunchType=launch/slurm

#Licenses=foo*4,bar

#MailProg=/bin/mail

#MaxJobCount=10000

#MaxStepCount=40000

#MaxTasksPerNode=8

MpiDefault=none

#MpiParams=ports=#-#

#PluginDir=

#PlugStackConfig=

#PrivateData=jobs

ProctrackType=proctrack/linuxproc

#Prolog=

#PrologFlags=

#PrologSlurmctld=

#PropagatePrioProcess=0

#PropagateResourceLimits=

#PropagateResourceLimitsExcept=

#RebootProgram=

ReturnToService=1

#SallocDefaultCommand=

SlurmctldPidFile=/apps/slurm/var/run/slurmctld.pid

SlurmctldPort=6827

SlurmdPidFile=/apps/slurm/var/run/slurmd.pid

SlurmdPort=6828

SlurmdSpoolDir=/apps/slurm/var/spool/slurmd

SlurmUser=sassrv

SlurmdUser=sassrv

#SrunEpilog=

#SrunProlog=

StateSaveLocation=/apps/slurm/var/spool/slurmd

SwitchType=switch/none

#TaskEpilog=

TaskPlugin=task/none

#TaskPluginParam=

#TaskProlog=

#TopologyPlugin=topology/tree

#TmpFS=/tmp

#TrackWCKey=no

#TreeWidth=

#UnkillableStepProgram=

#UsePAM=0

#

#

# TIMERS

#BatchStartTimeout=10

#CompleteWait=0

#EpilogMsgTime=2000

#GetEnvTimeout=2

#HealthCheckInterval=0

#HealthCheckProgram=

InactiveLimit=0

KillWait=30

#MessageTimeout=10

#ResvOverRun=0

MinJobAge=600

#OverTimeLimit=0

SlurmctldTimeout=120

SlurmdTimeout=300

#UnkillableStepTimeout=60

#VSizeFactor=0

Waittime=0

#

#

# SCHEDULING

#DefMemPerCPU=0

FastSchedule=1

#MaxMemPerCPU=0

#SchedulerRootFilter=1

#SchedulerTimeSlice=30

SchedulerType=sched/backfill

SchedulerPort=7321

SelectType=select/cons_res

SelectTypeParameters=CR_Core_Memory,CR_LLN

SchedulerParameters=kill_invalid_depend,sched_interval=60,sched_max_job_start=12,sched_min_interval=500000,default_queue_depth=25

#

#

# JOB PRIORITY

#PriorityFlags=

#PriorityType=priority/basic

PriorityType=priority/multifactor

PriorityDecayHalfLife=00:30:00

PriorityCalcPeriod=3

#PriorityFavorSmall=

PriorityMaxAge=4-0

PriorityUsageResetPeriod=NOW

PriorityWeightAge=100000

PriorityWeightFairshare=500000

PriorityWeightJobSize=0

PriorityWeightPartition=0

PriorityWeightQOS=250000

#

#

# LOGGING AND ACCOUNTING

AccountingStorageEnforce=limits

AccountingStorageHost=okdev1315

AccountingStorageLoc=slurm

AccountingStoragePass=slurm

AccountingStoragePort=3306

AccountingStorageUser=slurm

AccountingStorageType=accounting_storage/mysql

AccountingStoreJobComment=YES

#AccountingStorageTRES=gres/dir_size

ClusterName=cluster

#DebugFlags=

#JobCompHost=

#JobCompLoc=

#JobCompPass=

#JobCompPort=

JobCompType=jobcomp/none

#JobCompUser=

#JobContainerType=job_container/none

JobAcctGatherFrequency=30

JobAcctGatherType=jobacct_gather/linux

SlurmctldDebug=3

SlurmctldLogFile=/apps/slurm/var/log/slurm/slurmctld.log

SlurmdDebug=3

SlurmdLogFile=/apps/slurm/var/log/slurm/slurmd_%h.log

#SlurmSchedLogFile=

#SlurmSchedLogLevel=

#

#

# POWER SAVE SUPPORT FOR IDLE NODES (optional)

#SuspendProgram=

#ResumeProgram=

#SuspendTimeout=

#ResumeTimeout=

#ResumeRate=

#SuspendExcNodes=

#SuspendExcParts=

#SuspendRate=

#SuspendTime=

#

#

# COMPUTE NODES 10.9.137.94

 

NodeName=okdev1314 NodeAddr=10.9.137.84 SocketsPerBoard=4 CoresPerSocket=1 
ThreadsPerCore=1 State=UNKNOWN Gres=DIR_SIZE:2591387,Process:24 RealMemory=15947

NodeName=okdev1324 NodeAddr=10.9.137.94 SocketsPerBoard=4 CoresPerSocket=1 
ThreadsPerCore=1 State=UNKNOWN Gres=DIR_SIZE:1003126,Process:24 RealMemory=15947

NodeName=okdev1367 NodeAddr=10.9.137.167 SocketsPerBoard=2 CoresPerSocket=4 
ThreadsPerCore=2 State=UNKNOWN Gres=DIR_SIZE:1274,Process:24 RealMemory=775382

NodeName=okdev1368 NodeAddr=10.9.137.168 SocketsPerBoard=2 CoresPerSocket=4 
ThreadsPerCore=2 State=UNKNOWN Gres=DIR_SIZE:1274,Process:24 RealMemory=775382

NodeName=okdev1447 NodeAddr=10.9.137.207 SocketsPerBoard=2 CoresPerSocket=1 
ThreadsPerCore=1 State=UNKNOWN Gres=DIR_SIZE:708102,Process:24 RealMemory=15951

 

PartitionName=debug Nodes=okdev1324,okdev1314 Default=YES MaxTime=INFINITE 
State=UP shared=force:4

PartitionName=R Nodes=okdev1367,okdev1368  MaxTime=INFINITE State=UP 
shared=force:3

PartitionName=PY34 Nodes=okdev1368  MaxTime=INFINITE State=UP shared=force:4

PartitionName=PY27 Nodes=okdev1368  MaxTime=INFINITE State=UP shared=force:4

PartitionName=EM Nodes=okdev1447  MaxTime=INFINITE State=UP shared=force:4

PartitionName=admin Nodes=okdev1368,okdev1447,okdev1367,okdev1324,okdev1314  
MaxTime=INFINITE State=UP shared=force:5

 

 

 

 

 

 

 

Thanks & Regards,

Balaji Deivam

Staff Analyst - Business Data Center

Seagate Technology - 389 Disc Drive, Longmont, CO 80503 | 720-684- 
<tel:720-684-2363> 3395

Attachment: smime.p7s
Description: S/MIME cryptographic signature

Reply via email to