-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA256

Balaji,

Try adding a "DefMemPerCPU" statement in your partition definitions, e.g
.:

PartitionName=PY34 Nodes=okdev1368  DefMemPerCPU=512 MaxTime=INFINITE
State=UP shared=force:4

HTH,
John DeSantis

On 09/16/2016 04:44 PM, Balaji Deivam wrote:
> Hello,
> 
> Its look like only one job per node is running after the upgrade
> from *14.11 to 15.08* this week.
> 
> Below are the slurm.conf file, and not sure any new parameter have
> to define to fix this.
> 
> Could you please help me on this issue?
> 
> NodeName=okdev1314 NodeAddr=10.9.137.84 SocketsPerBoard=4
> CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN
> Gres=DIR_SIZE:2591387,Process:24 RealMemory=15947 
> NodeName=okdev1324 NodeAddr=10.9.137.94 SocketsPerBoard=4
> CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN
> Gres=DIR_SIZE:1003126,Process:24 RealMemory=15947 
> NodeName=okdev1367 NodeAddr=10.9.137.167 SocketsPerBoard=2
> CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
> Gres=DIR_SIZE:1274,Process:24 RealMemory=775382 NodeName=okdev1368
> NodeAddr=10.9.137.168 SocketsPerBoard=2 CoresPerSocket=4 
> ThreadsPerCore=2 State=UNKNOWN Gres=DIR_SIZE:1274,Process:24 
> RealMemory=775382 NodeName=okdev1447 NodeAddr=10.9.137.207
> SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN
> Gres=DIR_SIZE:708102,Process:24 RealMemory=15951
> 
> PartitionName=debug Nodes=okdev1324,okdev1314 Default=YES
> MaxTime=INFINITE State=UP shared=force:4 PartitionName=R
> Nodes=okdev1367,okdev1368  MaxTime=INFINITE State=UP 
> shared=force:3 PartitionName=PY34 Nodes=okdev1368  MaxTime=INFINITE
> State=UP shared=force:4 PartitionName=PY27 Nodes=okdev1368
> MaxTime=INFINITE State=UP shared=force:4 PartitionName=EM
> Nodes=okdev1447  MaxTime=INFINITE State=UP shared=force:4 
> PartitionName=admin
> Nodes=okdev1368,okdev1447,okdev1367,okdev1324,okdev1314 
> MaxTime=INFINITE State=UP shared=force:5
> 
> *Slurm.conf file:*
> 
> 
> okdev1314:/apps/slurm/etc/slurm>cat slurm.conf # slurm.conf file
> generated by configurator.html. # Put this file on all nodes of
> your cluster. # See the slurm.conf man page for more information. 
> # ControlMachine=okdev1315 ControlAddr=10.9.137.85 
> #BackupController=cloudlg016171 #BackupAddr=10.48.16.171 # 
> AuthType=auth/munge CacheGroups=0 #CheckpointType=checkpoint/none 
> CryptoType=crypto/munge #DisableRootJobs=NO #EnforcePartLimits=NO 
> #Epilog= #EpilogSlurmctld= #FirstJobId=1 #MaxJobId=999999 
> GresTypes=DIR_SIZE,Process #GroupUpdateForce=0 
> #GroupUpdateTime=600 #JobCheckpointDir=/var/slurm/checkpoint 
> #JobCredentialPrivateKey= #JobCredentialPublicCertificate= 
> #JobFileAppend=0 #JobRequeue=1 #JobSubmitPlugins=1 
> #KillOnBadExit=0 #LaunchType=launch/slurm #Licenses=foo*4,bar 
> #MailProg=/bin/mail #MaxJobCount=10000 #MaxStepCount=40000 
> #MaxTasksPerNode=8 MpiDefault=none #MpiParams=ports=#-# 
> #PluginDir= #PlugStackConfig= #PrivateData=jobs 
> ProctrackType=proctrack/linuxproc #Prolog= #PrologFlags= 
> #PrologSlurmctld= #PropagatePrioProcess=0 
> #PropagateResourceLimits= #PropagateResourceLimitsExcept= 
> #RebootProgram= ReturnToService=1 #SallocDefaultCommand= 
> SlurmctldPidFile=/apps/slurm/var/run/slurmctld.pid 
> SlurmctldPort=6827 SlurmdPidFile=/apps/slurm/var/run/slurmd.pid 
> SlurmdPort=6828 SlurmdSpoolDir=/apps/slurm/var/spool/slurmd 
> SlurmUser=sassrv SlurmdUser=sassrv #SrunEpilog= #SrunProlog= 
> StateSaveLocation=/apps/slurm/var/spool/slurmd 
> SwitchType=switch/none #TaskEpilog= TaskPlugin=task/none 
> #TaskPluginParam= #TaskProlog= #TopologyPlugin=topology/tree 
> #TmpFS=/tmp #TrackWCKey=no #TreeWidth= #UnkillableStepProgram= 
> #UsePAM=0 # # # TIMERS #BatchStartTimeout=10 #CompleteWait=0 
> #EpilogMsgTime=2000 #GetEnvTimeout=2 #HealthCheckInterval=0 
> #HealthCheckProgram= InactiveLimit=0 KillWait=30 
> #MessageTimeout=10 #ResvOverRun=0 MinJobAge=600 #OverTimeLimit=0 
> SlurmctldTimeout=120 SlurmdTimeout=300 #UnkillableStepTimeout=60 
> #VSizeFactor=0 Waittime=0 # # # SCHEDULING #DefMemPerCPU=0 
> FastSchedule=1 #MaxMemPerCPU=0 #SchedulerRootFilter=1 
> #SchedulerTimeSlice=30 SchedulerType=sched/backfill 
> SchedulerPort=7321 SelectType=select/cons_res 
> SelectTypeParameters=CR_Core_Memory,CR_LLN 
> SchedulerParameters=kill_invalid_depend,sched_interval=60,sched_max_jo
b_start=12,sched_min_interval=500000,default_queue_depth=25
>
> 
#
> # # JOB PRIORITY #PriorityFlags= #PriorityType=priority/basic 
> PriorityType=priority/multifactor PriorityDecayHalfLife=00:30:00 
> PriorityCalcPeriod=3 #PriorityFavorSmall= PriorityMaxAge=4-0 
> PriorityUsageResetPeriod=NOW PriorityWeightAge=100000 
> PriorityWeightFairshare=500000 PriorityWeightJobSize=0 
> PriorityWeightPartition=0 PriorityWeightQOS=250000 # # # LOGGING
> AND ACCOUNTING AccountingStorageEnforce=limits 
> AccountingStorageHost=okdev1315 AccountingStorageLoc=slurm 
> AccountingStoragePass=slurm AccountingStoragePort=3306 
> AccountingStorageUser=slurm 
> AccountingStorageType=accounting_storage/mysql 
> AccountingStoreJobComment=YES #AccountingStorageTRES=gres/dir_size 
> ClusterName=cluster #DebugFlags= #JobCompHost= #JobCompLoc= 
> #JobCompPass= #JobCompPort= JobCompType=jobcomp/none #JobCompUser= 
> #JobContainerType=job_container/none JobAcctGatherFrequency=30 
> JobAcctGatherType=jobacct_gather/linux SlurmctldDebug=3 
> SlurmctldLogFile=/apps/slurm/var/log/slurm/slurmctld.log 
> SlurmdDebug=3 
> SlurmdLogFile=/apps/slurm/var/log/slurm/slurmd_%h.log 
> #SlurmSchedLogFile= #SlurmSchedLogLevel= # # # POWER SAVE SUPPORT
> FOR IDLE NODES (optional) #SuspendProgram= #ResumeProgram= 
> #SuspendTimeout= #ResumeTimeout= #ResumeRate= #SuspendExcNodes= 
> #SuspendExcParts= #SuspendRate= #SuspendTime= # # # COMPUTE NODES
> 10.9.137.94
> 
> NodeName=okdev1314 NodeAddr=10.9.137.84 SocketsPerBoard=4
> CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN
> Gres=DIR_SIZE:2591387,Process:24 RealMemory=15947 
> NodeName=okdev1324 NodeAddr=10.9.137.94 SocketsPerBoard=4
> CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN
> Gres=DIR_SIZE:1003126,Process:24 RealMemory=15947 
> NodeName=okdev1367 NodeAddr=10.9.137.167 SocketsPerBoard=2
> CoresPerSocket=4 ThreadsPerCore=2 State=UNKNOWN
> Gres=DIR_SIZE:1274,Process:24 RealMemory=775382 NodeName=okdev1368
> NodeAddr=10.9.137.168 SocketsPerBoard=2 CoresPerSocket=4 
> ThreadsPerCore=2 State=UNKNOWN Gres=DIR_SIZE:1274,Process:24 
> RealMemory=775382 NodeName=okdev1447 NodeAddr=10.9.137.207
> SocketsPerBoard=2 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN
> Gres=DIR_SIZE:708102,Process:24 RealMemory=15951
> 
> PartitionName=debug Nodes=okdev1324,okdev1314 Default=YES
> MaxTime=INFINITE State=UP shared=force:4 PartitionName=R
> Nodes=okdev1367,okdev1368  MaxTime=INFINITE State=UP 
> shared=force:3 PartitionName=PY34 Nodes=okdev1368  MaxTime=INFINITE
> State=UP shared=force:4 PartitionName=PY27 Nodes=okdev1368
> MaxTime=INFINITE State=UP shared=force:4 PartitionName=EM
> Nodes=okdev1447  MaxTime=INFINITE State=UP shared=force:4 
> PartitionName=admin
> Nodes=okdev1368,okdev1447,okdev1367,okdev1324,okdev1314 
> MaxTime=INFINITE State=UP shared=force:5
> 
> 
> 
> 
> 
> 
> 
> Thanks & Regards, Balaji Deivam Staff Analyst - Business Data
> Center Seagate Technology - 389 Disc Drive, Longmont, CO 80503 |
> 720-684- <720-684-2363>*3395*
> 
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2

iQEcBAEBCAAGBQJX3YB6AAoJEEmckBqrs5nBYY4H/jMHwrYbuy1I8SfaZgbLFM+3
U9q8uX0TIHvLVZOlygbIXTvnzv4hreb3/1OlySCvG/QpRM2Uf9QjcM+chPmE+tV/
8HFdzwCmtsCd2WajIFtKZDtonyDS80B4Xmwp44rwWAtrRCkAbhkIBGeJVxw4Blgh
nEi0ScYgM2JSTOFh2FCZPnRoNbZ9sei4tTB3/jYUOGsUM/gbCWJXQbM6GY7Ymanl
ZWvsNIWR5yHmv+NjfbwgPd/BqegXtgAyosaLAiaIloQreup00zICA+UW6YG+fW8x
kfAFiB8g1iywCK8H0NoKL3IsHttOUvdSTGVvHJwQEcS1ZAKbIGMY1lZvUWE2B6s=
=Sczh
-----END PGP SIGNATURE-----

Reply via email to