Hi,

You have set a MaxMemPerCPU lower than what you are asking for. Try
changing that and check if that solves the issue.

Regards,
Carlos

On Tue, Nov 1, 2016 at 10:27 PM, Chad Cropper <[email protected]>
wrote:

> SBATCH submissions are not utilizing the –mem-per-cpu option for
> scheduling purposes. Also the AllocMem when running scontrol show nodes,
> provides the DefMemPerCPU * the number of CPUPerTask.
>
>
>
> My submit script options:
>
> #!/bin/bash
>
> #SBATCH -M cluster
>
> #SBATCH -A account
>
> #SBATCH --mail-type=end
>
> #SBATCH --mail-user=
>
> #SBATCH -J job1
>
> #SBATCH -e err.log
>
> #SBATCH -o out.log
>
> #SBATCH -p normal
>
> #SBATCH --time=24:00:00
>
> #SBATCH --begin=now
>
> #SBATCH -n 1
>
> #SBATCH -c 4
>
> #SBATCH --mem-per-cpu=36G
>
> export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
>
>
>
> My slurm.conf file:
>
> [root@titan-master1 ~]# cat /etc/slurm/slurm.conf | grep -v "#"
>
> ClusterName=titan
>
> SlurmUser=slurm
>
> SlurmctldPort=6817
>
> SlurmdPort=6818
>
> AuthType=auth/munge
>
> StateSaveLocation=/cm/shared/apps/slurm/var/cm/statesave
>
> SlurmdSpoolDir=/cm/local/apps/slurm/var/spool
>
> SwitchType=switch/none
>
> MpiDefault=none
>
> SlurmctldPidFile=/var/run/slurmctld.pid
>
> SlurmdPidFile=/var/run/slurmd.pid
>
> ProctrackType=proctrack/cgroup
>
> CacheGroups=0
>
> ReturnToService=2
>
> TaskPlugin=task/cgroup
>
> SlurmctldTimeout=300
>
> SlurmdTimeout=300
>
> InactiveLimit=0
>
> MinJobAge=300
>
> KillWait=30
>
> Waittime=0
>
> DefMemPerCPU=1024
>
> MaxMemPerCPU=16000
>
> SelectType=select/cons_res
>
> SelectTypeParameters=CR_Core_Memory
>
> PriorityType=priority/multifactor
>
> PriorityWeightFairshare=0
>
> PriorityWeightAge=1000
>
> PriorityWeightQOS=1000
>
> PriorityMaxAge=3-0
>
> PriorityWeightJobSize=1000
>
> PriorityFavorSmall=NO
>
> PriorityWeightPartition=10000
>
> SlurmctldDebug=3
>
> SlurmctldLogFile=/var/log/slurmctld
>
> SlurmdDebug=3
>
> SlurmdLogFile=/var/log/slurmd
>
>
>
>
>
> JobAcctGatherType=jobacct_gather/linux
>
> JobAcctGatherFrequency=task=30,network=30,filesystem=30
>
> AccountingStorageType=accounting_storage/slurmdbd
>
> AccountingStorageHost=usfit-hpcc-slurm1.global.internal
>
> AccountingStorageUser=slurm
>
> AccountingStorageLoc=slurm_acct_db
>
> AccountingStoragePass=kol2oja3vCkMAUrB
>
> AccountingStorageEnforce=limits
>
> PrivateData=cloud,nodes,reservations
>
>
>
> SchedulerType=sched/backfill
>
> ControlMachine=titan-master1
>
> ControlAddr=titan-master1
>
> NodeName=tnode03  Procs=32 CoresPerSocket=8 RealMemory=515800 Sockets=4
> ThreadsPerCore=1
>
> NodeName=tnode[01,02]  Procs=8 CoresPerSocket=4 RealMemory=96505 Sockets=2
> ThreadsPerCore=1
>
> PartitionName=low Default=YES MinNodes=1 AllowGroups=ALL Priority=1
> DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0
> PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO
> ExclusiveUser=NO State=UP Nodes=tnode[01-03]
>
> PartitionName=normal Default=NO MinNodes=1 AllowGroups=ALL Priority=5
> DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0
> PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO
> ExclusiveUser=NO Nodes=tnode[01-03]
>
> PartitionName=prd Default=NO MinNodes=1 AllowGroups=ALL Priority=15
> DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0
> PreemptMode=OFF ReqResv=NO AllowAccounts=picprd,dairyprd,beefprd,sc
> AllowQos=ALL LLN=NO ExclusiveUser=NO State=UP Nodes=tnode[01-03]
>
> PartitionName=long Default=NO MinNodes=1 AllowGroups=ALL Priority=1
> DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0
> PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO
> ExclusiveUser=NO State=UP Nodes=tnode[01-03]
>
> PartitionName=short Default=NO MinNodes=1 AllowGroups=ALL Priority=1
> DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0
> PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO
> ExclusiveUser=NO State=UP Nodes=tnode[01-03]
>
> GresTypes=gpu,mic
>
> PrologSlurmctld=/cm/local/apps/cmd/scripts/prolog-prejob
>
> Prolog=/cm/local/apps/cmd/scripts/prolog
>
> Epilog=/cm/local/apps/cmd/scripts/epilog
>
> FastSchedule=0
>
> SuspendTimeout=30
>
> ResumeTimeout=60
>
> SuspendProgram=/cm/local/apps/cluster-tools/wlm/scripts/slurmpoweroff
>
> ResumeProgram=/cm/local/apps/cluster-tools/wlm/scripts/slurmpoweron
>
>
>
>
>
> ------------------------------
> *** The information contained in this communication may be confidential,
> is intended only for the use of the recipient(s) named above, and may be
> legally privileged. If the reader of this message is not the intended
> recipient, you are hereby notified that any dissemination, distribution, or
> copying of this communication, or any of its contents, is strictly
> prohibited. If you have received this communication in error, please return
> it to the sender immediately and delete the original message and any copies
> of it. If you have any questions concerning this message, please contact
> the sender. ***
>



-- 
--
Carles Fenoy

Reply via email to