Memory specification is per node. See srun man page for details.

Quoting Sefa Arslan <[email protected]>:

>
> I have a such a problem,  each nodes I send my jobs has 48 cores and
> 128GB memory on them.
>
> I can run the following job:
>
> srun -N1 -n48 --mem=120000   -p cuda01 hostname
>
>
> also I can run the same job on every node adding --nodelist parameter to
> the above line.
>
> But when I double the size of the job  I get  an error:
>
> srun -n96 -N2 --mem=240000   -p cuda01 hostname
>
> srun: error: Unable to allocate resources: Memory required by task  
> is not available
>
>
> I paste the configurations below. What may be the problem about them?
>
> Thanks in advance.
>
>
>
> The partition info:
>
> PartitionName=cuda01
>
>     AllocNodes=ALL AllowGroups=ALL Default=NO
>
>     DefaultTime=NONE DisableRootJobs=NO GraceTime=0 Hidden=NO
>
>     MaxNodes=UNLIMITED MaxTime=UNLIMITED MinNodes=1
>
>     Nodes=lufer[119-120]
>
>     Priority=1500 RootOnly=NO ReqResv=NO Shared=FORCE:4  
> PreemptMode=GANG,SUSPEND
>
>     State=UP TotalCPUs=96 TotalNodes=2 DefMemPerCPU=2600 MaxMemPerNode=124800
>
>
> The controller config:
>
> Configuration data as of 2012-10-09T09:45:25
>
> AccountingStorageBackupHost = (null)
>
> AccountingStorageEnforce = associations,limits
>
> AccountingStorageHost   = mercan5
>
> AccountingStorageLoc    = N/A
>
> AccountingStoragePort   = 6819
>
> AccountingStorageType   = accounting_storage/slurmdbd
>
> AccountingStorageUser   = N/A
>
> AccountingStoreJobComment = YES
>
> AuthType                = auth/munge
>
> BackupAddr              = mercan6
>
> BackupController        = mercan6
>
> BatchStartTimeout       = 10 sec
>
> BOOT_TIME               = 2012-10-09T08:47:48
>
> CacheGroups             = 1
>
> CheckpointType          = checkpoint/none
>
> ClusterName             = linux
>
> CompleteWait            = 0 sec
>
> ControlAddr             = mercan5
>
> ControlMachine          = mercan5
>
> CryptoType              = crypto/munge
>
> DebugFlags              = NO_CONF_HASH
>
> DefMemPerNode           = UNLIMITED
>
> DisableRootJobs         = NO
>
> EnforcePartLimits       = YES
>
> Epilog                  = (null)
>
> EpilogMsgTime           = 2000 usec
>
> EpilogSlurmctld         = (null)
>
> FastSchedule            = 1
>
> FirstJobId              = 100000
>
> GetEnvTimeout           = 2 sec
>
> GresTypes               = gpu
>
> GroupUpdateForce        = 0
>
> GroupUpdateTime         = 600 sec
>
> HASH_VAL                = Different Ours=0x6f4a260a Slurmctld=0xed126f52
>
> HealthCheckInterval     = 0 sec
>
> HealthCheckProgram      = (null)
>
> InactiveLimit           = 0 sec
>
> JobAcctGatherFrequency  = 30 sec
>
> JobAcctGatherType       = jobacct_gather/linux
>
> JobCheckpointDir        = /var/slurm/checkpoint
>
> JobCompHost             = localhost
>
> JobCompLoc              = /var/log/slurm/job_completions
>
> JobCompPort             = 0
>
> JobCompType             = jobcomp/filetxt
>
> JobCompUser             = root
>
> JobCredentialPrivateKey = (null)
>
> JobCredentialPublicCertificate = (null)
>
> JobFileAppend           = 0
>
> JobRequeue              = 1
>
> JobSubmitPlugins        = (null)
>
> KillOnBadExit           = 0
>
> KillWait                = 30 sec
>
> Licenses                = (null)
>
> MailProg                = /bin/mail
>
> MaxJobCount             = 1000000
>
> MaxJobId                = 4294901760
>
> MaxMemPerNode           = UNLIMITED
>
> MaxStepCount            = 40000
>
> MaxTasksPerNode         = 128
>
> MessageTimeout          = 10 sec
>
> MinJobAge               = 300 sec
>
> MpiDefault              = none
>
> MpiParams               = (null)
>
> NEXT_JOB_ID             = 124381
>
> OverTimeLimit           = 0 min
>
> PluginDir               = /usr/lib64/slurm
>
> PlugStackConfig         = /etc/slurm/plugstack.conf
>
> PreemptMode             = GANG,SUSPEND
>
> PreemptType             = preempt/partition_prio
>
> PriorityDecayHalfLife   = 00:00:00
>
> PriorityCalcPeriod      = 00:05:00
>
> PriorityFavorSmall      = 1
>
> PriorityFlags           = 0
>
> PriorityMaxAge          = 14-00:00:00
>
> PriorityUsageResetPeriod = NONE
>
> PriorityType            = priority/multifactor
>
> PriorityWeightAge       = 1000
>
> PriorityWeightFairShare = 10000
>
> PriorityWeightJobSize   = 1000
>
> PriorityWeightPartition = 1000
>
> PriorityWeightQOS       = 0
>
> PrivateData             = jobs,usage,users,accounts,reservations
>
> ProctrackType           = proctrack/cgroup
>
> Prolog                  = (null)
>
> PrologSlurmctld         = (null)
>
> PropagatePrioProcess    = 0
>
> PropagateResourceLimits = (null)
>
> PropagateResourceLimitsExcept = MEMLOCK
>
> RebootProgram           = (null)
>
> ReconfigFlags           = (null)
>
> ResumeProgram           = (null)
>
> ResumeRate              = 300 nodes/min
>
> ResumeTimeout           = 60 sec
>
> ResvOverRun             = 0 min
>
> ReturnToService         = 2
>
> SallocDefaultCommand    = (null)
>
> SchedulerParameters     = (null)
>
> SchedulerPort           = 7321
>
> SchedulerRootFilter     = 1
>
> SchedulerTimeSlice      = 30 sec
>
> SchedulerType           = sched/builtin
>
> SelectType              = select/cons_res
>
> SelectTypeParameters    = CR_CPU_MEMORY
>
> SlurmUser               = root(0)
>
> SlurmctldDebug          = debug3
>
> SlurmctldLogFile        = /var/log/slurm/slurmctld.log
>
> SlurmSchedLogFile       = (null)
>
> SlurmctldPort           = 6817
>
> SlurmctldTimeout        = 300 sec
>
> SlurmdDebug             = debug3
>
> SlurmdLogFile           = /var/log/slurm/slurmd.log
>
> SlurmdPidFile           = /var/run/slurmd.pid
>
> SlurmdPort              = 6818
>
> SlurmdSpoolDir          = /tmp/slurmd
>
> SlurmdTimeout           = 300 sec
>
> SlurmdUser              = root(0)
>
> SlurmSchedLogLevel      = 0
>
> SlurmctldPidFile        = /var/run/slurmctld.pid
>
> SLURM_CONF              = /etc/slurm/slurm.conf
>
> SLURM_VERSION           = 2.4.1
>
> SrunEpilog              = (null)
>
> SrunProlog              = (null)
>
> StateSaveLocation       = /home_palamut1/slurm/slurm.state
>
> SuspendExcNodes         = (null)
>
> SuspendExcParts         = (null)
>
> SuspendProgram          = (null)
>
> SuspendRate             = 60 nodes/min
>
> SuspendTime             = NONE
>
> SuspendTimeout          = 30 sec
>
> SwitchType              = switch/none
>
> TaskEpilog              = (null)
>
> TaskPlugin              = task/cgroup
>
> TaskPluginParam         = (null type)
>
> TaskProlog              = (null)
>
> TmpFS                   = /tmp
>
> TopologyPlugin          = topology/none
>
> TrackWCKey              = 0
>
> TreeWidth               = 50
>
> UsePam                  = 1
>
> UnkillableStepProgram   = (null)
>
> UnkillableStepTimeout   = 60 sec
>
> VSizeFactor             = 0 percent
>
> WaitTime                = 0 sec
>

Reply via email to