The user is probably running within an existing salloc allocation. In  
that case, srun would ru na step  within the existing job allocation  
and sbatch would create a new job allocation.

Moe Jette
SchedMD

Quoting Sefa Arslan <[email protected]>:

>
> One of our user has such a porblem.
> He sends his batch file via sbatch..The batch file is :
>
> #!/bin/bash
>
> #SBATCH -p partition-1
>
> #SBATCH -J test
>
> #SBATCH -N 2
>
> #SBATCH -n 48
>
> #SBATCH --output=job.%j.out
>
> #SBATCH --error=job.%j.err
>
> srun hostname
>
>
>
> The job is waiting due to the resources (sometime due to priority
> although only %8-%10 of the partition is utilized):
>
> 138791 partition-1     test  mesahin  PD       0:00      2 (Resources)
>
>
> sprio:
>
> JOBID   PRIORITY        AGE  FAIRSHARE    JOBSIZE  PARTITION
>
> 138791       1990          0          0        991       1000
>
>
> But he is able to run the same job via srun
>
> srun -N 2 -n 48 -p partition-1 -J test hostname
>
>
>
> I could not understand how a user can not run a job via sbatch while he
> could run the same job via srun. Do I have something wrong in my
> configuration?
> the slurm version is slurm-2.4.1-1.
>
> PS: Only a few user have this problem. The others are able to run jobs
> both way. And only %10 of the partition is utilized.
>
>
> scontrol show partition:
>
>      PartitionName=partition-1
>     AllocNodes=ALL AllowGroups=ALL Default=NO
>     DefaultTime=NONE DisableRootJobs=NO GraceTime=0 Hidden=NO
>     MaxNodes=UNLIMITED MaxTime=UNLIMITED MinNodes=1
>     Nodes=mercan[7-192]
>     Priority=1500 RootOnly=NO ReqResv=NO Shared=NO PreemptMode=GANG,SUSPEND
>     State=UP TotalCPUs=4464 TotalNodes=186 DefMemPerCPU=5000
> MaxMemPerNode=124800
>
>
> scontrol show config:
> Configuration data as of 2012-11-05T16:22:17
> AccountingStorageBackupHost = (null)
> AccountingStorageEnforce = associations,limits
> AccountingStorageHost   = mercan5
> AccountingStorageLoc    = N/A
> AccountingStoragePort   = 6819
> AccountingStorageType   = accounting_storage/slurmdbd
> AccountingStorageUser   = N/A
> AccountingStoreJobComment = YES
> AuthType                = auth/munge
> BackupAddr              = mercan6
> BackupController        = mercan6
> BatchStartTimeout       = 10 sec
> BOOT_TIME               = 2012-10-24T16:20:38
> CacheGroups             = 1
> CheckpointType          = checkpoint/none
> ClusterName             = linux
> CompleteWait            = 0 sec
> ControlAddr             = mercan5
> ControlMachine          = mercan5
> CryptoType              = crypto/munge
> DebugFlags              = NO_CONF_HASH
> DefMemPerNode           = UNLIMITED
> DisableRootJobs         = NO
> EnforcePartLimits       = YES
> Epilog                  = (null)
> EpilogMsgTime           = 2000 usec
> EpilogSlurmctld         = (null)
> FastSchedule            = 1
> FirstJobId              = 100000
> GetEnvTimeout           = 2 sec
> GresTypes               = gpu
> GroupUpdateForce        = 0
> GroupUpdateTime         = 600 sec
> HASH_VAL                = Match
> HealthCheckInterval     = 0 sec
> HealthCheckProgram      = (null)
> InactiveLimit           = 0 sec
> JobAcctGatherFrequency  = 30 sec
> JobAcctGatherType       = jobacct_gather/linux
> JobCheckpointDir        = /var/slurm/checkpoint
> JobCompHost             = localhost
> JobCompLoc              = /var/log/slurm/job_completions
> JobCompPort             = 0
> JobCompType             = jobcomp/filetxt
> JobCompUser             = root
> JobCredentialPrivateKey = (null)
> JobCredentialPublicCertificate = (null)
> JobFileAppend           = 0
> JobRequeue              = 1
> JobSubmitPlugins        = (null)
> KillOnBadExit           = 0
> KillWait                = 30 sec
> Licenses                = (null)
> MailProg                = /bin/mail
> MaxJobCount             = 1000000
> MaxJobId                = 4294901760
> MaxMemPerNode           = UNLIMITED
> MaxStepCount            = 40000
> MaxTasksPerNode         = 128
> MessageTimeout          = 10 sec
> MinJobAge               = 300 sec
> MpiDefault              = none
> MpiParams               = (null)
> NEXT_JOB_ID             = 138808
> OverTimeLimit           = 0 min
> PluginDir               = /usr/lib64/slurm
> PlugStackConfig         = /etc/slurm/plugstack.conf
> PreemptMode             = GANG,SUSPEND
> PreemptType             = preempt/partition_prio
> PriorityDecayHalfLife   = 00:00:00
> PriorityCalcPeriod      = 00:05:00
> PriorityFavorSmall      = 1
> PriorityFlags           = 0
> PriorityMaxAge          = 14-00:00:00
> PriorityUsageResetPeriod = NONE
> PriorityType            = priority/multifactor
> PriorityWeightAge       = 1000
> PriorityWeightFairShare = 10000
> PriorityWeightJobSize   = 1000
> PriorityWeightPartition = 1000
> PriorityWeightQOS       = 0
> PrivateData             = jobs,usage,users,accounts,reservations
> ProctrackType           = proctrack/cgroup
> Prolog                  = (null)
> PrologSlurmctld         = (null)
> PropagatePrioProcess    = 0
> PropagateResourceLimits = (null)
> PropagateResourceLimitsExcept = MEMLOCK
> RebootProgram           = (null)
> ReconfigFlags           = (null)
> ResumeProgram           = (null)
> ResumeRate              = 300 nodes/min
> ResumeTimeout           = 60 sec
> ResvOverRun             = 0 min
> ReturnToService         = 2
> SallocDefaultCommand    = (null)
> SchedulerParameters     = (null)
> SchedulerPort           = 7321
> SchedulerRootFilter     = 1
> SchedulerTimeSlice      = 30 sec
> SchedulerType           = sched/builtin
> SelectType              = select/cons_res
> SelectTypeParameters    = CR_CPU_MEMORY
> SlurmUser               = root(0)
> SlurmctldDebug          = debug3
> SlurmctldLogFile        = /var/log/slurm/slurmctld.log
> SlurmSchedLogFile       = (null)
> SlurmctldPort           = 6817
> SlurmctldTimeout        = 300 sec
> SlurmdDebug             = debug3
> SlurmdLogFile           = /var/log/slurm/slurmd.log
> SlurmdPidFile           = /var/run/slurmd.pid
> SlurmdPort              = 6818
> SlurmdSpoolDir          = /tmp/slurmd
> SlurmdTimeout           = 300 sec
> SlurmdUser              = root(0)
> SlurmSchedLogLevel      = 0
> SlurmctldPidFile        = /var/run/slurmctld.pid
> SLURM_CONF              = /etc/slurm/slurm.conf
> SLURM_VERSION           = 2.4.1
> SrunEpilog              = (null)
> SrunProlog              = (null)
> StateSaveLocation       = /home_palamut1/slurm/slurm.state
> SuspendExcNodes         = (null)
> SuspendExcParts         = (null)
> SuspendProgram          = (null)
> SuspendRate             = 60 nodes/min
> SuspendTime             = NONE
> SuspendTimeout          = 30 sec
> SwitchType              = switch/none
> TaskEpilog              = (null)
> TaskPlugin              = task/cgroup
> TaskPluginParam         = (null type)
> TaskProlog              = (null)
> TmpFS                   = /tmp
> TopologyPlugin          = topology/none
> TrackWCKey              = 0
> TreeWidth               = 50
> UsePam                  = 1
> UnkillableStepProgram   = (null)
> UnkillableStepTimeout   = 60 sec
> VSizeFactor             = 0 percent
> WaitTime                = 0 sec
>
> Slurmctld(primary/backup) at mercan5/mercan6 are UP/UP
>
>
>

Reply via email to