The squeue command should report the reason for the jobs not running.

Quoting Tal Hazan <[email protected]>:

> Hi,
>
> We have two nodes, one in mixed mode and the second in IDLE mode.  
> Currently no tasks are running on the second node and no errors are  
> showing up.
>
> Below are the scripts we are using to send and show config from slurm:
>
> Currently over 1000 jobs are queued
>
> Submission script:
>         sbatch $ADDITIONAL --mail-type=FAIL  
> [email protected]<mailto:[email protected]> -J  
> $jname_static --partition=regs <<-EOF
>         #!/bin/bash
>         #SBATCH --get-user-env
>         #SBATCH --cpu_bind=cores
>         #SBATCH --distribution=cyclic
>         #SBATCH --partition=regs
>         #SBATCH --time=0
>         srun -q -o out_%j_%t --cpu_bind=cores --ntasks-per-core=1  
> --ntasks=1 $*
>         EOF
>
> Show config:
> Configuration data as of 2012-05-10T20:31:02
> AccountingStorageBackupHost = (null)
> AccountingStorageEnforce = none
> AccountingStorageHost   = localhost
> AccountingStorageLoc    = N/A
> AccountingStoragePort   = 6819
> AccountingStorageType   = accounting_storage/slurmdbd
> AccountingStorageUser   = N/A
> AuthType                = auth/munge
> BackupAddr              = tlvhpcbcm2
> BackupController        = tlvhpcbcm2
> BatchStartTimeout       = 10 sec
> BOOT_TIME               = 2012-05-10T20:27:01
> CacheGroups             = 1
> CheckpointType          = checkpoint/none
> ClusterName             = slurm_cluster
> CompleteWait            = 0 sec
> ControlAddr             = tlvhpcbcm1
> ControlMachine          = tlvhpcbcm1
> CryptoType              = crypto/munge
> DebugFlags              = (null)
> DefMemPerCPU            = UNLIMITED
> DisableRootJobs         = NO
> EnforcePartLimits       = NO
> Epilog                  = (null)
> EpilogMsgTime           = 2000 usec
> EpilogSlurmctld         = (null)
> FastSchedule            = 0
> FirstJobId              = 1
> GetEnvTimeout           = 2 sec
> GresTypes               = gpu
> GroupUpdateForce        = 0
> GroupUpdateTime         = 600 sec
> HashVal                 = Match
> HealthCheckInterval     = 0 sec
> HealthCheckProgram      = (null)
> InactiveLimit           = 0 sec
> JobAcctGatherFrequency  = 30 sec
> JobAcctGatherType       = jobacct_gather/linux
> JobCheckpointDir        = /var/slurm/checkpoint
> JobCompHost             = localhost
> JobCompLoc              = /tmp/slurmCompLog
> JobCompPort             = 0
> JobCompType             = jobcomp/none
> JobCompUser             = root
> JobCredentialPrivateKey = (null)
> JobCredentialPublicCertificate = (null)
> JobFileAppend           = 0
> JobRequeue              = 1
> JobSubmitPlugins        = (null)
> KillOnBadExit           = 0
> KillWait                = 30 sec
> Licenses                = vcsruntime*6
> MailProg                = /bin/mail
> MaxJobCount             = 10000
> MaxMemPerCPU            = UNLIMITED
> MaxTasksPerNode         = 12
> MessageTimeout          = 10 sec
> MinJobAge               = 300 sec
> MpiDefault              = none
> MpiParams               = (null)
> NEXT_JOB_ID             = 18656
> OverTimeLimit           = 0 min
> PluginDir               = /cm/shared/apps/slurm/2.2.7/lib64/slurm
> PlugStackConfig         = /etc/slurm/plugstack.conf
> PreemptMode             = REQUEUE
> PreemptType             = preempt/partition_prio
> PriorityType            = priority/basic
> PrivateData             = none
> ProctrackType           = proctrack/pgid
> Prolog                  = (null)
> PrologSlurmctld         = /cm/local/apps/cmd/scripts/prolog
> PropagatePrioProcess    = 0
> PropagateResourceLimits = ALL
> PropagateResourceLimitsExcept = (null)
> ResumeProgram           = (null)
> ResumeRate              = 300 nodes/min
> ResumeTimeout           = 60 sec
> ResvOverRun             = 0 min
> ReturnToService         = 1
> SallocDefaultCommand    = (null)
> SchedulerParameters     = (null)
> SchedulerPort           = 7321
> SchedulerRootFilter     = 1
> SchedulerTimeSlice      = 30 sec
> SchedulerType           = sched/backfill
> SelectType              = select/cons_res
> SelectTypeParameters    = CR_CORE
> SlurmUser               = slurm(117)
> SlurmctldDebug          = 3
> SlurmctldLogFile        = /var/log/slurmctld
> SlurmSchedLogFile       = (null)
> SlurmctldPort           = 6817
> SlurmctldTimeout        = 20 sec
> SlurmdDebug             = 3
> SlurmdLogFile           = /var/log/slurmd
> SlurmdPidFile           = /var/run/slurmd.pid
> SlurmdPort              = 6818
> SlurmdSpoolDir          = /cm/local/apps/slurm/2.2.4/spool
> SlurmdTimeout           = 20 sec
> SlurmdUser              = root(0)
> SlurmSchedLogLevel      = 0
> SlurmctldPidFile        = /var/run/slurmctld.pid
> SLURM_CONF              = /etc/slurm/slurm.conf
> SLURM_VERSION           = 2.2.7
> SrunEpilog              = (null)
> SrunProlog              = (null)
> StateSaveLocation       = /cm/shared/apps/slurm/current/cm/statesave
> SuspendExcNodes         = (null)
> SuspendExcParts         = (null)
> SuspendProgram          = (null)
> SuspendRate             = 60 nodes/min
> SuspendTime             = NONE
> SuspendTimeout          = 30 sec
> SwitchType              = switch/none
> TaskEpilog              = (null)
> TaskPlugin              = task/affinity
> TaskPluginParam         = (null type)
> TaskProlog              = (null)
> TmpFS                   = /tmp
> TopologyPlugin          = topology/none
> TrackWCKey              = 0
> TreeWidth               = 50
> UsePam                  = 0
> UnkillableStepProgram   = (null)
> UnkillableStepTimeout   = 60 sec
> VSizeFactor             = 0 percent
> WaitTime                = 0 sec
>
> [root@tlvhpc root]# sinfo -l
> Thu May 10 20:31:19 2012
> PARTITION AVAIL  TIMELIMIT   JOB_SIZE ROOT SHARE     GROUPS  NODES    
>     STATE NODELIST
> defq*        up   infinite 1-infinite   no    NO        all      1    
> allocated node001
> defq*        up   infinite 1-infinite   no    NO        all      1    
>      idle node002
> regs         up   infinite 1-infinite   no    NO        all      1    
> allocated node001
> regs         up   infinite 1-infinite   no    NO        all      1    
>      idle node002
>
> Scheduling pool data:
> -------------------------------------------------------------
> Pool        Memory  Cpus  Total Usable   Free  Other Traits
> -------------------------------------------------------------
> defq*      96865Mb    12      2      2      1
> regs       96865Mb    12      2      2      1
>
> slurm.conf:
>
> ClusterName=SLURM_CLUSTER
> #ControlAddr=
> #BackupAddr=
> #
> SlurmUser=slurm
> #SlurmdUser=root
> SlurmctldPort=6817
> SlurmdPort=6818
> AuthType=auth/munge
> #JobCredentialPrivateKey=
> #JobCredentialPublicCertificate=
> StateSaveLocation=/cm/shared/apps/slurm/current/cm/statesave
> SlurmdSpoolDir=/cm/local/apps/slurm/2.2.4/spool
> SwitchType=switch/none
> MpiDefault=none
> SlurmctldPidFile=/var/run/slurmctld.pid
> SlurmdPidFile=/var/run/slurmd.pid
> ProctrackType=proctrack/pgid
> #PluginDir=
> CacheGroups=1
> #FirstJobId=
> ReturnToService=1
> #MaxJobCount=
> #PlugStackConfig=
> #PropagatePrioProcess=
> #PropagateResourceLimits=
> #PropagateResourceLimitsExcept=
> #Prolog=
> PrologSlurmctld=/cm/local/apps/cmd/scripts/prolog
> #Epilog=
> #SrunProlog=
> #SrunEpilog=
> #TaskProlog=
> #TaskEpilog=
> TaskPlugin=task/affinity
> TaskPluginParam=sched
> #TaskPlugin=task/none
> #TaskPluginParam=Cores
> #TrackWCKey=no
> #TreeWidth=50
> #TmpFs=
> #UsePAM=
> #
> # TIMERS
> SlurmctldTimeout=20
> SlurmdTimeout=20
> InactiveLimit=0
> MinJobAge=300
> KillWait=30
> Waittime=0
> #
> # SCHEDULING
> #SchedulerAuth=
> #SchedulerPort=
> #SchedulerRootFilter=
> FastSchedule=0
> #PriorityType=priority/multifactor
> #PriorityDecayHalfLife=14-0
> #PriorityUsageResetPeriod=14-0
> #PriorityWeightFairshare=100000
> #PriorityWeightAge=1000
> #PriorityWeightPartition=10000
> #PriorityWeightJobSize=1000
> #PriorityMaxAge=1-0
> #
> # LOGGING
> SlurmctldDebug=3
> SlurmctldLogFile=/var/log/slurmctld
> SlurmdDebug=3
> SlurmdLogFile=/var/log/slurmd
> JobCompType=jobcomp/none
> JobCompLoc=/tmp/slurmCompLog
> #
> # ACCOUNTING
> JobAcctGatherType=jobacct_gather/linux
> JobAcctGatherFrequency=30
> #
> AccountingStorageType=accounting_storage/slurmdbd
> # AccountingStorageHost=localhost
> # AccountingStorageLoc=slurm_acct_db
> # AccountingStoragePass=SLURMDBD_USERPASS
> # AccountingStorageUser=slurm
> #
> # GENERIC RESOURCES
> GresTypes=gpu
> # BEGIN AUTOGENERATED SECTION -- DO NOT REMOVE
> # Scheduler
> SchedulerType=sched/backfill
> # Master nodes
> ControlMachine=tlvhpcbcm1
> ControlAddr=tlvhpcbcm1
> BackupController=tlvhpcbcm2
> BackupAddr=tlvhpcbcm2
> # Nodes
> PartitionName=defq Nodes=node[001,002] Default=YES MinNodes=1  
> MaxNodes=UNLIMITED MaxTime=UNLIMITED AllowGroups=ALL Priority=10  
> DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO
> PartitionName=regs Nodes=node[001,002] Default=NO MinNodes=1  
> MaxNodes=UNLIMITED MaxTime=UNLIMITED AllowGroups=ALL Priority=5  
> DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO
> # END AUTOGENERATED SECTION   -- DO NOT REMOVE
> # Plugins:
> SelectType=select/cons_res
> SelectTypeParameters=CR_Core
> MaxTasksPerNode=12
> NodeName=node[001,002] Procs=12 Sockets=2 CoresPerSocket=6  
> ThreadsPerCore=2 RealMemory=96865 TmpDisk=1922
> Licenses=vcsruntime*6
> PreemptType=preempt/partition_prio
> PreemptMode=REQUEUE
>
>
>
> Best Regards,
>
> Tal Hazan, IT Specialist
> DigitalOptics Corporation Israel Ltd.
> www.doc.com<http://www.doc.com/>
> Mobile: +972-54-332-3338
> Desk:     +972-732-404-777
> 6a Habarzel st. Tel Aviv, 69710 Israel
>
> [Description: Logo - DOC]
>
>

Reply via email to