We are running Maui 3.3.1. We have 648 nodes defined in our system and each
node has 16 cores. However, we could not get jobs requested
-lnodes=400:ppn=16 to run run. "check job" shows the following,

Holds:    Defer
Messages:  exceeds available partition procs
PE:  6400.00  StartPriority:  2478
cannot select job 1051 for partition DEFAULT (job hold active)

We don't have the partitions defined in Maui.  Here are the major
configurations,

[root@carter-adm hu8]# qstat -a 1051

carter-adm.rcac.purdue.edu:

 Req'd  Req'd   Elap
Job ID               Username Queue    Jobname          SessID NDS   TSK
Memory Time  S Time
-------------------- -------- -------- ---------------- ------ ----- ---
------ ----- - -----
1051.carter-adm.     mluisier workq    P6400               --    400 640
 --  02:00 Q   --

[root@carter-adm hu8]# checkjob 1051


checking job 1051

State: Idle
Creds:  user:mluisier  group:ece  class:workq  qos:DEFAULT
WallTime: 00:00:00 of 2:00:00
SubmitTime: Mon Nov 21 09:07:36
  (Time Queued  Total: 11:00:40:20  Eligible: 00:00:00)

Total Tasks: 6400

Req[0]  TaskCount: 6400  Partition: ALL
Network: [NONE]  Memory >= 0  Disk >= 0  Swap >= 0
Opsys: [NONE]  Arch: [NONE]  Features: [carter]


IWD: [NONE]  Executable:  [NONE]
Bypass: 0  StartCount: 0
PartitionMask: [ALL]
Flags:       RESTARTABLE

Holds:    Defer
Messages:  exceeds available partition procs
PE:  6400.00  StartPriority:  2483
cannot select job 1051 for partition DEFAULT (job hold active)


[root@carter-adm hu8]#

[root@carter-adm maui]# cat maui.cfg
# maui.cfg 3.3.1

SERVERHOST            carter-adm.rcac.purdue.edu
# primary admin must be first in list
ADMIN1                root

# Resource Manager Definition

RMCFG[CARTER-ADM.RCAC.PURDUE.EDU] TYPE=PBS

# Allocation Manager Definition

AMCFG[bank]  TYPE=NONE

# full parameter docs at http://supercluster.org/mauidocs/a.fparameters.html
# use the 'schedctl -l' command to display current configuration

RMPOLLINTERVAL        00:00:30

SERVERPORT            42559
SERVERMODE            NORMAL

# Admin: http://supercluster.org/mauidocs/a.esecurity.html


LOGFILE               maui.log
LOGFILEMAXSIZE        10000000
LOGLEVEL              3

# Job Priority: http://supercluster.org/mauidocs/5.1jobprioritization.html

QUEUETIMEWEIGHT       1
#QUEUETIME             0

# FairShare: http://supercluster.org/mauidocs/6.3fairshare.html

#FSPOLICY              PSDEDICATED
#FSDEPTH               7
#FSINTERVAL            86400
#FSDECAY               0.80

# Throttling Policies:
http://supercluster.org/mauidocs/6.2throttlingpolicies.html

# NONE SPECIFIED

# Backfill: http://supercluster.org/mauidocs/8.2backfill.html

#BACKFILLPOLICY        FIRSTFIT
#RESERVATIONPOLICY     CURRENTHIGHEST

# Node Allocation: http://supercluster.org/mauidocs/5.2nodeallocation.html

NODEALLOCATIONPOLICY  MINRESOURCE

# QOS: http://supercluster.org/mauidocs/7.3qos.html

# QOSCFG[hi]  PRIORITY=100 XFTARGET=100 FLAGS=PREEMPTOR:IGNMAXJOB
# QOSCFG[low] PRIORITY=-1000 FLAGS=PREEMPTEE

# Standing Reservations:
http://supercluster.org/mauidocs/7.1.3standingreservations.html

# SRSTARTTIME[test] 8:00:00
# SRENDTIME[test]   17:00:00
# SRDAYS[test]      MON TUE WED THU FRI
# SRTASKCOUNT[test] 20
# SRMAXTIME[test]   0:30:00

# Creds: http://supercluster.org/mauidocs/6.1fairnessoverview.html

# USERCFG[DEFAULT]      FSTARGET=25.0
# USERCFG[john]         PRIORITY=100  FSTARGET=10.0-
# GROUPCFG[staff]       PRIORITY=1000 QLIST=hi:low QDEF=hi
# CLASSCFG[batch]       FLAGS=PREEMPTEE
# CLASSCFG[interactive] FLAGS=PREEMPTOR

###### following added by Shaomin ######

CREDWEIGHT   1
CLASSWEIGHT   1
CLASSCFG[batch]       PRIORITY=10
CLASSCFG[test]        PRIORITY=1000 MAXPROC=32 MAXJOB=3

#CLASSCFG[test]        PRIORITY=1000 MAXPROC=32 MAXJOB=3
HOSTLIST=carter-a638,carter-a639

SRCFG[test1] ACCESS=DEDICATED
SRCFG[test1] PERIOD=INFINITY
SRCFG[test1] CLASSLIST=test
SRCFG[test1] HOSTLIST=carter-a638,carter-a639

#QOSWEIGHT   1
#QOSCFG[batch]       PRIORITY=10
#QOSCFG[test]        PRIORITY=1000
[root@carter-adm maui]#

[root@carter-adm maui]# qstat -Qf workq
Queue: workq
    queue_type = Execution
    total_jobs = 1
    state_count = Transit:0 Queued:1 Held:0 Waiting:0 Running:0 Exiting:0
    resources_max.neednodes = carter
    resources_max.walltime = 168:00:00
    resources_default.neednodes = carter
    resources_default.nodes = 1
    resources_default.walltime = 04:00:00
    mtime = 1322577475
    enabled = True
    started = True

[root@carter-adm maui]#

[root@carter-adm maui]# pbsnodes carter-a000
carter-a000
     state = free
     np = 16
     properties = carter
     ntype = cluster
     status =
rectime=1322837521,varattr=,jobs=,state=free,netload=680204799,gres=,loadave=0.07,ncpus=16,physmem=32841344kb,availmem=48525956kb,totmem=49618552kb,idletime=82118,nusers=0,nsessions=0,uname=Linux
carter-a000.rcac.purdue.edu 2.6.32-131.12.1.el6.x86_64 #1 SMP Sun Jul 31
16:44:56 EDT 2011 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0
[root@carter-adm maui]#


Nodes definition examples,

carter-a000 np=16 carter
.....
carter-a647 np=16 carter

We are able to run jobs sized less than -lnodes=256:ppn=16.

Thanks for all help.

Shaomin
_______________________________________________
mauiusers mailing list
[email protected]
http://www.supercluster.org/mailman/listinfo/mauiusers

Reply via email to