Dear Torque & Maui users,
I'm crossposting this to maui & torque users as I'm not sure which of these is
posing a problem. I'm trying to make three separate queue for the three
different types of nodes on our cluster using the combination of qmgr commands:
set queue firstgen resources_default.neednodes = 1stgennodes
set nodes compute-0-0 properties = 1stgennodes
I've found a fair few previous emails about this and have followed their
solution without sucess. I can submit the jobs to the queues but they remain
in a queued state, qstat -f gives:-
[root@che-hydra /]# qstat -f 48691
Job Id: 48691.che-hydra.bham.ac.uk
Job_Name = allenega_p2000_g300_r2.cmd
Job_Owner = [email protected]
resources_used.cput = 529:06:32
resources_used.mem = 16872kb
resources_used.vmem = 245908kb
resources_used.walltime = 529:11:31
job_state = R
queue = default
server = che-hydra.bham.ac.uk
Checkpoint = u
ctime = Wed Sep 21 10:45:00 2011
Error_Path = che-hydra.bham.ac.uk:/home/jsmale/allenega/allenega_p2000_g30
0_r2.cmd.e48691
exec_host = compute-0-16/1
Hold_Types = n
Join_Path = n
Keep_Files = n
Mail_Points = a
mtime = Wed Sep 21 10:45:01 2011
Output_Path = che-hydra.bham.ac.uk:/home/jsmale/allenega/allenega_p2000_g3
00_r2.cmd.o48691
Priority = 0
qtime = Wed Sep 21 10:45:00 2011
Rerunable = True
Resource_List.neednodes = 1
Resource_List.nodect = 1
Resource_List.nodes = 1
session_id = 20687
substate = 42
Variable_List = PBS_O_HOME=/home/jsmale,PBS_O_LANG=en_US.iso885915,
PBS_O_LOGNAME=jsmale,
PBS_O_PATH=/home/jsmale/mctdh90.svn/bin/x86_64:/home/jsmale/mctdh90.s
vn/bin:/usr/lib64/openmpi/1.3.2-gcc/bin:/usr/kerberos/bin:/usr/java/la
test/bin:/usr/local/bin:/bin:/usr/bin:/opt/maui/bin:/opt/torque/bin:/o
pt/torque/sbin:/usr/share/pvm3/pvm3//bin/LINUX64:/opt/rocks/bin:/opt/r
ocks/sbin:/global64/pgi/linux86-64/10.4/bin:/user/worth/gaussian/bin:/
user/jsmale/bin:/home/gaussian/bin:~/mctdh90.svn/bin:/home/jsmale/bin,
PBS_O_MAIL=/var/spool/mail/jsmale,PBS_O_SHELL=/bin/bash,
PBS_SERVER=che-hydra.bham.ac.uk,PBS_O_HOST=che-hydra.bham.ac.uk,
PBS_O_WORKDIR=/home/jsmale/allenega,PBS_O_QUEUE=default
euser = jsmale
egroup = worth
hashname = 48691.che-hydra.bham.ac.uk
queue_rank = 35491
queue_type = E
etime = Wed Sep 21 10:45:00 2011
submit_args = allenega_p2000_g300_r2.cmd
start_time = Wed Sep 21 10:45:01 2011
start_count = 1
and checkjob gives the following:
[root@che-hydra /]# checkjob 48691
checking job 48691
State: Running
Creds: user:jsmale group:worth class:default qos:DEFAULT
WallTime: 22:01:12:16 of 99:23:59:59
SubmitTime: Wed Sep 21 10:45:00
(Time Queued Total: 00:00:01 Eligible: 00:00:00)
StartTime: Wed Sep 21 10:45:01
Total Tasks: 1
Req[0] TaskCount: 1 Partition: DEFAULT
Network: [NONE] Memory >= 0 Disk >= 0 Swap >= 0
Opsys: [NONE] Arch: [NONE] Features: [NONE]
NodeCount: 1
Allocated Nodes:
[compute-0-16:1]
IWD: [NONE] Executable: [NONE]
Bypass: 0 StartCount: 1
PartitionMask: [ALL]
Flags: RESTARTABLE
Reservation '48691' ( -INFINITY -> 77:22:47:56 Duration: 99:23:59:59)
PE: 1.00 StartPriority: 20
I'm not sure why the job isn't running, there doesn't seem to be any reason
given in either the maui or torque (server&mom) logs. Could anyone help me
decipher the cause? Configuration of the server follows.
Top of maui.cfg file:
# maui.cfg 3.2.6p20
SERVERHOST che-hydra.bham.ac.uk
# primary admin must be first in list
ADMIN1 root
# Resource Manager Definition
RMCFG[che-hydra.bham.ac.uk] TYPE=PBS
# Allocation Manager Definition
AMCFG[bank] TYPE=NONE
# full parameter docs at http://supercluster.org/mauidocs/a.fparameters.html
# use the 'schedctl -l' command to display current configuration
RMPOLLINTERVAL 00:00:30
SERVERPORT 42559
SERVERMODE NORMAL
# Admin: http://supercluster.org/mauidocs/a.esecurity.html
LOGFILE maui.log
LOGFILEMAXSIZE 100000000
LOGLEVEL 3
# Setting up node information for throttling policies
#
NODECFG[compute-0-0] SPEED=1 MAXJOB=4 nodetype=firstgennodes
NODECFG[compute-0-1] SPEED=1 MAXJOB=4 nodetype=firstgennodes
NODECFG[compute-0-2] SPEED=1 MAXJOB=4 nodetype=firstgennodes
NODECFG[compute-0-3] SPEED=1 MAXJOB=4 nodetype=firstgennodes
NODECFG[compute-0-4] SPEED=1.2 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-5] SPEED=1.2 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-6] SPEED=1.2 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-7] SPEED=1.2 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-8] SPEED=1.4 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-9] SPEED=1.4 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-10] SPEED=1.4 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-11] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-12] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-13] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-14] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-15] SPEED=1.5 MAXJOB=8 nodetype=secondgennodes
NODECFG[compute-0-16] SPEED=1.7 MAXJOB=16 nodetype=thirdgennodes
NODECFG[compute-0-17] SPEED=1.7 MAXJOB=16 nodetype=thirdgennodes
NODECFG[compute-0-18] SPEED=1.7 MAXJOB=16 nodetype=thirdgennodes
NODECFG[compute-0-19] SPEED=1.7 MAXJOB=16 nodetype=thirdgennodes
# Setting up queue information to allow allocation to specfic types of nodes
via queues
CLASSCFG[firstgen] hostlist = compute-0-0,compute-0-1,compute-0-2,compute-0-3
CLASSCFG[secondgen] hostlist =
compute-0-4,compute-0-5,compute-0-6,compute-0-7,compute-0-8,compute-0-9,compute-0-10,compute-0-11,compute-0-12,compute-0-13,compute-0-14,compute-0-15
CLASSCFG[thirdgen] hostlist =
compute-0-16,compute-0-17,compute-0-18,compute-0-19
# Backfill: http://supercluster.org/mauidocs/8.2backfill.html
BACKFILLPOLICY FIRSTFIT
RESERVATIONPOLICY CURRENTHIGHEST
# Node Allocation: http://supercluster.org/mauidocs/5.2nodeallocation.html
NODEALLOCATIONPOLICY CPULOAD
Some torque setting that might be of use:
[root@che-hydra]# pbsnodes (truncated, one example of each type of node
compute-0-0
state = free
np = 4
properties = firstgennodes
ntype = cluster
status = opsys=linux,uname=Linux compute-0-0.local 2.6.18-164.6.1.el5 #1
SMP Tue Nov 3 16:12:36 EST 2009 x86_64,sessions=? 15201,nsessions=?
15201,nusers=0,idletime=13287377,totmem=9195716kb,availmem=8926420kb,physmem=8175600kb,ncpus=4,loadave=0.00,netload=54589282244,state=free,jobs=,varattr=,rectime=1318502240
compute-0-4
state = free
np = 8
properties = secondgennodes
ntype = cluster
status = opsys=linux,uname=Linux compute-0-4.local 2.6.18-164.6.1.el5 #1
SMP Tue Nov 3 16:12:36 EST 2009 x86_64,sessions=? 15201,nsessions=?
15201,nusers=0,idletime=21840103,totmem=17464156kb,availmem=17170748kb,physmem=16444040kb,ncpus=8,loadave=0.00,netload=494140575539,state=free,jobs=,varattr=,rectime=1318502242
compute-0-16
state = free
np = 16
properties = thirdgennodes
ntype = cluster
jobs = 0/48738.che-hydra.bham.ac.uk, 1/48691.che-hydra.bham.ac.uk,
3/48693.che-hydra.bham.ac.uk
status = opsys=linux,uname=Linux compute-0-16.local 2.6.18-164.6.1.el5 #1
SMP Tue Nov 3 16:12:36 EST 2009 x86_64,sessions=6691 20687
20764,nsessions=3,nusers=2,idletime=7342084,totmem=17461096kb,availmem=8761384kb,physmem=16440980kb,ncpus=16,loadave=3.08,netload=647310098799,state=free,jobs=48691.che-hydra.bham.ac.uk
48693.che-hydra.bham.ac.uk
48738.che-hydra.bham.ac.uk,varattr=,rectime=1318502257
[root@che-hydra]# qmgr -c "p s"
#
# Create queues and set their attributes.
#
#
# Create and define queue default
#
create queue default
set queue default queue_type = Execution
set queue default Priority = 100
set queue default resources_default.nodes = 1
set queue default enabled = True
set queue default started = True
#
# Create and define queue secondgen
#
create queue secondgen
set queue secondgen queue_type = Execution
set queue secondgen Priority = 100
set queue secondgen acl_host_enable = False
set queue secondgen acl_hosts = che-hydra+localhost
set queue secondgen resources_default.neednodes = secondgennodes
set queue secondgen resources_default.nodes = 1
set queue secondgen enabled = True
set queue secondgen started = True
#
# Create and define queue thirdgen
#
create queue thirdgen
set queue thirdgen queue_type = Execution
set queue thirdgen Priority = 100
set queue thirdgen acl_host_enable = False
set queue thirdgen acl_hosts = che-hydra+localhost
set queue thirdgen resources_default.neednodes = thirdgennodes
set queue thirdgen resources_default.nodes = 1
set queue thirdgen enabled = True
set queue thirdgen started = True
#
# Create and define queue firstgen
#
create queue firstgen
set queue firstgen queue_type = Execution
set queue firstgen Priority = 100
set queue firstgen acl_host_enable = False
set queue firstgen acl_hosts = che-hydra+localhost
set queue firstgen resources_default.neednodes = firstgennodes
set queue firstgen resources_default.nodes = 1
set queue firstgen enabled = True
set queue firstgen started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_host_enable = False
set server acl_hosts = che-hydra.bham.ac.uk
set server default_queue = default
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server auto_node_np = True
set server next_job_number = 49702
Jonathan Smale
_______________________________________________
mauiusers mailing list
[email protected]
http://www.supercluster.org/mailman/listinfo/mauiusers