Hi,
I set the maui.log file to level 5 to try to figure out which resource
limit was being violated causing my job to be killed. The PBS script
has the key line:
#PBS -l nodes=1:ppn=16
The openpbs/torque node file has listed np=16 for the node that the
job was sent to yet I see the following event in maui.log killing the
job:
MSysRegEvent(JOBRESVIOLATION: job '850416' in state 'Running' has
exceeded PROC resource limit (1618 > 100) (action CANCEL will be
taken)
What parameter (or lack thereof) is causing this to happen?
### maui.cfg ###
SERVERHOST pirserver
ADMIN1 root
RMCFG[PIRANHA] TYPE=PBS
AMCFG[bank] TYPE=NONE
RMPOLLINTERVAL 00:00:30
SERVERPORT 42559
SERVERMODE NORMAL
LOGFILE maui.log
LOGFILEMAXSIZE 10000000
LOGLEVEL 5
QUEUETIMEWEIGHT 1
FSPOLICY DEDICATEDPES%
FSINTERVAL 24:00:00
FSDEPTH 14
FSDECAY 0.85
FSWEIGHT 10
FSACCOUNTWEIGHT 1000
FSGROUPWEIGHT 300
FSUSERWEIGHT 300
RESCAP 10000
RESWEIGHT 20
PROCWEIGHT 200
NODEWEIGHT 20
MEMWEIGHT 0
BACKFILLPOLICY FIRSTFIT
RESERVATIONPOLICY CURRENTHIGHEST
NODEALLOCATIONPOLICY MINRESOURCE
CLASSCFG[DEFAULT] MAXIJOB=2000
ACCOUNTCFG[DEFAULT] MAXPROC=200
ACCOUNTCFG[DEFAULT] MAXIPROC=200
ACCOUNTCFG[DEFAULT] MAXIPROC=200
ACCOUNTCFG[DEFAULT] MAXJOB=200
ACCOUNTCFG[DEFAULT] MAXIJOB=200
ACCOUNTCFG[DEFAULT] MAXPS=34560000
ACCOUNTCFG[DEFAULT] MAXIPS=34560000
USERCFG[DEFAULT] FSTARGET=10
USERCFG[DEFAULT] MAXPROC=100
USERCFG[DEFAULT] MAXIPROC=100
USERCFG[DEFAULT] MAXJOB=100
USERCFG[DEFAULT] MAXIJOB=100
USERCFG[DEFAULT] MAXIPS=17280000
REJECTNEGPRIOJOBS FALSE
ENABLENEGJOBPRIORITY TRUE
ACCOUNTCFG[lab1_acct] FSTARGET=32
GROUPCFG[lab1] ADEF=lab1_acct
ACCOUNTCFG[lab2_acct] FSTARGET=8
GROUPCFG[lab2] ADEF=lab2_acct
ACCOUNTCFG[lab3_acct] FSTARGET=16
GROUPCFG[lab3] ADEF=lab3_acct
ACCOUNTCFG[lab4_acct] FSTARGET=44
GROUPCFG[lab4] ADEF=lab4_acct
ENFORCERESOURCELIMITS ON
RESOURCELIMITPOLICY MEM:ALWAYS:CANCEL PROC:ALWAYS:CANCEL
NODEMAXLOAD 20.0
NODELOADPOLICY ADJUSTSTATE
NODECFG[pir1] PROCSPEED=2930 SPEED=1.00
NODECFG[pir2] PROCSPEED=2930 SPEED=1.00
NODECFG[pir3] PROCSPEED=2930 SPEED=1.00
NODECFG[pir4] PROCSPEED=2930 SPEED=1.00
NODECFG[pir5] PROCSPEED=2930 SPEED=1.00
NODECFG[pir6] PROCSPEED=2930 SPEED=1.00
NODECFG[pir7] PROCSPEED=2930 SPEED=1.00
NODECFG[pir8] PROCSPEED=2930 SPEED=1.00
NODECFG[pir9] PROCSPEED=2930 SPEED=1.00
NODECFG[pir10] PROCSPEED=2930 SPEED=1.00
NODECFG[pir11] PROCSPEED=2930 SPEED=1.00
NODECFG[pir12] PROCSPEED=2930 SPEED=1.00
NODECFG[pir13] PROCSPEED=2930 SPEED=1.00
NODECFG[pir14] PROCSPEED=2930 SPEED=1.00
NODECFG[pir15] PROCSPEED=2930 SPEED=1.00
NODECFG[pir16] PROCSPEED=2930 SPEED=1.00
NODECFG[pir17] PROCSPEED=2930 SPEED=1.00
NODECFG[pir18] PROCSPEED=2930 SPEED=1.00
NODECFG[pir19] PROCSPEED=2930 SPEED=1.00
NODECFG[pir20] PROCSPEED=2930 SPEED=1.00
NODECFG[pir21] PROCSPEED=2930 SPEED=1.00
NODECFG[pir22] PROCSPEED=2930 SPEED=1.00
NODECFG[pir23] PROCSPEED=2930 SPEED=1.00
NODECFG[pir24] PROCSPEED=2930 SPEED=1.00
NODECFG[pir25] PROCSPEED=2930 SPEED=1.00
###
### openpbs queue config ###
Queue pir
queue_type = Execution
total_jobs = 0
state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
max_running = 400
resources_default.neednodes = pir
resources_default.nodes = 1
acl_group_enable = True
acl_groups = lab4
acl_group_sloppy = True
mtime = Thu Dec 17 15:40:47 2009
resources_assigned.mem = 0b
resources_assigned.nodect = 0
enabled = True
started = True
###
### openbs server attributes config ###
set server scheduling = True
set server acl_hosts = pirserver
set server managers = r...@pirserver
set server operators = r...@pirserver
set server default_queue = pir
set server log_events = 511
set server mail_from = root
set server query_other_jobs = True
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server keep_completed = 300
set server next_job_number = 850417
###
Thanks,
Sabuj Pattanayek
_______________________________________________
mauiusers mailing list
[email protected]
http://www.supercluster.org/mailman/listinfo/mauiusers