Hi,
I've set these additional resource max's and mins to the pir queue and
am also using ncpus=16 in the pbs script:
resources_max.ncpus = 100
resources_max.nodes = 12
resources_min.ncpus = 1
resources_min.nodes = 1
resources_default.ncpus = 1
resources_default.nodes = 1
this is what checkjob shows:
Req[0] TaskCount: 16 Partition: DEFAULT
Network: [NONE] Memory >= 0 Disk >= 0 Swap >= 0
Opsys: [NONE] Arch: [NONE] Features: [pir]
Dedicated Resources Per Task: PROCS: 1 MEM: 125M
Allocated Nodes:
[pir25:16]
This is what tracejob shows:
exec_host=pir25/15+pir25/14+pir25/13+pir25/12+pir25/11+pir25/10+pir25/9+pir25/8+pir25/7+pir25/6+pir25/5+pir25/4+pir25/3+pir25/2+pir25/1+pir25/0
Resource_List.mem=2000mb
Resource_List.ncpus=16 Resource_List.neednodes=pir25:ppn=16
Resource_List.nodect=1
Resource_List.nodes=1:ppn=16
Resource_List.walltime=00:10:00
12/23/2009 11:51:29 S Job deleted at request of r...@pirserver
12/23/2009 11:51:29 S Job sent signal SIGTERM on delete
12/23/2009 11:51:29 S Exit_status=143 resources_used.cput=00:03:56
resources_used.mem=3632kb resources_used.vmem=161840kb
resources_used.walltime=00:00:31
12/23/2009 11:51:29 A requestor=r...@pirserver
12/23/2009 11:51:29 A user=someUser group=lab4 jobname=nq2.pbs
queue=pir ctime=1261590657 qtime=1261590657 etime=1261590657
start=1261590658
owner=someu...@pirserver
exec_host=pir25/15+pir25/14+pir25/13+pir25/12+pir25/11+pir25/10+pir25/9+pir25/8+pir25/7+pir25/6+pir25/5+pir25/4+pir25/3+pir25/2+pir25/1+pir25/0
Resource_List.mem=2000mb
Resource_List.ncpus=16 Resource_List.neednodes=1:ppn=16
Resource_List.nodect=1
Resource_List.nodes=1:ppn=16
Resource_List.walltime=00:10:00 session=15719 end=1261590689
Exit_status=143
resources_used.cput=00:03:56
resources_used.mem=3632kb resources_used.vmem=161840kb
resources_used.walltime=00:00:31
and again here's the error:
12/23 11:49:00 INFO: job 850422 exceeds requested proc limit (15.86 > 1.00)
Any ideas on why these jobs keep getting killed?
On Wed, Dec 23, 2009 at 10:37 AM, Sabuj Pattanayek <[email protected]> wrote:
> Hi,
>
> I set the maui.log file to level 5 to try to figure out which resource
> limit was being violated causing my job to be killed. The PBS script
> has the key line:
>
> #PBS -l nodes=1:ppn=16
>
> The openpbs/torque node file has listed np=16 for the node that the
> job was sent to yet I see the following event in maui.log killing the
> job:
>
> MSysRegEvent(JOBRESVIOLATION: job '850416' in state 'Running' has
> exceeded PROC resource limit (1618 > 100) (action CANCEL will be
> taken)
>
> What parameter (or lack thereof) is causing this to happen?
>
> ### maui.cfg ###
>
> SERVERHOST pirserver
> ADMIN1 root
> RMCFG[PIRANHA] TYPE=PBS
> AMCFG[bank] TYPE=NONE
> RMPOLLINTERVAL 00:00:30
> SERVERPORT 42559
> SERVERMODE NORMAL
> LOGFILE maui.log
> LOGFILEMAXSIZE 10000000
> LOGLEVEL 5
> QUEUETIMEWEIGHT 1
> FSPOLICY DEDICATEDPES%
> FSINTERVAL 24:00:00
> FSDEPTH 14
> FSDECAY 0.85
> FSWEIGHT 10
> FSACCOUNTWEIGHT 1000
> FSGROUPWEIGHT 300
> FSUSERWEIGHT 300
> RESCAP 10000
> RESWEIGHT 20
> PROCWEIGHT 200
> NODEWEIGHT 20
> MEMWEIGHT 0
> BACKFILLPOLICY FIRSTFIT
> RESERVATIONPOLICY CURRENTHIGHEST
> NODEALLOCATIONPOLICY MINRESOURCE
> CLASSCFG[DEFAULT] MAXIJOB=2000
> ACCOUNTCFG[DEFAULT] MAXPROC=200
> ACCOUNTCFG[DEFAULT] MAXIPROC=200
> ACCOUNTCFG[DEFAULT] MAXIPROC=200
> ACCOUNTCFG[DEFAULT] MAXJOB=200
> ACCOUNTCFG[DEFAULT] MAXIJOB=200
> ACCOUNTCFG[DEFAULT] MAXPS=34560000
> ACCOUNTCFG[DEFAULT] MAXIPS=34560000
> USERCFG[DEFAULT] FSTARGET=10
> USERCFG[DEFAULT] MAXPROC=100
> USERCFG[DEFAULT] MAXIPROC=100
> USERCFG[DEFAULT] MAXJOB=100
> USERCFG[DEFAULT] MAXIJOB=100
> USERCFG[DEFAULT] MAXIPS=17280000
> REJECTNEGPRIOJOBS FALSE
> ENABLENEGJOBPRIORITY TRUE
> ACCOUNTCFG[lab1_acct] FSTARGET=32
> GROUPCFG[lab1] ADEF=lab1_acct
> ACCOUNTCFG[lab2_acct] FSTARGET=8
> GROUPCFG[lab2] ADEF=lab2_acct
> ACCOUNTCFG[lab3_acct] FSTARGET=16
> GROUPCFG[lab3] ADEF=lab3_acct
> ACCOUNTCFG[lab4_acct] FSTARGET=44
> GROUPCFG[lab4] ADEF=lab4_acct
> ENFORCERESOURCELIMITS ON
> RESOURCELIMITPOLICY MEM:ALWAYS:CANCEL PROC:ALWAYS:CANCEL
> NODEMAXLOAD 20.0
> NODELOADPOLICY ADJUSTSTATE
> NODECFG[pir1] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir2] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir3] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir4] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir5] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir6] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir7] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir8] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir9] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir10] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir11] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir12] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir13] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir14] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir15] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir16] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir17] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir18] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir19] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir20] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir21] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir22] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir23] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir24] PROCSPEED=2930 SPEED=1.00
> NODECFG[pir25] PROCSPEED=2930 SPEED=1.00
>
> ###
>
> ### openpbs queue config ###
>
> Queue pir
> queue_type = Execution
> total_jobs = 0
> state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0
> max_running = 400
> resources_default.neednodes = pir
> resources_default.nodes = 1
> acl_group_enable = True
> acl_groups = lab4
> acl_group_sloppy = True
> mtime = Thu Dec 17 15:40:47 2009
> resources_assigned.mem = 0b
> resources_assigned.nodect = 0
> enabled = True
> started = True
>
> ###
>
> ### openbs server attributes config ###
>
> set server scheduling = True
> set server acl_hosts = pirserver
> set server managers = r...@pirserver
> set server operators = r...@pirserver
> set server default_queue = pir
> set server log_events = 511
> set server mail_from = root
> set server query_other_jobs = True
> set server scheduler_iteration = 600
> set server node_check_rate = 150
> set server tcp_timeout = 6
> set server mom_job_sync = True
> set server keep_completed = 300
> set server next_job_number = 850417
>
> ###
>
> Thanks,
> Sabuj Pattanayek
>
_______________________________________________
mauiusers mailing list
[email protected]
http://www.supercluster.org/mailman/listinfo/mauiusers