Hi Gus, here are the results of all commands you mention:
[root@fe ~]# qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.nodes = 1
set queue batch resources_default.walltime = 2400:00:00
set queue batch enabled = True
set queue batch started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = fe
set server managers = root@fe
set server operators = root@fe
set server default_queue = batch
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server keep_completed = 300
set server auto_node_np = True
set server next_job_number = 182
set server record_job_info = True
[root@fe ~]#
${TORQUE}/bin/pbsnodes
[root@fe ~]# pbsnodes
n10
state = free
np = 12
ntype = cluster
jobs = 0/121.fe
status =
rectime=1317298640,varattr=,jobs=121.fe,state=free,netload=261129374581,gres=,loadave=4.00,ncpus=12,physmem=16360208kb,availmem=62484756kb,totmem=83471736kb,idletime=63369,nusers=2,nsessions=2,sessions=4394
8087,uname=Linux n10 2.6.18-194.el5 #1 SMP Fri Apr 2 14:58:14 EDT 2010
x86_64,opsys=linux
mom_service_port = 15002
mom_manager_port = 15003
gpus = 0
n11
state = free
np = 12
ntype = cluster
jobs = 0/143.fe
status =
rectime=1317298637,varattr=,jobs=143.fe,state=free,netload=12864227236,gres=,loadave=8.00,ncpus=12,physmem=16360208kb,availmem=78708424kb,totmem=83469060kb,idletime=1354314,nusers=2,nsessions=2,sessions=4583
20253,uname=Linux n11 2.6.18-194.el5 #1 SMP Fri Apr 2 14:58:14 EDT 2010
x86_64,opsys=linux
mom_service_port = 15002
mom_manager_port = 15003
gpus = 0
n12
state = free
np = 12
ntype = cluster
jobs = 0/144.fe
status =
rectime=1317298647,varattr=,jobs=144.fe,state=free,netload=953102292987,gres=,loadave=8.01,ncpus=12,physmem=16360208kb,availmem=78740696kb,totmem=83469060kb,idletime=1168354,nusers=2,nsessions=2,sessions=4635
20289,uname=Linux n12 2.6.18-194.el5 #1 SMP Fri Apr 2 14:58:14 EDT 2010
x86_64,opsys=linux
mom_service_port = 15002
mom_manager_port = 15003
gpus = 0
n13
state = free
np = 12
ntype = cluster
jobs = 0/181.fe
status =
rectime=1317298672,varattr=,jobs=181.fe,state=free,netload=1010169147229,gres=,loadave=4.00,ncpus=12,physmem=15955108kb,availmem=81150100kb,totmem=83066636kb,idletime=138726,nusers=2,nsessions=2,sessions=4407
29186,uname=Linux n13 2.6.18-194.el5xen #1 SMP Fri Apr 2 15:34:40 EDT
2010 x86_64,opsys=linux
mom_service_port = 15002
mom_manager_port = 15003
gpus = 0
[root@fe ~]#
${MAUI}/bin/showconfig
[root@fe ~]# which showconfig
/usr/local/maui/bin/showconfig
[root@fe ~]# showconfig
# Maui version 3.3.1 (PID: 18407)
# global policies
REJECTNEGPRIOJOBS[0] FALSE
ENABLENEGJOBPRIORITY[0] FALSE
ENABLEMULTINODEJOBS[0] TRUE
ENABLEMULTIREQJOBS[0] FALSE
BFPRIORITYPOLICY[0] [NONE]
JOBPRIOACCRUALPOLICY QUEUEPOLICY
NODELOADPOLICY ADJUSTSTATE
USEMACHINESPEEDFORFS FALSE
USEMACHINESPEED FALSE
USESYSTEMQUEUETIME TRUE
USELOCALMACHINEPRIORITY FALSE
NODEUNTRACKEDLOADFACTOR 1.2
JOBNODEMATCHPOLICY[0] EXACTNODE
JOBMAXSTARTTIME[0] INFINITY
METAMAXTASKS[0] 0
NODESETPOLICY[0] [NONE]
NODESETATTRIBUTE[0] [NONE]
NODESETLIST[0]
NODESETDELAY[0] 00:00:00
NODESETPRIORITYTYPE[0] MINLOSS
NODESETTOLERANCE[0] 0.00
BACKFILLPOLICY[0] FIRSTFIT
BACKFILLDEPTH[0] 0
BACKFILLPROCFACTOR[0] 0
BACKFILLMAXSCHEDULES[0] 10000
BACKFILLMETRIC[0] PROCS
BFCHUNKDURATION[0] 00:00:00
BFCHUNKSIZE[0] 0
PREEMPTPOLICY[0] REQUEUE
MINADMINSTIME[0] 00:00:00
RESOURCELIMITPOLICY[0]
NODEAVAILABILITYPOLICY[0] COMBINED:[DEFAULT]
NODEALLOCATIONPOLICY[0] MINRESOURCE
TASKDISTRIBUTIONPOLICY[0] DEFAULT
RESERVATIONPOLICY[0] CURRENTHIGHEST
RESERVATIONRETRYTIME[0] 00:00:00
RESERVATIONTHRESHOLDTYPE[0] NONE
RESERVATIONTHRESHOLDVALUE[0] 0
FSPOLICY [NONE]
FSPOLICY [NONE]
FSINTERVAL 12:00:00
FSDEPTH 8
FSDECAY 1.00
# Priority Weights
SERVICEWEIGHT[0] 1
TARGETWEIGHT[0] 1
CREDWEIGHT[0] 1
ATTRWEIGHT[0] 1
FSWEIGHT[0] 1
RESWEIGHT[0] 1
USAGEWEIGHT[0] 1
QUEUETIMEWEIGHT[0] 1
XFACTORWEIGHT[0] 0
SPVIOLATIONWEIGHT[0] 0
BYPASSWEIGHT[0] 0
TARGETQUEUETIMEWEIGHT[0] 0
TARGETXFACTORWEIGHT[0] 0
USERWEIGHT[0] 0
GROUPWEIGHT[0] 0
ACCOUNTWEIGHT[0] 0
QOSWEIGHT[0] 0
CLASSWEIGHT[0] 0
FSUSERWEIGHT[0] 0
FSGROUPWEIGHT[0] 0
FSACCOUNTWEIGHT[0] 0
FSQOSWEIGHT[0] 0
FSCLASSWEIGHT[0] 0
ATTRATTRWEIGHT[0] 0
ATTRSTATEWEIGHT[0] 0
NODEWEIGHT[0] 0
PROCWEIGHT[0] 0
MEMWEIGHT[0] 0
SWAPWEIGHT[0] 0
DISKWEIGHT[0] 0
PSWEIGHT[0] 0
PEWEIGHT[0] 0
WALLTIMEWEIGHT[0] 0
UPROCWEIGHT[0] 0
UJOBWEIGHT[0] 0
CONSUMEDWEIGHT[0] 0
USAGEEXECUTIONTIMEWEIGHT[0] 0
REMAININGWEIGHT[0] 0
PERCENTWEIGHT[0] 0
XFMINWCLIMIT[0] 00:02:00
# partition DEFAULT policies
REJECTNEGPRIOJOBS[1] FALSE
ENABLENEGJOBPRIORITY[1] FALSE
ENABLEMULTINODEJOBS[1] TRUE
ENABLEMULTIREQJOBS[1] FALSE
BFPRIORITYPOLICY[1] [NONE]
JOBPRIOACCRUALPOLICY QUEUEPOLICY
NODELOADPOLICY ADJUSTSTATE
JOBNODEMATCHPOLICY[1]
JOBMAXSTARTTIME[1] INFINITY
METAMAXTASKS[1] 0
NODESETPOLICY[1] [NONE]
NODESETATTRIBUTE[1] [NONE]
NODESETLIST[1]
NODESETDELAY[1] 00:00:00
NODESETPRIORITYTYPE[1] MINLOSS
NODESETTOLERANCE[1] 0.00
# Priority Weights
XFMINWCLIMIT[1] 00:00:00
RMAUTHTYPE[0] CHECKSUM
CLASSCFG[[NONE]] DEFAULT.FEATURES=[NONE]
CLASSCFG[[ALL]] DEFAULT.FEATURES=[NONE]
CLASSCFG[batch] DEFAULT.FEATURES=[NONE]
QOSPRIORITY[0] 0
QOSQTWEIGHT[0] 0
QOSXFWEIGHT[0] 0
QOSTARGETXF[0] 0.00
QOSTARGETQT[0] 00:00:00
QOSFLAGS[0]
QOSPRIORITY[1] 0
QOSQTWEIGHT[1] 0
QOSXFWEIGHT[1] 0
QOSTARGETXF[1] 0.00
QOSTARGETQT[1] 00:00:00
QOSFLAGS[1]
# SERVER MODULES: MX
SERVERMODE NORMAL
SERVERNAME
SERVERHOST fe
SERVERPORT 42559
LOGFILE maui.log
LOGFILEMAXSIZE 10000000
LOGFILEROLLDEPTH 1
LOGLEVEL 3
LOGFACILITY fALL
SERVERHOMEDIR /usr/local/maui/
TOOLSDIR /usr/local/maui/tools/
LOGDIR /usr/local/maui/log/
STATDIR /usr/local/maui/stats/
LOCKFILE /usr/local/maui/maui.pid
SERVERCONFIGFILE /usr/local/maui/maui.cfg
CHECKPOINTFILE /usr/local/maui/maui.ck
CHECKPOINTINTERVAL 00:05:00
CHECKPOINTEXPIRATIONTIME 3:11:20:00
TRAPJOB
TRAPNODE
TRAPFUNCTION
RESDEPTH 24
RMPOLLINTERVAL 00:00:30
NODEACCESSPOLICY SHARED
ALLOCLOCALITYPOLICY [NONE]
SIMTIMEPOLICY [NONE]
ADMIN1 root
ADMINHOSTS ALL
NODEPOLLFREQUENCY 0
DISPLAYFLAGS
DEFAULTDOMAIN
DEFAULTCLASSLIST [DEFAULT:1]
FEATURENODETYPEHEADER
FEATUREPROCSPEEDHEADER
FEATUREPARTITIONHEADER
DEFERTIME 1:00:00
DEFERCOUNT 24
DEFERSTARTCOUNT 1
JOBPURGETIME 0
NODEPURGETIME 2140000000
APIFAILURETHRESHHOLD 6
NODESYNCTIME 600
JOBSYNCTIME 600
JOBMAXOVERRUN 00:10:00
NODEMAXLOAD 0.0
PLOTMINTIME 120
PLOTMAXTIME 245760
PLOTTIMESCALE 11
PLOTMINPROC 1
PLOTMAXPROC 512
PLOTPROCSCALE 9
SCHEDCFG[] MODE=NORMAL SERVER=fe:42559
# RM MODULES: PBS SSS WIKI NATIVE
RMCFG[FE] AUTHTYPE=CHECKSUM EPORT=15004 TIMEOUT=00:00:09 TYPE=PBS
SIMWORKLOADTRACEFILE workload
SIMRESOURCETRACEFILE resource
SIMAUTOSHUTDOWN OFF
SIMSTARTTIME 0
SIMSCALEJOBRUNTIME FALSE
SIMFLAGS
SIMJOBSUBMISSIONPOLICY CONSTANTJOBDEPTH
SIMINITIALQUEUEDEPTH 16
SIMWCACCURACY 0.00
SIMWCACCURACYCHANGE 0.00
SIMNODECOUNT 0
SIMNODECONFIGURATION NORMAL
SIMWCSCALINGPERCENT 100
SIMCOMRATE 0.10
SIMCOMTYPE ROUNDROBIN
COMINTRAFRAMECOST 0.30
COMINTERFRAMECOST 0.30
SIMSTOPITERATION -1
SIMEXITITERATION -1
[root@fe ~]# ps -ef |grep maui
root 18407 1 0 Sep28 ? 00:00:04 /usr/local/maui/sbin/maui
root 22527 22463 0 09:19 pts/2 00:00:00 grep maui
[root@fe ~]# service maui status
maui (pid 18407) is running...
[root@fe ~]# service pbs_server status
pbs_server (pid 4147) is running...
[root@fe ~]#
service pbs_sched status [just in case it is also running ...]
service pbs_mom status
service pbs status
none of those 3 services are installed
Thank you very much
----------------------------------------------------
Ing. Fernando Caba
Director General de Telecomunicaciones
Universidad Nacional del Sur
http://www.dgt.uns.edu.ar
Tel/Fax: (54)-291-4595166
Tel: (54)-291-4595101 int. 2050
Avda. Alem 1253, (B8000CPB) Bahía Blanca - Argentina
----------------------------------------------------
El 28/09/2011 04:07 PM, Gus Correa escribió:
> Hi Fernando
>
> Did you restart maui after you changed maui.cfg? [service maui restart]
>
> Any chances that what you see is still residual from old jobs,
> submitted before you changed the maui configuration and job scripts
> [#PBS -l nodes=1:ppn=12]?
>
> For more help from everybody in the list,
> it may be useful if you send the output of:
>
> qmgr -c 'p s'
>
> ${TORQUE}/bin/pbsnodes
>
> ${MAUI}/bin/showconfig
>
> ps -ef |grep maui
>
> service maui status
> service pbs_server status
> service pbs_sched status [just in case it is also running ...]
> service pbs_mom status
> service pbs status
>
> I hope this helps,
> Gus Correa
>
>
> Fernando Caba wrote:
>> Hi everybody, thanks for all answers.
>> I try all that you point out:
>>
>> including
>> #PBS -l nodes=1:ppn=12
>>
>> adding
>>
>> JOBNODEMATCHPOLICY EXACTNODE
>>
>> to maui.cfg
>>
>> but nothing of this work. I´m thinking that the problem is in another
>> config parameter (maui or torque).
>>
>> I will reading more about all.
>>
>> Thanks!!
>>
>> ----------------------------------------------------
>> Ing. Fernando Caba
>> Director General de Telecomunicaciones
>> Universidad Nacional del Sur
>> http://www.dgt.uns.edu.ar
>> Tel/Fax: (54)-291-4595166
>> Tel: (54)-291-4595101 int. 2050
>> Avda. Alem 1253, (B8000CPB) Bahía Blanca - Argentina
>> ----------------------------------------------------
>>
>>
>> El 28/09/2011 12:33 PM, Gus Correa escribió:
>>> Hi Fernando
>>>
>>> Dennis already pointed out the first/main problem.
>>> Your Torque/PBS script is not requesting a specific number of nodes
>>> and cores/processors.
>>> You can ask for 12 processors, even if your MPI command doesn't
>>> use all of them:
>>>
>>> #PBS -l nodes=1:ppn=12
>>>
>>> [You can still do mpirun -np 8 if you want.]
>>>
>>> This will prevent two jobs to run in the same node [which seems
>>> to be your goal, if I understood it right].
>>>
>>> I like to add also the queue name [even if it is the default]
>>> and the job name [for documentation and stdout/stderr
>>> naming consistency]
>>>
>>> #PBS -q myqueue [whatever you called your queue]
>>> #PBS -N myjob [15 characters at most, the rest gets truncated]
>>>
>>> The #PBS clauses must be together and right after the #! /bin/sh line.
>>>
>>> Ask your users to always add these lines to their jobs.
>>> There is a feature of torque that allows you to write a wrapper
>>> that will whatever you want to the job script,
>>> but if your pool of users is small
>>> you can just ask them to cooperate.
>>>
>>> Of course there is much more that you can add.
>>> 'man qsub' and 'man pbs_resources' are good sources of information,
>>> highly recommended reading.
>>>
>>>
>>> Then there is what Antonio Messina mentioned, the cpuset feature
>>> of Torque.
>>> I don't know if you installed Torque with this feature enabled.
>>> However, if you did, it will allow the specific cores to be
>>> assigned to each process, which could allow node-sharing without
>>> jobs stepping on each other toes.
>>> However:
>>> A) this requires a bit more of setup [not a lot, check the
>>> list archives and the Torque Admin Guide]
>>> B) if your users are cooperative and request 12 processors for each job,
>>> and you're using the Maui 'JOBNODEMATCHPOLICY EXACTNODE' each job will
>>> get to a single node anyway.
>>>
>>> BTW, did you restart Maui after you added 'JOBNODEMATCHPOLICY EXACTNODE'
>>> to the maui.cfg file?
>>>
>>> I hope this helps,
>>> Gus Correa
>>>
>>>
>>> Fernando Caba wrote:
>>>> Hi Gus, my node file /var/spool/torque /server_priv/nodes looks like:
>>>>
>>>> [root@fe server_priv]# more nodes
>>>> n10 np=12
>>>> n11 np=12
>>>> n12 np=12
>>>> n13 np=12
>>>> [root@fe server_priv]#
>>>>
>>>> it is exact as your comment.
>>>>
>>>> My script:
>>>>
>>>> #!/bin/bash
>>>>
>>>> cd $PBS_O_WORKDIR
>>>>
>>>> mpirun -np 8 /usr/local/vasp/vasp
>>>>
>>>> launch 8 vasp in one node. If i start one job more (with -np 8),
>>>> the job will run in the same node (n13).
>>>> So if i start another job with -np 8
>>>> (or -np 4), it will run in the same node n13.
>>>>
>>>> I configured JOBNODEMATCHPOLICY EXACTNODE in maui.cfg,
>>>> but unfortunately the ran in node n13.
>>>> This is an example of the output of top
>>>>
>>>> top - 00:05:53 up 14 days, 6:47, 1 user, load average: 4.18, 4.06, 4.09
>>>> Mem: 15955108k total, 13287888k used, 2667220k free, 142168k buffers
>>>> Swap: 67111528k total, 16672k used, 67094856k free, 11360332k cached
>>>>
>>>> PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
>>>> 21796 patricia 25 0 463m 291m 12m R 100.5 1.9 517:29.59 vasp
>>>> 21797 patricia 25 0 448m 276m 11m R 100.2 1.8 518:51.49 vasp
>>>> 21798 patricia 25 0 458m 287m 11m R 100.2 1.8 522:01.79 vasp
>>>> 21799 patricia 25 0 448m 276m 11m R 99.9 1.8 519:04.25 vasp
>>>> 1 root 15 0 10348 672 568 S 0.0 0.0 0:00.53 init
>>>> 2 root RT -5 0 0 0 S 0.0 0.0 0:00.06 migration/0
>>>> 3 root 34 19 0 0 0 S 0.0 0.0 0:00.00 ksoftirqd/0
>>>> 4 root RT -5 0 0 0 S 0.0 0.0 0:00.00 watchdog/0
>>>> 5 root RT -5 0 0 0 S 0.0 0.0 0:00.04 migration/1
>>>>
>>>> The job that generate those 4 vasp process is:
>>>>
>>>> #!/bin/bash
>>>>
>>>> cd $PBS_O_WORKDIR
>>>>
>>>> mpirun -np 4 /usr/local/vasp/vasp
>>>>
>>>> Thanks
>>>>
>>>> ----------------------------------------------------
>>>> Ing. Fernando Caba
>>>> Director General de Telecomunicaciones
>>>> Universidad Nacional del Sur
>>>> http://www.dgt.uns.edu.ar
>>>> Tel/Fax: (54)-291-4595166
>>>> Tel: (54)-291-4595101 int. 2050
>>>> Avda. Alem 1253, (B8000CPB) Bahía Blanca - Argentina
>>>> ----------------------------------------------------
>>>>
>>>>
>>>> El 27/09/2011 08:07 PM, Gus Correa escribió:
>>>>> Hi Fernando
>>>>>
>>>>> Did you try something like this in your
>>>>> ${TORQUE}/server_priv/nodes file?
>>>>>
>>>>> frontend np=12 [skip this line if the frontend is not to do job work]
>>>>> node1 np=12
>>>>> node2 np=12
>>>>> node3 np=12
>>>>> node4 np=12
>>>>>
>>>>> This is probably the first thing to do.
>>>>> It is not Maui, just plain Torque [actually pbs_server configuration].
>>>>>
>>>>> The lines above assume your nodes are called node1, ...
>>>>> and the head node is called frontend,
>>>>> in some name-resolvable manner [most likely
>>>>> in your /etc/hosts file, most likely pointing to the nodes'
>>>>> IP addresses in your cluster's private subnet, 192.168.X.X,
>>>>> 10.X.X.X or equivalent].
>>>>>
>>>>> The 'np=12' clause will allow at most 12 *processes* per node.
>>>>>
>>>>>
>>>>> [However, if VASP is *threaded*, say via OpenMP, then it won't
>>>>> prevent that several threads are launched from each process.
>>>>> To handle threaded you can use some tricks, such as requesting
>>>>> more cores than processes.
>>>>> Sorry, I am not familiar to VASP to be able to say more than this.]
>>>>>
>>>>> I would suggest that you take a look at the Torque Admin Manual
>>>>> for more details:
>>>>> http://www.adaptivecomputing.com/resources/docs/torque/
>>>>>
>>>>> There are further controls in Maui, such as
>>>>> 'JOBNODEMATCHPOLICY EXACTNODE' in maui.cfg,
>>>>> for instance, if you want full nodes allocated to each job,
>>>>> as opposed to jobs sharing cores in a single node.
>>>>> However, these choices may come later.
>>>>> [You can change maui.cfg and restart the maui scheduler to
>>>>> test various changes.]
>>>>>
>>>>> For Maui details see the Maui Admin Guide:
>>>>> http://www.adaptivecomputing.com/resources/docs/maui/index.php
>>>>>
>>>>> I hope this helps,
>>>>> Gus Correa
>>>>>
>>>>> Fernando Caba wrote:
>>>>>> Hi every body, i am using torque 3.0.1 and maui 3.3.1 in a configuration
>>>>>> composed by a front end and 4 nodes (2 processors, 6 cores each)
>>>>>> totalizing 48 cores.
>>>>>> I need to configure that in each node don´t run no more than 12 process
>>>>>> (particular we are using vasp), so we wan´t no more than 12 vasp process
>>>>>> by node.
>>>>>> How can i configure this? I´m so confusing reading a lot of information
>>>>>> from torque and maui configuration.
>>>>>>
>>>>>> Thank´s in advance.
>>>>>>
>>>>> _______________________________________________
>>>>> mauiusers mailing list
>>>>> [email protected]
>>>>> http://www.supercluster.org/mailman/listinfo/mauiusers
>>>>>
>>>> _______________________________________________
>>>> mauiusers mailing list
>>>> [email protected]
>>>> http://www.supercluster.org/mailman/listinfo/mauiusers
>>> _______________________________________________
>>> mauiusers mailing list
>>> [email protected]
>>> http://www.supercluster.org/mailman/listinfo/mauiusers
>>>
>> _______________________________________________
>> mauiusers mailing list
>> [email protected]
>> http://www.supercluster.org/mailman/listinfo/mauiusers
> _______________________________________________
> mauiusers mailing list
> [email protected]
> http://www.supercluster.org/mailman/listinfo/mauiusers
>
_______________________________________________
mauiusers mailing list
[email protected]
http://www.supercluster.org/mailman/listinfo/mauiusers