What do you get when you do "qstat -f" on the job?  How many nodes is
it actually getting?

On 9/18/07, Nilesh Mistry <[EMAIL PROTECTED]> wrote:
> Micheal
>
> We have actually moved to a larger cluster of 64 nodes (50 quad core and
> 14 dual opterons), there fore 220 processors available.  We are
> submitting a job that requires 64 threads but still with the same
> result.  Here are the files you requested.  I have already submitted to
> torque user list.
>
> ####### PBS SCRIPT START#######
>
> #!/bin/sh -f
> #PBS -l nodes=64
> #PBS -N scaling_test
> #PBS -e scaling_test.err
> #PBS -o scaling_test.log
> #PBS -j oe
> #PBS -l mem=64000mb
> #PBS -m abe
> #PBS -q parallel
>
> NCPU=`wc -l < $PBS_NODEFILE`
> echo ------------------------------------------------------
> echo ' This job is allocated on '${NCPU}' cpu(s)'
> echo 'Job is running on node(s): '
> cat $PBS_NODEFILE
> echo PBS: qsub is running on $PBS_O_HOST
> echo PBS: originating queue is $PBS_O_QUEUE
> echo PBS: executing queue is $PBS_QUEUE
> echo PBS: working directory is $PBS_O_WORKDIR
> echo PBS: execution mode is $PBS_ENVIRONMENT
> echo PBS: job identifier is $PBS_JOBID
> echo PBS: job name is $PBS_JOBNAME
> echo PBS: node file is $PBS_NODEFILE
> echo PBS: current home directory is $PBS_O_HOME
> echo PBS: PATH = $PBS_O_PATH
> echo ------------------------------------------------------
> SERVER=$PBS_O_HOST
> WORKDIR=$HOME/pbs/multi/scaling_test
> cd ${WORKDIR}
> cat $PBS_NODEFILE > nodes.list
> lamboot -s -H $PBS_NODEFILE
> mpirun -np $NCPU /opt/fds/fds5_mpi scaling_test.fds
> lamhalt
>
> ####### PBS SCRIPT END #######
>
> ####### MAUI.CFG START #######
> # maui.cfg 3.2.6p14
>
> SERVERHOST              master.atar.senecac.on.ca
> # primary admin must be first in list
> ADMIN1                  root
> ADMIN3                  nilesh.mistry
>
>
> # Resource Manager Definition
>
> RMCFG[master.atar.senecac.on.ca] TYPE=PBS
>
> # Allocation Manager Definition
>
> AMCFG[bank]  TYPE=NONE
>
> # full parameter docs at
> http://clusterresources.com/mauidocs/a.fparameters.html
> # use the 'schedctl -l' command to display current configuration
>
> RMPOLLINTERVAL  00:01:00
>
> SERVERPORT            42559
> SERVERMODE            NORMAL
>
> # Admin: http://clusterresources.com/mauidocs/a.esecurity.html
>
>
> LOGFILE               maui.log
> LOGFILEMAXSIZE        10000000
> LOGLEVEL              4
> LOGFACILITY             fALL
>
> # Job Priority:
> http://clusterresources.com/mauidocs/5.1jobprioritization.html
>
> QUEUETIMEWEIGHT       1
>
> # FairShare: http://clusterresources.com/mauidocs/6.3fairshare.html
>
> #FSPOLICY              PSDEDICATED
> #FSDEPTH               7
> #FSINTERVAL            86400
> #FSDECAY               0.80
>
> # Throttling Policies:
> http://clusterresources.com/mauidocs/6.2throttlingpolicies.html
>
> # NONE SPECIFIED
>
> # Backfill: http://clusterresources.com/mauidocs/8.2backfill.html
>
> BACKFILLPOLICY  ON
> RESERVATIONPOLICY     CURRENTHIGHEST
>
> # the following are modified/added by Mehrdad 13 Sept 07
> #NODEACCESSPOLICY       DEDICATED
> NODEACCESSPOLICY        SHARED
> JOBNODEMATCHPOLICY   EXACTPROC
>
> # Node Allocation:
> http://clusterresources.com/mauidocs/5.2nodeallocation.html
>
> NODEALLOCATIONPOLICY  MINRESOURCE
>
> # QOS: http://clusterresources.com/mauidocs/7.3qos.html
>
> # QOSCFG[hi]  PRIORITY=100 XFTARGET=100 FLAGS=PREEMPTOR:IGNMAXJOB
> # QOSCFG[low] PRIORITY=-1000 FLAGS=PREEMPTEE
>
> # Standing Reservations:
> http://clusterresources.com/mauidocs/7.1.3standingreservations.html
>
> # SRSTARTTIME[test] 8:00:00
> # SRENDTIME[test]   17:00:00
> # SRDAYS[test]      MON TUE WED THU FRI
> # SRTASKCOUNT[test] 20
> # SRMAXTIME[test]   0:30:00
>
> # Creds: http://clusterresources.com/mauidocs/6.1fairnessoverview.html
>
> # USERCFG[DEFAULT]      FSTARGET=25.0
> # USERCFG[john]         PRIORITY=100  FSTARGET=10.0-
> # GROUPCFG[staff]       PRIORITY=1000 QLIST=hi:low QDEF=hi
> # CLASSCFG[batch]       FLAGS=PREEMPTEE
> # CLASSCFG[interactive] FLAGS=PREEMPTOR
> USERCFG[DEFAULT]        MAXJOB=4
> ####### MAUI.CFG  END #######
>
> ####### QMGR -c "PRINT SERVER MASTER" ########
> #
> # Create queues and set their attributes.
> #
> #
> # Create and define queue serial
> #
> create queue serial
> set queue serial queue_type = Execution
> set queue serial resources_max.cput = 1000:00:00
> set queue serial resources_max.mem = 3000mb
> set queue serial resources_max.ncpus = 1
> set queue serial resources_max.nodect = 1
> set queue serial resources_max.nodes = 1:ppn=1
> set queue serial resources_max.walltime = 1000:00:00
> set queue serial resources_default.cput = 336:00:00
> set queue serial resources_default.mem = 900mb
> set queue serial resources_default.ncpus = 1
> set queue serial resources_default.nodect = 1
> set queue serial resources_default.nodes = 1:ppn=1
> set queue serial enabled = True
> set queue serial started = True
> #
> # Create and define queue workq
> #
> create queue workq
> set queue workq queue_type = Execution
> set queue workq resources_max.cput = 10000:00:00
> set queue workq resources_max.ncpus = 200
> set queue workq resources_max.nodect = 64
> set queue workq resources_max.nodes = 200:ppn=4
> set queue workq resources_max.walltime = 10000:00:00
> set queue workq resources_min.cput = 00:00:01
> set queue workq resources_min.ncpus = 1
> set queue workq resources_min.nodect = 1
> set queue workq resources_min.walltime = 00:00:01
> set queue workq resources_default.cput = 10000:00:00
> set queue workq resources_default.nodect = 1
> set queue workq resources_default.walltime = 10000:00:00
> set queue workq enabled = True
> set queue workq started = True
> #
> # Create and define queue parallel
> #
> create queue parallel
> set queue parallel queue_type = Execution
> set queue parallel resources_max.cput = 10000:00:00
> set queue parallel resources_max.ncpus = 200
> set queue parallel resources_max.nodect = 64
> set queue parallel resources_max.nodes = 200:ppn=4
> set queue parallel resources_max.walltime = 10000:00:00
> set queue parallel resources_min.ncpus = 1
> set queue parallel resources_min.nodect = 1
> set queue parallel resources_default.ncpus = 1
> set queue parallel resources_default.nodect = 1
> set queue parallel resources_default.nodes = 1:ppn=1
> set queue parallel resources_default.walltime = 10000:00:00
> set queue parallel enabled = True
> set queue parallel started = True
> #
> # Set server attributes.
> #
> set server scheduling = True
> set server acl_host_enable = False
> set server acl_user_enable = False
> set server default_queue = serial
> set server log_events = 127
> set server mail_from = adm
> set server query_other_jobs = True
> set server resources_available.ncpus = 200
> set server resources_available.nodect = 64
> set server resources_available.nodes = 200
> set server resources_default.neednodes = 1
> set server resources_default.nodect = 1
> set server resources_default.nodes = 1
> set server resources_max.ncpus = 200
> set server resources_max.nodes = 200
> set server scheduler_iteration = 60
> set server node_check_rate = 150
> set server tcp_timeout = 6
> set server default_node = 1
> set server pbs_version = 2.0.0p8
>
>
>
> Thanks
>
> Nilesh Mistry
> Academic Computing Services
> [EMAIL PROTECTED] & TEL Campus
> Seneca College Of Applies Arts & Technology
> 70 The Pond Road
> Toronto, Ontario
> M3J 3M6 Canada
> Phone 416 491 5050 ext 3788
> Fax 416 661 4695
> http://acs.senecac.on.ca
>
>
>
> Michael Edwards wrote:
> > We'd need your script and the qsub command you used, possibly more
> > configuration information from maui and torque, to be much help.
> >
> > I don't know that we have anyone who is deep with maui or torque right
> > now, you might also want to ask on the maui or torque lists.
> >
> > >From the other posts you have made this error seems to be one of those
> > general "Something is Broken" messages that could have many causes.
> >
> > On 9/17/07, Nilesh Mistry <[EMAIL PROTECTED]> wrote:
> >
> >> Hello
> >>
> >> I am having problems submitting job that requires 23 threads.  I keep
> >> getting the following error:
> >>
> >> ERROR: Number of meshes not equal to number of thread
> >>
> >> Hardware:
> >> 10 quad core nodes (therefore 40 processors available)
> >>
> >> What do I need to insure in my job queue (qmgr) , maui (maui.cfg) and
> >> my submit script when using qsub?
> >>
> >> Any and all help is greatly appreciated.
> >>
> >> --
> >> Thanks
> >>
> >> Nilesh Mistry
> >> Academic Computing Services
> >> [EMAIL PROTECTED] & TEL Campus
> >> Seneca College Of Applies Arts & Technology
> >> 70 The Pond Road
> >> Toronto, Ontario
> >> M3J 3M6 Canada
> >> Phone 416 491 5050 ext 3788
> >> Fax 416 661 4695
> >> http://acs.senecac.on.ca
> >>
> >>
> >>
> >> -------------------------------------------------------------------------
> >> This SF.net email is sponsored by: Microsoft
> >> Defy all challenges. Microsoft(R) Visual Studio 2005.
> >> http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> >> _______________________________________________
> >> Oscar-users mailing list
> >> Oscar-users@lists.sourceforge.net
> >> https://lists.sourceforge.net/lists/listinfo/oscar-users
> >>
> >>
> >
> > -------------------------------------------------------------------------
> > This SF.net email is sponsored by: Microsoft
> > Defy all challenges. Microsoft(R) Visual Studio 2005.
> > http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> > _______________________________________________
> > Oscar-users mailing list
> > Oscar-users@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/oscar-users
> >
>
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by: Microsoft
> Defy all challenges. Microsoft(R) Visual Studio 2005.
> http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
> _______________________________________________
> Oscar-users mailing list
> Oscar-users@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/oscar-users
>

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Oscar-users mailing list
Oscar-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/oscar-users

Reply via email to