Sorry I forgot to post my queue script

# cat qsub

#---------------rareCluster bdimer03job sample script --------
#
#PBS -S /bin/csh
#PBS -q workq
#PBS -N bdimer
#PBS -l nodes=1:fast:nm
#PBS -e bdimer.err
#PBS -o bdimer.out
#
# Insert proper node and walltime specifications above making sure ppn is
# set to either 1 or 2 so that you get either all single processor nodes
# or all dual processors nodes instead of a mixture
# Other node specifications such as memory can also be added
#
# Put in proper WORKDIR, FILENAME in the first two lines below, respectively
# Set SCRFLAG to either GLOBAL or LOCAL in third
# Set  NODES, PPN (either 1 or 2) in a manner consistent to the PBS -l
line above
# Put in USER ID for use in scratch directory path
#
# Also make sure %nprocs and/or %nprocl and %mem are set properly in
the com file
# Allow at least 64MB of the memory available per node for the OS
# On multiple node runs use %nprocl; max %mem is the lower value of 512MB or the
# actual amount of memory minus 64MB
# On single node runs use %nproc when using ppn=2; max %mem is based on memory
# of node, again allowing 64MB; the 512MB restriction does not apply
#
setenv WORKDIR   /home/vuser/pdg/2007fall/1124
setenv SCRFLAG   GLOBAL
setenv NODES     1
setenv USER      vuser
#
#
# nothing should need changed below here to run unless you do not
# want to use the default version of Gaussian; in this case the
# g03root path will need to be changed
#
cd $WORKDIR

setenv g03root /home/chemtools
set path = ($path $g03root/g03/linda7.1/intel-linux2.4/bin)

setenv MP_NEWJOB yes
setenv LINDA_CLC network
setenv LINDA_FLC network

cat $PBS_NODEFILE | uniq > $WORKDIR/Gau.hosts
set NODELIST = `xargs echo < Gau.hosts`
cat Gau.hosts | xargs >tsnet.nodes
echo $NODELIST

if ("$SCRFLAG" == "GLOBAL") then
  setenv GAUSS_SCRDIR  /scratch/$USER/$PBS_JOBID
  source $g03root/g03/bsd/g03.login
  mkdir -p /scratch/$USER/$PBS_JOBID
endif

if ("$SCRFLAG" == "LOCAL") then
  setenv GAUSS_SCRDIR  /scratch/$USER/$PBS_JOBID
  source $g03root/g03/bsd/g03.login
  foreach HOST (`cat $PBS_NODEFILE`)
        ssh $HOST mkdir -p /scratch/$USER/$PBS_JOBID
  end
endif

echo $GAUSS_SCRDIR
setenv GAUSS_LFLAGS "-nodelist '$NODELIST'"
echo $GAUSS_LFLAGS

# Run the program
foreach FILENAME (bdimer)
if ("$NODES" == "1") then
g03 <$WORKDIR/$FILENAME.com >$WORKDIR/$FILENAME.log
else
g03l <$WORKDIR/$FILENAME.com >$WORKDIR/$FILENAME.log
endif
end

# Clean up scratch space

if ("$SCRFLAG" == "GLOBAL") then
  rm -r /scratch/$USER/$PBS_JOBID
endif
if ("$SCRFLAG" == "LOCAL") then
  foreach HOST (`cat $PBS_NODEFILE`)
        ssh $HOST rm -rf /scratch/$USER/$PBS_JOBID
  end
endif




On Wed, Mar 19, 2008 at 10:49 AM, Chunwei Han <[EMAIL PROTECTED]> wrote:
> Thank you for your kindly reply
>
>  Right now no user can run any job on the server. And I can ssh any
>  nodes without password. I tried to reboot all the cluster when the
>  server is running, but it doesn't seem to work.
>
>  Here is some debug info as you suggested, I hope this helps :)
>
>  # checkjob 21208
>
>  checking job 21208
>
>  State: Idle  (User: vuser  Group: relab)
>  WallTime: 0:00:00 of   INFINITY
>  SubmitTime: Wed Mar 19 09:47:04
>   (Time Queued  Total: 0:02:48  Eligible: 0:00:00)
>
>  Total Tasks: 1
>
>  Req[0]  TaskCount: 1  Partition: ALL
>  Network: [NONE]  Memory >= 0  Disk >= 0  Swap NC 0
>  Opsys: [NONE]  Arch: [NONE]  Class: [workq 1]  Features: [nm][fast]
>
>
>  IWD: [NONE]  Executable:  [NONE]
>  QOS: DEFAULT  Bypass: 0  StartCount: 0
>  PartitionMask: [ALL]
>  Flags:       RESTARTABLE
>
>  job is deferred.  Reason:  NoResources  (exceeds available partition procs)
>  Holds:    Defer
>  PE:  1.00  StartPriority:  2
>  cannot select job 21208 for partition DEFAULT (job hold active)
>
>
>  # qstat -an
>
>  parellel:
>                                                             Req'd  Req'd   
> Elap
>  Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time  S 
> Time
>  --------------- -------- -------- ---------- ------ --- --- ------ ----- - 
> -----
>  21208.parellel  vuser    workq    bdimer        --    1   1    --  10000 Q   
> --
>     --
>
>  # qstat -f
>  Job Id: 21208.parellel
>     Job_Name = bdimer
>     Job_Owner = [EMAIL PROTECTED]
>     job_state = Q
>     queue = workq
>     server = parellel
>     Checkpoint = u
>     ctime = Wed Mar 19 09:47:04 2008
>     Error_Path = parellel:/home/vuser/pdg/2007fall/1124/bdimer.err
>     Hold_Types = n
>     Join_Path = n
>     Keep_Files = n
>     Mail_Points = a
>     mtime = Wed Mar 19 09:47:04 2008
>     Output_Path = parellel:/home/vuser/pdg/2007fall/1124/bdimer.out
>     Priority = 0
>     qtime = Wed Mar 19 09:47:04 2008
>     Rerunable = True
>     Resource_List.cput = 10000:00:00
>     Resource_List.ncpus = 1
>     Resource_List.nodect = 1
>     Resource_List.nodes = 1:fast:nm
>     Resource_List.walltime = 10000:00:00
>     Shell_Path_List = /bin/csh
>     Variable_List = PBS_O_HOME=/home/vuser,PBS_O_LANG=en_US.UTF-8,
>         PBS_O_LOGNAME=vuser,
>         PBS_O_PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bi
>         
> n:/usr/X11R6/bin:/opt/env-switcher/bin:/opt/mpich-1.2.5.10-ch_p4-gcc/bi
>         
> n:/opt/hdf5-oscar-1.6.0/bin/:/opt/kernel_picker/bin:/opt/pvm3/lib:/opt/
>         
> pvm3/lib/LINUX:/opt/pvm3/bin/LINUX:/opt/c3-4/:/opt/pbs/bin:/opt/pbs/lib
>         
> /xpbs/bin:/home/chemtools/g03.c02/g03/bsd:/home/chemtools/g03.c02/g03/p
>         
> rivate:/home/chemtools/g03.c02/g03:/usr/pgi/linux86/bin:/usr/pgi/linux8
>         6/lib:/usr/pgi/linux86/include:/opt/maui/bin:/root/bin,
>         PBS_O_MAIL=/var/spool/mail/root,PBS_O_SHELL=/bin/bash,
>         PBS_O_HOST=parellel,PBS_O_WORKDIR=/home/vuser/pdg/2007fall/1124,
>         PBS_O_QUEUE=workq
>     etime = Wed Mar 19 09:47:04 2008
>
>
>
>  On Wed, Mar 19, 2008 at 3:09 AM,
>  <[EMAIL PROTECTED]> wrote:
>  > Send Oscar-users mailing list submissions to
>  >         oscar-users@lists.sourceforge.net
>  >
>  >  To subscribe or unsubscribe via the World Wide Web, visit
>  >         https://lists.sourceforge.net/lists/listinfo/oscar-users
>  >  or, via email, send a message with subject or body 'help' to
>  >         [EMAIL PROTECTED]
>  >
>  >  You can reach the person managing the list at
>  >         [EMAIL PROTECTED]
>  >
>  >  When replying, please edit your Subject line so it is more specific
>  >  than "Re: Contents of Oscar-users digest..."
>  >
>  >
>  >  Today's Topics:
>  >
>  >    1. How to make job run? (Chunwei Han)
>  >    2. Re: How to make job run? (Michael Edwards)
>  >    3. Re: How to make job run? (Greenseid, Joseph M.)
>  >    4. Re: How to make job run? (arun shankar)
>  >
>  >
>  >  ----------------------------------------------------------------------
>  >
>  >  Message: 1
>  >  Date: Tue, 18 Mar 2008 11:10:37 +0800
>  >  From: "Chunwei Han" <[EMAIL PROTECTED]>
>  >  Subject: [Oscar-users] How to make job run?
>  >  To: oscar-users@lists.sourceforge.net
>  >  Message-ID:
>  >         <[EMAIL PROTECTED]>
>  >  Content-Type: text/plain; charset=ISO-8859-1
>  >
>  >  Hey guys
>  >
>  >  Recently I take over the job as the cluster administrator. Unluckily
>  >  the former administrator did not leave any documents and I have no
>  >  experience at all. Right now the job can not be submitted. Here is
>  >  some debugger info:
>  >
>  >  # qstat
>  >  Job id           Name             User               Time Use S Queue
>  >  ---------------- ---------------- ------------------ -------- - -----
>  >  21205.parellel   bdimer           vuser                   0 Q workq
>  >  21207.parellel   gram0            vuser                   0 Q workq
>  >
>  >  # tracejob -n 10 21205
>  >
>  >  Job: 21205.parellel
>  >
>  >  03/17/2008 22:23:13  S    Job Queued at request of [EMAIL PROTECTED],
>  >  owner = [EMAIL PROTECTED], job name = bdimer, queue = workq
>  >  03/17/2008 22:23:13  A    queue=workq
>  >
>  >
>  >  So, how to make it run?
>  >  The version of OSCAR is 3.0
>  >
>  >
>  >
>  >  ------------------------------
>  >
>  >  Message: 2
>  >  Date: Tue, 18 Mar 2008 07:25:30 -0400
>  >  From: "Michael Edwards" <[EMAIL PROTECTED]>
>  >  Subject: Re: [Oscar-users] How to make job run?
>  >  To: oscar-users@lists.sourceforge.net
>  >  Message-ID:
>  >         <[EMAIL PROTECTED]>
>  >  Content-Type: text/plain; charset=ISO-8859-1
>  >
>  >  It could be any number of things.  Are other users able to run jobs on
>  >  the cluster?
>  >  When you started the cluster, did you boot the head node completely
>  >  before booting the cluster nodes?
>  >
>  >  Other than that, it would probably be an issue with either your user
>  >  permissions (can you ssh to a node without entering a password?) or
>  >  your queue script.  If you wanted to send your queue script it might
>  >  help.
>  >
>  >  On Mon, Mar 17, 2008 at 11:10 PM, Chunwei Han <[EMAIL PROTECTED]> wrote:
>  >  > Hey guys
>  >  >
>  >  >  Recently I take over the job as the cluster administrator. Unluckily
>  >  >  the former administrator did not leave any documents and I have no
>  >  >  experience at all. Right now the job can not be submitted. Here is
>  >  >  some debugger info:
>  >  >
>  >  >  # qstat
>  >  >  Job id           Name             User               Time Use S Queue
>  >  >  ---------------- ---------------- ------------------ -------- - -----
>  >  >  21205.parellel   bdimer           vuser                   0 Q workq
>  >  >  21207.parellel   gram0            vuser                   0 Q workq
>  >  >
>  >  >  # tracejob -n 10 21205
>  >  >
>  >  >  Job: 21205.parellel
>  >  >
>  >  >  03/17/2008 22:23:13  S    Job Queued at request of [EMAIL PROTECTED],
>  >  >  owner = [EMAIL PROTECTED], job name = bdimer, queue = workq
>  >  >  03/17/2008 22:23:13  A    queue=workq
>  >  >
>  >  >
>  >  >  So, how to make it run?
>  >  >  The version of OSCAR is 3.0
>  >  >
>  >  >  
> -------------------------------------------------------------------------
>  >  >  This SF.net email is sponsored by: Microsoft
>  >  >  Defy all challenges. Microsoft(R) Visual Studio 2008.
>  >  >  http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
>  >  >  _______________________________________________
>  >  >  Oscar-users mailing list
>  >  >  Oscar-users@lists.sourceforge.net
>  >  >  https://lists.sourceforge.net/lists/listinfo/oscar-users
>  >  >
>  >
>  >
>  >
>  >  ------------------------------
>  >
>  >  Message: 3
>  >  Date: Tue, 18 Mar 2008 08:10:30 -0500
>  >  From: "Greenseid, Joseph M." <[EMAIL PROTECTED]>
>  >  Subject: Re: [Oscar-users] How to make job run?
>  >  To: <oscar-users@lists.sourceforge.net>,
>  >         <oscar-users@lists.sourceforge.net>
>  >  Message-ID:
>  >         <[EMAIL PROTECTED]>
>  >  Content-Type: text/plain;       charset="iso-8859-1"
>  >
>  >  If you're using Maui for scheduling, try running the command `sudo 
> checkjob [job-id]` (or run as root if you don't have sudo set up).  This 
> gives more info than tracejob, and may tell you something a little more 
> helpful, like "not enough available resources," etc.
>  >
>  >  --Joe
>  >
>  >  ________________________________
>  >
>  >  From: [EMAIL PROTECTED] on behalf of Chunwei Han
>  >  Sent: Mon 3/17/2008 11:10 PM
>  >  To: oscar-users@lists.sourceforge.net
>  >  Subject: [Oscar-users] How to make job run?
>  >
>  >
>  >
>  >  Hey guys
>  >
>  >  Recently I take over the job as the cluster administrator. Unluckily
>  >  the former administrator did not leave any documents and I have no
>  >  experience at all. Right now the job can not be submitted. Here is
>  >  some debugger info:
>  >
>  >  # qstat
>  >  Job id           Name             User               Time Use S Queue
>  >  ---------------- ---------------- ------------------ -------- - -----
>  >  21205.parellel   bdimer           vuser                   0 Q workq
>  >  21207.parellel   gram0            vuser                   0 Q workq
>  >
>  >  # tracejob -n 10 21205
>  >
>  >  Job: 21205.parellel
>  >
>  >  03/17/2008 22:23:13  S    Job Queued at request of [EMAIL PROTECTED],
>  >  owner = [EMAIL PROTECTED], job name = bdimer, queue = workq
>  >  03/17/2008 22:23:13  A    queue=workq
>  >
>  >
>  >  So, how to make it run?
>  >  The version of OSCAR is 3.0
>  >
>  >  -------------------------------------------------------------------------
>  >  This SF.net email is sponsored by: Microsoft
>  >  Defy all challenges. Microsoft(R) Visual Studio 2008.
>  >  http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
>  >  _______________________________________________
>  >  Oscar-users mailing list
>  >  Oscar-users@lists.sourceforge.net
>  >  https://lists.sourceforge.net/lists/listinfo/oscar-users
>  >
>  >
>  >
>  >
>  >
>  >  ------------------------------
>  >
>  >  Message: 4
>  >  Date: Tue, 18 Mar 2008 07:48:13 -0700 (PDT)
>  >  From: arun shankar <[EMAIL PROTECTED]>
>  >  Subject: Re: [Oscar-users] How to make job run?
>  >  To: oscar-users@lists.sourceforge.net
>  >  Message-ID: <[EMAIL PROTECTED]>
>  >  Content-Type: text/plain; charset="us-ascii"
>  >
>  >  Try using the command ''qstat -atn'', which will give you more debugging 
> information ( extra comment line ), like ''Not Enough Resources Available'' 
> or ''License not vaild''. Also try using the command ''qstat -f  <jobid>'' to 
> get more info.
>  >
>  >  If the comment says ''license not valid'', check for the license file ( i 
> guess it should be under <HOME directory>/server_priv/license_file if i am 
> not wrong )
>  >
>  >  Check for the Queue Settings, # qmgr ( Enter )...Check for ''s q workq 
> started=true and s q workq enabled=true'', if neither of this is false, make 
> it true.
>  >
>  >  Above are some steps you can perform if jobs go into queue state. Hope 
> this will be useful.
>  >
>  >  Regards
>  >  Arun
>  >
>  >
>  >
>  >  ----- Original Message ----
>  >  From: "Greenseid, Joseph M." <[EMAIL PROTECTED]>
>  >  To: oscar-users@lists.sourceforge.net; oscar-users@lists.sourceforge.net
>  >  Sent: Tuesday, March 18, 2008 9:10:30 PM
>  >  Subject: Re: [Oscar-users] How to make job run?
>  >
>  >  If you're using Maui for scheduling, try running the command `sudo 
> checkjob [job-id]` (or run as root if you don't have sudo set up).  This 
> gives more info than tracejob, and may tell you something a little more 
> helpful, like "not enough available resources," etc.
>  >
>  >  --Joe
>  >
>  >  ________________________________
>  >
>  >  From: [EMAIL PROTECTED] on behalf of Chunwei Han
>  >  Sent: Mon 3/17/2008 11:10 PM
>  >  To: oscar-users@lists.sourceforge.net
>  >  Subject: [Oscar-users] How to make job run?
>  >
>  >
>  >
>  >  Hey guys
>  >
>  >  Recently I take over the job as the cluster administrator. Unluckily
>  >  the former administrator did not leave any documents and I have no
>  >  experience at all. Right now the job can not be submitted. Here is
>  >  some debugger info:
>  >
>  >  # qstat
>  >  Job id          Name            User              Time Use S Queue
>  >  ---------------- ---------------- ------------------ -------- - -----
>  >  21205.parellel  bdimer          vuser                  0 Q workq
>  >  21207.parellel  gram0            vuser                  0 Q workq
>  >
>  >  # tracejob -n 10 21205
>  >
>  >  Job: 21205.parellel
>  >
>  >  03/17/2008 22:23:13  S    Job Queued at request of [EMAIL PROTECTED],
>  >  owner = [EMAIL PROTECTED], job name = bdimer, queue = workq
>  >  03/17/2008 22:23:13  A    queue=workq
>  >
>  >
>  >  So, how to make it run?
>  >  The version of OSCAR is 3.0
>  >
>  >  -------------------------------------------------------------------------
>  >  This SF.net email is sponsored by: Microsoft
>  >  Defy all challenges. Microsoft(R) Visual Studio 2008.
>  >  http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
>  >  _______________________________________________
>  >  Oscar-users mailing list
>  >  Oscar-users@lists.sourceforge.net
>  >  https://lists.sourceforge.net/lists/listinfo/oscar-users
>  >
>  >
>  >
>  >  -------------------------------------------------------------------------
>  >  This SF.net email is sponsored by: Microsoft
>  >  Defy all challenges. Microsoft(R) Visual Studio 2008.
>  >  http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
>  >  _______________________________________________
>  >  Oscar-users mailing list
>  >  Oscar-users@lists.sourceforge.net
>  >  https://lists.sourceforge.net/lists/listinfo/oscar-users
>  >
>  >
>  >       
> ____________________________________________________________________________________
>  >  Be a better friend, newshound, and
>  >  know-it-all with Yahoo! Mobile.  Try it now.  
> http://mobile.yahoo.com/;_ylt=Ahu06i62sR8HDtDypao8Wcj9tAcJ
>  >  -------------- next part --------------
>  >  An HTML attachment was scrubbed...
>  >
>  >  ------------------------------
>  >
>  >  -------------------------------------------------------------------------
>  >  This SF.net email is sponsored by: Microsoft
>  >  Defy all challenges. Microsoft(R) Visual Studio 2008.
>  >  http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
>  >
>  >  ------------------------------
>  >
>  >  _______________________________________________
>  >  Oscar-users mailing list
>  >  Oscar-users@lists.sourceforge.net
>  >  https://lists.sourceforge.net/lists/listinfo/oscar-users
>  >
>  >
>  >  End of Oscar-users Digest, Vol 22, Issue 7
>  >  ******************************************
>  >
>



-- 
+---------------------------------------------------------------------
| Chun-Wei Han
|
| College Of Chemistry and Molecular Engineering, Peking University
| Address : Room 809, New South Chemistry Building
| College of Chemistry and Molecular Engineering
| Peking University, Beijing 100871,
| P.R.China
| Office : +86-10-62751723
| E-Mail : [EMAIL PROTECTED]
+---------------------------------------------------------------------

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Oscar-users mailing list
Oscar-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/oscar-users

Reply via email to