Hi,

Am 07.03.2014 um 12:28 schrieb Petar Penchev:

> I have a rocks-cluster 6.1 using OGS2011.11p1 and i am trying to use the
> PlatformMPI parallel libraries. My problem is that when i submit a job
> using qsub test.sh, the job starts only on one node with 16 processes
> and not on both nodes. The -pe pmpi, which i am using for now  is only a
> copy of mpi.

The definition of the PE pmpi does also include the -catch_rsh? The recent 
IBM/Platform-MPI can cope with a machine file in the MPICH(1) format, which is 
created by the /usr/sge/mpi/startmpi.sh

In addition you need the following settings for a tight integration. Please try:

...
export MPI_REMSH=rsh
export MPI_TMPDIR=$TMPDIR
mpiexec -np $NSLOTS -machinefile $TMPDIR/machines $BIN $ARGS

-- Reuti


> What am i missing? Dose anyone have a working -pe submit script, or some
> hints how to make this working?
> 
> Thanks in advance,
> Petar
> 
> [root@rocks mpi]# test.sh
> #!/bin/bash
> #$ -N lsdyna
> #$ -S /bin/bash
> #$ -pe pmpi 16
> #$ -cwd
> #$ -o lsdyna.out
> #$ -e lsdyna.err
> ###
> #$ -q test.q
> ### -notify
> export MPI_ROOT=/export/apps/platform_mpi
> export LD_LIBRARY_PATH=/export/apps/platform_mpi/lib/linux_amd64
> export PATH=/export/apps/platform_mpi/bin
> BIN="/export/apps/lsdyna/ls-dyna_mpp_s_r6_1_2_85274_x64_redhat54_ifort120_sse2_platformmpi.exe"
> ARGS="i=test.k"
> mpirun -np $NSLOTS $BIN $ARGS
> 
> 
> [root@rocks mpi]# qconf -sq test.q
> qname                 test.q
> hostlist              mnode01 mnode02
> seq_no                0
> load_thresholds       np_load_avg=1.75
> suspend_thresholds    NONE
> nsuspend              1
> suspend_interval      00:05:00
> priority              0
> min_cpu_interval      00:05:00
> processors            UNDEFINED
> qtype                 BATCH INTERACTIVE
> ckpt_list             NONE
> pe_list               pmpi
> rerun                 FALSE
> slots                 8
> tmpdir                /tmp
> shell                 /bin/bash
> prolog                NONE
> epilog                NONE
> shell_start_mode      unix_behavior
> starter_method        NONE
> suspend_method        NONE
> resume_method         NONE
> terminate_method      NONE
> notify                00:00:60
> owner_list            NONE
> user_lists            NONE
> xuser_lists           NONE
> subordinate_list      NONE
> complex_values        NONE
> projects              NONE
> xprojects             NONE
> calendar              NONE
> initial_state         default
> s_rt                  INFINITY
> h_rt                  INFINITY
> s_cpu                 INFINITY
> h_cpu                 INFINITY
> s_fsize               INFINITY
> h_fsize               INFINITY
> s_data                INFINITY
> h_data                INFINITY
> s_stack               INFINITY
> h_stack               INFINITY
> s_core                INFINITY
> h_core                INFINITY
> s_rss                 INFINITY
> h_rss                 INFINITY
> s_vmem                INFINITY
> h_vmem                INFINITY
> _______________________________________________
> users mailing list
> [email protected]
> https://gridengine.org/mailman/listinfo/users


_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users

Reply via email to