I tried the following code without CUDA, the error is still there:

#include "mpi.h"
#include <cstdlib>
#include <cstring>
#include <cmath>

int main(int argc, char **argv)
{
    // override command line arguments to make sure cudaengine get the
correct one
    char **argv_new = new char*[ argc + 2 ];
    for( int i = 0 ; i < argc ; i++ )
    {
        argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
        strcpy( argv_new[i], argv[i] );
    }
    argv_new[ argc   ] = new char[ 32 ];
    argv_new[ argc+1 ] = new char[ 32 ];
    strcpy( argv_new[argc],   "-device" );
    sprintf( argv_new[argc+1], "%d", 0 );
    argc += 2;
    argv = argv_new;

    MPI_Init(&argc,&argv);

    // do something...

    MPI_Finalize();

    for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
    delete [] argv;
}

At the end of the program the pointer stored in argv is exactly that of
argv_new so this should not be a problem. Manually inserting printf tells
me that the fault occured at MPI_Init. The code works fine if I use
MPI_Init(NULL,NULL) instead. The same code also compiles and runs without a
problem on my laptop with mpich2-1.4.

Best,
Yu-Hang



On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher <
matthieu.bruc...@gmail.com> wrote:

> Hi,
>
> Are you sure this is the correct code? This seems strange and not a good
> idea:
>
>    MPI_Init(&argc,&argv);
>
>     // do something...
>
>     for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>     delete [] argv;
>
> Did you mean argc_new and argv_new instead?
> Do you have the same error without CUDA?
>
> Cheers,
>
> Matthieu
>
>
> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>:
> > Hi,
> >
> > I tried to augment the command line argument list by allocating my own
> list
> > of strings and passing them to MPI_Init, yet I got a segmentation fault
> for
> > both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. The
> > code is:
> >
> > #include "mpi.h"
> > #include "cuda_runtime.h"
> > #include <cstdlib>
> > #include <cstring>
> > #include <cmath>
> >
> > int main(int argc, char **argv)
> > {
> >     int device = 0;
> >     int skip = 0;
> >     bool skipmode = false;
> >     bool specified = false;
> >     for( int i = 0 ; i < argc ; i++ )
> >     {
> >         if ( strcmp( argv[i], "-device" ) == 0 )
> >         {
> >             i++;
> >             if ( argv[i][0] == '-' )
> >             {
> >                 skipmode = true;
> >                 skip = fabs( atoi( argv[i] ) );
> >             }
> >             else
> >             {
> >                 skipmode = false;
> >                 device = atoi( argv[i] );
> >             }
> >             specified = true;
> >         }
> >     }
> >
> >     if ( !specified || skipmode )
> >     {
> >         char* var;
> >         int dev_count, local_rank = 0;
> >         if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank =
> > atoi(var);
> >         else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK"))  != NULL)
> > local_rank = atoi(var);
> >         else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL)
> > local_rank = atoi(var);
> >         cudaGetDeviceCount( &dev_count );
> >         if ( skipmode )
> >         {
> >             device = 0;
> >             if ( device == skip ) local_rank++;
> >             while( local_rank-- > 0 )
> >             {
> >                 device = (++device) % dev_count;
> >                 if ( device == skip ) local_rank++;
> >             }
> >         }
> >         else device = local_rank % dev_count;
> >     }
> >
> >     // override command line arguments to make sure cudaengine get the
> > correct one
> >     char **argv_new = new char*[ argc + 2 ];
> >     for( int i = 0 ; i < argc ; i++ )
> >     {
> >         argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
> >         strcpy( argv_new[i], argv[i] );
> >     }
> >     argv_new[ argc   ] = new char[ 32 ];
> >     argv_new[ argc+1 ] = new char[ 32 ];
> >     strcpy( argv_new[argc],   "-device" );
> >     sprintf( argv_new[argc+1], "%d", device );
> >     argc += 2;
> >     argv = argv_new;
> >
> >     cudaSetDevice( device );
> >
> >     MPI_Init(&argc,&argv);
> >
> >     // do something...
> >
> >     MPI_Finalize();
> >
> >     cudaDeviceReset();
> >     for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
> >     delete [] argv;
> > }
> >
> > When compiled using nvcc -ccbin mpic++, The error I got was:
> >
> > [jueying:16317] *** Process received signal ***
> > [jueying:16317] Signal: Segmentation fault (11)
> > [jueying:16317] Signal code: Address not mapped (1)
> > [jueying:16317] Failing at address: 0x21
> > [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000]
> > [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551]
> > [jueying:16317] [ 2]
> > /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39)
> > [0x7f460b993079]
> > [jueying:16317] [ 3]
> /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347)
> > [0x7f460c106a57]
> > [jueying:16317] [ 4] /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b)
> > [0x7f460c12523b]
> > [jueying:16317] [ 5] ./lmp_jueying() [0x40c035]
> > [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5)
> > [0x39e5621a05]
> > [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21]
> > [jueying:16317] *** End of error message ***
> >
> > Thanks for the help.
> >
> > Best regards,
> > Yu-Hang Tang
> >
> > _______________________________________________
> > users mailing list
> > us...@open-mpi.org
> > http://www.open-mpi.org/mailman/listinfo.cgi/users
>
>
>
> --
> Information System Engineer, Ph.D.
> Blog: http://matt.eifelle.com
> LinkedIn: http://www.linkedin.com/in/matthieubrucher
> Music band: http://liliejay.com/
> _______________________________________________
> users mailing list
> us...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/users
>



-- 
Yu-Hang Tang
Room 105, 37 Manning St
Division of Applied Mathematics, Brown University
Providence, RI 02912

Reply via email to