I don't think that's true in the case of argv as that is a pointer...but <shrug> either way, this isn't an OMPI problem.
On Nov 12, 2013, at 9:09 AM, Matthieu Brucher <matthieu.bruc...@gmail.com> wrote: > I understand why he did this, it's only the main argc/argv values that > are changed, not the actual system values (my mistake as well, I > overlooked his code, not paying attention to the details!). > Still, keeping different names would be best for code reviews and code > understanding. > > The fact that the error is not caught is because opal_argv_join > doesn't get argc as one of its parameters, so it can't check the > value. It just assumes the standard was respected. > > Matthieu > > 2013/11/12 Ralph Castain <r...@open-mpi.org>: >> >> On Nov 12, 2013, at 8:56 AM, Matthieu Brucher <matthieu.bruc...@gmail.com> >> wrote: >> >> It seems that argv[argc] should always be NULL according to the >> standard. >> >> >> That is definitely true. >> >> So OMPI failure is not actually a bug! >> >> >> I think that is true as well, though I suppose we could try to catch it >> (doubtful - what if it isn't NULL but garbage? after all, you are looking >> past the end of the array) >> >> Something else is also wrong here. You are never allowed to release argv >> entries as those belong to the system, so the last loop in your program is >> wrong. Also, you do something else that is wrong - you create a new argv >> array (argv_new), but then you set argv to point to that array - which >> messes up the system array again. On top of that, you changed the system >> value of argc instead of setting your own variable. >> >> >> >> Cheers, >> >> 2013/11/12 Matthieu Brucher <matthieu.bruc...@gmail.com>: >> >> Interestingly enough, in ompi_mpi_init, opal_argv_join is called >> without then array length, so I suppose that in the usual argc/argv >> couple, you have an additional value to argv which may be NULL. So try >> allocating 3 additional values, the last being NULL, and it may work. >> >> Cheers, >> >> Matthieu >> >> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>: >> >> I tried the following code without CUDA, the error is still there: >> >> #include "mpi.h" >> >> #include <cstdlib> >> #include <cstring> >> #include <cmath> >> >> int main(int argc, char **argv) >> { >> // override command line arguments to make sure cudaengine get the >> correct one >> char **argv_new = new char*[ argc + 2 ]; >> for( int i = 0 ; i < argc ; i++ ) >> { >> argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; >> strcpy( argv_new[i], argv[i] ); >> } >> argv_new[ argc ] = new char[ 32 ]; >> argv_new[ argc+1 ] = new char[ 32 ]; >> strcpy( argv_new[argc], "-device" ); >> sprintf( argv_new[argc+1], "%d", 0 ); >> >> argc += 2; >> argv = argv_new; >> >> MPI_Init(&argc,&argv); >> >> // do something... >> >> MPI_Finalize(); >> >> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; >> delete [] argv; >> } >> >> At the end of the program the pointer stored in argv is exactly that of >> argv_new so this should not be a problem. Manually inserting printf tells me >> that the fault occured at MPI_Init. The code works fine if I use >> MPI_Init(NULL,NULL) instead. The same code also compiles and runs without a >> problem on my laptop with mpich2-1.4. >> >> Best, >> Yu-Hang >> >> >> >> On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher >> <matthieu.bruc...@gmail.com> wrote: >> >> >> Hi, >> >> Are you sure this is the correct code? This seems strange and not a good >> idea: >> >> MPI_Init(&argc,&argv); >> >> // do something... >> >> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; >> delete [] argv; >> >> Did you mean argc_new and argv_new instead? >> Do you have the same error without CUDA? >> >> Cheers, >> >> Matthieu >> >> >> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>: >> >> Hi, >> >> I tried to augment the command line argument list by allocating my own >> list >> of strings and passing them to MPI_Init, yet I got a segmentation fault >> for >> both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. The >> code is: >> >> #include "mpi.h" >> #include "cuda_runtime.h" >> #include <cstdlib> >> #include <cstring> >> #include <cmath> >> >> int main(int argc, char **argv) >> { >> int device = 0; >> int skip = 0; >> bool skipmode = false; >> bool specified = false; >> for( int i = 0 ; i < argc ; i++ ) >> { >> if ( strcmp( argv[i], "-device" ) == 0 ) >> { >> i++; >> if ( argv[i][0] == '-' ) >> { >> skipmode = true; >> skip = fabs( atoi( argv[i] ) ); >> } >> else >> { >> skipmode = false; >> device = atoi( argv[i] ); >> } >> specified = true; >> } >> } >> >> if ( !specified || skipmode ) >> { >> char* var; >> int dev_count, local_rank = 0; >> if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank = >> atoi(var); >> else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) >> local_rank = atoi(var); >> else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) >> local_rank = atoi(var); >> cudaGetDeviceCount( &dev_count ); >> if ( skipmode ) >> { >> device = 0; >> if ( device == skip ) local_rank++; >> while( local_rank-- > 0 ) >> { >> device = (++device) % dev_count; >> if ( device == skip ) local_rank++; >> } >> } >> else device = local_rank % dev_count; >> } >> >> // override command line arguments to make sure cudaengine get the >> correct one >> char **argv_new = new char*[ argc + 2 ]; >> for( int i = 0 ; i < argc ; i++ ) >> { >> argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; >> strcpy( argv_new[i], argv[i] ); >> } >> argv_new[ argc ] = new char[ 32 ]; >> argv_new[ argc+1 ] = new char[ 32 ]; >> strcpy( argv_new[argc], "-device" ); >> sprintf( argv_new[argc+1], "%d", device ); >> argc += 2; >> argv = argv_new; >> >> cudaSetDevice( device ); >> >> MPI_Init(&argc,&argv); >> >> // do something... >> >> MPI_Finalize(); >> >> cudaDeviceReset(); >> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; >> delete [] argv; >> } >> >> When compiled using nvcc -ccbin mpic++, The error I got was: >> >> [jueying:16317] *** Process received signal *** >> [jueying:16317] Signal: Segmentation fault (11) >> [jueying:16317] Signal code: Address not mapped (1) >> [jueying:16317] Failing at address: 0x21 >> [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000] >> [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551] >> [jueying:16317] [ 2] >> /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39) >> [0x7f460b993079] >> [jueying:16317] [ 3] >> /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347) >> [0x7f460c106a57] >> [jueying:16317] [ 4] /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b) >> [0x7f460c12523b] >> [jueying:16317] [ 5] ./lmp_jueying() [0x40c035] >> [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5) >> [0x39e5621a05] >> [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21] >> [jueying:16317] *** End of error message *** >> >> Thanks for the help. >> >> Best regards, >> Yu-Hang Tang >> >> _______________________________________________ >> users mailing list >> us...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/users >> >> >> >> >> -- >> Information System Engineer, Ph.D. >> Blog: http://matt.eifelle.com >> LinkedIn: http://www.linkedin.com/in/matthieubrucher >> Music band: http://liliejay.com/ >> _______________________________________________ >> users mailing list >> us...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/users >> >> >> >> >> >> -- >> Yu-Hang Tang >> Room 105, 37 Manning St >> Division of Applied Mathematics, Brown University >> Providence, RI 02912 >> >> _______________________________________________ >> users mailing list >> us...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/users >> >> >> >> >> -- >> Information System Engineer, Ph.D. >> Blog: http://matt.eifelle.com >> LinkedIn: http://www.linkedin.com/in/matthieubrucher >> Music band: http://liliejay.com/ >> >> >> >> >> -- >> Information System Engineer, Ph.D. >> Blog: http://matt.eifelle.com >> LinkedIn: http://www.linkedin.com/in/matthieubrucher >> Music band: http://liliejay.com/ >> _______________________________________________ >> users mailing list >> us...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/users >> >> >> >> _______________________________________________ >> users mailing list >> us...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/users > > > > -- > Information System Engineer, Ph.D. > Blog: http://matt.eifelle.com > LinkedIn: http://www.linkedin.com/in/matthieubrucher > Music band: http://liliejay.com/ > _______________________________________________ > users mailing list > us...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/users