Hello list, 


I have an application which uses MPI_Allgather with derived types. It works
correctly with mpich2 and mvapich2. However it crashes periodically with
openmpi2. After investigation I found that the crash takes place when I use
derived datatypes with MPI_AllGather and number of ranks greater than 8.
I've written a simple application which demonstrates the crash. It simply
calls for MPI_Allgather with derived datatype that consists of 1 shifted
integer . The sample works correctly with number of ranks 2-8. But when
number of ranks is greater than 8 it crashes with segmentation fault inside
MPI_Type_free, MPI_Allgather or MPI_Type_create_struct functions. This
sample also works correctly with mv2, mpich2 with any number of ranks. Is
this a limitation of ompi allgather?



Crashed output:

Press any key...

Press any key...

Press any key...

Press any key...

Press any key...

Press any key...

Press any key...

Press any key...

Press any key...

[amd1:24260] *** Process received signal ***

[amd1:24260] Signal: Segmentation fault (11)

[amd1:24260] Signal code: Address not mapped (1)

[amd1:24260] Failing at address: 0x18

[amd1:24262] *** Process received signal ***

[amd1:24262] Signal: Segmentation fault (11)

[amd1:24262] Signal code: Address not mapped (1)

[amd1:24262] Failing at address: 0x18

[amd1:24258] *** Process received signal ***

[amd1:24258] Signal: Segmentation fault (11)

[amd1:24258] Signal code: Address not mapped (1)

[amd1:24258] Failing at address: 0x18

[amd1:24260] [ 0] /lib64/libpthread.so.0 [0x3d6b20eb10]

[amd1:24260] [ 1] /lib64/libc.so.6 [0x3d6a671d80]

[amd1:24260] [ 2] /lib64/libc.so.6(cfree+0x4b) [0x3d6a67276b]

[amd1:24260] [ 3]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libopen-pal.so.0(o
pal_free+0x4e) [0x2ae52f5836bd]

[amd1:24260] [ 4]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1
[0x2ae52efd05aa]

[amd1:24260] [ 5]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1
[0x2ae52efd1e20]

[amd1:24260] [ 6]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1(ompi_d
dt_destroy+0xe3) [0x2ae52efd1d7b]

[amd1:24260] [ 7]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1(MPI_Ty
pe_free+0xf0) [0x2ae52f0202ec]

[amd1:24260] [ 8] ./gather_openmpi_153(main+0xef) [0x400dc8]

[amd1:24260] [ 9] /lib64/libc.so.6(__libc_start_main+0xf4) [0x3d6a61d994]

[amd1:24260] [10] ./gather_openmpi_153 [0x400ba9]

[amd1:24260] *** End of error message ***

[amd1:24262] [ 0] /lib64/libpthread.so.0 [0x3d6b20eb10]

[amd1:24262] [ 1] /lib64/libc.so.6 [0x3d6a671d80]

[amd1:24262] [ 2] /lib64/libc.so.6(cfree+0x4b) [0x3d6a67276b]

[amd1:24262] [ 3]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libopen-pal.so.0(o
pal_free+0x4e) [0x2aedeea596bd]

[amd1:24262] [ 4]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1
[0x2aedee4a65aa]

[amd1:24262] [ 5]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1
[0x2aedee4a7e20]

[amd1:24262] [ 6]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1(ompi_d
dt_destroy+0xe3) [0x2aedee4a7d7b]

[amd1:24262] [ 7]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1(MPI_Ty
pe_free+0xf0) [0x2aedee4f62ec]

[amd1:24262] [ 8] ./gather_openmpi_153(main+0xef) [0x400dc8]

[amd1:24262] [ 9] /lib64/libc.so.6(__libc_start_main+0xf4) [0x3d6a61d994]

[amd1:24262] [10] ./gather_openmpi_153 [0x400ba9]

[amd1:24262] *** End of error message ***

[amd1:24258] [ 0] /lib64/libpthread.so.0 [0x3d6b20eb10]

[amd1:24258] [ 1] /lib64/libc.so.6 [0x3d6a671d80]

[amd1:24258] [ 2] /lib64/libc.so.6(cfree+0x4b) [0x3d6a67276b]

[amd1:24258] [ 3]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libopen-pal.so.0(o
pal_free+0x4e) [0x2ab92cc786bd]

[amd1:24258] [ 4]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1
[0x2ab92c6c55aa]

[amd1:24258] [ 5]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1
[0x2ab92c6c6e20]

[amd1:24258] [ 6]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1(ompi_d
dt_destroy+0xe3) [0x2ab92c6c6d7b]

[amd1:24258] [ 7]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1(MPI_Ty
pe_free+0xf0) [0x2ab92c7152ec]

[amd1:24258] [ 8] ./gather_openmpi_153(main+0xef) [0x400dc8]

[amd1:24258] [ 9] /lib64/libc.so.6(__libc_start_main+0xf4) [0x3d6a61d994]

[amd1:24258] [10] ./gather_openmpi_153 [0x400ba9]

[amd1:24258] *** End of error message ***

[amd1:24256] *** Process received signal ***

[amd1:24256] Signal: Segmentation fault (11)

[amd1:24256] Signal code: Address not mapped (1)

[amd1:24256] Failing at address: 0x18

[amd1:24256] [ 0] /lib64/libpthread.so.0 [0x3d6b20eb10]

[amd1:24256] [ 1] /lib64/libc.so.6 [0x3d6a671d80]

[amd1:24256] [ 2] /lib64/libc.so.6(cfree+0x4b) [0x3d6a67276b]

[amd1:24256] [ 3]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libopen-pal.so.0(o
pal_free+0x4e) [0x2ba5a8c866bd]

[amd1:24256] [ 4]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1
[0x2ba5a86d35aa]

[amd1:24256] [ 5]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1
[0x2ba5a86d4e20]

[amd1:24256] [ 6]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1(ompi_d
dt_destroy+0xe3) [0x2ba5a86d4d7b]

[amd1:24256] [ 7]
/hpc/home/USERS/senina/projects/openmpi-1.4.3/install/lib/libmpi.so.1(MPI_Ty
pe_free+0xf0) [0x2ba5a87232ec]

[amd1:24256] [ 8] ./gather_openmpi_153(main+0xef) [0x400dc8]

[amd1:24256] [ 9] /lib64/libc.so.6(__libc_start_main+0xf4) [0x3d6a61d994]

[amd1:24256] [10] ./gather_openmpi_153 [0x400ba9]

[amd1:24256] *** End of error message ***

--------------------------------------------------------------------------

mpirun noticed that process rank 7 with PID 24262 on node amd1 exited on
signal 11 (Segmentation fault).

--------------------------------------------------------------------------



The sample source code:

#include "mpi.h"

#include <string.h>

#include "stdio.h"

#include <stdlib.h>



#define MAX_ERROR_LENGTH 128



#define num_types 3

MPI_Aint disp[num_types] = {0, 10, 20};

MPI_Datatype type[num_types] = {MPI_LB, MPI_INT, MPI_UB};

int blocklen[num_types] = {1, 1, 1};



void generate_derived_type(MPI_Datatype *dtype)

{

    int rank;

    int status;

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);





    status = MPI_Type_create_struct( num_types, blocklen, disp, type, dtype
);

    if (status != MPI_SUCCESS)

    {

        printf("status of create_struct = %i, rank = %i\n",

                               status, rank);

    }



    status = MPI_Type_commit( dtype );

    if (status != MPI_SUCCESS)

    {

        printf("status of type_commit = %i, rank = %i\n",

           status, rank);

    }

}



int main(int argc, char *argv[])

{

    int i;

    int rank, size;

    MPI_Datatype stype;

    int *send_buf = 0, *recv_buf = 0;



    send_buf = (int*)calloc(sizeof(int), 10000000);

    recv_buf = (int*)calloc(sizeof(int), 10000000);



    MPI_Init(&argc, &argv);



    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    MPI_Comm_size(MPI_COMM_WORLD, &size);



    printf("Press any key...\n");

    getchar();



    for (i = 0; i < 20000; i++)

    {

       int status = 0;

        generate_derived_type(&stype);



        status = MPI_Allgather(send_buf, 1, stype, recv_buf, 1, stype,
MPI_COMM_WORLD);

        if (status != MPI_SUCCESS)

        {

            printf("status of all_gather = %i, rank = %i\n", 

                    status, rank);

        }



        status = MPI_Type_free( &stype);

        if (status != MPI_SUCCESS)

        {

            printf("status of type_free = %i, rank = %i\n",

                    status, rank);

        }

    }



    MPI_Finalize();



    free(send_buf);

    free(recv_buf);



    return 0;

}





Thanks, 

Andrew.

Reply via email to