Dear list,

the attached program deadlocks in MPI_File_write_all when run with 16 processes on two 8 core nodes of an Infiniband cluster. It runs fine when I

a) use tcp
or
b) replace MPI_File_write_all by MPI_File_write

I'm using openmpi V. 1.3.2 (but I checked that the problem is also occurs with version 1.3.3). The OFED version is 1.4 (installed via Rocks). The Operating system is CentOS 5.2

I compile with gcc-4.1.2. The openmpi configure flags are

../../configure --prefix=/share/apps/openmpi/1.3.2/gcc-4.1.2/ --with-io-romio-flags=--with-file-system=nfs+ufs+pvfs2 --with-wrapper-ldflags=-L/share/apps/pvfs2/lib CPPFLAGS=-I/share/apps/pvfs2/include/ LDFLAGS=-L/share/apps/pvfs2/lib LIBS=-lpvfs2 -lpthread

The user home directories are mounted via nfs.

Is it a problem with the user code, the system or with openmpi?

Thanks,
Dorian

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <assert.h>

int blocklength[16][3] = { { 2471, 2471, 2471 },
                           { 4392, 4392, 4392 },
                           { 4392, 4392, 4392 },
                           { 2470, 2470, 2470 },
                           { 2956, 2956, 2956 },
                           { 5256, 5256, 5256 },
                           { 5256, 5256, 5256 },
                           { 2957, 2957, 2957 },
                           { 2903, 2903, 2903 },
                           { 5160, 5160, 5160 },
                           { 5160, 5160, 5160 },
                           { 2902, 2902, 2902 },
                           { 2470, 2470, 2470 },
                           { 4392, 4392, 4392 },
                           { 4392, 4392, 4392 },
                           { 2471, 2471, 2471 }
                        };
int       displ[16][3] = { {    0,  60000,  120000 },
                           { 2471,  62471,  122471 },
                           { 6863,  66863,  126863 },
                           { 11255, 71255,  131255 },
                           { 13725, 73725,  133725 },
                           { 16681, 76681,  136681 },
                           { 21937, 81937,  141937 },
                           { 27193, 87193,  147193 },
                           { 30150, 90150,  150150 },
                           { 33053, 93053,  153053 },
                           { 38213, 98213,  158213 },
                           { 43373, 103373, 163373 },
                           { 46275, 106275, 166275 },
                           { 48745, 108745, 168745 },
                           { 53137, 113137, 173137 },
                           { 57529, 117529, 177529 }
                        };

int main(int argc, char **argv)
{
        int b[3], d[3], size;

        MPI_Init(&argc,&argv);

        int myrank;
        MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

        int numprocs;
        MPI_Comm_size(MPI_COMM_WORLD, &numprocs);

        MPI_File fh;
        MPI_File_open(MPI_COMM_WORLD, "write.data",
                          MPI_MODE_CREATE | MPI_MODE_WRONLY | 
MPI_MODE_UNIQUE_OPEN,
                          MPI_INFO_NULL, &fh);

        /* Just a check */
        if(0 < myrank) {
                for(int k = 0; k < 3; ++k) {
                        assert(displ[myrank][k] == 
displ[myrank-1][k]+blocklength[myrank-1][k]);
                }
        }

        size = 0;
        for(int k = 0; k < 3; ++k) {
                b[k]  = 96*blocklength[myrank][k];
                d[k]  = 96*displ[myrank][k];
                size += b[k];
        }

        MPI_Datatype block;
        MPI_Type_indexed(3, b, d, MPI_CHAR, &block);
        MPI_Type_commit(&block);

        MPI_File_set_view(fh, 0, MPI_CHAR, block, "native", MPI_INFO_NULL);

        char *buf = new char[size]();

        MPI_File_write_all(fh, buf, size, MPI_CHAR, MPI_STATUS_IGNORE);

        MPI_File_close(&fh);
        MPI_Type_free(&block);

        delete[] buf;

        return MPI_Finalize();
}


Reply via email to