Hello, a colleague of mine has investigated a difficult problem we traced to OpenMPI giving incorrectly delivered data on some struct datatypes which use specific offsets (on the stack in our case but the problem can be reproduced when using specifically chosen slices of an array). Our library is used to aggregate several MPI communications in a generic and transparent manner and therefore we need to be able to handle any combination of properly aligned offsets and component types.
The attached example program contains the necessary steps to reproduce the problem: 1. create the struct types in question 2. send/recv the data 3. compare to reference (said comparison works on several MPICH2 versions) The code prints than any array indices/values not matching the reference. Our platform is linux x86_64 with Debian squeeze, the tested versions of OpenMPI are the 1.4.2 version supplied with squeeze and 1.6.4 compiled ourselves. For 1.4.2 I also did a quick test in a i386 chroot and the code fails there too. gcc 4.6.1 was used for the x86_64 cases and gcc 4.3.5 for the i386 chroot. Sorry if the test is not of minimal size, but we were happy once he got this down from several 10000 lines Fortran+C and even that took more than a day once we understood the problem was unrelated to the Fortran program it originally occurred in. When running the program with OpenMPI: $ mpicc -std=gnu99 ./mpi_test.c && ./a.out first tests: second tests: results_2[6] = 8 ref_results_2[6] = 12 results_2[7] = 9 ref_results_2[7] = 13 MPICH gives the expected result: $ /sw/squeeze-x64/mpi/mpich2-1.4.1p1-gccsys/bin/mpicc -std=gnu99 ./mpi_test.c && ./a.out first tests: second tests: Regards, Thomas -- Thomas Jahns DKRZ GmbH, Department: Application software Deutsches Klimarechenzentrum Bundesstraße 45a D-20146 Hamburg Phone: +49-40-460094-151 Fax: +49-40-460094-270 Email: Thomas Jahns <ja...@dkrz.de>
#include <mpi.h> #include <stdio.h> /** * expected output: first tests: second tests: * actual output first tests: second tests: results_2[6] = 8 ref_results_2[6] = 12 results_2[7] = 9 ref_results_2[7] = 13 */ void do_test(MPI_Datatype * recvs, MPI_Datatype * sends, int * inputs[2]); int main(void) { MPI_Init(NULL, NULL); int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if (size == 1) { MPI_Datatype sends[2], recvs[2]; { int count = 2, blocklen = 2, stride = 4; MPI_Type_vector(count, blocklen, stride, MPI_INT, &recvs[0]); MPI_Type_commit(&recvs[0]); } { int count = 1; int blocklength = 4; int array_of_displacements[] = {4}; MPI_Type_create_indexed_block(count, blocklength, array_of_displacements, MPI_INT, &sends[0]); MPI_Type_commit(&sends[0]); } { int count = 1; int blocklength = 4; int array_of_displacements[] = {4}; MPI_Type_create_indexed_block(count, blocklength, array_of_displacements, MPI_INT, &recvs[1]); MPI_Type_commit(&recvs[1]); } { int count = 2, blocklen = 2, stride = 4; MPI_Type_vector(count, blocklen, stride, MPI_INT, &sends[1]); MPI_Type_commit(&sends[1]); } { int raw_input[24] = {0,1,2,3,4,5,6,7, -2,-2,-2,-2,-2,-2,-2,-2, 8,9,10,11,12,13,14,15}; int * input_1 = &raw_input[0], * input_2 = &raw_input[16]; int * inputs[2] = {input_1, input_2}; puts("first tests:"); do_test(recvs, sends, inputs); } { int raw_input[16] = {0,1,2,3,4,5,6,7, 8,9,10,11,12,13,14,15}; int * input_1 = &raw_input[0], * input_2 = &raw_input[8]; int * inputs[2] = {input_1, input_2}; puts("second tests:"); do_test(recvs, sends, inputs); } MPI_Type_free(&sends[1]); MPI_Type_free(&recvs[1]); MPI_Type_free(&sends[0]); MPI_Type_free(&recvs[0]); } MPI_Finalize(); return 0; } void do_test(MPI_Datatype * recvs, MPI_Datatype * sends, int * inputs[2]) { int results_1[8] = {-1,-1,-1,-1,-1,-1,-1,-1}, results_2[8] = {-1,-1,-1,-1,-1,-1,-1,-1}; int * results[2] = {results_1, results_2}; MPI_Datatype send_dt, recv_dt; { int count = 2; int array_of_blocklengths[] = {1, 1}; MPI_Aint array_of_displacements[] = {0, (results[1] - results[0]) * sizeof(int)}; MPI_Type_create_struct(count, array_of_blocklengths, array_of_displacements, recvs, &recv_dt); MPI_Type_commit(&recv_dt); } { int count = 2; int array_of_blocklengths[] = {1, 1}; MPI_Aint array_of_displacements[] = {0, (inputs[1] - inputs[0]) * sizeof(int)}; MPI_Type_create_struct(count, array_of_blocklengths, array_of_displacements, sends, &send_dt); MPI_Type_commit(&send_dt); } MPI_Request request; MPI_Irecv(results[0], 1, recv_dt, 0, 0, MPI_COMM_WORLD, &request); MPI_Send(inputs[0], 1, send_dt, 0, 0, MPI_COMM_WORLD); MPI_Waitall(1, &request, MPI_STATUSES_IGNORE); MPI_Type_free(&send_dt); MPI_Type_free(&recv_dt); int ref_results_1[8] = {4,5,-1,-1,6,7,-1,-1}, ref_results_2[8] = {-1,-1,-1,-1,8,9,12,13}; for (int i = 0; i < 8; ++i) { if (results_1[i] != ref_results_1[i]) printf("results_1[%d] = %d\n" "ref_results_1[%d] = %d\n", i, results_1[i], i, ref_results_1[i]); if (results_2[i] != ref_results_2[i]) printf("results_2[%d] = %d\n" "ref_results_2[%d] = %d\n", i, results_2[i], i, ref_results_2[i]); } }
smime.p7s
Description: S/MIME Cryptographic Signature