George, A improved patch is attached. Latter half is same as your patch. But again, I'm not sure this is a correct solution.
It works correctly for my attached put_dup_type_3.c. Run as "mpiexec -n 1 ./put_dup_type_3". It will print seven OKs if succeeded. Regards, KAWASHIMA Takahiro > No. My patch doesn't work for a more simple case, > just a duplicate of MPI_INT. > > Datatype is too complex for me ... > > Regards, > KAWASHIMA Takahiro > > > George, > > > > Thanks. But no, your patch does not work correctly. > > > > The assertion failure disappeared by your patch but the value of the > > target buffer of MPI_Put is not a correct one. > > > > In rdma OSC (and pt2pt OSC), the following data are packed into > > the send buffer in ompi_osc_rdma_sendreq_send function on the > > origin side. > > > > - header > > - datatype description > > - user data > > > > User data are written at the offset of > > (sizeof(ompi_osc_rdma_send_header_t) + total_pack_size). > > > > In the case of my program attached in my previous mail, total_pack_size > > is 32 because ompi_datatype_set_args set 8 for MPI_COMBINER_DUP and > > 24 for MPI_COMBINER_CONTIGUOUS. See the following code. > > > > ---------------------------------------------------------------- > > int32_t ompi_datatype_set_args(... snip ...) > > { > > ... snip ... > > switch(type){ > > ... snip ... > > case MPI_COMBINER_DUP: > > /* Recompute the data description packed size based on the > > optimization > > * for MPI_COMBINER_DUP. > > */ > > pArgs->total_pack_size = 2 * sizeof(int); #### total_pack_size = 8 > > break; > > ... snip ... > > } > > ... > > for( pos = 0; pos < cd; pos++ ) { > > ... snip ... > > if( !(ompi_datatype_is_predefined(d[pos])) ) { > > ... snip ... > > pArgs->total_pack_size += > > ((ompi_datatype_args_t*)d[pos]->args)->total_pack_size; #### > > total_pack_size += 24 > > ... snip ... > > } > > ... snip ... > > } > > ... snip ... > > } > > ---------------------------------------------------------------- > > > > But on the target side, user data are read at the offset of > > (sizeof(ompi_osc_rdma_send_header_t) + 24) > > because ompi_osc_base_datatype_create function, which is called > > by ompi_osc_rdma_sendreq_recv_put function, progress the offset > > only 24 bytes. Not 32 bytes. > > > > So the wrong data are written to the target buffer. > > > > We need to take care of total_pack_size in the origin side. > > > > I modified ompi_datatype_set_args function as a trial. > > > > Index: ompi/datatype/ompi_datatype_args.c > > =================================================================== > > --- ompi/datatype/ompi_datatype_args.c (revision 28778) > > +++ ompi/datatype/ompi_datatype_args.c (working copy) > > @@ -129,7 +129,7 @@ > > /* Recompute the data description packed size based on the > > optimization > > * for MPI_COMBINER_DUP. > > */ > > - pArgs->total_pack_size = 2 * sizeof(int); > > + pArgs->total_pack_size = 0; > > break; > > > > case MPI_COMBINER_CONTIGUOUS: > > > > This patch in addition to your patch works correctly for my program. > > But I'm not sure this is a correct solution. > > > > Regards, > > KAWASHIMA Takahiro > > > > > Takahiro, > > > > > > Nice catch. That particular code was an over-optimizations … that failed. > > > Please try with the patch below. > > > > > > Let me know if it's working as expected, I will push it in the trunk once > > > confirmed. > > > > > > George. > > > > > > > > > Index: ompi/datatype/ompi_datatype_args.c > > > =================================================================== > > > --- ompi/datatype/ompi_datatype_args.c (revision 28787) > > > +++ ompi/datatype/ompi_datatype_args.c (working copy) > > > @@ -449,9 +449,10 @@ > > > } > > > /* For duplicated datatype we don't have to store all the > > > information */ > > > if( MPI_COMBINER_DUP == args->create_type ) { > > > - position[0] = args->create_type; > > > - position[1] = args->d[0]->id; /* On the OMPI - layer, copy the > > > ompi_datatype.id */ > > > - return OMPI_SUCCESS; > > > + ompi_datatype_t* temp_data = args->d[0]; > > > + return __ompi_datatype_pack_description(temp_data, > > > + packed_buffer, > > > + next_index ); > > > } > > > position[0] = args->create_type; > > > position[1] = args->ci; > > > > > > > > > > > > On Jul 14, 2013, at 14:30 , KAWASHIMA Takahiro > > > <rivis.kawash...@nifty.com> wrote: > > > > > > > Hi, > > > > > > > > I encountered an assertion failure in Open MPI trunk and found a bug. > > > > > > > > See the attached program. This program can be run with mpiexec -n 1. > > > > This program calls MPI_Put and writes one int value to the target side. > > > > The target side datatype is equivalent to MPI_INT, but is a derived > > > > datatype created by MPI_Type_contiguous and MPI_Type_Dup. > > > > > > > > This program aborts with the following output. > > > > > > > > ========================================================================== > > > > #### dt1 (0x2626160) #### > > > > type 2 count ints 1 count disp 0 count datatype 1 > > > > ints: 1 > > > > types: MPI_INT > > > > #### dt2 (0x2626340) #### > > > > type 1 count ints 0 count disp 0 count datatype 1 > > > > types: 0x2626160 > > > > put_dup_type: ../../../ompi/datatype/ompi_datatype_args.c:565: > > > > __ompi_datatype_create_from_packed_description: Assertion `data_id < > > > > 45' failed. > > > > [ppc:05244] *** Process received signal *** > > > > [ppc:05244] Signal: Aborted (6) > > > > [ppc:05244] Signal code: (-6) > > > > [ppc:05244] [ 0] /lib/libpthread.so.0(+0xeff0) [0x7fe58a275ff0] > > > > [ppc:05244] [ 1] /lib/libc.so.6(gsignal+0x35) [0x7fe589f371b5] > > > > [ppc:05244] [ 2] /lib/libc.so.6(abort+0x180) [0x7fe589f39fc0] > > > > [ppc:05244] [ 3] /lib/libc.so.6(__assert_fail+0xf1) [0x7fe589f30301] > > > > [ppc:05244] [ 4] /ompi/lib/libmpi.so.0(+0x6504e) [0x7fe58a4e804e] > > > > [ppc:05244] [ 5] > > > > /ompi/lib/libmpi.so.0(ompi_datatype_create_from_packed_description+0x23) > > > > [0x7fe58a4e8cf6] > > > > [ppc:05244] [ 6] /ompi/lib/openmpi/mca_osc_rdma.so(+0xd04b) > > > > [0x7fe5839a104b] > > > > [ppc:05244] [ 7] > > > > /ompi/lib/openmpi/mca_osc_rdma.so(ompi_osc_rdma_sendreq_recv_put+0xa8) > > > > [0x7fe5839a3ae5] > > > > [ppc:05244] [ 8] /ompi/lib/openmpi/mca_osc_rdma.so(+0x86cc) > > > > [0x7fe58399c6cc] > > > > [ppc:05244] [ 9] > > > > /ompi/lib/openmpi/mca_btl_self.so(mca_btl_self_send+0x87) > > > > [0x7fe58510bb04] > > > > [ppc:05244] [10] /ompi/lib/openmpi/mca_osc_rdma.so(+0xc44b) > > > > [0x7fe5839a044b] > > > > [ppc:05244] [11] /ompi/lib/openmpi/mca_osc_rdma.so(+0xd69d) > > > > [0x7fe5839a169d] > > > > [ppc:05244] [12] > > > > /ompi/lib/openmpi/mca_osc_rdma.so(ompi_osc_rdma_flush+0x50) > > > > [0x7fe5839a1776] > > > > [ppc:05244] [13] > > > > /ompi/lib/openmpi/mca_osc_rdma.so(ompi_osc_rdma_module_fence+0x8e6) > > > > [0x7fe5839a84ab] > > > > [ppc:05244] [14] /ompi/lib/libmpi.so.0(MPI_Win_fence+0x16f) > > > > [0x7fe58a54127d] > > > > [ppc:05244] [15] ompi-trunk/put_dup_type() [0x400d10] > > > > [ppc:05244] [16] /lib/libc.so.6(__libc_start_main+0xfd) [0x7fe589f23c8d] > > > > [ppc:05244] [17] put_dup_type() [0x400b09] > > > > [ppc:05244] *** End of error message *** > > > > -------------------------------------------------------------------------- > > > > mpiexec noticed that process rank 0 with PID 5244 on node ppc exited on > > > > signal 6 (Aborted). > > > > -------------------------------------------------------------------------- > > > > ========================================================================== > > > > > > > > __ompi_datatype_create_from_packed_description function, in which the > > > > assertion failure occurred, seems to expect the value of data_id is an > > > > ID of a predefined datatype. In my environment, the value of data_id > > > > is 68, that is an ID of the datatype created by MPI_Type_contiguous. > > > > > > > > On one-sided communication, the target side datatype is encoded as > > > > 'description' at the origin side and then it is decoded at the target > > > > side. I think there are problems in both encoding stage and decoding > > > > stage. > > > > > > > > __ompi_datatype_pack_description function in > > > > ompi/datatype/ompi_datatype_args.c file encodes the datatype. > > > > For MPI_COMBINER_DUP on line 451, it encodes only create_type and id > > > > and returns immediately. It doesn't encode the information of the base > > > > dataype (in my case, the datatype created by MPI_Type_contiguous). > > > > > > > > __ompi_datatype_create_from_packed_description function in > > > > ompi/datatype/ompi_datatype_args.c file decodes the description. > > > > For MPI_COMBINER_DUP in line 557, it expects the value of data_id is > > > > an ID of a predefined datatype. It is not always true. > > > > > > > > I cannot fix this problem yet because I'm not familiar with the datatype > > > > code in Open MPI. MPI_COMBINER_DUP is also used for predefined datatypes > > > > and the calculation of total_pack_size is also involved. It seems not > > > > so simple.
Index: ompi/datatype/ompi_datatype_args.c =================================================================== --- ompi/datatype/ompi_datatype_args.c (revision 28778) +++ ompi/datatype/ompi_datatype_args.c (working copy) @@ -129,7 +129,13 @@ /* Recompute the data description packed size based on the optimization * for MPI_COMBINER_DUP. */ - pArgs->total_pack_size = 2 * sizeof(int); + if (ompi_datatype_is_predefined(d[0])) { + /* MPI_COMBINER_DUP and d[0]->id */ + pArgs->total_pack_size = 2 * sizeof(int); + } else { + /* don't pack the description of MPI_COMBINER_DUP itself */ + pArgs->total_pack_size = 0; + } break; case MPI_COMBINER_CONTIGUOUS: @@ -449,9 +455,10 @@ } /* For duplicated datatype we don't have to store all the information */ if( MPI_COMBINER_DUP == args->create_type ) { - position[0] = args->create_type; - position[1] = args->d[0]->id; /* On the OMPI - layer, copy the ompi_datatype.id */ - return OMPI_SUCCESS; + ompi_datatype_t* temp_data = args->d[0]; + return __ompi_datatype_pack_description(temp_data, + packed_buffer, + next_index ); } position[0] = args->create_type; position[1] = args->ci;
#include <stdint.h> #include <stdio.h> #include <string.h> #include <mpi.h> static MPI_Win win; static int obuf[1], tbuf[100]; static void do_put(MPI_Datatype dt) { memset(tbuf, 0, sizeof(tbuf)); MPI_Win_fence(0, win); MPI_Put(obuf, 1, MPI_INT, 0, 0, 1, dt, win); MPI_Win_fence(0, win); } static void check_value(int expected, int actual) { if (expected != actual) { printf("NG (expected: %d, actual: %d)\n", expected, actual); } else { printf("OK\n"); } } int main(int argc, char *argv[]) { MPI_Datatype dt, dt1, dt2; obuf[0] = 77; MPI_Init(&argc, &argv); MPI_Win_create(tbuf, sizeof(int), 1, MPI_INFO_NULL, MPI_COMM_SELF, &win); do_put(MPI_INT); check_value(77, tbuf[0]); MPI_Type_contiguous(1, MPI_INT, &dt); MPI_Type_commit(&dt); do_put(dt); MPI_Type_free(&dt); check_value(77, tbuf[0]); MPI_Type_dup(MPI_INT, &dt); MPI_Type_commit(&dt); do_put(dt); MPI_Type_free(&dt); check_value(77, tbuf[0]); MPI_Type_contiguous(1, MPI_INT, &dt1); MPI_Type_dup(dt1, &dt); MPI_Type_commit(&dt); do_put(dt); MPI_Type_free(&dt); MPI_Type_free(&dt1); check_value(77, tbuf[0]); MPI_Type_dup(MPI_INT, &dt1); MPI_Type_contiguous(1, dt1, &dt); MPI_Type_commit(&dt); do_put(dt); MPI_Type_free(&dt); MPI_Type_free(&dt1); check_value(77, tbuf[0]); MPI_Type_contiguous(1, MPI_INT, &dt1); MPI_Type_dup(dt1, &dt2); MPI_Type_contiguous(1, dt2, &dt); MPI_Type_commit(&dt); do_put(dt); MPI_Type_free(&dt); MPI_Type_free(&dt2); MPI_Type_free(&dt1); check_value(77, tbuf[0]); MPI_Type_dup(MPI_INT, &dt1); MPI_Type_contiguous(1, dt1, &dt2); MPI_Type_dup(dt2, &dt); MPI_Type_commit(&dt); do_put(dt); MPI_Type_free(&dt); MPI_Type_free(&dt2); MPI_Type_free(&dt1); check_value(77, tbuf[0]); MPI_Win_free(&win); MPI_Finalize(); printf("finished\n"); return 0; }