Attaching the test for reproduction. On Mon, Jan 19, 2015 at 11:48 AM, Alina Sklarevich < ali...@dev.mellanox.co.il> wrote:
> Dear OMPI community, > > > > We observe a segmentation fault in our regression testing. Our initial > investigation shows that It happens for any 1.8.x release and with any > PML/BTL/MTL combo on two processes, when running the MPICH one-sided test, > accumulate-fence test, attached to this report with the following command > line: > > > > $mpirun -np 2 --bind-to core --display-map --map-by node -mca pml ob1 -mca > btl self,openib ../test/mpi/rma/accfence1 > > > > The initial trace is: > > > > Data for JOB [16088,1] offset 0 > > > > ======================== JOB MAP ======================== > > > > Data for node: vegas15 Num slots: 16 Max slots: 0 Num procs: 1 > > Process OMPI jobid: [16088,1] App: 0 Process rank: 0 > > > > Data for node: vegas16 Num slots: 16 Max slots: 0 Num procs: 1 > > Process OMPI jobid: [16088,1] App: 0 Process rank: 1 > > > > ============================================================= > > [vegas16:22098] *** Process received signal *** > > [vegas16:22098] Signal: Segmentation fault (11) > > [vegas16:22098] Signal code: Address not mapped (1) > > [vegas16:22098] Failing at address: 0x34 > > [vegas16:22098] [ 0] /lib64/libpthread.so.0[0x3f6e80f710] > > [vegas16:22098] [ 1] > /labhome/alinas/workspace/ompi/openmpi-1.8.4/install/lib/libopen-pal.so.6(opal_memory_ptmalloc2_int_free+0x188)[0x7ffff772baa2] > > [vegas16:22098] [ 2] > /labhome/alinas/workspace/ompi/openmpi-1.8.4/install/lib/libopen-pal.so.6(opal_memory_ptmalloc2_free+0x98)[0x7ffff772a1f5] > > [vegas16:22098] [ 3] > /labhome/alinas/workspace/ompi/openmpi-1.8.4/install/lib/libopen-pal.so.6(+0xd6f59)[0x7ffff7728f59] > > [vegas16:22098] [ 4] > /labhome/alinas/workspace/ompi/openmpi-1.8.4/install/lib/libmpi.so.1(+0x2f884)[0x7ffff7c92884] > > [vegas16:22098] [ 5] > /labhome/alinas/workspace/ompi/openmpi-1.8.4/install/lib/libmpi.so.1(ompi_attr_delete_all+0x2eb)[0x7ffff7c92dbe] > > [vegas16:22098] [ 6] > /labhome/alinas/workspace/ompi/openmpi-1.8.4/install/lib/libmpi.so.1(ompi_comm_free+0x6a)[0x7ffff7c99336] > > [vegas16:22098] [ 7] > /labhome/alinas/workspace/ompi/openmpi-1.8.4/install/lib/openmpi/mca_osc_rdma.so(ompi_osc_rdma_free+0x921)[0x7ffff32ab3bc] > > [vegas16:22098] [ 8] > /labhome/alinas/workspace/ompi/openmpi-1.8.4/install/lib/libmpi.so.1(ompi_win_free+0x24)[0x7ffff7cc0c87] > > [vegas16:22098] [ 9] > /labhome/alinas/workspace/ompi/openmpi-1.8.4/install/lib/libmpi.so.1(MPI_Win_free+0xb8)[0x7ffff7d2b702] > > [vegas16:22098] [10] > /labhome/alinas/workspace/mpich/mpich-mellanox/test/mpi/rma/accfence1[0x402447] > > [vegas16:22098] [11] /lib64/libc.so.6(__libc_start_main+0xfd)[0x3f6e41ed1d] > > [vegas16:22098] [12] > /labhome/alinas/workspace/mpich/mpich-mellanox/test/mpi/rma/accfence1[0x402119] > > [vegas16:22098] *** End of error message *** > > > > > > And subsequent investigation of the core file generates the following > hints: > > > > (gdb) bt > > #0 0x00007ffff7722a96 in opal_memory_ptmalloc2_int_free > (av=0x7ffff796b320, mem=0x7125a0) at malloc.c:4402 > > #1 0x00007ffff77211f5 in opal_memory_ptmalloc2_free (mem=0x7125a0) at > malloc.c:3511 > > #2 0x00007ffff771ff59 in opal_memory_linux_free_hook (__ptr=0x7125a0, > caller=0x7ffff769a8f6) at hooks.c:709 > > #3 0x00007ffff769a8f6 in opal_datatype_destruct (datatype=0x7123b0) at > opal_datatype_create.c:59 > > #4 0x00007ffff3346ad0 in opal_obj_run_destructors (object=0x7123b0) at > ../../../../opal/class/opal_object.h:448 > > #5 0x00007ffff334af68 in process_acc (module=0x70e370, source=0, > acc_header=0x70fef0) at osc_rdma_data_move.c:1184 > > #6 0x00007ffff334c752 in process_frag (module=0x70e370, frag=0x70fee0) at > osc_rdma_data_move.c:1576 > > #7 0x00007ffff334cafb in ompi_osc_rdma_callback (request=0x700b80) at > osc_rdma_data_move.c:1656 > > #8 0x00007ffff3db3770 in ompi_request_complete (request=0x700b80, > with_signal=true) at ../../../../ompi/request/request.h:402 > > #9 0x00007ffff3db3f11 in recv_request_pml_complete (recvreq=0x700b80) at > pml_ob1_recvreq.h:181 > > #10 0x00007ffff3db5019 in mca_pml_ob1_recv_frag_callback_match > (btl=0x7ffff41d9c20, tag=65 'A', des=0x7fffffffd210, cbdata=0x0) at > pml_ob1_recvfrag.c:243 > > #11 0x00007ffff3fd6c4b in mca_btl_sm_component_progress () at > btl_sm_component.c:1087 > > #12 0x00007ffff7678d66 in opal_progress () at runtime/opal_progress.c:187 > > #13 0x00007ffff3dabb44 in opal_condition_wait (c=0x7ffff7ffa120, > m=0x7ffff7ffa160) at ../../../../opal/threads/condition.h:78 > > #14 0x00007ffff3dabcc6 in ompi_request_wait_completion > (req=0x7fffffffd410) at ../../../../ompi/request/request.h:381 > > #15 0x00007ffff3dac9da in mca_pml_ob1_recv (addr=0x7fffffffd9ec, count=1, > datatype=0x7ffff7fe25c0, src=0, tag=-24, comm=0x70dac0, status=0x0) at > pml_ob1_irecv.c:109 > > #16 0x00007ffff2cd2868 in ompi_coll_tuned_scatter_intra_basic_linear > (sbuf=0x0, scount=1, sdtype=0x7ffff7fe25c0, rbuf=0x7fffffffd9ec, rcount=1, > rdtype=0x7ffff7fe25c0, root=0, comm=0x70dac0, module=0x70fa20) > > at coll_tuned_scatter.c:231 > > #17 0x00007ffff2cbbd75 in ompi_coll_tuned_scatter_intra_dec_fixed > (sbuf=0x0, scount=1, sdtype=0x7ffff7fe25c0, rbuf=0x7fffffffd9ec, rcount=1, > rdtype=0x7ffff7fe25c0, root=0, comm=0x70dac0, module=0x70fa20) > > at coll_tuned_decision_fixed.c:769 > > #18 0x00007ffff3b9c16e in mca_coll_basic_reduce_scatter_block_intra > (sbuf=0x70e220, rbuf=0x7fffffffd9ec, rcount=1, dtype=0x7ffff7fe25c0, > op=0x60d180, comm=0x70dac0, module=0x70f230) > > at coll_basic_reduce_scatter_block.c:102 > > #19 0x00007ffff334eecc in ompi_osc_rdma_fence (assert=0, win=0x70e260) at > osc_rdma_active_target.c:140 > > #20 0x00007ffff7d2a1b5 in PMPI_Win_fence (assert=0, win=0x70e260) at > pwin_fence.c:59 > > #21 0x0000000000402405 in main () > > > > > > It looks to be a clear case of memory corruption hidden somewhere in the > OSC code. Nathan, can you please have a look? > > > > Thanks, > > Alina. >
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ /* * * (C) 2003 by Argonne National Laboratory. * See COPYRIGHT in top-level directory. */ #include "mpi.h" #include <stdio.h> #include "mpitest.h" /* static char MTEST_Descrip[] = "Accumulate/Replace with Fence"; */ int main( int argc, char *argv[] ) { int errs = 0, err; int rank, size, source, dest; int minsize = 2, count; MPI_Comm comm; MPI_Win win; MPI_Aint extent; MTestDatatype sendtype, recvtype; MTest_Init( &argc, &argv ); /* The following illustrates the use of the routines to run through a selection of communicators and datatypes. Use subsets of these for tests that do not involve combinations of communicators, datatypes, and counts of datatypes */ while (MTestGetIntracommGeneral( &comm, minsize, 1 )) { if (comm == MPI_COMM_NULL) continue; /* Determine the sender and receiver */ MPI_Comm_rank( comm, &rank ); MPI_Comm_size( comm, &size ); source = 0; dest = size - 1; MTEST_DATATYPE_FOR_EACH_COUNT(count) { while (MTestGetDatatypes( &sendtype, &recvtype, count )) { /* Make sure that everyone has a recv buffer */ recvtype.InitBuf( &recvtype ); MPI_Type_extent( recvtype.datatype, &extent ); MPI_Win_create( recvtype.buf, recvtype.count * extent, (int)extent, MPI_INFO_NULL, comm, &win ); MPI_Win_fence( 0, win ); if (rank == source) { sendtype.InitBuf( &sendtype ); /* To improve reporting of problems about operations, we change the error handler to errors return */ MPI_Win_set_errhandler( win, MPI_ERRORS_RETURN ); /* MPI_REPLACE on accumulate is almost the same as MPI_Put; the only difference is in the handling of overlapping accumulate operations, which are not tested here */ err = MPI_Accumulate( sendtype.buf, sendtype.count, sendtype.datatype, dest, 0, recvtype.count, recvtype.datatype, MPI_REPLACE, win ); if (err) { errs++; if (errs < 10) { printf( "Accumulate types: send %s, recv %s\n", MTestGetDatatypeName( &sendtype ), MTestGetDatatypeName( &recvtype ) ); MTestPrintError( err ); } } err = MPI_Win_fence( 0, win ); if (err) { errs++; if (errs < 10) { MTestPrintError( err ); } } } else if (rank == dest) { MPI_Win_fence( 0, win ); /* This should have the same effect, in terms of transfering data, as a send/recv pair */ err = MTestCheckRecv( 0, &recvtype ); if (err) { errs += err; } } else { MPI_Win_fence( 0, win ); } MPI_Win_free( &win ); MTestFreeDatatype( &sendtype ); MTestFreeDatatype( &recvtype ); } } MTestFreeComm(&comm); } MTest_Finalize( errs ); MPI_Finalize(); return 0; }