Ok, I think I have found the problem During std::vector::push_back or emplace_back a realloc happens and thus memory locations that I gave to MPI_Isend become invalid.
My loop now reads: std::vector<MPI_EventData> eventSendBuf(eventsSize); // Buffer to hold the MPI_EventData object for (int i = 0; i < eventsSize; ++i) { MPI_Request req; eventSendBuf.at(i).size = 5; cout << "Isending event " << i << endl; MPI_Isend(&eventSendBuf[i], 1, MPI_EVENTDATA, 0, 0, MPI_COMM_WORLD, &req); requests.push_back(req); } Best, Florian Am 19.02.2018 um 10:14 schrieb Florian Lindner: > Hello, > > I am having problems understanding an error valgrind gives me. I tried to bog > down the program as much as possible. The > original program as well as the test example both work fine, but when I link > the created library to another application > I get segfaults. I think that this piece of code is to blame. I run valgrind > on it and get an invalid read. > > The code can be seen at > https://gist.github.com/floli/d62d16ce7cabb4522e2ae7e6b3cfda43 or below. > > It's about 60 lines of C/C++ code. > > I have also attached the valgrind report below the code. > > The code registers a custom MPI datatype and sends that using an isend. It > does not crash or produces invalid data, but > I fear that the invalid read message from valgrind is a hint of an existing > memory corruption. > > But I got no idea where that could happen. > > OpenMPI 3.0.0 @ Arch > > I am very thankful of any hints whatsover! > > Florian > > > > > > // Compile and test with: mpicxx -std=c++11 -g -O0 mpitest.cpp && > LD_PRELOAD=/usr/lib/valgrind/libmpiwrap-amd64-linux.so mpirun -n 1 valgrind > --read-var-info=yes --leak-check=full ./a.out > > #include <vector> > #include <iostream> > > #include <mpi.h> > > using namespace std; > > struct MPI_EventData > { > int size; > }; > > > void collect() > { > // Register MPI datatype > MPI_Datatype MPI_EVENTDATA; > int blocklengths[] = {1}; > MPI_Aint displacements[] = {offsetof(MPI_EventData, size) }; > MPI_Datatype types[] = {MPI_INT}; > MPI_Type_create_struct(1, blocklengths, displacements, types, > &MPI_EVENTDATA); > MPI_Type_commit(&MPI_EVENTDATA); > > int rank, MPIsize; > MPI_Comm_rank(MPI_COMM_WORLD, &rank); > MPI_Comm_size(MPI_COMM_WORLD, &MPIsize); > > std::vector<MPI_Request> requests; > std::vector<int> eventsPerRank(MPIsize); > size_t eventsSize = 3; // each rank sends three events, invalid read > happens only if eventsSize > 1 > MPI_Gather(&eventsSize, 1, MPI_INT, eventsPerRank.data(), 1, MPI_INT, 0, > MPI_COMM_WORLD); > > std::vector<MPI_EventData> eventSendBuf; // Buffer to hold the > MPI_EventData object > > for (int i = 0; i < eventsSize; ++i) { > MPI_EventData eventdata; > MPI_Request req; > > eventdata.size = 5; > eventSendBuf.push_back(eventdata); > > cout << "Isending event " << i << endl; > MPI_Isend(&eventSendBuf.back(), 1, MPI_EVENTDATA, 0, 0, MPI_COMM_WORLD, > &req); > requests.push_back(req); > } > > if (rank == 0) { > for (int i = 0; i < MPIsize; ++i) { > for (int j = 0; j < eventsPerRank[i]; ++j) { > MPI_EventData ev; > MPI_Recv(&ev, 1, MPI_EVENTDATA, i, MPI_ANY_TAG, MPI_COMM_WORLD, > MPI_STATUS_IGNORE); > > cout << "Received Size = " << ev.size << endl; > } > } > } > MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE); > MPI_Type_free(&MPI_EVENTDATA); > } > > > int main(int argc, char *argv[]) > { > MPI_Init(&argc, &argv); > > collect(); > > MPI_Finalize(); > } > > > /* > > % mpicxx -std=c++11 -g -O0 mpitest.cpp && > LD_PRELOAD=/usr/lib/valgrind/libmpiwrap-amd64-linux.so mpirun -n 1 valgrind > --read-var-info=yes --leak-check=full ./a.out > ==13584== Memcheck, a memory error detector > ==13584== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al. > ==13584== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info > ==13584== Command: ./a.out > ==13584== > valgrind MPI wrappers 13584: Active for pid 13584 > valgrind MPI wrappers 13584: Try MPIWRAP_DEBUG=help for possible options > ==13584== Thread 3: > ==13584== Syscall param epoll_pwait(sigmask) points to unaddressable byte(s) > ==13584== at 0x61A0FE6: epoll_pwait (in /usr/lib/libc-2.26.so) > ==13584== by 0x677CDDC: ??? (in /usr/lib/openmpi/libopen-pal.so.40.0.0) > ==13584== by 0x6780EDA: opal_libevent2022_event_base_loop (in > /usr/lib/openmpi/libopen-pal.so.40.0.0) > ==13584== by 0x93100CE: ??? (in > /usr/lib/openmpi/openmpi/mca_pmix_pmix2x.so) > ==13584== by 0x5E9408B: start_thread (in /usr/lib/libpthread-2.26.so) > ==13584== by 0x61A0E7E: clone (in /usr/lib/libc-2.26.so) > ==13584== Address 0x0 is not stack'd, malloc'd or (recently) free'd > ==13584== > Isending event 0 > ==13584== Thread 1: > ==13584== Invalid read of size 2 > ==13584== at 0x4C33B20: memmove (vg_replace_strmem.c:1258) > ==13584== by 0x11A7BB: MPI_EventData* std::__copy_move<true, true, > std::random_access_iterator_tag>::__copy_m<MPI_EventData>(MPI_EventData > const*, MPI_EventData const*, MPI_EventData*) > (stl_algobase.h:368) > ==13584== by 0x11A70B: MPI_EventData* std::__copy_move_a<true, > MPI_EventData*, MPI_EventData*>(MPI_EventData*, > MPI_EventData*, MPI_EventData*) (stl_algobase.h:386) > ==13584== by 0x11A62B: MPI_EventData* std::__copy_move_a2<true, > MPI_EventData*, MPI_EventData*>(MPI_EventData*, > MPI_EventData*, MPI_EventData*) (stl_algobase.h:424) > ==13584== by 0x11A567: MPI_EventData* > std::copy<std::move_iterator<MPI_EventData*>, > MPI_EventData*>(std::move_iterator<MPI_EventData*>, > std::move_iterator<MPI_EventData*>, MPI_EventData*) (stl_algobase.h:456) > ==13584== by 0x11A478: MPI_EventData* > std::__uninitialized_copy<true>::__uninit_copy<std::move_iterator<MPI_EventData*>, > MPI_EventData*>(std::move_iterator<MPI_EventData*>, > std::move_iterator<MPI_EventData*>, MPI_EventData*) > (stl_uninitialized.h:101) > ==13584== by 0x11A306: MPI_EventData* > std::uninitialized_copy<std::move_iterator<MPI_EventData*>, > MPI_EventData*>(std::move_iterator<MPI_EventData*>, > std::move_iterator<MPI_EventData*>, MPI_EventData*) > (stl_uninitialized.h:134) > ==13584== by 0x11A05B: MPI_EventData* > std::__uninitialized_copy_a<std::move_iterator<MPI_EventData*>, > MPI_EventData*, > MPI_EventData>(std::move_iterator<MPI_EventData*>, > std::move_iterator<MPI_EventData*>, MPI_EventData*, > std::allocator<MPI_EventData>&) (stl_uninitialized.h:289) > ==13584== by 0x119AEC: MPI_EventData* > std::__uninitialized_move_if_noexcept_a<MPI_EventData*, MPI_EventData*, > std::allocator<MPI_EventData> >(MPI_EventData*, MPI_EventData*, > MPI_EventData*, std::allocator<MPI_EventData>&) > (stl_uninitialized.h:312) > ==13584== by 0x1190D2: void std::vector<MPI_EventData, > std::allocator<MPI_EventData> >> ::_M_realloc_insert<MPI_EventData >> const&>(__gnu_cxx::__normal_iterator<MPI_EventData*, >> std::vector<MPI_EventData, > std::allocator<MPI_EventData> > >, MPI_EventData const&) (vector.tcc:424) > ==13584== by 0x118B17: std::vector<MPI_EventData, > std::allocator<MPI_EventData> >::push_back(MPI_EventData const&) > (stl_vector.h:948) > ==13584== by 0x113B6E: collect() (mpitest.cpp:42) > ==13584== Address 0xd315ca0 is 0 bytes inside a block of size 4 alloc'd > ==13584== at 0x4C2D54F: operator new(unsigned long) > (vg_replace_malloc.c:334) > ==13584== by 0x11A2BB: > __gnu_cxx::new_allocator<MPI_EventData>::allocate(unsigned long, void const*) > (new_allocator.h:111) > ==13584== by 0x119FE2: std::allocator_traits<std::allocator<MPI_EventData> >> ::allocate(std::allocator<MPI_EventData>&, unsigned long) >> (alloc_traits.h:436) > ==13584== by 0x119A87: std::_Vector_base<MPI_EventData, > std::allocator<MPI_EventData> >::_M_allocate(unsigned long) > (stl_vector.h:172) > ==13584== by 0x119055: void std::vector<MPI_EventData, > std::allocator<MPI_EventData> >> ::_M_realloc_insert<MPI_EventData >> const&>(__gnu_cxx::__normal_iterator<MPI_EventData*, >> std::vector<MPI_EventData, > std::allocator<MPI_EventData> > >, MPI_EventData const&) (vector.tcc:406) > ==13584== by 0x118B17: std::vector<MPI_EventData, > std::allocator<MPI_EventData> >::push_back(MPI_EventData const&) > (stl_vector.h:948) > ==13584== by 0x113B6E: collect() (mpitest.cpp:42) > ==13584== by 0x113DEE: main (mpitest.cpp:68) > ==13584== > Isending event 1 > Isending event 2 > Received Size = 5 > Received Size = 5 > Received Size = 5 > ==13584== > ==13584== HEAP SUMMARY: > ==13584== in use at exit: 1,898 bytes in 44 blocks > ==13584== total heap usage: 18,037 allocs, 17,993 frees, 4,086,090 bytes > allocated > ==13584== > ==13584== 5 bytes in 1 blocks are definitely lost in loss record 1 of 44 > ==13584== at 0x4C2CEDF: malloc (vg_replace_malloc.c:299) > ==13584== by 0x6130B9A: strdup (in /usr/lib/libc-2.26.so) > ==13584== by 0x954B720: ??? > ==13584== by 0x9312913: ??? > ==13584== by 0x9313114: ??? > ==13584== by 0x930FE31: ??? > ==13584== by 0x92CA3AD: ??? > ==13584== by 0x92A3ED7: ??? > ==13584== by 0x867C876: ??? > ==13584== by 0x647BBEA: orte_init (in > /usr/lib/openmpi/libopen-rte.so.40.0.0) > ==13584== by 0x52E3F77: ompi_mpi_init (in > /usr/lib/openmpi/libmpi.so.40.0.0) > ==13584== by 0x530B78D: PMPI_Init (in /usr/lib/openmpi/libmpi.so.40.0.0) > ==13584== > ==13584== 12 bytes in 1 blocks are definitely lost in loss record 2 of 44 > ==13584== at 0x4C2CEDF: malloc (vg_replace_malloc.c:299) > ==13584== by 0x6130B9A: strdup (in /usr/lib/libc-2.26.so) > ==13584== by 0x954FA92: ??? > ==13584== by 0x9323720: ??? > ==13584== by 0x92CA651: ??? > ==13584== by 0x92A3ED7: ??? > ==13584== by 0x867C876: ??? > ==13584== by 0x647BBEA: orte_init (in > /usr/lib/openmpi/libopen-rte.so.40.0.0) > ==13584== by 0x52E3F77: ompi_mpi_init (in > /usr/lib/openmpi/libmpi.so.40.0.0) > ==13584== by 0x530B78D: PMPI_Init (in /usr/lib/openmpi/libmpi.so.40.0.0) > ==13584== by 0x4E4A2C0: PMPI_Init (libmpiwrap.c:2271) > ==13584== by 0x113DE9: main (mpitest.cpp:66) > ==13584== > ==13584== 35 bytes in 1 blocks are definitely lost in loss record 24 of 44 > ==13584== at 0x4C2CEDF: malloc (vg_replace_malloc.c:299) > ==13584== by 0x6130B9A: strdup (in /usr/lib/libc-2.26.so) > ==13584== by 0x954B6B8: ??? > ==13584== by 0x9312913: ??? > ==13584== by 0x9313114: ??? > ==13584== by 0x930FE31: ??? > ==13584== by 0x92CA3AD: ??? > ==13584== by 0x92A3ED7: ??? > ==13584== by 0x867C876: ??? > ==13584== by 0x647BBEA: orte_init (in > /usr/lib/openmpi/libopen-rte.so.40.0.0) > ==13584== by 0x52E3F77: ompi_mpi_init (in > /usr/lib/openmpi/libmpi.so.40.0.0) > ==13584== by 0x530B78D: PMPI_Init (in /usr/lib/openmpi/libmpi.so.40.0.0) > ==13584== > ==13584== 1,608 (320 direct, 1,288 indirect) bytes in 1 blocks are definitely > lost in loss record 44 of 44 > ==13584== at 0x4C2F0FF: realloc (vg_replace_malloc.c:785) > ==13584== by 0x92B1D7E: ??? > ==13584== by 0x92B9FFE: ??? > ==13584== by 0x92BB03E: ??? > ==13584== by 0x6781AD8: opal_libevent2022_event_base_loop (in > /usr/lib/openmpi/libopen-pal.so.40.0.0) > ==13584== by 0x93100CE: ??? > ==13584== by 0x5E9408B: start_thread (in /usr/lib/libpthread-2.26.so) > ==13584== by 0x61A0E7E: clone (in /usr/lib/libc-2.26.so) > ==13584== > ==13584== LEAK SUMMARY: > ==13584== definitely lost: 372 bytes in 4 blocks > ==13584== indirectly lost: 1,288 bytes in 34 blocks > ==13584== possibly lost: 0 bytes in 0 blocks > ==13584== still reachable: 238 bytes in 6 blocks > ==13584== suppressed: 0 bytes in 0 blocks > ==13584== Reachable blocks (those to which a pointer was found) are not shown. > ==13584== To see them, rerun with: --leak-check=full --show-leak-kinds=all > ==13584== > ==13584== For counts of detected and suppressed errors, rerun with: -v > ==13584== ERROR SUMMARY: 44 errors from 6 contexts (suppressed: 0 from 0) > > */ > _______________________________________________ > users mailing list > users@lists.open-mpi.org > https://lists.open-mpi.org/mailman/listinfo/users > _______________________________________________ users mailing list users@lists.open-mpi.org https://lists.open-mpi.org/mailman/listinfo/users