Here’s what happens using a debug build: [raijin7:22225] ompi_comm_peer_lookup: invalid peer index (2) [raijin7:22225:0:22225] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x8)
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_comm.h: [ mca_pml_ob1_peer_lookup() ] ... 75 mca_pml_ob1_comm_proc_t* proc = OBJ_NEW(mca_pml_ob1_comm_proc_t); 76 proc->ompi_proc = ompi_comm_peer_lookup (comm, rank); 77 OBJ_RETAIN(proc->ompi_proc); ==> 78 opal_atomic_wmb (); 79 pml_comm->procs[rank] = proc; 80 } 81 OPAL_THREAD_UNLOCK(&pml_comm->proc_lock); ==== backtrace ==== 0 0x0000000000017505 mca_pml_ob1_peer_lookup() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_comm.h:78 1 0x0000000000019119 mca_pml_ob1_recv_frag_callback_match() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_recvfrag.c:361 2 0x00000000000052d7 mca_btl_vader_check_fboxes() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/opal/mca/btl/vader/../../../../../../../opal/mca/btl/vader/btl_vader_fbox.h:208 3 0x00000000000077fd mca_btl_vader_component_progress() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/opal/mca/btl/vader/../../../../../../../opal/mca/btl/vader/btl_vader_component.c:689 4 0x000000000002ff90 opal_progress() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/opal/../../../../opal/runtime/opal_progress.c:228 5 0x000000000003b168 ompi_sync_wait_mt() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/opal/../../../../opal/threads/wait_sync.c:85 6 0x000000000005cd64 ompi_request_wait_completion() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/../../../../ompi/request/request.h:403 7 0x000000000005ce28 ompi_request_default_wait() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/../../../../ompi/request/req_wait.c:42 8 0x00000000001142d9 ompi_coll_base_sendrecv_zero() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_barrier.c:64 9 0x0000000000114763 ompi_coll_base_barrier_intra_recursivedoubling() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_barrier.c:215 10 0x0000000000004cad ompi_coll_tuned_barrier_intra_dec_fixed() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/coll/tuned/../../../../../../../ompi/mca/coll/tuned/coll_tuned_decision_fixed.c:212 11 0x00000000000831ac PMPI_Barrier() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mpi/c/profile/pbarrier.c:63 12 0x0000000000044041 ompi_barrier_f() /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/intel/debug-1/ompi/mpi/fortran/mpif-h/profile/pbarrier_f.c:76 13 0x00000000005c79de comms_barrier() /short/z00/aab900/onetep/src/comms_mod.F90:1543 14 0x00000000005c79de comms_bcast_logical_0() /short/z00/aab900/onetep/src/comms_mod.F90:10756 15 0x0000000001c21509 utils_devel_code_logical() /short/z00/aab900/onetep/src/utils_mod.F90:2646 16 0x0000000001309ddb multigrid_bc_for_dlmg() /short/z00/aab900/onetep/src/multigrid_methods_mod.F90:260 17 0x0000000001309ddb multigrid_initialise() /short/z00/aab900/onetep/src/multigrid_methods_mod.F90:174 18 0x0000000000f0c885 hartree_via_multigrid() /short/z00/aab900/onetep/src/hartree_mod.F90:181 19 0x0000000000a0c62a electronic_init_pot() /short/z00/aab900/onetep/src/electronic_init_mod.F90:1123 20 0x0000000000a14d62 electronic_init_denskern() /short/z00/aab900/onetep/src/electronic_init_mod.F90:334 21 0x0000000000a50136 energy_and_force_calculate() /short/z00/aab900/onetep/src/energy_and_force_mod.F90:1702 22 0x00000000014f46e7 onetep() /short/z00/aab900/onetep/src/onetep.F90:277 23 0x000000000041465e main() ???:0 24 0x000000000001ed1d __libc_start_main() ???:0 25 0x0000000000414569 _start() ???:0 =================== > On 12 Jul 2018, at 1:36 pm, Ben Menadue <ben.mena...@nci.org.au> wrote: > > Hi, > > Perhaps related — we’re seeing this one with 3.1.1. I’ll see if I can get the > application run against our --enable-debug build. > > Cheers, > Ben > > [raijin7:1943 :0:1943] Caught signal 11 (Segmentation fault: address not > mapped to object at address 0x45) > > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_recvfrag.c: > [ append_frag_to_ordered_list() ] > ... > 118 * account for this rollover or the matching will fail. > 119 * Extract the items from the list to order them safely */ > 120 if( hdr->hdr_seq < prior->hdr.hdr_match.hdr_seq ) { > ==> 121 uint16_t d1, d2 = prior->hdr.hdr_match.hdr_seq - > hdr->hdr_seq; > 122 do { > 123 d1 = d2; > 124 prior = > (mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_prev); > > ==== backtrace ==== > 0 0x0000000000012d5f append_frag_to_ordered_list() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_recvfrag.c:121 > 1 0x0000000000013a06 mca_pml_ob1_recv_frag_callback_match() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_recvfrag.c:390 > 2 0x00000000000044ef mca_btl_vader_check_fboxes() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/opal/mca/btl/vader/../../../../../../../opal/mca/btl/vader/btl_vader_fbox.h:208 > 3 0x000000000000602f mca_btl_vader_component_progress() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/opal/mca/btl/vader/../../../../../../../opal/mca/btl/vader/btl_vader_component.c:689 > 4 0x000000000002b554 opal_progress() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/opal/../../../../opal/runtime/opal_progress.c:228 > 5 0x00000000000331cc ompi_sync_wait_mt() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/opal/../../../../opal/threads/wait_sync.c:85 > 6 0x000000000004a989 ompi_request_wait_completion() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/../../../../ompi/request/request.h:403 > 7 0x000000000004aa1d ompi_request_default_wait() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/../../../../ompi/request/req_wait.c:42 > 8 0x00000000000d3486 ompi_coll_base_sendrecv_actual() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_util.c:59 > 9 0x00000000000d0d2b ompi_coll_base_sendrecv() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_util.h:67 > 10 0x00000000000d14c7 ompi_coll_base_allgather_intra_recursivedoubling() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_allgather.c:329 > 11 0x00000000000056dc ompi_coll_tuned_allgather_intra_dec_fixed() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/coll/tuned/../../../../../../../ompi/mca/coll/tuned/coll_tuned_decision_fixed.c:551 > 12 0x000000000006185d PMPI_Allgather() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mpi/c/profile/pallgather.c:122 > 13 0x000000000004362c ompi_allgather_f() > /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/intel/debug-0/ompi/mpi/fortran/mpif-h/profile/pallgather_f.c:86 > 14 0x00000000005ed3cb comms_allgather_integer_0() > /short/z00/aab900/onetep/src/comms_mod.F90:14795 > 15 0x0000000001309fe1 multigrid_bc_for_dlmg() > /short/z00/aab900/onetep/src/multigrid_methods_mod.F90:270 > 16 0x0000000001309fe1 multigrid_initialise() > /short/z00/aab900/onetep/src/multigrid_methods_mod.F90:174 > 17 0x0000000000f0c885 hartree_via_multigrid() > /short/z00/aab900/onetep/src/hartree_mod.F90:181 > 18 0x0000000000a0c62a electronic_init_pot() > /short/z00/aab900/onetep/src/electronic_init_mod.F90:1123 > 19 0x0000000000a14d62 electronic_init_denskern() > /short/z00/aab900/onetep/src/electronic_init_mod.F90:334 > 20 0x0000000000a50136 energy_and_force_calculate() > /short/z00/aab900/onetep/src/energy_and_force_mod.F90:1702 > 21 0x00000000014f46e7 onetep() /short/z00/aab900/onetep/src/onetep.F90:277 > 22 0x000000000041465e main() ???:0 > 23 0x000000000001ed1d __libc_start_main() ???:0 > 24 0x0000000000414569 _start() ???:0 > =================== > ------------------------------------------------------- > Primary job terminated normally, but 1 process returned > a non-zero exit code. Per user-direction, the job has been aborted. > ------------------------------------------------------- > forrtl: error (78): process killed (SIGTERM) > Image PC Routine Line Source > > onetep.nci 0000000001DCC6DE Unknown Unknown Unknown > libpthread-2.12.s 00002B6D46ED07E0 Unknown Unknown Unknown > libmlx4-rdmav2.so 00002B6D570E3B18 Unknown Unknown Unknown > -------------------------------------------------------------------------- > mpirun noticed that process rank 0 with PID 0 on node raijin7 exited on > signal 11 (Segmentation fault). > -------------------------------------------------------------------------- > > > > >> On 12 Jul 2018, at 8:16 am, Nathan Hjelm via users <users@lists.open-mpi.org >> <mailto:users@lists.open-mpi.org>> wrote: >> >> Might be also worth testing a master snapshot and see if that fixes the >> issue. There are a couple of fixes being backported from master to v3.0.x >> and v3.1.x now. >> >> -Nathan >> >> On Jul 11, 2018, at 03:16 PM, Noam Bernstein <noam.bernst...@nrl.navy.mil >> <mailto:noam.bernst...@nrl.navy.mil>> wrote: >> >>>> On Jul 11, 2018, at 11:29 AM, Jeff Squyres (jsquyres) via users >>>> <users@lists.open-mpi.org <mailto:users@lists.open-mpi.org>> wrote: >>>> Ok, that would be great -- thanks. >>>> >>>> Recompiling Open MPI with --enable-debug will turn on several >>>> debugging/sanity checks inside Open MPI, and it will also enable debugging >>>> symbols. Hence, If you can get a failure when a debug Open MPI build, it >>>> might give you a core file that can be used to get a more detailed stack >>>> trace, poke around and see if there's a NULL pointer somewhere, …etc. >>> >>> I haven’t tried to get a core file yes, but it’s not producing any more >>> info from the runtime stack trace, despite configure with —enable-debug: >>> >>> Image PC Routine Line Source >>> vasp.gamma_para.i 0000000002DCE8C1 Unknown Unknown Unknown >>> vasp.gamma_para.i 0000000002DCC9FB Unknown Unknown Unknown >>> vasp.gamma_para.i 0000000002D409E4 Unknown Unknown Unknown >>> vasp.gamma_para.i 0000000002D407F6 Unknown Unknown Unknown >>> vasp.gamma_para.i 0000000002CDCED9 Unknown Unknown Unknown >>> vasp.gamma_para.i 0000000002CE3DB6 Unknown Unknown Unknown >>> libpthread-2.12.s 0000003F8E60F7E0 Unknown Unknown Unknown >>> mca_btl_vader.so 00002B1AFA5FAC30 Unknown Unknown Unknown >>> mca_btl_vader.so 00002B1AFA5FD00D Unknown Unknown Unknown >>> libopen-pal.so.40 00002B1AE884327C opal_progress Unknown Unknown >>> mca_pml_ob1.so 00002B1AFB855DCE Unknown Unknown Unknown >>> mca_pml_ob1.so 00002B1AFB858305 mca_pml_ob1_send Unknown Unknown >>> libmpi.so.40.10.1 00002B1AE823A5DA ompi_coll_base_al Unknown Unknown >>> mca_coll_tuned.so 00002B1AFC6F0842 ompi_coll_tuned_a Unknown Unknown >>> libmpi.so.40.10.1 00002B1AE81B66F5 PMPI_Allreduce Unknown Unknown >>> libmpi_mpifh.so.4 00002B1AE7F2259B mpi_allreduce_ Unknown Unknown >>> vasp.gamma_para.i 000000000042D1ED m_sum_d_ 1300 mpi.F >>> vasp.gamma_para.i 000000000089947D nonl_mp_vnlacc_.R 1754 nonl.F >>> vasp.gamma_para.i 0000000000972C51 hamil_mp_hamiltmu 825 hamil.F >>> vasp.gamma_para.i 0000000001BD2608 david_mp_eddav_.R 419 >>> davidson.F >>> vasp.gamma_para.i 0000000001D2179E elmin_.R 424 >>> electron.F >>> vasp.gamma_para.i 0000000002B92452 vamp_IP_electroni 4783 main.F >>> vasp.gamma_para.i 0000000002B6E173 MAIN__ 2800 main.F >>> vasp.gamma_para.i 000000000041325E Unknown Unknown Unknown >>> libc-2.12.so 0000003F8E21ED1D __libc_start_main Unknown Unknown >>> vasp.gamma_para.i 0000000000413169 Unknown Unknown Unknown >>> >>> This is the configure line that was supposedly used to create the library: >>> ./configure >>> --prefix=/usr/local/openmpi/3.1.1_debug/x86_64/ib/intel/11.1.080 >>> --with-tm=/usr/local/torque --enable-mpirun-prefix-by-default >>> --with-verbs=/usr --with-verbs-libdir=/usr/lib64 --enable-debug >>> >>> Is there any way I can confirm that the version of the openmpi library I >>> think I’m using really was compiled with debugging? >>> >>> Noam >>> >>> >>> ____________ >>> >>> | >>> | >>> >>> |U.S. NAVAL| >>> >>> |_RESEARCH_| >>> >>> >>> LABORATORY >>> >>> >>> Noam Bernstein, Ph.D. >>> Center for Materials Physics and Technology >>> U.S. Naval Research Laboratory >>> T +1 202 404 8628 F +1 202 404 7546 >>> https://www.nrl.navy.mil <https://www.nrl.navy.mil/> >>> _______________________________________________ >>> users mailing list >>> users@lists.open-mpi.org <mailto:users@lists.open-mpi.org> >>> https://lists.open-mpi.org/mailman/listinfo/users >>> <https://lists.open-mpi.org/mailman/listinfo/users>_______________________________________________ >> users mailing list >> users@lists.open-mpi.org <mailto:users@lists.open-mpi.org> >> https://lists.open-mpi.org/mailman/listinfo/users > > _______________________________________________ > users mailing list > users@lists.open-mpi.org > https://lists.open-mpi.org/mailman/listinfo/users
_______________________________________________ users mailing list users@lists.open-mpi.org https://lists.open-mpi.org/mailman/listinfo/users