Hi Gilles, > i think i found the alignment issue and fixed it (commit > 8c556bbc66c06fb19c6e46c67624bac1d6719b12) > > here is attached the patch that fixes the issue.
Great, the problems are solved with the patch as far as I can say today! Thank you very much for your help. Today I could only test my programs on a sinlge machine, because you don't support "heterogeneous" at the moment and new program versions will be mirrored in the night so that they are available the next day on all machines. Unfortunately I have no Internet access for the next days, so that I can test with more than one machine only next week. I detected a new Java error which I will report in a different mail. Are you interested to solve the heterogeneous problem as well? Shall I run a small program and provide a backtrace? Kind regards and thank you very much once more Siegmar > Cheers, > > Gilles > > On 2014/10/29 5:24, Siegmar Gross wrote: > > Hi Gilles, > > > >> From the jvm logs, there is an alignment error in native_get_attr > >> but i could not find it by reading the source code. > >> > >> Could you please do > >> ulimit -c unlimited > >> mpiexec ... > >> and then > >> gdb <your path to java>/bin/java core > >> And run bt on all threads until you get a line number in native_get_attr > > I found pmix_native.c:1131 in native_get_attr, attached gdb to the > > Java process and set a breakpoint to this line. From there I single > > stepped until I got SIGSEGV, so that you can see what happened. > > > > > > (gdb) b pmix_native.c:1131 > > No source file named pmix_native.c. > > Make breakpoint pending on future shared library load? (y or [n]) y > > > > Breakpoint 1 (pmix_native.c:1131) pending. > > (gdb) thread 14 > > [Switching to thread 14 (Thread 2 (LWP 2))] > > #0 0xffffffff7eadc6b0 in __pollsys () from /lib/sparcv9/libc.so.1 > > (gdb) f 3 > > #3 0xfffffffee5122230 in JNI_OnLoad (vm=0xffffffff7e57e9d8 <main_vm>, > > reserved=0x0) > > at ../../../../../openmpi-dev-178-ga16c1e4/ompi/mpi/java/c/mpi_MPI.c:128 > > 128 while (_dbg) poll(NULL, 0, 1); > > (gdb) set _dbg=0 > > (gdb) c > > Continuing. > > [New LWP 13 ] > > > > Breakpoint 1, native_get_attr (attr=0xfffffffee2e05db0 "pmix.jobid", > > kv=0xffffffff7b4ff028) > > at ../../../../../openmpi-dev-178-ga16c1e4/opal/mca/pmix/native/pmix_native.c:1131 > > 1131 OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, > > (gdb) s > > opal_proc_local_get () at ../../../openmpi-dev-178-ga16c1e4/opal/util/proc.c:80 > > 80 return opal_proc_my_name; > > (gdb) > > 81 } > > (gdb) > > _process_name_print_for_opal (procname=14259803799433510912) > > at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_init.c:64 > > 64 orte_process_name_t* rte_name = (orte_process_name_t*)&procname; > > (gdb) > > 65 return ORTE_NAME_PRINT(rte_name); > > (gdb) > > orte_util_print_name_args (name=0xffffffff7b4feb90) > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:122 > > 122 if (NULL == name) { > > (gdb) > > 142 job = orte_util_print_jobids(name->jobid); > > (gdb) > > orte_util_print_jobids (job=3320119297) > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:170 > > 170 ptr = get_print_name_buffer(); > > (gdb) > > get_print_name_buffer () > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:92 > > 92 if (!fns_init) { > > (gdb) > > 101 ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr); > > (gdb) > > opal_tsd_getspecific (key=4, valuep=0xffffffff7b4fe8a0) > > at ../../openmpi-dev-178-ga16c1e4/opal/threads/tsd.h:163 > > 163 *valuep = pthread_getspecific(key); > > (gdb) > > 164 return OPAL_SUCCESS; > > (gdb) > > 165 } > > (gdb) > > get_print_name_buffer () > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:102 > > 102 if (OPAL_SUCCESS != ret) return NULL; > > (gdb) > > 104 if (NULL == ptr) { > > (gdb) > > 113 return (orte_print_args_buffers_t*) ptr; > > (gdb) > > 114 } > > (gdb) > > orte_util_print_jobids (job=3320119297) > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:172 > > 172 if (NULL == ptr) { > > (gdb) > > 178 if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) { > > (gdb) > > 179 ptr->cntr = 0; > > (gdb) > > 182 if (ORTE_JOBID_INVALID == job) { > > (gdb) > > 184 } else if (ORTE_JOBID_WILDCARD == job) { > > (gdb) > > 187 tmp1 = ORTE_JOB_FAMILY((unsigned long)job); > > (gdb) > > 188 tmp2 = ORTE_LOCAL_JOBID((unsigned long)job); > > (gdb) > > 189 snprintf(ptr->buffers[ptr->cntr++], > > (gdb) > > 193 return ptr->buffers[ptr->cntr-1]; > > (gdb) > > 194 } > > (gdb) > > orte_util_print_name_args (name=0xffffffff7b4feb90) > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:143 > > 143 vpid = orte_util_print_vpids(name->vpid); > > (gdb) > > orte_util_print_vpids (vpid=0) > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:260 > > 260 ptr = get_print_name_buffer(); > > (gdb) > > get_print_name_buffer () > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:92 > > 92 if (!fns_init) { > > (gdb) > > 101 ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr); > > (gdb) > > opal_tsd_getspecific (key=4, valuep=0xffffffff7b4fe8b0) > > at ../../openmpi-dev-178-ga16c1e4/opal/threads/tsd.h:163 > > 163 *valuep = pthread_getspecific(key); > > (gdb) > > 164 return OPAL_SUCCESS; > > (gdb) > > 165 } > > (gdb) > > get_print_name_buffer () > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:102 > > 102 if (OPAL_SUCCESS != ret) return NULL; > > (gdb) > > 104 if (NULL == ptr) { > > (gdb) > > 113 return (orte_print_args_buffers_t*) ptr; > > (gdb) > > 114 } > > (gdb) > > orte_util_print_vpids (vpid=0) > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:262 > > 262 if (NULL == ptr) { > > (gdb) > > 268 if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) { > > (gdb) > > 272 if (ORTE_VPID_INVALID == vpid) { > > (gdb) > > 274 } else if (ORTE_VPID_WILDCARD == vpid) { > > (gdb) > > 277 snprintf(ptr->buffers[ptr->cntr++], > > (gdb) > > 281 return ptr->buffers[ptr->cntr-1]; > > (gdb) > > 282 } > > (gdb) > > orte_util_print_name_args (name=0xffffffff7b4feb90) > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:146 > > 146 ptr = get_print_name_buffer(); > > (gdb) > > get_print_name_buffer () > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:92 > > 92 if (!fns_init) { > > (gdb) > > 101 ret = opal_tsd_getspecific(print_args_tsd_key, (void**)&ptr); > > (gdb) > > opal_tsd_getspecific (key=4, valuep=0xffffffff7b4fe970) > > at ../../openmpi-dev-178-ga16c1e4/opal/threads/tsd.h:163 > > 163 *valuep = pthread_getspecific(key); > > (gdb) > > 164 return OPAL_SUCCESS; > > (gdb) > > 165 } > > (gdb) > > get_print_name_buffer () > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:102 > > 102 if (OPAL_SUCCESS != ret) return NULL; > > (gdb) > > 104 if (NULL == ptr) { > > (gdb) > > 113 return (orte_print_args_buffers_t*) ptr; > > (gdb) > > 114 } > > (gdb) > > orte_util_print_name_args (name=0xffffffff7b4feb90) > > at ../../openmpi-dev-178-ga16c1e4/orte/util/name_fns.c:148 > > 148 if (NULL == ptr) { > > (gdb) > > 154 if (ORTE_PRINT_NAME_ARG_NUM_BUFS == ptr->cntr) { > > (gdb) > > 158 snprintf(ptr->buffers[ptr->cntr++], > > (gdb) > > 162 return ptr->buffers[ptr->cntr-1]; > > (gdb) > > 163 } > > (gdb) > > _process_name_print_for_opal (procname=14259803799433510912) > > at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_init.c:66 > > 66 } > > (gdb) > > > > Program received signal SIGSEGV, Segmentation fault. > > 0xfffffffee3210bfc in native_get_attr (attr=0xfffffffee2e05db0 > > "pmix.jobid", > > kv=0xffffffff7b4ff028) > > at ../../../../../openmpi-dev-178-ga16c1e4/opal/mca/pmix/native/pmix_native.c:1131 > > 1131 OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, > > (gdb) bt > > #0 0xfffffffee3210bfc in native_get_attr ( > > attr=0xfffffffee2e05db0 "pmix.jobid", kv=0xffffffff7b4ff028) > > at ../../../../../openmpi-dev-178-ga16c1e4/opal/mca/pmix/native/pmix_native.c:1131 > > #1 0xfffffffee2e033e4 in rte_init () > > at ../../../../../openmpi-dev-178-ga16c1e4/orte/mca/ess/pmi/ess_pmi_module.c:170 > > #2 0xfffffffee4a340c0 in orte_init (pargc=0x0, pargv=0x0, flags=32) > > at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_init.c:239 > > #3 0xfffffffee4d9a164 in ompi_mpi_init (argc=0, argv=0x1003f5850, > > requested=0, provided=0xffffffff7b4ff44c) > > at ../../openmpi-dev-178-ga16c1e4/ompi/runtime/ompi_mpi_init.c:480 > > #4 0xfffffffee4dfbb30 in PMPI_Init (argc=0xffffffff7b4ff554, > > argv=0xffffffff7b4ff548) at pinit.c:84 > > #5 0xfffffffee5122f6c in Java_mpi_MPI_Init_1jni (env=0x10010e9e0, > > clazz=0xffffffff7b4ff760, argv=0xffffffff7b4ff858) > > at ../../../../../openmpi-dev-178-ga16c1e4/ompi/mpi/java/c/mpi_MPI.c:271 > > #6 0xffffffff6b810738 in ?? () > > #7 0xffffffff6b810738 in ?? () > > Backtrace stopped: previous frame identical to this frame (corrupt stack?) > > (gdb) > > > > > > > > Hopefully the above output is helpful. Please let me know if you > > need something else. > > > > Kind regards > > > > Siegmar > > > > > > > >> Siegmar Gross <siegmar.gr...@informatik.hs-fulda.de> wrote: > >>> Hi, > >>> > >>> today I installed openmpi-dev-178-ga16c1e4 on Solaris 10 Sparc > >>> with gcc-4.9.1 and Java 8. Now a very simple Java program works > >>> as expected, but other Java programs still break. I removed the > >>> warnings about "shmem.jar" and used the following configure > >>> command. > >>> > >>> tyr openmpi-dev-178-ga16c1e4-SunOS.sparc.64_gcc 406 head config.log \ > >>> | grep openmpi > >>> $ ../openmpi-dev-178-ga16c1e4/configure > >>> --prefix=/usr/local/openmpi-1.9.0_64_gcc > >>> --libdir=/usr/local/openmpi-1.9.0_64_gcc/lib64 > >>> --with-jdk-bindir=/usr/local/jdk1.8.0/bin > >>> --with-jdk-headers=/usr/local/jdk1.8.0/include > >>> JAVA_HOME=/usr/local/jdk1.8.0 > >>> LDFLAGS=-m64 CC=gcc CXX=g++ FC=gfortran CFLAGS=-m64 -D_REENTRANT > >>> CXXFLAGS=-m64 FCFLAGS=-m64 CPP=cpp CXXCPP=cpp > >>> CPPFLAGS= -D_REENTRANT CXXCPPFLAGS= > >>> --enable-mpi-cxx --enable-cxx-exceptions --enable-mpi-java > >>> --enable-mpi-thread-multiple --with-threads=posix > >>> --with-hwloc=internal > >>> --without-verbs --with-wrapper-cflags=-std=c11 -m64 > >>> --with-wrapper-cxxflags=-m64 --enable-debug > >>> > >>> > >>> tyr java 290 ompi_info | grep -e "Open MPI repo revision:" -e "C compiler version:" > >>> Open MPI repo revision: dev-178-ga16c1e4 > >>> C compiler version: 4.9.1 > >>> > >>> > >>> > >>>>> regarding the BUS error reported by Siegmar, i also commited > >>>>> 62bde1fcb554079143030bb305512c236672386f > >>>>> in order to fix it (this is based on code review only, i have no sparc64 > >>>>> hardware to test it is enough) > >>>> I'll test it, when a new nightly snapshot is available for the trunk. > >>> > >>> tyr java 291 mpijavac InitFinalizeMain.java > >>> tyr java 292 mpiexec -np 1 java InitFinalizeMain > >>> Hello! > >>> > >>> tyr java 293 mpijavac BcastIntMain.java > >>> tyr java 294 mpiexec -np 2 java BcastIntMain > >>> # > >>> # A fatal error has been detected by the Java Runtime Environment: > >>> # > >>> # SIGBUS (0xa) at pc=0xfffffffee3210bfc, pid=24792, tid=2 > >>> ... > >>> > >>> > >>> > >>> tyr java 296 /usr/local/gdb-7.6.1_64_gcc/bin/gdb mpiexec > >>> ... > >>> (gdb) run -np 2 java BcastIntMain > >>> Starting program: /usr/local/openmpi-1.9.0_64_gcc/bin/mpiexec -np 2 java BcastIntMain > >>> [Thread debugging using libthread_db enabled] > >>> [New Thread 1 (LWP 1)] > >>> [New LWP 2 ] > >>> # > >>> # A fatal error has been detected by the Java Runtime Environment: > >>> # > >>> # SIGBUS (0xa) at pc=0xfffffffee3210bfc, pid=24814, tid=2 > >>> # > >>> # JRE version: Java(TM) SE Runtime Environment (8.0-b132) (build 1.8.0-b132) > >>> # Java VM: Java HotSpot(TM) 64-Bit Server VM (25.0-b70 mixed mode solaris-sparc compressed oops) > >>> # Problematic frame: > >>> # C [mca_pmix_native.so+0x10bfc] native_get_attr+0x3000 > >>> # > >>> # Failed to write core dump. Core dumps have been disabled. To enable > >>> core dumping, try "ulimit -c unlimited" > > before starting Java again > >>> # > >>> # An error report file with more information is saved as: > >>> # /home/fd1026/work/skripte/master/parallel/prog/mpi/java/hs_err_pid24814.log > >>> # > >>> # A fatal error has been detected by the Java Runtime Environment: > >>> # > >>> # SIGBUS (0xa) at pc=0xfffffffee3210bfc, pid=24812, tid=2 > >>> # > >>> # JRE version: Java(TM) SE Runtime Environment (8.0-b132) (build 1.8.0-b132) > >>> # Java VM: Java HotSpot(TM) 64-Bit Server VM (25.0-b70 mixed mode solaris-sparc compressed oops) > >>> # Problematic frame: > >>> # C [mca_pmix_native.so+0x10bfc] native_get_attr+0x3000 > >>> # > >>> # Failed to write core dump. Core dumps have been disabled. To enable > >>> core dumping, try "ulimit -c unlimited" > > before starting Java again > >>> # > >>> # An error report file with more information is saved as: > >>> # /home/fd1026/work/skripte/master/parallel/prog/mpi/java/hs_err_pid24812.log > >>> # > >>> # If you would like to submit a bug report, please visit: > >>> # http://bugreport.sun.com/bugreport/crash.jsp > >>> # The crash happened outside the Java Virtual Machine in native code. > >>> # See problematic frame for where to report the bug. > >>> # > >>> [tyr:24814] *** Process received signal *** > >>> [tyr:24814] Signal: Abort (6) > >>> [tyr:24814] Signal code: (-1) > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:opal_b acktrace_print+0x2c > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:0xdc2d 4 > >>> /lib/sparcv9/libc.so.1:0xd8b98 > >>> /lib/sparcv9/libc.so.1:0xcc70c > >>> /lib/sparcv9/libc.so.1:0xcc918 > >>> /lib/sparcv9/libc.so.1:0xdd2d0 [ Signal 6 (ABRT)] > >>> /lib/sparcv9/libc.so.1:_thr_sigsetmask+0x1c4 > >>> /lib/sparcv9/libc.so.1:sigprocmask+0x28 > >>> /lib/sparcv9/libc.so.1:_sigrelse+0x5c > >>> /lib/sparcv9/libc.so.1:abort+0xc0 > >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0xb3cb90 > >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0xd97a04 > >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:JVM_handle_s olaris_signal+0xc0c > >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0xb44e84 > >>> /lib/sparcv9/libc.so.1:0xd8b98 > >>> /lib/sparcv9/libc.so.1:0xcc70c > >>> /lib/sparcv9/libc.so.1:0xcc918 > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_pmix_native.so: 0x10bfc [ Signal 10 (BUS)] > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_ess_pmi.so:0x33 dc > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-rte.so.0.0.0:orte_i nit+0x67c > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:ompi_mpi_in it+0x374 > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:PMPI_Init+0 x2a8 > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi_java.so.0.0.0:Java_m pi_MPI_Init_1jni+0x1a0 > >>> 0xffffffff6b810730 > >>> 0xffffffff6b8106d4 > >>> 0xffffffff6b8078a8 > >>> 0xffffffff6b8078a8 > >>> 0xffffffff6b80024c > >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0x6fd4e8 > >>> /export2/prog/SunOS_sparc/jdk1.8.0/jre/lib/sparcv9/server/libjvm.so:0x79331c > >>> /export2/prog/SunOS_sparc/jdk1.8.0/lib/sparcv9/jli/libjli.so:0x7290 > >>> /lib/sparcv9/libc.so.1:0xd8a6c > >>> [tyr:24814] *** End of error message *** > >>> -------------------------------------------------------------------------- > >>> mpiexec noticed that process rank 1 with PID 0 on node tyr exited on signal 6 (Abort). > >>> -------------------------------------------------------------------------- > >>> [LWP 2 exited] > >>> [New Thread 2 ] > >>> [Switching to Thread 1 (LWP 1)] > >>> sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found to satisfy query > >>> (gdb) bt > >>> #0 0xffffffff7f6173d0 in rtld_db_dlactivity () from /usr/lib/sparcv9/ld.so.1 > >>> #1 0xffffffff7f6175a8 in rd_event () from /usr/lib/sparcv9/ld.so.1 > >>> #2 0xffffffff7f618950 in lm_delete () from /usr/lib/sparcv9/ld.so.1 > >>> #3 0xffffffff7f6226bc in remove_so () from /usr/lib/sparcv9/ld.so.1 > >>> #4 0xffffffff7f624574 in remove_hdl () from /usr/lib/sparcv9/ld.so.1 > >>> #5 0xffffffff7f61d97c in dlclose_core () from /usr/lib/sparcv9/ld.so.1 > >>> #6 0xffffffff7f61d9d4 in dlclose_intn () from /usr/lib/sparcv9/ld.so.1 > >>> #7 0xffffffff7f61db0c in dlclose () from /usr/lib/sparcv9/ld.so.1 > >>> #8 0xffffffff7ec87ca0 in vm_close () > >>> from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0 > >>> #9 0xffffffff7ec85274 in lt_dlclose () > >>> from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0 > >>> #10 0xffffffff7ecaa5dc in ri_destructor (obj=0x100187b70) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository .c:382 > >>> #11 0xffffffff7eca8fd8 in opal_obj_run_destructors (object=0x100187b70) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/class/opal_object.h:446 > >>> #12 0xffffffff7eca9eac in mca_base_component_repository_release ( > >>> component=0xffffffff7b1236f0 <mca_oob_tcp_component>) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository .c:240 > >>> #13 0xffffffff7ecac17c in mca_base_component_unload ( > >>> component=0xffffffff7b1236f0 <mca_oob_tcp_component>, output_id=-1) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:4 7 > >>> #14 0xffffffff7ecac210 in mca_base_component_close ( > >>> component=0xffffffff7b1236f0 <mca_oob_tcp_component>, output_id=-1) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:6 0 > >>> #15 0xffffffff7ecac2e4 in mca_base_components_close (output_id=-1, > >>> components=0xffffffff7f14bc58 <orte_oob_base_framework+80>, skip=0x0) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:8 6 > >>> #16 0xffffffff7ecac24c in mca_base_framework_components_close ( > >>> framework=0xffffffff7f14bc08 <orte_oob_base_framework>, skip=0x0) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:6 6 > >>> #17 0xffffffff7efcaf80 in orte_oob_base_close () > >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/mca/oob/base/oob_base_frame.c:112 > >>> #18 0xffffffff7ecc0d74 in mca_base_framework_close ( > >>> framework=0xffffffff7f14bc08 <orte_oob_base_framework>) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_framework.c:187 > >>> #19 0xffffffff7be07858 in rte_finalize () > >>> at ../../../../../openmpi-dev-178-ga16c1e4/orte/mca/ess/hnp/ess_hnp_module.c:857 > >>> #20 0xffffffff7ef338bc in orte_finalize () > >>> at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_finalize.c:66 > >>> #21 0x000000010000723c in orterun (argc=5, argv=0xffffffff7fffe0d8) > >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/orterun.c:1103 > >>> #22 0x0000000100003e80 in main (argc=5, argv=0xffffffff7fffe0d8) > >>> ---Type <return> to continue, or q <return> to quit--- > >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/main.c:13 > >>> (gdb) > >>> > >>> > >>> > >>> > >>> I get the same error for C programs, if they use more than > >>> MPI_Init and MPI_Finalize. > >>> > >>> tyr small_prog 301 mpicc init_finalize.c > >>> tyr small_prog 302 mpiexec -np 1 a.out > >>> Hello! > >>> tyr small_prog 303 mpicc column_int.c > >>> tyr small_prog 306 /usr/local/gdb-7.6.1_64_gcc/bin/gdb mpiexec > >>> ... > >>> (gdb) run -np 4 a.out > >>> Starting program: /usr/local/openmpi-1.9.0_64_gcc/bin/mpiexec -np 4 a.out > >>> [Thread debugging using libthread_db enabled] > >>> [New Thread 1 (LWP 1)] > >>> [New LWP 2 ] > >>> [tyr:24880] *** Process received signal *** > >>> [tyr:24880] Signal: Bus Error (10) > >>> [tyr:24880] Signal code: Invalid address alignment (1) > >>> [tyr:24880] Failing at address: ffffffff7bd1c10c > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:opal_b acktrace_print+0x2c > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:0xdc2d 4 > >>> /lib/sparcv9/libc.so.1:0xd8b98 > >>> /lib/sparcv9/libc.so.1:0xcc70c > >>> /lib/sparcv9/libc.so.1:0xcc918 > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_pmix_native.so: 0x10684 [ Signal 10 (BUS)] > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_ess_pmi.so:0x33 dc > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-rte.so.0.0.0:orte_i nit+0x67c > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:ompi_mpi_in it+0x374 > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:PMPI_Init+0 x2a8 > >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:main+0x20 > >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:_start+0x7c > >>> [tyr:24880] *** End of error message *** > >>> [tyr:24876] *** Process received signal *** > >>> [tyr:24876] Signal: Bus Error (10) > >>> [tyr:24876] Signal code: Invalid address alignment (1) > >>> [tyr:24876] Failing at address: ffffffff7bd1c10c > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:opal_b acktrace_print+0x2c > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0.0.0:0xdc2d 4 > >>> /lib/sparcv9/libc.so.1:0xd8b98 > >>> /lib/sparcv9/libc.so.1:0xcc70c > >>> /lib/sparcv9/libc.so.1:0xcc918 > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_pmix_native.so: 0x10684 [ Signal 10 (BUS)] > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/openmpi/mca_ess_pmi.so:0x33 dc > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libopen-rte.so.0.0.0:orte_i nit+0x67c > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:ompi_mpi_in it+0x374 > >>> /export2/prog/SunOS_sparc/openmpi-1.9.0_64_gcc/lib64/libmpi.so.0.0.0:PMPI_Init+0 x2a8 > >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:main+0x20 > >>> /home/fd1026/work/skripte/master/parallel/prog/mpi/small_prog/a.out:_start+0x7c > >>> [tyr:24876] *** End of error message *** > >>> -------------------------------------------------------------------------- > >>> mpiexec noticed that process rank 2 with PID 0 on node tyr exited on signal 10 (Bus Error). > >>> -------------------------------------------------------------------------- > >>> [LWP 2 exited] > >>> [New Thread 2 ] > >>> [Switching to Thread 1 (LWP 1)] > >>> sol_thread_fetch_registers: td_ta_map_id2thr: no thread can be found to satisfy query > >>> (gdb) bt > >>> #0 0xffffffff7f6173d0 in rtld_db_dlactivity () from /usr/lib/sparcv9/ld.so.1 > >>> #1 0xffffffff7f6175a8 in rd_event () from /usr/lib/sparcv9/ld.so.1 > >>> #2 0xffffffff7f618950 in lm_delete () from /usr/lib/sparcv9/ld.so.1 > >>> #3 0xffffffff7f6226bc in remove_so () from /usr/lib/sparcv9/ld.so.1 > >>> #4 0xffffffff7f624574 in remove_hdl () from /usr/lib/sparcv9/ld.so.1 > >>> #5 0xffffffff7f61d97c in dlclose_core () from /usr/lib/sparcv9/ld.so.1 > >>> #6 0xffffffff7f61d9d4 in dlclose_intn () from /usr/lib/sparcv9/ld.so.1 > >>> #7 0xffffffff7f61db0c in dlclose () from /usr/lib/sparcv9/ld.so.1 > >>> #8 0xffffffff7ec87ca0 in vm_close () > >>> from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0 > >>> #9 0xffffffff7ec85274 in lt_dlclose () > >>> from /usr/local/openmpi-1.9.0_64_gcc/lib64/libopen-pal.so.0 > >>> #10 0xffffffff7ecaa5dc in ri_destructor (obj=0x100187ae0) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository .c:382 > >>> #11 0xffffffff7eca8fd8 in opal_obj_run_destructors (object=0x100187ae0) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/class/opal_object.h:446 > >>> #12 0xffffffff7eca9eac in mca_base_component_repository_release ( > >>> component=0xffffffff7b0236f0 <mca_oob_tcp_component>) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_component_repository .c:240 > >>> #13 0xffffffff7ecac17c in mca_base_component_unload ( > >>> component=0xffffffff7b0236f0 <mca_oob_tcp_component>, output_id=-1) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:4 7 > >>> #14 0xffffffff7ecac210 in mca_base_component_close ( > >>> component=0xffffffff7b0236f0 <mca_oob_tcp_component>, output_id=-1) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:6 0 > >>> #15 0xffffffff7ecac2e4 in mca_base_components_close (output_id=-1, > >>> components=0xffffffff7f14bc58 <orte_oob_base_framework+80>, skip=0x0) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:8 6 > >>> #16 0xffffffff7ecac24c in mca_base_framework_components_close ( > >>> framework=0xffffffff7f14bc08 <orte_oob_base_framework>, skip=0x0) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_components_close.c:6 6 > >>> #17 0xffffffff7efcaf80 in orte_oob_base_close () > >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/mca/oob/base/oob_base_frame.c:112 > >>> #18 0xffffffff7ecc0d74 in mca_base_framework_close ( > >>> framework=0xffffffff7f14bc08 <orte_oob_base_framework>) > >>> at ../../../../openmpi-dev-178-ga16c1e4/opal/mca/base/mca_base_framework.c:187 > >>> #19 0xffffffff7bd07858 in rte_finalize () > >>> at ../../../../../openmpi-dev-178-ga16c1e4/orte/mca/ess/hnp/ess_hnp_module.c:857 > >>> #20 0xffffffff7ef338bc in orte_finalize () > >>> at ../../openmpi-dev-178-ga16c1e4/orte/runtime/orte_finalize.c:66 > >>> #21 0x000000010000723c in orterun (argc=4, argv=0xffffffff7fffe0e8) > >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/orterun.c:1103 > >>> #22 0x0000000100003e80 in main (argc=4, argv=0xffffffff7fffe0e8) > >>> at ../../../../openmpi-dev-178-ga16c1e4/orte/tools/orterun/main.c:13 > >>> (gdb) > >>> > >>> > >>> > >>> Do you need any other information? > >>> > >>> > >>> Kind regards > >>> > >>> Siegmar > > _______________________________________________ > > users mailing list > > us...@open-mpi.org > > Subscription: http://www.open-mpi.org/mailman/listinfo.cgi/users > > Link to this post: http://www.open-mpi.org/community/lists/users/2014/10/25635.php >