This works:

$ mpirun -x UCX_TLS=ib,rc -x UCX_NET_DEVICES=mlx4_0:1 -x 
MXM_RDMA_PORTS=mlx4_0:1 ./openmpi_hello_world_barrier
 
for both openmpi5-gnu14/ucx1.18.0 and onpempi5-intel/ucx1.18.0

Thanks for guiding me to dig into it.
Achilles.

On Wednesday, July 2, 2025 at 10:10:25 PM UTC-4 Achilles Vassilicos wrote:

> A lot to chew on. Thanks.
>
> Achilles
> Sent from my iPhone
>
> On Jul 2, 2025, at 5:40 PM, George Bosilca <bos...@icl.utk.edu> wrote:
>
> 
>
> OMPI 5.x has no support for the openib BTL
>
>
> , all IB traffic is now going through the UCX PML. This means that `-mca 
> btl_openib_if_include XXX` is meaningless, but you can use 
> the UCX_NET_DEVICES to direct UCX to a specific device.
>
> As the error happens for UD you can switch to a different transport with 
> the environment variable `UCX_TLS=dc_x,self,sm`. You could also build UCX 
> in debug mode and enable UCX_LOG_LEVEL to something very verbose (such as 
> DEBUG or DIAG) to get more info about the cause of the failure.
>
>   George.
>
>
> On Wed, Jul 2, 2025 at 3:40 PM Achilles Vassilicos <avas...@gmail.com> 
> wrote:
>
>> UCX1.18.0
>>
>> On Wednesday, July 2, 2025 at 3:38:57 PM UTC-4 Achilles Vassilicos wrote:
>>
>>> I checked further using a modification to mpi_hello_world.c (that 
>>> includes MPI_Barrier) and a test code that checks connectivity between all 
>>> processes.
>>> 1. On the mpi_hello_world_barrier.c case, openmpi5 failed the same way 
>>> as before. mpich-ofi completed without error.
>>> 2. On the connectivity_c.c case, openmpi5 failed with the same error, 
>>> and did not pass connectivity. mpich-ofi completed and passed connectivity 
>>> (see below).
>>>
>>> So it boils down to openmpi/ucx is unable to communicate between 
>>> processes in my network setup?
>>>
>>>
>>> -------------------------------------------------------------------------------------
>>> [av@sms test]$ cat mpi_hello_world_barrier.c
>>>
>>> #include <mpi.h>
>>> #include <stdio.h>
>>>
>>> int main(int argc, char** argv) {
>>>     // Initialize the MPI environment
>>>     MPI_Init(NULL, NULL);
>>>
>>>     // Get the number of processes
>>>     int world_size;
>>>     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
>>>
>>>     // Get the rank of the process
>>>     int world_rank;
>>>     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
>>>
>>>     // Get the name of the processor
>>>     char processor_name[MPI_MAX_PROCESSOR_NAME];
>>>     int name_len;
>>>     MPI_Get_processor_name(processor_name, &name_len);
>>>
>>>     // Print off a hello world message
>>>     int i;
>>>     for(i=0; i<world_size; i++){
>>>
>>>         printf("Hello world from processor %s, rank %d out of %d 
>>> processors\n",
>>>            processor_name, world_rank, world_size);
>>>         MPI_Barrier(MPI_COMM_WORLD);
>>>
>>>     }
>>>
>>>     // Finalize the MPI environment.
>>>     MPI_Finalize();
>>> }
>>>
>>>
>>> -------------------------------------------------------------------------------------
>>> [av@c11 ompi]$ cat connectivity_c.c
>>> /*
>>>  * Copyright (c) 2007      Sun Microsystems, Inc.  All rights reserved.
>>>  */
>>>  * Test the connectivity between all processes.
>>>  */
>>> #include <errno.h>
>>> #include <stdio.h>
>>> #include <stdlib.h>
>>> #include <string.h>
>>> #include <netdb.h>
>>> #include <unistd.h>
>>> #include <mpi.h>
>>> int
>>> main(int argc, char **argv)
>>> {
>>>     MPI_Status  status;
>>>     int         verbose = 0;
>>>     int         rank;
>>>     int         np;        /* number of processes in job */
>>>     int         peer;
>>>     int         i;
>>>     int         j;
>>>     int         length;
>>>     char        name[MPI_MAX_PROCESSOR_NAME+1];
>>>     MPI_Init(&argc, &argv);
>>>     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
>>>     MPI_Comm_size(MPI_COMM_WORLD, &np);
>>>     /*
>>>      * If we cannot get the name for whatever reason, just
>>>      * set it to unknown. */
>>>     if (MPI_SUCCESS != MPI_Get_processor_name(name, &length)) {
>>>         strcpy(name, "unknown");
>>>     }
>>>     if (argc>1 && strcmp(argv[1], "-v")==0)
>>>         verbose = 1;
>>>
>>>     for (i=0; i<np; i++) {
>>>         if (rank==i) {
>>>             /* rank i sends to and receives from each higher rank */
>>>             for(j=i+1; j<np; j++) {
>>>                 if (verbose)
>>>                     printf("checking connection between rank %d on %s 
>>> and rank %-4d\n",
>>>                            i, name, j);
>>>                 MPI_Send(&rank, 1, MPI_INT, j, rank, MPI_COMM_WORLD);
>>>                 MPI_Recv(&peer, 1, MPI_INT, j, j, MPI_COMM_WORLD, 
>>> &status);
>>>             }
>>>         } else if (rank>i) {
>>>             /* receive from and reply to rank i */
>>>             MPI_Recv(&peer, 1, MPI_INT, i, i, MPI_COMM_WORLD, &status);
>>>             MPI_Send(&rank, 1, MPI_INT, i, rank, MPI_COMM_WORLD);
>>>         }
>>>     }
>>>     MPI_Barrier(MPI_COMM_WORLD);
>>>     if (rank==0)
>>>         printf("Connectivity test on %d processes PASSED.\n", np);
>>>     MPI_Finalize();
>>>     return 0;
>>> }
>>> ------------------------------------------------------------
>>> [av@sms ompi]$ mpicc -o openmpi5-connectivity_c connectivity_c.c
>>> [av@sms ompi]$ which mpicc
>>> /opt/ohpc/pub/mpi/openmpi5-gnu14/5.0.7/bin/mpicc
>>> [av@sms ompi]$ salloc -n 6 -N 3
>>> salloc: Granted job allocation 72
>>> salloc: Nodes c[11-13] are ready for job
>>> [av@c11 ompi]$ mpirun openmpi5-connectivity_c
>>> [c11:1928 :0:1928]       ud_ep.c:278  Fatal: UD endpoint 0x12e1c70 to 
>>> <no debug data>: unhandled timeout error
>>>
>>> ------------------------------------------------------------
>>> [av@sms ompi]$ mpicc -o mpich-ofi-connectivity_c connectivity_c.c
>>> [av@sms ompi]$ salloc -n 6 -N 3
>>> salloc: Granted job allocation 71
>>> salloc: Nodes c[11-13] are ready for job
>>> [av@c11 ompi]$ mpirun ./mpich-ofi-connectivity_c
>>> Connectivity test on 6 processes PASSED.
>>> ------------------------------------------------------------
>>>
>>> Achilles.
>>> On Wednesday, July 2, 2025 at 4:01:30 AM UTC-4 George Bosilca wrote:
>>>
>>>> UCX 1.8 or UCX 1.18 ?
>>>>
>>>> Your application does not exchange any data so it is possible that 
>>>> MPICH behavior differs from OMPI (aka not creating connections vs creating 
>>>> them during MPI_Init). That's why running a slightly different version of 
>>>> the hello_world with a barrier would clarify the connection's status.
>>>>
>>>>   George.
>>>>
>>>>
>>>> On Tue, Jul 1, 2025 at 10:30 PM Achilles Vassilicos <avas...@gmail.com> 
>>>> wrote:
>>>>
>>>>> When I use openmpi5, I get the same behavior even with a very small 
>>>>> number of processes per node. However, when I use mpich-ofi it runs fine 
>>>>> (see below). That gives me confidence that the network is setup 
>>>>> correctly. 
>>>>> The nodes are connected via infiniband ConnectX-3 adapters, and all ib 
>>>>> tests show no problems.
>>>>> I found  an older post about ucx1.18 having possible issues with 
>>>>> openmpi5. I have assumed that ucx1.18 is now fully compatible with 
>>>>> openmpi5. Could this be the cause? Does anyone use ucx1.8 with openmpi5? 
>>>>> If 
>>>>> not ucx1.18, what version is confirmed to work with openmpi5?
>>>>>
>>>>> My test code:
>>>>> ----------------------------------------------------------------------
>>>>> [av@c12 test]$ cat mpi_hello_world.c
>>>>> #include <mpi.h>
>>>>> #include <stdio.h>
>>>>>
>>>>> int main(int argc, char** argv) {
>>>>>     // Initialize the MPI environment
>>>>>     MPI_Init(NULL, NULL);
>>>>>
>>>>>     // Get the number of processes
>>>>>     int world_size;
>>>>>     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
>>>>>
>>>>>     // Get the rank of the process
>>>>>     int world_rank;
>>>>>     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
>>>>>
>>>>>     // Get the name of the processor
>>>>>     char processor_name[MPI_MAX_PROCESSOR_NAME];
>>>>>     int name_len;
>>>>>     MPI_Get_processor_name(processor_name, &name_len);
>>>>>
>>>>>     // Print off a hello world message
>>>>>     printf("Hello world from processor %s, rank %d out of %d 
>>>>> processors\n",
>>>>>            processor_name, world_rank, world_size);
>>>>>
>>>>>     // Finalize the MPI environment.
>>>>>     MPI_Finalize();
>>>>> }
>>>>>
>>>>> -------------------------------------------------------------------------
>>>>> [av@c12 test]$ which mpirun
>>>>> /opt/ohpc/pub/mpi/openmpi5-gnu14/5.0.7/bin/mpirun
>>>>> [av@sms test]$ mpicc -o openmpi5_hello_world mpi_hello_world.c
>>>>> [av@sms test]$ salloc -n 4 -N 2
>>>>> salloc: Granted job allocation 63
>>>>> salloc: Nodes c[12-13] are ready for job
>>>>> [av@c12 test]$ mpirun  ./openmpi5_hello_world
>>>>> Hello world from processor c12, rank 0 out of 4 processors
>>>>> Hello world from processor c12, rank 1 out of 4 processors
>>>>> Hello world from processor c13, rank 3 out of 4 processors
>>>>> Hello world from processor c13, rank 2 out of 4 processors
>>>>> [c12:1709 :0:1709]       ud_ep.c:278  Fatal: UD endpoint 0x117ae80 to 
>>>>> <no debug data>: unhandled timeout error
>>>>> ==== backtrace (tid:   1709) ====
>>>>>  0 
>>>>>  
>>>>> /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucs.so.0(ucs_handle_error+0x294) 
>>>>> [0x7f200b4f3ee4]
>>>>> ................
>>>>> -----------------------------------------------------------------------
>>>>> [av@sms test]$ which mpicc
>>>>> /opt/ohpc/pub/mpi/mpich-ofi-gnu14-ohpc/3.4.3/bin/mpicc
>>>>> [av@sms test]$ which mpirun
>>>>> /opt/ohpc/pub/mpi/mpich-ofi-gnu14-ohpc/3.4.3/bin/mpirun
>>>>> [av@sms test]$ mpicc -o mpich-ofi_hello_world mpi_hello_world.c
>>>>> [av@sms test]$ salloc -n 4 -N 2
>>>>> salloc: Granted job allocation 66
>>>>> salloc: Nodes c[12-13] are ready for job
>>>>> [av@c12 test]$ mpirun ./mpich-ofi_hello_world
>>>>> Hello world from processor c13, rank 2 out of 4 processors
>>>>> Hello world from processor c13, rank 3 out of 4 processors
>>>>> Hello world from processor c12, rank 0 out of 4 processors
>>>>> Hello world from processor c12, rank 1 out of 4 processors
>>>>> [av@c12 test]$
>>>>>
>>>>> ------------------------------------------------------------------------
>>>>> Achilles
>>>>> On Tuesday, July 1, 2025 at 7:14:06 AM UTC-4 George Bosilca wrote:
>>>>>
>>>>>> This error message is usually due to a misconfiguration of the 
>>>>>> network. However, I don't think this is the case here because the output 
>>>>>> contains messages from both odd and even ranks (which according to your 
>>>>>> binding policy were placed on different nodes) suggesting at least some 
>>>>>> of 
>>>>>> the processes were able to connect (and thus the network configuration 
>>>>>> is 
>>>>>> correct).
>>>>>>
>>>>>> So I'm thinking about some timing issues during network setup due to 
>>>>>> the fact that you have many processes per node, and an application that 
>>>>>> does nothing except creating and then shutting down the network layer. 
>>>>>> Does 
>>>>>> this happen if you have less processes per node ? Does it happen if you 
>>>>>> add 
>>>>>> anything else in the application (such as an 
>>>>>> `MPI_Barrier(MPI_COMM_WORLD)`) 
>>>>>> ?
>>>>>>
>>>>>>    George.
>>>>>>
>>>>>>
>>>>>> On Mon, Jun 30, 2025 at 10:00 PM Achilles Vassilicos <
>>>>>> avas...@gmail.com> wrote:
>>>>>>
>>>>>>> Hello all, new to the list.
>>>>>>> While testing my openmpi5.0.7 installation using the simple 
>>>>>>> mpi_hello_world.c code, I am experiencing an unexpected behavior where 
>>>>>>> the 
>>>>>>> execution on the last processor rank hangs with a "fatal unhandled 
>>>>>>> timeout 
>>>>>>> error", which leads to core dumps. It confirmed that it happens 
>>>>>>> regardless 
>>>>>>> of the compiler I use, i.e., gnu14 or intel2024.0. Moreover, it does 
>>>>>>> not 
>>>>>>> happen when I use mpich3.4.3-ofi. Below I am including the setting I am 
>>>>>>> using and the runtime error. You will notice that the error happened on 
>>>>>>> node c11, which may suggest that there is something wrong with this 
>>>>>>> node. 
>>>>>>> However, it turns out that any other other node that happens to execute 
>>>>>>> the 
>>>>>>> last processor rank leads to the same error. I must be missing 
>>>>>>> something. 
>>>>>>> Any Thoughts?
>>>>>>> Sorry about the length of the post.
>>>>>>>
>>>>>>> -----------------------------------------------------
>>>>>>> ]$ module list
>>>>>>> Currently Loaded Modules:
>>>>>>>   1) cmake/4.0.0        6) spack/0.23.1          11) mkl/2024.0     
>>>>>>>     16) ifort/2024.0.0              21) EasyBuild/5.0.0
>>>>>>>   2) autotools          7) oclfpga/2024.0.0      12) intel/2024.0.0 
>>>>>>>     17) inspector/2024.2            22) valgrind/3.24.0
>>>>>>>   3) hwloc/2.12.0       8) tbb/2021.11           13) 
>>>>>>> debugger/2024.0.0  18) intel_ipp_intel64/2021.10   23) openmpi5/5.0.7
>>>>>>>   4) libfabric/1.18.0   9) compiler-rt/2024.0.0  14) dpl/2022.3     
>>>>>>>     19) intel_ippcp_intel64/2021.9  24) ucx/1.18.0
>>>>>>>   5) prun/2.2          10) compiler/2024.0.0     15) icc/2023.2.1   
>>>>>>>     20) vtune/2025.3
>>>>>>> ----------------------------------------------------------
>>>>>>> $ sinfo
>>>>>>> PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
>>>>>>> normal*      up   infinite     10  idle* c[2-10,12]
>>>>>>> normal*      up   infinite      3   idle c[1,11,13]
>>>>>>> [av@sms test]$ salloc -n 24 -N 2 
>>>>>>> salloc: Granted job allocation 61
>>>>>>> salloc: Nodes c[1,11] are ready for job
>>>>>>> [av@c1 test]$ mpirun --display-map --map-by node -x 
>>>>>>> MXM_RDMA_PORTS=mlx4_0:1 -mca btl_openib_if_include mlx4_0:1 
>>>>>>> mpi_hello_world 
>>>>>>>
>>>>>>> ========================   JOB MAP   ========================
>>>>>>> Data for JOB prterun-c1-1575@1 offset 0 Total slots allocated 24
>>>>>>>     Mapping policy: BYNODE:NOOVERSUBSCRIBE  Ranking policy: NODE 
>>>>>>> Binding policy: NUMA:IF-SUPPORTED
>>>>>>>     Cpu set: N/A  PPR: N/A  Cpus-per-rank: N/A  Cpu Type: CORE
>>>>>>>
>>>>>>>
>>>>>>> Data for node: c1 Num slots: 12 Max slots: 0 Num procs: 12
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 0 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 2 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 4 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 6 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 8 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 10 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 12 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 14 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 16 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 18 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 20 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 22 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>
>>>>>>> Data for node: c11 Num slots: 12 Max slots: 0 Num procs: 12
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 1 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 3 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 5 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 7 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 9 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 11 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 13 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 15 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 17 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 19 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 21 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>         Process jobid: prterun-c1-1575@1 App: 0 Process rank: 23 
>>>>>>> Bound: package[0][core:0-17]
>>>>>>>
>>>>>>> =============================================================
>>>>>>> Hello world from processor c1, rank 6 out of 24 processors
>>>>>>> Hello world from processor c1, rank 20 out of 24 processors
>>>>>>> Hello world from processor c1, rank 16 out of 24 processors
>>>>>>> Hello world from processor c1, rank 12 out of 24 processors
>>>>>>> Hello world from processor c1, rank 0 out of 24 processors
>>>>>>> Hello world from processor c1, rank 2 out of 24 processors
>>>>>>> Hello world from processor c1, rank 14 out of 24 processors
>>>>>>> Hello world from processor c1, rank 10 out of 24 processors
>>>>>>> Hello world from processor c1, rank 4 out of 24 processors
>>>>>>> Hello world from processor c1, rank 22 out of 24 processors
>>>>>>> Hello world from processor c1, rank 18 out of 24 processors
>>>>>>> Hello world from processor c1, rank 8 out of 24 processors
>>>>>>> Hello world from processor c11, rank 11 out of 24 processors
>>>>>>> Hello world from processor c11, rank 1 out of 24 processors
>>>>>>> Hello world from processor c11, rank 3 out of 24 processors
>>>>>>> Hello world from processor c11, rank 13 out of 24 processors
>>>>>>> Hello world from processor c11, rank 19 out of 24 processors
>>>>>>> Hello world from processor c11, rank 7 out of 24 processors
>>>>>>> Hello world from processor c11, rank 17 out of 24 processors
>>>>>>> Hello world from processor c11, rank 21 out of 24 processors
>>>>>>> Hello world from processor c11, rank 15 out of 24 processors
>>>>>>> Hello world from processor c11, rank 23 out of 24 processors
>>>>>>> Hello world from processor c11, rank 9 out of 24 processors
>>>>>>> Hello world from processor c11, rank 5 out of 24 processors
>>>>>>> [c11:2028 :0:2028]       ud_ep.c:278  Fatal: UD endpoint 0x1c8da90 
>>>>>>> to <no debug data>: unhandled timeout error
>>>>>>> [c11:2035 :0:2035]       ud_ep.c:278  Fatal: UD endpoint 0x722a90 to 
>>>>>>> <no debug data>: unhandled timeout error
>>>>>>> [c11:2025 :0:2025]       ud_ep.c:278  Fatal: UD endpoint 0xc52a90 to 
>>>>>>> <no debug data>: unhandled timeout error
>>>>>>> ==== backtrace (tid:   2028) ====
>>>>>>> 0 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucs.so.0(ucs_handle_error+0x294)
>>>>>>>  
>>>>>>> [0x7fade4326ee4]
>>>>>>>  1 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucs.so.0(ucs_fatal_error_message+0xb2)
>>>>>>>  
>>>>>>> [0x7fade4324292]
>>>>>>>  2  /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucs.so.0(+0x2f369) 
>>>>>>> [0x7fade4324369]
>>>>>>>  3 
>>>>>>>  /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/ucx/libuct_ib.so.0(+0x263f0) 
>>>>>>> [0x7fade110d3f0]
>>>>>>>  4  /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucs.so.0(+0x24987) 
>>>>>>> [0x7fade4319987]
>>>>>>>  5 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucp.so.0(ucp_worker_progress+0x2a)
>>>>>>>  
>>>>>>> [0x7fade43abc9a]
>>>>>>>  6 
>>>>>>>  /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libopen-pal.so.80(+0xa09bc) 
>>>>>>> [0x7fade471b9bc]
>>>>>>>  7 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libopen-pal.so.80(opal_common_ucx_del_procs_nofence+0x6a)
>>>>>>>  
>>>>>>> [0x7fade471b79a]
>>>>>>>  8 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libopen-pal.so.80(opal_common_ucx_del_procs+0x20)
>>>>>>>  
>>>>>>> [0x7fade471baf0]
>>>>>>>  9 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libmpi.so.40(mca_pml_ucx_del_procs+0x140)
>>>>>>>  
>>>>>>> [0x7fade4d1cd70]
>>>>>>> 10 
>>>>>>>  /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libmpi.so.40(+0xac837) 
>>>>>>> [0x7fade4b27837]
>>>>>>> 11 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libopen-pal.so.80(opal_finalize_cleanup_domain+0x53)
>>>>>>>  
>>>>>>> [0x7fade46aebd3]
>>>>>>> 12 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libopen-pal.so.80(opal_finalize+0x2e)
>>>>>>>  
>>>>>>> [0x7fade46a22be]
>>>>>>> 13 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libmpi.so.40(ompi_rte_finalize+0x1f9)
>>>>>>>  
>>>>>>> [0x7fade4b21909]
>>>>>>> 14 
>>>>>>>  /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libmpi.so.40(+0xab304) 
>>>>>>> [0x7fade4b26304]
>>>>>>> 15 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libmpi.so.40(ompi_mpi_instance_finalize+0xe5)
>>>>>>>  
>>>>>>> [0x7fade4b26935]
>>>>>>> 16 
>>>>>>>  
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libmpi.so.40(ompi_mpi_finalize+0x3d1)
>>>>>>>  
>>>>>>> [0x7fade4b1e091]
>>>>>>> 17  mpi_hello_world() [0x40258f]
>>>>>>> 18  /lib64/libc.so.6(+0x295d0) [0x7fade47b95d0]
>>>>>>> 19  /lib64/libc.so.6(__libc_start_main+0x80) [0x7fade47b9680]
>>>>>>> 20  mpi_hello_world() [0x402455]
>>>>>>> =================================
>>>>>>> [c11:02028] *** Process received signal ***
>>>>>>> [c11:02028] Signal: Aborted (6)
>>>>>>> [c11:02028] Signal code:  (-6)
>>>>>>> [c11:02028] [ 0] /lib64/libc.so.6(+0x3ebf0)[0x7fade47cebf0]
>>>>>>> [c11:02028] [ 1] /lib64/libc.so.6(+0x8bedc)[0x7fade481bedc]
>>>>>>> [c11:02028] [ 2] /lib64/libc.so.6(raise+0x16)[0x7fade47ceb46]
>>>>>>> [c11:02028] [ 3] /lib64/libc.so.6(abort+0xd3)[0x7fade47b8833]
>>>>>>> [c11:02028] [ 4] 
>>>>>>> /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucs.so.0(+0x2f297)[0x7fade4324297]
>>>>>>> [c11:02028] [ 5] 
>>>>>>> /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucs.so.0(+0x2f369)[0x7fade4324369]
>>>>>>> [c11:02028] [ 6] 
>>>>>>> /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/ucx/libuct_ib.so.0(+0x263f0)[0x7fade110d3f0]
>>>>>>> [c11:02028] [ 7] 
>>>>>>> /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucs.so.0(+0x24987)[0x7fade4319987]
>>>>>>> [c11:02028] [ 8] 
>>>>>>> /opt/ohpc/pub/mpi/ucx-ohpc/1.18.0/lib/libucp.so.0(ucp_worker_progress+0x2a)[0x7fade43abc9a]
>>>>>>> [c11:02028] [ 9] 
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libopen-pal.so.80(+0xa09bc)[0x7fade471b9bc]
>>>>>>> [c11:02028] [10] 
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libopen-pal.so.80(opal_common_ucx_del_procs_nofence+0x6a)[0x7fade471b79a]
>>>>>>> [c11:02028] [11] 
>>>>>>> /opt/ohpc/pub/mpi/openmpi5-intel/5.0.7/lib/libopen-pal.so.80(opal_common_ucx_del_procs+0x20)[0x7fade471baf0]
>>>>>>> c11:02028] [12] ==== backtrace (tid:   2035) ====
>>>>>>> ..................
>>>>>>>
>>>>>>> --------------------------------------------------------------------------------
>>>>>>> Achilles
>>>>>>>
>>>>>>> To unsubscribe from this group and stop receiving emails from it, 
>>>>>>> send an email to users+un...@lists.open-mpi.org.
>>>>>>>
>>>>>>

To unsubscribe from this group and stop receiving emails from it, send an email 
to users+unsubscr...@lists.open-mpi.org.

Reply via email to