Hi, here is the gdb output for additional information: (It might be inexact, because I built openmpi-2.0.0 without debug option)
Core was generated by `osu_bw'. Program terminated with signal 11, Segmentation fault. #0 0x00000031d9008806 in ?? () from /lib64/libgcc_s.so.1 (gdb) where #0 0x00000031d9008806 in ?? () from /lib64/libgcc_s.so.1 #1 0x00000031d9008934 in _Unwind_Backtrace () from /lib64/libgcc_s.so.1 #2 0x00000037ab8e5ee8 in backtrace () from /lib64/libc.so.6 #3 0x00002ad882bd4345 in opal_backtrace_print () at ./backtrace_execinfo.c:47 #4 0x00002ad882bd1180 in show_stackframe () at ./stacktrace.c:331 #5 <signal handler called> #6 mca_pml_ob1_recv_request_schedule_once () at ./pml_ob1_recvreq.c:983 #7 0x00002aaab412f47a in mca_pml_ob1_recv_request_progress_rndv () from /home/mishima/opt/mpi/openmpi-2.0.0-pgi16.5/lib/openmpi/mca_pml_ob1.so #8 0x00002aaab412c645 in mca_pml_ob1_recv_frag_match () at ./pml_ob1_recvfrag.c:715 #9 0x00002aaab412bba6 in mca_pml_ob1_recv_frag_callback_rndv () at ./pml_ob1_recvfrag.c:267 #10 0x00002aaaaf2748d3 in mca_btl_vader_poll_handle_frag () at ./btl_vader_component.c:589 #11 0x00002aaaaf274b9a in mca_btl_vader_component_progress () at ./btl_vader_component.c:231 #12 0x00002ad882b916fc in opal_progress () at runtime/opal_progress.c:224 #13 0x00002ad8820a9aa5 in ompi_request_default_wait_all () at request/req_wait.c:77 #14 0x00002ad8820f10dd in PMPI_Waitall () at ./pwaitall.c:76 #15 0x0000000000401108 in main () at ./osu_bw.c:144 Tetsuya 2016/08/08 12:34:57、"devel"さんは「Re: [OMPI devel] sm BTL performace of the openmpi-2.0.0」で書きました > Hi, it caused segfault as below: > > [manage.cluster:25436] MCW rank 0 bound to socket 0[core 0[hwt 0]], socket > 0[core 1[hwt 0]], socket 0[core 2[hwt 0]], so > cket 0[core 3[hwt 0]], socket 0[core 4[hwt 0]], socket 0[core 5[hwt 0]]: > [B/B/B/B/B/B][./././././.] > [manage.cluster:25436] MCW rank 1 bound to socket 0[core 0[hwt 0]], socket > 0[core 1[hwt 0]], socket 0[core 2[hwt 0]], so > cket 0[core 3[hwt 0]], socket 0[core 4[hwt 0]], socket 0[core 5[hwt 0]]: > [B/B/B/B/B/B][./././././.] > # OSU MPI Bandwidth Test v3.1.1 > # Size Bandwidth (MB/s) > 1 2.23 > 2 4.51 > 4 8.99 > 8 17.83 > 16 35.18 > 32 69.66 > 64 109.84 > 128 179.65 > 256 303.52 > 512 532.81 > 1024 911.74 > 2048 1605.29 > 4096 1598.73 > 8192 2135.94 > 16384 2468.98 > 32768 2818.37 > 65536 3658.83 > 131072 4200.50 > 262144 4545.01 > 524288 4757.84 > 1048576 4831.75 > [manage:25442] *** Process received signal *** > [manage:25442] Signal: Segmentation fault (11) > [manage:25442] Signal code: Address not mapped (1) > [manage:25442] Failing at address: 0x8 > -------------------------------------------------------------------------- > mpirun noticed that process rank 1 with PID 0 on node manage exited on > signal 11 (Segmentation fault). > -------------------------------------------------------------------------- > > Tetsuya Mishima > > > 2016/08/08 10:12:05、"devel"さんは「Re: [OMPI devel] sm BTL performace of > the openmpi-2.0.0」で書きました > > This patch also modifies the put path. Let me know if this works: > > > > diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c > b/ompi/mca/pml/ob1/pml_ob1_rdma.c > > index 888e126..a3ec6f8 100644 > > --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c > > +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c > > @@ -42,6 +42,7 @@ size_t mca_pml_ob1_rdma_btls( > > mca_pml_ob1_com_btl_t* rdma_btls) > > { > > int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint-> btl_rdma); > > + int num_eager_btls = mca_bml_base_btl_array_get_size > (&bml_endpoint->btl_eager); > > double weight_total = 0; > > int num_btls_used = 0; > > > > @@ -57,6 +58,21 @@ size_t mca_pml_ob1_rdma_btls( > > (bml_endpoint->btl_rdma_index + n) % num_btls); > > mca_btl_base_registration_handle_t *reg_handle = NULL; > > mca_btl_base_module_t *btl = bml_btl->btl; > > + bool ignore = true; > > + > > + /* do not use rdma btls that are not in the eager list. this is > necessary to avoid using > > + * btls that exist on the endpoint only to support RMA. */ > > + for (int i = 0 ; i < num_eager_btls ; ++i) { > > + mca_bml_base_btl_t *eager_btl = > mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i); > > + if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) { > > + ignore = false; > > + break; > > + } > > + } > > + > > + if (ignore) { > > + continue; > > + } > > > > if (btl->btl_register_mem) { > > /* do not use the RDMA protocol with this btl if 1) leave pinned is > disabled, > > @@ -99,18 +115,34 @@ size_t mca_pml_ob1_rdma_pipeline_btls > ( mca_bml_base_endpoint_t* bml_endpoint, > > size_t size, > > mca_pml_ob1_com_btl_t* rdma_btls ) > > { > > - int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint-> > btl_rdma); > > + int num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint-> > btl_rdma); > > + int num_eager_btls = mca_bml_base_btl_array_get_size > (&bml_endpoint->btl_eager); > > double weight_total = 0; > > + int rdma_count = 0; > > > > - for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i + > +) { > > - rdma_btls[i].bml_btl = > > - mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); > > - rdma_btls[i].btl_reg = NULL; > > + for(int i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; > i++) { > > + mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_next > (&bml_endpoint->btl_rdma); > > + bool ignore = true; > > + > > + for (int i = 0 ; i < num_eager_btls ; ++i) { > > + mca_bml_base_btl_t *eager_btl = > mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i); > > + if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) { > > + ignore = false; > > + break; > > + } > > + } > > > > - weight_total += rdma_btls[i].bml_btl->btl_weight; > > + if (ignore) { > > + continue; > > + } > > + > > + rdma_btls[rdma_count].bml_btl = bml_btl; > > + rdma_btls[rdma_count++].btl_reg = NULL; > > + > > + weight_total += bml_btl->btl_weight; > > } > > > > - mca_pml_ob1_calc_weighted_length(rdma_btls, i, size, weight_total); > > + mca_pml_ob1_calc_weighted_length (rdma_btls, rdma_count, size, > weight_total); > > > > - return i; > > + return rdma_count; > > } > > > > > > > > > > > On Aug 7, 2016, at 6:51 PM, Nathan Hjelm <hje...@me.com> wrote: > > > > > > Looks like the put path probably needs a similar patch. Will send > another patch soon. > > > > > >> On Aug 7, 2016, at 6:01 PM, tmish...@jcity.maeda.co.jp wrote: > > >> > > >> Hi, > > >> > > >> I applied the patch to the file "pml_ob1_rdma.c" and ran osu_bw again. > > >> Then, I still see the bad performance for larger size(>=2097152 ). > > >> > > >> [mishima@manage OMB-3.1.1-openmpi2.0.0]$ mpirun -np 2 -report-bindings > > >> osu_bw > > >> [manage.cluster:27444] MCW rank 0 bound to socket 0[core 0[hwt 0]], > socket > > >> 0[core 1[hwt 0]], socket 0[core 2[hwt 0]], so > > >> cket 0[core 3[hwt 0]], socket 0[core 4[hwt 0]], socket 0[core 5[hwt > 0]]: > > >> [B/B/B/B/B/B][./././././.] > > >> [manage.cluster:27444] MCW rank 1 bound to socket 0[core 0[hwt 0]], > socket > > >> 0[core 1[hwt 0]], socket 0[core 2[hwt 0]], so > > >> cket 0[core 3[hwt 0]], socket 0[core 4[hwt 0]], socket 0[core 5[hwt > 0]]: > > >> [B/B/B/B/B/B][./././././.] > > >> # OSU MPI Bandwidth Test v3.1.1 > > >> # Size Bandwidth (MB/s) > > >> 1 2.23 > > >> 2 4.52 > > >> 4 8.82 > > >> 8 17.83 > > >> 16 35.31 > > >> 32 69.49 > > >> 64 109.46 > > >> 128 178.51 > > >> 256 307.68 > > >> 512 532.64 > > >> 1024 909.34 > > >> 2048 1583.95 > > >> 4096 1554.74 > > >> 8192 2120.31 > > >> 16384 2489.79 > > >> 32768 2853.66 > > >> 65536 3692.82 > > >> 131072 4236.67 > > >> 262144 4575.63 > > >> 524288 4778.47 > > >> 1048576 4839.34 > > >> 2097152 2231.46 > > >> 4194304 1505.48 > > >> > > >> Regards, > > >> > > >> Tetsuya Mishima > > >> > > >> 2016/08/06 0:00:08、"devel"さんは「Re: [OMPI devel] sm BTL performace > of > > >> the openmpi-2.0.0」で書きました > > >>> Making ob1 ignore RDMA btls that are not in use for eager messages > might > > >> be sufficient. Please try the following patch and let me know if it > works > > >> for you. > > >>> > > >>> diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c > > >> b/ompi/mca/pml/ob1/pml_ob1_rdma.c > > >>> index 888e126..0c99525 100644 > > >>> --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c > > >>> +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c > > >>> @@ -42,6 +42,7 @@ size_t mca_pml_ob1_rdma_btls( > > >>> mca_pml_ob1_com_btl_t* rdma_btls) > > >>> { > > >>> int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint-> > btl_rdma); > > >>> + int num_eager_btls = mca_bml_base_btl_array_get_size > > >> (&bml_endpoint->btl_eager); > > >>> double weight_total = 0; > > >>> int num_btls_used = 0; > > >>> > > >>> @@ -57,6 +58,21 @@ size_t mca_pml_ob1_rdma_btls( > > >>> (bml_endpoint->btl_rdma_index + n) % num_btls); > > >>> mca_btl_base_registration_handle_t *reg_handle = NULL; > > >>> mca_btl_base_module_t *btl = bml_btl->btl; > > >>> + bool ignore = true; > > >>> + > > >>> + /* do not use rdma btls that are not in the eager list. this > is > > >> necessary to avoid using > > >>> + * btls that exist on the endpoint only to support RMA. */ > > >>> + for (int i = 0 ; i < num_eager_btls ; ++i) { > > >>> + mca_bml_base_btl_t *eager_btl = > > >> mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i); > > >>> + if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) { > > >>> + ignore = false; > > >>> + break; > > >>> + } > > >>> + } > > >>> + > > >>> + if (ignore) { > > >>> + continue; > > >>> + } > > >>> > > >>> if (btl->btl_register_mem) { > > >>> /* do not use the RDMA protocol with this btl if 1) leave pinned is > > >> disabled, > > >>> > > >>> > > >>> > > >>> -Nathan > > >>> > > >>> > > >>>> On Aug 5, 2016, at 8:44 AM, Nathan Hjelm <hje...@me.com> wrote: > > >>>> > > >>>> Nope. We are not going to change the flags as this will disable the > blt > > >> for one-sided. Not sure what is going on here as the openib btl should > be > > >> 1) not used for pt2pt, and 2) polled infrequently. > > >>> The btl debug log suggests both of these are the case. Not sure what > is > > >> going on yet. > > >>>> > > >>>> -Nathan > > >>>> > > >>>>> On Aug 5, 2016, at 8:16 AM, r...@open-mpi.org wrote: > > >>>>> > > >>>>> Perhaps those flags need to be the default? > > >>>>> > > >>>>> > > >>>>>> On Aug 5, 2016, at 7:14 AM, tmish...@jcity.maeda.co.jp wrote: > > >>>>>> > > >>>>>> Hi Christoph, > > >>>>>> > > >>>>>> I applied the commits - pull/#1250 as Nathan told me and added > "-mca > > >>>>>> btl_openib_flags 311" to the mpirun command line option, then it > > >> worked for > > >>>>>> me. I don't know the reason, but it looks ATOMIC_FOP in the > > >>>>>> btl_openib_flags degrades the sm/vader perfomance. > > >>>>>> > > >>>>>> Regards, > > >>>>>> Tetsuya Mishima > > >>>>>> > > >>>>>> > > >>>>>> 2016/08/05 22:10:37、"devel"さんは「Re: [OMPI devel] sm BTL > > >> performace of > > >>>>>> the openmpi-2.0.0」で書きました > > >>>>>>> Hello, > > >>>>>>> > > >>>>>>> We see the same problem here on various machines with Open MPI > > >> 2.0.0. > > >>>>>>> To us it seems that enabling the openib btl triggers bad > performance > > >> for > > >>>>>> the sm AND vader btls! > > >>>>>>> --mca btl_base_verbose 10 reports in both cases the correct use > of > > >> sm and > > >>>>>> vader between MPI ranks - only performance differs?! > > >>>>>>> > > >>>>>>> One irritating thing I see in the log output is the following: > > >>>>>>> openib BTL: rdmacm CPC unavailable for use on mlx4_0:1; skipped > > >>>>>>> [rank=1] openib: using port mlx4_0:1 > > >>>>>>> select: init of component openib returned success > > >>>>>>> > > >>>>>>> Did not look into the "Skipped" code part yet, ... > > >>>>>>> > > >>>>>>> Results see below. > > >>>>>>> > > >>>>>>> Best regards > > >>>>>>> Christoph Niethammer > > >>>>>>> > > >>>>>>> -- > > >>>>>>> > > >>>>>>> Christoph Niethammer > > >>>>>>> High Performance Computing Center Stuttgart (HLRS) > > >>>>>>> Nobelstrasse 19 > > >>>>>>> 70569 Stuttgart > > >>>>>>> > > >>>>>>> Tel: ++49(0)711-685-87203 > > >>>>>>> email: nietham...@hlrs.de > > >>>>>>> http://www.hlrs.de/people/niethammer > > >>>>>>> > > >>>>>>> > > >>>>>>> > > >>>>>>> mpirun -np 2 --mca btl self,vader osu_bw > > >>>>>>> # OSU MPI Bandwidth Test > > >>>>>>> # Size Bandwidth (MB/s) > > >>>>>>> 1 4.83 > > >>>>>>> 2 10.30 > > >>>>>>> 4 24.68 > > >>>>>>> 8 49.27 > > >>>>>>> 16 95.80 > > >>>>>>> 32 187.52 > > >>>>>>> 64 270.82 > > >>>>>>> 128 405.00 > > >>>>>>> 256 659.26 > > >>>>>>> 512 1165.14 > > >>>>>>> 1024 2372.83 > > >>>>>>> 2048 3592.85 > > >>>>>>> 4096 4283.51 > > >>>>>>> 8192 5523.55 > > >>>>>>> 16384 7388.92 > > >>>>>>> 32768 7024.37 > > >>>>>>> 65536 7353.79 > > >>>>>>> 131072 7465.96 > > >>>>>>> 262144 8597.56 > > >>>>>>> 524288 9292.86 > > >>>>>>> 1048576 9168.01 > > >>>>>>> 2097152 9009.62 > > >>>>>>> 4194304 9013.02 > > >>>>>>> > > >>>>>>> mpirun -np 2 --mca btl self,vader,openib osu_bw > > >>>>>>> # OSU MPI Bandwidth Test > > >>>>>>> # Size Bandwidth (MB/s) > > >>>>>>> 1 5.32 > > >>>>>>> 2 11.14 > > >>>>>>> 4 20.88 > > >>>>>>> 8 49.26 > > >>>>>>> 16 99.11 > > >>>>>>> 32 197.42 > > >>>>>>> 64 301.08 > > >>>>>>> 128 413.64 > > >>>>>>> 256 651.15 > > >>>>>>> 512 1161.12 > > >>>>>>> 1024 2460.99 > > >>>>>>> 2048 3627.36 > > >>>>>>> 4096 2191.06 > > >>>>>>> 8192 3118.36 > > >>>>>>> 16384 3428.45 > > >>>>>>> 32768 3676.96 > > >>>>>>> 65536 3709.65 > > >>>>>>> 131072 3748.64 > > >>>>>>> 262144 3764.88 > > >>>>>>> 524288 3764.61 > > >>>>>>> 1048576 3772.45 > > >>>>>>> 2097152 3757.37 > > >>>>>>> 4194304 3746.45 > > >>>>>>> > > >>>>>>> mpirun -np 2 --mca btl self,sm osu_bw > > >>>>>>> # OSU MPI Bandwidth Test > > >>>>>>> # Size Bandwidth (MB/s) > > >>>>>>> 1 2.98 > > >>>>>>> 2 5.97 > > >>>>>>> 4 11.99 > > >>>>>>> 8 23.47 > > >>>>>>> 16 50.64 > > >>>>>>> 32 99.91 > > >>>>>>> 64 197.87 > > >>>>>>> 128 343.32 > > >>>>>>> 256 667.48 > > >>>>>>> 512 1200.86 > > >>>>>>> 1024 2050.05 > > >>>>>>> 2048 3578.52 > > >>>>>>> 4096 3966.92 > > >>>>>>> 8192 5687.96 > > >>>>>>> 16384 7395.88 > > >>>>>>> 32768 7101.41 > > >>>>>>> 65536 7619.49 > > >>>>>>> 131072 7978.09 > > >>>>>>> 262144 8648.87 > > >>>>>>> 524288 9129.18 > > >>>>>>> 1048576 10525.31 > > >>>>>>> 2097152 10511.63 > > >>>>>>> 4194304 10489.66 > > >>>>>>> > > >>>>>>> mpirun -np 2 --mca btl self,sm,openib osu_bw > > >>>>>>> # OSU MPI Bandwidth Test > > >>>>>>> # Size Bandwidth (MB/s) > > >>>>>>> 1 2.02 > > >>>>>>> 2 3.00 > > >>>>>>> 4 9.99 > > >>>>>>> 8 19.96 > > >>>>>>> 16 40.10 > > >>>>>>> 32 70.63 > > >>>>>>> 64 144.08 > > >>>>>>> 128 282.21 > > >>>>>>> 256 543.55> >>>>>>> 512 1032.61 > > >>>>>>> 1024 1871.09 > > >>>>>>> 2048 3294.07 > > >>>>>>> 4096 2336.48 > > >>>>>>> 8192 3142.22 > > >>>>>>> 16384 3419.93 > > >>>>>>> 32768 3647.30 > > >>>>>>> 65536 3725.40 > > >>>>>>> 131072 3749.43 > > >>>>>>> 262144 3765.31 > > >>>>>>> 524288 3771.06 > > >>>>>>> 1048576 3772.54 > > >>>>>>> 2097152 3760.93 > > >>>>>>> 4194304 3745.37 > > >>>>>>> > > >>>>>>> ----- Original Message ----- > > >>>>>>> From: tmish...@jcity.maeda.co.jp > > >>>>>>> To: "Open MPI Developers" <de...@open-mpi.org> > > >>>>>>> Sent: Wednesday, July 27, 2016 6:04:48 AM > > >>>>>>> Subject: Re: [OMPI devel] sm BTL performace of the openmpi-2.0.0 > > >>>>>>> > > >>>>>>> HiNathan, > > >>>>>>> > > >>>>>>> I applied those commits and ran again without any BTL specified. > > >>>>>>> > > >>>>>>> Then, although it says "mca: bml: Using vader btl for send to > > >>>>>> [[18993,1],1] > > >>>>>>> on node manage", > > >>>>>>> the osu_bw still shows it's very slow as shown below: > > >>>>>>> > > >>>>>>> [mishima@manage OMB-3.1.1-openmpi2.0.0]$ mpirun -np 2 -mca > > >>>>>> btl_base_verbose > > >>>>>>> 10 -bind-to core -report-bindings osu_bw > > >>>>>>> [manage.cluster:17482] MCW rank 0 bound to socket 0[core 0[hwt > 0]]: > > >>>>>>> [B/././././.][./././././.] > > >>>>>>> [manage.cluster:17482] MCW rank 1 bound to socket 0[core 1[hwt > 0]]: > > >>>>>>> [./B/./././.][./././././.] > > >>>>>>> [manage.cluster:17487] mca: base: components_register: > registering > > >>>>>>> framework btl components > > >>>>>>> [manage.cluster:17487] mca: base: components_register: found > loaded > > >>>>>>> component self > > >>>>>>> [manage.cluster:17487] mca: base: components_register: component > > >> self > > >>>>>>> register function successful > > >>>>>>> [manage.cluster:17487] mca: base: components_register: found > loaded > > >>>>>>> component vader > > >>>>>>> [manage.cluster:17488] mca: base: components_register: > registering > > >>>>>>> framework btl components > > >>>>>>> [manage.cluster:17488] mca: base: components_register: found > loaded > > >>>>>>> component self > > >>>>>>> [manage.cluster:17487] mca: base: components_register: component > > >> vader > > >>>>>>> register function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_register: component > > >> self > > >>>>>>> register function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_register: found > loaded > > >>>>>>> component vader > > >>>>>>> [manage.cluster:17487] mca: base: components_register: found > loaded > > >>>>>>> component tcp > > >>>>>>> [manage.cluster:17488] mca: base: components_register: component > > >> vader>>>>>>> register function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_register: found > loaded > > >>>>>>> component tcp > > >>>>>>> [manage.cluster:17487] mca: base: components_register: component > tcp > > >>>>>>> register function successful > > >>>>>>> [manage.cluster:17487] mca: base: components_register: found > loaded > > >>>>>>> component sm > > >>>>>>> [manage.cluster:17488] mca: base: components_register: component > tcp > > >>>>>>> register function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_register: found > loaded > > >>>>>>> component sm > > >>>>>>> [manage.cluster:17487] mca: base: components_register: component > sm > > >>>>>>> register function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_register: component > sm > > >>>>>>> register function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_register: found > loaded > > >>>>>>> component openib > > >>>>>>> [manage.cluster:17487] mca: base: components_register: found > loaded > > >>>>>>> component openib > > >>>>>>> [manage.cluster:17488] mca: base: components_register: component > > >> openib > > >>>>>>> register function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_open: opening btl > > >> components > > >>>>>>> [manage.cluster:17488] mca: base: components_open: found loaded > > >> component > > >>>>>>> self > > >>>>>>> [manage.cluster:17488] mca: base: components_open: component self > > >> open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_open: found loaded > > >> component > > >>>>>>> vader > > >>>>>>> [manage.cluster:17488] mca: base: components_open: component > vader > > >> open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_open: found loaded > > >> component > > >>>>>>> tcp > > >>>>>>> [manage.cluster:17488] mca: base: components_open: component tcp > > >> open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_open: found loaded > > >> component > > >>>>>>> sm > > >>>>>>> [manage.cluster:17488] mca: base: components_open: component sm > open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17488] mca: base: components_open: found loaded > > >> component > > >>>>>>> openib > > >>>>>>> [manage.cluster:17488] mca: base: components_open: component > openib > > >> open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17488] select: initializing btl component self > > >>>>>>> [manage.cluster:17488] select: init of component self returned > > >> success > > >>>>>>> [manage.cluster:17488] select: initializing btl component vader > > >>>>>>> [manage.cluster:17487] mca: base: components_register: component > > >> openib > > >>>>>>> register function successful > > >>>>>>> [manage.cluster:17487] mca: base: components_open: opening btl > > >> components > > >>>>>>> [manage.cluster:17487] mca: base: components_open: found loaded > > >> component > > >>>>>>> self > > >>>>>>> [manage.cluster:17487] mca: base: components_open: component self > > >> open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17487] mca: base: components_open: found loaded > > >> component > > >>>>>>> vader > > >>>>>>> [manage.cluster:17487] mca: base: components_open: component > vader > > >> open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17487] mca: base: components_open: found loaded > > >> component > > >>>>>>> tcp > > >>>>>>> [manage.cluster:17487] mca: base: components_open: component tcp > > >> open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17487] mca: base: components_open: found loaded > > >> component > > >>>>>>> sm > > >>>>>>> [manage.cluster:17487] mca: base: components_open: component sm > open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17487] mca: base: components_open: found loaded > > >> component > > >>>>>>> openib > > >>>>>>> [manage.cluster:17488] select: init of component vader returned > > >> success > > >>>>>>> [manage.cluster:17488] select: initializing btl component tcp > > >>>>>>> [manage.cluster:17487] mca: base: components_open: component > openib > > >> open > > >>>>>>> function successful > > >>>>>>> [manage.cluster:17487] select: initializing btl component self > > >>>>>>> [manage.cluster:17487] select: init of component self returned > > >> success > > >>>>>>> [manage.cluster:17487] select: initializing btl component vader > > >>>>>>> [manage.cluster:17488] select: init of component tcp returned > > >> success > > >>>>>>> [manage.cluster:17488] select: initializing btl component sm > > >>>>>>> [manage.cluster:17488] select: init of component sm returned > success > > >>>>>>> [manage.cluster:17488] select: initializing btl component openib > > >>>>>>> [manage.cluster:17487] select: init of component vader returned > > >> success > > >>>>>>> [manage.cluster:17487] select: initializing btl component tcp > > >>>>>>> [manage.cluster:17487] select: init of component tcp returned > > >> success > > >>>>>>> [manage.cluster:17487] select: initializing btl component sm > > >>>>>>> [manage.cluster:17488] Checking distance from this process to > > >>>>>> device=mthca0 > > >>>>>>> [manage.cluster:17488] hwloc_distances->nbobjs=2 > > >>>>>>> [manage.cluster:17488] hwloc_distances->latency[0]=1.000000 > > >>>>>>> [manage.cluster:17488] hwloc_distances->latency[1]=1.600000 > > >>>>>>> [manage.cluster:17488] hwloc_distances->latency[2]=1.600000 > > >>>>>>> [manage.cluster:17488] hwloc_distances->latency[3]=1.000000 > > >>>>>>> [manage.cluster:17488] ibv_obj->type set to NULL > > >>>>>>> [manage.cluster:17488] Process is bound: distance to device is > > >> 0.000000 > > >>>>>>> [manage.cluster:17487] select: init of component sm returned > success > > >>>>>>> [manage.cluster:17487] select: initializing btl component openib > > >>>>>>> [manage.cluster:17488] openib BTL: rdmacm CPC unavailable for use > on > > >>>>>>> mthca0:1; skipped > > >>>>>>> [manage.cluster:17487] Checking distance from this process to > > >>>>>> device=mthca0 > > >>>>>>> [manage.cluster:17487] hwloc_distances->nbobjs=2 > > >>>>>>> [manage.cluster:17487] hwloc_distances->latency[0]=1.000000 > > >>>>>>> [manage.cluster:17487] hwloc_distances->latency[1]=1.600000 > > >>>>>>> [manage.cluster:17487] hwloc_distances->latency[2]=1.600000 > > >>>>>>> [manage.cluster:17487] hwloc_distances->latency[3]=1.000000 > > >>>>>>> [manage.cluster:17487] ibv_obj->type set to NULL > > >>>>>>> [manage.cluster:17487] Process is bound: distance to device is > > >> 0.000000 > > >>>>>>> [manage.cluster:17488] [rank=1] openib: using port mthca0:1 > > >>>>>>> [manage.cluster:17488] select: init of component openib returned > > >> success > > >>>>>>> [manage.cluster:17487] openib BTL: rdmacm CPC unavailable for use > on > > >>>>>>> mthca0:1; skipped > > >>>>>>> [manage.cluster:17487] [rank=0] openib: using port mthca0:1>>>> > > >> [manage.cluster:17487] select: init of component openib returned > success > > >>>>>>> [manage.cluster:17488] mca: bml: Using self btl for send to > > >> [[18993,1],1] > > >>>>>>> on node manage > > >>>>>>> [manage.cluster:17487] mca: bml: Using self btl for send to > > >> [[18993,1],0] > > >>>>>>> on node manage > > >>>>>>> [manage.cluster:17488] mca: bml: Using vader btl for send to > > >>>>>> [[18993,1],0] > > >>>>>>> on node manage > > >>>>>>> [manage.cluster:17487] mca: bml: Using vader btl for send to > > >>>>>> [[18993,1],1] > > >>>>>>> on node manage > > >>>>>>> # OSU MPI Bandwidth Test v3.1.1 > > >>>>>>> # Size Bandwidth (MB/s) > > >>>>>>> 1 1.76 > > >>>>>>> 2 3.53 > > >>>>>>> 4 7.06 > > >>>>>>> 8 14.46 > > >>>>>>> 16 29.12 > > >>>>>>> 32 57.54 > > >>>>>>> 64 100.12 > > >>>>>>> 128 157.78 > > >>>>>>> 256 277.32 > > >>>>>>> 512 477.53 > > >>>>>>> 1024 894.81 > > >>>>>>> 2048 1330.68 > > >>>>>>> 4096 278.58 > > >>>>>>> 8192 516.00 > > >>>>>>> 16384 762.99 > > >>>>>>> 32768 1037.19 > > >>>>>>> 65536 1181.66 > > >>>>>>> 131072 1261.91 > > >>>>>>> 262144 1237.39 > > >>>>>>> 524288 1247.86 > > >>>>>>> 1048576 1252.04 > > >>>>>>> 2097152 1273.46 > > >>>>>>> 4194304 1281.21 > > >>>>>>> [manage.cluster:17488] mca: base: close: component self closed > > >>>>>>> [manage.cluster:17488] mca: base: close: unloading component self > > >>>>>>> [manage.cluster:17487] mca: base: close: component self closed > > >>>>>>> [manage.cluster:17487] mca: base: close: unloading component self > > >>>>>>> [manage.cluster:17488] mca: base: close: component vader closed > > >>>>>>> [manage.cluster:17488] mca: base: close: unloading component > vader > > >>>>>>> [manage.cluster:17487] mca: base: close: component vader closed > > >>>>>>> [manage.cluster:17487] mca: base: close: unloading component > vader > > >>>>>>> [manage.cluster:17488] mca: base: close: component tcp closed > > >>>>>>> [manage.cluster:17488] mca: base: close: unloading component tcp > > >>>>>>> [manage.cluster:17487] mca: base: close: component tcp closed > > >>>>>>> [manage.cluster:17487] mca: base: close: unloading component tcp > > >>>>>>> [manage.cluster:17488] mca: base: close: component sm closed > > >>>>>>> [manage.cluster:17488] mca: base: close: unloading component sm > > >>>>>>> [manage.cluster:17487] mca: base: close: component sm closed > > >>>>>>> [manage.cluster:17487] mca: base: close: unloading component sm > > >>>>>>> [manage.cluster:17488] mca: base: close: component openib closed > > >>>>>>> [manage.cluster:17488] mca: base: close: unloading component > openib > > >>>>>>> [manage.cluster:17487] mca: base: close: component openib closed > > >>>>>>> [manage.cluster:17487] mca: base: close: unloading component > openib > > >>>>>>> > > >>>>>>> Tetsuya Mishima > > >>>>>>> > > >>>>>>> 2016/07/27 9:20:28、"devel"さんは「Re: [OMPI devel] sm BTL > > >> performace of > > >>>>>>> the openmpi-2.0.0」で書きました > > >>>>>>>> sm is deprecated in 2.0.0 and will likely be removed in favor of > > >> vader > > >>>>>> in > > >>>>>>> 2.1.0. > > >>>>>>>> > > >>>>>>>> This issue is probably this known issue: > > >>>>>>> https://github.com/open-mpi/ompi-release/pull/1250 > > >>>>>>>> > > >>>>>>>> Please apply those commits and see if it fixes the issue for > you. > > >>>>>>>> > > >>>>>>>> -Nathan > > >>>>>>>> > > >>>>>>>>> On Jul 26, 2016, at 6:17 PM, tmish...@jcity.maeda.co.jp wrote: > > >>>>>>>>> > > >>>>>>>>> Hi Gilles, > > >>>>>>>>> > > >>>>>>>>> Thanks. I ran again with --mca pml ob1 but I've got the same > > >> results > > >>>>>> as > > >>>>>>>>> below: > > >>>>>>>>> > > >>>>>>>>> [mishima@manage OMB-3.1.1-openmpi2.0.0]$ mpirun -np 2 -mca pml > ob1 > > >>>>>>> -bind-to > > >>>>>>>>> core -report-bindings osu_bw > > >>>>>>>>> [manage.cluster:18142] MCW rank 0 bound to socket 0[core 0 [hwt > > >> 0]]: > > >>>>>>>>> [B/././././.][./././././.] > > >>>>>>>>> [manage.cluster:18142] MCW rank 1 bound to socket 0[core 1 [hwt > > >> 0]]: > > >>>>>>>>> [./B/./././.][./././././.] > > >>>>>>>>> # OSU MPI Bandwidth Test v3.1.1 > > >>>>>>>>> # Size Bandwidth (MB/s) > > >>>>>>>>> 1 1.48 > > >>>>>>>>> 2 3.07 > > >>>>>>>>> 4 6.26 > > >>>>>>>>> 8 12.53 > > >>>>>>>>> 16 24.33 _______________________________________________ devel mailing list devel@lists.open-mpi.org https://rfd.newmexicoconsortium.org/mailman/listinfo/devel