Hey Sasha,

patch series seems to perform well.  As a reminder mvapich 0.9.9 does
not how to deal with multiple lids, openMPI 1.2.6 does.  The "preserve
lids" condition is labeled "PL".

With LMC > 0, the preserve lids seems to do its job under mvapich 0.9.9,
and atleast maintains (sometimes increases) performance of openmpi 1.2.6
with LMC > 0.  Numbers are attached.

I'd give my thumbs up to the patch.  Lets commit it.

Al

On Tue, 2008-06-10 at 05:59 +0300, Sasha Khapyorsky wrote:
> Basically this addresses the problem described by Al Chu in:
> 
> http://lists.openfabrics.org/pipermail/general/2008-April/049132.html
> 
> When base lid paths become completely disbalanced on a fabrics with
> lmc > 0.
> 
> One feedback was from Yiftah Shahar:
> 
> "I think that our requirements should be that even when you are working
> with LMC>0 then the base LID routing should not be affected.
> One way to achieve this goal is to first run the base-LID routing (so
> all base LID improvement will be also in LMC>0) and then start with the
> other LIDs as round-robbing starting from the base-lid-port + 1
> according current routing algorithm rules (keeping min-hop, up/down...)."
> 
> We had some discussion with Al and Yiftah about this and considered that
> in addition to "pure" base lid paths preservation (which is good thing by
> itself) proposed method solves original lid disbalancing problem as well.
> 
> This patch is implementation of the idea above.
> 
> Signed-off-by: Sasha Khapyorsky <[EMAIL PROTECTED]>
> ---
>  opensm/include/opensm/osm_switch.h |    4 +
>  opensm/opensm/osm_dump.c           |    3 +-
>  opensm/opensm/osm_switch.c         |    8 ++-
>  opensm/opensm/osm_ucast_mgr.c      |  163 
> ++++++++++++++++--------------------
>  4 files changed, 86 insertions(+), 92 deletions(-)
> 
> diff --git a/opensm/include/opensm/osm_switch.h 
> b/opensm/include/opensm/osm_switch.h
> index 0e9c5fa..c1521a6 100644
> --- a/opensm/include/opensm/osm_switch.h
> +++ b/opensm/include/opensm/osm_switch.h
> @@ -981,6 +981,7 @@ uint8_t
>  osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
>                         IN osm_port_t * p_port,
>                         IN const uint16_t lid_ho,
> +                       IN unsigned start_from,
>                         IN const boolean_t ignore_existing,
>                         IN const boolean_t dor);
>  /*
> @@ -995,6 +996,9 @@ osm_switch_recommend_path(IN const osm_switch_t * const 
> p_sw,
>  *    lid_ho
>  *            [in] LID value (host order) for which to get a path advisory.
>  *
> +*    start_from
> +*            [in] Port number from where to start balance counting.
> +*
>  *    ignore_existing
>  *            [in] Set to cause the switch to choose the optimal route
>  *            regardless of existing paths.
> diff --git a/opensm/opensm/osm_dump.c b/opensm/opensm/osm_dump.c
> index b96984b..60c6d25 100644
> --- a/opensm/opensm/osm_dump.c
> +++ b/opensm/opensm/osm_dump.c
> @@ -218,7 +218,8 @@ static void dump_ucast_routes(cl_map_item_t *p_map_item, 
> FILE *file, void *cxt)
>               else {
>                       /* No LMC Optimization */
>                       best_port = osm_switch_recommend_path(p_sw, p_port,
> -                                                           lid_ho, TRUE, 
> dor);
> +                                                           lid_ho, 1, TRUE,
> +                                                           dor);
>                       fprintf(file, "No %u hop path possible via port %u!",
>                               best_hops, best_port);
>               }
> diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c
> index 58936e3..a9d13c8 100644
> --- a/opensm/opensm/osm_switch.c
> +++ b/opensm/opensm/osm_switch.c
> @@ -274,6 +274,7 @@ uint8_t
>  osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
>                         IN osm_port_t * p_port,
>                         IN const uint16_t lid_ho,
> +                       IN unsigned start_from,
>                         IN const boolean_t ignore_existing,
>                         IN const boolean_t dor)
>  {
> @@ -294,6 +295,7 @@ osm_switch_recommend_path(IN const osm_switch_t * const 
> p_sw,
>       uint8_t port_num;
>       uint8_t num_ports;
>       uint32_t least_paths = 0xFFFFFFFF;
> +     unsigned i;
>       /*
>          The follwing will track the least paths if the
>          route should go through a new system/node
> @@ -397,8 +399,10 @@ osm_switch_recommend_path(IN const osm_switch_t * const 
> p_sw,
>        */
>  
>       /* port number starts with one and num_ports is 1 + num phys ports */
> -     for (port_num = 1; port_num < num_ports; port_num++) {
> -             if (osm_switch_get_hop_count(p_sw, base_lid, port_num) !=
> +     for (i = start_from; i < start_from + num_ports; i++) {
> +             port_num = i%num_ports;
> +             if (!port_num ||
> +                 osm_switch_get_hop_count(p_sw, base_lid, port_num) !=
>                   least_hops)
>                       continue;
>  
> diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
> index c073037..2aae6d5 100644
> --- a/opensm/opensm/osm_ucast_mgr.c
> +++ b/opensm/opensm/osm_ucast_mgr.c
> @@ -208,7 +208,8 @@ find_and_add_remote_sys(osm_switch_t *sw, uint8_t port,
>  static void
>  __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const p_mgr,
>                            IN osm_switch_t * const p_sw,
> -                          IN osm_port_t * const p_port)
> +                          IN osm_port_t * const p_port,
> +                          IN unsigned lid_offset)
>  {
>       uint16_t min_lid_ho;
>       uint16_t max_lid_ho;
> @@ -217,19 +218,14 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const 
> p_mgr,
>       boolean_t is_ignored_by_port_prof;
>       ib_net64_t node_guid;
>       struct osm_routing_engine *p_routing_eng;
> -     /*
> -        The following are temporary structures that will aid
> -        in providing better routing in LMC > 0 situations
> -      */
> -     uint16_t lids_per_port = 1 << p_mgr->p_subn->opt.lmc;
> -     struct osm_remote_node *p_remote_guid_used = NULL;
> +     unsigned start_from = 1;
>  
>       OSM_LOG_ENTER(p_mgr->p_log);
>  
>       osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho);
>  
> -     /* If the lids are zero - then there was some problem with the 
> initialization.
> -        Don't handle this port. */
> +     /* If the lids are zero - then there was some problem with
> +      * the initialization. Don't handle this port. */
>       if (min_lid_ho == 0 || max_lid_ho == 0) {
>               OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A04: "
>                       "Port 0x%" PRIx64 " has LID 0. An initialization "
> @@ -238,16 +234,22 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const 
> p_mgr,
>               goto Exit;
>       }
>  
> -     if (osm_log_is_active(p_mgr->p_log, OSM_LOG_DEBUG)) {
> +     lid_ho = min_lid_ho + lid_offset;
> +
> +     if (lid_ho > max_lid_ho)
> +             goto Exit;
> +
> +     if (lid_offset)
> +             /* ignore potential overflow - it is handled in osm_switch.c */
> +             start_from = osm_switch_get_port_by_lid(p_sw, lid_ho - 1) + 1;
> +
> +     if (osm_log_is_active(p_mgr->p_log, OSM_LOG_DEBUG))
>               OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
> -                     "Processing port 0x%" PRIx64 ", LIDs [0x%X,0x%X]\n",
> -                     cl_ntoh64(osm_port_get_guid(p_port)),
> +                     "Processing port 0x%" PRIx64 ", LID %u [0x%X,0x%X]\n",
> +                     cl_ntoh64(osm_port_get_guid(p_port)), lid_ho,
>                       min_lid_ho, max_lid_ho);
> -     }
>  
> -     /*
> -        TO DO - This should be runtime error, not a CL_ASSERT()
> -      */
> +     /* TODO - This should be runtime error, not a CL_ASSERT() */
>       CL_ASSERT(max_lid_ho < osm_switch_get_fwd_tbl_size(p_sw));
>  
>       node_guid = osm_node_get_node_guid(p_sw->p_node);
> @@ -260,80 +262,62 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const 
> p_mgr,
>          how best to distribute the LID range across the ports
>          that can reach those LIDs.
>        */
> -     for (lid_ho = min_lid_ho; lid_ho <= max_lid_ho; lid_ho++) {
> -             /* Use the enhanced algorithm only for LMC > 0 */
> -             if (lids_per_port > 1) {
> -                     port = osm_switch_recommend_path(p_sw, p_port, lid_ho,
> -                                                      p_mgr->p_subn->
> -                                                      ignore_existing_lfts,
> -                                                      p_mgr->is_dor);
> -                     if (port > 0 && port != OSM_NO_PATH && p_port->priv)
> -                             p_remote_guid_used =
> -                                 find_and_add_remote_sys(p_sw, port,
> -                                                         p_port->priv);
> +     port = osm_switch_recommend_path(p_sw, p_port, lid_ho, start_from,
> +                                      p_mgr->p_subn->ignore_existing_lfts,
> +                                      p_mgr->is_dor);
> +
> +     if (port == OSM_NO_PATH) {
> +             /* do not try to overwrite the ppro of non existing port ... */
> +             is_ignored_by_port_prof = TRUE;
> +
> +             /* Up/Down routing can cause unreachable routes between some
> +                switches so we do not report that as an error in that case */
> +             if (!p_routing_eng->build_lid_matrices) {
> +                     OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A08: "
> +                             "No path to get to LID 0x%X from switch 0x%"
> +                             PRIx64 "\n", lid_ho, cl_ntoh64(node_guid));
> +                     /* trigger a new sweep - try again ... */
> +                     p_mgr->p_subn->subnet_initialization_error = TRUE;
>               } else
> -                     port = osm_switch_recommend_path(p_sw, p_port, lid_ho,
> -                                                      p_mgr->p_subn->
> -                                                      ignore_existing_lfts,
> -                                                      p_mgr->is_dor);
> +                     OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
> +                             "No path to get to LID 0x%X from switch 0x%"
> +                             PRIx64 "\n", lid_ho, cl_ntoh64(node_guid));
> +     } else {
> +             OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
> +                     "Routing LID 0x%X to port 0x%X"
> +                     " for switch 0x%" PRIx64 "\n",
> +                     lid_ho, port, cl_ntoh64(node_guid));
>  
>               /*
> -                There might be no path to the target
> +                we would like to optionally ignore this port in equalization
> +                as in the case of the Mellanox Anafa Internal PCI TCA port
>                */
> -             if (port == OSM_NO_PATH) {
> -                     /* do not try to overwrite the ppro of non existing 
> port ... */
> -                     is_ignored_by_port_prof = TRUE;
> -
> -                     /* Up/Down routing can cause unreachable routes between 
> some
> -                        switches so we do not report that as an error in 
> that case */
> -                     if (!p_routing_eng->build_lid_matrices) {
> -                             OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A08: 
> "
> -                                     "No path to get to LID 0x%X from switch 
> 0x%"
> -                                     PRIx64 "\n", lid_ho,
> -                                     cl_ntoh64(node_guid));
> -                             /* trigger a new sweep - try again ... */
> -                             p_mgr->p_subn->subnet_initialization_error =
> -                                 TRUE;
> -                     } else
> -                             OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
> -                                     "No path to get to LID 0x%X from switch 
> 0x%"
> -                                     PRIx64 "\n", lid_ho,
> -                                     cl_ntoh64(node_guid));
> -             } else {
> -                     OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
> -                             "Routing LID 0x%X to port 0x%X"
> -                             " for switch 0x%" PRIx64 "\n",
> -                             lid_ho, port, cl_ntoh64(node_guid));
> -
> -                     /*
> -                        we would like to optionally ignore this port in 
> equalization
> -                        as in the case of the Mellanox Anafa Internal PCI 
> TCA port
> -                      */
> -                     is_ignored_by_port_prof =
> -                         osm_port_prof_is_ignored_port(p_mgr->p_subn,
> -                                                       node_guid, port);
> -
> -                     /*
> -                        We also would ignore this route if the target lid is 
> of a switch
> -                        and the port_profile_switch_node is not TRUE
> -                      */
> -                     if (!p_mgr->p_subn->opt.port_profile_switch_nodes) {
> -                             is_ignored_by_port_prof |=
> -                                 (osm_node_get_type(p_port->p_node) ==
> -                                  IB_NODE_TYPE_SWITCH);
> -                     }
> -             }
> +             is_ignored_by_port_prof =
> +                 osm_port_prof_is_ignored_port(p_mgr->p_subn,
> +                                               node_guid, port);
>  
>               /*
> -                We have selected the port for this LID.
> -                Write it to the forwarding tables.
> +                We also would ignore this route if the target lid is of
> +                a switch and the port_profile_switch_node is not TRUE
>                */
> -             p_mgr->lft_buf[lid_ho] = port;
> -             if (!is_ignored_by_port_prof) {
> -                     osm_switch_count_path(p_sw, port);
> -                     if (p_remote_guid_used)
> -                             p_remote_guid_used->forwarded_to++;
> -             }
> +             if (!p_mgr->p_subn->opt.port_profile_switch_nodes)
> +                     is_ignored_by_port_prof |=
> +                         (osm_node_get_type(p_port->p_node) ==
> +                          IB_NODE_TYPE_SWITCH);
> +     }
> +
> +     /*
> +        We have selected the port for this LID.
> +        Write it to the forwarding tables.
> +      */
> +     p_mgr->lft_buf[lid_ho] = port;
> +     if (!is_ignored_by_port_prof) {
> +             struct osm_remote_node *rem_node_used;
> +             osm_switch_count_path(p_sw, port);
> +             if (port > 0 && p_port->priv &&
> +                 (rem_node_used = find_and_add_remote_sys(p_sw, port,
> +                                                          p_port->priv)))
> +                     rem_node_used->forwarded_to++;
>       }
>  
>  Exit:
> @@ -512,6 +496,7 @@ __osm_ucast_mgr_process_tbl(IN cl_map_item_t * const 
> p_map_item,
>       osm_node_t *p_node;
>       osm_port_t *p_port;
>       const cl_qmap_t *p_port_tbl;
> +     unsigned i, lids_per_port;
>  
>       OSM_LOG_ENTER(p_mgr->p_log);
>  
> @@ -538,12 +523,12 @@ __osm_ucast_mgr_process_tbl(IN cl_map_item_t * const 
> p_map_item,
>          Iterate through every port setting LID routes for each
>          port based on base LID and LMC value.
>        */
> -
> -     for (p_port = (osm_port_t *) cl_qmap_head(p_port_tbl);
> -          p_port != (osm_port_t *) cl_qmap_end(p_port_tbl);
> -          p_port = (osm_port_t *) cl_qmap_next(&p_port->map_item)) {
> -             __osm_ucast_mgr_process_port(p_mgr, p_sw, p_port);
> -     }
> +     lids_per_port = 1 << p_mgr->p_subn->opt.lmc;
> +     for (i = 0; i < lids_per_port; i++)
> +             for (p_port = (osm_port_t *) cl_qmap_head(p_port_tbl);
> +                  p_port != (osm_port_t *) cl_qmap_end(p_port_tbl);
> +                  p_port = (osm_port_t *) cl_qmap_next(&p_port->map_item))
> +                     __osm_ucast_mgr_process_port(p_mgr, p_sw, p_port, i);
>  
>       osm_ucast_mgr_set_fwd_table(p_mgr, p_sw);
>  
-- 
Albert Chu
[EMAIL PROTECTED]
925-422-5311
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory

Attachment: mpi_preserve_base_lids.xls
Description: MS-Excel spreadsheet

_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to