Currently, MADs are pipelined to a single switch at a time which effectively serializes these requests due to processing at the SMA. This patch pipelines (stripes) them across the switches first before proceeding with successive blocks. As a result of this striping, multiple switches can process the set and respond concurrently which results in an improvement to the subnet initialization time.
All unicast routing protocols are updated for this. A similar subsequent change will do this for MFTs. Yevgeny Kliteynik <[email protected]> wrote: With a small cluster of 17 IS4 switches and 11 HCAs and to artificially increase the cluster, LMC of 7 was used including EnhancedSwitchPort 0 LMC. With the new code, LFT configuration is more than twice as fast as with the old code :) Current ucast manager ran on avarage for ~250msec, with the new code - 110-120msec. Routing calculation phase of the ucast manager took ~1200 usec, the rest was sending the blocks and waiting for no more pending transactions. Here are some detailed results of different executions (the number on the left is timer value in usec): Current ucast manager (w/o the optimization): 000000 [LFT]: osm_ucast_mgr_process() - START 001131 [LFT]: ucast_mgr_process_tbl() - START 032251 [LFT]: ucast_mgr_process_tbl() - END 032263 [LFT]: osm_ucast_mgr_process() - END 253416 [LFT]: Done wait_for_pending_transactions() New algorithm: 001417 [LFT]: osm_ucast_mgr_process() - START 002690 [LFT]: ucast_mgr_process_tbl() - START 032946 [LFT]: ucast_mgr_process_tbl() - END 032948 [LFT]: osm_ucast_pipeline_tbl() - START 033846 [LFT]: osm_ucast_pipeline_tbl() - END 033858 [LFT]: osm_ucast_mgr_process() - END 108203 [LFT]: Done wait_for_pending_transactions() With IS3 based Qlogic switches, which do not handle DR packets forwarding in HW, with a fabric of ~1100 HCAs, ~280 switches: Current OSM configures LFTs in ~2 seconds. New algorithm does the same job in 1.4-1.6 seconds (30%-20% speed up). Signed-off-by: Hal Rosenstock <[email protected]> --- Changes since v2: Eliminated max_smps_per_node Moved LFTs pushing up to ucast_mgr_route level from the individual routing engines Changes since v1: Added Yevgeny's performance data No change to actual patch diff --git a/opensm/include/opensm/osm_ucast_mgr.h b/opensm/include/opensm/osm_ucast_mgr.h index a040476..4ef045c 100644 --- a/opensm/include/opensm/osm_ucast_mgr.h +++ b/opensm/include/opensm/osm_ucast_mgr.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved. - * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two @@ -242,16 +242,12 @@ osm_ucast_mgr_init(IN osm_ucast_mgr_t * const p_mgr, IN struct osm_sm * sm); * * SYNOPSIS */ -int osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * const p_mgr, - IN osm_switch_t * const p_sw); +void osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * const p_mgr); /* * PARAMETERS * p_mgr * [in] Pointer to an osm_ucast_mgr_t object. * -* p_mgr -* [in] Pointer to an osm_switch_t object. -* * SEE ALSO * Unicast Manager *********/ diff --git a/opensm/opensm/osm_ucast_cache.c b/opensm/opensm/osm_ucast_cache.c index 216b496..30a3c1d 100644 --- a/opensm/opensm/osm_ucast_cache.c +++ b/opensm/opensm/osm_ucast_cache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2008,2009 Mellanox Technologies LTD. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -1085,9 +1085,10 @@ int osm_ucast_cache_process(osm_ucast_mgr_t * p_mgr) memset(p_sw->lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1); } - osm_ucast_mgr_set_fwd_table(p_mgr, p_sw); } + osm_ucast_mgr_set_fwd_table(p_mgr); + return 0; } diff --git a/opensm/opensm/osm_ucast_file.c b/opensm/opensm/osm_ucast_file.c index 2505c46..5b73ca5 100644 --- a/opensm/opensm/osm_ucast_file.c +++ b/opensm/opensm/osm_ucast_file.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2006,2007 Voltaire, Inc. All rights reserved. - * Copyright (c) 2008 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2008,2009 Mellanox Technologies LTD. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -167,9 +167,6 @@ static int do_ucast_file_load(void *context) "skipping parsing. Using default " "routing algorithm\n"); } else if (!strncmp(p, "Unicast lids", 12)) { - if (p_sw) - osm_ucast_mgr_set_fwd_table(&p_osm->sm. - ucast_mgr, p_sw); q = strstr(p, " guid 0x"); if (!q) { OSM_LOG(&p_osm->log, OSM_LOG_ERROR, @@ -220,7 +217,7 @@ static int do_ucast_file_load(void *context) return -1; } p = q; - /* additionally try to exract guid */ + /* additionally try to extract guid */ q = strstr(p, " portguid 0x"); if (!q) { OSM_LOG(&p_osm->log, OSM_LOG_VERBOSE, @@ -246,9 +243,6 @@ static int do_ucast_file_load(void *context) } } - if (p_sw) - osm_ucast_mgr_set_fwd_table(&p_osm->sm.ucast_mgr, p_sw); - fclose(file); return 0; } diff --git a/opensm/opensm/osm_ucast_ftree.c b/opensm/opensm/osm_ucast_ftree.c index bde6dbd..6ec6bc7 100644 --- a/opensm/opensm/osm_ucast_ftree.c +++ b/opensm/opensm/osm_ucast_ftree.c @@ -2,7 +2,7 @@ * Copyright (c) 2009 Simula Research Laboratory. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved. - * Copyright (c) 2002-2007 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two @@ -1905,8 +1905,6 @@ static void set_sw_fwd_table(IN cl_map_item_t * const p_map_item, ftree_fabric_t *p_ftree = (ftree_fabric_t *) context; p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid; - osm_ucast_mgr_set_fwd_table(&p_ftree->p_osm->sm.ucast_mgr, - p_sw->p_osm_sw); } /*************************************************** diff --git a/opensm/opensm/osm_ucast_lash.c b/opensm/opensm/osm_ucast_lash.c index b3107f0..0a567b3 100644 --- a/opensm/opensm/osm_ucast_lash.c +++ b/opensm/opensm/osm_ucast_lash.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved. - * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. * Copyright (c) 2007 Simula Research Laboratory. All rights reserved. * Copyright (c) 2007 Silicon Graphics Inc. All rights reserved. @@ -990,7 +990,6 @@ static void populate_fwd_tbls(lash_t * p_lash) { osm_log_t *p_log = &p_lash->p_osm->log; osm_subn_t *p_subn = &p_lash->p_osm->subn; - osm_opensm_t *p_osm = p_lash->p_osm; osm_switch_t *p_sw, *p_next_sw, *p_dst_sw; osm_port_t *port; uint16_t max_lid_ho, lid; @@ -1054,7 +1053,6 @@ static void populate_fwd_tbls(lash_t * p_lash) physical_egress_port); } } /* for */ - osm_ucast_mgr_set_fwd_table(&p_osm->sm.ucast_mgr, p_sw); } OSM_LOG_EXIT(p_log); } diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c index 78a7031..e28752a 100644 --- a/opensm/opensm/osm_ucast_mgr.c +++ b/opensm/opensm/osm_ucast_mgr.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved. - * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two @@ -315,16 +315,13 @@ Exit: /********************************************************************** **********************************************************************/ -int osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * p_mgr, - IN osm_switch_t * p_sw) +static int set_fwd_tbl_top(IN osm_ucast_mgr_t * p_mgr, IN osm_switch_t * p_sw) { osm_node_t *p_node; osm_dr_path_t *p_path; osm_madw_context_t context; ib_api_status_t status; ib_switch_info_t si; - uint16_t block_id_ho = 0; - uint8_t block[IB_SMP_DATA_SIZE]; boolean_t set_swinfo_require = FALSE; uint16_t lin_top; uint8_t life_state; @@ -382,48 +379,6 @@ int osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * p_mgr, ib_get_err_str(status)); } - /* - Send linear forwarding table blocks to the switch - as long as the switch indicates it has blocks needing - configuration. - */ - - context.lft_context.node_guid = osm_node_get_node_guid(p_node); - context.lft_context.set_method = TRUE; - - if (!p_sw->new_lft) { - /* any routing should provide the new_lft */ - CL_ASSERT(p_mgr->p_subn->opt.use_ucast_cache && - p_mgr->cache_valid && !p_sw->need_update); - goto Exit; - } - - for (block_id_ho = 0; - osm_switch_get_lft_block(p_sw, block_id_ho, block); - block_id_ho++) { - if (!p_sw->need_update && !p_mgr->p_subn->need_update && - !memcmp(block, - p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE, - IB_SMP_DATA_SIZE)) - continue; - - OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, - "Writing FT block %u\n", block_id_ho); - - status = osm_req_set(p_mgr->sm, p_path, - p_sw->new_lft + - block_id_ho * IB_SMP_DATA_SIZE, - sizeof(block), IB_MAD_ATTR_LIN_FWD_TBL, - cl_hton32(block_id_ho), CL_DISP_MSGID_NONE, - &context); - - if (status != IB_SUCCESS) - OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A05: " - "Sending linear fwd. tbl. block failed (%s)\n", - ib_get_err_str(status)); - } - -Exit: OSM_LOG_EXIT(p_mgr->p_log); return 0; } @@ -508,7 +463,7 @@ static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item, } } - osm_ucast_mgr_set_fwd_table(p_mgr, p_sw); + set_fwd_tbl_top(p_mgr, p_sw); if (p_mgr->p_subn->opt.lmc) free_ports_priv(p_mgr); @@ -516,6 +471,101 @@ static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item, OSM_LOG_EXIT(p_mgr->p_log); } +static void ucast_mgr_process_top(IN cl_map_item_t * p_map_item, + IN void *context) +{ + osm_ucast_mgr_t *p_mgr = context; + osm_switch_t *const p_sw = (osm_switch_t *) p_map_item; + + set_fwd_tbl_top(p_mgr, p_sw); +} + +static boolean_t set_next_lft_block(IN osm_switch_t * p_sw, IN osm_sm_t * p_sm, + IN uint8_t * p_block, + IN osm_dr_path_t * p_path, + IN uint16_t block_id_ho, + IN osm_madw_context_t * p_context) +{ + ib_api_status_t status; + boolean_t sts; + + OSM_LOG_ENTER(p_sm->p_log); + + for (; + (sts = osm_switch_get_lft_block(p_sw, block_id_ho, p_block)); + block_id_ho++) { + if (!p_sw->need_update && !p_sm->p_subn->need_update && + !memcmp(p_block, + p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE, + IB_SMP_DATA_SIZE)) + continue; + + OSM_LOG(p_sm->p_log, OSM_LOG_DEBUG, + "Writing FT block %u to switch 0x%" PRIx64 "\n", + block_id_ho, + cl_ntoh64(p_context->lft_context.node_guid)); + + status = osm_req_set(p_sm, p_path, + p_sw->new_lft + + block_id_ho * IB_SMP_DATA_SIZE, + IB_SMP_DATA_SIZE, IB_MAD_ATTR_LIN_FWD_TBL, + cl_hton32(block_id_ho), + CL_DISP_MSGID_NONE, p_context); + + if (status != IB_SUCCESS) + OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR 3A05: " + "Sending linear fwd. tbl. block failed (%s)\n", + ib_get_err_str(status)); + break; + } + + OSM_LOG_EXIT(p_sm->p_log); + return sts; +} + +static boolean_t pipeline_next_lft_block(IN osm_switch_t *p_sw, + IN osm_ucast_mgr_t *p_mgr, + IN uint16_t block_id_ho) +{ + osm_dr_path_t *p_path; + osm_madw_context_t context; + uint8_t block[IB_SMP_DATA_SIZE]; + boolean_t status; + + OSM_LOG_ENTER(p_mgr->p_log); + + CL_ASSERT(p_sw && p_sw->p_node); + + OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG, + "Processing switch 0x%" PRIx64 "\n", + cl_ntoh64(osm_node_get_node_guid(p_sw->p_node))); + + /* + Send linear forwarding table blocks to the switch + as long as the switch indicates it has blocks needing + configuration. + */ + if (!p_sw->new_lft) { + /* any routing should provide the new_lft */ + CL_ASSERT(p_mgr->p_subn->opt.use_ucast_cache && + p_mgr->cache_valid && !p_sw->need_update); + status = FALSE; + goto Exit; + } + + p_path = osm_physp_get_dr_path_ptr(osm_node_get_physp_ptr(p_sw->p_node, 0)); + + context.lft_context.node_guid = osm_node_get_node_guid(p_sw->p_node); + context.lft_context.set_method = TRUE; + + status = set_next_lft_block(p_sw, p_mgr->sm, &block[0], p_path, + block_id_ho, &context); + +Exit: + OSM_LOG_EXIT(p_mgr->p_log); + return status; +} + /********************************************************************** **********************************************************************/ static void ucast_mgr_process_neighbors(IN cl_map_item_t * p_map_item, @@ -731,7 +781,6 @@ static int ucast_mgr_setup_all_switches(osm_subn_t * p_subn) /********************************************************************** **********************************************************************/ - static int add_guid_to_order_list(void *ctx, uint64_t guid, char *p) { osm_ucast_mgr_t *m = ctx; @@ -870,6 +919,30 @@ static void sort_ports_by_switch_load(osm_ucast_mgr_t * m) add_sw_endports_to_order_list(s[i], m); } +static void ucast_mgr_pipeline_fwd_tbl(osm_ucast_mgr_t * p_mgr) +{ + cl_qmap_t *p_sw_tbl; + osm_switch_t *p_sw; + uint16_t block_id_ho = 0; + int sws_notdone; + boolean_t sts; + + p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl; + while (1) { + p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); + sws_notdone = 0; + while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) { + sts = pipeline_next_lft_block(p_sw, p_mgr, block_id_ho); + if (sts) + sws_notdone++; + p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); + } + if (!sws_notdone) + break; + block_id_ho++; + } +} + static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr) { cl_qlist_init(&p_mgr->port_order_list); @@ -904,6 +977,8 @@ static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr) cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, ucast_mgr_process_tbl, p_mgr); + ucast_mgr_pipeline_fwd_tbl(p_mgr); + cl_qlist_remove_all(&p_mgr->port_order_list); return 0; @@ -911,6 +986,16 @@ static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr) /********************************************************************** **********************************************************************/ +void osm_ucast_mgr_set_fwd_table(osm_ucast_mgr_t * p_mgr) +{ + cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, + ucast_mgr_process_top, p_mgr); + + ucast_mgr_pipeline_fwd_tbl(p_mgr); +} + +/********************************************************************** + **********************************************************************/ static int ucast_mgr_route(struct osm_routing_engine *r, osm_opensm_t * osm) { int ret; @@ -940,6 +1025,9 @@ static int ucast_mgr_route(struct osm_routing_engine *r, osm_opensm_t * osm) osm->routing_engine_used = osm_routing_engine_type(r->name); + if (r->ucast_build_fwd_tables) + osm_ucast_mgr_set_fwd_table(&osm->sm.ucast_mgr); + return 0; } _______________________________________________ general mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
