On Wed, 2010-03-10 at 11:06 -0700, Jim Schutt wrote:
> If unicast routing fails, there is no point to continuing with fabric
> bring-up.
> Just restart a new heavy sweep instead.
>
> Signed-off-by: Jim Schutt <[email protected]>
> ---
> opensm/opensm/osm_state_mgr.c | 12 +++++++++---
> opensm/opensm/osm_ucast_mgr.c | 14 +++++++++-----
> 2 files changed, 18 insertions(+), 8 deletions(-)
>
> diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c
> index 96ad348..e666034 100644
> --- a/opensm/opensm/osm_state_mgr.c
> +++ b/opensm/opensm/osm_state_mgr.c
> @@ -1140,7 +1140,11 @@ static void do_sweep(osm_sm_t * sm)
> /* Re-program the switches fully */
> sm->p_subn->ignore_existing_lfts = TRUE;
>
> - osm_ucast_mgr_process(&sm->ucast_mgr);
> + if (osm_ucast_mgr_process(&sm->ucast_mgr)) {
> + OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE,
> + "REROUTE FAILED");
> + return;
> + }
> osm_qos_setup(sm->p_subn->p_osm);
>
> /* Reset flag */
> @@ -1299,12 +1303,14 @@ repeat_discovery:
> "LID ASSIGNMENT COMPLETE - STARTING SWITCH TABLE
> CONFIG");
>
> /*
> - * Proceed with unicast forwarding table configuration.
> + * Proceed with unicast forwarding table configuration; repeat
> + * if unicast routing fails.
> */
>
> if (!sm->ucast_mgr.cache_valid ||
> osm_ucast_cache_process(&sm->ucast_mgr))
> - osm_ucast_mgr_process(&sm->ucast_mgr);
> + if (osm_ucast_mgr_process(&sm->ucast_mgr))
> + goto repeat_discovery;
>
> osm_qos_setup(sm->p_subn->p_osm);
>
Sorry I missed this: do_sweep() should just return early on
unicast route failure.
If osm_ucast_mgr_process() fails, no configured routing engine was able
to route the fabric. In that case, do_sweep() should just return,
and a new sweep will be triggered either on a trap due to a fabric
change, or by the configured sweep_interval.
I think this should just be:
@@ -1299,12 +1303,14 @@ repeat_discovery:
"LID ASSIGNMENT COMPLETE - STARTING SWITCH TABLE
CONFIG");
/*
- * Proceed with unicast forwarding table configuration.
+ * Proceed with unicast forwarding table configuration; if it fails
+ * return early to wait for a trap or the next sweep interval.
*/
if (!sm->ucast_mgr.cache_valid ||
osm_ucast_cache_process(&sm->ucast_mgr))
- osm_ucast_mgr_process(&sm->ucast_mgr);
+ if (osm_ucast_mgr_process(&sm->ucast_mgr))
+ return;
osm_qos_setup(sm->p_subn->p_osm);
> diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
> index fbc9244..8ea2e52 100644
> --- a/opensm/opensm/osm_ucast_mgr.c
> +++ b/opensm/opensm/osm_ucast_mgr.c
> @@ -955,6 +955,7 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
> osm_opensm_t *p_osm;
> struct osm_routing_engine *p_routing_eng;
> cl_qmap_t *p_sw_guid_tbl;
> + int failed = 0;
>
> OSM_LOG_ENTER(p_mgr->p_log);
>
> @@ -973,7 +974,8 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
>
> p_osm->routing_engine_used = NULL;
> while (p_routing_eng) {
> - if (!ucast_mgr_route(p_routing_eng, p_osm))
> + failed = ucast_mgr_route(p_routing_eng, p_osm);
> + if (!failed)
> break;
> p_routing_eng = p_routing_eng->next;
> }
> @@ -984,9 +986,11 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
> struct osm_routing_engine *r = p_osm->default_routing_engine;
>
> r->build_lid_matrices(r->context);
> - r->ucast_build_fwd_tables(r->context);
> - p_osm->routing_engine_used = r;
> - osm_ucast_mgr_set_fwd_tables(p_mgr);
> + failed = r->ucast_build_fwd_tables(r->context);
> + if (!failed) {
> + p_osm->routing_engine_used = r;
> + osm_ucast_mgr_set_fwd_tables(p_mgr);
> + }
> }
>
> if (p_osm->routing_engine_used) {
> @@ -1006,7 +1010,7 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
> Exit:
> CL_PLOCK_RELEASE(p_mgr->p_lock);
> OSM_LOG_EXIT(p_mgr->p_log);
> - return 0;
> + return failed;
> }
>
> static int ucast_build_lid_matrices(void *context)
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html