Hi Hal, We've noticed that currently if we have 2 hcas with duplicated guids connected back-2-back, opensm gets stuck. The reason for that is that in osm_vendor_set_sm() function - the second call trying to open the /dev/infiniband/issm%id is stuck, since this file is already open. The following patch fixes 2 things - 1. In osm_node_info_rcv.c - we've added a case that on cases of duplicated guids - exit (unless a flag is set otherwise). Add this exiting code also to the case where the nodes are connected back-2-back. 2. In osm_vendor_ibumad.c - add a static variable to avoid trying to open /dev/inifiniband/issm%d file twice during the run of opensm.
Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: libvendor/osm_vendor_ibumad.c =================================================================== --- libvendor/osm_vendor_ibumad.c (revision 4951) +++ libvendor/osm_vendor_ibumad.c (working copy) @@ -1142,8 +1142,11 @@ osm_vendor_set_sm( osm_umad_bind_info_t *p_bind = (osm_umad_bind_info_t *)h_bind; osm_vendor_t *p_vend = p_bind->p_vend; char issmstring[24]; + static boolean_t osm_vendor_set_sm_indicator = FALSE; OSM_LOG_ENTER( p_vend->p_log, osm_vendor_set_sm ); + if (is_sm_val == FALSE || osm_vendor_set_sm_indicator == FALSE) + { sprintf(issmstring, "/dev/infiniband/issm%d", p_vend->umad_port_id); if (TRUE == is_sm_val) { p_vend->issmfd = open(issmstring, 0); @@ -1162,6 +1165,15 @@ osm_vendor_set_sm( " mask failed: errno %d\n", errno); p_vend->issmfd = -1; } + if ( osm_vendor_set_sm_indicator == FALSE ) + osm_vendor_set_sm_indicator = TRUE; + } + else + { + osm_log(p_vend->p_log, OSM_LOG_ERROR, + "osm_vendor_set_sm: ERR 5436: " + "Trying to set IS_SM capability mask again\n"); + } OSM_LOG_EXIT( p_vend->p_log ); } Index: opensm/osm_node_info_rcv.c =================================================================== --- opensm/osm_node_info_rcv.c (revision 4951) +++ opensm/osm_node_info_rcv.c (working copy) @@ -229,6 +229,14 @@ __osm_ni_rcv_set_links( osm_dump_dr_path(p_rcv->p_log, osm_physp_get_dr_path_ptr(p_physp), OSM_LOG_ERROR); + + osm_log( p_rcv->p_log, OSM_LOG_SYS, + "Errors on subnet. Duplicate GUID found " + "by link from a port to itself. " + "See osm log for more details\n"); + + if ( p_rcv->p_subn->opt.exit_on_fatal == TRUE ) + exit( 1 ); } else { _______________________________________________ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general