Hi Jim, On 13:08 Fri 28 Aug , Jim Schutt wrote: > When LASH is run against ibsim, valgrind reports the following > (on x86_64) after a switch is removed from the fabric: > > ==15699== Invalid write of size 8 > ==15699== at 0x45FD8A: switch_delete (osm_ucast_lash.c:648) > ==15699== by 0x461483: lash_cleanup (osm_ucast_lash.c:1123) > ==15699== by 0x461848: lash_process (osm_ucast_lash.c:1230) > ==15699== by 0x45C043: ucast_mgr_route (osm_ucast_mgr.c:1016) > ==15699== by 0x45C1A0: osm_ucast_mgr_process (osm_ucast_mgr.c:1057) > ==15699== by 0x44F11B: do_sweep (osm_state_mgr.c:1283) > ==15699== by 0x44F539: osm_state_mgr_process (osm_state_mgr.c:1398) > ==15699== by 0x447296: sm_process (osm_sm.c:90) > ==15699== by 0x4473FE: sm_sweeper (osm_sm.c:130) > ==15699== by 0x5023505: __cl_thread_wrapper (cl_thread.c:57) > ==15699== by 0x37AC006366: start_thread (in /lib64/libpthread-2.5.so) > ==15699== by 0x37AB4D30AC: clone (in /lib64/libc-2.5.so) > ==15699== Address 0x9B28198 is 152 bytes inside a block of size 160 free'd > ==15699== at 0x4A0541E: free (vg_replace_malloc.c:233) > ==15699== by 0x453866: osm_switch_delete (osm_switch.c:97) > ==15699== by 0x4116AA: drop_mgr_remove_switch (osm_drop_mgr.c:290) > ==15699== by 0x411820: drop_mgr_process_node (osm_drop_mgr.c:339) > ==15699== by 0x411D0C: osm_drop_mgr_process (osm_drop_mgr.c:465) > ==15699== by 0x44EF97: do_sweep (osm_state_mgr.c:1231) > ==15699== by 0x44F539: osm_state_mgr_process (osm_state_mgr.c:1398) > ==15699== by 0x447296: sm_process (osm_sm.c:90) > ==15699== by 0x4473FE: sm_sweeper (osm_sm.c:130) > ==15699== by 0x5023505: __cl_thread_wrapper (cl_thread.c:57) > ==15699== by 0x37AC006366: start_thread (in /lib64/libpthread-2.5.so) > ==15699== by 0x37AB4D30AC: clone (in /lib64/libc-2.5.so) > > The root cause is that in order to perform SL lookup for path record > queries, LASH needs to keep persistent data between calls to the > routing engine. > > LASH uses the osm_switch_t:priv member to speed lookup of the LASH > switch_t objects it needs to perform SL lookup, and has a corresponding > switch_t:p_sw member to point to the corresponding osm_switch_t object. > > When a switch is deleted from the fabric, the switch_t:p_sw value becomes > invalid, but LASH's switch_delete() uses it to clear the corresponding > osm_switch_t:priv value.
Ok. I see the issue. This 'p_sw->priv = NULL' line was not in the original "priv" introduction code, but was added by mistake (AFAIR for "for sure" reason :)) by some subsequent patch. Why to not fix this by just removing this not actually needed statement? Sasha > > Solve this problem by adding a priv_release function pointer that > is set when osm_switch_t:priv is set. This allows the opensm core to > clean up after any routing engine that is using priv to access > persistent data (LASH seems to be the only one so far), without > knowing the details of how to do so. > > When multiple routing engines are configured, it also allows a routing > engine using osm_switch_t:priv to clean up if some other routing engine > using priv fails in an unexpected way. > > With this addition, the rules for using osm_switch_t:priv become: > 1) Never assign to priv without also assigning to priv_release. > 2) Always use priv_release() before assigning to priv; this > prevents memory issues due to unexpected errors in a > routing engine using priv. > 3) Always use priv_release() to clean up after a use of priv. > > Since updn uses osm_switch_t:priv, fix it up to follow the above > rules as well, for consistency. > > Signed-off-by: Jim Schutt <jasc...@sandia.gov> > --- > opensm/include/opensm/osm_switch.h | 1 + > opensm/opensm/osm_switch.c | 2 ++ > opensm/opensm/osm_ucast_lash.c | 24 ++++++++++++++++++++---- > opensm/opensm/osm_ucast_updn.c | 15 +++++++++++---- > 4 files changed, 34 insertions(+), 8 deletions(-) > > diff --git a/opensm/include/opensm/osm_switch.h > b/opensm/include/opensm/osm_switch.h > index 7ce28c5..d48f8c6 100644 > --- a/opensm/include/opensm/osm_switch.h > +++ b/opensm/include/opensm/osm_switch.h > @@ -106,6 +106,7 @@ typedef struct osm_switch { > unsigned endport_links; > unsigned need_update; > void *priv; > + void (*priv_release)(struct osm_switch *p_sw); > } osm_switch_t; > /* > * FIELDS > diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c > index ce1ca63..fbf3973 100644 > --- a/opensm/opensm/osm_switch.c > +++ b/opensm/opensm/osm_switch.c > @@ -94,6 +94,8 @@ void osm_switch_delete(IN OUT osm_switch_t ** const pp_sw) > free(p_sw->hops[i]); > free(p_sw->hops); > } > + if (p_sw->priv_release) > + p_sw->priv_release(p_sw); > free(*pp_sw); > *pp_sw = NULL; > } > diff --git a/opensm/opensm/osm_ucast_lash.c b/opensm/opensm/osm_ucast_lash.c > index 0a567b3..ceae7d8 100644 > --- a/opensm/opensm/osm_ucast_lash.c > +++ b/opensm/opensm/osm_ucast_lash.c > @@ -603,6 +603,17 @@ static int balance_virtual_lanes(lash_t * p_lash, > unsigned lanes_needed) > return 0; > } > > +static void lash_switch_priv_release(osm_switch_t *osm_sw) > +{ > + switch_t *sw = osm_sw->priv; > + > + osm_sw->priv_release = NULL; > + osm_sw->priv = NULL; > + > + if (sw && sw->p_sw == osm_sw) > + sw->p_sw = NULL; > +} > + > static switch_t *switch_create(lash_t * p_lash, unsigned id, osm_switch_t * > p_sw) > { > unsigned num_switches = p_lash->num_switches; > @@ -628,8 +639,12 @@ static switch_t *switch_create(lash_t * p_lash, unsigned > id, osm_switch_t * p_sw > } > > sw->p_sw = p_sw; > - if (p_sw) > + if (p_sw) { > + if (p_sw->priv_release) > + p_sw->priv_release(p_sw); > p_sw->priv = sw; > + p_sw->priv_release = lash_switch_priv_release; > + } > > if (osm_mesh_node_create(p_lash, sw)) { > free(sw->dij_channels); > @@ -644,8 +659,8 @@ static void switch_delete(lash_t *p_lash, switch_t * sw) > { > if (sw->dij_channels) > free(sw->dij_channels); > - if (sw->p_sw) > - sw->p_sw->priv = NULL; > + if (sw->p_sw && sw->p_sw->priv_release) > + sw->p_sw->priv_release(sw->p_sw); > free(sw); > } > > @@ -1113,7 +1128,8 @@ static void lash_cleanup(lash_t * p_lash) > while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) > { > p_sw = p_next_sw; > p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); > - p_sw->priv = NULL; > + if (p_sw->priv_release) > + p_sw->priv_release(p_sw); > } > > if (p_lash->switches) { > diff --git a/opensm/opensm/osm_ucast_updn.c b/opensm/opensm/osm_ucast_updn.c > index bb9ccda..dc5f459 100644 > --- a/opensm/opensm/osm_ucast_updn.c > +++ b/opensm/opensm/osm_ucast_updn.c > @@ -404,10 +404,13 @@ static struct updn_node *create_updn_node(osm_switch_t > * sw) > return u; > } > > -static void delete_updn_node(struct updn_node *u) > +static void updn_sw_priv_release(osm_switch_t *sw) > { > - u->sw->priv = NULL; > - free(u); > + if (sw->priv) > + free(sw->priv); > + > + sw->priv_release = NULL; > + sw->priv = NULL; > } > > /********************************************************************** > @@ -589,6 +592,8 @@ static int updn_lid_matrices(void *ctx) > item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl); > item = cl_qmap_next(item)) { > p_sw = (osm_switch_t *)item; > + if (p_sw->priv_release) > + p_sw->priv_release(p_sw); > p_sw->priv = create_updn_node(p_sw); > if (!p_sw->priv) { > OSM_LOG(&(p_updn->p_osm->log), OSM_LOG_ERROR, "ERR > AA0C: " > @@ -596,6 +601,7 @@ static int updn_lid_matrices(void *ctx) > OSM_LOG_EXIT(&p_updn->p_osm->log); > return -1; > } > + p_sw->priv_release = updn_sw_priv_release; > } > > /* First setup root nodes */ > @@ -653,7 +659,8 @@ static int updn_lid_matrices(void *ctx) > item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl); > item = cl_qmap_next(item)) { > p_sw = (osm_switch_t *) item; > - delete_updn_node(p_sw->priv); > + if (p_sw->priv_release) > + p_sw->priv_release(p_sw); > } > > OSM_LOG_EXIT(&p_updn->p_osm->log); > -- > 1.5.6.GIT > > _______________________________________________ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general