Hi Yevgeny,

On 13:44 Wed 09 Jan     , Yevgeny Kliteynik wrote:
>  Hi Sasha,
> 
>  Please see below.
> 
>  Sasha Khapyorsky wrote:
> > This improves handling of mcast join/leave requests storming. Now mcast
> > routing will be recalculated for all mcast groups where changes occurred
> > and not one by one. For this it queues mcast groups instead of mcast
> > rerouting requests, this also makes state_mgr idle queue obsolete.
> > Signed-off-by: Sasha Khapyorsky <[EMAIL PROTECTED]>
> > ---
> > Hi Yevgeny,
> > For me it looks that it should solve the original problem (mcast group
> > list is purged in osm_mcast_mgr_process()). Could you review and ideally
> > test it? Thanks.
> > Sasha
> > diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c
> > index 50b95fd..f51a45a 100644
> > --- a/opensm/opensm/osm_mcast_mgr.c
> > +++ b/opensm/opensm/osm_mcast_mgr.c
> > @@ -1254,63 +1254,55 @@ osm_mcast_mgr_process_tree(IN osm_mcast_mgr_t * 
> > const p_mgr,
> >                                                        port_guid);
> >     }
> >  -      Exit:
> > +Exit:
> >     OSM_LOG_EXIT(p_mgr->p_log);
> >     return (status);
> >  }
> >   /**********************************************************************
> >   Process the entire group.
> > -
> >   NOTE : The lock should be held externally!
> >   **********************************************************************/
> > -static osm_signal_t
> > -osm_mcast_mgr_process_mgrp(IN osm_mcast_mgr_t * const p_mgr,
> > -                      IN osm_mgrp_t * const p_mgrp,
> > -                      IN osm_mcast_req_type_t req_type,
> > -                      IN ib_net64_t port_guid)
> > +static ib_api_status_t
> > +mcast_mgr_process_mgrp(IN osm_mcast_mgr_t * const p_mgr,
> > +                  IN osm_mgrp_t * const p_mgrp,
> > +                  IN osm_mcast_req_type_t req_type,
> > +                  IN ib_net64_t port_guid)
> >  {
> > -   osm_signal_t signal = OSM_SIGNAL_DONE;
> >     ib_api_status_t status;
> > -   osm_switch_t *p_sw;
> > -   cl_qmap_t *p_sw_tbl;
> > -   boolean_t pending_transactions = FALSE;
> >     OSM_LOG_ENTER(p_mgr->p_log, osm_mcast_mgr_process_mgrp);
> >  -  p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
> > -
> >     status = osm_mcast_mgr_process_tree(p_mgr, p_mgrp, req_type, port_guid);
> >     if (status != IB_SUCCESS) {
> >             osm_log(p_mgr->p_log, OSM_LOG_ERROR,
> > -                   "osm_mcast_mgr_process_mgrp: ERR 0A19: "
> > +                   "mcast_mgr_process_mgrp: ERR 0A19: "
> >                     "Unable to create spanning tree (%s)\n",
> >                     ib_get_err_str(status));
> > -
> >             goto Exit;
> >     }
> > +   p_mgrp->last_tree_id = p_mgrp->last_change_id;
> >  -  /*
> > -      Walk the switches and download the tables for each.
> > +   /* Remove MGRP only if osm_mcm_port_t count is 0 and
> > +    * Not a well known group
> >      */
> > -   p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
> > -   while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> > -           signal = __osm_mcast_mgr_set_tbl(p_mgr, p_sw);
> > -           if (signal == OSM_SIGNAL_DONE_PENDING)
> > -                   pending_transactions = TRUE;
> > -
> > -           p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> > +   if (cl_qmap_count(&p_mgrp->mcm_port_tbl) == 0 && !p_mgrp->well_known) {
> > +           osm_log(p_mgr->p_log, OSM_LOG_DEBUG,
> > +                   "mcast_mgr_process_mgrp: "
> > +                   "Destroying mgrp with lid:0x%X\n",
> > +                   cl_ntoh16(p_mgrp->mlid));
> > +           /* Send a Report to any InformInfo registered for
> > +              Trap 67 : MCGroup delete */
> > +           osm_mgrp_send_delete_notice(p_mgr->p_subn, p_mgr->p_log,
> > +                                       p_mgrp);
> > +           cl_qmap_remove_item(&p_mgr->p_subn->mgrp_mlid_tbl,
> > +                               (cl_map_item_t *) p_mgrp);
> > +           osm_mgrp_delete(p_mgrp);
> 
>  If the group is empty, p_mgrp is deleted
> 
> > @@ -1321,14 +1313,13 @@ osm_signal_t osm_mcast_mgr_process(IN 
> > osm_mcast_mgr_t * const p_mgr)
> >     osm_switch_t *p_sw;
> >     cl_qmap_t *p_sw_tbl;
> >     cl_qmap_t *p_mcast_tbl;
> > +   cl_qlist_t *p_list = &p_mgr->p_subn->p_osm->sm.mgrp_list;
> >     osm_mgrp_t *p_mgrp;
> > -   ib_api_status_t status;
> >     boolean_t pending_transactions = FALSE;
> >     OSM_LOG_ENTER(p_mgr->p_log, osm_mcast_mgr_process);
> >     p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
> > -
> >     p_mcast_tbl = &p_mgr->p_subn->mgrp_mlid_tbl;
> >     /*
> >        While holding the lock, iterate over all the established
> > @@ -1343,16 +1334,8 @@ osm_signal_t osm_mcast_mgr_process(IN 
> > osm_mcast_mgr_t * const p_mgr)
> >             /* We reached here due to some change that caused a heavy sweep
> >                of the subnet. Not due to a specific multicast request.
> >                So the request type is subnet_change and the port guid is 0. 
> > */
> > -           status = osm_mcast_mgr_process_tree(p_mgr, p_mgrp,
> > -                                               
> > OSM_MCAST_REQ_TYPE_SUBNET_CHANGE,
> > -                                               0);
> > -           if (status != IB_SUCCESS) {
> > -                   osm_log(p_mgr->p_log, OSM_LOG_ERROR,
> > -                           "osm_mcast_mgr_process: ERR 0A20: "
> > -                           "Unable to create spanning tree (%s)\n",
> > -                           ib_get_err_str(status));
> > -           }
> > -
> > +           mcast_mgr_process_mgrp(p_mgr, p_mgrp,
> > +                                  OSM_MCAST_REQ_TYPE_SUBNET_CHANGE, 0);
> >             p_mgrp = (osm_mgrp_t *) cl_qmap_next(&p_mgrp->map_item);
> 
>  And here there's a call to 'next' on a p_mgrp that was freed,
>  which eventually causes osm to crash on some segfault or assert
>  at some point.

Nice catch! Thanks for the fix!

Sasha
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to