Hi Sasha,

Please see below.

Sasha Khapyorsky wrote:
This improves handling of mcast join/leave requests storming. Now mcast
routing will be recalculated for all mcast groups where changes occurred
and not one by one. For this it queues mcast groups instead of mcast
rerouting requests, this also makes state_mgr idle queue obsolete.

Signed-off-by: Sasha Khapyorsky <[EMAIL PROTECTED]>
---

Hi Yevgeny,

For me it looks that it should solve the original problem (mcast group
list is purged in osm_mcast_mgr_process()). Could you review and ideally
test it? Thanks.

Sasha

diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c
index 50b95fd..f51a45a 100644
--- a/opensm/opensm/osm_mcast_mgr.c
+++ b/opensm/opensm/osm_mcast_mgr.c
@@ -1254,63 +1254,55 @@ osm_mcast_mgr_process_tree(IN osm_mcast_mgr_t * const 
p_mgr,
                                                           port_guid);
        }
- Exit:
+Exit:
        OSM_LOG_EXIT(p_mgr->p_log);
        return (status);
 }
/**********************************************************************
  Process the entire group.
-
  NOTE : The lock should be held externally!
  **********************************************************************/
-static osm_signal_t
-osm_mcast_mgr_process_mgrp(IN osm_mcast_mgr_t * const p_mgr,
-                          IN osm_mgrp_t * const p_mgrp,
-                          IN osm_mcast_req_type_t req_type,
-                          IN ib_net64_t port_guid)
+static ib_api_status_t
+mcast_mgr_process_mgrp(IN osm_mcast_mgr_t * const p_mgr,
+                      IN osm_mgrp_t * const p_mgrp,
+                      IN osm_mcast_req_type_t req_type,
+                      IN ib_net64_t port_guid)
 {
-       osm_signal_t signal = OSM_SIGNAL_DONE;
        ib_api_status_t status;
-       osm_switch_t *p_sw;
-       cl_qmap_t *p_sw_tbl;
-       boolean_t pending_transactions = FALSE;
OSM_LOG_ENTER(p_mgr->p_log, osm_mcast_mgr_process_mgrp); - p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
-
        status = osm_mcast_mgr_process_tree(p_mgr, p_mgrp, req_type, port_guid);
        if (status != IB_SUCCESS) {
                osm_log(p_mgr->p_log, OSM_LOG_ERROR,
-                       "osm_mcast_mgr_process_mgrp: ERR 0A19: "
+                       "mcast_mgr_process_mgrp: ERR 0A19: "
                        "Unable to create spanning tree (%s)\n",
                        ib_get_err_str(status));
-
                goto Exit;
        }
+       p_mgrp->last_tree_id = p_mgrp->last_change_id;
- /*
-          Walk the switches and download the tables for each.
+       /* Remove MGRP only if osm_mcm_port_t count is 0 and
+        * Not a well known group
         */
-       p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
-       while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
-               signal = __osm_mcast_mgr_set_tbl(p_mgr, p_sw);
-               if (signal == OSM_SIGNAL_DONE_PENDING)
-                       pending_transactions = TRUE;
-
-               p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
+       if (cl_qmap_count(&p_mgrp->mcm_port_tbl) == 0 && !p_mgrp->well_known) {
+               osm_log(p_mgr->p_log, OSM_LOG_DEBUG,
+                       "mcast_mgr_process_mgrp: "
+                       "Destroying mgrp with lid:0x%X\n",
+                       cl_ntoh16(p_mgrp->mlid));
+               /* Send a Report to any InformInfo registered for
+                  Trap 67 : MCGroup delete */
+               osm_mgrp_send_delete_notice(p_mgr->p_subn, p_mgr->p_log,
+                                           p_mgrp);
+               cl_qmap_remove_item(&p_mgr->p_subn->mgrp_mlid_tbl,
+                                   (cl_map_item_t *) p_mgrp);
+               osm_mgrp_delete(p_mgrp);

If the group is empty, p_mgrp is deleted

@@ -1321,14 +1313,13 @@ osm_signal_t osm_mcast_mgr_process(IN osm_mcast_mgr_t * 
const p_mgr)
        osm_switch_t *p_sw;
        cl_qmap_t *p_sw_tbl;
        cl_qmap_t *p_mcast_tbl;
+       cl_qlist_t *p_list = &p_mgr->p_subn->p_osm->sm.mgrp_list;
        osm_mgrp_t *p_mgrp;
-       ib_api_status_t status;
        boolean_t pending_transactions = FALSE;
OSM_LOG_ENTER(p_mgr->p_log, osm_mcast_mgr_process); p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
-
        p_mcast_tbl = &p_mgr->p_subn->mgrp_mlid_tbl;
        /*
           While holding the lock, iterate over all the established
@@ -1343,16 +1334,8 @@ osm_signal_t osm_mcast_mgr_process(IN osm_mcast_mgr_t * 
const p_mgr)
                /* We reached here due to some change that caused a heavy sweep
                   of the subnet. Not due to a specific multicast request.
                   So the request type is subnet_change and the port guid is 0. 
*/
-               status = osm_mcast_mgr_process_tree(p_mgr, p_mgrp,
-                                                   
OSM_MCAST_REQ_TYPE_SUBNET_CHANGE,
-                                                   0);
-               if (status != IB_SUCCESS) {
-                       osm_log(p_mgr->p_log, OSM_LOG_ERROR,
-                               "osm_mcast_mgr_process: ERR 0A20: "
-                               "Unable to create spanning tree (%s)\n",
-                               ib_get_err_str(status));
-               }
-
+               mcast_mgr_process_mgrp(p_mgr, p_mgrp,
+                                      OSM_MCAST_REQ_TYPE_SUBNET_CHANGE, 0);
                p_mgrp = (osm_mgrp_t *) cl_qmap_next(&p_mgrp->map_item);

And here there's a call to 'next' on a p_mgrp that was freed,
which eventually causes osm to crash on some segfault or assert
at some point.

-- Yevgeny
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to