On 19:48 Fri 23 Oct     , Hal Rosenstock wrote:
> 
> Heap memory consumption by the unicast and multicast routing tables can be
> reduced.
> 
> This patch is analagous to the previous patch doing this for the unicast
> routing tables (LFTs).
> 
> Using valgrind --tool=massif (for heap profiling), there are couple of place
> ->38.75% (11,206,656B) 0x43267E: osm_switch_new (osm_switch.c:134)
> ->12.89% (3,728,256B) 0x40F8C9: osm_mcast_tbl_init (osm_mcast_tbl.c:96)
> 
> osm_mcast_tbl_init (osm_mcast_tbl.c:96):
>         p_tbl->p_mask_tbl = malloc(p_tbl->num_entries *
>                                    (IB_MCAST_POSITION_MAX +
>                                     1) * IB_MCAST_MASK_SIZE / 8);
> 
> num_entries above is set based on the switch's MulticastFDBCap
> (indicated in it's SM class SwitchInfo attribute).
> 
> MFTs are only be increased in size and are never reduced in size. If a realloc
> for MFT fails, it is treated as a fatal error and OpenSM is exited.
> 
> Signed-off-by: Hal Rosenstock <[email protected]>

Applied. Thanks.

I have some comments (see below) and will send the fixes as subsequent
patch(es). Please reply over comments and/or patches if needed.

> ---
> Changes since v4:
> Incorporated osm_mcast_tbl change to make max_mlid_ho be maximum MLID
> configured rather than max table size
> 
> Changes since v3:
> Renamed mft_size to mft_depth and added description in osm_mcast_tbl.h
> Removed vestigial realloc mask tbl call in osm_dump.c
> Simplified max_mlid determination in alloc_mfts
> Added return value to osm_mcast_tbl_realloc_mask_tbl
> Added return value to alloc_mfts 
> Handle alloc_mfts failure in osm_mcast_mgr_process/process_mgroups
> Renamed osm_mcast_tbl_realloc_mask_tbl to osm_mcast_tbl_realloc
> In osm_mcast_tbl_realloc, simplified mft_depth calculation
> 
> Changes since v2:
> MFT allocation during routing preparation rather than on table access
> 
> Changes since v1:
> MFT allocation based on actual MLID requests
> 
> diff --git a/opensm/include/opensm/osm_mcast_tbl.h 
> b/opensm/include/opensm/osm_mcast_tbl.h
> index 276b7f7..5c36f2a 100644
> --- a/opensm/include/opensm/osm_mcast_tbl.h
> +++ b/opensm/include/opensm/osm_mcast_tbl.h
> @@ -1,6 +1,6 @@
>  /*
>   * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
> - * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
>   * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
>   * Copyright (c) 2009 HNR Consulting. All rights reserved.
>   *
> @@ -75,6 +75,7 @@ typedef struct osm_mcast_fwdbl {
>       int16_t max_block_in_use;
>       uint16_t num_entries;
>       uint16_t max_mlid_ho;
> +     uint16_t mft_depth;
>       uint16_t(*p_mask_tbl)[][IB_MCAST_POSITION_MAX];
>  } osm_mcast_tbl_t;
>  /*
> @@ -96,10 +97,14 @@ typedef struct osm_mcast_fwdbl {
>  *            Number of entries in the table (aka number of MLIDs supported).
>  *
>  *    max_mlid_ho
> -*            Maximum MLID (host order) configured in the multicast port mask
> +*            Maximum MLID (host order) for the currently allocated multicast
> +*            port mask table.
> +*
> +*    mft_depth
> +*            Number of MLIDs in the currently allocated multicast port mask
>  *            table.
>  *
> -*    pp_mask_tbl
> +*    p_mask_tbl
>  *            Pointer to a two dimensional array of port_masks for this 
> switch.
>  *            The first dimension is MLID, the second dimension is mask 
> position.
>  *            This pointer is null for switches that do not support multicast.
> @@ -116,8 +121,8 @@ typedef struct osm_mcast_fwdbl {
>  *
>  * SYNOPSIS
>  */
> -ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * p_tbl,
> -                                IN uint8_t num_ports, IN uint16_t capacity);
> +void osm_mcast_tbl_init(IN osm_mcast_tbl_t * p_tbl, IN uint8_t num_ports,
> +                     IN uint16_t capacity);
>  /*
>  * PARAMETERS
>  *    num_ports
> @@ -128,7 +133,7 @@ ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * 
> p_tbl,
>  *            by this switch.
>  *
>  * RETURN VALUE
> -*    IB_SUCCESS on success.
> +*    None.
>  *
>  * NOTES
>  *
> @@ -160,6 +165,34 @@ void osm_mcast_tbl_delete(IN osm_mcast_tbl_t ** pp_tbl);
>  * SEE ALSO
>  *********/
>  
> +/****f* OpenSM: Forwarding Table/osm_mcast_tbl_realloc
> +* NAME
> +*    osm_mcast_tbl_realloc
> +*
> +* DESCRIPTION
> +*    This function reallocates the multicast port mask table if necessary.
> +*
> +* SYNOPSIS
> +*/
> +int
> +osm_mcast_tbl_realloc(IN osm_mcast_tbl_t * p_tbl, IN uintn_t mlid_offset);
> +/*
> +* PARAMETERS
> +*
> +*    p_tbl
> +*            [in] Pointer to the Multicast Forwarding Table object.
> +*
> +*    mlid_offset
> +*            [in] Offset of MLID being accessed.
> +*
> +* RETURN VALUE
> +*    Returns 0 on success and non-zero value otherwise.
> +*
> +* NOTES
> +*
> +* SEE ALSO
> +*/
> +
>  /****f* OpenSM: Forwarding Table/osm_mcast_tbl_destroy
>  * NAME
>  *    osm_mcast_tbl_destroy
> diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c
> index 0ee689c..a5e9758 100644
> --- a/opensm/opensm/osm_mcast_mgr.c
> +++ b/opensm/opensm/osm_mcast_mgr.c
> @@ -1043,6 +1043,36 @@ static int mcast_mgr_set_mftables(osm_sm_t * sm)
>       return ret;
>  }
>  
> +static int alloc_mfts(osm_sm_t * sm)
> +{
> +     int i;
> +     cl_map_item_t *item;
> +     osm_switch_t *p_sw;
> +     int max_mlid = 0;
> +
> +     for (i = sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO; i >= 0;
> +          i--) {
> +             if (sm->p_subn->mgroups[i]) {
> +                     max_mlid = i + IB_LID_MCAST_START_HO;
> +                     break;
> +             }
> +     }
> +
> +     if (max_mlid == 0)
> +             return 0;
> +
> +     /* Now, walk switches and (re)allocate multicast tables */
> +     for (item = cl_qmap_head(&sm->p_subn->sw_guid_tbl);
> +          item != cl_qmap_end(&sm->p_subn->sw_guid_tbl);
> +          item = cl_qmap_next(item)) {
> +             p_sw = (osm_switch_t *)item;
> +             if (osm_mcast_tbl_realloc(&p_sw->mcast_tbl,
> +                                       max_mlid - IB_LID_MCAST_START_HO))
> +                     return -1;
> +     }

A variable 'max_mlid' is not actually needed in this function - you are
initializing this as 'mlid_max = i + IB_LID_MCAST_START_NO' and then
using as 'mlid_max - IB_LID_MCAST_START_HO'. Instead you could just use
'i' as it is.

> +     return 0;
> +}
> +
>  /**********************************************************************
>   **********************************************************************/
>  int osm_mcast_mgr_process(osm_sm_t * sm)
> @@ -1063,6 +1093,12 @@ int osm_mcast_mgr_process(osm_sm_t * sm)
>               goto exit;
>       }
>  
> +     if (alloc_mfts(sm)) {
> +             OSM_LOG(sm->p_log, OSM_LOG_ERROR,
> +                     "ERR 0A07: alloc_mfts failed\n");
> +             goto exit;
> +     }
> +
>       for (i = 0; i <= sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO;
>            i++)
>               if (sm->p_subn->mgroups[i] || sm->mlids_req[i])
> @@ -1101,6 +1137,12 @@ int osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
>               goto exit;
>       }
>  
> +     if (alloc_mfts(sm)) {
> +             OSM_LOG(sm->p_log, OSM_LOG_ERROR,
> +                     "ERR 0A09: alloc_mfts failed\n");
> +             goto exit;
> +     }
> +
>       for (i = 0; i <= sm->mlids_req_max; i++) {
>               if (!sm->mlids_req[i])
>                       continue;
> diff --git a/opensm/opensm/osm_mcast_tbl.c b/opensm/opensm/osm_mcast_tbl.c
> index bdea416..be2181d 100644
> --- a/opensm/opensm/osm_mcast_tbl.c
> +++ b/opensm/opensm/osm_mcast_tbl.c
> @@ -1,6 +1,6 @@
>  /*
>   * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
> - * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
>   * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
>   * Copyright (c) 2009 HNR Consulting. All rights reserved.
>   *
> @@ -53,8 +53,8 @@
>  
>  /**********************************************************************
>   **********************************************************************/
> -ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * p_tbl,
> -                                IN uint8_t num_ports, IN uint16_t capacity)
> +void osm_mcast_tbl_init(IN osm_mcast_tbl_t * p_tbl, IN uint8_t num_ports,
> +                     IN uint16_t capacity)
>  {
>       CL_ASSERT(p_tbl);
>       CL_ASSERT(num_ports);
> @@ -68,7 +68,7 @@ ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * 
> p_tbl,
>                  This switch apparently doesn't support multicast.
>                  Everything is initialized to zero already, so return.
>                */
> -             return IB_SUCCESS;
> +             return;
>       }
>  
>       p_tbl->num_entries = capacity;
> @@ -80,25 +80,6 @@ ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * 
> p_tbl,
>       p_tbl->max_block = (uint16_t) ((ROUNDUP(p_tbl->num_entries,
>                                               IB_MCAST_BLOCK_SIZE) /
>                                       IB_MCAST_BLOCK_SIZE) - 1);
> -
> -     /*
> -        The number of bytes needed in the mask table is:
> -        The (maximum bit mask 'position' + 1) times the
> -        number of bytes in each bit mask times the
> -        number of MLIDs supported by the table.
> -
> -        We must always allocate the array with the maximum position
> -        since it is (and must be) defined that way the table structure
> -        in order to create a pointer to a two dimensional array.
> -      */
> -     p_tbl->p_mask_tbl = calloc(p_tbl->num_entries,
> -                                (IB_MCAST_POSITION_MAX +
> -                                 1) * IB_MCAST_MASK_SIZE / 8);
> -
> -     if (p_tbl->p_mask_tbl == NULL)
> -             return IB_INSUFFICIENT_MEMORY;
> -
> -     return IB_SUCCESS;
>  }
>  
>  /**********************************************************************
> @@ -120,8 +101,8 @@ void osm_mcast_tbl_set(IN osm_mcast_tbl_t * p_tbl, IN 
> uint16_t mlid_ho,
>  
>       CL_ASSERT(p_tbl);
>       CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
> -     CL_ASSERT(mlid_ho <= (uint16_t) (IB_LID_MCAST_START_HO +
> -                                      p_tbl->num_entries - 1));
> +     CL_ASSERT(mlid_ho <= p_tbl->max_mlid_ho);
> +     CL_ASSERT(mlid_ho - IB_LID_MCAST_START_HO < p_tbl->mft_depth);


Isn't it the same check since
'p_tbl->max_mlid_ho = p_tbl->mft_depth + IB_LID_MCAST_START_HO - 1'?

>       CL_ASSERT(p_tbl->p_mask_tbl);
>  
>       mlid_offset = mlid_ho - IB_LID_MCAST_START_HO;
> @@ -133,8 +114,41 @@ void osm_mcast_tbl_set(IN osm_mcast_tbl_t * p_tbl, IN 
> uint16_t mlid_ho,
>  
>       if (block_num > p_tbl->max_block_in_use)
>               p_tbl->max_block_in_use = (uint16_t) block_num;
> -     if (mlid_ho > p_tbl->max_mlid_ho)
> -             p_tbl->max_mlid_ho = mlid_ho;
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
> +int
> +osm_mcast_tbl_realloc(IN osm_mcast_tbl_t * p_tbl, IN uintn_t mlid_offset)
> +{
> +     size_t mft_depth, size;
> +     uint16_t (*p_mask_tbl)[][IB_MCAST_POSITION_MAX];
> +
> +     if (mlid_offset < p_tbl->mft_depth)
> +             return 0;
> +
> +     /*
> +        The number of bytes needed in the mask table is:
> +        The (maximum bit mask 'position' + 1) times the
> +        number of bytes in each bit mask times the
> +        number of MLIDs supported by the table.
> +
> +        We must always allocate the array with the maximum position
> +        since it is (and must be) defined that way the table structure
> +        in order to create a pointer to a two dimensional array.
> +      */
> +     mft_depth = (mlid_offset / IB_MCAST_BLOCK_SIZE + 1) * 
> IB_MCAST_BLOCK_SIZE;
> +     size = mft_depth * (IB_MCAST_POSITION_MAX + 1) * IB_MCAST_MASK_SIZE / 8;
> +     p_mask_tbl = realloc(p_tbl->p_mask_tbl, size);
> +     if (!p_mask_tbl)
> +             return -1;
> +     memset((uint8_t *)p_mask_tbl + p_tbl->mft_depth * 
> (IB_MCAST_POSITION_MAX + 1) * IB_MCAST_MASK_SIZE / 8,
> +            0,
> +            size - p_tbl->mft_depth * (IB_MCAST_POSITION_MAX + 1) * 
> IB_MCAST_MASK_SIZE / 8);
> +     p_tbl->p_mask_tbl = p_mask_tbl;
> +     p_tbl->mft_depth = mft_depth;
> +     p_tbl->max_mlid_ho = mft_depth + IB_LID_MCAST_START_HO - 1;

Wouldn't it be more accurate/efficient to set max_mlid_ho as
'mlid_offset + IB_LID_MCAST_START_HO - 1'?

> +     return 0;
>  }
>  
>  /**********************************************************************
> @@ -152,10 +166,11 @@ boolean_t osm_mcast_tbl_is_port(IN const 
> osm_mcast_tbl_t * p_tbl,
>               CL_ASSERT(port_num <=
>                         (p_tbl->max_position + 1) * IB_MCAST_MASK_SIZE);
>               CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
> -             CL_ASSERT(mlid_ho <= (uint16_t) (IB_LID_MCAST_START_HO +
> -                                              p_tbl->num_entries - 1));
> +             CL_ASSERT(mlid_ho <= p_tbl->max_mlid_ho);
>  
>               mlid_offset = mlid_ho - IB_LID_MCAST_START_HO;
> +             if (mlid_offset >= p_tbl->mft_depth)
> +                     return FALSE;

This duplicates CL_ASSERT() above. Looking on how this function is used
I don't see why we this check should be introduced. Do you?

>               mask_offset = port_num / IB_MCAST_MASK_SIZE;
>               bit_mask = cl_ntoh16((uint16_t)
>                                    (1 << (port_num % IB_MCAST_MASK_SIZE)));
> @@ -180,10 +195,11 @@ boolean_t osm_mcast_tbl_is_any_port(IN const 
> osm_mcast_tbl_t * p_tbl,
>  
>       if (p_tbl->p_mask_tbl) {
>               CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
> -             CL_ASSERT(mlid_ho <= (uint16_t) (IB_LID_MCAST_START_HO +
> -                                              p_tbl->num_entries - 1));
> +             CL_ASSERT(mlid_ho <= p_tbl->max_mlid_ho);
>  
>               mlid_offset = mlid_ho - IB_LID_MCAST_START_HO;
> +             if (mlid_offset >= p_tbl->mft_depth)
> +                     return FALSE;

Ditto.

>  
>               for (position = 0; position <= p_tbl->max_position; position++)
>                       result |= (*p_tbl->p_mask_tbl)[mlid_offset][position];
> @@ -213,8 +229,7 @@ ib_api_status_t osm_mcast_tbl_set_block(IN 
> osm_mcast_tbl_t * p_tbl,
>  
>       mlid_start_ho = (uint16_t) (block_num * IB_MCAST_BLOCK_SIZE);
>  
> -     if (mlid_start_ho + IB_MCAST_BLOCK_SIZE - 1 >
> -         p_tbl->num_entries + IB_LID_MCAST_START_HO - 1)
> +     if (mlid_start_ho + IB_MCAST_BLOCK_SIZE - 1 > p_tbl->mft_depth)
>               return IB_INVALID_PARAMETER;

I see that 'mlid_start_ho' is actually a mlid offset. Was a previous
check wrong?

>  
>       for (i = 0; i < IB_MCAST_BLOCK_SIZE; i++)
> @@ -223,9 +238,6 @@ ib_api_status_t osm_mcast_tbl_set_block(IN 
> osm_mcast_tbl_t * p_tbl,
>       if (block_num > p_tbl->max_block_in_use)
>               p_tbl->max_block_in_use = (uint16_t) block_num;
>  
> -     if (mlid_start_ho + IB_MCAST_BLOCK_SIZE - 1 > p_tbl->max_mlid_ho)
> -             p_tbl->max_mlid_ho = mlid_start_ho + IB_MCAST_BLOCK_SIZE - 1;
> -
>       return IB_SUCCESS;
>  }
>  
> @@ -241,6 +253,8 @@ void osm_mcast_tbl_clear_mlid(IN osm_mcast_tbl_t * p_tbl, 
> IN uint16_t mlid_ho)
>  
>       if (p_tbl->p_mask_tbl && (mlid_ho <= p_tbl->max_mlid_ho)) {
>               mlid_offset = mlid_ho - IB_LID_MCAST_START_HO;
> +             if (mlid_offset >= p_tbl->mft_depth)
> +                     return;

This seems redundant for me after 'mlid_ho <= p_tbl->max_mlid_ho' check
above.

Sasha

>               for (i = 0; i <= p_tbl->max_position; i++)
>                       (*p_tbl->p_mask_tbl)[mlid_offset][i] = 0;
>       }
> @@ -257,6 +271,7 @@ boolean_t osm_mcast_tbl_get_block(IN osm_mcast_tbl_t * 
> p_tbl,
>  
>       CL_ASSERT(p_tbl);
>       CL_ASSERT(p_block);
> +     CL_ASSERT(block_num * IB_MCAST_BLOCK_SIZE <= p_tbl->mft_depth);
>  
>       if (block_num > p_tbl->max_block_in_use)
>               return FALSE;
> diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c
> index d2123a4..c912b03 100644
> --- a/opensm/opensm/osm_switch.c
> +++ b/opensm/opensm/osm_switch.c
> @@ -136,9 +136,8 @@ osm_switch_t *osm_switch_new(IN osm_node_t * p_node,
>  
>       memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports);
>  
> -     if (osm_mcast_tbl_init(&p_sw->mcast_tbl, osm_node_get_num_physp(p_node),
> -                            cl_ntoh16(p_si->mcast_cap)))
> -             goto err;
> +     osm_mcast_tbl_init(&p_sw->mcast_tbl, osm_node_get_num_physp(p_node),
> +                        cl_ntoh16(p_si->mcast_cap));
>  
>       for (port_num = 0; port_num < num_ports; port_num++)
>               osm_port_prof_construct(&p_sw->p_prof[port_num]);
> @@ -507,7 +506,6 @@ static int alloc_lft(IN osm_switch_t * p_sw, uint16_t 
> lids)
>               p_sw->lft = new_lft;
>               p_sw->lft_size = lft_size;
>       }
> -
>       return 0;
>  }
>  
> @@ -548,7 +546,6 @@ int osm_switch_prepare_path_rebuild(IN osm_switch_t * 
> p_sw, IN uint16_t max_lids
>               p_sw->num_hops = max_lids + 1;
>       }
>       p_sw->max_lid_ho = max_lids;
> -
>       return 0;
>  }
>  
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to