On Thu, Oct 29, 2009 at 3:17 PM, Sasha Khapyorsky <[email protected]> wrote:
> On 19:48 Fri 23 Oct     , Hal Rosenstock wrote:
>>
>> Heap memory consumption by the unicast and multicast routing tables can be
>> reduced.
>>
>> This patch is analagous to the previous patch doing this for the unicast
>> routing tables (LFTs).
>>
>> Using valgrind --tool=massif (for heap profiling), there are couple of place
>> ->38.75% (11,206,656B) 0x43267E: osm_switch_new (osm_switch.c:134)
>> ->12.89% (3,728,256B) 0x40F8C9: osm_mcast_tbl_init (osm_mcast_tbl.c:96)
>>
>> osm_mcast_tbl_init (osm_mcast_tbl.c:96):
>>         p_tbl->p_mask_tbl = malloc(p_tbl->num_entries *
>>                                    (IB_MCAST_POSITION_MAX +
>>                                     1) * IB_MCAST_MASK_SIZE / 8);
>>
>> num_entries above is set based on the switch's MulticastFDBCap
>> (indicated in it's SM class SwitchInfo attribute).
>>
>> MFTs are only be increased in size and are never reduced in size. If a 
>> realloc
>> for MFT fails, it is treated as a fatal error and OpenSM is exited.
>>
>> Signed-off-by: Hal Rosenstock <[email protected]>
>
> Applied. Thanks.
>
> I have some comments (see below) and will send the fixes as subsequent
> patch(es). Please reply over comments and/or patches if needed.
>
>> ---
>> Changes since v4:
>> Incorporated osm_mcast_tbl change to make max_mlid_ho be maximum MLID
>> configured rather than max table size
>>
>> Changes since v3:
>> Renamed mft_size to mft_depth and added description in osm_mcast_tbl.h
>> Removed vestigial realloc mask tbl call in osm_dump.c
>> Simplified max_mlid determination in alloc_mfts
>> Added return value to osm_mcast_tbl_realloc_mask_tbl
>> Added return value to alloc_mfts
>> Handle alloc_mfts failure in osm_mcast_mgr_process/process_mgroups
>> Renamed osm_mcast_tbl_realloc_mask_tbl to osm_mcast_tbl_realloc
>> In osm_mcast_tbl_realloc, simplified mft_depth calculation
>>
>> Changes since v2:
>> MFT allocation during routing preparation rather than on table access
>>
>> Changes since v1:
>> MFT allocation based on actual MLID requests
>>
>> diff --git a/opensm/include/opensm/osm_mcast_tbl.h 
>> b/opensm/include/opensm/osm_mcast_tbl.h
>> index 276b7f7..5c36f2a 100644
>> --- a/opensm/include/opensm/osm_mcast_tbl.h
>> +++ b/opensm/include/opensm/osm_mcast_tbl.h
>> @@ -1,6 +1,6 @@
>>  /*
>>   * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
>> - * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
>> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
>>   * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
>>   * Copyright (c) 2009 HNR Consulting. All rights reserved.
>>   *
>> @@ -75,6 +75,7 @@ typedef struct osm_mcast_fwdbl {
>>       int16_t max_block_in_use;
>>       uint16_t num_entries;
>>       uint16_t max_mlid_ho;
>> +     uint16_t mft_depth;
>>       uint16_t(*p_mask_tbl)[][IB_MCAST_POSITION_MAX];
>>  } osm_mcast_tbl_t;
>>  /*
>> @@ -96,10 +97,14 @@ typedef struct osm_mcast_fwdbl {
>>  *            Number of entries in the table (aka number of MLIDs supported).
>>  *
>>  *    max_mlid_ho
>> -*            Maximum MLID (host order) configured in the multicast port mask
>> +*            Maximum MLID (host order) for the currently allocated multicast
>> +*            port mask table.
>> +*
>> +*    mft_depth
>> +*            Number of MLIDs in the currently allocated multicast port mask
>>  *            table.
>>  *
>> -*    pp_mask_tbl
>> +*    p_mask_tbl
>>  *            Pointer to a two dimensional array of port_masks for this 
>> switch.
>>  *            The first dimension is MLID, the second dimension is mask 
>> position.
>>  *            This pointer is null for switches that do not support 
>> multicast.
>> @@ -116,8 +121,8 @@ typedef struct osm_mcast_fwdbl {
>>  *
>>  * SYNOPSIS
>>  */
>> -ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * p_tbl,
>> -                                IN uint8_t num_ports, IN uint16_t capacity);
>> +void osm_mcast_tbl_init(IN osm_mcast_tbl_t * p_tbl, IN uint8_t num_ports,
>> +                     IN uint16_t capacity);
>>  /*
>>  * PARAMETERS
>>  *    num_ports
>> @@ -128,7 +133,7 @@ ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * 
>> p_tbl,
>>  *            by this switch.
>>  *
>>  * RETURN VALUE
>> -*    IB_SUCCESS on success.
>> +*    None.
>>  *
>>  * NOTES
>>  *
>> @@ -160,6 +165,34 @@ void osm_mcast_tbl_delete(IN osm_mcast_tbl_t ** pp_tbl);
>>  * SEE ALSO
>>  *********/
>>
>> +/****f* OpenSM: Forwarding Table/osm_mcast_tbl_realloc
>> +* NAME
>> +*    osm_mcast_tbl_realloc
>> +*
>> +* DESCRIPTION
>> +*    This function reallocates the multicast port mask table if necessary.
>> +*
>> +* SYNOPSIS
>> +*/
>> +int
>> +osm_mcast_tbl_realloc(IN osm_mcast_tbl_t * p_tbl, IN uintn_t mlid_offset);
>> +/*
>> +* PARAMETERS
>> +*
>> +*    p_tbl
>> +*            [in] Pointer to the Multicast Forwarding Table object.
>> +*
>> +*    mlid_offset
>> +*            [in] Offset of MLID being accessed.
>> +*
>> +* RETURN VALUE
>> +*    Returns 0 on success and non-zero value otherwise.
>> +*
>> +* NOTES
>> +*
>> +* SEE ALSO
>> +*/
>> +
>>  /****f* OpenSM: Forwarding Table/osm_mcast_tbl_destroy
>>  * NAME
>>  *    osm_mcast_tbl_destroy
>> diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c
>> index 0ee689c..a5e9758 100644
>> --- a/opensm/opensm/osm_mcast_mgr.c
>> +++ b/opensm/opensm/osm_mcast_mgr.c
>> @@ -1043,6 +1043,36 @@ static int mcast_mgr_set_mftables(osm_sm_t * sm)
>>       return ret;
>>  }
>>
>> +static int alloc_mfts(osm_sm_t * sm)
>> +{
>> +     int i;
>> +     cl_map_item_t *item;
>> +     osm_switch_t *p_sw;
>> +     int max_mlid = 0;
>> +
>> +     for (i = sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO; i >= 0;
>> +          i--) {
>> +             if (sm->p_subn->mgroups[i]) {
>> +                     max_mlid = i + IB_LID_MCAST_START_HO;
>> +                     break;
>> +             }
>> +     }
>> +
>> +     if (max_mlid == 0)
>> +             return 0;
>> +
>> +     /* Now, walk switches and (re)allocate multicast tables */
>> +     for (item = cl_qmap_head(&sm->p_subn->sw_guid_tbl);
>> +          item != cl_qmap_end(&sm->p_subn->sw_guid_tbl);
>> +          item = cl_qmap_next(item)) {
>> +             p_sw = (osm_switch_t *)item;
>> +             if (osm_mcast_tbl_realloc(&p_sw->mcast_tbl,
>> +                                       max_mlid - IB_LID_MCAST_START_HO))
>> +                     return -1;
>> +     }
>
> A variable 'max_mlid' is not actually needed in this function - you are
> initializing this as 'mlid_max = i + IB_LID_MCAST_START_NO' and then
> using as 'mlid_max - IB_LID_MCAST_START_HO'. Instead you could just use
> 'i' as it is.
>
>> +     return 0;
>> +}
>> +
>>  /**********************************************************************
>>   **********************************************************************/
>>  int osm_mcast_mgr_process(osm_sm_t * sm)
>> @@ -1063,6 +1093,12 @@ int osm_mcast_mgr_process(osm_sm_t * sm)
>>               goto exit;
>>       }
>>
>> +     if (alloc_mfts(sm)) {
>> +             OSM_LOG(sm->p_log, OSM_LOG_ERROR,
>> +                     "ERR 0A07: alloc_mfts failed\n");
>> +             goto exit;
>> +     }
>> +
>>       for (i = 0; i <= sm->p_subn->max_mcast_lid_ho - IB_LID_MCAST_START_HO;
>>            i++)
>>               if (sm->p_subn->mgroups[i] || sm->mlids_req[i])
>> @@ -1101,6 +1137,12 @@ int osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
>>               goto exit;
>>       }
>>
>> +     if (alloc_mfts(sm)) {
>> +             OSM_LOG(sm->p_log, OSM_LOG_ERROR,
>> +                     "ERR 0A09: alloc_mfts failed\n");
>> +             goto exit;
>> +     }
>> +
>>       for (i = 0; i <= sm->mlids_req_max; i++) {
>>               if (!sm->mlids_req[i])
>>                       continue;
>> diff --git a/opensm/opensm/osm_mcast_tbl.c b/opensm/opensm/osm_mcast_tbl.c
>> index bdea416..be2181d 100644
>> --- a/opensm/opensm/osm_mcast_tbl.c
>> +++ b/opensm/opensm/osm_mcast_tbl.c
>> @@ -1,6 +1,6 @@
>>  /*
>>   * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
>> - * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
>> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
>>   * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
>>   * Copyright (c) 2009 HNR Consulting. All rights reserved.
>>   *
>> @@ -53,8 +53,8 @@
>>
>>  /**********************************************************************
>>   **********************************************************************/
>> -ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * p_tbl,
>> -                                IN uint8_t num_ports, IN uint16_t capacity)
>> +void osm_mcast_tbl_init(IN osm_mcast_tbl_t * p_tbl, IN uint8_t num_ports,
>> +                     IN uint16_t capacity)
>>  {
>>       CL_ASSERT(p_tbl);
>>       CL_ASSERT(num_ports);
>> @@ -68,7 +68,7 @@ ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * 
>> p_tbl,
>>                  This switch apparently doesn't support multicast.
>>                  Everything is initialized to zero already, so return.
>>                */
>> -             return IB_SUCCESS;
>> +             return;
>>       }
>>
>>       p_tbl->num_entries = capacity;
>> @@ -80,25 +80,6 @@ ib_api_status_t osm_mcast_tbl_init(IN osm_mcast_tbl_t * 
>> p_tbl,
>>       p_tbl->max_block = (uint16_t) ((ROUNDUP(p_tbl->num_entries,
>>                                               IB_MCAST_BLOCK_SIZE) /
>>                                       IB_MCAST_BLOCK_SIZE) - 1);
>> -
>> -     /*
>> -        The number of bytes needed in the mask table is:
>> -        The (maximum bit mask 'position' + 1) times the
>> -        number of bytes in each bit mask times the
>> -        number of MLIDs supported by the table.
>> -
>> -        We must always allocate the array with the maximum position
>> -        since it is (and must be) defined that way the table structure
>> -        in order to create a pointer to a two dimensional array.
>> -      */
>> -     p_tbl->p_mask_tbl = calloc(p_tbl->num_entries,
>> -                                (IB_MCAST_POSITION_MAX +
>> -                                 1) * IB_MCAST_MASK_SIZE / 8);
>> -
>> -     if (p_tbl->p_mask_tbl == NULL)
>> -             return IB_INSUFFICIENT_MEMORY;
>> -
>> -     return IB_SUCCESS;
>>  }
>>
>>  /**********************************************************************
>> @@ -120,8 +101,8 @@ void osm_mcast_tbl_set(IN osm_mcast_tbl_t * p_tbl, IN 
>> uint16_t mlid_ho,
>>
>>       CL_ASSERT(p_tbl);
>>       CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
>> -     CL_ASSERT(mlid_ho <= (uint16_t) (IB_LID_MCAST_START_HO +
>> -                                      p_tbl->num_entries - 1));
>> +     CL_ASSERT(mlid_ho <= p_tbl->max_mlid_ho);
>> +     CL_ASSERT(mlid_ho - IB_LID_MCAST_START_HO < p_tbl->mft_depth);
>
>
> Isn't it the same check since
> 'p_tbl->max_mlid_ho = p_tbl->mft_depth + IB_LID_MCAST_START_HO - 1'?
>
>>       CL_ASSERT(p_tbl->p_mask_tbl);
>>
>>       mlid_offset = mlid_ho - IB_LID_MCAST_START_HO;
>> @@ -133,8 +114,41 @@ void osm_mcast_tbl_set(IN osm_mcast_tbl_t * p_tbl, IN 
>> uint16_t mlid_ho,
>>
>>       if (block_num > p_tbl->max_block_in_use)
>>               p_tbl->max_block_in_use = (uint16_t) block_num;
>> -     if (mlid_ho > p_tbl->max_mlid_ho)
>> -             p_tbl->max_mlid_ho = mlid_ho;
>> +}
>> +
>> +/**********************************************************************
>> + **********************************************************************/
>> +int
>> +osm_mcast_tbl_realloc(IN osm_mcast_tbl_t * p_tbl, IN uintn_t mlid_offset)
>> +{
>> +     size_t mft_depth, size;
>> +     uint16_t (*p_mask_tbl)[][IB_MCAST_POSITION_MAX];
>> +
>> +     if (mlid_offset < p_tbl->mft_depth)
>> +             return 0;
>> +
>> +     /*
>> +        The number of bytes needed in the mask table is:
>> +        The (maximum bit mask 'position' + 1) times the
>> +        number of bytes in each bit mask times the
>> +        number of MLIDs supported by the table.
>> +
>> +        We must always allocate the array with the maximum position
>> +        since it is (and must be) defined that way the table structure
>> +        in order to create a pointer to a two dimensional array.
>> +      */
>> +     mft_depth = (mlid_offset / IB_MCAST_BLOCK_SIZE + 1) * 
>> IB_MCAST_BLOCK_SIZE;
>> +     size = mft_depth * (IB_MCAST_POSITION_MAX + 1) * IB_MCAST_MASK_SIZE / 
>> 8;
>> +     p_mask_tbl = realloc(p_tbl->p_mask_tbl, size);
>> +     if (!p_mask_tbl)
>> +             return -1;
>> +     memset((uint8_t *)p_mask_tbl + p_tbl->mft_depth * 
>> (IB_MCAST_POSITION_MAX + 1) * IB_MCAST_MASK_SIZE / 8,
>> +            0,
>> +            size - p_tbl->mft_depth * (IB_MCAST_POSITION_MAX + 1) * 
>> IB_MCAST_MASK_SIZE / 8);
>> +     p_tbl->p_mask_tbl = p_mask_tbl;
>> +     p_tbl->mft_depth = mft_depth;
>> +     p_tbl->max_mlid_ho = mft_depth + IB_LID_MCAST_START_HO - 1;
>
> Wouldn't it be more accurate/efficient to set max_mlid_ho as
> 'mlid_offset + IB_LID_MCAST_START_HO - 1'?
>
>> +     return 0;
>>  }
>>
>>  /**********************************************************************
>> @@ -152,10 +166,11 @@ boolean_t osm_mcast_tbl_is_port(IN const 
>> osm_mcast_tbl_t * p_tbl,
>>               CL_ASSERT(port_num <=
>>                         (p_tbl->max_position + 1) * IB_MCAST_MASK_SIZE);
>>               CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
>> -             CL_ASSERT(mlid_ho <= (uint16_t) (IB_LID_MCAST_START_HO +
>> -                                              p_tbl->num_entries - 1));
>> +             CL_ASSERT(mlid_ho <= p_tbl->max_mlid_ho);
>>
>>               mlid_offset = mlid_ho - IB_LID_MCAST_START_HO;
>> +             if (mlid_offset >= p_tbl->mft_depth)
>> +                     return FALSE;
>
> This duplicates CL_ASSERT() above.

Not quite. There's a difference between max_mlid_ho and mft_depth.

> Looking on how this function is used
> I don't see why we this check should be introduced. Do you?

It's needed for group removal to work properly.


>>               mask_offset = port_num / IB_MCAST_MASK_SIZE;
>>               bit_mask = cl_ntoh16((uint16_t)
>>                                    (1 << (port_num % IB_MCAST_MASK_SIZE)));
>> @@ -180,10 +195,11 @@ boolean_t osm_mcast_tbl_is_any_port(IN const 
>> osm_mcast_tbl_t * p_tbl,
>>
>>       if (p_tbl->p_mask_tbl) {
>>               CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
>> -             CL_ASSERT(mlid_ho <= (uint16_t) (IB_LID_MCAST_START_HO +
>> -                                              p_tbl->num_entries - 1));
>> +             CL_ASSERT(mlid_ho <= p_tbl->max_mlid_ho);
>>
>>               mlid_offset = mlid_ho - IB_LID_MCAST_START_HO;
>> +             if (mlid_offset >= p_tbl->mft_depth)
>> +                     return FALSE;
>
> Ditto.

See above.

>>
>>               for (position = 0; position <= p_tbl->max_position; position++)
>>                       result |= (*p_tbl->p_mask_tbl)[mlid_offset][position];
>> @@ -213,8 +229,7 @@ ib_api_status_t osm_mcast_tbl_set_block(IN 
>> osm_mcast_tbl_t * p_tbl,
>>
>>       mlid_start_ho = (uint16_t) (block_num * IB_MCAST_BLOCK_SIZE);
>>
>> -     if (mlid_start_ho + IB_MCAST_BLOCK_SIZE - 1 >
>> -         p_tbl->num_entries + IB_LID_MCAST_START_HO - 1)
>> +     if (mlid_start_ho + IB_MCAST_BLOCK_SIZE - 1 > p_tbl->mft_depth)
>>               return IB_INVALID_PARAMETER;
>
> I see that 'mlid_start_ho' is actually a mlid offset. Was a previous
> check wrong?

This check ensures the whole MFT block (in terms of SM attribute use)
is allocated.
It isn't needed as the table allocation is done in these units. Prior
to this, I think there was a potential boundary condition so it was
needed.

>>
>>       for (i = 0; i < IB_MCAST_BLOCK_SIZE; i++)
>> @@ -223,9 +238,6 @@ ib_api_status_t osm_mcast_tbl_set_block(IN 
>> osm_mcast_tbl_t * p_tbl,
>>       if (block_num > p_tbl->max_block_in_use)
>>               p_tbl->max_block_in_use = (uint16_t) block_num;
>>
>> -     if (mlid_start_ho + IB_MCAST_BLOCK_SIZE - 1 > p_tbl->max_mlid_ho)
>> -             p_tbl->max_mlid_ho = mlid_start_ho + IB_MCAST_BLOCK_SIZE - 1;
>> -
>>       return IB_SUCCESS;
>>  }
>>
>> @@ -241,6 +253,8 @@ void osm_mcast_tbl_clear_mlid(IN osm_mcast_tbl_t * 
>> p_tbl, IN uint16_t mlid_ho)
>>
>>       if (p_tbl->p_mask_tbl && (mlid_ho <= p_tbl->max_mlid_ho)) {
>>               mlid_offset = mlid_ho - IB_LID_MCAST_START_HO;
>> +             if (mlid_offset >= p_tbl->mft_depth)
>> +                     return;
>
> This seems redundant for me after 'mlid_ho <= p_tbl->max_mlid_ho' check
> above.

See above.

-- Hal

> Sasha
>
>>               for (i = 0; i <= p_tbl->max_position; i++)
>>                       (*p_tbl->p_mask_tbl)[mlid_offset][i] = 0;
>>       }
>> @@ -257,6 +271,7 @@ boolean_t osm_mcast_tbl_get_block(IN osm_mcast_tbl_t * 
>> p_tbl,
>>
>>       CL_ASSERT(p_tbl);
>>       CL_ASSERT(p_block);
>> +     CL_ASSERT(block_num * IB_MCAST_BLOCK_SIZE <= p_tbl->mft_depth);
>>
>>       if (block_num > p_tbl->max_block_in_use)
>>               return FALSE;
>> diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c
>> index d2123a4..c912b03 100644
>> --- a/opensm/opensm/osm_switch.c
>> +++ b/opensm/opensm/osm_switch.c
>> @@ -136,9 +136,8 @@ osm_switch_t *osm_switch_new(IN osm_node_t * p_node,
>>
>>       memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports);
>>
>> -     if (osm_mcast_tbl_init(&p_sw->mcast_tbl, 
>> osm_node_get_num_physp(p_node),
>> -                            cl_ntoh16(p_si->mcast_cap)))
>> -             goto err;
>> +     osm_mcast_tbl_init(&p_sw->mcast_tbl, osm_node_get_num_physp(p_node),
>> +                        cl_ntoh16(p_si->mcast_cap));
>>
>>       for (port_num = 0; port_num < num_ports; port_num++)
>>               osm_port_prof_construct(&p_sw->p_prof[port_num]);
>> @@ -507,7 +506,6 @@ static int alloc_lft(IN osm_switch_t * p_sw, uint16_t 
>> lids)
>>               p_sw->lft = new_lft;
>>               p_sw->lft_size = lft_size;
>>       }
>> -
>>       return 0;
>>  }
>>
>> @@ -548,7 +546,6 @@ int osm_switch_prepare_path_rebuild(IN osm_switch_t * 
>> p_sw, IN uint16_t max_lids
>>               p_sw->num_hops = max_lids + 1;
>>       }
>>       p_sw->max_lid_ho = max_lids;
>> -
>>       return 0;
>>  }
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to [email protected]
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to