On 02/13/2018 09:07 AM, Stokes, Ian wrote:
> Hi all,
> 
> Just pinging for feedback for this patch. If we are to include this for 
> branch 2.9 as a bug fix I think we need feedback quite soon.
> 
> Have people had time to validate and review?
> 

Hi Ian,

As per the other thread I have reviewed and tested multiple ports with
MTU reconfigs - creating new mempools, freeing unused mempools, ports
sharing mempools, ports having separate mempools, numa based reconfig
from vhost etc. All working fine, so LGTM.

Are you going to send a non-rfc version of this patch?

thanks,
Kevin.

> Thanks
> Ian
> 
>> -----Original Message-----
>> From: Stokes, Ian
>> Sent: Thursday, February 8, 2018 11:39 AM
>> To: d...@openvswitch.org
>> Cc: Stokes, Ian <ian.sto...@intel.com>; antonio.fische...@gmail.com; Ilya
>> Maximets <i.maxim...@samsung.com>; Kevin Traynor <ktray...@redhat.com>;
>> Jan Scheurich <jan.scheur...@ericsson.com>
>> Subject: [RFC Patch v1] netdev-dpdk: Reintroduce shared mempools.
>>
>> This commit manually reverts the current per port mempool model to the
>> previous shared mempool model for DPDK ports.
>>
>> OVS previously used a shared mempool model for ports with the same MTU
>> configuration. This was replaced by a per port mempool model to address
>> issues flagged by users such as:
>>
>> https://mail.openvswitch.org/pipermail/ovs-discuss/2016-
>> September/042560.html
>>
>> However the per port model has a number of issues including:
>>
>> 1. Requires an increase in memory resource requirements to support the
>> same number of ports as the shared port model.
>> 2. Incorrect algorithm for mbuf provisioning for each mempool.
>>
>> These are considered blocking factors for current deployments of OVS when
>> upgrading to OVS 2.9 as a memory redimensioning will be required. This may
>> not be possible for users.
>>
>> For clarity, the commits whose changes are removed include the
>> following:
>>
>> netdev-dpdk: Create separate memory pool for each port: d555d9b
>> netdev-dpdk: fix management of pre-existing mempools: b6b26021d Fix
>> mempool names to reflect socket id: f06546a
>> netdev-dpdk: skip init for existing mempools: 837c176
>> netdev-dpdk: manage failure in mempool name creation: 65056fd
>> netdev-dpdk: Reword mp_size as n_mbufs: ad9b5b9
>> netdev-dpdk: Rename dpdk_mp_put as dpdk_mp_free: a08a115
>> netdev-dpdk: Fix mp_name leak on snprintf failure: ec6edc8
>> netdev-dpdk: Fix dpdk_mp leak in case of EEXIST: 173ef76
>> netdev-dpdk: Factor out struct dpdk_mp: 24e78f9
>> netdev-dpdk: Remove unused MAX_NB_MBUF: bc57ed9
>> netdev-dpdk: Fix mempool creation with large MTU: af5b0da
>>
>> Due to the number of commits and period of time they were introduced over,
>> a simple revert was not possible. All code from the commits above is
>> removed and the shared mempool code reintroduced as it was before its
>> replacement.
>>
>> Code introduced by commit
>>
>> netdev-dpdk: Add debug appctl to get mempool information: be48173
>>
>> has been modified to work with the shared mempool model.
>>
>> Cc: antonio.fische...@gmail.com
>> Cc: Ilya Maximets <i.maxim...@samsung.com>
>> Cc: Kevin Traynor <ktray...@redhat.com>
>> Cc: Jan Scheurich <jan.scheur...@ericsson.com>
>> Signed-off-by: Ian Stokes <ian.sto...@intel.com>
>> ---
>>  lib/netdev-dpdk.c | 246 ++++++++++++++++++++++++++++++-------------------
>> -----
>>  1 file changed, 138 insertions(+), 108 deletions(-)
>>
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 94fb163..6f3378b
>> 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -91,13 +91,24 @@ static struct vlog_rate_limit rl =
>> VLOG_RATE_LIMIT_INIT(5, 20);
>>  #define NETDEV_DPDK_MBUF_ALIGN      1024
>>  #define NETDEV_DPDK_MAX_PKT_LEN     9728
>>
>> -/* Min number of packets in the mempool.  OVS tries to allocate a mempool
>> with
>> - * roughly estimated number of mbufs: if this fails (because the system
>> doesn't
>> - * have enough hugepages) we keep halving the number until the allocation
>> - * succeeds or we reach MIN_NB_MBUF */
>> +/* Max and min number of packets in the mempool.  OVS tries to allocate
>> +a
>> + * mempool with MAX_NB_MBUF: if this fails (because the system doesn't
>> +have
>> + * enough hugepages) we keep halving the number until the allocation
>> +succeeds
>> + * or we reach MIN_NB_MBUF */
>> +
>> +#define MAX_NB_MBUF          (4096 * 64)
>>  #define MIN_NB_MBUF          (4096 * 4)
>>  #define MP_CACHE_SZ          RTE_MEMPOOL_CACHE_MAX_SIZE
>>
>> +/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
>> +BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF /
>> MIN_NB_MBUF)
>> +                  == 0);
>> +
>> +/* The smallest possible NB_MBUF that we're going to try should be a
>> +multiple
>> + * of MP_CACHE_SZ. This is advised by DPDK documentation. */
>> +BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF /
>> MIN_NB_MBUF))
>> +                  % MP_CACHE_SZ == 0);
>> +
>>  /*
>>   * DPDK XSTATS Counter names definition
>>   */
>> @@ -295,6 +306,19 @@ static struct ovs_list dpdk_list
>> OVS_GUARDED_BY(dpdk_mutex)  static struct ovs_mutex dpdk_mp_mutex
>> OVS_ACQ_AFTER(dpdk_mutex)
>>      = OVS_MUTEX_INITIALIZER;
>>
>> +static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
>> +    = OVS_LIST_INITIALIZER(&dpdk_mp_list);
>> +
>> +
>> +struct dpdk_mp {
>> +     struct rte_mempool *mp;
>> +     int mtu;
>> +     int socket_id;
>> +     int refcount;
>> +     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);  };
>> +
>> +
>>  /* There should be one 'struct dpdk_tx_queue' created for
>>   * each cpu core. */
>>  struct dpdk_tx_queue {
>> @@ -371,7 +395,7 @@ struct netdev_dpdk {
>>
>>      PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
>>          struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
>> -        struct rte_mempool *mp;
>> +        struct dpdk_mp *dpdk_mp;
>>
>>          /* virtio identifier for vhost devices */
>>          ovsrcu_index vid;
>> @@ -510,133 +534,132 @@ ovs_rte_pktmbuf_init(struct rte_mempool *mp
>> OVS_UNUSED,
>>      dp_packet_init_dpdk((struct dp_packet *) pkt, pkt->buf_len);  }
>>
>> -/* Returns a valid pointer when either of the following is true:
>> - *  - a new mempool was just created;
>> - *  - a matching mempool already exists. */ -static struct rte_mempool *
>> -dpdk_mp_create(struct netdev_dpdk *dev, int mtu) -{
>> -    char mp_name[RTE_MEMPOOL_NAMESIZE];
>> -    const char *netdev_name = netdev_get_name(&dev->up);
>> -    int socket_id = dev->requested_socket_id;
>> -    uint32_t n_mbufs;
>> -    uint32_t hash = hash_string(netdev_name, 0);
>> -    struct rte_mempool *mp = NULL;
>> -
>> -    /*
>> -     * XXX: rough estimation of number of mbufs required for this port:
>> -     * <packets required to fill the device rxqs>
>> -     * + <packets that could be stuck on other ports txqs>
>> -     * + <packets in the pmd threads>
>> -     * + <additional memory for corner cases>
>> +static struct dpdk_mp *
>> +dpdk_mp_create(int socket_id, int mtu)
>> +{
>> +    struct dpdk_mp *dmp;
>> +    unsigned mp_size;
>> +    char *mp_name;
>> +
>> +    dmp = dpdk_rte_mzalloc(sizeof *dmp);
>> +    if (!dmp) {
>> +        return NULL;
>> +    }
>> +    dmp->socket_id = socket_id;
>> +    dmp->mtu = mtu;
>> +    dmp->refcount = 1;
>> +    /* XXX: this is a really rough method of provisioning memory.
>> +     * It's impossible to determine what the exact memory requirements
>> are
>> +     * when the number of ports and rxqs that utilize a particular
>> mempool can
>> +     * change dynamically at runtime. For now, use this rough heurisitic.
>>       */
>> -    n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
>> -              + dev->requested_n_txq * dev->requested_txq_size
>> -              + MIN(RTE_MAX_LCORE, dev->requested_n_rxq) *
>> NETDEV_MAX_BURST
>> -              + MIN_NB_MBUF;
>> +    if (mtu >= ETHER_MTU) {
>> +        mp_size = MAX_NB_MBUF;
>> +    } else {
>> +        mp_size = MIN_NB_MBUF;
>> +    }
>>
>> -    ovs_mutex_lock(&dpdk_mp_mutex);
>>      do {
>> -        /* Full DPDK memory pool name must be unique and cannot be
>> -         * longer than RTE_MEMPOOL_NAMESIZE. */
>> -        int ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
>> -                           "ovs%08x%02d%05d%07u",
>> -                           hash, socket_id, mtu, n_mbufs);
>> -        if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
>> -            VLOG_DBG("snprintf returned %d. "
>> -                     "Failed to generate a mempool name for \"%s\". "
>> -                     "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
>> -                     ret, netdev_name, hash, socket_id, mtu, n_mbufs);
>> -            break;
>> +        mp_name = xasprintf("ovs_mp_%d_%d_%u", dmp->mtu, dmp->socket_id,
>> +                            mp_size);
>> +
>> +        dmp->mp = rte_pktmbuf_pool_create(mp_name, mp_size,
>> +                                          MP_CACHE_SZ,
>> +                                          sizeof (struct dp_packet)
>> +                                          - sizeof (struct rte_mbuf),
>> +                                          MBUF_SIZE(mtu)
>> +                                          - sizeof(struct dp_packet),
>> +                                          socket_id);
>> +        if (dmp->mp) {
>> +            VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
>> +                     mp_name, mp_size);
>>          }
>> +        free(mp_name);
>> +        if (dmp->mp) {
>> +            /* rte_pktmbuf_pool_create has done some initialization of
>> the
>> +             * rte_mbuf part of each dp_packet, while
>> ovs_rte_pktmbuf_init
>> +             * initializes some OVS specific fields of dp_packet.
>> +             */
>> +            rte_mempool_obj_iter(dmp->mp, ovs_rte_pktmbuf_init, NULL);
>> +            return dmp;
>> +        }
>> +    } while (rte_errno == ENOMEM && (mp_size /= 2) >= MIN_NB_MBUF);
>>
>> -        VLOG_DBG("Port %s: Requesting a mempool of %u mbufs "
>> -                  "on socket %d for %d Rx and %d Tx queues.",
>> -                  netdev_name, n_mbufs, socket_id,
>> -                  dev->requested_n_rxq, dev->requested_n_txq);
>> +    rte_free(dmp);
>> +    return NULL;
>> +}
>>
>> -        mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
>> -                 sizeof (struct dp_packet) - sizeof (struct rte_mbuf),
>> -                 MBUF_SIZE(mtu) - sizeof(struct dp_packet), socket_id);
>> +static struct dpdk_mp *
>> +dpdk_mp_get(int socket_id, int mtu)
>> +{
>> +    struct dpdk_mp *dmp;
>>
>> -        if (mp) {
>> -            VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
>> -                     mp_name, n_mbufs);
>> -            /* rte_pktmbuf_pool_create has done some initialization of
>> the
>> -             * rte_mbuf part of each dp_packet. Some OvS specific fields
>> -             * of the packet still need to be initialized by
>> -             * ovs_rte_pktmbuf_init. */
>> -            rte_mempool_obj_iter(mp, ovs_rte_pktmbuf_init, NULL);
>> -        } else if (rte_errno == EEXIST) {
>> -            /* A mempool with the same name already exists.  We just
>> -             * retrieve its pointer to be returned to the caller. */
>> -            mp = rte_mempool_lookup(mp_name);
>> -            /* As the mempool create returned EEXIST we can expect the
>> -             * lookup has returned a valid pointer.  If for some reason
>> -             * that's not the case we keep track of it. */
>> -            VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
>> -                     mp_name, mp);
>> -        } else {
>> -            VLOG_ERR("Failed mempool \"%s\" create request of %u mbufs",
>> -                     mp_name, n_mbufs);
>> +    ovs_mutex_lock(&dpdk_mp_mutex);
>> +    LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
>> +        if (dmp->socket_id == socket_id && dmp->mtu == mtu) {
>> +            dmp->refcount++;
>> +            goto out;
>>          }
>> -    } while (!mp && rte_errno == ENOMEM && (n_mbufs /= 2) >=
>> MIN_NB_MBUF);
>> +    }
>> +
>> +    dmp = dpdk_mp_create(socket_id, mtu);
>> +    if (dmp) {
>> +        ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
>> +    }
>> +
>> +out:
>>
>>      ovs_mutex_unlock(&dpdk_mp_mutex);
>> -    return mp;
>> +
>> +    return dmp;
>>  }
>>
>>  /* Release an existing mempool. */
>>  static void
>> -dpdk_mp_free(struct rte_mempool *mp)
>> +dpdk_mp_put(struct dpdk_mp *dmp)
>>  {
>> -    if (!mp) {
>> +    if (!dmp) {
>>          return;
>>      }
>>
>>      ovs_mutex_lock(&dpdk_mp_mutex);
>> -    VLOG_DBG("Releasing \"%s\" mempool", mp->name);
>> -    rte_mempool_free(mp);
>> +    ovs_assert(dmp->refcount);
>> +
>> +    if (! --dmp->refcount) {
>> +        ovs_list_remove(&dmp->list_node);
>> +        rte_mempool_free(dmp->mp);
>> +        rte_free(dmp);
>> +     }
>>      ovs_mutex_unlock(&dpdk_mp_mutex);
>>  }
>>
>> -/* Tries to allocate a new mempool - or re-use an existing one where
>> - * appropriate - on requested_socket_id with a size determined by
>> - * requested_mtu and requested Rx/Tx queues.
>> - * On success - or when re-using an existing mempool - the new
>> configuration
>> - * will be applied.
>> +/* Tries to allocate new mempool on requested_socket_id with
>> + * mbuf size corresponding to requested_mtu.
>> + * On success new configuration will be applied.
>>   * On error, device will be left unchanged. */  static int
>> netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
>>      OVS_REQUIRES(dev->mutex)
>>  {
>>      uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
>> -    struct rte_mempool *mp;
>> -    int ret = 0;
>> +    struct dpdk_mp *mp;
>>
>> -    mp = dpdk_mp_create(dev, FRAME_LEN_TO_MTU(buf_size));
>> +    mp = dpdk_mp_get(dev->requested_socket_id,
>> + FRAME_LEN_TO_MTU(buf_size));
>>      if (!mp) {
>>          VLOG_ERR("Failed to create memory pool for netdev "
>>                   "%s, with MTU %d on socket %d: %s\n",
>>                   dev->up.name, dev->requested_mtu, dev-
>>> requested_socket_id,
>> -                 rte_strerror(rte_errno));
>> -        ret = rte_errno;
>> +        rte_strerror(rte_errno));
>> +        return rte_errno;
>>      } else {
>> -        /* If a new MTU was requested and its rounded value equals the
>> one
>> -         * that is currently used, then the existing mempool is returned.
>> */
>> -        if (dev->mp != mp) {
>> -            /* A new mempool was created, release the previous one. */
>> -            dpdk_mp_free(dev->mp);
>> -        } else {
>> -            ret = EEXIST;
>> -        }
>> -        dev->mp = mp;
>> +        dpdk_mp_put(dev->dpdk_mp);
>> +        dev->dpdk_mp = mp;
>>          dev->mtu = dev->requested_mtu;
>>          dev->socket_id = dev->requested_socket_id;
>>          dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
>>      }
>>
>> -    return ret;
>> +    return 0;
>>  }
>>
>>  static void
>> @@ -742,7 +765,8 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
>> n_rxq, int n_txq)
>>
>>          for (i = 0; i < n_rxq; i++) {
>>              diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
>> -                                          dev->socket_id, NULL, dev->mp);
>> +                                          dev->socket_id, NULL,
>> +                                          dev->dpdk_mp->mp);
>>              if (diag) {
>>                  VLOG_INFO("Interface %s rxq(%d) setup error: %s",
>>                            dev->up.name, i, rte_strerror(-diag)); @@ -
>> 826,7 +850,7 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>>      memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
>>      rte_eth_link_get_nowait(dev->port_id, &dev->link);
>>
>> -    mbp_priv = rte_mempool_get_priv(dev->mp);
>> +    mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
>>      dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
>>
>>      /* Get the Flow control configuration for DPDK-ETH */ @@ -1079,7
>> +1103,7 @@ common_destruct(struct netdev_dpdk *dev)
>>      OVS_EXCLUDED(dev->mutex)
>>  {
>>      rte_free(dev->tx_q);
>> -    dpdk_mp_free(dev->mp);
>> +    dpdk_mp_put(dev->dpdk_mp);
>>
>>      ovs_list_remove(&dev->list_node);
>>      free(ovsrcu_get_protected(struct ingress_policer *, @@ -1823,7
>> +1847,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>>      }
>>
>>      nb_rx = rte_vhost_dequeue_burst(vid, qid * VIRTIO_QNUM + VIRTIO_TXQ,
>> -                                    dev->mp,
>> +                                    dev->dpdk_mp->mp,
>>                                      (struct rte_mbuf **) batch->packets,
>>                                      NETDEV_MAX_BURST);
>>      if (!nb_rx) {
>> @@ -2043,7 +2067,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,
>> struct dp_packet_batch *batch)
>>              continue;
>>          }
>>
>> -        pkts[txcnt] = rte_pktmbuf_alloc(dev->mp);
>> +        pkts[txcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
>>          if (OVS_UNLIKELY(!pkts[txcnt])) {
>>              dropped += cnt - i;
>>              break;
>> @@ -2919,7 +2943,7 @@ netdev_dpdk_get_mempool_info(struct unixctl_conn
>> *conn,
>>          ovs_mutex_lock(&dev->mutex);
>>          ovs_mutex_lock(&dpdk_mp_mutex);
>>
>> -        rte_mempool_dump(stream, dev->mp);
>> +        rte_mempool_dump(stream, dev->dpdk_mp->mp);
>>
>>          ovs_mutex_unlock(&dpdk_mp_mutex);
>>          ovs_mutex_unlock(&dev->mutex);
>> @@ -3556,9 +3580,12 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>>
>>      rte_eth_dev_stop(dev->port_id);
>>
>> -    err = netdev_dpdk_mempool_configure(dev);
>> -    if (err && err != EEXIST) {
>> -        goto out;
>> +    if (dev->mtu != dev->requested_mtu
>> +        || dev->socket_id != dev->requested_socket_id) {
>> +        err = netdev_dpdk_mempool_configure(dev);
>> +        if (err) {
>> +            goto out;
>> +        }
>>      }
>>
>>      netdev->n_txq = dev->requested_n_txq; @@ -3596,13 +3623,16 @@
>> dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
>>
>>      netdev_dpdk_remap_txqs(dev);
>>
>> -    err = netdev_dpdk_mempool_configure(dev);
>> -    if (!err) {
>> -        /* A new mempool was created. */
>> -        netdev_change_seq_changed(&dev->up);
>> -    } else if (err != EEXIST){
>> -        return err;
>> +    if (dev->requested_socket_id != dev->socket_id
>> +        || dev->requested_mtu != dev->mtu) {
>> +        err = netdev_dpdk_mempool_configure(dev);
>> +        if (err) {
>> +            return err;
>> +        } else {
>> +            netdev_change_seq_changed(&dev->up);
>> +        }
>>      }
>> +
>>      if (netdev_dpdk_get_vid(dev) >= 0) {
>>          if (dev->vhost_reconfigured == false) {
>>              dev->vhost_reconfigured = true;
>> --
>> 2.7.5
> 

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to