> -----Original Message-----
> From: Stokes, Ian
> Sent: Wednesday, March 7, 2018 4:59 PM
> To: Yuanhan Liu <y...@fridaylinux.org>; d...@openvswitch.org
> Cc: Finn Christensen <f...@napatech.com>; Darrell Ball <db...@vmware.com>;
> Chandran, Sugesh <sugesh.chand...@intel.com>; Simon Horman
> <simon.hor...@netronome.com>
> Subject: RE: [PATCH v7 1/6] dpif-netdev: associate flow with a mark id
> 

Adding Shahaf Shuler

> > -----Original Message-----
> > From: Yuanhan Liu [mailto:y...@fridaylinux.org]
> > Sent: Monday, January 29, 2018 7:00 AM
> > To: d...@openvswitch.org
> > Cc: Stokes, Ian <ian.sto...@intel.com>; Finn Christensen
> > <f...@napatech.com>; Darrell Ball <db...@vmware.com>; Chandran, Sugesh
> > <sugesh.chand...@intel.com>; Simon Horman
> > <simon.hor...@netronome.com>; Yuanhan Liu <y...@fridaylinux.org>
> > Subject: [PATCH v7 1/6] dpif-netdev: associate flow with a mark id
> >
> > Most modern NICs have the ability to bind a flow with a mark, so that
> > every packet matches such flow will have that mark present in its
> > descriptor.
> >
> > The basic idea of doing that is, when we receives packets later, we
> > could directly get the flow from the mark. That could avoid some very
> > costly CPU operations, including (but not limiting to)
> > miniflow_extract, emc lookup, dpcls lookup, etc. Thus, performance could
> be greatly improved.
> >
> > Thus, the major work of this patch is to associate a flow with a mark
> > id (an uint32_t number). The association in netdev datapath is done by
> > CMAP, while in hardware it's done by the rte_flow MARK action.
> >
> > One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the
> > case there is only one phys port but with 2 queues, there could be 2
> > PMDs. In other words, even for a single mega flow (i.e.
> > udp,tp_src=1000), there could be 2 different dp_netdev flows, one for
> > each PMD. That could results to the same mega flow being offloaded
> > twice in the hardware, worse, we may get 2 different marks and only the
> last one will work.
> >
> > To avoid that, a megaflow_to_mark CMAP is created. An entry will be
> > added for the first PMD that wants to offload a flow. For later PMDs,
> > it will see such megaflow is already offloaded, then the flow will not
> > be offloaded to HW twice.
> >
> > Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is
> > what the mark_to_flow CMAP is for. When the first PMD wants to offload
> > a flow, it allocates a new mark and performs the flow offload by
> > reusing the -
> > >flow_put method. When it succeeds, a "mark to flow" entry will be
> added.
> > For later PMDs, it will get the corresponding mark by above
> > megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.
> >
> > Co-authored-by: Finn Christensen <f...@napatech.com>
> > Signed-off-by: Yuanhan Liu <y...@fridaylinux.org>
> > Signed-off-by: Finn Christensen <f...@napatech.com>
> > ---
> >
> > v6: - fixed typos in commit log
> >     - fixed a sparse warning
> >     - used hash_int to compute the hash for mark_to_flow CMAP
> >     - added more comments
> >     - lock before port lookup
> >
> > v5: - fixed check of flow_mark_has_no_ref (renamed from
> >       is_last_flow_mark_reference).
> >       This fixed an issue that it took too long to finish
> >       flow add/removal if we do that repeatdly.
> >
> >     - do mark_to_flow disassociation if flow modification failed
> > ---
> >  lib/dpif-netdev.c | 283
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  lib/netdev.h      |   6 ++
> >  2 files changed, 289 insertions(+)
> >
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
> > ba62128..a514de8
> > 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -75,6 +75,7 @@
> >  #include "tnl-ports.h"
> >  #include "unixctl.h"
> >  #include "util.h"
> > +#include "uuid.h"
> >
> >  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
> >
> > @@ -430,7 +431,9 @@ struct dp_netdev_flow {
> >      /* Hash table index by unmasked flow. */
> >      const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
> >                                   /* 'flow_table'. */
> > +    const struct cmap_node mark_node; /* In owning flow_mark's
> > + mark_to_flow */
> >      const ovs_u128 ufid;         /* Unique flow identifier. */
> > +    const ovs_u128 mega_ufid;    /* Unique mega flow identifier. */
> >      const unsigned pmd_id;       /* The 'core_id' of pmd thread owning
> > this */
> >                                   /* flow. */
> >
> > @@ -441,6 +444,7 @@ struct dp_netdev_flow {
> >      struct ovs_refcount ref_cnt;
> >
> >      bool dead;
> > +    uint32_t mark;               /* Unique flow mark assigned to a flow
> > */
> >
> >      /* Statistics. */
> >      struct dp_netdev_flow_stats stats; @@ -1837,6 +1841,178 @@
> > dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
> >      return cls;
> >  }
> >
> > +#define MAX_FLOW_MARK       (UINT32_MAX - 1)
> > +#define INVALID_FLOW_MARK   (UINT32_MAX)
> > +
> > +struct megaflow_to_mark_data {
> > +    const struct cmap_node node;
> > +    ovs_u128 mega_ufid;
> > +    uint32_t mark;
> > +};
> > +
> > +struct flow_mark {
> > +    struct cmap megaflow_to_mark;
> > +    struct cmap mark_to_flow;
> > +    struct id_pool *pool;
> > +    struct ovs_mutex mutex;
> > +};
> > +
> > +static struct flow_mark flow_mark = {
> > +    .megaflow_to_mark = CMAP_INITIALIZER,
> > +    .mark_to_flow = CMAP_INITIALIZER,
> > +    .mutex = OVS_MUTEX_INITIALIZER,
> > +};
> > +
> > +static uint32_t
> > +flow_mark_alloc(void)
> > +{
> > +    uint32_t mark;
> > +
> > +    if (!flow_mark.pool) {
> > +        /* Haven't initiated yet, do it here */
> > +        flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
> > +    }
> > +
> > +    if (id_pool_alloc_id(flow_mark.pool, &mark)) {
> > +        return mark;
> > +    }
> > +
> > +    return INVALID_FLOW_MARK;
> > +}
> > +
> > +static void
> > +flow_mark_free(uint32_t mark)
> > +{
> > +    id_pool_free_id(flow_mark.pool, mark); }
> > +
> > +/* associate megaflow with a mark, which is a 1:1 mapping */ static
> > +void megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t
> > +mark) {
> > +    size_t hash = dp_netdev_flow_hash(mega_ufid);
> > +    struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
> > +
> > +    data->mega_ufid = *mega_ufid;
> > +    data->mark = mark;
> > +
> > +    cmap_insert(&flow_mark.megaflow_to_mark,
> > +                CONST_CAST(struct cmap_node *, &data->node), hash); }
> > +
> > +/* disassociate meagaflow with a mark */ static void
> > +megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid) {
> > +    size_t hash = dp_netdev_flow_hash(mega_ufid);
> > +    struct megaflow_to_mark_data *data;
> > +
> > +    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
> > &flow_mark.megaflow_to_mark) {
> > +        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
> > +            cmap_remove(&flow_mark.megaflow_to_mark,
> > +                        CONST_CAST(struct cmap_node *, &data->node),
> > hash);
> > +            free(data);
> > +            return;
> > +        }
> > +    }
> > +
> > +    VLOG_WARN("masked ufid "UUID_FMT" is not associated with a
> mark?\n",
> > +              UUID_ARGS((struct uuid *)mega_ufid)); }
> 
> Minor nit above, Capitalize the first letter the warning message 'Masked'
> to keep consistent with existing code standard.
> There are multiple instances of this for info, debug & warning messages
> throughout the patch series but I will just flag this once here for
> brevity.
> 
> > +
> > +static inline uint32_t
> > +megaflow_to_mark_find(const ovs_u128 *mega_ufid) {
> > +    size_t hash = dp_netdev_flow_hash(mega_ufid);
> > +    struct megaflow_to_mark_data *data;
> > +
> > +    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
> > &flow_mark.megaflow_to_mark) {
> > +        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
> > +            return data->mark;
> > +        }
> > +    }
> 
> In keeping with other helper methods for the megaflow to mark, it would be
> useful here to add a debug message to report that a mark for this megaflow
> was not found.
> 
> > +
> > +    return INVALID_FLOW_MARK;
> > +}
> > +
> > +/* associate mark with a flow, which is 1:N mapping */ static void
> > +mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow
> > +*flow) {
> > +    dp_netdev_flow_ref(flow);
> > +
> > +    cmap_insert(&flow_mark.mark_to_flow,
> > +                CONST_CAST(struct cmap_node *, &flow->mark_node),
> > +                hash_int(mark, 0));
> > +    flow->mark = mark;
> > +
> > +    VLOG_DBG("associated dp_netdev flow %p with mark %u\n", flow,
> > +mark); }
> > +
> > +static bool
> > +flow_mark_has_no_ref(uint32_t mark)
> > +{
> > +    struct dp_netdev_flow *flow;
> > +
> > +    CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
> > +                             &flow_mark.mark_to_flow) {
> > +        if (flow->mark == mark) {
> > +            return false;
> > +        }
> > +    }
> > +
> > +    return true;
> > +}
> > +
> > +static int
> > +mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
> > +                          struct dp_netdev_flow *flow) {
> > +    int ret = 0;
> > +    uint32_t mark = flow->mark;
> > +    struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
> > +                                             &flow->mark_node);
> > +
> > +    cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
> > +    flow->mark = INVALID_FLOW_MARK;
> > +
> > +    /*
> > +     * no flow is referencing the mark any more? If so, let's
> > +     * remove the flow from hardare and free the mark.
> > +     */
> 
> Minor typo, 'hardware' instead of 'hardare' above.
> 
> > +    if (flow_mark_has_no_ref(mark)) {
> > +        struct dp_netdev_port *port;
> > +        odp_port_t in_port = flow->flow.in_port.odp_port;
> > +
> > +        ovs_mutex_lock(&pmd->dp->port_mutex);
> > +        port = dp_netdev_lookup_port(pmd->dp, in_port);
> > +        if (port) {
> > +            ret = netdev_flow_del(port->netdev, &flow->mega_ufid,
> NULL);
> > +        }
> > +        ovs_mutex_unlock(&pmd->dp->port_mutex);
> > +
> > +        flow_mark_free(mark);
> > +        VLOG_DBG("freed flow mark %u\n", mark);
> > +
> > +        megaflow_to_mark_disassociate(&flow->mega_ufid);
> > +    }
> > +    dp_netdev_flow_unref(flow);
> > +
> > +    return ret;
> > +}
> > +
> > +static void
> > +flow_mark_flush(struct dp_netdev_pmd_thread *pmd) {
> > +    struct dp_netdev_flow *flow;
> > +
> > +    CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
> > +        if (flow->pmd_id == pmd->core_id) {
> > +            mark_to_flow_disassociate(pmd, flow);
> > +        }
> > +    }
> > +}
> > +
> >  static void
> >  dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
> >                            struct dp_netdev_flow *flow) @@ -1850,6
> > +2026,9 @@ dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
> >      ovs_assert(cls != NULL);
> >      dpcls_remove(cls, &flow->cr);
> >      cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow-
> > >ufid));
> > +    if (flow->mark != INVALID_FLOW_MARK) {
> > +        mark_to_flow_disassociate(pmd, flow);
> > +    }
> >      flow->dead = true;
> >
> >      dp_netdev_flow_unref(flow);
> > @@ -2429,6 +2608,101 @@ out:
> >      return error;
> >  }
> >
> > +/*
> > + * There are two flow offload operations here: addition and
> modification.
> > + *
> > + * For flow addition, this function does:
> > + * - allocate a new flow mark id
> > + * - perform hardware flow offload
> > + * - associate the flow mark with flow and mega flow
> > + *
> > + * For flow modification, both flow mark and the associations are
> > +still
> > + * valid, thus only item 2 needed.
> > + */
> > +static void
> > +try_netdev_flow_put(struct dp_netdev_pmd_thread *pmd, odp_port_t
> in_port,
> > +                    struct dp_netdev_flow *flow, struct match *match,
> > +                    const struct nlattr *actions, size_t actions_len) {
> > +    struct offload_info info;
> > +    struct dp_netdev_port *port;
> > +    bool modification = flow->mark != INVALID_FLOW_MARK;
> > +    const char *op = modification ? "modify" : "add";
> > +    uint32_t mark;
> > +    int ret;
> > +
> > +    ovs_mutex_lock(&flow_mark.mutex);
> > +
> > +    if (modification) {
> > +        mark = flow->mark;
> > +    } else {
> > +        if (!netdev_is_flow_api_enabled()) {
> > +            goto out;
> > +        }
> > +
> > +        /*
> > +         * If a mega flow has already been offloaded (from other PMD
> > +         * instances), do not offload it again.
> > +         */
> > +        mark = megaflow_to_mark_find(&flow->mega_ufid);
> > +        if (mark != INVALID_FLOW_MARK) {
> > +            VLOG_DBG("flow has already been offloaded with mark
> > + %u\n",
> > mark);
> > +            mark_to_flow_associate(mark, flow);
> > +            goto out;
> > +        }
> > +
> > +        mark = flow_mark_alloc();
> > +        if (mark == INVALID_FLOW_MARK) {
> > +            VLOG_ERR("failed to allocate flow mark!\n");
> > +            goto out;
> > +        }
> > +    }
> > +    info.flow_mark = mark;
> > +
> > +    ovs_mutex_lock(&pmd->dp->port_mutex);
> > +    port = dp_netdev_lookup_port(pmd->dp, in_port);
> > +    if (!port) {
> > +        ovs_mutex_unlock(&pmd->dp->port_mutex);
> > +        goto out;
> > +    }
> > +    ret = netdev_flow_put(port->netdev, match,
> > +                          CONST_CAST(struct nlattr *, actions),
> > +                          actions_len, &flow->mega_ufid, &info, NULL);
> > +    ovs_mutex_unlock(&pmd->dp->port_mutex);
> > +
> > +    if (ret) {
> > +        VLOG_ERR("failed to %s netdev flow with mark %u\n", op, mark);
> > +        if (!modification) {
> > +            flow_mark_free(mark);
> > +        } else {
> > +            mark_to_flow_disassociate(pmd, flow);
> 
>                In the case here would it not be a call to flush? If the
> flow could not be modified wouldn't you have to remove all references for
> it not just on this PMD?
> 
> > +        }
> > +        goto out;
> > +    }
> > +
> > +    if (!modification) {
> > +        megaflow_to_mark_associate(&flow->mega_ufid, mark);
> > +        mark_to_flow_associate(mark, flow);
> > +    }
> > +    VLOG_DBG("succeed to %s netdev flow with mark %u\n", op, mark);
> > +
> > +out:
> > +    ovs_mutex_unlock(&flow_mark.mutex);
> > +}
> > +
> > +static void
> > +dp_netdev_get_mega_ufid(const struct match *match, ovs_u128
> > +*mega_ufid) {
> > +    struct flow masked_flow;
> > +    size_t i;
> > +
> > +    for (i = 0; i < sizeof(struct flow); i++) {
> > +        ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
> > +                                       ((uint8_t *)&match->wc)[i];
> > +    }
> > +    dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow),
> > +mega_ufid); }
> > +
> >  static struct dp_netdev_flow *
> >  dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
> >                     struct match *match, const ovs_u128 *ufid, @@
> > -2464,12
> > +2738,14 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
> >      memset(&flow->stats, 0, sizeof flow->stats);
> >      flow->dead = false;
> >      flow->batch = NULL;
> > +    flow->mark = INVALID_FLOW_MARK;
> >      *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
> >      *CONST_CAST(struct flow *, &flow->flow) = match->flow;
> >      *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
> >      ovs_refcount_init(&flow->ref_cnt);
> >      ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions,
> > actions_len));
> >
> > +    dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *,
> > + &flow->mega_ufid));
> >      netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
> >
> >      /* Select dpcls for in_port. Relies on in_port to be exact match.
> > */ @@ -2479,6 +2755,8 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread
> *pmd,
> >      cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *,
> > &flow-
> > >node),
> >                  dp_netdev_flow_hash(&flow->ufid));
> >
> > +    try_netdev_flow_put(pmd, in_port, flow, match, actions,
> > + actions_len);
> > +
> >      if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
> >          struct ds ds = DS_EMPTY_INITIALIZER;
> >          struct ofpbuf key_buf, mask_buf; @@ -2559,6 +2837,7 @@
> > flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
> >          if (put->flags & DPIF_FP_MODIFY) {
> >              struct dp_netdev_actions *new_actions;
> >              struct dp_netdev_actions *old_actions;
> > +            odp_port_t in_port = netdev_flow->flow.in_port.odp_port;
> >
> >              new_actions = dp_netdev_actions_create(put->actions,
> >                                                     put->actions_len);
> > @@
> > -2566,6 +2845,9 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
> >              old_actions = dp_netdev_flow_get_actions(netdev_flow);
> >              ovsrcu_set(&netdev_flow->actions, new_actions);
> >
> > +            try_netdev_flow_put(pmd, in_port, netdev_flow, match,
> > +                                put->actions, put->actions_len);
> > +
> >              if (stats) {
> >                  get_dpif_flow_stats(netdev_flow, stats);
> >              }
> > @@ -3635,6 +3917,7 @@ reload_affected_pmds(struct dp_netdev *dp)
> >
> >      CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> >          if (pmd->need_reload) {
> > +            flow_mark_flush(pmd);
> >              dp_netdev_reload_pmd__(pmd);
> >              pmd->need_reload = false;
> >          }
> > diff --git a/lib/netdev.h b/lib/netdev.h index ff1b604..9ee3092 100644
> > --- a/lib/netdev.h
> > +++ b/lib/netdev.h
> > @@ -188,6 +188,12 @@ void netdev_send_wait(struct netdev *, int qid);
> > struct offload_info {
> >      const struct dpif_class *dpif_class;
> >      ovs_be16 tp_dst_port; /* Destination port for tunnel in SET
> > action */
> > +
> > +    /*
> > +     * The flow mark id assigened to the flow. If any pkts hit the
> flow,
> > +     * it will be in the pkt meta data.
> > +     */
> > +    uint32_t flow_mark;
> >  };
> >  struct dpif_class;
> >  struct netdev_flow_dump;
> > --
> > 2.7.4

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to