> -----Original Message----- > From: Stokes, Ian > Sent: Wednesday, March 7, 2018 4:59 PM > To: Yuanhan Liu <y...@fridaylinux.org>; d...@openvswitch.org > Cc: Finn Christensen <f...@napatech.com>; Darrell Ball <db...@vmware.com>; > Chandran, Sugesh <sugesh.chand...@intel.com>; Simon Horman > <simon.hor...@netronome.com> > Subject: RE: [PATCH v7 1/6] dpif-netdev: associate flow with a mark id >
Adding Shahaf Shuler > > -----Original Message----- > > From: Yuanhan Liu [mailto:y...@fridaylinux.org] > > Sent: Monday, January 29, 2018 7:00 AM > > To: d...@openvswitch.org > > Cc: Stokes, Ian <ian.sto...@intel.com>; Finn Christensen > > <f...@napatech.com>; Darrell Ball <db...@vmware.com>; Chandran, Sugesh > > <sugesh.chand...@intel.com>; Simon Horman > > <simon.hor...@netronome.com>; Yuanhan Liu <y...@fridaylinux.org> > > Subject: [PATCH v7 1/6] dpif-netdev: associate flow with a mark id > > > > Most modern NICs have the ability to bind a flow with a mark, so that > > every packet matches such flow will have that mark present in its > > descriptor. > > > > The basic idea of doing that is, when we receives packets later, we > > could directly get the flow from the mark. That could avoid some very > > costly CPU operations, including (but not limiting to) > > miniflow_extract, emc lookup, dpcls lookup, etc. Thus, performance could > be greatly improved. > > > > Thus, the major work of this patch is to associate a flow with a mark > > id (an uint32_t number). The association in netdev datapath is done by > > CMAP, while in hardware it's done by the rte_flow MARK action. > > > > One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the > > case there is only one phys port but with 2 queues, there could be 2 > > PMDs. In other words, even for a single mega flow (i.e. > > udp,tp_src=1000), there could be 2 different dp_netdev flows, one for > > each PMD. That could results to the same mega flow being offloaded > > twice in the hardware, worse, we may get 2 different marks and only the > last one will work. > > > > To avoid that, a megaflow_to_mark CMAP is created. An entry will be > > added for the first PMD that wants to offload a flow. For later PMDs, > > it will see such megaflow is already offloaded, then the flow will not > > be offloaded to HW twice. > > > > Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is > > what the mark_to_flow CMAP is for. When the first PMD wants to offload > > a flow, it allocates a new mark and performs the flow offload by > > reusing the - > > >flow_put method. When it succeeds, a "mark to flow" entry will be > added. > > For later PMDs, it will get the corresponding mark by above > > megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added. > > > > Co-authored-by: Finn Christensen <f...@napatech.com> > > Signed-off-by: Yuanhan Liu <y...@fridaylinux.org> > > Signed-off-by: Finn Christensen <f...@napatech.com> > > --- > > > > v6: - fixed typos in commit log > > - fixed a sparse warning > > - used hash_int to compute the hash for mark_to_flow CMAP > > - added more comments > > - lock before port lookup > > > > v5: - fixed check of flow_mark_has_no_ref (renamed from > > is_last_flow_mark_reference). > > This fixed an issue that it took too long to finish > > flow add/removal if we do that repeatdly. > > > > - do mark_to_flow disassociation if flow modification failed > > --- > > lib/dpif-netdev.c | 283 > > ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > > lib/netdev.h | 6 ++ > > 2 files changed, 289 insertions(+) > > > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index > > ba62128..a514de8 > > 100644 > > --- a/lib/dpif-netdev.c > > +++ b/lib/dpif-netdev.c > > @@ -75,6 +75,7 @@ > > #include "tnl-ports.h" > > #include "unixctl.h" > > #include "util.h" > > +#include "uuid.h" > > > > VLOG_DEFINE_THIS_MODULE(dpif_netdev); > > > > @@ -430,7 +431,9 @@ struct dp_netdev_flow { > > /* Hash table index by unmasked flow. */ > > const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */ > > /* 'flow_table'. */ > > + const struct cmap_node mark_node; /* In owning flow_mark's > > + mark_to_flow */ > > const ovs_u128 ufid; /* Unique flow identifier. */ > > + const ovs_u128 mega_ufid; /* Unique mega flow identifier. */ > > const unsigned pmd_id; /* The 'core_id' of pmd thread owning > > this */ > > /* flow. */ > > > > @@ -441,6 +444,7 @@ struct dp_netdev_flow { > > struct ovs_refcount ref_cnt; > > > > bool dead; > > + uint32_t mark; /* Unique flow mark assigned to a flow > > */ > > > > /* Statistics. */ > > struct dp_netdev_flow_stats stats; @@ -1837,6 +1841,178 @@ > > dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd, > > return cls; > > } > > > > +#define MAX_FLOW_MARK (UINT32_MAX - 1) > > +#define INVALID_FLOW_MARK (UINT32_MAX) > > + > > +struct megaflow_to_mark_data { > > + const struct cmap_node node; > > + ovs_u128 mega_ufid; > > + uint32_t mark; > > +}; > > + > > +struct flow_mark { > > + struct cmap megaflow_to_mark; > > + struct cmap mark_to_flow; > > + struct id_pool *pool; > > + struct ovs_mutex mutex; > > +}; > > + > > +static struct flow_mark flow_mark = { > > + .megaflow_to_mark = CMAP_INITIALIZER, > > + .mark_to_flow = CMAP_INITIALIZER, > > + .mutex = OVS_MUTEX_INITIALIZER, > > +}; > > + > > +static uint32_t > > +flow_mark_alloc(void) > > +{ > > + uint32_t mark; > > + > > + if (!flow_mark.pool) { > > + /* Haven't initiated yet, do it here */ > > + flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK); > > + } > > + > > + if (id_pool_alloc_id(flow_mark.pool, &mark)) { > > + return mark; > > + } > > + > > + return INVALID_FLOW_MARK; > > +} > > + > > +static void > > +flow_mark_free(uint32_t mark) > > +{ > > + id_pool_free_id(flow_mark.pool, mark); } > > + > > +/* associate megaflow with a mark, which is a 1:1 mapping */ static > > +void megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t > > +mark) { > > + size_t hash = dp_netdev_flow_hash(mega_ufid); > > + struct megaflow_to_mark_data *data = xzalloc(sizeof(*data)); > > + > > + data->mega_ufid = *mega_ufid; > > + data->mark = mark; > > + > > + cmap_insert(&flow_mark.megaflow_to_mark, > > + CONST_CAST(struct cmap_node *, &data->node), hash); } > > + > > +/* disassociate meagaflow with a mark */ static void > > +megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid) { > > + size_t hash = dp_netdev_flow_hash(mega_ufid); > > + struct megaflow_to_mark_data *data; > > + > > + CMAP_FOR_EACH_WITH_HASH (data, node, hash, > > &flow_mark.megaflow_to_mark) { > > + if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) { > > + cmap_remove(&flow_mark.megaflow_to_mark, > > + CONST_CAST(struct cmap_node *, &data->node), > > hash); > > + free(data); > > + return; > > + } > > + } > > + > > + VLOG_WARN("masked ufid "UUID_FMT" is not associated with a > mark?\n", > > + UUID_ARGS((struct uuid *)mega_ufid)); } > > Minor nit above, Capitalize the first letter the warning message 'Masked' > to keep consistent with existing code standard. > There are multiple instances of this for info, debug & warning messages > throughout the patch series but I will just flag this once here for > brevity. > > > + > > +static inline uint32_t > > +megaflow_to_mark_find(const ovs_u128 *mega_ufid) { > > + size_t hash = dp_netdev_flow_hash(mega_ufid); > > + struct megaflow_to_mark_data *data; > > + > > + CMAP_FOR_EACH_WITH_HASH (data, node, hash, > > &flow_mark.megaflow_to_mark) { > > + if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) { > > + return data->mark; > > + } > > + } > > In keeping with other helper methods for the megaflow to mark, it would be > useful here to add a debug message to report that a mark for this megaflow > was not found. > > > + > > + return INVALID_FLOW_MARK; > > +} > > + > > +/* associate mark with a flow, which is 1:N mapping */ static void > > +mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow > > +*flow) { > > + dp_netdev_flow_ref(flow); > > + > > + cmap_insert(&flow_mark.mark_to_flow, > > + CONST_CAST(struct cmap_node *, &flow->mark_node), > > + hash_int(mark, 0)); > > + flow->mark = mark; > > + > > + VLOG_DBG("associated dp_netdev flow %p with mark %u\n", flow, > > +mark); } > > + > > +static bool > > +flow_mark_has_no_ref(uint32_t mark) > > +{ > > + struct dp_netdev_flow *flow; > > + > > + CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0), > > + &flow_mark.mark_to_flow) { > > + if (flow->mark == mark) { > > + return false; > > + } > > + } > > + > > + return true; > > +} > > + > > +static int > > +mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd, > > + struct dp_netdev_flow *flow) { > > + int ret = 0; > > + uint32_t mark = flow->mark; > > + struct cmap_node *mark_node = CONST_CAST(struct cmap_node *, > > + &flow->mark_node); > > + > > + cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0)); > > + flow->mark = INVALID_FLOW_MARK; > > + > > + /* > > + * no flow is referencing the mark any more? If so, let's > > + * remove the flow from hardare and free the mark. > > + */ > > Minor typo, 'hardware' instead of 'hardare' above. > > > + if (flow_mark_has_no_ref(mark)) { > > + struct dp_netdev_port *port; > > + odp_port_t in_port = flow->flow.in_port.odp_port; > > + > > + ovs_mutex_lock(&pmd->dp->port_mutex); > > + port = dp_netdev_lookup_port(pmd->dp, in_port); > > + if (port) { > > + ret = netdev_flow_del(port->netdev, &flow->mega_ufid, > NULL); > > + } > > + ovs_mutex_unlock(&pmd->dp->port_mutex); > > + > > + flow_mark_free(mark); > > + VLOG_DBG("freed flow mark %u\n", mark); > > + > > + megaflow_to_mark_disassociate(&flow->mega_ufid); > > + } > > + dp_netdev_flow_unref(flow); > > + > > + return ret; > > +} > > + > > +static void > > +flow_mark_flush(struct dp_netdev_pmd_thread *pmd) { > > + struct dp_netdev_flow *flow; > > + > > + CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) { > > + if (flow->pmd_id == pmd->core_id) { > > + mark_to_flow_disassociate(pmd, flow); > > + } > > + } > > +} > > + > > static void > > dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd, > > struct dp_netdev_flow *flow) @@ -1850,6 > > +2026,9 @@ dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd, > > ovs_assert(cls != NULL); > > dpcls_remove(cls, &flow->cr); > > cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow- > > >ufid)); > > + if (flow->mark != INVALID_FLOW_MARK) { > > + mark_to_flow_disassociate(pmd, flow); > > + } > > flow->dead = true; > > > > dp_netdev_flow_unref(flow); > > @@ -2429,6 +2608,101 @@ out: > > return error; > > } > > > > +/* > > + * There are two flow offload operations here: addition and > modification. > > + * > > + * For flow addition, this function does: > > + * - allocate a new flow mark id > > + * - perform hardware flow offload > > + * - associate the flow mark with flow and mega flow > > + * > > + * For flow modification, both flow mark and the associations are > > +still > > + * valid, thus only item 2 needed. > > + */ > > +static void > > +try_netdev_flow_put(struct dp_netdev_pmd_thread *pmd, odp_port_t > in_port, > > + struct dp_netdev_flow *flow, struct match *match, > > + const struct nlattr *actions, size_t actions_len) { > > + struct offload_info info; > > + struct dp_netdev_port *port; > > + bool modification = flow->mark != INVALID_FLOW_MARK; > > + const char *op = modification ? "modify" : "add"; > > + uint32_t mark; > > + int ret; > > + > > + ovs_mutex_lock(&flow_mark.mutex); > > + > > + if (modification) { > > + mark = flow->mark; > > + } else { > > + if (!netdev_is_flow_api_enabled()) { > > + goto out; > > + } > > + > > + /* > > + * If a mega flow has already been offloaded (from other PMD > > + * instances), do not offload it again. > > + */ > > + mark = megaflow_to_mark_find(&flow->mega_ufid); > > + if (mark != INVALID_FLOW_MARK) { > > + VLOG_DBG("flow has already been offloaded with mark > > + %u\n", > > mark); > > + mark_to_flow_associate(mark, flow); > > + goto out; > > + } > > + > > + mark = flow_mark_alloc(); > > + if (mark == INVALID_FLOW_MARK) { > > + VLOG_ERR("failed to allocate flow mark!\n"); > > + goto out; > > + } > > + } > > + info.flow_mark = mark; > > + > > + ovs_mutex_lock(&pmd->dp->port_mutex); > > + port = dp_netdev_lookup_port(pmd->dp, in_port); > > + if (!port) { > > + ovs_mutex_unlock(&pmd->dp->port_mutex); > > + goto out; > > + } > > + ret = netdev_flow_put(port->netdev, match, > > + CONST_CAST(struct nlattr *, actions), > > + actions_len, &flow->mega_ufid, &info, NULL); > > + ovs_mutex_unlock(&pmd->dp->port_mutex); > > + > > + if (ret) { > > + VLOG_ERR("failed to %s netdev flow with mark %u\n", op, mark); > > + if (!modification) { > > + flow_mark_free(mark); > > + } else { > > + mark_to_flow_disassociate(pmd, flow); > > In the case here would it not be a call to flush? If the > flow could not be modified wouldn't you have to remove all references for > it not just on this PMD? > > > + } > > + goto out; > > + } > > + > > + if (!modification) { > > + megaflow_to_mark_associate(&flow->mega_ufid, mark); > > + mark_to_flow_associate(mark, flow); > > + } > > + VLOG_DBG("succeed to %s netdev flow with mark %u\n", op, mark); > > + > > +out: > > + ovs_mutex_unlock(&flow_mark.mutex); > > +} > > + > > +static void > > +dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 > > +*mega_ufid) { > > + struct flow masked_flow; > > + size_t i; > > + > > + for (i = 0; i < sizeof(struct flow); i++) { > > + ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] & > > + ((uint8_t *)&match->wc)[i]; > > + } > > + dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), > > +mega_ufid); } > > + > > static struct dp_netdev_flow * > > dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd, > > struct match *match, const ovs_u128 *ufid, @@ > > -2464,12 > > +2738,14 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd, > > memset(&flow->stats, 0, sizeof flow->stats); > > flow->dead = false; > > flow->batch = NULL; > > + flow->mark = INVALID_FLOW_MARK; > > *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id; > > *CONST_CAST(struct flow *, &flow->flow) = match->flow; > > *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid; > > ovs_refcount_init(&flow->ref_cnt); > > ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, > > actions_len)); > > > > + dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, > > + &flow->mega_ufid)); > > netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask); > > > > /* Select dpcls for in_port. Relies on in_port to be exact match. > > */ @@ -2479,6 +2755,8 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread > *pmd, > > cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, > > &flow- > > >node), > > dp_netdev_flow_hash(&flow->ufid)); > > > > + try_netdev_flow_put(pmd, in_port, flow, match, actions, > > + actions_len); > > + > > if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) { > > struct ds ds = DS_EMPTY_INITIALIZER; > > struct ofpbuf key_buf, mask_buf; @@ -2559,6 +2837,7 @@ > > flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, > > if (put->flags & DPIF_FP_MODIFY) { > > struct dp_netdev_actions *new_actions; > > struct dp_netdev_actions *old_actions; > > + odp_port_t in_port = netdev_flow->flow.in_port.odp_port; > > > > new_actions = dp_netdev_actions_create(put->actions, > > put->actions_len); > > @@ > > -2566,6 +2845,9 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, > > old_actions = dp_netdev_flow_get_actions(netdev_flow); > > ovsrcu_set(&netdev_flow->actions, new_actions); > > > > + try_netdev_flow_put(pmd, in_port, netdev_flow, match, > > + put->actions, put->actions_len); > > + > > if (stats) { > > get_dpif_flow_stats(netdev_flow, stats); > > } > > @@ -3635,6 +3917,7 @@ reload_affected_pmds(struct dp_netdev *dp) > > > > CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { > > if (pmd->need_reload) { > > + flow_mark_flush(pmd); > > dp_netdev_reload_pmd__(pmd); > > pmd->need_reload = false; > > } > > diff --git a/lib/netdev.h b/lib/netdev.h index ff1b604..9ee3092 100644 > > --- a/lib/netdev.h > > +++ b/lib/netdev.h > > @@ -188,6 +188,12 @@ void netdev_send_wait(struct netdev *, int qid); > > struct offload_info { > > const struct dpif_class *dpif_class; > > ovs_be16 tp_dst_port; /* Destination port for tunnel in SET > > action */ > > + > > + /* > > + * The flow mark id assigened to the flow. If any pkts hit the > flow, > > + * it will be in the pkt meta data. > > + */ > > + uint32_t flow_mark; > > }; > > struct dpif_class; > > struct netdev_flow_dump; > > -- > > 2.7.4 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev