> -----Original Message-----
> From: Yuanhan Liu [mailto:[email protected]]
> Sent: Monday, January 29, 2018 7:00 AM
> To: [email protected]
> Cc: Stokes, Ian <[email protected]>; Finn Christensen
> <[email protected]>; Darrell Ball <[email protected]>; Chandran, Sugesh
> <[email protected]>; Simon Horman <[email protected]>;
> Yuanhan Liu <[email protected]>
> Subject: [PATCH v7 1/6] dpif-netdev: associate flow with a mark id
>
> Most modern NICs have the ability to bind a flow with a mark, so that
> every packet matches such flow will have that mark present in its
> descriptor.
>
> The basic idea of doing that is, when we receives packets later, we could
> directly get the flow from the mark. That could avoid some very costly CPU
> operations, including (but not limiting to) miniflow_extract, emc lookup,
> dpcls lookup, etc. Thus, performance could be greatly improved.
>
> Thus, the major work of this patch is to associate a flow with a mark id
> (an uint32_t number). The association in netdev datapath is done by CMAP,
> while in hardware it's done by the rte_flow MARK action.
>
> One tricky thing in OVS-DPDK is, the flow tables is per-PMD. For the case
> there is only one phys port but with 2 queues, there could be 2 PMDs. In
> other words, even for a single mega flow (i.e. udp,tp_src=1000), there
> could be 2 different dp_netdev flows, one for each PMD. That could results
> to the same mega flow being offloaded twice in the hardware, worse, we may
> get 2 different marks and only the last one will work.
>
> To avoid that, a megaflow_to_mark CMAP is created. An entry will be added
> for the first PMD that wants to offload a flow. For later PMDs, it will
> see such megaflow is already offloaded, then the flow will not be
> offloaded to HW twice.
>
> Meanwhile, the mark to flow mapping becomes to 1:N mapping. That is what
> the mark_to_flow CMAP is for. When the first PMD wants to offload a flow,
> it allocates a new mark and performs the flow offload by reusing the -
> >flow_put method. When it succeeds, a "mark to flow" entry will be added.
> For later PMDs, it will get the corresponding mark by above
> megaflow_to_mark CMAP. Then, another "mark to flow" entry will be added.
>
> Co-authored-by: Finn Christensen <[email protected]>
> Signed-off-by: Yuanhan Liu <[email protected]>
> Signed-off-by: Finn Christensen <[email protected]>
> ---
>
> v6: - fixed typos in commit log
> - fixed a sparse warning
> - used hash_int to compute the hash for mark_to_flow CMAP
> - added more comments
> - lock before port lookup
>
> v5: - fixed check of flow_mark_has_no_ref (renamed from
> is_last_flow_mark_reference).
> This fixed an issue that it took too long to finish
> flow add/removal if we do that repeatdly.
>
> - do mark_to_flow disassociation if flow modification failed
> ---
> lib/dpif-netdev.c | 283
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> lib/netdev.h | 6 ++
> 2 files changed, 289 insertions(+)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index ba62128..a514de8
> 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -75,6 +75,7 @@
> #include "tnl-ports.h"
> #include "unixctl.h"
> #include "util.h"
> +#include "uuid.h"
>
> VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>
> @@ -430,7 +431,9 @@ struct dp_netdev_flow {
> /* Hash table index by unmasked flow. */
> const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
> /* 'flow_table'. */
> + const struct cmap_node mark_node; /* In owning flow_mark's
> + mark_to_flow */
> const ovs_u128 ufid; /* Unique flow identifier. */
> + const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
> const unsigned pmd_id; /* The 'core_id' of pmd thread owning
> this */
> /* flow. */
>
> @@ -441,6 +444,7 @@ struct dp_netdev_flow {
> struct ovs_refcount ref_cnt;
>
> bool dead;
> + uint32_t mark; /* Unique flow mark assigned to a flow
> */
>
> /* Statistics. */
> struct dp_netdev_flow_stats stats;
> @@ -1837,6 +1841,178 @@ dp_netdev_pmd_find_dpcls(struct
> dp_netdev_pmd_thread *pmd,
> return cls;
> }
>
> +#define MAX_FLOW_MARK (UINT32_MAX - 1)
> +#define INVALID_FLOW_MARK (UINT32_MAX)
> +
> +struct megaflow_to_mark_data {
> + const struct cmap_node node;
> + ovs_u128 mega_ufid;
> + uint32_t mark;
> +};
> +
> +struct flow_mark {
> + struct cmap megaflow_to_mark;
> + struct cmap mark_to_flow;
> + struct id_pool *pool;
> + struct ovs_mutex mutex;
> +};
> +
> +static struct flow_mark flow_mark = {
> + .megaflow_to_mark = CMAP_INITIALIZER,
> + .mark_to_flow = CMAP_INITIALIZER,
> + .mutex = OVS_MUTEX_INITIALIZER,
> +};
> +
> +static uint32_t
> +flow_mark_alloc(void)
> +{
> + uint32_t mark;
> +
> + if (!flow_mark.pool) {
> + /* Haven't initiated yet, do it here */
> + flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK);
> + }
> +
> + if (id_pool_alloc_id(flow_mark.pool, &mark)) {
> + return mark;
> + }
> +
> + return INVALID_FLOW_MARK;
> +}
> +
> +static void
> +flow_mark_free(uint32_t mark)
> +{
> + id_pool_free_id(flow_mark.pool, mark); }
> +
> +/* associate megaflow with a mark, which is a 1:1 mapping */ static
> +void megaflow_to_mark_associate(const ovs_u128 *mega_ufid, uint32_t
> +mark) {
> + size_t hash = dp_netdev_flow_hash(mega_ufid);
> + struct megaflow_to_mark_data *data = xzalloc(sizeof(*data));
> +
> + data->mega_ufid = *mega_ufid;
> + data->mark = mark;
> +
> + cmap_insert(&flow_mark.megaflow_to_mark,
> + CONST_CAST(struct cmap_node *, &data->node), hash); }
> +
> +/* disassociate meagaflow with a mark */ static void
> +megaflow_to_mark_disassociate(const ovs_u128 *mega_ufid) {
> + size_t hash = dp_netdev_flow_hash(mega_ufid);
> + struct megaflow_to_mark_data *data;
> +
> + CMAP_FOR_EACH_WITH_HASH (data, node, hash,
> &flow_mark.megaflow_to_mark) {
> + if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
> + cmap_remove(&flow_mark.megaflow_to_mark,
> + CONST_CAST(struct cmap_node *, &data->node),
> hash);
> + free(data);
> + return;
> + }
> + }
> +
> + VLOG_WARN("masked ufid "UUID_FMT" is not associated with a mark?\n",
> + UUID_ARGS((struct uuid *)mega_ufid)); }
Minor nit above, Capitalize the first letter the warning message 'Masked' to
keep consistent with existing code standard.
There are multiple instances of this for info, debug & warning messages
throughout the patch series but I will just flag this once here for brevity.
> +
> +static inline uint32_t
> +megaflow_to_mark_find(const ovs_u128 *mega_ufid) {
> + size_t hash = dp_netdev_flow_hash(mega_ufid);
> + struct megaflow_to_mark_data *data;
> +
> + CMAP_FOR_EACH_WITH_HASH (data, node, hash,
> &flow_mark.megaflow_to_mark) {
> + if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
> + return data->mark;
> + }
> + }
In keeping with other helper methods for the megaflow to mark, it would be
useful here to add a debug message to report that a mark for this megaflow was
not found.
> +
> + return INVALID_FLOW_MARK;
> +}
> +
> +/* associate mark with a flow, which is 1:N mapping */ static void
> +mark_to_flow_associate(const uint32_t mark, struct dp_netdev_flow
> +*flow) {
> + dp_netdev_flow_ref(flow);
> +
> + cmap_insert(&flow_mark.mark_to_flow,
> + CONST_CAST(struct cmap_node *, &flow->mark_node),
> + hash_int(mark, 0));
> + flow->mark = mark;
> +
> + VLOG_DBG("associated dp_netdev flow %p with mark %u\n", flow,
> +mark); }
> +
> +static bool
> +flow_mark_has_no_ref(uint32_t mark)
> +{
> + struct dp_netdev_flow *flow;
> +
> + CMAP_FOR_EACH_WITH_HASH (flow, mark_node, hash_int(mark, 0),
> + &flow_mark.mark_to_flow) {
> + if (flow->mark == mark) {
> + return false;
> + }
> + }
> +
> + return true;
> +}
> +
> +static int
> +mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd,
> + struct dp_netdev_flow *flow) {
> + int ret = 0;
> + uint32_t mark = flow->mark;
> + struct cmap_node *mark_node = CONST_CAST(struct cmap_node *,
> + &flow->mark_node);
> +
> + cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0));
> + flow->mark = INVALID_FLOW_MARK;
> +
> + /*
> + * no flow is referencing the mark any more? If so, let's
> + * remove the flow from hardare and free the mark.
> + */
Minor typo, 'hardware' instead of 'hardare' above.
> + if (flow_mark_has_no_ref(mark)) {
> + struct dp_netdev_port *port;
> + odp_port_t in_port = flow->flow.in_port.odp_port;
> +
> + ovs_mutex_lock(&pmd->dp->port_mutex);
> + port = dp_netdev_lookup_port(pmd->dp, in_port);
> + if (port) {
> + ret = netdev_flow_del(port->netdev, &flow->mega_ufid, NULL);
> + }
> + ovs_mutex_unlock(&pmd->dp->port_mutex);
> +
> + flow_mark_free(mark);
> + VLOG_DBG("freed flow mark %u\n", mark);
> +
> + megaflow_to_mark_disassociate(&flow->mega_ufid);
> + }
> + dp_netdev_flow_unref(flow);
> +
> + return ret;
> +}
> +
> +static void
> +flow_mark_flush(struct dp_netdev_pmd_thread *pmd) {
> + struct dp_netdev_flow *flow;
> +
> + CMAP_FOR_EACH (flow, mark_node, &flow_mark.mark_to_flow) {
> + if (flow->pmd_id == pmd->core_id) {
> + mark_to_flow_disassociate(pmd, flow);
> + }
> + }
> +}
> +
> static void
> dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
> struct dp_netdev_flow *flow) @@ -1850,6 +2026,9
> @@ dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
> ovs_assert(cls != NULL);
> dpcls_remove(cls, &flow->cr);
> cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow-
> >ufid));
> + if (flow->mark != INVALID_FLOW_MARK) {
> + mark_to_flow_disassociate(pmd, flow);
> + }
> flow->dead = true;
>
> dp_netdev_flow_unref(flow);
> @@ -2429,6 +2608,101 @@ out:
> return error;
> }
>
> +/*
> + * There are two flow offload operations here: addition and modification.
> + *
> + * For flow addition, this function does:
> + * - allocate a new flow mark id
> + * - perform hardware flow offload
> + * - associate the flow mark with flow and mega flow
> + *
> + * For flow modification, both flow mark and the associations are still
> + * valid, thus only item 2 needed.
> + */
> +static void
> +try_netdev_flow_put(struct dp_netdev_pmd_thread *pmd, odp_port_t in_port,
> + struct dp_netdev_flow *flow, struct match *match,
> + const struct nlattr *actions, size_t actions_len) {
> + struct offload_info info;
> + struct dp_netdev_port *port;
> + bool modification = flow->mark != INVALID_FLOW_MARK;
> + const char *op = modification ? "modify" : "add";
> + uint32_t mark;
> + int ret;
> +
> + ovs_mutex_lock(&flow_mark.mutex);
> +
> + if (modification) {
> + mark = flow->mark;
> + } else {
> + if (!netdev_is_flow_api_enabled()) {
> + goto out;
> + }
> +
> + /*
> + * If a mega flow has already been offloaded (from other PMD
> + * instances), do not offload it again.
> + */
> + mark = megaflow_to_mark_find(&flow->mega_ufid);
> + if (mark != INVALID_FLOW_MARK) {
> + VLOG_DBG("flow has already been offloaded with mark %u\n",
> mark);
> + mark_to_flow_associate(mark, flow);
> + goto out;
> + }
> +
> + mark = flow_mark_alloc();
> + if (mark == INVALID_FLOW_MARK) {
> + VLOG_ERR("failed to allocate flow mark!\n");
> + goto out;
> + }
> + }
> + info.flow_mark = mark;
> +
> + ovs_mutex_lock(&pmd->dp->port_mutex);
> + port = dp_netdev_lookup_port(pmd->dp, in_port);
> + if (!port) {
> + ovs_mutex_unlock(&pmd->dp->port_mutex);
> + goto out;
> + }
> + ret = netdev_flow_put(port->netdev, match,
> + CONST_CAST(struct nlattr *, actions),
> + actions_len, &flow->mega_ufid, &info, NULL);
> + ovs_mutex_unlock(&pmd->dp->port_mutex);
> +
> + if (ret) {
> + VLOG_ERR("failed to %s netdev flow with mark %u\n", op, mark);
> + if (!modification) {
> + flow_mark_free(mark);
> + } else {
> + mark_to_flow_disassociate(pmd, flow);
In the case here would it not be a call to flush? If the flow
could not be modified wouldn't you have to remove all references for it not
just on this PMD?
> + }
> + goto out;
> + }
> +
> + if (!modification) {
> + megaflow_to_mark_associate(&flow->mega_ufid, mark);
> + mark_to_flow_associate(mark, flow);
> + }
> + VLOG_DBG("succeed to %s netdev flow with mark %u\n", op, mark);
> +
> +out:
> + ovs_mutex_unlock(&flow_mark.mutex);
> +}
> +
> +static void
> +dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
> +{
> + struct flow masked_flow;
> + size_t i;
> +
> + for (i = 0; i < sizeof(struct flow); i++) {
> + ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
> + ((uint8_t *)&match->wc)[i];
> + }
> + dpif_flow_hash(NULL, &masked_flow, sizeof(struct flow), mega_ufid);
> +}
> +
> static struct dp_netdev_flow *
> dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
> struct match *match, const ovs_u128 *ufid, @@ -2464,12
> +2738,14 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
> memset(&flow->stats, 0, sizeof flow->stats);
> flow->dead = false;
> flow->batch = NULL;
> + flow->mark = INVALID_FLOW_MARK;
> *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
> *CONST_CAST(struct flow *, &flow->flow) = match->flow;
> *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
> ovs_refcount_init(&flow->ref_cnt);
> ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions,
> actions_len));
>
> + dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *,
> + &flow->mega_ufid));
> netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
>
> /* Select dpcls for in_port. Relies on in_port to be exact match. */
> @@ -2479,6 +2755,8 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
> cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow-
> >node),
> dp_netdev_flow_hash(&flow->ufid));
>
> + try_netdev_flow_put(pmd, in_port, flow, match, actions,
> + actions_len);
> +
> if (OVS_UNLIKELY(!VLOG_DROP_DBG((&upcall_rl)))) {
> struct ds ds = DS_EMPTY_INITIALIZER;
> struct ofpbuf key_buf, mask_buf; @@ -2559,6 +2837,7 @@
> flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
> if (put->flags & DPIF_FP_MODIFY) {
> struct dp_netdev_actions *new_actions;
> struct dp_netdev_actions *old_actions;
> + odp_port_t in_port = netdev_flow->flow.in_port.odp_port;
>
> new_actions = dp_netdev_actions_create(put->actions,
> put->actions_len); @@
> -2566,6 +2845,9 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
> old_actions = dp_netdev_flow_get_actions(netdev_flow);
> ovsrcu_set(&netdev_flow->actions, new_actions);
>
> + try_netdev_flow_put(pmd, in_port, netdev_flow, match,
> + put->actions, put->actions_len);
> +
> if (stats) {
> get_dpif_flow_stats(netdev_flow, stats);
> }
> @@ -3635,6 +3917,7 @@ reload_affected_pmds(struct dp_netdev *dp)
>
> CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> if (pmd->need_reload) {
> + flow_mark_flush(pmd);
> dp_netdev_reload_pmd__(pmd);
> pmd->need_reload = false;
> }
> diff --git a/lib/netdev.h b/lib/netdev.h index ff1b604..9ee3092 100644
> --- a/lib/netdev.h
> +++ b/lib/netdev.h
> @@ -188,6 +188,12 @@ void netdev_send_wait(struct netdev *, int qid);
> struct offload_info {
> const struct dpif_class *dpif_class;
> ovs_be16 tp_dst_port; /* Destination port for tunnel in SET action */
> +
> + /*
> + * The flow mark id assigened to the flow. If any pkts hit the flow,
> + * it will be in the pkt meta data.
> + */
> + uint32_t flow_mark;
> };
> struct dpif_class;
> struct netdev_flow_dump;
> --
> 2.7.4
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev