git sha: 2a063b239d7295d23f89a306406e720c375c297d
Author: Eelco Chaudron <[email protected]>
Subject: dpif-offload-dpdk: Abstract rte_flow implementation from dpif-netdev.

This commit abstracts the rte_flow-specific logic from dpif-netdev into 
the dpif-offload-dpdk provider, making the generic datapath implementation 
independent of specific offload technologies.

> diff --git a/lib/automake.mk b/lib/automake.mk
> index f7586bb0a..c0edef8e9 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -508,6 +508,7 @@ if DPDK_NETDEV
>  lib_libopenvswitch_la_SOURCES += \
>       lib/dpdk.c \
>       lib/dpif-offload-dpdk.c \
> +     lib/dpif-offload-dpdk-private.h \
>       lib/netdev-dpdk.c \
>       lib/netdev-offload-dpdk.c \
>       lib/netdev-offload-dpdk.h
> diff --git a/lib/dpif-netdev-private-flow.h b/lib/dpif-netdev-private-flow.h
> index 7425dd44e..753358925 100644
> --- a/lib/dpif-netdev-private-flow.h
> +++ b/lib/dpif-netdev-private-flow.h
> @@ -89,7 +89,7 @@ struct dp_netdev_flow {
>                                   /* 'flow_table'. */
>      const struct cmap_node simple_match_node; /* In dp_netdev_pmd_thread's
>                                                   'simple_match_table'. */
> -    const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow 
> */
> +    const struct cmap_node mufid_node; /* In dp_netdev's 'mufid_to_flow'. */
>      const ovs_u128 ufid;         /* Unique flow identifier. */
>      const ovs_u128 mega_ufid;    /* Unique mega flow identifier. */
>      const unsigned pmd_id;       /* The 'core_id' of pmd thread owning this 
> */

Does this field change from mark_node to mufid_node actually get used 
anywhere in the code? I don't see any references to mufid_node being 
added or any mufid_to_flow map being created.

> @@ -102,18 +102,14 @@ struct dp_netdev_flow {
>      struct ovs_refcount ref_cnt;
>  
>      bool dead;
> -    uint32_t mark;               /* Unique flow mark for netdev offloading. 
> */
> +    bool offloaded;
> +    atomic_int offload_queue_depth;
>      uint64_t simple_match_mark;  /* Unique flow mark for the simple match. */
>      odp_port_t orig_in_port;
>  
>      /* Statistics. */
>      struct dp_netdev_flow_stats stats;
>  
> -    /* Statistics and attributes received from the netdev offload provider. 
> */
> -    atomic_int netdev_flow_get_result;
> -    struct dp_netdev_flow_stats last_stats;
> -    struct dp_netdev_flow_attrs last_attrs;
> -
>      /* Actions. */
>      OVSRCU_TYPE(struct dp_netdev_actions *) actions;
>  
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c

[ ... ]

> @@ -2463,137 +2463,171 @@ log_netdev_flow_change(const struct dp_netdev_flow 
> *flow,
>      ds_destroy(&ds);
>  }
>  
> -static void
> -queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
> -                      struct dp_netdev_flow *flow, struct match *match,
> -                      const struct nlattr *actions, size_t actions_len,
> -                      int op)
> +/* Offloaded flows can be handled asynchronously, so we do not always know
> + * whether a specific flow is offloaded or not.  It might still be pending;
> + * in fact, multiple modifications can be pending, and the actual offload
> + * state depends on the completion of each modification.
> + *
> + * To correctly determine whether a flow is offloaded when it is being
> + * destroyed (and therefore requires cleanup), we must ensure that all
> + * operations have completed.  To achieve this, we track the number of
> + * outstanding offloaded flow modifications. */
> +static bool
> +offload_queue_inc(struct dp_netdev_flow *flow)
>  {
> -    struct dp_offload_thread_item *item;
> -    struct dp_offload_flow_item *flow_offload;
> +    int current;
>  
> -    if (!dpif_offload_is_offload_enabled()) {
> -        return;
> +    while (true) {
> +        atomic_read(&flow->offload_queue_depth, &current);
> +        if (current < 0) {
> +            /* We are cleaning up, so no longer enqueue operations. */
> +            return false;
> +        }
> +
> +        /* Here we try to atomically increase the value.  If we do not 
> succeed,
> +         * someone else has modified it, and we need to check again for a
> +         * current negative value. */
> +        if (atomic_compare_exchange_strong(&flow->offload_queue_depth,
> +                                           &current, current + 1)) {
> +            return true;
> +        }
>      }

This infinite loop in offload_queue_inc() could potentially be problematic 
under high contention. Does this code handle the case where multiple threads 
are continuously modifying offload_queue_depth?

> +}
> +
> +static bool
> +offload_queue_dec(struct dp_netdev_flow *flow)
> +{
> +    int old;
> +
> +    atomic_sub(&flow->offload_queue_depth, 1, &old);
> +    ovs_assert(old >= 1);
> +
> +    if (old == 1) {
> +        /* Note that this only indicates that the queue might be empty. */
> +        return true;
> +    }
> +    return false;
> +}
> +
> +static bool
> +offload_queue_complete(struct dp_netdev_flow *flow)
> +{
> +    /* This function returns false if the queue is still in use.
> +     * If the queue is empty, it will attempt to atomically mark it as
> +     * 'not in use' by making the queue depth negative.  This prevents
> +     * other flow operations from being added.  If successful, it returns
> +     * true. */
> +     int expected_val = 0;
> +
> +    return atomic_compare_exchange_strong(&flow->offload_queue_depth,
> +                                          &expected_val, -1);
> +}

Is there a potential race condition between offload_queue_dec() returning 
true and offload_queue_complete() being called where another thread could 
increment the queue depth?

> +static void
> +offload_flow_reference_unreference_cb(unsigned pmd_id OVS_UNUSED,
> +                                      void *flow_reference_)
> +{
> +    struct dp_netdev_flow *flow_reference = flow_reference_;
> +
> +    if (flow_reference) {
> +        flow_reference->offloaded = false;
> +        dp_netdev_flow_unref(flow_reference);
> +    }
> +}

[ ... ]

> @@ -4185,12 +3596,10 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
>      /* Do not allocate extra space. */
>      flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
>      memset(&flow->stats, 0, sizeof flow->stats);
> -    atomic_init(&flow->netdev_flow_get_result, 0);
> -    memset(&flow->last_stats, 0, sizeof flow->last_stats);
> -    memset(&flow->last_attrs, 0, sizeof flow->last_attrs);
>      flow->dead = false;
> +    flow->offloaded = false;
> +    atomic_init(&flow->offload_queue_depth, 0);
>      flow->batch = NULL;
> -    flow->mark = INVALID_FLOW_MARK;
>      flow->orig_in_port = orig_in_port;
>      *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
>      *CONST_CAST(struct flow *, &flow->flow) = match->flow;

[ ... ]

> diff --git a/lib/dpif-offload-dpdk.c b/lib/dpif-offload-dpdk.c
> index 5c3e7f041..547bb68ef 100644
> --- a/lib/dpif-offload-dpdk.c
> +++ b/lib/dpif-offload-dpdk.c

[ ... ]

> +uint32_t
> +dpif_offload_dpdk_allocate_flow_mark(struct dpif_offload_dpdk *offload)
> +{
> +    static struct ovsthread_once init_once = OVSTHREAD_ONCE_INITIALIZER;
> +    unsigned int uid = dpdk_offload_thread_id() \
> +                       % offload->offload_thread_count;
> +    uint32_t flow_mark;
> +
> +    if (ovsthread_once_start(&init_once)) {
> +        /* Haven't initiated yet, do it here. */
> +        offload->flow_mark_pool = id_fpool_create(
> +            offload->offload_thread_count, 1, UINT32_MAX - 1);
> +        ovsthread_once_done(&init_once);
> +    }

Does this static ovsthread_once cause issues when multiple dpif_offload_dpdk 
instances exist? The flow mark pool initialization seems to only happen once 
globally but each offload instance might need its own pool.

> +
> +    if (id_fpool_new_id(offload->flow_mark_pool, uid, &flow_mark)) {
> +        return flow_mark;
> +    }
> +
> +    return INVALID_FLOW_MARK;
> +}

[ ... ]

> +static void
> +dpif_offload_dpdk_offload_threads_init(struct dpif_offload_dpdk *offload)
> +{
> +    offload->offload_threads = xcalloc(offload->offload_thread_count,
> +                                       sizeof(struct dpdk_offload_thread));
> +
> +    for (unsigned int tid = 0; tid < offload->offload_thread_count; tid++) {
> +        struct dpdk_offload_thread *thread;
> +
> +        thread = &offload->offload_threads[tid];
> +        mpsc_queue_init(&thread->queue);
> +        atomic_init(&thread->enqueued_item, 0);
> +        mov_avg_cma_init(&thread->cma);
> +        mov_avg_ema_init(&thread->ema, 100);
> +        thread->offload = offload;
> +        thread->thread = ovs_thread_create(
> +            "dpdk_offload", dpif_offload_dpdk_offload_thread_main, thread);
> +    }
> +}

Are there any checks to ensure offload_thread_count is reasonable? Could 
this code handle the case where offload_thread_count is extremely large 
and cause resource exhaustion?

[ ... ]

> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c

[ ... ]

> +static struct pmd_data *
> +netdev_offload_dpdk_pmd_data_init(void)
> +{
> +    struct pmd_data *mapping = xmalloc(sizeof *mapping);
> +
> +    ovs_mutex_init(&mapping->map_lock);
> +    cmap_init(&mapping->pmd_id_to_flow_ref);
> +    return mapping;
> +}

Does this function handle the case where xmalloc() fails? The code doesn't 
seem to check for allocation failure.

> +static void
> +netdev_offload_dpdk_pmd_data_cleanup_mappings(
> +    struct dpif_offload_dpdk *offload, struct pmd_data *mapping)
> +{
> +    struct pmd_id_to_flow_ref_data *data;
> +
> +    if (!mapping) {
> +        return;
> +    }
> +
> +    ovs_mutex_lock(&mapping->map_lock);
> +
> +    CMAP_FOR_EACH (data, node, &mapping->pmd_id_to_flow_ref) {
> +        cmap_remove(&mapping->pmd_id_to_flow_ref, &data->node,
> +                    hash_int(data->pmd_id, 0));
> +
> +        dpif_offload_dpdk_flow_unreference(offload, data->pmd_id,
> +                                           data->flow_reference);
> +        ovsrcu_postpone(free, data);
> +    }
> +
> +    ovs_mutex_unlock(&mapping->map_lock);
> +}

Could this cleanup function cause issues if 
dpif_offload_dpdk_flow_unreference() 
callbacks try to access the mapping being cleaned up?

[ ... ]

> @@ -2506,13 +2554,15 @@ out:
>  }
>  
>  static struct ufid_to_rte_flow_data *
> -netdev_offload_dpdk_add_flow(struct netdev *netdev,
> +netdev_offload_dpdk_add_flow(struct dpif_offload_dpdk *offload,
> +                             struct pmd_data *pmd_mapping,
> +                             struct netdev *netdev,
>                               struct match *match,
>                               struct nlattr *nl_actions,
>                               size_t actions_len,
>                               const ovs_u128 *ufid,
> -                             struct dpif_netdev_offload_info *info)
> +                             uint32_t flow_mark,
> +                             odp_port_t orig_in_port)
>  {
>      struct flow_patterns patterns = {
>          .items = NULL,
> @@ -2523,20 +2573,20 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
>      bool actions_offloaded = true;
>      struct rte_flow *flow;
>  
> -    if (parse_flow_match(netdev, info->orig_in_port, &patterns, match)) {
> +    if (parse_flow_match(offload, netdev, orig_in_port, &patterns, match)) {
>          VLOG_DBG_RL(&rl, "%s: matches of ufid "UUID_FMT" are not supported",
>                      netdev_get_name(netdev), UUID_ARGS((struct uuid *) 
> ufid));
>          goto out;
>      }
>  
> -    flow = netdev_offload_dpdk_actions(patterns.physdev, &patterns, 
> nl_actions,
> -                                       actions_len);
> +    flow = netdev_offload_dpdk_actions(offload, patterns.physdev, &patterns,
> +                                       nl_actions, actions_len);
>      if (!flow && !netdev_vport_is_vport_class(netdev->netdev_class)) {
>          /* If we failed to offload the rule actions fallback to MARK+RSS
>           * actions.
>           */
>          flow = netdev_offload_dpdk_mark_rss(&patterns, netdev,
> -                                            info->flow_mark);
> +                                            flow_mark);
>          actions_offloaded = false;
>      }
>  
> @@ -2544,7 +2594,8 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
>          goto out;
>      }
>      flows_data = ufid_to_rte_flow_associate(ufid, netdev, patterns.physdev,
> -                                            flow, actions_offloaded);
> +                                            flow, actions_offloaded,
> +                                            flow_mark, pmd_mapping);
>      VLOG_DBG("%s/%s: installed flow %p by ufid "UUID_FMT,
>               netdev_get_name(netdev), netdev_get_name(patterns.physdev), 
> flow,
>               UUID_ARGS((struct uuid *) ufid));

[ ... ]

> @@ -2734,21 +2784,54 @@ netdev_offload_dpdk_flow_put(struct dpif_offload_dpdk 
> *offload,
>          dpif_netdev_offload_ports_traverse(get_netdev_odp_cb, &aux);
>          orig_in_port = aux.odp_port;
>          old_stats = rte_flow_data->stats;
>          modification = true;
> -        ret = netdev_offload_dpdk_flow_destroy(rte_flow_data);
> +        pmd_mapping = ovsrcu_get(struct pmd_data *,
> +                                 &rte_flow_data->pmd_mapping);
> +        ovsrcu_set(&rte_flow_data->pmd_mapping, NULL);
> +        flow_mark = rte_flow_data->flow_mark;
> +
> +        ret = netdev_offload_dpdk_flow_destroy(offload, rte_flow_data,
> +                                               false, true);
>          if (ret < 0) {
>              return ret;
>          }
> +    } else if (!rte_flow_data) {
> +        pmd_mapping = netdev_offload_dpdk_pmd_data_init();
> +        netdev_offload_dpdk_pmd_data_associate(pmd_mapping, pmd_id,
> +                                               flow_reference);
> +        *previous_flow_reference = NULL;
> +        flow_mark = dpif_offload_dpdk_allocate_flow_mark(offload);
> +    } else /* if (rte_flow_data) */ {
> +        pmd_mapping = ovsrcu_get(struct pmd_data *,
> +                                 &rte_flow_data->pmd_mapping);
> +
> +        netdev_offload_dpdk_pmd_data_associate(pmd_mapping, pmd_id,
> +                                               flow_reference);
> +        *previous_flow_reference = NULL;
> +    }

In the middle else clause, flow_mark is allocated but what happens if 
the subsequent netdev_offload_dpdk_add_flow() fails? Does the allocated 
flow_mark get properly freed?

> +
> +    if (modification || !rte_flow_data) {
> +        rte_flow_data = netdev_offload_dpdk_add_flow(offload, pmd_mapping,
> +                                                     netdev, match,
> +                                                     actions, actions_len,
> +                                                     ufid, flow_mark,
> +                                                     orig_in_port);
> +        if (!rte_flow_data) {
> +            /* Clean up existing mappings, except for the current pmd_id one,
> +             * as this is handled through the callback. */
> +            netdev_offload_dpdk_pmd_data_disassociate(pmd_mapping, pmd_id);
> +            netdev_offload_dpdk_pmd_data_cleanup_mappings(offload,
> +                                                          pmd_mapping);
> +            ovsrcu_postpone(netdev_offload_dpdk_pmd_data_cleanup_mapping,
> +                            pmd_mapping);
> +            dpif_offload_dpdk_free_flow_mark(offload, flow_mark);
> +            return -1;
> +        }
> +    }

Good, the flow_mark cleanup is handled in the error path.

[ ... ]

> diff --git a/lib/netdev-offload-dpdk.h b/lib/netdev-offload-dpdk.h

[ ... ]

> -int netdev_offload_dpdk_init(struct netdev *);
> +int netdev_offload_dpdk_init(struct netdev *,
> +                             unsigned int offload_thread_count);
>  void netdev_offload_dpdk_uninit(struct netdev *);
> -int netdev_offload_dpdk_flow_flush(struct netdev *);
> -uint64_t netdev_offload_dpdk_flow_get_n_offloaded(struct netdev *);
> -int netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *,
> -                                               struct dp_packet *);
> -#ifdef DPDK_NETDEV
> -int netdev_offload_dpdk_flow_put(struct netdev *, struct match *,
> +int netdev_offload_dpdk_flow_flush(struct dpif_offload_dpdk *,
> +                                   struct netdev *);
> +uint64_t netdev_offload_dpdk_flow_get_n_offloaded(
> +    struct netdev *, unsigned int offload_thread_count);
> +uint64_t netdev_offload_dpdk_flow_get_n_offloaded_by_thread(
> +    struct netdev *, unsigned int tid);
> +int netdev_offload_dpdk_hw_miss_packet_recover(struct dpif_offload_dpdk *,
> +                                               struct netdev *,
> +                                               unsigned pmd_id,
> +                                               struct dp_packet *,
> +                                               void **flow_reference);
> +int netdev_offload_dpdk_flow_put(struct dpif_offload_dpdk *,
> +                                 unsigned pmd_id, void *flow_reference,
> +                                 struct netdev *, struct match *,
>                                   struct nlattr *actions, size_t actions_len,
>                                   const ovs_u128 *ufid,
> -                                 struct dpif_netdev_offload_info *,
> +                                 odp_port_t orig_in_port,
> +                                 void **previous_flow_reference,
>                                   struct dpif_flow_stats *);
> -int netdev_offload_dpdk_flow_del(struct netdev *, const ovs_u128 *ufid,
> +int netdev_offload_dpdk_flow_del(struct dpif_offload_dpdk *, struct netdev *,
> +                                 unsigned pmd_id, const ovs_u128 *ufid,
> +                                 void *flow_reference,
>                                   struct dpif_flow_stats *);
>  int netdev_offload_dpdk_flow_get(struct netdev *, struct match *,
>                                   struct nlattr **actions, const ovs_u128 
> *ufid,
>                                   struct dpif_flow_stats *,
>                                   struct dpif_flow_attrs *, struct ofpbuf 
> *buf);

The API changes look comprehensive. Are all the function signature changes 
backwards compatible with existing callers?

> -#else

The removal of the #ifdef DPDK_NETDEV block and associated stub functions 
seems correct for this abstraction.

Overall, this is a substantial refactoring that appears to properly separate 
concerns, though there are several potential concurrency and error handling 
issues to consider.
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to