This patch adds support for specific PMD thread initialization, deinitialization, and a callback execution to perform work as part of the PMD thread loop. This allows hardware offload providers to handle any specific asynchronous or batching work.
This patch also adds cycle statistics for the provider-specific callbacks to the 'ovs-appctl dpif-netdev/pmd-perf-show' command. Signed-off-by: Eelco Chaudron <[email protected]> --- Note that this patch builds on top of the hardware offload rework patch set, which can be found here: https://patchwork.ozlabs.org/project/openvswitch/list/?series=484144 v2: - Added offload cycle statistics to the 'ovs-appctl dpif-netdev/pmd-perf-show' command. --- lib/dpif-netdev-perf.c | 19 ++++- lib/dpif-netdev-perf.h | 3 +- lib/dpif-netdev.c | 22 ++++-- lib/dpif-offload-dummy.c | 40 +++++++++++ lib/dpif-offload-provider.h | 26 +++++++ lib/dpif-offload.c | 134 ++++++++++++++++++++++++++++++++++++ lib/dpif-offload.h | 11 +++ tests/pmd.at | 32 +++++++++ 8 files changed, 279 insertions(+), 8 deletions(-) diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c index 1cd4ee084..39465ba81 100644 --- a/lib/dpif-netdev-perf.c +++ b/lib/dpif-netdev-perf.c @@ -232,6 +232,7 @@ pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter : 0; uint64_t sleep_iter = stats[PMD_SLEEP_ITER]; uint64_t tot_sleep_cycles = stats[PMD_CYCLES_SLEEP]; + uint64_t offload_cycles = stats[PMD_CYCLES_OFFLOAD]; ds_put_format(str, " Iterations: %12"PRIu64" (%.2f us/it)\n" @@ -242,7 +243,8 @@ pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, " Sleep time (us): %12.0f (%3.0f us/iteration avg.)\n", tot_iter, tot_iter - ? (tot_cycles + tot_sleep_cycles) * us_per_cycle / tot_iter + ? (tot_cycles + tot_sleep_cycles + offload_cycles) + * us_per_cycle / tot_iter : 0, tot_cycles, 100.0 * (tot_cycles / duration) / tsc_hz, idle_iter, @@ -252,6 +254,13 @@ pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, sleep_iter, tot_iter ? 100.0 * sleep_iter / tot_iter : 0, tot_sleep_cycles * us_per_cycle, sleep_iter ? (tot_sleep_cycles * us_per_cycle) / sleep_iter : 0); + if (offload_cycles > 0) { + ds_put_format(str, + " Offload cycles: %12" PRIu64 " (%5.1f %% of used cycles)\n", + offload_cycles, + 100.0 * offload_cycles / (tot_cycles + tot_sleep_cycles + + offload_cycles)); + } if (rx_packets > 0) { ds_put_format(str, " Rx packets: %12"PRIu64" (%.0f Kpps, %.0f cycles/pkt)\n" @@ -532,14 +541,14 @@ OVS_REQUIRES(s->stats_mutex) void pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, int tx_packets, uint64_t sleep_cycles, - bool full_metrics) + uint64_t offload_cycles, bool full_metrics) { uint64_t now_tsc = cycles_counter_update(s); struct iter_stats *cum_ms; uint64_t cycles, cycles_per_pkt = 0; char *reason = NULL; - cycles = now_tsc - s->start_tsc - sleep_cycles; + cycles = now_tsc - s->start_tsc - sleep_cycles - offload_cycles; s->current.timestamp = s->iteration_cnt; s->current.cycles = cycles; s->current.pkts = rx_packets; @@ -558,6 +567,10 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, pmd_perf_update_counter(s, PMD_CYCLES_SLEEP, sleep_cycles); } + if (offload_cycles) { + pmd_perf_update_counter(s, PMD_CYCLES_OFFLOAD, offload_cycles); + } + if (!full_metrics) { return; } diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h index 84beced15..2a055dacd 100644 --- a/lib/dpif-netdev-perf.h +++ b/lib/dpif-netdev-perf.h @@ -82,6 +82,7 @@ enum pmd_stat_type { PMD_CYCLES_UPCALL, /* Cycles spent processing upcalls. */ PMD_SLEEP_ITER, /* Iterations where a sleep has taken place. */ PMD_CYCLES_SLEEP, /* Total cycles slept to save power. */ + PMD_CYCLES_OFFLOAD, /* Total cycles spend handling offload. */ PMD_N_STATS }; @@ -411,7 +412,7 @@ pmd_perf_start_iteration(struct pmd_perf_stats *s); void pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, int tx_packets, uint64_t sleep_cycles, - bool full_metrics); + uint64_t offload_cycles, bool full_metrics); /* Formatting the output of commands. */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 3a8802cf3..0038f93d4 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -6513,6 +6513,7 @@ pmd_thread_main(void *f_) { struct dp_netdev_pmd_thread *pmd = f_; struct pmd_perf_stats *s = &pmd->perf_stats; + struct dpif_offload_pmd_ctx *offload_ctx = NULL; unsigned int lc = 0; struct polled_queue *poll_list; bool wait_for_reload = false; @@ -6546,6 +6547,9 @@ reload: dpdk_attached = dpdk_attach_thread(pmd->core_id); } + dpif_offload_pmd_thread_reload(pmd->dp->full_name, pmd->core_id, + pmd->numa_id, &offload_ctx); + /* List port/core affinity */ for (i = 0; i < poll_cnt; i++) { VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n", @@ -6585,7 +6589,7 @@ reload: ovs_mutex_lock(&pmd->perf_stats.stats_mutex); for (;;) { uint64_t rx_packets = 0, tx_packets = 0; - uint64_t time_slept = 0; + uint64_t time_slept = 0, offload_cycles = 0; uint64_t max_sleep; pmd_perf_start_iteration(s); @@ -6625,6 +6629,10 @@ reload: ? true : false); } + /* Do work required by any of the hardware offload providers. */ + offload_cycles = dpif_offload_pmd_thread_do_work(offload_ctx, + &pmd->perf_stats); + if (max_sleep) { /* Check if a sleep should happen on this iteration. */ if (sleep_time) { @@ -6684,7 +6692,7 @@ reload: } pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept, - pmd_perf_metrics_enabled(pmd)); + offload_cycles, pmd_perf_metrics_enabled(pmd)); } ovs_mutex_unlock(&pmd->perf_stats.stats_mutex); @@ -6705,6 +6713,7 @@ reload: goto reload; } + dpif_offload_pmd_thread_exit(offload_ctx); pmd_free_static_tx_qid(pmd); dfc_cache_uninit(&pmd->flow_cache); free(poll_list); @@ -9629,7 +9638,7 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, struct polled_queue *poll_list, int poll_cnt) { struct dpcls *cls; - uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0; + uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0, tot_offload = 0; unsigned int pmd_load = 0; if (pmd->ctx.now > pmd->next_cycle_store) { @@ -9648,11 +9657,14 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, pmd->prev_stats[PMD_CYCLES_ITER_BUSY]; tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] - pmd->prev_stats[PMD_CYCLES_SLEEP]; + tot_offload = pmd->perf_stats.counters.n[PMD_CYCLES_OFFLOAD] - + pmd->prev_stats[PMD_CYCLES_OFFLOAD]; if (pmd_alb->is_enabled && !pmd->isolated) { if (tot_proc) { pmd_load = ((tot_proc * 100) / - (tot_idle + tot_proc + tot_sleep)); + (tot_idle + tot_proc + tot_sleep + + tot_offload)); } atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, @@ -9671,6 +9683,8 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY]; pmd->prev_stats[PMD_CYCLES_SLEEP] = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP]; + pmd->prev_stats[PMD_CYCLES_OFFLOAD] = + pmd->perf_stats.counters.n[PMD_CYCLES_OFFLOAD]; /* Get the cycles that were used to process each queue and store. */ for (unsigned i = 0; i < poll_cnt; i++) { diff --git a/lib/dpif-offload-dummy.c b/lib/dpif-offload-dummy.c index d7be35c5e..58a579206 100644 --- a/lib/dpif-offload-dummy.c +++ b/lib/dpif-offload-dummy.c @@ -17,6 +17,7 @@ #include <config.h> #include <errno.h> +#include "coverage.h" #include "dpif.h" #include "dpif-offload.h" #include "dpif-offload-provider.h" @@ -33,6 +34,8 @@ VLOG_DEFINE_THIS_MODULE(dpif_offload_dummy); +COVERAGE_DEFINE(dummy_offload_do_work); + struct pmd_id_data { struct hmap_node node; void *flow_reference; @@ -849,6 +852,41 @@ dpif_offload_dummy_netdev_simulate_offload(struct netdev *netdev, ovs_mutex_unlock(&port->port_mutex); } +static void +dpif_offload_dummy_pmd_thread_work_cb(unsigned core_id OVS_UNUSED, + int numa_id OVS_UNUSED, + void *ctx OVS_UNUSED) +{ + COVERAGE_INC(dummy_offload_do_work); +} + +static void +dpif_offload_dummy_pmd_thread_lifecycle( + const struct dpif_offload *dpif_offload, bool exit, unsigned core_id, + int numa_id, dpif_offload_pmd_thread_work_cb **callback, void **ctx) +{ + /* Only do this for the 'dummy' class, not for 'dummy_x'. */ + if (strcmp(dpif_offload_class_type(dpif_offload), "dummy")) { + *callback = NULL; + *ctx = NULL; + return; + } + + VLOG_DBG( + "pmd_thread_lifecycle; exit=%s, core=%u, numa=%d, cb=%p, ctx=%p", + exit ? "true" : "false", core_id, numa_id, *callback, *ctx); + + ovs_assert(!*callback + || *callback == dpif_offload_dummy_pmd_thread_work_cb); + + if (exit) { + free(*ctx); + } else { + *ctx = *ctx ? *ctx : xstrdup("DUMMY_OFFLOAD_WORK"); + *callback = dpif_offload_dummy_pmd_thread_work_cb; + } +} + #define DEFINE_DPIF_DUMMY_CLASS(NAME, TYPE_STR) \ struct dpif_offload_class NAME = { \ .type = TYPE_STR, \ @@ -873,6 +911,8 @@ dpif_offload_dummy_netdev_simulate_offload(struct netdev *netdev, .netdev_flow_stats = dpif_offload_dummy_netdev_flow_stats, \ .register_flow_unreference_cb = \ dpif_offload_dummy_register_flow_unreference_cb, \ + .pmd_thread_lifecycle = \ + dpif_offload_dummy_pmd_thread_lifecycle, \ } DEFINE_DPIF_DUMMY_CLASS(dpif_offload_dummy_class, "dummy"); diff --git a/lib/dpif-offload-provider.h b/lib/dpif-offload-provider.h index 46fbd68ac..b0a0ede00 100644 --- a/lib/dpif-offload-provider.h +++ b/lib/dpif-offload-provider.h @@ -87,6 +87,10 @@ dpif_offload_flow_dump_thread_init( } +/* Offload Provider specific PMD thread work callback definition. */ +typedef void dpif_offload_pmd_thread_work_cb(unsigned core_id, int numa_id, + void *ctx); + struct dpif_offload_class { /* Type of DPIF offload provider in this class, e.g., "tc", "dpdk", * "dummy", etc. */ @@ -333,6 +337,28 @@ struct dpif_offload_class { * to netdev_flow_put() is no longer held by the offload provider. */ void (*register_flow_unreference_cb)(const struct dpif_offload *, dpif_offload_flow_unreference_cb *); + + + /* The API below is specific to PMD (userspace) thread lifecycle handling. + * + * This API allows a provider to supply a callback function + * (via `*callback`) and an optional context pointer (via `*ctx`) for a + * PMD thread. + * + * The lifecycle hook may be invoked multiple times for the same PMD + * thread. For example, when the thread is reinitialized, this function + * will be called again and the previous `callback` and `ctx` values will + * be passed back in. It is the provider's responsibility to decide + * whether those should be reused, replaced, or cleaned up before storing + * new values. + * + * When the PMD thread is terminating, this API is called with + * `exit == true`. At that point, the provider must release any resources + * associated with the previously returned `callback` and `ctx`. */ + void (*pmd_thread_lifecycle)(const struct dpif_offload *, bool exit, + unsigned core_id, int numa_id, + dpif_offload_pmd_thread_work_cb **callback, + void **ctx); }; extern struct dpif_offload_class dpif_offload_dummy_class; diff --git a/lib/dpif-offload.c b/lib/dpif-offload.c index 68fc1c56e..d2c07b41c 100644 --- a/lib/dpif-offload.c +++ b/lib/dpif-offload.c @@ -17,6 +17,7 @@ #include <config.h> #include <errno.h> +#include "dpif-netdev-perf.h" #include "dpif-offload.h" #include "dpif-offload-provider.h" #include "dpif-provider.h" @@ -54,6 +55,7 @@ static const struct dpif_offload_class *base_dpif_offload_classes[] = { &dpif_offload_dummy_x_class, }; +#define TOTAL_PROVIDERS ARRAY_SIZE(base_dpif_offload_classes) #define DEFAULT_PROVIDER_PRIORITY_LIST "tc,dpdk,dummy,dummy_x" static char *dpif_offload_provider_priority_list = NULL; @@ -1788,3 +1790,135 @@ dpif_offload_port_mgr_port_dump_done( free(state); return 0; } + +struct dpif_offload_pmd_ctx_node { + const struct dpif_offload *offload; + dpif_offload_pmd_thread_work_cb *callback; + void *provider_ctx; +}; + +struct dpif_offload_pmd_ctx { + unsigned core_id; + int numa_id; + size_t n_nodes; + struct dpif_offload_pmd_ctx_node nodes[TOTAL_PROVIDERS]; + struct dpif_offload_pmd_ctx_node old_nodes[TOTAL_PROVIDERS]; +}; + +void +dpif_offload_pmd_thread_reload(const char *dpif_name, unsigned core_id, + int numa_id, struct dpif_offload_pmd_ctx **ctx_) +{ + struct dpif_offload_pmd_ctx *ctx; + struct dp_offload *dp_offload; + struct dpif_offload *offload; + size_t old_n_nodes = 0; + + if (!dpif_offload_is_offload_enabled()) { + ovs_assert(!*ctx_); + return; + } + + ovs_mutex_lock(&dpif_offload_mutex); + dp_offload = shash_find_data(&dpif_offload_providers, dpif_name); + ovs_mutex_unlock(&dpif_offload_mutex); + + if (OVS_UNLIKELY(!dp_offload)) { + ovs_assert(!*ctx_); + return; + } + + if (!*ctx_) { + /* Would be nice if we have a numa specific xzalloc(). */ + ctx = xzalloc(sizeof *ctx); + ctx->core_id = core_id; + ctx->numa_id = numa_id; + *ctx_ = ctx; + } else { + ctx = *ctx_; + old_n_nodes = ctx->n_nodes; + + if (old_n_nodes) { + memcpy(ctx->old_nodes, ctx->nodes, + old_n_nodes * sizeof ctx->old_nodes[0]); + } + + /* Reset active nodes array. */ + memset(ctx->nodes, 0, sizeof ctx->nodes); + ctx->n_nodes = 0; + } + + LIST_FOR_EACH (offload, dpif_list_node, &dp_offload->offload_providers) { + + ovs_assert(ctx->n_nodes < TOTAL_PROVIDERS); + + if (!offload->class->pmd_thread_lifecycle) { + continue; + } + + if (old_n_nodes) { + /* If this is a reload, try to find previous callback and ctx. */ + for (size_t i = 0; i < old_n_nodes; i++) { + struct dpif_offload_pmd_ctx_node *node = &ctx->old_nodes[i]; + + if (offload == node->offload) { + ctx->nodes[ctx->n_nodes].callback = node->callback; + ctx->nodes[ctx->n_nodes].provider_ctx = node->provider_ctx; + break; + } + } + } + + offload->class->pmd_thread_lifecycle( + offload, false, core_id, numa_id, + &ctx->nodes[ctx->n_nodes].callback, + &ctx->nodes[ctx->n_nodes].provider_ctx); + + if (ctx->nodes[ctx->n_nodes].callback) { + ctx->nodes[ctx->n_nodes].offload = offload; + ctx->n_nodes++; + } else { + memset(&ctx->nodes[ctx->n_nodes], 0, + sizeof ctx->nodes[ctx->n_nodes]); + } + } +} + +uint64_t +dpif_offload_pmd_thread_do_work(struct dpif_offload_pmd_ctx *ctx, + struct pmd_perf_stats *stats) +{ + struct cycle_timer offload_work_timer; + + if (!ctx || !ctx->n_nodes) { + return 0; + } + + cycle_timer_start(stats, &offload_work_timer); + + for (size_t i = 0; i < ctx->n_nodes; i++) { + ctx->nodes[i].callback(ctx->core_id, ctx->numa_id, + ctx->nodes[i].provider_ctx); + } + + return cycle_timer_stop(stats, &offload_work_timer); +} + +void +dpif_offload_pmd_thread_exit(struct dpif_offload_pmd_ctx *ctx) +{ + if (!ctx) { + return; + } + + for (size_t i = 0; i < ctx->n_nodes; i++) { + struct dpif_offload_pmd_ctx_node *node = &ctx->nodes[i]; + + node->offload->class->pmd_thread_lifecycle(node->offload, true, + ctx->core_id, ctx->numa_id, + &node->callback, + &node->provider_ctx); + } + + free(ctx); +} diff --git a/lib/dpif-offload.h b/lib/dpif-offload.h index 0b52577f7..e2eecdf72 100644 --- a/lib/dpif-offload.h +++ b/lib/dpif-offload.h @@ -22,6 +22,7 @@ /* Forward declarations of private structures. */ struct dpif_offload_class; struct dpif_offload; +struct pmd_perf_stats; /* Structure used by the dpif_offload_dump_* functions. */ struct dpif_offload_dump { @@ -218,4 +219,14 @@ static inline void dpif_offload_datapath_flow_op_continue( } } +/* PMD Thread helper functions. */ +struct dpif_offload_pmd_ctx; + +void dpif_offload_pmd_thread_reload(const char *dpif_name, + unsigned core_id, int numa_id, + struct dpif_offload_pmd_ctx **ctx); +uint64_t dpif_offload_pmd_thread_do_work(struct dpif_offload_pmd_ctx *ctx, + struct pmd_perf_stats *stats); +void dpif_offload_pmd_thread_exit(struct dpif_offload_pmd_ctx *ctx); + #endif /* DPIF_OFFLOAD_H */ diff --git a/tests/pmd.at b/tests/pmd.at index 8254ac3b0..54184d8c9 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -1689,3 +1689,35 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2. OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([PMD - offload work]) +OVS_VSWITCHD_START([], [], [], [DUMMY_NUMA], + [-- set Open_vSwitch . other_config:hw-offload=true]) + +AT_CHECK([ovs-appctl vlog/set dpif_offload_dummy:dbg]) +AT_CHECK([ovs-vsctl add-port br0 p0 -- set Interface p0 type=dummy-pmd]) + +CHECK_CPU_DISCOVERED() +CHECK_PMD_THREADS_CREATED() + +OVS_WAIT_UNTIL( + [test $(ovs-appctl coverage/read-counter dummy_offload_do_work) -gt 0]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-perf-show \ + | grep -Eq 'Offload cycles: +[[0-9]]+ \( *[[0-9.]]+ % of used cycles\)']) + +OVS_VSWITCHD_STOP + +LOG="$(sed -n 's/.*\(pmd_thread_lifecycle.*\)/\1/p' ovs-vswitchd.log)" +CB=$(echo "$LOG" | sed -n '2p' | sed -n 's/.*cb=\([[^,]]*\).*/\1/p') +CTX=$(echo "$LOG" | sed -n '2p' | sed -n 's/.*ctx=\(.*\)$/\1/p') + +AT_CHECK([echo "$LOG" | sed -n '1p' | sed 's/(nil)/0x0/g'], [0], [dnl +pmd_thread_lifecycle; exit=false, core=0, numa=0, cb=0x0, ctx=0x0 +]) +AT_CHECK([echo "$LOG" | sed -n '2p' \ + | grep -q "exit=false, core=0, numa=0, cb=$CB, ctx=$CTX"]) +AT_CHECK([echo "$LOG" | sed -n '$p' \ + | grep -q "exit=true, core=0, numa=0, cb=$CB, ctx=$CTX"]) + +AT_CLEANUP -- 2.50.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
