Most DPDK components make the assumption that rte_lcore_id() returns a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of the LCORE_ID_ANY special value).
OVS does not currently check which value is set in RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK side. Introduce a lcore allocator in OVS for PMD threads and map them to unused lcores from DPDK à la --lcores. The physical cores on which the PMD threads are running still constitutes an important information when debugging, so still keep those in the PMD thread names but add a new debug log when starting them. Mapping between OVS threads and DPDK lcores can be dumped with a new dpdk/dump-lcores command. Synchronize DPDK internals on numa and cpuset for the PMD threads by registering them via the rte_thread_set_affinity() helper. Signed-off-by: David Marchand <david.march...@redhat.com> --- Changelog since v1: - rewired existing configuration 'dpdk-lcore-mask' to use --lcores, - switched to a bitmap to track lcores, - added a command to dump current mapping (Flavio): used an experimental API to get DPDK lcores cpuset since it is the most reliable/portable information, - used the same code for the logs when starting DPDK/PMD threads, - addressed Ilya comments, --- lib/dpdk-stub.c | 8 ++- lib/dpdk.c | 163 ++++++++++++++++++++++++++++++++++++++++++++-- lib/dpdk.h | 5 +- lib/dpif-netdev.c | 3 +- 4 files changed, 170 insertions(+), 9 deletions(-) diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c index c332c217c..90473bc8e 100644 --- a/lib/dpdk-stub.c +++ b/lib/dpdk-stub.c @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config) } void -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED) +dpdk_init_thread_context(unsigned cpu OVS_UNUSED) +{ + /* Nothing */ +} + +void +dpdk_uninit_thread_context(void) { /* Nothing */ } diff --git a/lib/dpdk.c b/lib/dpdk.c index 37ea2973c..0173366a0 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -30,6 +30,7 @@ #include <rte_pdump.h> #endif +#include "bitmap.h" #include "dirs.h" #include "fatal-signal.h" #include "netdev-dpdk.h" @@ -39,6 +40,7 @@ #include "ovs-numa.h" #include "smap.h" #include "svec.h" +#include "unixctl.h" #include "util.h" #include "vswitch-idl.h" @@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization * of DPDK. */ static bool per_port_memory = false; /* Status of per port memory support */ +static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER; +static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex); + static int process_vhost_flags(char *flag, const char *default_val, int size, const struct smap *ovs_other_config, @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value) return false; } +static void +construct_dpdk_lcore_option(const struct smap *ovs_other_config, + struct svec *args) +{ + const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask"); + struct svec lcores = SVEC_EMPTY_INITIALIZER; + struct ovs_numa_info_core *core; + struct ovs_numa_dump *cores; + int index = 0; + + if (!cmask) { + return; + } + if (args_contains(args, "-c") || args_contains(args, "-l") || + args_contains(args, "--lcores")) { + VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' " + "due to dpdk-extra config"); + return; + } + + cores = ovs_numa_dump_cores_with_cmask(cmask); + FOR_EACH_CORE_ON_DUMP(core, cores) { + svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id)); + index++; + } + svec_terminate(&lcores); + ovs_numa_dump_destroy(cores); + svec_add(args, "--lcores"); + svec_add_nocopy(args, svec_join(&lcores, ",", "")); + svec_destroy(&lcores); +} + static void construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) { @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) bool default_enabled; const char *default_value; } opts[] = { - {"dpdk-lcore-mask", "-c", false, NULL}, {"dpdk-hugepage-dir", "--huge-dir", false, NULL}, {"dpdk-socket-limit", "--socket-limit", false, NULL}, }; @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args) svec_parse_words(args, extra_configuration); } + construct_dpdk_lcore_option(ovs_other_config, args); construct_dpdk_options(ovs_other_config, args); construct_dpdk_mutex_options(ovs_other_config, args); } @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = { .write = dpdk_log_write, }; +static void +dpdk_dump_lcore(struct ds *ds, unsigned lcore) +{ + struct svec cores = SVEC_EMPTY_INITIALIZER; + rte_cpuset_t cpuset; + unsigned cpu; + + cpuset = rte_lcore_cpuset(lcore); + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { + if (!CPU_ISSET(cpu, &cpuset)) { + continue; + } + svec_add_nocopy(&cores, xasprintf("%u", cpu)); + } + svec_terminate(&cores); + ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore, + rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS", + svec_join(&cores, ",", "")); + svec_destroy(&cores); +} + +static void +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[], + void *aux OVS_UNUSED) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + unsigned lcore; + + ovs_mutex_lock(&lcore_bitmap_mutex); + if (lcore_bitmap == NULL) { + unixctl_command_reply_error(conn, "DPDK has not been initialised"); + goto out; + } + if (argc > 1) { + if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE || + !bitmap_is_set(lcore_bitmap, lcore)) { + unixctl_command_reply_error(conn, "incorrect lcoreid"); + goto out; + } + dpdk_dump_lcore(&ds, lcore); + } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { + if (!bitmap_is_set(lcore_bitmap, lcore)) { + continue; + } + dpdk_dump_lcore(&ds, lcore); + } + unixctl_command_reply(conn, ds_cstr(&ds)); + ds_destroy(&ds); +out: + ovs_mutex_unlock(&lcore_bitmap_mutex); +} + static bool dpdk_init__(const struct smap *ovs_other_config) { @@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config) } } - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { + if (args_contains(&args, "-c") || args_contains(&args, "-l") || + args_contains(&args, "--lcores")) { auto_determine = false; } @@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config) * thread affintity - default to core #0 */ VLOG_ERR("Thread getaffinity failed. Using core #0"); } - svec_add(&args, "-l"); - svec_add_nocopy(&args, xasprintf("%d", cpu)); + svec_add(&args, "--lcores"); + svec_add_nocopy(&args, xasprintf("0@%d", cpu)); } svec_terminate(&args); @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config) } } + ovs_mutex_lock(&lcore_bitmap_mutex); + lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE); + /* Mark DPDK threads. */ + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { + struct ds ds = DS_EMPTY_INITIALIZER; + + if (rte_eal_lcore_role(lcore) == ROLE_OFF) { + continue; + } + bitmap_set1(lcore_bitmap, lcore); + dpdk_dump_lcore(&ds, lcore); + VLOG_INFO("%s", ds_cstr(&ds)); + ds_destroy(&ds); + } + unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1, + dpdk_dump_lcores, NULL); + ovs_mutex_unlock(&lcore_bitmap_mutex); + /* We are called from the main thread here */ RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID; @@ -512,11 +620,54 @@ dpdk_available(void) } void -dpdk_set_lcore_id(unsigned cpu) +dpdk_init_thread_context(unsigned cpu) { + struct ds ds = DS_EMPTY_INITIALIZER; + rte_cpuset_t cpuset; + unsigned lcore; + /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */ ovs_assert(cpu != NON_PMD_CORE_ID); - RTE_PER_LCORE(_lcore_id) = cpu; + + ovs_mutex_lock(&lcore_bitmap_mutex); + if (lcore_bitmap == NULL) { + lcore = NON_PMD_CORE_ID; + } else { + lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE); + if (lcore == RTE_MAX_LCORE) { + VLOG_WARN("Reached maximum number of DPDK lcores, core %u will " + "have lower performance", cpu); + lcore = NON_PMD_CORE_ID; + } else { + bitmap_set1(lcore_bitmap, lcore); + } + } + ovs_mutex_unlock(&lcore_bitmap_mutex); + + RTE_PER_LCORE(_lcore_id) = lcore; + + if (lcore == NON_PMD_CORE_ID) { + return; + } + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + rte_thread_set_affinity(&cpuset); + dpdk_dump_lcore(&ds, lcore); + VLOG_INFO("%s", ds_cstr(&ds)); + ds_destroy(&ds); +} + +void +dpdk_uninit_thread_context(void) +{ + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) { + return; + } + + ovs_mutex_lock(&lcore_bitmap_mutex); + bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id)); + ovs_mutex_unlock(&lcore_bitmap_mutex); } void diff --git a/lib/dpdk.h b/lib/dpdk.h index 736a64279..eab7a926b 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -22,7 +22,9 @@ #ifdef DPDK_NETDEV #include <rte_config.h> +#define ALLOW_EXPERIMENTAL_API #include <rte_lcore.h> +#undef ALLOW_EXPERIMENTAL_API #define NON_PMD_CORE_ID LCORE_ID_ANY @@ -36,7 +38,8 @@ struct smap; struct ovsrec_open_vswitch; void dpdk_init(const struct smap *ovs_other_config); -void dpdk_set_lcore_id(unsigned cpu); +void dpdk_init_thread_context(unsigned cpu); +void dpdk_uninit_thread_context(void); const char *dpdk_get_vhost_sock_dir(void); bool dpdk_vhost_iommu_enabled(void); bool dpdk_vhost_postcopy_enabled(void); diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 7ebf4ef87..91e98919c 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_) /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); ovs_numa_thread_setaffinity_core(pmd->core_id); - dpdk_set_lcore_id(pmd->core_id); + dpdk_init_thread_context(pmd->core_id); poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); dfc_cache_init(&pmd->flow_cache); pmd_alloc_static_tx_qid(pmd); @@ -5590,6 +5590,7 @@ reload: dfc_cache_uninit(&pmd->flow_cache); free(poll_list); pmd_free_cached_ports(pmd); + dpdk_uninit_thread_context(); return NULL; } -- 2.23.0 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev