On 22/06/2021 19:53, David Wilder wrote: > This change removes the assumption that numa nodes and cores are numbered > contiguously in linux. This change is required to support some Power > systems. > > A check has been added to verify that cores are online, > offline cores result in non-contiguously numbered cores. > > Dpdk EAL option generation is updated to work with non-contiguous numa nodes. > These options can be seen in the ovs-vswitchd.log. For example: > a system containing only numa nodes 0 and 8 will generate the following: > > EAL ARGS: ovs-vswitchd --socket-mem 1024,0,0,0,0,0,0,0,1024 \ > --socket-limit 1024,0,0,0,0,0,0,0,1024 -l 0 > > Tests for pmd and dpif-netdev have been updated to validate non-contiguous > numbered nodes. > > Signed-off-by: David Wilder <[email protected]> > --- > lib/dpdk.c | 57 +++++++++++++++++++++++++++++++++++------ > lib/ovs-numa.c | 51 ++++++++++++++++++++++++------------ > lib/ovs-numa.h | 2 ++ > tests/dpif-netdev.at | 2 +- > tests/pmd.at | 61 ++++++++++++++++++++++++++++++++++++++++---- > 5 files changed, 142 insertions(+), 31 deletions(-) > > diff --git a/lib/dpdk.c b/lib/dpdk.c > index 2eaaa569c..238d0fffb 100644 > --- a/lib/dpdk.c > +++ b/lib/dpdk.c > @@ -130,22 +130,63 @@ construct_dpdk_options(const struct smap > *ovs_other_config, struct svec *args) > } > } > > +static int > +compare_numa_node_list(const void *a_, const void *b_) > +{ > + int a = *(const int *) a_; > + int b = *(const int *) b_; > + > + if (a < b) { > + return -1; > + } > + if (a > b) { > + return 1; > + } > + return 0; > +} > + > static char * > construct_dpdk_socket_mem(void) > { > const char *def_value = "1024"; > - int numa, numa_nodes = ovs_numa_get_n_numas(); > + struct ovs_numa_dump *dump; > + const struct ovs_numa_info_numa *node; > + int k = 0, last_node = 0, n_numa_nodes, *numa_node_list;
I probably would have kept k and n_numa_nodes as size_t as they are used for handling around the hmap_count() return, but realistically it's not going to make any difference. i'm not sure it's worth another re-spin unless you have to do it anyway for other comments. Acked-by: Kevin Traynor <[email protected]> > struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER; > > - if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) { > - numa_nodes = 1; > - } > + /* Build a list of all numa nodes with at least one core */ > + dump = ovs_numa_dump_n_cores_per_numa(1); > + n_numa_nodes = hmap_count(&dump->numas); > + numa_node_list = xcalloc(n_numa_nodes, sizeof *numa_node_list); > > - ds_put_cstr(&dpdk_socket_mem, def_value); > - for (numa = 1; numa < numa_nodes; ++numa) { > - ds_put_format(&dpdk_socket_mem, ",%s", def_value); > + FOR_EACH_NUMA_ON_DUMP(node, dump) { > + if (k >= n_numa_nodes) { > + break; > + } > + numa_node_list[k++] = node->numa_id; > } > - > + qsort(numa_node_list, k, sizeof *numa_node_list, compare_numa_node_list); > + > + for (int i = 0; i < n_numa_nodes; i++) { > + while (numa_node_list[i] > last_node && > + numa_node_list[i] != OVS_NUMA_UNSPEC && > + numa_node_list[i] <= MAX_NUMA_NODES){ > + if (last_node == 0) { > + ds_put_format(&dpdk_socket_mem, "%s", "0"); > + } else { > + ds_put_format(&dpdk_socket_mem, ",%s", "0"); > + } > + last_node++; > + } > + if (numa_node_list[i] == 0) { > + ds_put_format(&dpdk_socket_mem, "%s", def_value); > + } else { > + ds_put_format(&dpdk_socket_mem, ",%s", def_value); > + } > + last_node++; > + } > + free(numa_node_list); > + ovs_numa_dump_destroy(dump); > return ds_cstr(&dpdk_socket_mem); > } > > diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c > index 6d0a68522..b825ecbdd 100644 > --- a/lib/ovs-numa.c > +++ b/lib/ovs-numa.c > @@ -42,21 +42,22 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa); > * This module stores the affinity information of numa nodes and cpu cores. > * It also provides functions to bookkeep the pin of threads on cpu cores. > * > - * It is assumed that the numa node ids and cpu core ids all start from 0 and > - * range continuously. So, for example, if 'ovs_numa_get_n_cores()' returns > N, > - * user can assume core ids from 0 to N-1 are all valid and there is a > - * 'struct cpu_core' for each id. > + * It is assumed that the numa node ids and cpu core ids all start from 0. > + * There is no guarantee that node and cpu ids are numbered consecutively > + * (this is a change from earlier version of the code). So, for example, > + * if two nodes exist with ids 0 and 8, 'ovs_numa_get_n_nodes()' will > + * return 2, no assumption of node numbering should be made. > * > * NOTE, this module should only be used by the main thread. > * > - * NOTE, the assumption above will fail when cpu hotplug is used. In that > - * case ovs-numa will not function correctly. For now, add a TODO entry > - * for addressing it in the future. > + * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be > + * invalidated when ever the system topology changes. Support for detecting > + * topology changes has not been included. For now, add a TODO entry for > + * addressing it in the future. > * > * TODO: Fix ovs-numa when cpu hotplug is used. > */ > > -#define MAX_NUMA_NODES 128 > > /* numa node. */ > struct numa_node { > @@ -130,15 +131,14 @@ insert_new_cpu_core(struct numa_node *n, unsigned > core_id) > * - "0,0,0,0": four cores on numa socket 0. > * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets. > * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets. > - * > - * The different numa ids must be consecutives or the function will abort. */ > + * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous. > + */ > static void > discover_numa_and_core_dummy(void) > { > char *conf = xstrdup(dummy_config); > char *id, *saveptr = NULL; > unsigned i = 0; > - long max_numa_id = 0; > > for (id = strtok_r(conf, ",", &saveptr); id; > id = strtok_r(NULL, ",", &saveptr)) { > @@ -152,8 +152,6 @@ discover_numa_and_core_dummy(void) > continue; > } > > - max_numa_id = MAX(max_numa_id, numa_id); > - > hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0)); > > if (hnode) { > @@ -169,10 +167,27 @@ discover_numa_and_core_dummy(void) > > free(conf); > > - if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) { > - ovs_fatal(0, "dummy numa contains non consecutive numa ids"); > +} > + > +#ifdef __linux__ > +/* Check if a cpu is detected and online */ > +static int > +cpu_detected(unsigned int core_id) > +{ > + char path[PATH_MAX]; > + int len = snprintf(path, sizeof(path), > + "/sys/devices/system/cpu/cpu%d/topology/core_id", > + core_id); > + if (len <= 0 || (unsigned) len >= sizeof(path)) { > + return 0; > + } > + if (access(path, F_OK) != 0) { > + return 0; > } > + > + return 1; > } > +#endif /* __linux__ */ > > /* Discovers all numa nodes and the corresponding cpu cores. > * Constructs the 'struct numa_node' and 'struct cpu_core'. */ > @@ -219,7 +234,9 @@ discover_numa_and_core(void) > unsigned core_id; > > core_id = strtoul(subdir->d_name + 3, NULL, 10); > - insert_new_cpu_core(n, core_id); > + if (cpu_detected(core_id)) { > + insert_new_cpu_core(n, core_id); > + } > } > } > closedir(dir); > @@ -229,7 +246,7 @@ discover_numa_and_core(void) > } > > free(path); > - if (!dir || !numa_supported) { > + if (!numa_supported) { > break; > } > } > diff --git a/lib/ovs-numa.h b/lib/ovs-numa.h > index 8f2ea3430..ecc251a7f 100644 > --- a/lib/ovs-numa.h > +++ b/lib/ovs-numa.h > @@ -26,6 +26,8 @@ > #define OVS_CORE_UNSPEC INT_MAX > #define OVS_NUMA_UNSPEC INT_MAX > > +#define MAX_NUMA_NODES 128 > + > /* Dump of a list of 'struct ovs_numa_info'. */ > struct ovs_numa_dump { > struct hmap cores; > diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at > index 16402ebae..53eee185a 100644 > --- a/tests/dpif-netdev.at > +++ b/tests/dpif-netdev.at > @@ -98,7 +98,7 @@ m4_define([DPIF_NETDEV_DUMMY_IFACE], > fail-mode=secure -- \ > add-port br1 p2 -- set interface p2 type=$1 > options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \ > add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], > [], > - [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])]) > + [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,8,8,8,8"], [])]) > AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) > > AT_CHECK([ovs-ofctl add-flow br0 action=normal]) > diff --git a/tests/pmd.at b/tests/pmd.at > index cc5371d5a..faf02f158 100644 > --- a/tests/pmd.at > +++ b/tests/pmd.at > @@ -361,8 +361,8 @@ AT_SETUP([PMD - change numa node]) > OVS_VSWITCHD_START( > [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 > options:n_rxq=2 -- \ > add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 > options:n_rxq=2 -- \ > - set Open_vSwitch . other_config:pmd-cpu-mask=3 > -], [], [], [--dummy-numa 0,1]) > + set Open_vSwitch . other_config:pmd-cpu-mask=7 > +], [], [], [--dummy-numa 0,1,8]) > AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) > > AT_CHECK([ovs-ofctl add-flow br0 action=controller]) > @@ -432,6 +432,40 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 > in_port=2 (via action) data_l > > icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 > icmp_csum:13fc > ]) > > +AT_CHECK([ovs-vsctl set Interface p1 options:numa_id=8]) > + > +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], > [dnl > +p1 0 8 2 > +p1 1 8 2 > +p2 0 1 1 > +p2 1 1 1 > +]) > + > +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir > --pidfile 2> ofctl_monitor.log]) > + > +AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 > 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) > + > +OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2]) > +OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit]) > + > +AT_CHECK([cat ofctl_monitor.log], [0], [dnl > +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) > data_len=106 (unbuffered) > +icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 > icmp_csum:13fc > +]) > + > +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir > --pidfile 2> ofctl_monitor.log]) > + > +AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 > 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) > + > +OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2]) > +OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit]) > + > +AT_CHECK([cat ofctl_monitor.log], [0], [dnl > +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) > data_len=106 (unbuffered) > +icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 > icmp_csum:13fc > +]) > + > + > OVS_VSWITCHD_STOP > AT_CLEANUP > > @@ -584,7 +618,7 @@ AT_CLEANUP > > AT_SETUP([PMD - rxq affinity - NUMA]) > OVS_VSWITCHD_START( > - [], [], [], [--dummy-numa 0,0,0,1,1]) > + [], [], [], [--dummy-numa 0,0,0,1,1,8,8]) > AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) > > AT_CHECK([ovs-ofctl add-flow br0 actions=controller]) > @@ -601,21 +635,38 @@ p1 1 0 2 > > AT_CHECK([ovs-vsctl set Interface p1 > other_config:pmd-rxq-affinity="0:3,1:4"]) > > -dnl We moved the queues to different numa node. Expecting threads on > +dnl We moved the queues to different contiguous numa node. Expecting threads > on > dnl NUMA node 1 to be created. > AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], > [dnl > p1 0 1 3 > p1 1 1 4 > ]) > > +AT_CHECK([ovs-vsctl set Interface p1 > other_config:pmd-rxq-affinity="0:5,1:6"]) > + > +dnl We moved the queues to different non-contiguous numa node. Expecting > threads on > +dnl NUMA node 8 to be created. > +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], > [dnl > +p1 0 8 5 > +p1 1 8 6 > +]) > + > AT_CHECK([ovs-vsctl set Interface p1 > other_config:pmd-rxq-affinity="0:3,1:1"]) > > -dnl Queues splitted between NUMA nodes. > +dnl Queues splitted between contiguous NUMA nodes. > AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], > [dnl > p1 0 1 3 > p1 1 0 1 > ]) > > +AT_CHECK([ovs-vsctl set Interface p1 > other_config:pmd-rxq-affinity="0:5,1:1"]) > + > +dnl Queues splitted between non-contiguous NUMA nodes. > +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], > [dnl > +p1 0 8 5 > +p1 1 0 1 > +]) > + > AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity]) > > dnl We removed the rxq-affinity request. dpif-netdev should assign queues > _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
