On 09/09/2020 14:49, Flavio Leitner wrote:
>
>
> Hi Mark,
>
> Found one issue, and then since another spin is needed I am
> suggesting a stylish improvement.
>
>
> On Tue, Sep 08, 2020 at 06:27:48PM +0100, Mark Gray wrote:
>> From: Aaron Conole <[email protected]>
>>
>> Currently, the channel handlers are polled globally. On some
>> systems, this causes a thundering herd issue where multiple
>> handler threads become active, only to do no work and immediately
>> sleep.
>>
>> The approach here is to push the netlink socket channels to discreet
>> handler threads to process, rather than polling on every thread.
>> This will eliminate the need to wake multiple threads.
>>
>> To check:
>>
>> ip netns add left
>> ip netns add right
>> ip link add center-left type veth peer name left0 netns left
>> ip link add center-right type veth peer name right0 netns right
>> ip link set center-left up
>> ip link set center-right up
>> ip -n left ip link set left0 up
>> ip -n left ip addr add 172.31.110.10/24 dev left0
>> ip -n right ip link set right0 up
>> ip -n right ip addr add 172.31.110.11/24 dev right0
>>
>> ovs-vsctl add-br br0
>> ovs-vsctl add-port br0 center-right
>> ovs-vsctl add-port br0 center-left
>>
>> # in one terminal
>> perf record -e sched:sched_wakeup,irq:softirq_entry -ag
>>
>> # in a separate terminal
>> ip netns exec left arping -I left0 -c 1 172.31.110.11
>>
>> # in the perf terminal after exiting
>> perf script
>>
>> Look for the number of 'handler' threads which were made active.
>>
>> Suggested-by: Ben Pfaff <[email protected]>
>> Co-authored-by: Mark Gray <[email protected]>
>> Reported-by: David Ahern <[email protected]>
>> Reported-at:
>> https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html
>> Cc: Matteo Croce <[email protected]>
>> Cc: Flavio Leitner <[email protected]>
>> Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets")
>> Signed-off-by: Aaron Conole <[email protected]>
>> Signed-off-by: Mark Gray <[email protected]>
>> ---
>>
>>
>> v2: Oops - forgot to commit my whitespace cleanups.
>> v3: one port latency results as per Matteo's comments
>>
>> Stock:
>> min/avg/max/mdev = 21.5/36.5/96.5/1.0 us
>> With Patch:
>> min/avg/max/mdev = 5.3/9.7/98.4/0.5 us
>> v4: Oops - first email did not send
>> v5: min/avg/max/mdev = 9.3/10.4/33.6/2.2 us
>> v6: Split out the AUTHORS patch and added a cover letter as
>> per Ilya's suggestion.
>> Fixed 0-day issues.
>> v7: Merged patches as per Flavio's suggestion. This is
>> no longer a series. Fixed some other small issues.
>>
>> lib/dpif-netlink.c | 325 +++++++++++++++++++++++++--------------------
>> lib/id-pool.c | 10 ++
>> lib/id-pool.h | 1 +
>> 3 files changed, 195 insertions(+), 141 deletions(-)
>>
>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>> index 7da4fb54d..4ecf0c51c 100644
>> --- a/lib/dpif-netlink.c
>> +++ b/lib/dpif-netlink.c
>> @@ -37,6 +37,7 @@
>> #include "dpif-provider.h"
>> #include "fat-rwlock.h"
>> #include "flow.h"
>> +#include "id-pool.h"
>> #include "netdev-linux.h"
>> #include "netdev-offload.h"
>> #include "netdev-provider.h"
>> @@ -160,6 +161,9 @@ static void dpif_netlink_flow_to_dpif_flow(struct
>> dpif_flow *,
>>
>> /* One of the dpif channels between the kernel and userspace. */
>> struct dpif_channel {
>> + struct hmap_node hmap_node;
>> + struct dpif_handler *handler;
>> + uint32_t port_idx;
>> struct nl_sock *sock; /* Netlink socket. */
>> long long int last_poll; /* Last time this channel was polled. */
>> };
>> @@ -183,6 +187,8 @@ struct dpif_handler {
>> int n_events; /* Num events returned by epoll_wait(). */
>> int event_offset; /* Offset into 'epoll_events'. */
>>
>> + struct hmap channels; /* map of channels for each port. */
>> +
>> #ifdef _WIN32
>> /* Pool of sockets. */
>> struct dpif_windows_vport_sock *vport_sock_pool;
>> @@ -201,9 +207,8 @@ struct dpif_netlink {
>> struct fat_rwlock upcall_lock;
>> struct dpif_handler *handlers;
>> uint32_t n_handlers; /* Num of upcall handlers. */
>> - struct dpif_channel *channels; /* Array of channels for each port. */
>> - int uc_array_size; /* Size of 'handler->channels' and */
>> - /* 'handler->epoll_events'. */
>> +
>> + struct id_pool *port_ids; /* Set of added port ids */
>>
>> /* Change notification. */
>> struct nl_sock *port_notifier; /* vport multicast group subscriber. */
>> @@ -287,6 +292,44 @@ close_nl_sock(struct nl_sock *sock)
>> #endif
>> }
>>
>> +static size_t
>> +dpif_handler_channels_count(const struct dpif_handler *handler)
>> +{
>> + return hmap_count(&handler->channels);
>> +}
>> +
>> +static struct dpif_channel *
>> +dpif_handler_channel_find(struct dpif_handler *handler, uint32_t idx)
>> +{
>> + struct dpif_channel *channel;
>> +
>> + HMAP_FOR_EACH_WITH_HASH (channel, hmap_node, hash_int(idx, 0),
>> + &handler->channels) {
>> + if (channel->port_idx == idx) {
>> + return channel;
>> + }
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static void
>> +dpif_channel_del(struct dpif_netlink *dpif,
>> + struct dpif_channel *channel)
>> +{
>> + struct dpif_handler *handler = channel->handler;
>> +
>> +#ifndef _WIN32
>> + epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
>> + nl_sock_fd(channel->sock), NULL);
>> + nl_sock_destroy(channel->sock);
>> +#endif
>> + hmap_remove(&handler->channels, &channel->hmap_node);
>> + id_pool_free_id(dpif->port_ids, channel->port_idx);
>> + handler->event_offset = handler->n_events = 0;
>> + free(channel);
>> +}
>> +
>> static struct dpif_netlink *
>> dpif_netlink_cast(const struct dpif *dpif)
>> {
>> @@ -385,6 +428,7 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif
>> **dpifp)
>>
>> dpif->dp_ifindex = dp->dp_ifindex;
>> dpif->user_features = dp->user_features;
>> + dpif->port_ids = id_pool_create(0, MAX_PORTS);
>> *dpifp = &dpif->dpif;
>>
>> return 0;
>> @@ -452,161 +496,145 @@ static bool
>> vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx,
>> uint32_t *upcall_pid)
>> {
>> - /* Since the nl_sock can only be assigned in either all
>> - * or none "dpif" channels, the following check
>> - * would suffice. */
>> - if (!dpif->channels[port_idx].sock) {
>> + size_t i;
>> +
>> + ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
>> +
>> + if (!id_pool_has_id(dpif->port_ids, port_idx)) {
>> return false;
>> }
>> - ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
>>
>> - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock);
>> + /* The 'port_idx' should only be valid for a single handler. */
>> + for (i = 0; i < dpif->n_handlers; i++) {
>> + struct dpif_channel *channel;
>> +
>> + channel = dpif_handler_channel_find(&dpif->handlers[i], port_idx);
>> + if (channel) {
>> + *upcall_pid = nl_sock_pid(channel->sock);
>> + return true;
>> + }
>> + }
>>
>> - return true;
>> + return false;
>> }
>>
>> static int
>> vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no,
>> struct nl_sock *sock)
>> {
>> - struct epoll_event event;
>> uint32_t port_idx = odp_to_u32(port_no);
>> + struct dpif_channel *channel;
>> + struct dpif_handler *handler;
>> + struct epoll_event event;
>> size_t i;
>> - int error;
>>
>> - if (dpif->handlers == NULL) {
>> + if (dpif->handlers == NULL ||
>> + id_pool_has_id(dpif->port_ids, port_idx)) {
>> close_nl_sock(sock);
>> - return 0;
>> + return EINVAL;
>> }
>>
>> - /* We assume that the datapath densely chooses port numbers, which can
>> - * therefore be used as an index into 'channels' and 'epoll_events' of
>> - * 'dpif'. */
>> - if (port_idx >= dpif->uc_array_size) {
>> - uint32_t new_size = port_idx + 1;
>> -
>> - if (new_size > MAX_PORTS) {
>> - VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big",
>> - dpif_name(&dpif->dpif), port_no);
>> - return EFBIG;
>> - }
>> + if (port_idx >= MAX_PORTS) {
>> + VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big",
>> + dpif_name(&dpif->dpif), port_no);
>> + return EFBIG;
>> + }
>>
>> - dpif->channels = xrealloc(dpif->channels,
>> - new_size * sizeof *dpif->channels);
>> + handler = &dpif->handlers[0];
>>
>> - for (i = dpif->uc_array_size; i < new_size; i++) {
>> - dpif->channels[i].sock = NULL;
>> + for (i = 0; i < dpif->n_handlers; i++) {
>> + if (dpif_handler_channels_count(&dpif->handlers[i]) <
>> + dpif_handler_channels_count(handler)) {
>> + handler = &dpif->handlers[i];
>> }
>> + }
>>
>> - for (i = 0; i < dpif->n_handlers; i++) {
>> - struct dpif_handler *handler = &dpif->handlers[i];
>> + channel = xmalloc(sizeof *channel);
>> + channel->port_idx = port_idx;
>> + channel->sock = sock;
>> + channel->last_poll = LLONG_MIN;
>> + channel->handler = handler;
>> + hmap_insert(&handler->channels, &channel->hmap_node,
>> + hash_int(port_idx, 0));
>>
>> - handler->epoll_events = xrealloc(handler->epoll_events,
>> - new_size * sizeof *handler->epoll_events);
>> -
>> - }
>> - dpif->uc_array_size = new_size;
>> - }
>> + handler->epoll_events = xrealloc(handler->epoll_events,
>> + dpif_handler_channels_count(handler) *
>> + sizeof *handler->epoll_events);
>>
>> memset(&event, 0, sizeof event);
>> event.events = EPOLLIN | EPOLLEXCLUSIVE;
>> event.data.u32 = port_idx;
>>
>> - for (i = 0; i < dpif->n_handlers; i++) {
>> - struct dpif_handler *handler = &dpif->handlers[i];
>> -
>> #ifndef _WIN32
>> - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
>> - &event) < 0) {
>> - error = errno;
>> - goto error;
>> - }
>> -#endif
>> - }
>> - dpif->channels[port_idx].sock = sock;
>> - dpif->channels[port_idx].last_poll = LLONG_MIN;
>> -
>> - return 0;
>> -
>> -error:
>> -#ifndef _WIN32
>> - while (i--) {
>> - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL,
>> - nl_sock_fd(sock), NULL);
>> + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
>> + &event) < 0) {
>> + hmap_remove(&handler->channels, &channel->hmap_node);
>> + free(channel);
>> + id_pool_free_id(dpif->port_ids, port_idx);
>> + return errno;
>> }
>> #endif
>> - dpif->channels[port_idx].sock = NULL;
>>
>> - return error;
>> + id_pool_add(dpif->port_ids, port_idx);
>> + return 0;
>> }
>>
>> -static void
>> -vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
>> +static bool
>> +vport_del_channel(struct dpif_netlink *dpif, odp_port_t port_no)
>> {
>> - uint32_t port_idx = odp_to_u32(port_no);
>> + struct dpif_channel *channel = NULL;
>> + bool found_channel = false;
>> size_t i;
>>
>> - if (!dpif->handlers || port_idx >= dpif->uc_array_size
>> - || !dpif->channels[port_idx].sock) {
>> - return;
>> - }
>> -
>> for (i = 0; i < dpif->n_handlers; i++) {
>> - struct dpif_handler *handler = &dpif->handlers[i];
>> -#ifndef _WIN32
>> - epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
>> - nl_sock_fd(dpif->channels[port_idx].sock), NULL);
>> -#endif
>> - handler->event_offset = handler->n_events = 0;
>> + channel = dpif_handler_channel_find(&dpif->handlers[i],
>> + odp_to_u32(port_no));
>> + if (channel) {
>> + dpif_channel_del(dpif, channel);
>> + found_channel = true;
>> + break;
>> + }
>
> My suggestion is to move channel declaration there to inside the loop
> without initialization and take out the found_channel like it is
> done at vport_get_pid():
>
> for (i = 0; i < dpif->n_handlers; i++) {
> struct dpif_channel *channel;
>
> channel = dpif_handler_channel_find(&dpif->handlers[i],
> odp_to_u32(port_no));
> if (channel) {
> dpif_channel_del(dpif, channel);
> return true;
> }
> }
>
> return false;
Makes sense, I dont like returning successfully mid-function but
probably makes it a bit easier to read.
>
>
>> }
>> -#ifndef _WIN32
>> - nl_sock_destroy(dpif->channels[port_idx].sock);
>> -#endif
>> - dpif->channels[port_idx].sock = NULL;
>> +
>> + return found_channel;
>> }
>>
>> static void
>> destroy_all_channels(struct dpif_netlink *dpif)
>> OVS_REQ_WRLOCK(dpif->upcall_lock)
>> {
> [...]
>> @@ -2475,25 +2520,15 @@ dpif_netlink_refresh_channels(struct dpif_netlink
>> *dpif, uint32_t n_handlers)
>> }
>> }
>>
>> - if (port_no < keep_channels_nbits) {
>> - bitmap_set1(keep_channels, port_no);
>> - }
>> continue;
>>
>> error:
>> - vport_del_channels(dpif, vport.port_no);
>> + vport_del_channel(dpif, port_no);
>
> The port_no here is uint32_t but vport_del_channel() receives odp_port_t.
> That breaks build:
> https://travis-ci.org/github/fleitner/ovs/jobs/725370855
I see you set this up on your own github account. Good idea.
>
> Thanks!
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev