On Fri, Aug 28, 2020 at 03:21:16PM -0300, Flavio Leitner wrote: > > Mark, > > This patch is v4 and 2/3. Before it was only one patch, so I wonder > if v4 is only one patch, or if this has multiple patches (2/3).
Scratch that, I got the other emails now. Thanks, fbl > > BTW, differently from DPDK community, we start new threads for new > patch versions. > > fbl > > On Fri, Aug 28, 2020 at 06:03:04PM +0100, Mark Gray wrote: > > From: Aaron Conole <[email protected]> > > > > Currently, the channel handlers are polled globally. On some > > systems, this causes a thundering herd issue where multiple > > handler threads become active, only to do no work and immediately > > sleep. > > > > The approach here is to push the netlink socket channels to discreet > > handler threads to process, rather than polling on every thread. > > This will eliminate the need to wake multiple threads. > > > > To check: > > > > ip netns add left > > ip netns add right > > ip link add center-left type veth peer name left0 netns left > > ip link add center-right type veth peer name right0 netns right > > ip link set center-left up > > ip link set center-right up > > ip -n left ip link set left0 up > > ip -n left ip addr add 172.31.110.10/24 dev left0 > > ip -n right ip link set right0 up > > ip -n right ip addr add 172.31.110.11/24 dev right0 > > > > ovs-vsctl add-br br0 > > ovs-vsctl add-port br0 center-right > > ovs-vsctl add-port br0 center-left > > > > # in one terminal > > perf record -e sched:sched_wakeup,irq:softirq_entry -ag > > > > # in a separate terminal > > ip netns exec left arping -I left0 -c 1 172.31.110.11 > > > > # in the perf terminal after exiting > > perf script > > > > Look for the number of 'handler' threads which were made active. > > > > Suggested-by: Ben Pfaff <[email protected]> > > Co-authored-by: Mark Gray <[email protected]> > > Reported-by: David Ahern <[email protected]> > > Reported-at: > > https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html > > Cc: Matteo Croce <[email protected]> > > Cc: Flavio Leitner <[email protected]> > > Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets") > > Signed-off-by: Aaron Conole <[email protected]> > > Signed-off-by: Mark Gray <[email protected]> > > --- > > v2: Oops - forgot to commit my whitespace cleanups. > > v3: one port latency results as per Matteo's comments > > > > Stock: > > min/avg/max/mdev = 21.5/36.5/96.5/1.0 us > > With Patch: > > min/avg/max/mdev = 5.3/9.7/98.4/0.5 us > > v4: Oops - first email did not send > > > > lib/dpif-netlink.c | 311 ++++++++++++++++++++++++++------------------- > > 1 file changed, 182 insertions(+), 129 deletions(-) > > > > diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c > > index 7da4fb54d..d928d7aa1 100644 > > --- a/lib/dpif-netlink.c > > +++ b/lib/dpif-netlink.c > > @@ -37,6 +37,7 @@ > > #include "dpif-provider.h" > > #include "fat-rwlock.h" > > #include "flow.h" > > +#include "id-pool.h" > > #include "netdev-linux.h" > > #include "netdev-offload.h" > > #include "netdev-provider.h" > > @@ -160,6 +161,8 @@ static void dpif_netlink_flow_to_dpif_flow(struct > > dpif_flow *, > > > > /* One of the dpif channels between the kernel and userspace. */ > > struct dpif_channel { > > + struct hmap_node hmap_node; > > + uint32_t port_idx; > > struct nl_sock *sock; /* Netlink socket. */ > > long long int last_poll; /* Last time this channel was polled. */ > > }; > > @@ -183,6 +186,8 @@ struct dpif_handler { > > int n_events; /* Num events returned by epoll_wait(). > > */ > > int event_offset; /* Offset into 'epoll_events'. */ > > > > + struct hmap channels; /* map of channels for each port. */ > > + > > #ifdef _WIN32 > > /* Pool of sockets. */ > > struct dpif_windows_vport_sock *vport_sock_pool; > > @@ -201,9 +206,8 @@ struct dpif_netlink { > > struct fat_rwlock upcall_lock; > > struct dpif_handler *handlers; > > uint32_t n_handlers; /* Num of upcall handlers. */ > > - struct dpif_channel *channels; /* Array of channels for each port. */ > > - int uc_array_size; /* Size of 'handler->channels' and */ > > - /* 'handler->epoll_events'. */ > > + > > + struct id_pool *port_ids; /* Set of added port ids */ > > > > /* Change notification. */ > > struct nl_sock *port_notifier; /* vport multicast group subscriber. */ > > @@ -287,6 +291,53 @@ close_nl_sock(struct nl_sock *sock) > > #endif > > } > > > > +static size_t > > +dpif_handler_channels_count(const struct dpif_handler *handler) > > +{ > > + return hmap_count(&handler->channels); > > +} > > + > > +static struct dpif_channel * > > +dpif_handler_channel_get(struct dpif_handler *handler, uint32_t idx) > > +{ > > + struct dpif_channel *channel; > > + > > + HMAP_FOR_EACH_WITH_HASH (channel, hmap_node, hash_int(idx, 0), > > + &handler->channels) { > > + if (channel->port_idx == idx) { > > + return channel; > > + } > > + } > > + > > + return NULL; > > +} > > + > > +static void > > +dpif_handler_channel_add(struct dpif_handler *handler, uint32_t idx, > > + struct nl_sock *sock) > > +{ > > + struct dpif_channel *channel = xmalloc(sizeof *channel); > > + > > + channel->port_idx = idx; > > + channel->sock = sock; > > + channel->last_poll = LLONG_MIN; > > + > > + hmap_insert(&handler->channels, &channel->hmap_node, hash_int(idx,0)); > > +} > > + > > +static void > > +dpif_handler_channel_del(struct dpif_handler *handler, uint32_t port_idx) > > +{ > > + struct dpif_channel *channel = dpif_handler_channel_get(handler, > > port_idx); > > + > > + hmap_remove(&(handler->channels), &(channel->hmap_node)); > > + > > + free(channel); > > +} > > + > > + > > + > > + > > static struct dpif_netlink * > > dpif_netlink_cast(const struct dpif *dpif) > > { > > @@ -385,6 +436,11 @@ open_dpif(const struct dpif_netlink_dp *dp, struct > > dpif **dpifp) > > > > dpif->dp_ifindex = dp->dp_ifindex; > > dpif->user_features = dp->user_features; > > + dpif->port_ids = id_pool_create(0, MAX_PORTS); > > + if (!dpif->port_ids) { > > + return ENOMEM; > > + } > > + > > *dpifp = &dpif->dpif; > > > > return 0; > > @@ -452,161 +508,144 @@ static bool > > vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx, > > uint32_t *upcall_pid) > > { > > - /* Since the nl_sock can only be assigned in either all > > - * or none "dpif" channels, the following check > > - * would suffice. */ > > - if (!dpif->channels[port_idx].sock) { > > + size_t i; > > + ovs_assert(!WINDOWS || dpif->n_handlers <= 1); > > + > > + if (!id_pool_check(dpif->port_ids, port_idx)) { > > return false; > > } > > - ovs_assert(!WINDOWS || dpif->n_handlers <= 1); > > > > - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock); > > + /* The 'port_idx' should only be valid for a single handler. */ > > + for (i = 0; i < dpif->n_handlers; i++) { > > + struct dpif_channel *channel = > > + dpif_handler_channel_get(&dpif->handlers[i], port_idx); > > + > > + if (channel) { > > + *upcall_pid = nl_sock_pid(channel->sock); > > + return true; > > + } > > + } > > > > - return true; > > + return false; > > } > > > > static int > > vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no, > > struct nl_sock *sock) > > { > > - struct epoll_event event; > > uint32_t port_idx = odp_to_u32(port_no); > > + struct dpif_handler *handler; > > + struct epoll_event event; > > size_t i; > > - int error; > > > > - if (dpif->handlers == NULL) { > > + if (dpif->handlers == NULL || > > + id_pool_check(dpif->port_ids, port_idx)) { > > close_nl_sock(sock); > > - return 0; > > + return EINVAL; > > } > > > > - /* We assume that the datapath densely chooses port numbers, which can > > - * therefore be used as an index into 'channels' and 'epoll_events' of > > - * 'dpif'. */ > > - if (port_idx >= dpif->uc_array_size) { > > - uint32_t new_size = port_idx + 1; > > - > > - if (new_size > MAX_PORTS) { > > + if (port_idx >= MAX_PORTS) { > > VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", > > dpif_name(&dpif->dpif), port_no); > > return EFBIG; > > - } > > - > > - dpif->channels = xrealloc(dpif->channels, > > - new_size * sizeof *dpif->channels); > > + } > > > > - for (i = dpif->uc_array_size; i < new_size; i++) { > > - dpif->channels[i].sock = NULL; > > - } > > + handler = &dpif->handlers[0]; > > > > - for (i = 0; i < dpif->n_handlers; i++) { > > - struct dpif_handler *handler = &dpif->handlers[i]; > > + for (i = 0; i < dpif->n_handlers; i++) { > > + if (dpif_handler_channels_count(&dpif->handlers[i]) < > > + dpif_handler_channels_count(handler)) > > + handler = &dpif->handlers[i]; > > + } > > > > - handler->epoll_events = xrealloc(handler->epoll_events, > > - new_size * sizeof *handler->epoll_events); > > + dpif_handler_channel_add(handler, port_idx, sock); > > > > - } > > - dpif->uc_array_size = new_size; > > - } > > + handler->epoll_events = xrealloc(handler->epoll_events, > > + > > dpif_handler_channels_count(handler) * > > + sizeof *handler->epoll_events); > > > > memset(&event, 0, sizeof event); > > event.events = EPOLLIN | EPOLLEXCLUSIVE; > > event.data.u32 = port_idx; > > > > - for (i = 0; i < dpif->n_handlers; i++) { > > - struct dpif_handler *handler = &dpif->handlers[i]; > > - > > #ifndef _WIN32 > > - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), > > - &event) < 0) { > > - error = errno; > > - goto error; > > - } > > -#endif > > - } > > - dpif->channels[port_idx].sock = sock; > > - dpif->channels[port_idx].last_poll = LLONG_MIN; > > - > > - return 0; > > - > > -error: > > -#ifndef _WIN32 > > - while (i--) { > > - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL, > > - nl_sock_fd(sock), NULL); > > + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), > > + &event) < 0) { > > + dpif_handler_channel_del(handler, port_idx); > > + id_pool_free_id(dpif->port_ids, port_idx); > > + return errno; > > } > > #endif > > - dpif->channels[port_idx].sock = NULL; > > > > - return error; > > + id_pool_add(dpif->port_ids, port_idx); > > + return 0; > > } > > > > static void > > vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no) > > { > > uint32_t port_idx = odp_to_u32(port_no); > > + struct dpif_handler *handler = NULL; > > + struct dpif_channel *channel = NULL; > > size_t i; > > > > - if (!dpif->handlers || port_idx >= dpif->uc_array_size > > - || !dpif->channels[port_idx].sock) { > > + if (!dpif->handlers) { > > return; > > } > > > > for (i = 0; i < dpif->n_handlers; i++) { > > - struct dpif_handler *handler = &dpif->handlers[i]; > > + handler = &dpif->handlers[i]; > > + channel = dpif_handler_channel_get(handler, port_idx); > > + > > + if (channel) { > > #ifndef _WIN32 > > - epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, > > - nl_sock_fd(dpif->channels[port_idx].sock), NULL); > > + epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, > > + nl_sock_fd(channel->sock), NULL); > > + nl_sock_destroy(channel->sock); > > #endif > > - handler->event_offset = handler->n_events = 0; > > + dpif_handler_channel_del(handler, port_idx); > > + id_pool_free_id(dpif->port_ids, port_idx); > > + handler->event_offset = handler->n_events = 0; > > + break; > > + } > > } > > -#ifndef _WIN32 > > - nl_sock_destroy(dpif->channels[port_idx].sock); > > -#endif > > - dpif->channels[port_idx].sock = NULL; > > } > > > > static void > > destroy_all_channels(struct dpif_netlink *dpif) > > OVS_REQ_WRLOCK(dpif->upcall_lock) > > { > > - unsigned int i; > > + size_t i; > > > > if (!dpif->handlers) { > > return; > > } > > > > - for (i = 0; i < dpif->uc_array_size; i++ ) { > > - struct dpif_netlink_vport vport_request; > > - uint32_t upcall_pids = 0; > > - > > - if (!dpif->channels[i].sock) { > > - continue; > > - } > > - > > - /* Turn off upcalls. */ > > - dpif_netlink_vport_init(&vport_request); > > - vport_request.cmd = OVS_VPORT_CMD_SET; > > - vport_request.dp_ifindex = dpif->dp_ifindex; > > - vport_request.port_no = u32_to_odp(i); > > - vport_request.n_upcall_pids = 1; > > - vport_request.upcall_pids = &upcall_pids; > > - dpif_netlink_vport_transact(&vport_request, NULL, NULL); > > - > > - vport_del_channels(dpif, u32_to_odp(i)); > > - } > > - > > for (i = 0; i < dpif->n_handlers; i++) { > > struct dpif_handler *handler = &dpif->handlers[i]; > > + struct dpif_channel *channel, *next; > > + > > + HMAP_FOR_EACH_SAFE (channel, next, hmap_node, &handler->channels) { > > + struct dpif_netlink_vport vport_request; > > + uint32_t upcall_pids = 0; > > + /* Turn off upcalls. */ > > + dpif_netlink_vport_init(&vport_request); > > + vport_request.cmd = OVS_VPORT_CMD_SET; > > + vport_request.dp_ifindex = dpif->dp_ifindex; > > + vport_request.port_no = u32_to_odp(channel->port_idx); > > + vport_request.n_upcall_pids = 1; > > + vport_request.upcall_pids = &upcall_pids; > > + dpif_netlink_vport_transact(&vport_request, NULL, NULL); > > + > > + vport_del_channels(dpif, u32_to_odp(channel->port_idx)); > > + } > > > > dpif_netlink_handler_uninit(handler); > > free(handler->epoll_events); > > } > > - free(dpif->channels); > > - free(dpif->handlers); > > + > > dpif->handlers = NULL; > > - dpif->channels = NULL; > > dpif->n_handlers = 0; > > - dpif->uc_array_size = 0; > > } > > > > static void > > @@ -618,9 +657,11 @@ dpif_netlink_close(struct dpif *dpif_) > > > > fat_rwlock_wrlock(&dpif->upcall_lock); > > destroy_all_channels(dpif); > > + id_pool_destroy(dpif->port_ids); > > fat_rwlock_unlock(&dpif->upcall_lock); > > > > fat_rwlock_destroy(&dpif->upcall_lock); > > + > > free(dpif); > > } > > > > @@ -1084,23 +1125,40 @@ dpif_netlink_port_get_pid__(const struct > > dpif_netlink *dpif, > > odp_port_t port_no) > > OVS_REQ_RDLOCK(dpif->upcall_lock) > > { > > + struct dpif_channel *min_channel = NULL, *found_channel = NULL; > > uint32_t port_idx = odp_to_u32(port_no); > > uint32_t pid = 0; > > + size_t i; > > + > > + /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s > > + * channel, since it is not heavily loaded. */ > > + uint32_t idx = port_no == ODPP_NONE ? 0 : port_idx; > > + > > + if (!id_pool_check(dpif->port_ids, port_idx)) { > > + return 0; > > + } > > + > > + if (dpif->handlers) { > > + for (i = 0; i < dpif->n_handlers; i++) { > > + if (dpif_handler_channel_get(&dpif->handlers[i], idx)) { > > + found_channel = > > dpif_handler_channel_get(&dpif->handlers[i], > > + idx); > > + break; > > + } > > > > - if (dpif->handlers && dpif->uc_array_size > 0) { > > - /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s > > - * channel, since it is not heavily loaded. */ > > - uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx; > > - > > - /* Needs to check in case the socket pointer is changed in between > > - * the holding of upcall_lock. A known case happens when the main > > - * thread deletes the vport while the handler thread is handling > > - * the upcall from that port. */ > > - if (dpif->channels[idx].sock) { > > - pid = nl_sock_pid(dpif->channels[idx].sock); > > + if (dpif_handler_channel_get(&dpif->handlers[i], 0)) { > > + min_channel = dpif_handler_channel_get(&dpif->handlers[i], > > + 0); > > + } > > } > > } > > > > + if (found_channel) { > > + pid = nl_sock_pid(found_channel->sock); > > + } else if (min_channel) { > > + pid = nl_sock_pid(min_channel->sock); > > + } > > + > > return pid; > > } > > > > @@ -2342,12 +2400,14 @@ dpif_netlink_operate(struct dpif *dpif_, struct > > dpif_op **ops, size_t n_ops, > > static void > > dpif_netlink_handler_uninit(struct dpif_handler *handler) > > { > > + hmap_destroy(&handler->channels); > > vport_delete_sock_pool(handler); > > } > > > > static int > > dpif_netlink_handler_init(struct dpif_handler *handler) > > { > > + hmap_init(&handler->channels); > > return vport_create_sock_pool(handler); > > } > > #else > > @@ -2355,6 +2415,7 @@ dpif_netlink_handler_init(struct dpif_handler > > *handler) > > static int > > dpif_netlink_handler_init(struct dpif_handler *handler) > > { > > + hmap_init(&handler->channels); > > handler->epoll_fd = epoll_create(10); > > return handler->epoll_fd < 0 ? errno : 0; > > } > > @@ -2362,6 +2423,7 @@ dpif_netlink_handler_init(struct dpif_handler > > *handler) > > static void > > dpif_netlink_handler_uninit(struct dpif_handler *handler) > > { > > + hmap_destroy(&handler->channels); > > close(handler->epoll_fd); > > } > > #endif > > @@ -2374,9 +2436,7 @@ static int > > dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t > > n_handlers) > > OVS_REQ_WRLOCK(dpif->upcall_lock) > > { > > - unsigned long int *keep_channels; > > struct dpif_netlink_vport vport; > > - size_t keep_channels_nbits; > > struct nl_dump dump; > > uint64_t reply_stub[NL_DUMP_BUFSIZE / 8]; > > struct ofpbuf buf; > > @@ -2416,9 +2476,6 @@ dpif_netlink_refresh_channels(struct dpif_netlink > > *dpif, uint32_t n_handlers) > > handler->event_offset = handler->n_events = 0; > > } > > > > - keep_channels_nbits = dpif->uc_array_size; > > - keep_channels = bitmap_allocate(keep_channels_nbits); > > - > > ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub); > > dpif_netlink_port_dump_start__(dpif, &dump); > > while (!dpif_netlink_port_dump_next__(dpif, &dump, &vport, &buf)) { > > @@ -2426,8 +2483,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink > > *dpif, uint32_t n_handlers) > > uint32_t upcall_pid; > > int error; > > > > - if (port_no >= dpif->uc_array_size > > - || !vport_get_pid(dpif, port_no, &upcall_pid)) { > > + if (!vport_get_pid(dpif, port_no, &upcall_pid)) { > > struct nl_sock *sock; > > error = create_nl_sock(dpif, &sock); > > > > @@ -2475,9 +2531,6 @@ dpif_netlink_refresh_channels(struct dpif_netlink > > *dpif, uint32_t n_handlers) > > } > > } > > > > - if (port_no < keep_channels_nbits) { > > - bitmap_set1(keep_channels, port_no); > > - } > > continue; > > > > error: > > @@ -2486,14 +2539,6 @@ dpif_netlink_refresh_channels(struct dpif_netlink > > *dpif, uint32_t n_handlers) > > nl_dump_done(&dump); > > ofpbuf_uninit(&buf); > > > > - /* Discard any saved channels that we didn't reuse. */ > > - for (i = 0; i < keep_channels_nbits; i++) { > > - if (!bitmap_is_set(keep_channels, i)) { > > - vport_del_channels(dpif, u32_to_odp(i)); > > - } > > - } > > - free(keep_channels); > > - > > return retval; > > } > > > > @@ -2714,6 +2759,8 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, > > uint32_t handler_id, > > OVS_REQ_RDLOCK(dpif->upcall_lock) > > { > > struct dpif_handler *handler; > > + size_t n_channels; > > + > > int read_tries = 0; > > > > if (!dpif->handlers || handler_id >= dpif->n_handlers) { > > @@ -2721,14 +2768,15 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, > > uint32_t handler_id, > > } > > > > handler = &dpif->handlers[handler_id]; > > - if (handler->event_offset >= handler->n_events) { > > + n_channels = dpif_handler_channels_count(handler); > > + if (handler->event_offset >= handler->n_events && n_channels) { > > int retval; > > > > handler->event_offset = handler->n_events = 0; > > > > do { > > retval = epoll_wait(handler->epoll_fd, handler->epoll_events, > > - dpif->uc_array_size, 0); > > + n_channels, 0); > > } while (retval < 0 && errno == EINTR); > > > > if (retval < 0) { > > @@ -2741,7 +2789,10 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, > > uint32_t handler_id, > > > > while (handler->event_offset < handler->n_events) { > > int idx = handler->epoll_events[handler->event_offset].data.u32; > > - struct dpif_channel *ch = &dpif->channels[idx]; > > + struct dpif_channel *ch = dpif_handler_channel_get(handler, idx); > > + if (!ch) { > > + return EAGAIN; > > + } > > > > handler->event_offset++; > > > > @@ -2845,12 +2896,14 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif) > > if (dpif->handlers) { > > size_t i; > > > > - if (!dpif->channels[0].sock) { > > - return; > > - } > > - for (i = 0; i < dpif->uc_array_size; i++ ) { > > + for (i = 0; i < dpif->n_handlers; i++) { > > + struct dpif_handler *handler = &dpif->handlers[i]; > > + struct dpif_channel *channel; > > + > > + HMAP_FOR_EACH (channel, hmap_node, &handler->channels) { > > + nl_sock_drain(channel->sock); > > + } > > > > - nl_sock_drain(dpif->channels[i].sock); > > } > > } > > } > > -- > > 2.26.2 > > > > -- > fbl -- fbl _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
