Currently, the channel handlers are polled globally. On some systems, this causes a thundering herd issue where multiple handler threads become active, only to do no work and immediately sleep.
The approach here is to push the netlink socket channels to discreet handler threads to process, rather than polling on every thread. This will eliminate the need to wake multiple threads. To check: ip netns add left ip netns add right ip link add center-left type veth peer name left0 ip link add center-right type veth peer name right0 ip link set left0 netns left ip link set right0 netns right ip link set center-left up ip link set center-right up ip -n left ip link set left0 up ip -n left ip addr add 172.31.110.10/24 dev left0 ip -n right ip link set right0 up ip -n right ip addr add 172.31.110.11/24 dev right0 ovs-vsctl add-br br0 ovs-vsctl add-port br0 center-right ovs-vsctl add-port br0 center-left # in one terminal perf record -e sched:sched_wakeup,irq:softirq_entry -ag # in a separate terminal ip netns exec left arping -I left0 -c 1 172.31.110.11 # in the perf terminal after exiting perf script Look for the number of 'handler' threads which were made active. Suggested-by: Ben Pfaff <[email protected]> Reported-by: David Ahern <[email protected]> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html Cc: Matteo Croce <[email protected]> Cc: Flavio Leitner <[email protected]> Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets") Signed-off-by: Aaron Conole <[email protected]> --- lib/dpif-netlink.c | 290 ++++++++++++++++++++++++++++----------------- 1 file changed, 180 insertions(+), 110 deletions(-) diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 7da4fb54d9..e1468ee5fd 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -183,6 +183,10 @@ struct dpif_handler { int n_events; /* Num events returned by epoll_wait(). */ int event_offset; /* Offset into 'epoll_events'. */ + struct dpif_channel *channels; /* Array of channels for each port. */ + uint32_t *port_idx_map; /* Port idx to channel idx. */ + size_t n_channels; + #ifdef _WIN32 /* Pool of sockets. */ struct dpif_windows_vport_sock *vport_sock_pool; @@ -202,8 +206,6 @@ struct dpif_netlink { struct dpif_handler *handlers; uint32_t n_handlers; /* Num of upcall handlers. */ struct dpif_channel *channels; /* Array of channels for each port. */ - int uc_array_size; /* Size of 'handler->channels' and */ - /* 'handler->epoll_events'. */ /* Change notification. */ struct nl_sock *port_notifier; /* vport multicast group subscriber. */ @@ -287,6 +289,55 @@ close_nl_sock(struct nl_sock *sock) #endif } +static uint32_t +dpif_handler_port_idx_max(const struct dpif_handler *handler, + struct dpif_channel **out_chn) +{ + uint32_t max = 0; + size_t i; + + for (i = 0; i < handler->n_channels; ++i) { + if (handler->channels[i].sock && + handler->port_idx_map[i] > max) { + max = handler->port_idx_map[i]; + if (out_chn) { + *out_chn = &handler->channels[i]; + } + } + } + + return max; +} + +static size_t +dpif_handler_active_channels(const struct dpif_handler *handler) +{ + size_t i, ret = 0; + + for (i = 0; i < handler->n_channels; ++i) { + if (handler->channels[i].sock) { + ret++; + } + } + + return ret; +} + +static struct dpif_channel * +dpif_handler_has_port_idx(const struct dpif_handler *handler, uint32_t idx) +{ + size_t i; + + for (i = 0; i < handler->n_channels; ++i) { + if (handler->channels[i].sock && + handler->port_idx_map[i] == idx) { + return &handler->channels[i]; + } + } + + return NULL; +} + static struct dpif_netlink * dpif_netlink_cast(const struct dpif *dpif) { @@ -452,90 +503,86 @@ static bool vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx, uint32_t *upcall_pid) { - /* Since the nl_sock can only be assigned in either all - * or none "dpif" channels, the following check - * would suffice. */ - if (!dpif->channels[port_idx].sock) { - return false; - } + size_t i; + ovs_assert(!WINDOWS || dpif->n_handlers <= 1); - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock); + /* The 'port_idx' should only be valid for a single handler. */ + for (i = 0; i < dpif->n_handlers; ++i) { + struct dpif_channel *channel = + dpif_handler_has_port_idx(&dpif->handlers[i], port_idx); + + if (channel) { + *upcall_pid = nl_sock_pid(channel->sock); + return true; + } + } - return true; + return false; } static int vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no, struct nl_sock *sock) { - struct epoll_event event; uint32_t port_idx = odp_to_u32(port_no); + struct dpif_handler *handler; + struct dpif_channel *channel; + struct epoll_event event; + int error = 0; size_t i; - int error; if (dpif->handlers == NULL) { close_nl_sock(sock); - return 0; + return error; } - /* We assume that the datapath densely chooses port numbers, which can - * therefore be used as an index into 'channels' and 'epoll_events' of - * 'dpif'. */ - if (port_idx >= dpif->uc_array_size) { - uint32_t new_size = port_idx + 1; - - if (new_size > MAX_PORTS) { - VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", - dpif_name(&dpif->dpif), port_no); - return EFBIG; - } + handler = &dpif->handlers[0]; - dpif->channels = xrealloc(dpif->channels, - new_size * sizeof *dpif->channels); + for (i = 0; i < dpif->n_handlers; ++i) { + if (dpif_handler_active_channels(&dpif->handlers[i]) < + dpif_handler_active_channels(handler)) + handler = &dpif->handlers[i]; + } - for (i = dpif->uc_array_size; i < new_size; i++) { - dpif->channels[i].sock = NULL; + channel = NULL; + for (i = 0; i < handler->n_channels; ++i) { + if (!handler->channels[i].sock) { + channel = &handler->channels[i]; + handler->port_idx_map[i] = port_idx; } + } - for (i = 0; i < dpif->n_handlers; i++) { - struct dpif_handler *handler = &dpif->handlers[i]; - - handler->epoll_events = xrealloc(handler->epoll_events, - new_size * sizeof *handler->epoll_events); - - } - dpif->uc_array_size = new_size; + if (channel == NULL) { + size_t new_size = handler->n_channels + 1; + handler->channels = xrealloc(handler->channels, + new_size * sizeof *handler->channels); + handler->epoll_events = xrealloc(handler->epoll_events, + new_size * + sizeof *handler->epoll_events); + handler->port_idx_map = xrealloc(handler->port_idx_map, + new_size * + sizeof *handler->port_idx_map); + channel = &handler->channels[handler->n_channels]; + handler->port_idx_map[handler->n_channels] = port_idx; + i = handler->n_channels; + handler->n_channels += 1; } + channel->sock = sock; + channel->last_poll = LLONG_MIN; + memset(&event, 0, sizeof event); event.events = EPOLLIN | EPOLLEXCLUSIVE; - event.data.u32 = port_idx; - - for (i = 0; i < dpif->n_handlers; i++) { - struct dpif_handler *handler = &dpif->handlers[i]; + event.data.u32 = i; #ifndef _WIN32 - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), - &event) < 0) { - error = errno; - goto error; - } -#endif - } - dpif->channels[port_idx].sock = sock; - dpif->channels[port_idx].last_poll = LLONG_MIN; - - return 0; - -error: -#ifndef _WIN32 - while (i--) { - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL, - nl_sock_fd(sock), NULL); + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), + &event) < 0) { + error = errno; + channel->sock = NULL; } #endif - dpif->channels[port_idx].sock = NULL; return error; } @@ -544,69 +591,82 @@ static void vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no) { uint32_t port_idx = odp_to_u32(port_no); + struct dpif_handler *handler = NULL; + struct dpif_channel *channel = NULL; size_t i; - if (!dpif->handlers || port_idx >= dpif->uc_array_size - || !dpif->channels[port_idx].sock) { + if (!dpif->handlers) { return; } - for (i = 0; i < dpif->n_handlers; i++) { - struct dpif_handler *handler = &dpif->handlers[i]; + for (i = 0; i < dpif->n_handlers && channel == NULL; i++) { + handler = &dpif->handlers[i]; + channel = dpif_handler_has_port_idx(handler, port_idx); + + if (channel == NULL) { + continue; + } + #ifndef _WIN32 epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, - nl_sock_fd(dpif->channels[port_idx].sock), NULL); + nl_sock_fd(channel->sock), NULL); #endif handler->event_offset = handler->n_events = 0; } + + if (channel) { #ifndef _WIN32 - nl_sock_destroy(dpif->channels[port_idx].sock); + nl_sock_destroy(channel->sock); #endif - dpif->channels[port_idx].sock = NULL; + channel->sock = NULL; + } } static void destroy_all_channels(struct dpif_netlink *dpif) OVS_REQ_WRLOCK(dpif->upcall_lock) { - unsigned int i; + size_t i, j; if (!dpif->handlers) { return; } - for (i = 0; i < dpif->uc_array_size; i++ ) { - struct dpif_netlink_vport vport_request; - uint32_t upcall_pids = 0; + for (i = 0; i < dpif->n_handlers; ++i) { + struct dpif_handler *handler = &dpif->handlers[i]; - if (!dpif->channels[i].sock) { - continue; - } + for (j = 0; j < handler->n_channels; ++j) { + + struct dpif_netlink_vport vport_request; + uint32_t upcall_pids = 0; - /* Turn off upcalls. */ - dpif_netlink_vport_init(&vport_request); - vport_request.cmd = OVS_VPORT_CMD_SET; - vport_request.dp_ifindex = dpif->dp_ifindex; - vport_request.port_no = u32_to_odp(i); - vport_request.n_upcall_pids = 1; - vport_request.upcall_pids = &upcall_pids; - dpif_netlink_vport_transact(&vport_request, NULL, NULL); + if (!handler->channels[j].sock) { + continue; + } - vport_del_channels(dpif, u32_to_odp(i)); - } + /* Turn off upcalls. */ + dpif_netlink_vport_init(&vport_request); + vport_request.cmd = OVS_VPORT_CMD_SET; + vport_request.dp_ifindex = dpif->dp_ifindex; + vport_request.port_no = u32_to_odp(handler->port_idx_map[j]); + vport_request.n_upcall_pids = 1; + vport_request.upcall_pids = &upcall_pids; + dpif_netlink_vport_transact(&vport_request, NULL, NULL); - for (i = 0; i < dpif->n_handlers; i++) { - struct dpif_handler *handler = &dpif->handlers[i]; + vport_del_channels(dpif, u32_to_odp(handler->port_idx_map[j])); + } dpif_netlink_handler_uninit(handler); + handler->n_channels = 0; + free(handler->epoll_events); + free(handler->port_idx_map); + free(handler->channels); } - free(dpif->channels); + free(dpif->handlers); dpif->handlers = NULL; - dpif->channels = NULL; dpif->n_handlers = 0; - dpif->uc_array_size = 0; } static void @@ -1084,23 +1144,31 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif, odp_port_t port_no) OVS_REQ_RDLOCK(dpif->upcall_lock) { + struct dpif_channel *min_channel = NULL, *found_channel = NULL; uint32_t port_idx = odp_to_u32(port_no); uint32_t pid = 0; - if (dpif->handlers && dpif->uc_array_size > 0) { - /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s - * channel, since it is not heavily loaded. */ - uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx; - - /* Needs to check in case the socket pointer is changed in between - * the holding of upcall_lock. A known case happens when the main - * thread deletes the vport while the handler thread is handling - * the upcall from that port. */ - if (dpif->channels[idx].sock) { - pid = nl_sock_pid(dpif->channels[idx].sock); + if (dpif->handlers) { + size_t i; + for (i = 0; i < dpif->n_handlers; ++i) { + if (dpif_handler_has_port_idx(&dpif->handlers[i], port_idx)) { + found_channel = dpif_handler_has_port_idx(&dpif->handlers[i], + port_idx); + } + + if (dpif_handler_has_port_idx(&dpif->handlers[i], 0)) { + min_channel = dpif_handler_has_port_idx(&dpif->handlers[i], + 0); + } } } + if (found_channel) { + pid = nl_sock_pid(found_channel->sock); + } else if (min_channel) { + pid = nl_sock_pid(min_channel->sock); + } + return pid; } @@ -2376,7 +2444,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) { unsigned long int *keep_channels; struct dpif_netlink_vport vport; - size_t keep_channels_nbits; + size_t keep_channels_nbits = 0; struct nl_dump dump; uint64_t reply_stub[NL_DUMP_BUFSIZE / 8]; struct ofpbuf buf; @@ -2414,9 +2482,11 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) struct dpif_handler *handler = &dpif->handlers[i]; handler->event_offset = handler->n_events = 0; + if (keep_channels_nbits < dpif_handler_port_idx_max(handler, NULL)) { + keep_channels_nbits = dpif_handler_port_idx_max(handler, NULL); + } } - keep_channels_nbits = dpif->uc_array_size; keep_channels = bitmap_allocate(keep_channels_nbits); ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub); @@ -2426,8 +2496,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) uint32_t upcall_pid; int error; - if (port_no >= dpif->uc_array_size - || !vport_get_pid(dpif, port_no, &upcall_pid)) { + if (!vport_get_pid(dpif, port_no, &upcall_pid)) { struct nl_sock *sock; error = create_nl_sock(dpif, &sock); @@ -2721,14 +2790,14 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id, } handler = &dpif->handlers[handler_id]; - if (handler->event_offset >= handler->n_events) { + if (handler->event_offset >= handler->n_events && handler->n_channels) { int retval; handler->event_offset = handler->n_events = 0; do { retval = epoll_wait(handler->epoll_fd, handler->epoll_events, - dpif->uc_array_size, 0); + handler->n_channels, 0); } while (retval < 0 && errno == EINTR); if (retval < 0) { @@ -2741,7 +2810,7 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id, while (handler->event_offset < handler->n_events) { int idx = handler->epoll_events[handler->event_offset].data.u32; - struct dpif_channel *ch = &dpif->channels[idx]; + struct dpif_channel *ch = &handler->channels[idx]; handler->event_offset++; @@ -2845,12 +2914,13 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif) if (dpif->handlers) { size_t i; - if (!dpif->channels[0].sock) { - return; - } - for (i = 0; i < dpif->uc_array_size; i++ ) { + for (i = 0; i < dpif->n_handlers; i++ ) { + struct dpif_handler *handler = &dpif->handlers[i]; + size_t j; - nl_sock_drain(dpif->channels[i].sock); + for (j = 0; j < handler->n_channels; ++j) { + nl_sock_drain(handler->channels[j].sock); + } } } } -- 2.25.4 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
