Flavio Leitner <[email protected]> writes:
> Hi Aaron,
>
> Thanks for the patch. I ran some basic tests here
> and they passed. I could see only one handler thread
> becoming active with a single upcall.
>
> See my comment below.
>
> On Tue, Jul 21, 2020 at 07:27:41PM -0400, Aaron Conole wrote:
>> Currently, the channel handlers are polled globally. On some
>> systems, this causes a thundering herd issue where multiple
>> handler threads become active, only to do no work and immediately
>> sleep.
>>
>> The approach here is to push the netlink socket channels to discreet
>> handler threads to process, rather than polling on every thread.
>> This will eliminate the need to wake multiple threads.
>>
>> To check:
>>
>> ip netns add left
>> ip netns add right
>> ip link add center-left type veth peer name left0
>> ip link add center-right type veth peer name right0
>> ip link set left0 netns left
>> ip link set right0 netns right
>> ip link set center-left up
>> ip link set center-right up
>> ip -n left ip link set left0 up
>> ip -n left ip addr add 172.31.110.10/24 dev left0
>> ip -n right ip link set right0 up
>> ip -n right ip addr add 172.31.110.11/24 dev right0
>>
>> ovs-vsctl add-br br0
>> ovs-vsctl add-port br0 center-right
>> ovs-vsctl add-port br0 center-left
>>
>> # in one terminal
>> perf record -e sched:sched_wakeup,irq:softirq_entry -ag
>>
>> # in a separate terminal
>> ip netns exec left arping -I left0 -c 1 172.31.110.11
>>
>> # in the perf terminal after exiting
>> perf script
>>
>> Look for the number of 'handler' threads which were made active.
>>
>> Suggested-by: Ben Pfaff <[email protected]>
>> Reported-by: David Ahern <[email protected]>
>> Reported-at:
>> https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html
>> Cc: Matteo Croce <[email protected]>
>> Cc: Flavio Leitner <[email protected]>
>> Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets")
>> Signed-off-by: Aaron Conole <[email protected]>
>> ---
>> v2: Oops - forgot to commit my whitespace cleanups.
>>
>> lib/dpif-netlink.c | 289 ++++++++++++++++++++++++++++-----------------
>> 1 file changed, 179 insertions(+), 110 deletions(-)
>>
>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>> index 7da4fb54d9..71d2805427 100644
>> --- a/lib/dpif-netlink.c
>> +++ b/lib/dpif-netlink.c
>> @@ -183,6 +183,10 @@ struct dpif_handler {
>> int n_events; /* Num events returned by epoll_wait(). */
>> int event_offset; /* Offset into 'epoll_events'. */
>>
>> + struct dpif_channel *channels; /* Array of channels for each port. */
>> + uint32_t *port_idx_map; /* Port idx to channel idx. */
>> + size_t n_channels;
>> +
>> #ifdef _WIN32
>> /* Pool of sockets. */
>> struct dpif_windows_vport_sock *vport_sock_pool;
>> @@ -202,8 +206,6 @@ struct dpif_netlink {
>> struct dpif_handler *handlers;
>> uint32_t n_handlers; /* Num of upcall handlers. */
>> struct dpif_channel *channels; /* Array of channels for each port. */
>
>
> Shouldn't the above channels be removed?
Yes.
>> - int uc_array_size; /* Size of 'handler->channels' and */
>> - /* 'handler->epoll_events'. */
>>
>> /* Change notification. */
>> struct nl_sock *port_notifier; /* vport multicast group subscriber. */
>> @@ -287,6 +289,55 @@ close_nl_sock(struct nl_sock *sock)
>> #endif
>> }
>>
>> +static uint32_t
>> +dpif_handler_port_idx_max(const struct dpif_handler *handler,
>> + struct dpif_channel **out_chn)
>
> Looks like there is no caller passing out_chn, so why
> that is needed?
It was from an earlier rework of the patch. I dropped it.
>> +{
>> + uint32_t max = 0;
>> + size_t i;
>> +
>> + for (i = 0; i < handler->n_channels; ++i) {
>
> OVS does have some code using '++i', but the majority of
> it uses 'i++', can you please swap them?
Sure.
>
>> + if (handler->channels[i].sock &&
>> + handler->port_idx_map[i] > max) {
>> + max = handler->port_idx_map[i];
>> + if (out_chn) {
>> + *out_chn = &handler->channels[i];
>> + }
>> + }
>> + }
>> +
>> + return max;
>> +}
>> +
>> +static size_t
>> +dpif_handler_active_channels(const struct dpif_handler *handler)
>> +{
>> + size_t i, ret = 0;
>
> Perhaps 'count' or 'num' would be a better name.
'chans'
>> +
>> + for (i = 0; i < handler->n_channels; ++i) {
>> + if (handler->channels[i].sock) {
>> + ret++;
>> + }
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +static struct dpif_channel *
>> +dpif_handler_has_port_idx(const struct dpif_handler *handler, uint32_t idx)
>> +{
>> + size_t i;
>> +
>> + for (i = 0; i < handler->n_channels; ++i) {
>> + if (handler->channels[i].sock &&
>> + handler->port_idx_map[i] == idx) {
>> + return &handler->channels[i];
>> + }
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> static struct dpif_netlink *
>> dpif_netlink_cast(const struct dpif *dpif)
>> {
>> @@ -452,90 +503,86 @@ static bool
>> vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx,
>> uint32_t *upcall_pid)
>> {
>> - /* Since the nl_sock can only be assigned in either all
>> - * or none "dpif" channels, the following check
>> - * would suffice. */
>> - if (!dpif->channels[port_idx].sock) {
>> - return false;
>> - }
>> + size_t i;
>> +
>> ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
>>
>> - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock);
>> + /* The 'port_idx' should only be valid for a single handler. */
>> + for (i = 0; i < dpif->n_handlers; ++i) {
>> + struct dpif_channel *channel =
>> + dpif_handler_has_port_idx(&dpif->handlers[i], port_idx);
>> +
>> + if (channel) {
>> + *upcall_pid = nl_sock_pid(channel->sock);
>> + return true;
>> + }
>> + }
>>
>> - return true;
>> + return false;
>> }
>>
>> static int
>> vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no,
>> struct nl_sock *sock)
>> {
>> - struct epoll_event event;
>> uint32_t port_idx = odp_to_u32(port_no);
>> + struct dpif_handler *handler;
>> + struct dpif_channel *channel;
>> + struct epoll_event event;
>> + int error = 0;
>> size_t i;
>> - int error;
>>
>> if (dpif->handlers == NULL) {
>> close_nl_sock(sock);
>> - return 0;
>> + return error;
>> }
>>
>> - /* We assume that the datapath densely chooses port numbers, which can
>> - * therefore be used as an index into 'channels' and 'epoll_events' of
>> - * 'dpif'. */
>> - if (port_idx >= dpif->uc_array_size) {
>> - uint32_t new_size = port_idx + 1;
>> + handler = &dpif->handlers[0];
>>
>> - if (new_size > MAX_PORTS) {
>> - VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big",
>> - dpif_name(&dpif->dpif), port_no);
>> - return EFBIG;
>> - }
>
>
> Could you please explain why checking MAX_PORTS is not
> important anymore? If it isn't, then this should remove
> MAX_PORT declaration as that was the only use.
I think it is, and was dropped by mistake. Fixed.
>
>> -
>> - dpif->channels = xrealloc(dpif->channels,
>> - new_size * sizeof *dpif->channels);
>> + for (i = 0; i < dpif->n_handlers; ++i) {
>> + if (dpif_handler_active_channels(&dpif->handlers[i]) <
>> + dpif_handler_active_channels(handler))
>> + handler = &dpif->handlers[i];
>> + }
>>
>> - for (i = dpif->uc_array_size; i < new_size; i++) {
>> - dpif->channels[i].sock = NULL;
>> + channel = NULL;
>> + for (i = 0; i < handler->n_channels; ++i) {
>> + if (!handler->channels[i].sock) {
>> + channel = &handler->channels[i];
>> + handler->port_idx_map[i] = port_idx;
>> }
>> + }
>>
>> - for (i = 0; i < dpif->n_handlers; i++) {
>> - struct dpif_handler *handler = &dpif->handlers[i];
>> -
>> - handler->epoll_events = xrealloc(handler->epoll_events,
>> - new_size * sizeof *handler->epoll_events);
>> -
>> - }
>> - dpif->uc_array_size = new_size;
>> + if (channel == NULL) {
>> + size_t new_size = handler->n_channels + 1;
>> + handler->channels = xrealloc(handler->channels,
>> + new_size * sizeof *handler->channels);
>> + handler->epoll_events = xrealloc(handler->epoll_events,
>> + new_size *
>> + sizeof *handler->epoll_events);
>> + handler->port_idx_map = xrealloc(handler->port_idx_map,
>> + new_size *
>> + sizeof *handler->port_idx_map);
>> + channel = &handler->channels[handler->n_channels];
>> + handler->port_idx_map[handler->n_channels] = port_idx;
>> + i = handler->n_channels;
>> + handler->n_channels += 1;
>
> perhaps handler->n_channels = new_size to remove the magic number?
Sure.
>> }
>>
>> + channel->sock = sock;
>> + channel->last_poll = LLONG_MIN;
>> +
>> memset(&event, 0, sizeof event);
>> event.events = EPOLLIN | EPOLLEXCLUSIVE;
>> - event.data.u32 = port_idx;
>> -
>> - for (i = 0; i < dpif->n_handlers; i++) {
>> - struct dpif_handler *handler = &dpif->handlers[i];
>> + event.data.u32 = i;
>>
>> #ifndef _WIN32
>> - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
>> - &event) < 0) {
>> - error = errno;
>> - goto error;
>> - }
>> -#endif
>> - }
>> - dpif->channels[port_idx].sock = sock;
>> - dpif->channels[port_idx].last_poll = LLONG_MIN;
>> -
>> - return 0;
>> -
>> -error:
>> -#ifndef _WIN32
>> - while (i--) {
>> - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL,
>> - nl_sock_fd(sock), NULL);
>> + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
>> + &event) < 0) {
>> + error = errno;
>> + channel->sock = NULL;
>> }
>> #endif
>> - dpif->channels[port_idx].sock = NULL;
>>
>> return error;
>> }
>> @@ -544,69 +591,81 @@ static void
>> vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
>> {
>> uint32_t port_idx = odp_to_u32(port_no);
>> + struct dpif_handler *handler = NULL;
>> + struct dpif_channel *channel = NULL;
>> size_t i;
>>
>> - if (!dpif->handlers || port_idx >= dpif->uc_array_size
>> - || !dpif->channels[port_idx].sock) {
>> + if (!dpif->handlers) {
>> return;
>> }
>>
>> - for (i = 0; i < dpif->n_handlers; i++) {
>> - struct dpif_handler *handler = &dpif->handlers[i];
>> + for (i = 0; i < dpif->n_handlers && channel == NULL; i++) {
>> + handler = &dpif->handlers[i];
>> + channel = dpif_handler_has_port_idx(handler, port_idx);
>> +
>> + if (channel == NULL) {
>> + continue;
>> + }
>> +
>> #ifndef _WIN32
>> epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
>> - nl_sock_fd(dpif->channels[port_idx].sock), NULL);
>> + nl_sock_fd(channel->sock), NULL);
>> #endif
>> handler->event_offset = handler->n_events = 0;
>> }
>> +
>> + if (channel) {
>> #ifndef _WIN32
>> - nl_sock_destroy(dpif->channels[port_idx].sock);
>> + nl_sock_destroy(channel->sock);
>> #endif
>> - dpif->channels[port_idx].sock = NULL;
>> + channel->sock = NULL;
>
> Ok, so it doesn't resize the memory when ports are removed.
>
> I was wondering if it wouldn't scale better if we use hmaps and
> id_pools instead:
>
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -183,9 +183,7 @@ struct dpif_handler {
> int n_events; /* Num events returned by epoll_wait(). */
> int event_offset; /* Offset into 'epoll_events'. */
>
> - struct dpif_channel *channels; /* Array of channels for each port. */
> - uint32_t *port_idx_map; /* Port idx to channel idx. */
> - size_t n_channels;
> + struct hmap channels; /* Channels for each port. */
>
> #ifdef _WIN32
> /* Pool of sockets. */
> @@ -205,7 +203,7 @@ struct dpif_netlink {
> struct fat_rwlock upcall_lock;
> struct dpif_handler *handlers;
> uint32_t n_handlers; /* Num of upcall handlers. */
> - struct dpif_channel *channels; /* Array of channels for each port. */
> + struct id_pool *port_ids; /* Pool of port ids. */
>
> /* Change notification. */
> struct nl_sock *port_notifier; /* vport multicast group subscriber. */
>
> For instance, we can find the least loaded with just hmap_count() instead
> of iterating on all handlers and ports. Also when we add/remove a port, we
> could use HMAP_FOR_EACH_WITH_HASH() using port_id as hash to find the
> channel.
>
> What do you think?
I need to think about it a bit more. Seems reasonable.
>> + }
>> }
>>
>> static void
>> destroy_all_channels(struct dpif_netlink *dpif)
>> OVS_REQ_WRLOCK(dpif->upcall_lock)
>> {
>> - unsigned int i;
>> + size_t i, j;
>>
>> if (!dpif->handlers) {
>> return;
>> }
>>
>> - for (i = 0; i < dpif->uc_array_size; i++ ) {
>> - struct dpif_netlink_vport vport_request;
>> - uint32_t upcall_pids = 0;
>> + for (i = 0; i < dpif->n_handlers; ++i) {
>
> nitpick: Same comment as before to change to 'i++'
Done.
>> + struct dpif_handler *handler = &dpif->handlers[i];
>>
>> - if (!dpif->channels[i].sock) {
>> - continue;
>> - }
>> + for (j = 0; j < handler->n_channels; ++j) {
> and here
>
>> + struct dpif_netlink_vport vport_request;
>> + uint32_t upcall_pids = 0;
>>
>> - /* Turn off upcalls. */
>> - dpif_netlink_vport_init(&vport_request);
>> - vport_request.cmd = OVS_VPORT_CMD_SET;
>> - vport_request.dp_ifindex = dpif->dp_ifindex;
>> - vport_request.port_no = u32_to_odp(i);
>> - vport_request.n_upcall_pids = 1;
>> - vport_request.upcall_pids = &upcall_pids;
>> - dpif_netlink_vport_transact(&vport_request, NULL, NULL);
>> + if (!handler->channels[j].sock) {
>> + continue;
>> + }
>>
>> - vport_del_channels(dpif, u32_to_odp(i));
>> - }
>> + /* Turn off upcalls. */
>> + dpif_netlink_vport_init(&vport_request);
>> + vport_request.cmd = OVS_VPORT_CMD_SET;
>> + vport_request.dp_ifindex = dpif->dp_ifindex;
>> + vport_request.port_no = u32_to_odp(handler->port_idx_map[j]);
>> + vport_request.n_upcall_pids = 1;
>> + vport_request.upcall_pids = &upcall_pids;
>> + dpif_netlink_vport_transact(&vport_request, NULL, NULL);
>>
>> - for (i = 0; i < dpif->n_handlers; i++) {
>> - struct dpif_handler *handler = &dpif->handlers[i];
>> + vport_del_channels(dpif, u32_to_odp(handler->port_idx_map[j]));
>> + }
>>
>> dpif_netlink_handler_uninit(handler);
>> + handler->n_channels = 0;
>> +
>> free(handler->epoll_events);
>> + free(handler->port_idx_map);
>> + free(handler->channels);
>> }
>> - free(dpif->channels);
>> +
>> free(dpif->handlers);
>> dpif->handlers = NULL;
>> - dpif->channels = NULL;
>> dpif->n_handlers = 0;
>> - dpif->uc_array_size = 0;
>> }
>>
>> static void
>> @@ -1084,23 +1143,31 @@ dpif_netlink_port_get_pid__(const struct
>> dpif_netlink *dpif,
>> odp_port_t port_no)
>> OVS_REQ_RDLOCK(dpif->upcall_lock)
>> {
>> + struct dpif_channel *min_channel = NULL, *found_channel = NULL;
>> uint32_t port_idx = odp_to_u32(port_no);
>> uint32_t pid = 0;
>>
>> - if (dpif->handlers && dpif->uc_array_size > 0) {
>> - /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s
>> - * channel, since it is not heavily loaded. */
>> - uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx;
>> -
>> - /* Needs to check in case the socket pointer is changed in between
>> - * the holding of upcall_lock. A known case happens when the main
>> - * thread deletes the vport while the handler thread is handling
>> - * the upcall from that port. */
>> - if (dpif->channels[idx].sock) {
>> - pid = nl_sock_pid(dpif->channels[idx].sock);
>> + if (dpif->handlers) {
>> + size_t i;
>> + for (i = 0; i < dpif->n_handlers; ++i) {
>> + if (dpif_handler_has_port_idx(&dpif->handlers[i], port_idx)) {
>> + found_channel =
>> dpif_handler_has_port_idx(&dpif->handlers[i],
>> + port_idx);
>> + }
>> +
>> + if (dpif_handler_has_port_idx(&dpif->handlers[i], 0)) {
>> + min_channel = dpif_handler_has_port_idx(&dpif->handlers[i],
>> + 0);
>> + }
>> }
>
> The comment about the ODPP_NONE reserved port seems still relevant.
> Also, if it finds a channel, couldn't it just break the loop?
Yes on both.
>> }
>>
>> + if (found_channel) {
>> + pid = nl_sock_pid(found_channel->sock);
>> + } else if (min_channel) {
>> + pid = nl_sock_pid(min_channel->sock);
>> + }
>> +
>> return pid;
>> }
>>
>> @@ -2376,7 +2443,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink
>> *dpif, uint32_t n_handlers)
>> {
>> unsigned long int *keep_channels;
>> struct dpif_netlink_vport vport;
>> - size_t keep_channels_nbits;
>> + size_t keep_channels_nbits = 0;
>> struct nl_dump dump;
>> uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
>> struct ofpbuf buf;
>> @@ -2414,9 +2481,11 @@ dpif_netlink_refresh_channels(struct dpif_netlink
>> *dpif, uint32_t n_handlers)
>> struct dpif_handler *handler = &dpif->handlers[i];
>>
>> handler->event_offset = handler->n_events = 0;
>> + if (keep_channels_nbits < dpif_handler_port_idx_max(handler, NULL))
>> {
>> + keep_channels_nbits = dpif_handler_port_idx_max(handler, NULL);
>> + }
>> }
>>
>> - keep_channels_nbits = dpif->uc_array_size;
>> keep_channels = bitmap_allocate(keep_channels_nbits);
>>
>> ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
>> @@ -2426,8 +2495,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink
>> *dpif, uint32_t n_handlers)
>> uint32_t upcall_pid;
>> int error;
>>
>> - if (port_no >= dpif->uc_array_size
>> - || !vport_get_pid(dpif, port_no, &upcall_pid)) {
>> + if (!vport_get_pid(dpif, port_no, &upcall_pid)) {
>> struct nl_sock *sock;
>> error = create_nl_sock(dpif, &sock);
>>
>> @@ -2721,14 +2789,14 @@ dpif_netlink_recv__(struct dpif_netlink *dpif,
>> uint32_t handler_id,
>> }
>>
>> handler = &dpif->handlers[handler_id];
>> - if (handler->event_offset >= handler->n_events) {
>> + if (handler->event_offset >= handler->n_events && handler->n_channels) {
>> int retval;
>>
>> handler->event_offset = handler->n_events = 0;
>>
>> do {
>> retval = epoll_wait(handler->epoll_fd, handler->epoll_events,
>> - dpif->uc_array_size, 0);
>> + handler->n_channels, 0);
>> } while (retval < 0 && errno == EINTR);
>>
>> if (retval < 0) {
>> @@ -2741,7 +2809,7 @@ dpif_netlink_recv__(struct dpif_netlink *dpif,
>> uint32_t handler_id,
>>
>> while (handler->event_offset < handler->n_events) {
>> int idx = handler->epoll_events[handler->event_offset].data.u32;
>> - struct dpif_channel *ch = &dpif->channels[idx];
>> + struct dpif_channel *ch = &handler->channels[idx];
>>
>> handler->event_offset++;
>>
>> @@ -2845,12 +2913,13 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
>> if (dpif->handlers) {
>> size_t i;
>>
>> - if (!dpif->channels[0].sock) {
>> - return;
>> - }
>> - for (i = 0; i < dpif->uc_array_size; i++ ) {
>> + for (i = 0; i < dpif->n_handlers; i++) {
>> + struct dpif_handler *handler = &dpif->handlers[i];
>> + size_t j;
>>
>> - nl_sock_drain(dpif->channels[i].sock);
>> + for (j = 0; j < handler->n_channels; ++j) {
>> + nl_sock_drain(handler->channels[j].sock);
>> + }
>> }
>> }
>> }
>> --
>> 2.25.4
>>
>> _______________________________________________
>> dev mailing list
>> [email protected]
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev