WireGuard currently uses round-robin to dispatch the handling of packets, handling them on all online CPUs, including isolated ones (isolcpus).
This is unfortunate because it causes significant latency on isolated CPUs - see e.g. below over 240 usec: kworker/47:1-2373323 [047] 243644.756405: funcgraph_entry: | process_one_work() { kworker/47:1-2373323 [047] 243644.756406: funcgraph_entry: | wg_packet_decrypt_worker() { [...] kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: 0.591 us | } kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: ! 242.655 us | } Instead, restrict to non-isolated CPUs. Example: ~# cat /sys/devices/system/cpu/isolated 3 ~# /usr/share/doc/wireguard-tools/examples/ncat-client-server/client.sh ~# ping 192.168.4.1 Before - corresponding workqueues are executed on all CPUs: ~# trace-cmd record -p function -l wg_packet_decrypt_worker -- sleep 10 plugin 'function' CPU0 data recorded at offset=0x7d6000 4096 bytes in size CPU1 data recorded at offset=0x7d7000 4096 bytes in size CPU2 data recorded at offset=0x7d8000 4096 bytes in size CPU3 data recorded at offset=0x7d9000 4096 bytes in size ~# trace-cmd report cpus=4 kworker/3:1-52 [003] 49.784353: function: wg_packet_decrypt_worker kworker/0:1-17 [000] 50.782879: function: wg_packet_decrypt_worker kworker/1:3-162 [001] 51.783044: function: wg_packet_decrypt_worker kworker/2:1-56 [002] 52.782159: function: wg_packet_decrypt_worker kworker/3:1-52 [003] 53.780919: function: wg_packet_decrypt_worker kworker/0:0-6 [000] 54.781755: function: wg_packet_decrypt_worker kworker/1:3-162 [001] 55.781273: function: wg_packet_decrypt_worker kworker/2:1-56 [002] 56.781946: function: wg_packet_decrypt_worker kworker/3:1-52 [003] 57.781010: function: wg_packet_decrypt_worker kworker/0:0-6 [000] 58.782097: function: wg_packet_decrypt_worker ~# After - isolated CPU 3 is excluded: ~# trace-cmd record -p function -l wg_packet_decrypt_worker -- sleep 10 plugin 'function' CPU0 data recorded at offset=0x7d7000 4096 bytes in size CPU1 data recorded at offset=0x7d8000 4096 bytes in size CPU2 data recorded at offset=0x7d9000 4096 bytes in size CPU3 data recorded at offset=0x7da000 0 bytes in size ~# trace-cmd report CPU 3 is empty cpus=4 kworker/1:2-66 [001] 291.800063: function: wg_packet_decrypt_worker kworker/2:2-143 [002] 292.800266: function: wg_packet_decrypt_worker kworker/0:2-145 [000] 293.801778: function: wg_packet_decrypt_worker kworker/1:4-261 [001] 294.803411: function: wg_packet_decrypt_worker kworker/2:2-143 [002] 295.804068: function: wg_packet_decrypt_worker kworker/0:2-145 [000] 296.806057: function: wg_packet_decrypt_worker kworker/1:2-66 [001] 297.810686: function: wg_packet_decrypt_worker kworker/2:2-143 [002] 298.811602: function: wg_packet_decrypt_worker kworker/0:2-145 [000] 299.812790: function: wg_packet_decrypt_worker kworker/1:4-261 [001] 300.813076: function: wg_packet_decrypt_worker ~# Signed-off-by: Charles-Francois Natali <cf.nat...@gmail.com> --- drivers/net/wireguard/queueing.h | 59 +++++++++++++++++++++++++------- drivers/net/wireguard/receive.c | 2 +- 2 files changed, 48 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index 583adb37e..106a2686c 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -11,6 +11,7 @@ #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/sched/isolation.h> #include <net/ip_tunnels.h> struct wg_device; @@ -102,16 +103,50 @@ static inline void wg_reset_packet(struct sk_buff *skb, bool encapsulating) skb_reset_inner_headers(skb); } -static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id) +/* We only want to dispatch work to housekeeping CPUs, ignoring isolated ones. + */ +static inline const struct cpumask *wg_cpumask_housekeeping(void) +{ + return housekeeping_cpumask(HK_FLAG_DOMAIN); +} + +static inline int wg_cpumask_test_cpu(int cpu) +{ + return cpumask_test_cpu(cpu, cpu_online_mask) && + cpumask_test_cpu(cpu, wg_cpumask_housekeeping()); +} + +static inline unsigned int wg_cpumask_first(void) +{ + return cpumask_first_and(cpu_online_mask, wg_cpumask_housekeeping()); +} + +static inline unsigned int wg_cpumask_next(int n) +{ + return cpumask_next_and(n, cpu_online_mask, wg_cpumask_housekeeping()); +} + +static inline unsigned int wg_cpumask_weight(void) +{ + int cpu; + int weight = 0; + + for_each_cpu_and(cpu, cpu_online_mask, wg_cpumask_housekeeping()) { + ++weight; + } + + return weight; +} + +static inline int wg_cpumask_choose_eligible(int *stored_cpu, unsigned int id) { unsigned int cpu = *stored_cpu, cpu_index, i; - if (unlikely(cpu == nr_cpumask_bits || - !cpumask_test_cpu(cpu, cpu_online_mask))) { - cpu_index = id % cpumask_weight(cpu_online_mask); - cpu = cpumask_first(cpu_online_mask); + if (unlikely(cpu == nr_cpumask_bits || !wg_cpumask_test_cpu(cpu))) { + cpu_index = id % wg_cpumask_weight(); + cpu = wg_cpumask_first(); for (i = 0; i < cpu_index; ++i) - cpu = cpumask_next(cpu, cpu_online_mask); + cpu = wg_cpumask_next(cpu); *stored_cpu = cpu; } return cpu; @@ -124,13 +159,13 @@ static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id) * a bit slower, and it doesn't seem like this potential race actually * introduces any performance loss, so we live with it. */ -static inline int wg_cpumask_next_online(int *next) +static inline int wg_cpumask_next_eligible(int *next) { int cpu = *next; - while (unlikely(!cpumask_test_cpu(cpu, cpu_online_mask))) - cpu = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits; - *next = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits; + while (unlikely(!wg_cpumask_test_cpu(cpu))) + cpu = wg_cpumask_next(cpu) % nr_cpumask_bits; + *next = wg_cpumask_next(cpu) % nr_cpumask_bits; return cpu; } @@ -173,7 +208,7 @@ static inline int wg_queue_enqueue_per_device_and_peer( /* Then we queue it up in the device queue, which consumes the * packet as soon as it can. */ - cpu = wg_cpumask_next_online(next_cpu); + cpu = wg_cpumask_next_eligible(next_cpu); if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb))) return -EPIPE; queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work); @@ -188,7 +223,7 @@ static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); atomic_set_release(&PACKET_CB(skb)->state, state); - queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), + queue_work_on(wg_cpumask_choose_eligible(&peer->serial_work_cpu, peer->internal_id), peer->device->packet_crypt_wq, &peer->transmit_packet_work); wg_peer_put(peer); } diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 7b8df406c..2d5d903d0 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -572,7 +572,7 @@ void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb) goto err; } atomic_inc(&wg->handshake_queue_len); - cpu = wg_cpumask_next_online(&wg->handshake_queue.last_cpu); + cpu = wg_cpumask_next_eligible(&wg->handshake_queue.last_cpu); /* Queues up a call to packet_process_queued_handshake_packets(skb): */ queue_work_on(cpu, wg->handshake_receive_wq, &per_cpu_ptr(wg->handshake_queue.worker, cpu)->work); -- 2.30.2