WireGuard currently uses round-robin to dispatch the handling of
packets, handling them on all online CPUs, including isolated ones
(isolcpus).

This is unfortunate because it causes significant latency on isolated
CPUs - see e.g. below over 240 usec:

kworker/47:1-2373323 [047] 243644.756405: funcgraph_entry: |
process_one_work() {
    kworker/47:1-2373323 [047] 243644.756406: funcgraph_entry: |
wg_packet_decrypt_worker() {
[...]
    kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: 0.591 us | }
    kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: ! 242.655 us | }

Instead, restrict to non-isolated CPUs.

Example:

~# cat /sys/devices/system/cpu/isolated
3
~# /usr/share/doc/wireguard-tools/examples/ncat-client-server/client.sh
~# ping 192.168.4.1

Before - corresponding workqueues are executed on all CPUs:

~# trace-cmd record -p function -l wg_packet_decrypt_worker -- sleep 10
  plugin 'function'
CPU0 data recorded at offset=0x7d6000
    4096 bytes in size
CPU1 data recorded at offset=0x7d7000
    4096 bytes in size
CPU2 data recorded at offset=0x7d8000
    4096 bytes in size
CPU3 data recorded at offset=0x7d9000
    4096 bytes in size
~# trace-cmd report
cpus=4
     kworker/3:1-52    [003]    49.784353: function:
wg_packet_decrypt_worker
     kworker/0:1-17    [000]    50.782879: function:
wg_packet_decrypt_worker
     kworker/1:3-162   [001]    51.783044: function:
wg_packet_decrypt_worker
     kworker/2:1-56    [002]    52.782159: function:
wg_packet_decrypt_worker
     kworker/3:1-52    [003]    53.780919: function:
wg_packet_decrypt_worker
     kworker/0:0-6     [000]    54.781755: function:
wg_packet_decrypt_worker
     kworker/1:3-162   [001]    55.781273: function:
wg_packet_decrypt_worker
     kworker/2:1-56    [002]    56.781946: function:
wg_packet_decrypt_worker
     kworker/3:1-52    [003]    57.781010: function:
wg_packet_decrypt_worker
     kworker/0:0-6     [000]    58.782097: function:
wg_packet_decrypt_worker
~#

After - isolated CPU 3 is excluded:

~# trace-cmd record -p function -l wg_packet_decrypt_worker -- sleep 10
  plugin 'function'
CPU0 data recorded at offset=0x7d7000
    4096 bytes in size
CPU1 data recorded at offset=0x7d8000
    4096 bytes in size
CPU2 data recorded at offset=0x7d9000
    4096 bytes in size
CPU3 data recorded at offset=0x7da000
    0 bytes in size
~# trace-cmd report
CPU 3 is empty
cpus=4
     kworker/1:2-66    [001]   291.800063: function:
wg_packet_decrypt_worker
     kworker/2:2-143   [002]   292.800266: function:
wg_packet_decrypt_worker
     kworker/0:2-145   [000]   293.801778: function:
wg_packet_decrypt_worker
     kworker/1:4-261   [001]   294.803411: function:
wg_packet_decrypt_worker
     kworker/2:2-143   [002]   295.804068: function:
wg_packet_decrypt_worker
     kworker/0:2-145   [000]   296.806057: function:
wg_packet_decrypt_worker
     kworker/1:2-66    [001]   297.810686: function:
wg_packet_decrypt_worker
     kworker/2:2-143   [002]   298.811602: function:
wg_packet_decrypt_worker
     kworker/0:2-145   [000]   299.812790: function:
wg_packet_decrypt_worker
     kworker/1:4-261   [001]   300.813076: function:
wg_packet_decrypt_worker
~#

Signed-off-by: Charles-Francois Natali <cf.nat...@gmail.com>
---
 drivers/net/wireguard/queueing.h | 59 +++++++++++++++++++++++++-------
 drivers/net/wireguard/receive.c  |  2 +-
 2 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h
index 583adb37e..106a2686c 100644
--- a/drivers/net/wireguard/queueing.h
+++ b/drivers/net/wireguard/queueing.h
@@ -11,6 +11,7 @@
 #include <linux/skbuff.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/sched/isolation.h>
 #include <net/ip_tunnels.h>
 
 struct wg_device;
@@ -102,16 +103,50 @@ static inline void wg_reset_packet(struct sk_buff *skb, 
bool encapsulating)
        skb_reset_inner_headers(skb);
 }
 
-static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id)
+/* We only want to dispatch work to housekeeping CPUs, ignoring isolated ones.
+ */
+static inline const struct cpumask *wg_cpumask_housekeeping(void)
+{
+       return housekeeping_cpumask(HK_FLAG_DOMAIN);
+}
+
+static inline int wg_cpumask_test_cpu(int cpu)
+{
+       return cpumask_test_cpu(cpu, cpu_online_mask) &&
+               cpumask_test_cpu(cpu, wg_cpumask_housekeeping());
+}
+
+static inline unsigned int wg_cpumask_first(void)
+{
+       return cpumask_first_and(cpu_online_mask, wg_cpumask_housekeeping());
+}
+
+static inline unsigned int wg_cpumask_next(int n)
+{
+       return cpumask_next_and(n, cpu_online_mask, wg_cpumask_housekeeping());
+}
+
+static inline unsigned int wg_cpumask_weight(void)
+{
+       int cpu;
+       int weight = 0;
+
+       for_each_cpu_and(cpu, cpu_online_mask, wg_cpumask_housekeeping()) {
+               ++weight;
+       }
+
+       return weight;
+}
+
+static inline int wg_cpumask_choose_eligible(int *stored_cpu, unsigned int id)
 {
        unsigned int cpu = *stored_cpu, cpu_index, i;
 
-       if (unlikely(cpu == nr_cpumask_bits ||
-                    !cpumask_test_cpu(cpu, cpu_online_mask))) {
-               cpu_index = id % cpumask_weight(cpu_online_mask);
-               cpu = cpumask_first(cpu_online_mask);
+       if (unlikely(cpu == nr_cpumask_bits || !wg_cpumask_test_cpu(cpu))) {
+               cpu_index = id % wg_cpumask_weight();
+               cpu = wg_cpumask_first();
                for (i = 0; i < cpu_index; ++i)
-                       cpu = cpumask_next(cpu, cpu_online_mask);
+                       cpu = wg_cpumask_next(cpu);
                *stored_cpu = cpu;
        }
        return cpu;
@@ -124,13 +159,13 @@ static inline int wg_cpumask_choose_online(int 
*stored_cpu, unsigned int id)
  * a bit slower, and it doesn't seem like this potential race actually
  * introduces any performance loss, so we live with it.
  */
-static inline int wg_cpumask_next_online(int *next)
+static inline int wg_cpumask_next_eligible(int *next)
 {
        int cpu = *next;
 
-       while (unlikely(!cpumask_test_cpu(cpu, cpu_online_mask)))
-               cpu = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits;
-       *next = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits;
+       while (unlikely(!wg_cpumask_test_cpu(cpu)))
+               cpu = wg_cpumask_next(cpu) % nr_cpumask_bits;
+       *next = wg_cpumask_next(cpu) % nr_cpumask_bits;
        return cpu;
 }
 
@@ -173,7 +208,7 @@ static inline int wg_queue_enqueue_per_device_and_peer(
        /* Then we queue it up in the device queue, which consumes the
         * packet as soon as it can.
         */
-       cpu = wg_cpumask_next_online(next_cpu);
+       cpu = wg_cpumask_next_eligible(next_cpu);
        if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb)))
                return -EPIPE;
        queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work);
@@ -188,7 +223,7 @@ static inline void wg_queue_enqueue_per_peer_tx(struct 
sk_buff *skb, enum packet
        struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
 
        atomic_set_release(&PACKET_CB(skb)->state, state);
-       queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, 
peer->internal_id),
+       queue_work_on(wg_cpumask_choose_eligible(&peer->serial_work_cpu, 
peer->internal_id),
                      peer->device->packet_crypt_wq, 
&peer->transmit_packet_work);
        wg_peer_put(peer);
 }
diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c
index 7b8df406c..2d5d903d0 100644
--- a/drivers/net/wireguard/receive.c
+++ b/drivers/net/wireguard/receive.c
@@ -572,7 +572,7 @@ void wg_packet_receive(struct wg_device *wg, struct sk_buff 
*skb)
                        goto err;
                }
                atomic_inc(&wg->handshake_queue_len);
-               cpu = wg_cpumask_next_online(&wg->handshake_queue.last_cpu);
+               cpu = wg_cpumask_next_eligible(&wg->handshake_queue.last_cpu);
                /* Queues up a call to 
packet_process_queued_handshake_packets(skb): */
                queue_work_on(cpu, wg->handshake_receive_wq,
                              &per_cpu_ptr(wg->handshake_queue.worker, 
cpu)->work);
-- 
2.30.2

Reply via email to