On 2/12/26 09:14, Jason Wang wrote:
> On Mon, Feb 9, 2026 at 2:18 AM Simon Schippers
> <[email protected]> wrote:
>>
>> On 2/6/26 04:21, Jason Wang wrote:
>>> On Fri, Feb 6, 2026 at 6:28 AM Simon Schippers
>>> <[email protected]> wrote:
>>>>
>>>> On 2/5/26 04:59, Jason Wang wrote:
>>>>> On Wed, Feb 4, 2026 at 11:44 PM Simon Schippers
>>>>> <[email protected]> wrote:
>>>>>>
>>>>>> On 2/3/26 04:48, Jason Wang wrote:
>>>>>>> On Mon, Feb 2, 2026 at 4:19 AM Simon Schippers
>>>>>>> <[email protected]> wrote:
>>>>>>>>
>>>>>>>> On 1/30/26 02:51, Jason Wang wrote:
>>>>>>>>> On Thu, Jan 29, 2026 at 5:25 PM Simon Schippers
>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>
>>>>>>>>>> On 1/29/26 02:14, Jason Wang wrote:
>>>>>>>>>>> On Wed, Jan 28, 2026 at 3:54 PM Simon Schippers
>>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 1/28/26 08:03, Jason Wang wrote:
>>>>>>>>>>>>> On Wed, Jan 28, 2026 at 12:48 AM Simon Schippers
>>>>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 1/23/26 10:54, Simon Schippers wrote:
>>>>>>>>>>>>>>> On 1/23/26 04:05, Jason Wang wrote:
>>>>>>>>>>>>>>>> On Thu, Jan 22, 2026 at 1:35 PM Jason Wang
>>>>>>>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> On Wed, Jan 21, 2026 at 5:33 PM Simon Schippers
>>>>>>>>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> On 1/9/26 07:02, Jason Wang wrote:
>>>>>>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 3:41 PM Simon Schippers
>>>>>>>>>>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> On 1/8/26 04:38, Jason Wang wrote:
>>>>>>>>>>>>>>>>>>>>> On Thu, Jan 8, 2026 at 5:06 AM Simon Schippers
>>>>>>>>>>>>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Introduce {tun,tap}_ring_consume() helpers that wrap
>>>>>>>>>>>>>>>>>>>>>> __ptr_ring_consume()
>>>>>>>>>>>>>>>>>>>>>> and wake the corresponding netdev subqueue when
>>>>>>>>>>>>>>>>>>>>>> consuming an entry frees
>>>>>>>>>>>>>>>>>>>>>> space in the underlying ptr_ring.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Stopping of the netdev queue when the ptr_ring is full
>>>>>>>>>>>>>>>>>>>>>> will be introduced
>>>>>>>>>>>>>>>>>>>>>> in an upcoming commit.
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> Co-developed-by: Tim Gebauer <[email protected]>
>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Tim Gebauer <[email protected]>
>>>>>>>>>>>>>>>>>>>>>> Signed-off-by: Simon Schippers
>>>>>>>>>>>>>>>>>>>>>> <[email protected]>
>>>>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>>>> drivers/net/tap.c | 23 ++++++++++++++++++++++-
>>>>>>>>>>>>>>>>>>>>>> drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>>>>>>>>>>>>>>>>>>>> 2 files changed, 45 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>>>>>>>>>>>>>>>>>>>>>> index 1197f245e873..2442cf7ac385 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/net/tap.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/net/tap.c
>>>>>>>>>>>>>>>>>>>>>> @@ -753,6 +753,27 @@ static ssize_t tap_put_user(struct
>>>>>>>>>>>>>>>>>>>>>> tap_queue *q,
>>>>>>>>>>>>>>>>>>>>>> return ret ? ret : total;
>>>>>>>>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> +static void *tap_ring_consume(struct tap_queue *q)
>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>> + struct ptr_ring *ring = &q->ring;
>>>>>>>>>>>>>>>>>>>>>> + struct net_device *dev;
>>>>>>>>>>>>>>>>>>>>>> + void *ptr;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>>>>>>>>>> + if (unlikely(ptr &&
>>>>>>>>>>>>>>>>>>>>>> __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>>>>>>>>>> + rcu_read_lock();
>>>>>>>>>>>>>>>>>>>>>> + dev = rcu_dereference(q->tap)->dev;
>>>>>>>>>>>>>>>>>>>>>> + netif_wake_subqueue(dev, q->queue_index);
>>>>>>>>>>>>>>>>>>>>>> + rcu_read_unlock();
>>>>>>>>>>>>>>>>>>>>>> + }
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + spin_unlock(&ring->consumer_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + return ptr;
>>>>>>>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> static ssize_t tap_do_read(struct tap_queue *q,
>>>>>>>>>>>>>>>>>>>>>> struct iov_iter *to,
>>>>>>>>>>>>>>>>>>>>>> int noblock, struct sk_buff
>>>>>>>>>>>>>>>>>>>>>> *skb)
>>>>>>>>>>>>>>>>>>>>>> @@ -774,7 +795,7 @@ static ssize_t tap_do_read(struct
>>>>>>>>>>>>>>>>>>>>>> tap_queue *q,
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> TASK_INTERRUPTIBLE);
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> /* Read frames from the queue */
>>>>>>>>>>>>>>>>>>>>>> - skb = ptr_ring_consume(&q->ring);
>>>>>>>>>>>>>>>>>>>>>> + skb = tap_ring_consume(q);
>>>>>>>>>>>>>>>>>>>>>> if (skb)
>>>>>>>>>>>>>>>>>>>>>> break;
>>>>>>>>>>>>>>>>>>>>>> if (noblock) {
>>>>>>>>>>>>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>>>>>>>>>>>>>> index 8192740357a0..7148f9a844a4 100644
>>>>>>>>>>>>>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>>>>>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>>>>>>>>>>>>>> @@ -2113,13 +2113,34 @@ static ssize_t
>>>>>>>>>>>>>>>>>>>>>> tun_put_user(struct tun_struct *tun,
>>>>>>>>>>>>>>>>>>>>>> return total;
>>>>>>>>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> +static void *tun_ring_consume(struct tun_file *tfile)
>>>>>>>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>>>>>>>> + struct ptr_ring *ring = &tfile->tx_ring;
>>>>>>>>>>>>>>>>>>>>>> + struct net_device *dev;
>>>>>>>>>>>>>>>>>>>>>> + void *ptr;
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + spin_lock(&ring->consumer_lock);
>>>>>>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>>>>>>> + ptr = __ptr_ring_consume(ring);
>>>>>>>>>>>>>>>>>>>>>> + if (unlikely(ptr &&
>>>>>>>>>>>>>>>>>>>>>> __ptr_ring_consume_created_space(ring, 1))) {
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> I guess it's the "bug" I mentioned in the previous patch
>>>>>>>>>>>>>>>>>>>>> that leads to
>>>>>>>>>>>>>>>>>>>>> the check of __ptr_ring_consume_created_space() here. If
>>>>>>>>>>>>>>>>>>>>> it's true,
>>>>>>>>>>>>>>>>>>>>> another call to tweak the current API.
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>> + rcu_read_lock();
>>>>>>>>>>>>>>>>>>>>>> + dev = rcu_dereference(tfile->tun)->dev;
>>>>>>>>>>>>>>>>>>>>>> + netif_wake_subqueue(dev,
>>>>>>>>>>>>>>>>>>>>>> tfile->queue_index);
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> This would cause the producer TX_SOFTIRQ to run on the
>>>>>>>>>>>>>>>>>>>>> same cpu which
>>>>>>>>>>>>>>>>>>>>> I'm not sure is what we want.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> What else would you suggest calling to wake the queue?
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> I don't have a good method in my mind, just want to point
>>>>>>>>>>>>>>>>>>> out its implications.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> I have to admit I'm a bit stuck at this point, particularly
>>>>>>>>>>>>>>>>>> with this
>>>>>>>>>>>>>>>>>> aspect.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> What is the correct way to pass the producer CPU ID to the
>>>>>>>>>>>>>>>>>> consumer?
>>>>>>>>>>>>>>>>>> Would it make sense to store smp_processor_id() in the tfile
>>>>>>>>>>>>>>>>>> inside
>>>>>>>>>>>>>>>>>> tun_net_xmit(), or should it instead be stored in the skb
>>>>>>>>>>>>>>>>>> (similar to the
>>>>>>>>>>>>>>>>>> XDP bit)? In the latter case, my concern is that this
>>>>>>>>>>>>>>>>>> information may
>>>>>>>>>>>>>>>>>> already be significantly outdated by the time it is used.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Based on that, my idea would be for the consumer to wake the
>>>>>>>>>>>>>>>>>> producer by
>>>>>>>>>>>>>>>>>> invoking a new function (e.g., tun_wake_queue()) on the
>>>>>>>>>>>>>>>>>> producer CPU via
>>>>>>>>>>>>>>>>>> smp_call_function_single().
>>>>>>>>>>>>>>>>>> Is this a reasonable approach?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> I'm not sure but it would introduce costs like IPI.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> More generally, would triggering TX_SOFTIRQ on the consumer
>>>>>>>>>>>>>>>>>> CPU be
>>>>>>>>>>>>>>>>>> considered a deal-breaker for the patch set?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> It depends on whether or not it has effects on the
>>>>>>>>>>>>>>>>> performance.
>>>>>>>>>>>>>>>>> Especially when vhost is pinned.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I meant we can benchmark to see the impact. For example, pin
>>>>>>>>>>>>>>>> vhost to
>>>>>>>>>>>>>>>> a specific CPU and the try to see the impact of the TX_SOFTIRQ.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Thanks
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I ran benchmarks with vhost pinned to CPU 0 using taskset -p -c
>>>>>>>>>>>>>>> 0 ...
>>>>>>>>>>>>>>> for both the stock and patched versions. The benchmarks were
>>>>>>>>>>>>>>> run with
>>>>>>>>>>>>>>> the full patch series applied, since testing only patches 1-3
>>>>>>>>>>>>>>> would not
>>>>>>>>>>>>>>> be meaningful - the queue is never stopped in that case, so no
>>>>>>>>>>>>>>> TX_SOFTIRQ is triggered.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Compared to the non-pinned CPU benchmarks in the cover letter,
>>>>>>>>>>>>>>> performance is lower for pktgen with a single thread but higher
>>>>>>>>>>>>>>> with
>>>>>>>>>>>>>>> four threads. The results show no regression for the patched
>>>>>>>>>>>>>>> version,
>>>>>>>>>>>>>>> with even slight performance improvements observed:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> +-------------------------+-----------+----------------+
>>>>>>>>>>>>>>> | pktgen benchmarks to | Stock | Patched with |
>>>>>>>>>>>>>>> | Debian VM, i5 6300HQ, | | fq_codel qdisc |
>>>>>>>>>>>>>>> | 100M packets | | |
>>>>>>>>>>>>>>> | vhost pinned to core 0 | | |
>>>>>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>>>>>> | TAP | Transmitted | 452 Kpps | 454 Kpps |
>>>>>>>>>>>>>>> | + +-------------+-----------+----------------+
>>>>>>>>>>>>>>> | vhost-net | Lost | 1154 Kpps | 0 |
>>>>>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> +-------------------------+-----------+----------------+
>>>>>>>>>>>>>>> | pktgen benchmarks to | Stock | Patched with |
>>>>>>>>>>>>>>> | Debian VM, i5 6300HQ, | | fq_codel qdisc |
>>>>>>>>>>>>>>> | 100M packets | | |
>>>>>>>>>>>>>>> | vhost pinned to core 0 | | |
>>>>>>>>>>>>>>> | *4 threads* | | |
>>>>>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>>>>>> | TAP | Transmitted | 71 Kpps | 79 Kpps |
>>>>>>>>>>>>>>> | + +-------------+-----------+----------------+
>>>>>>>>>>>>>>> | vhost-net | Lost | 1527 Kpps | 0 |
>>>>>>>>>>>>>>> +-----------+-------------+-----------+----------------+
>>>>>>>>>>>>>
>>>>>>>>>>>>> The PPS seems to be low. I'd suggest using testpmd (rxonly) mode
>>>>>>>>>>>>> in
>>>>>>>>>>>>> the guest or an xdp program that did XDP_DROP in the guest.
>>>>>>>>>>>>
>>>>>>>>>>>> I forgot to mention that these PPS values are per thread.
>>>>>>>>>>>> So overall we have 71 Kpps * 4 = 284 Kpps and 79 Kpps * 4 = 326
>>>>>>>>>>>> Kpps,
>>>>>>>>>>>> respectively. For packet loss, that comes out to 1154 Kpps * 4 =
>>>>>>>>>>>> 4616 Kpps and 0, respectively.
>>>>>>>>>>>>
>>>>>>>>>>>> Sorry about that!
>>>>>>>>>>>>
>>>>>>>>>>>> The pktgen benchmarks with a single thread look fine, right?
>>>>>>>>>>>
>>>>>>>>>>> Still looks very low. E.g I just have a run of pktgen (using
>>>>>>>>>>> pktgen_sample03_burst_single_flow.sh) without a XDP_DROP in the
>>>>>>>>>>> guest,
>>>>>>>>>>> I can get 1Mpps.
>>>>>>>>>>
>>>>>>>>>> Keep in mind that I am using an older CPU (i5-6300HQ). For the
>>>>>>>>>> single-threaded tests I always used pktgen_sample01_simple.sh, and
>>>>>>>>>> for
>>>>>>>>>> the multi-threaded tests I always used pktgen_sample02_multiqueue.sh.
>>>>>>>>>>
>>>>>>>>>> Using pktgen_sample03_burst_single_flow.sh as you did fails for me
>>>>>>>>>> (even
>>>>>>>>>> though the same parameters work fine for sample01 and sample02):
>>>>>>>>>>
>>>>>>>>>> samples/pktgen/pktgen_sample03_burst_single_flow.sh -i tap0 -m
>>>>>>>>>> 52:54:00:12:34:56 -d 10.0.0.2 -n 100000000
>>>>>>>>>> /samples/pktgen/functions.sh: line 79: echo: write error: Operation
>>>>>>>>>> not
>>>>>>>>>> supported
>>>>>>>>>> ERROR: Write error(1) occurred
>>>>>>>>>> cmd: "burst 32 > /proc/net/pktgen/tap0@0"
>>>>>>>>>>
>>>>>>>>>> ...and I do not know what I am doing wrong, even after looking at
>>>>>>>>>> Documentation/networking/pktgen.rst. Every burst size except 1 fails.
>>>>>>>>>> Any clues?
>>>>>>>>>
>>>>>>>>> Please use -b 0, and I'm Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz.
>>>>>>>>
>>>>>>>> I tried using "-b 0", and while it worked, there was no noticeable
>>>>>>>> performance improvement.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Another thing I can think of is to disable
>>>>>>>>>
>>>>>>>>> 1) mitigations in both guest and host
>>>>>>>>> 2) any kernel debug features in both host and guest
>>>>>>>>
>>>>>>>> I also rebuilt the kernel with everything disabled under
>>>>>>>> "Kernel hacking", but that didn’t make any difference either.
>>>>>>>>
>>>>>>>> Because of this, I ran "pktgen_sample01_simple.sh" and
>>>>>>>> "pktgen_sample02_multiqueue.sh" on my AMD Ryzen 5 5600X system. The
>>>>>>>> results were about 374 Kpps with TAP and 1192 Kpps with TAP+vhost_net,
>>>>>>>> with very similar performance between the stock and patched kernels.
>>>>>>>>
>>>>>>>> Personally, I think the low performance is to blame on the hardware.
>>>>>>>
>>>>>>> Let's double confirm this by:
>>>>>>>
>>>>>>> 1) make sure pktgen is using 100% CPU
>>>>>>> 2) Perf doesn't show anything strange for pktgen thread
>>>>>>>
>>>>>>> Thanks
>>>>>>>
>>>>>>
>>>>>> I ran pktgen using pktgen_sample01_simple.sh and, in parallel, started a
>>>>>> 100 second perf stat measurement covering all kpktgend threads.
>>>>>>
>>>>>> Across all configurations, a single CPU was fully utilized.
>>>>>>
>>>>>> Apart from that, the patched variants show a higher branch frequency and
>>>>>> a slightly increased number of context switches.
>>>>>>
>>>>>>
>>>>>> The detailed results are provided below:
>>>>>>
>>>>>> Processor: Ryzen 5 5600X
>>>>>>
>>>>>> pktgen command:
>>>>>> sudo perf stat samples/pktgen/pktgen_sample01_simple.sh -i tap0 -m
>>>>>> 52:54:00:12:34:56 -d 10.0.0.2 -n 10000000000
>>>>>>
>>>>>> perf stat command:
>>>>>> sudo perf stat --timeout 100000 -p $(pgrep kpktgend | tr '\n' ,) -o X.txt
>>>>>>
>>>>>>
>>>>>> Results:
>>>>>> Stock TAP:
>>>>>> 46.997 context-switches # 467,2
>>>>>> cs/sec cs_per_second
>>>>>> 0 cpu-migrations # 0,0
>>>>>> migrations/sec migrations_per_second
>>>>>> 0 page-faults # 0,0
>>>>>> faults/sec page_faults_per_second
>>>>>> 100.587,69 msec task-clock # 1,0 CPUs
>>>>>> CPUs_utilized
>>>>>> 8.491.586.483 branch-misses # 10,9 %
>>>>>> branch_miss_rate (50,24%)
>>>>>> 77.734.761.406 branches # 772,8
>>>>>> M/sec branch_frequency (66,85%)
>>>>>> 382.420.291.585 cpu-cycles # 3,8 GHz
>>>>>> cycles_frequency (66,85%)
>>>>>> 377.612.185.141 instructions # 1,0
>>>>>> instructions insn_per_cycle (66,85%)
>>>>>> 84.012.185.936 stalled-cycles-frontend # 0,22
>>>>>> frontend_cycles_idle (66,35%)
>>>>>>
>>>>>> 100,100414494 seconds time elapsed
>>>>>>
>>>>>>
>>>>>> Stock TAP+vhost-net:
>>>>>> 47.087 context-switches # 468,1
>>>>>> cs/sec cs_per_second
>>>>>> 0 cpu-migrations # 0,0
>>>>>> migrations/sec migrations_per_second
>>>>>> 0 page-faults # 0,0
>>>>>> faults/sec page_faults_per_second
>>>>>> 100.594,09 msec task-clock # 1,0 CPUs
>>>>>> CPUs_utilized
>>>>>> 8.034.703.613 branch-misses # 11,1 %
>>>>>> branch_miss_rate (50,24%)
>>>>>> 72.477.989.922 branches # 720,5
>>>>>> M/sec branch_frequency (66,86%)
>>>>>> 382.218.276.832 cpu-cycles # 3,8 GHz
>>>>>> cycles_frequency (66,85%)
>>>>>> 349.555.577.281 instructions # 0,9
>>>>>> instructions insn_per_cycle (66,85%)
>>>>>> 83.917.644.262 stalled-cycles-frontend # 0,22
>>>>>> frontend_cycles_idle (66,35%)
>>>>>>
>>>>>> 100,100520402 seconds time elapsed
>>>>>>
>>>>>>
>>>>>> Patched TAP:
>>>>>> 47.862 context-switches # 475,8
>>>>>> cs/sec cs_per_second
>>>>>> 0 cpu-migrations # 0,0
>>>>>> migrations/sec migrations_per_second
>>>>>> 0 page-faults # 0,0
>>>>>> faults/sec page_faults_per_second
>>>>>> 100.589,30 msec task-clock # 1,0 CPUs
>>>>>> CPUs_utilized
>>>>>> 9.337.258.794 branch-misses # 9,4 %
>>>>>> branch_miss_rate (50,19%)
>>>>>> 99.518.421.676 branches # 989,4
>>>>>> M/sec branch_frequency (66,85%)
>>>>>> 382.508.244.894 cpu-cycles # 3,8 GHz
>>>>>> cycles_frequency (66,85%)
>>>>>> 312.582.270.975 instructions # 0,8
>>>>>> instructions insn_per_cycle (66,85%)
>>>>>> 76.338.503.984 stalled-cycles-frontend # 0,20
>>>>>> frontend_cycles_idle (66,39%)
>>>>>>
>>>>>> 100,101262454 seconds time elapsed
>>>>>>
>>>>>>
>>>>>> Patched TAP+vhost-net:
>>>>>> 47.892 context-switches # 476,1
>>>>>> cs/sec cs_per_second
>>>>>> 0 cpu-migrations # 0,0
>>>>>> migrations/sec migrations_per_second
>>>>>> 0 page-faults # 0,0
>>>>>> faults/sec page_faults_per_second
>>>>>> 100.581,95 msec task-clock # 1,0 CPUs
>>>>>> CPUs_utilized
>>>>>> 9.083.588.313 branch-misses # 10,1 %
>>>>>> branch_miss_rate (50,28%)
>>>>>> 90.300.124.712 branches # 897,8
>>>>>> M/sec branch_frequency (66,85%)
>>>>>> 382.374.510.376 cpu-cycles # 3,8 GHz
>>>>>> cycles_frequency (66,85%)
>>>>>> 340.089.181.199 instructions # 0,9
>>>>>> instructions insn_per_cycle (66,85%)
>>>>>> 78.151.408.955 stalled-cycles-frontend # 0,20
>>>>>> frontend_cycles_idle (66,31%)
>>>>>>
>>>>>> 100,101212911 seconds time elapsed
>>>>>
>>>>> Thanks for sharing. I have more questions:
>>>>>
>>>>> 1) The number of CPU and vCPUs
>>>>
>>>> qemu runs with a single core. And my host system is now a Ryzen 5 5600x
>>>> with 6 cores, 12 threads.
>>>> This is my command for TAP+vhost-net:
>>>>
>>>> sudo qemu-system-x86_64 -hda debian.qcow2
>>>> -netdev tap,id=mynet0,ifname=tap0,script=no,downscript=no,vhost=on
>>>> -device virtio-net-pci,netdev=mynet0 -m 1024 -enable-kvm
>>>>
>>>> For TAP only it is the same but without vhost=on.
>>>>
>>>>> 2) If you pin vhost or vCPU threads
>>>>
>>>> Not in the previous shown benchmark. I pinned vhost in other benchmarks
>>>> but since there is only minor PPS difference I omitted for the sake of
>>>> simplicity.
>>>>
>>>>> 3) what does perf top looks like or perf top -p $pid_of_vhost
>>>>
>>>> The perf reports for the pid_of_vhost from pktgen_sample01_simple.sh
>>>> with TAP+vhost-net (not pinned, pktgen single queue, fq_codel) are shown
>>>> below. I can not see a huge difference between stock and patched.
>>>>
>>>> Also I included perf reports from the pktgen_pids. I find them more
>>>> intersting because tun_net_xmit shows less overhead for the patched.
>>>> I assume that is due to the stopped netdev queue.
>>>>
>>>> I have now benchmarked pretty much all possible combinations (with a
>>>> script) of TAP/TAP+vhost-net, single/multi-queue pktgen, vhost
>>>> pinned/not pinned, with/without -b 0, fq_codel/noqueue... All of that
>>>> with perf records..
>>>> I could share them if you want but I feel this is getting out of hand.
>>>>
>>>>
>>>> Stock:
>>>> sudo perf record -p "$vhost_pid"
>>>> ...
>>>> # Overhead Command Shared Object Symbol
>>>> # ........ ............... ..........................
>>>> ..........................................
>>>> #
>>>> 5.97% vhost-4874 [kernel.kallsyms] [k] _copy_to_iter
>>>> 2.68% vhost-4874 [kernel.kallsyms] [k] tun_do_read
>>>> 2.23% vhost-4874 [kernel.kallsyms] [k]
>>>> native_write_msr
>>>> 1.93% vhost-4874 [kernel.kallsyms] [k]
>>>> __check_object_size
>>>
>>> Let's disable CONFIG_HARDENED_USERCOPY and retry.
>>>
>>>> 1.61% vhost-4874 [kernel.kallsyms] [k]
>>>> __slab_free.isra.0
>>>> 1.56% vhost-4874 [kernel.kallsyms] [k]
>>>> __get_user_nocheck_2
>>>> 1.54% vhost-4874 [kernel.kallsyms] [k] iov_iter_zero
>>>> 1.45% vhost-4874 [kernel.kallsyms] [k]
>>>> kmem_cache_free
>>>> 1.43% vhost-4874 [kernel.kallsyms] [k] tun_recvmsg
>>>> 1.24% vhost-4874 [kernel.kallsyms] [k]
>>>> sk_skb_reason_drop
>>>> 1.12% vhost-4874 [kernel.kallsyms] [k]
>>>> srso_alias_safe_ret
>>>> 1.07% vhost-4874 [kernel.kallsyms] [k]
>>>> native_read_msr
>>>> 0.76% vhost-4874 [kernel.kallsyms] [k]
>>>> simple_copy_to_iter
>>>> 0.75% vhost-4874 [kernel.kallsyms] [k]
>>>> srso_alias_return_thunk
>>>> 0.69% vhost-4874 [vhost] [k]
>>>> 0x0000000000002e70
>>>> 0.59% vhost-4874 [kernel.kallsyms] [k]
>>>> skb_release_data
>>>> 0.59% vhost-4874 [kernel.kallsyms] [k]
>>>> __skb_datagram_iter
>>>> 0.53% vhost-4874 [vhost] [k]
>>>> 0x0000000000002e5f
>>>> 0.51% vhost-4874 [kernel.kallsyms] [k]
>>>> slab_update_freelist.isra.0
>>>> 0.46% vhost-4874 [kernel.kallsyms] [k] kfree_skbmem
>>>> 0.44% vhost-4874 [kernel.kallsyms] [k]
>>>> skb_copy_datagram_iter
>>>> 0.43% vhost-4874 [kernel.kallsyms] [k] skb_free_head
>>>> 0.37% qemu-system-x86 [unknown] [k]
>>>> 0xffffffffba898b1b
>>>> 0.35% vhost-4874 [vhost] [k]
>>>> 0x0000000000002e6b
>>>> 0.33% vhost-4874 [vhost_net] [k]
>>>> 0x000000000000357d
>>>> 0.28% vhost-4874 [kernel.kallsyms] [k]
>>>> __check_heap_object
>>>> 0.27% vhost-4874 [vhost_net] [k]
>>>> 0x00000000000035f3
>>>> 0.26% vhost-4874 [vhost_net] [k]
>>>> 0x00000000000030f6
>>>> 0.26% vhost-4874 [kernel.kallsyms] [k]
>>>> __virt_addr_valid
>>>> 0.24% vhost-4874 [kernel.kallsyms] [k]
>>>> iov_iter_advance
>>>> 0.22% vhost-4874 [kernel.kallsyms] [k]
>>>> perf_event_update_userpage
>>>> 0.22% vhost-4874 [kernel.kallsyms] [k]
>>>> check_stack_object
>>>> 0.19% qemu-system-x86 [unknown] [k]
>>>> 0xffffffffba2a68cd
>>>> 0.19% vhost-4874 [kernel.kallsyms] [k]
>>>> dequeue_entities
>>>> 0.19% vhost-4874 [vhost_net] [k]
>>>> 0x0000000000003237
>>>> 0.18% vhost-4874 [vhost_net] [k]
>>>> 0x0000000000003550
>>>> 0.18% vhost-4874 [kernel.kallsyms] [k] x86_pmu_del
>>>> 0.18% vhost-4874 [vhost_net] [k]
>>>> 0x00000000000034a0
>>>> 0.17% vhost-4874 [kernel.kallsyms] [k]
>>>> x86_pmu_disable_all
>>>> 0.16% vhost-4874 [vhost_net] [k]
>>>> 0x0000000000003523
>>>> 0.16% vhost-4874 [kernel.kallsyms] [k]
>>>> amd_pmu_addr_offset
>>>> ...
>>>>
>>>>
>>>> sudo perf record -p "$kpktgend_pids":
>>>> ...
>>>> # Overhead Command Shared Object Symbol
>>>> # ........ ........... .................
>>>> ...............................................
>>>> #
>>>> 10.98% kpktgend_0 [kernel.kallsyms] [k] tun_net_xmit
>>>> 10.45% kpktgend_0 [kernel.kallsyms] [k] memset
>>>> 8.40% kpktgend_0 [kernel.kallsyms] [k] __alloc_skb
>>>> 6.31% kpktgend_0 [kernel.kallsyms] [k]
>>>> kmem_cache_alloc_node_noprof
>>>> 3.13% kpktgend_0 [kernel.kallsyms] [k] srso_alias_safe_ret
>>>> 2.40% kpktgend_0 [kernel.kallsyms] [k] sk_skb_reason_drop
>>>> 2.11% kpktgend_0 [kernel.kallsyms] [k] srso_alias_return_thunk
>>>
>>> This is a hint that SRSO migitaion is enabled.
>>>
>>> Have you disabled CPU_MITIGATIONS via either Kconfig or kernel command
>>> line (mitigations = off) for both host and guest?
>>>
>>> Thanks
>>>
>>
>> Your both suggested changes really boosted the performance, especially
>> for TAP.
>
> Good to know that.
>
>>
>> I disabled SRSO mitigation with spec_rstack_overflow=off and went from
>> "Mitigation: Safe RET" to "Vulnerable" on the host. The VM showed "Not
>> affected" but I applied spec_rstack_overflow=off anyway.
>
> I think we need to find the root cause of the regression.
>
>>
>> Here are some new benchmarks for pktgen_sample01_simple.sh:
>> (I also have other available and I can share them if you want.)
>>
>
> It's a little hard to compare the diff, maybe you can do perf diff.
I ran perf diff for the pktgen perf records of TAP and TAP+vhost-net
(both single queue, not cpu pinned).
With the help of that perf diff (results below) I was able to find out
that functions related to the wake (e.g. __local_bh_enable_ip) cost
quite some performance. Because my patch already wakes on
__ptr_ring_consume_created_space(), I suspected that this leads to very
frequent stop -> wake -> stop -> wake cycles.
Therefore, I also compiled a new variant that wakes on
__ptr_ring_empty() instead. The idea is that netif_tx_wake_queue() is
invoked less frequently.
The pktgen results:
+-------------------------+-----------+-----------+---------------+
| pktgen benchmarks to | Stock | Patched | Wake on |
| Debian VM, R5 5600X, | | | empty Variant |
| 100M packets | | | |
| CPU not pinned | | | |
+-----------+-------------+-----------+-----------+---------------+
| TAP | Transmitted | 1293 Kpps | 989 Kpps | 1248 Kpps |
| +-------------+-----------+-----------+---------------+
| | Lost | 3918 Kpps | 0 | 0 |
+-----------+-------------+-----------+-----------+---------------+
| TAP | Transmitted | 1411 Kpps | 1410 Kpps | 1379 Kpps |
| + +-------------+-----------+-----------+---------------+
| vhost-net | Lost | 3659 Kpps | 0 | 0 |
+-----------+-------------+-----------+-----------+---------------+
My conclusions are:
Patched: Waking on __ptr_ring_produce_created_space() is too early. The
stop/wake cycle occurs too frequently which slows down
performance as can be seen for TAP.
Wake on empty variant: Waking on __ptr_ring_empty() is (slightly) too
late. The consumer starves because the producer
first has to produce packets again. This slows
down performance aswell as can be seen for TAP
and TAP+vhost-net (both down ~30-40Kpps).
I think something inbetween should be used.
The wake should be done as late as possible to have as few
NET_TX_SOFTIRQs as possible but early enough that there are still
consumable packets remaining to not starve the consumer.
However, I can not think of a proper way to implement this right now.
Thanks!
=========================================================================
TAP:
# Event 'cpu/cycles/P'
#
# Data files:
# [0] stock_pktgen.data (Baseline)
# [1] patched_pktgen.data
# [2] wake_on_empty_variant_pktgen.data
#
# Baseline/0 Delta Abs/1 Delta Abs/2 Shared Object Symbol
# .......... ........... ........... .................
........................................................
#
24.49% +43.46% +47.09% [pktgen] [k]
0x0000000000000e30
22.27% -17.03% -16.76% [kernel.kallsyms] [k] memset
10.59% -7.72% -8.06% [kernel.kallsyms] [k] __alloc_skb
7.50% -5.34% -6.00% [kernel.kallsyms] [k]
kmem_cache_alloc_node_noprof
1.20% +4.08% +2.82% [kernel.kallsyms] [k]
__local_bh_enable_ip
5.76% -4.04% -4.07% [kernel.kallsyms] [k] tun_net_xmit
3.15% -2.23% -2.37% [kernel.kallsyms] [k] chacha_permute
0.22% +1.87% +1.41% [kernel.kallsyms] [k]
kthread_should_stop
2.36% -1.55% -1.57% [kernel.kallsyms] [k] get_random_u32
2.19% -1.51% -1.74% [kernel.kallsyms] [k] skb_put
0.18% +1.33% +1.04% [kernel.kallsyms] [k] __cond_resched
0.68% +1.32% +0.87% [kernel.kallsyms] [k] __rcu_read_unlock
0.49% +1.17% +0.85% [kernel.kallsyms] [k] __rcu_read_lock
1.40% -1.15% -1.24% [kernel.kallsyms] [k]
sk_skb_reason_drop
1.34% -1.12% -1.16% [kernel.kallsyms] [k] ip_send_check
1.10% -0.80% -0.85% [kernel.kallsyms] [k] _raw_spin_lock
1.04% -0.71% -0.81% [kernel.kallsyms] [k] kmalloc_reserve
0.86% -0.51% -0.66% [kernel.kallsyms] [k]
__netdev_alloc_skb
0.62% -0.41% -0.46% [kernel.kallsyms] [k]
__get_random_u32_below
0.50% -0.34% -0.38% [kernel.kallsyms] [k] _get_random_bytes
0.37% -0.26% -0.28% [kernel.kallsyms] [k]
crng_fast_key_erasure
0.33% -0.19% -0.25% [kernel.kallsyms] [k]
chacha_block_generic
0.24% -0.15% -0.18% [kernel.kallsyms] [k]
skb_clone_tx_timestamp
0.31% -0.13% -0.23% [kernel.kallsyms] [k] skb_push
0.30% -0.11% -0.22% [kernel.kallsyms] [k] _raw_spin_unlock
0.25% -0.10% -0.14% [kernel.kallsyms] [k]
__x86_indirect_thunk_array
0.56% +0.08% +0.15% [kernel.kallsyms] [k] sock_def_readable
0.12% -0.08% -0.09% [kernel.kallsyms] [k] memcpy
0.25% +0.06% +0.01% [kernel.kallsyms] [k] ___slab_alloc
0.31% +0.05% -0.03% [kernel.kallsyms] [k] native_write_msr
0.01% +0.05% +0.96% [kernel.kallsyms] [k] clear_page_erms
0.07% +0.03% -0.03% [kernel.kallsyms] [k] get_partial_node
0.08% -0.03% -0.05% [kernel.kallsyms] [k] crng_make_state
0.02% +0.02% -0.01% [kernel.kallsyms] [k]
_raw_spin_lock_irqsave
0.02% -0.02% -0.01% [kernel.kallsyms] [k] read_tsc
0.02% +0.02% +0.00% [kernel.kallsyms] [k]
__slab_alloc.isra.0
0.00% +0.02% +0.10% [kernel.kallsyms] [k]
get_page_from_freelist
0.08% +0.01% -0.06% [kernel.kallsyms] [k] put_cpu_partial
0.00% +0.01% +0.25% [kernel.kallsyms] [k] allocate_slab
0.01% +0.01% -0.00% [kernel.kallsyms] [k]
x86_schedule_events
0.01% +0.01% -0.01% [kernel.kallsyms] [k]
_raw_spin_unlock_irqrestore
0.00% +0.01% +0.00% [kernel.kallsyms] [k]
perf_assign_events
0.00% +0.01% -0.00% [amdgpu] [k]
0x00000000000662f4
0.02% +0.01% +0.00% [kernel.kallsyms] [k]
amd_pmu_addr_offset
=========================================================================
TAP+vhost-net:
# Event 'cpu/cycles/P'
#
# Data files:
# [0] stock_pktgen.data (Baseline)
# [1] patched_pktgen.data
# [2] wake_on_empty_variant_pktgen.data
#
# Baseline/0 Delta Abs/1 Delta Abs/2 Shared Object Symbol
# .......... ........... ........... .................
..............................................
#
24.35% +47.04% +45.59% [pktgen] [k]
0x0000000000000e30
22.06% -16.02% -16.19% [kernel.kallsyms] [k] memset
10.72% -7.84% -7.84% [kernel.kallsyms] [k] __alloc_skb
7.59% -5.82% -5.79% [kernel.kallsyms] [k]
kmem_cache_alloc_node_noprof
5.69% -3.98% -4.08% [kernel.kallsyms] [k] tun_net_xmit
1.22% +2.74% +2.65% [kernel.kallsyms] [k]
__local_bh_enable_ip
3.18% -2.33% -2.30% [kernel.kallsyms] [k] chacha_permute
2.47% -1.78% -1.49% [kernel.kallsyms] [k] get_random_u32
2.16% -1.58% -1.66% [kernel.kallsyms] [k] skb_put
0.24% +1.36% +1.33% [kernel.kallsyms] [k]
kthread_should_stop
1.47% -1.28% -1.29% [kernel.kallsyms] [k]
sk_skb_reason_drop
0.18% +1.05% +1.01% [kernel.kallsyms] [k] __cond_resched
0.69% +0.88% +0.84% [kernel.kallsyms] [k] __rcu_read_unlock
1.23% -0.87% -1.04% [kernel.kallsyms] [k] ip_send_check
0.52% +0.83% +0.84% [kernel.kallsyms] [k] __rcu_read_lock
1.09% -0.80% -0.78% [kernel.kallsyms] [k] _raw_spin_lock
1.03% -0.73% -0.75% [kernel.kallsyms] [k] kmalloc_reserve
0.83% -0.61% -0.58% [kernel.kallsyms] [k]
__netdev_alloc_skb
0.63% -0.47% -0.45% [kernel.kallsyms] [k]
__get_random_u32_below
0.47% -0.34% -0.33% [kernel.kallsyms] [k] _get_random_bytes
0.36% -0.26% -0.25% [kernel.kallsyms] [k]
crng_fast_key_erasure
0.34% -0.25% -0.24% [kernel.kallsyms] [k]
chacha_block_generic
0.32% -0.22% -0.23% [kernel.kallsyms] [k] skb_push
0.31% -0.21% -0.22% [kernel.kallsyms] [k] _raw_spin_unlock
0.25% -0.19% -0.18% [kernel.kallsyms] [k]
skb_clone_tx_timestamp
0.28% -0.15% -0.16% [kernel.kallsyms] [k]
__x86_indirect_thunk_array
0.11% -0.09% -0.08% [kernel.kallsyms] [k] memcpy
0.10% -0.08% -0.08% [kernel.kallsyms] [k] crng_make_state
0.29% -0.03% -0.02% [kernel.kallsyms] [k] native_write_msr
0.13% -0.03% -0.01% [kernel.kallsyms] [k] native_read_msr
0.66% +0.02% +0.09% [kernel.kallsyms] [k] sock_def_readable
0.27% +0.01% +0.03% [kernel.kallsyms] [k] ___slab_alloc
0.03% -0.01% -0.01% [kernel.kallsyms] [k]
__slab_alloc.isra.0
0.03% -0.01% -0.00% [kernel.kallsyms] [k]
amd_pmu_addr_offset
0.08% -0.01% -0.03% [kernel.kallsyms] [k] get_partial_node
0.09% -0.01% +1.02% [kernel.kallsyms] [k] clear_page_erms
0.00% +0.00% +0.00% [kernel.kallsyms] [k]
x86_schedule_events
0.01% +0.00% +0.01% [kernel.kallsyms] [k] x86_pmu_add
0.01% +0.00% +0.10% [kernel.kallsyms] [k]
get_page_from_freelist
0.01% -0.00% +0.00% [kernel.kallsyms] [k] x86_pmu_del
0.01% -0.00% -0.00% [kernel.kallsyms] [k]
x86_pmu_disable_all
0.00% +0.00% -0.00% [kernel.kallsyms] [k]
perf_assign_events
0.01% -0.00% -0.00% [kernel.kallsyms] [k] group_sched_out
0.01% -0.00% -0.00% [kernel.kallsyms] [k] read_tsc
>
> (Btw, it's near to the public holiday of spring festival in China, so
> the reply might a slow).
Good to know, I won't ping you then.
>
> Thanks
>