On Thu, May 07, 2026 at 06:48:47PM -0400, Michael S. Tsirkin wrote:
On Thu, May 07, 2026 at 02:59:13PM +0200, Stefano Garzarella wrote:
On Thu, May 07, 2026 at 07:45:10AM -0400, Michael S. Tsirkin wrote:
> On Thu, May 07, 2026 at 11:09:47AM +0200, Stefano Garzarella wrote:

[...]

> > For now, we're already doing something:
> > merging the skuffs if they don't have EOM set.
>
>
> Right that's good. You could go further and merge with EOM too
> if you stick the info about message boundaries somewhere else.

This adds a lot of complexity IMO, but we can try.

Do you have something in mind?

BER is clearly overkill but here's a POC that claude made for me,
just to give u an idea. It's clearly has a ton of issues,
for example I dislike how GFP_ATOMIC is handled.

Okay, I somewhat understand, but clearly this isn't net material, so for now I think the best thing to do is to merge the fixup I sent (or something similar): https://lore.kernel.org/netdev/[email protected]/

This is a major change that should be merged with more caution.
Could this have too much of an impact on performance?

Thanks,
Stefano

Yet it seems to work fine in light testing.

-->


vsock/virtio: use DWARF ULEB128 to record EOM boundaries, enable cross-EOM skb 
coalescing

virtio_transport_recv_enqueue() currently refuses to coalesce an
incoming skb with the previous one when the previous skb carries
VIRTIO_VSOCK_SEQ_EOM.  This forces one skb per seqpacket message.
For workloads with many small or zero-byte messages the per-skb
overhead (~960 bytes) dominates, causing unbounded memory growth.

Decouple message boundary tracking from the skb structure: store
boundary offsets in a compact side buffer using DWARF ULEB128
encoding with the EOR flag folded into the low bit, then allow
the data of multiple complete messages to be coalesced into a single
skb.

Cross-EOM coalescing fires only when:
- both the tail skb and the incoming packet carry EOM (complete msgs)
- the incoming packet fits in the tail skb's tailroom
- no BPF psock is attached (read_skb expects one msg per skb)

On allocation failure the code falls back to separate skbs (existing
behaviour).  Credit accounting is unchanged; the boundary buffer is
capped at PAGE_SIZE.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index f91704731057..e36b9ab28372 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -12,6 +12,7 @@
struct virtio_vsock_skb_cb {
        bool reply;
        bool tap_delivered;
+       bool has_boundary_entries;
        u32 offset;
};

@@ -167,6 +168,12 @@ struct virtio_vsock_sock {
        u32 buf_used;
        struct sk_buff_head rx_queue;
        u32 msg_count;
+
+       /* ULEB128-encoded seqpacket message boundary buffer */
+       u8 *boundary_buf;
+       u32 boundary_len;
+       u32 boundary_alloc;
+       u32 boundary_off;
};

struct virtio_vsock_pkt_info {
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 416d533f493d..81654f70f72c 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -11,6 +11,7 @@
#include <linux/sched/signal.h>
#include <linux/ctype.h>
#include <linux/list.h>
+#include <linux/skmsg.h>
#include <linux/virtio_vsock.h>
#include <uapi/linux/vsockmon.h>

@@ -26,6 +27,91 @@
/* Threshold for detecting small packets to copy */
#define GOOD_COPY_LEN  128

+#define VSOCK_BOUNDARY_BUF_INIT        64
+#define VSOCK_BOUNDARY_BUF_MAX PAGE_SIZE
+
+/* ULEB128 boundary encoding: value = (msg_len << 1) | eor.
+ * Each byte carries 7 data bits; bit 7 is set on all but the last byte.
+ * Max 5 bytes for a u32 msg_len (33 bits with eor shift).
+ */
+static int vsock_uleb_encode_boundary(u8 *buf, u32 msg_len, bool eor)
+{
+       u64 val = ((u64)msg_len << 1) | eor;
+       int n = 0;
+
+       do {
+               buf[n] = val & 0x7f;
+               val >>= 7;
+               if (val)
+                       buf[n] |= 0x80;
+               n++;
+       } while (val);
+
+       return n;
+}
+
+static int vsock_uleb_decode_boundary(const u8 *buf, u32 avail,
+                                     u32 *msg_len, bool *eor)
+{
+       u64 val = 0;
+       int shift = 0;
+       int n = 0;
+
+       do {
+               if (n >= avail || shift >= 35)
+                       return -EINVAL;
+               val |= (u64)(buf[n] & 0x7f) << shift;
+               shift += 7;
+       } while (buf[n++] & 0x80);
+
+       *eor = val & 1;
+       *msg_len = val >> 1;
+       return n;
+}
+
+static void vsock_boundary_buf_compact(struct virtio_vsock_sock *vvs)
+{
+       if (vvs->boundary_off == 0)
+               return;
+
+       vvs->boundary_len -= vvs->boundary_off;
+       memmove(vvs->boundary_buf, vvs->boundary_buf + vvs->boundary_off,
+               vvs->boundary_len);
+       vvs->boundary_off = 0;
+}
+
+static int vsock_boundary_buf_ensure(struct virtio_vsock_sock *vvs, u32 needed)
+{
+       u32 new_alloc;
+       u8 *new_buf;
+
+       if (vvs->boundary_alloc >= needed)
+               return 0;
+
+       /* Reclaim consumed space before growing */
+       if (vvs->boundary_off) {
+               needed -= vvs->boundary_off;
+               vsock_boundary_buf_compact(vvs);
+               if (vvs->boundary_alloc >= needed)
+                       return 0;
+       }
+
+       new_alloc = max(needed, vvs->boundary_alloc ? vvs->boundary_alloc * 2
+                                                   : VSOCK_BOUNDARY_BUF_INIT);
+       if (new_alloc > VSOCK_BOUNDARY_BUF_MAX)
+               new_alloc = VSOCK_BOUNDARY_BUF_MAX;
+       if (new_alloc < needed)
+               return -ENOMEM;
+
+       new_buf = krealloc(vvs->boundary_buf, new_alloc, GFP_ATOMIC);
+       if (!new_buf)
+               return -ENOMEM;
+
+       vvs->boundary_buf = new_buf;
+       vvs->boundary_alloc = new_alloc;
+       return 0;
+}
+
static void virtio_transport_cancel_close_work(struct vsock_sock *vsk,
                                               bool cancel_timeout);
static s64 virtio_transport_has_space(struct virtio_vsock_sock *vvs);
@@ -682,41 +768,74 @@ virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk,
        total = 0;
        len = msg_data_left(msg);

-       skb_queue_walk(&vvs->rx_queue, skb) {
-               struct virtio_vsock_hdr *hdr;
+       skb = skb_peek(&vvs->rx_queue);
+       if (skb && VIRTIO_VSOCK_SKB_CB(skb)->has_boundary_entries) {
+               u32 msg_len, offset;
+               size_t bytes;
+               bool eor;
+               int ret;

-               if (total < len) {
-                       size_t bytes;
+               ret = vsock_uleb_decode_boundary(
+                       vvs->boundary_buf + vvs->boundary_off,
+                       vvs->boundary_len - vvs->boundary_off,
+                       &msg_len, &eor);
+               if (ret < 0)
+                       goto unlock;
+
+               offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
+               bytes = min(len, (size_t)msg_len);
+
+               if (bytes) {
                        int err;

-                       bytes = len - total;
-                       if (bytes > skb->len)
-                               bytes = skb->len;
-
                        spin_unlock_bh(&vvs->rx_lock);
-
-                       /* sk_lock is held by caller so no one else can dequeue.
-                        * Unlock rx_lock since skb_copy_datagram_iter() may 
sleep.
-                        */
-                       err = skb_copy_datagram_iter(skb, 
VIRTIO_VSOCK_SKB_CB(skb)->offset,
+                       err = skb_copy_datagram_iter(skb, offset,
                                                     &msg->msg_iter, bytes);
                        if (err)
                                return err;
-
                        spin_lock_bh(&vvs->rx_lock);
                }

-               total += skb->len;
-               hdr = virtio_vsock_hdr(skb);
+               total = msg_len;
+               if (eor)
+                       msg->msg_flags |= MSG_EOR;
+       } else {
+               skb_queue_walk(&vvs->rx_queue, skb) {
+                       struct virtio_vsock_hdr *hdr;

-               if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
-                       if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
-                               msg->msg_flags |= MSG_EOR;
+                       if (total < len) {
+                               size_t bytes;
+                               int err;

-                       break;
+                               bytes = len - total;
+                               if (bytes > skb->len)
+                                       bytes = skb->len;
+
+                               spin_unlock_bh(&vvs->rx_lock);
+
+                               err = skb_copy_datagram_iter(
+                                       skb,
+                                       VIRTIO_VSOCK_SKB_CB(skb)->offset,
+                                       &msg->msg_iter, bytes);
+                               if (err)
+                                       return err;
+
+                               spin_lock_bh(&vvs->rx_lock);
+                       }
+
+                       total += skb->len;
+                       hdr = virtio_vsock_hdr(skb);
+
+                       if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+                               if (le32_to_cpu(hdr->flags) &
+                                   VIRTIO_VSOCK_SEQ_EOR)
+                                       msg->msg_flags |= MSG_EOR;
+                               break;
+                       }
                }
        }

+unlock:
        spin_unlock_bh(&vvs->rx_lock);

        return total;
@@ -740,57 +859,105 @@ static int virtio_transport_seqpacket_do_dequeue(struct 
vsock_sock *vsk,
        }

        while (!msg_ready) {
-               struct virtio_vsock_hdr *hdr;
-               size_t pkt_len;
-
-               skb = __skb_dequeue(&vvs->rx_queue);
+               skb = skb_peek(&vvs->rx_queue);
                if (!skb)
                        break;
-               hdr = virtio_vsock_hdr(skb);
-               pkt_len = (size_t)le32_to_cpu(hdr->len);

-               if (dequeued_len >= 0) {
+               if (VIRTIO_VSOCK_SKB_CB(skb)->has_boundary_entries) {
                        size_t bytes_to_copy;
+                       u32 msg_len, offset;
+                       bool eor;
+                       int ret;

-                       bytes_to_copy = min(user_buf_len, pkt_len);
+                       ret = vsock_uleb_decode_boundary(
+                               vvs->boundary_buf + vvs->boundary_off,
+                               vvs->boundary_len - vvs->boundary_off,
+                               &msg_len, &eor);
+                       if (ret < 0)
+                               break;
+                       vvs->boundary_off += ret;

-                       if (bytes_to_copy) {
+                       offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
+                       bytes_to_copy = min(user_buf_len, (size_t)msg_len);
+
+                       if (bytes_to_copy && dequeued_len >= 0) {
                                int err;

-                               /* sk_lock is held by caller so no one else can 
dequeue.
-                                * Unlock rx_lock since 
skb_copy_datagram_iter() may sleep.
-                                */
                                spin_unlock_bh(&vvs->rx_lock);
-
-                               err = skb_copy_datagram_iter(skb, 0,
+                               err = skb_copy_datagram_iter(skb, offset,
                                                             &msg->msg_iter,
                                                             bytes_to_copy);
-                               if (err) {
-                                       /* Copy of message failed. Rest of
-                                        * fragments will be freed without copy.
-                                        */
-                                       dequeued_len = err;
-                               } else {
-                                       user_buf_len -= bytes_to_copy;
-                               }
-
                                spin_lock_bh(&vvs->rx_lock);
+                               if (err)
+                                       dequeued_len = err;
+                               else
+                                       user_buf_len -= bytes_to_copy;
                        }

                        if (dequeued_len >= 0)
-                               dequeued_len += pkt_len;
-               }
+                               dequeued_len += msg_len;

-               if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+                       VIRTIO_VSOCK_SKB_CB(skb)->offset += msg_len;
                        msg_ready = true;
                        vvs->msg_count--;

-                       if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
+                       if (eor)
                                msg->msg_flags |= MSG_EOR;
-               }

-               virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len);
-               kfree_skb(skb);
+                       virtio_transport_dec_rx_pkt(vvs, msg_len, msg_len);
+
+                       if (VIRTIO_VSOCK_SKB_CB(skb)->offset >= skb->len) {
+                               __skb_unlink(skb, &vvs->rx_queue);
+                               kfree_skb(skb);
+                       }
+
+                       if (vvs->boundary_off >= vvs->boundary_len / 2)
+                               vsock_boundary_buf_compact(vvs);
+               } else {
+                       struct virtio_vsock_hdr *hdr;
+                       size_t pkt_len;
+
+                       skb = __skb_dequeue(&vvs->rx_queue);
+                       if (!skb)
+                               break;
+                       hdr = virtio_vsock_hdr(skb);
+                       pkt_len = (size_t)le32_to_cpu(hdr->len);
+
+                       if (dequeued_len >= 0) {
+                               size_t bytes_to_copy;
+
+                               bytes_to_copy = min(user_buf_len, pkt_len);
+
+                               if (bytes_to_copy) {
+                                       int err;
+
+                                       spin_unlock_bh(&vvs->rx_lock);
+                                       err = skb_copy_datagram_iter(
+                                               skb, 0, &msg->msg_iter,
+                                               bytes_to_copy);
+                                       if (err)
+                                               dequeued_len = err;
+                                       else
+                                               user_buf_len -= bytes_to_copy;
+                                       spin_lock_bh(&vvs->rx_lock);
+                               }
+
+                               if (dequeued_len >= 0)
+                                       dequeued_len += pkt_len;
+                       }
+
+                       if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+                               msg_ready = true;
+                               vvs->msg_count--;
+
+                               if (le32_to_cpu(hdr->flags) &
+                                   VIRTIO_VSOCK_SEQ_EOR)
+                                       msg->msg_flags |= MSG_EOR;
+                       }
+
+                       virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len);
+                       kfree_skb(skb);
+               }
        }

        spin_unlock_bh(&vvs->rx_lock);
@@ -1132,6 +1299,7 @@ void virtio_transport_destruct(struct vsock_sock *vsk)

        virtio_transport_cancel_close_work(vsk, true);

+       kfree(vvs->boundary_buf);
        kfree(vvs);
        vsk->trans = NULL;
}
@@ -1224,6 +1392,11 @@ static void virtio_transport_remove_sock(struct 
vsock_sock *vsk)
         * removing it.
         */
        __skb_queue_purge(&vvs->rx_queue);
+       kfree(vvs->boundary_buf);
+       vvs->boundary_buf = NULL;
+       vvs->boundary_len = 0;
+       vvs->boundary_alloc = 0;
+       vvs->boundary_off = 0;
        vsock_remove_sock(vsk);
}

@@ -1395,23 +1568,62 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
            !skb_is_nonlinear(skb)) {
                struct virtio_vsock_hdr *last_hdr;
                struct sk_buff *last_skb;
+               bool last_has_eom;
+               bool has_eom;

                last_skb = skb_peek_tail(&vvs->rx_queue);
                last_hdr = virtio_vsock_hdr(last_skb);
+               last_has_eom = le32_to_cpu(last_hdr->flags) & 
VIRTIO_VSOCK_SEQ_EOM;
+               has_eom = le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM;

-               /* If there is space in the last packet queued, we copy the
-                * new packet in its buffer. We avoid this if the last packet
-                * queued has VIRTIO_VSOCK_SEQ_EOM set, because this is
-                * delimiter of SEQPACKET message, so 'pkt' is the first packet
-                * of a new message.
-                */
-               if (skb->len < skb_tailroom(last_skb) &&
-                   !(le32_to_cpu(last_hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)) {
-                       memcpy(skb_put(last_skb, skb->len), skb->data, 
skb->len);
-                       free_pkt = true;
-                       last_hdr->flags |= hdr->flags;
-                       le32_add_cpu(&last_hdr->len, len);
-                       goto out;
+               if (skb->len < skb_tailroom(last_skb)) {
+                       if (!last_has_eom) {
+                               /* Same-message coalescing (existing path) */
+                               memcpy(skb_put(last_skb, skb->len),
+                                      skb->data, skb->len);
+                               free_pkt = true;
+                               last_hdr->flags |= hdr->flags;
+                               le32_add_cpu(&last_hdr->len, len);
+                               goto out;
+                       }
+
+                       /* Cross-EOM: coalesce complete messages into one skb,
+                        * recording message boundaries in a compact BER buffer.
+                        * Only when incoming packet also has EOM (complete 
msg).
+                        */
+                       if (has_eom && !sk_psock(sk_vsock(vsk))) {
+                               bool prev_eor, cur_eor;
+                               u8 tmp[12];
+                               int n = 0;
+
+                               cur_eor = le32_to_cpu(hdr->flags) &
+                                         VIRTIO_VSOCK_SEQ_EOR;
+
+                               if 
(!VIRTIO_VSOCK_SKB_CB(last_skb)->has_boundary_entries) {
+                                       u32 prev_len = 
le32_to_cpu(last_hdr->len);
+
+                                       prev_eor = le32_to_cpu(last_hdr->flags) 
&
+                                                  VIRTIO_VSOCK_SEQ_EOR;
+                                       n += vsock_uleb_encode_boundary(
+                                               tmp + n, prev_len, prev_eor);
+                               }
+                               n += vsock_uleb_encode_boundary(
+                                       tmp + n, len, cur_eor);
+
+                               if (!vsock_boundary_buf_ensure(
+                                           vvs, vvs->boundary_len + n)) {
+                                       memcpy(vvs->boundary_buf +
+                                              vvs->boundary_len, tmp, n);
+                                       vvs->boundary_len += n;
+                                       
VIRTIO_VSOCK_SKB_CB(last_skb)->has_boundary_entries = true;
+                                       memcpy(skb_put(last_skb, skb->len),
+                                              skb->data, skb->len);
+                                       free_pkt = true;
+                                       last_hdr->flags |= hdr->flags;
+                                       le32_add_cpu(&last_hdr->len, len);
+                                       goto out;
+                               }
+                       }
                }
        }




Reply via email to