On Thu, 14 May 2026 11:29:48 +0200
Stefano Garzarella <[email protected]> wrote:

> From: Stefano Garzarella <[email protected]>
> 
> When a large message is fragmented into multiple skbs, the zerocopy
> uarg is only allocated and attached to the last skb in the loop.
> Non-final skbs carry pinned user pages with no completion tracking,
> so the kernel has no way to notify userspace when those pages are safe
> to reuse. If the loop breaks early the uarg is never allocated at all,
> leaking pinned pages with no completion notification.
> 
> Fix this by following the approach used by TCP: allocate the zerocopy
> uarg (if not provided by the caller) before the send loop and attach
> it to every skb via skb_zcopy_set(), which takes a reference per skb.
> Each skb's completion properly decrements the refcount, and the
> notification only fires after the last skb is freed.
> On failure, if no data was sent, the uarg is cleanly aborted via
> net_zcopy_put_abort().
> 
> This issue was initially discovered by sashiko while reviewing commit
> 1cb36e252211 ("vsock/virtio: fix MSG_ZEROCOPY pinned-pages accounting")
> but was pre-existing.
> 
> Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support")
> Cc: Arseniy Krasnov <[email protected]>
> Closes: 
> https://sashiko.dev/#/patchset/20260420132051.217589-1-sgarzare%40redhat.com
> Reported-by: Maher Azzouzi <[email protected]>
> Signed-off-by: Stefano Garzarella <[email protected]>
> ---
>  net/vmw_vsock/virtio_transport_common.c | 83 ++++++++++---------------
>  1 file changed, 34 insertions(+), 49 deletions(-)
> 
> diff --git a/net/vmw_vsock/virtio_transport_common.c 
> b/net/vmw_vsock/virtio_transport_common.c
> index 989cc252d3d3..1e3409d28164 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -70,34 +70,6 @@ static bool virtio_transport_can_zcopy(const struct 
> virtio_transport *t_ops,
>       return true;
>  }
>  
> -static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
> -                                        struct sk_buff *skb,
> -                                        struct msghdr *msg,
> -                                        size_t pkt_len,
> -                                        bool zerocopy)
> -{
> -     struct ubuf_info *uarg;
> -
> -     if (msg->msg_ubuf) {
> -             uarg = msg->msg_ubuf;
> -             net_zcopy_get(uarg);
> -     } else {
> -             struct ubuf_info_msgzc *uarg_zc;
> -
> -             uarg = msg_zerocopy_realloc(sk_vsock(vsk),
> -                                         pkt_len, NULL, false);
> -             if (!uarg)
> -                     return -1;
> -
> -             uarg_zc = uarg_to_msgzc(uarg);
> -             uarg_zc->zerocopy = zerocopy ? 1 : 0;
> -     }
> -
> -     skb_zcopy_init(skb, uarg);
> -
> -     return 0;
> -}
> -
>  static int virtio_transport_fill_skb(struct sk_buff *skb,
>                                    struct virtio_vsock_pkt_info *info,
>                                    size_t len,
> @@ -317,8 +289,10 @@ static int virtio_transport_send_pkt_info(struct 
> vsock_sock *vsk,
>       u32 src_cid, src_port, dst_cid, dst_port;
>       const struct virtio_transport *t_ops;
>       struct virtio_vsock_sock *vvs;
> +     struct ubuf_info *uarg = NULL;
>       u32 pkt_len = info->pkt_len;
>       bool can_zcopy = false;
> +     bool have_uref = false;
>       u32 rest_len;
>       int ret;
>  
> @@ -360,6 +334,25 @@ static int virtio_transport_send_pkt_info(struct 
> vsock_sock *vsk,
>               if (can_zcopy)
>                       max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
>                                           (MAX_SKB_FRAGS * PAGE_SIZE));
> +
> +             if (info->msg->msg_flags & MSG_ZEROCOPY &&
> +                 info->op == VIRTIO_VSOCK_OP_RW) {
> +                     uarg = info->msg->msg_ubuf;
> +
> +                     if (!uarg) {
> +                             uarg = msg_zerocopy_realloc(sk_vsock(vsk),
> +                                                         pkt_len, NULL, 
> false);
> +                             if (!uarg) {
> +                                     virtio_transport_put_credit(vvs, 
> pkt_len);
> +                                     return -ENOMEM;
> +                             }
> +
> +                             if (!can_zcopy)
> +                                     uarg_to_msgzc(uarg)->zerocopy = 0;
> +
> +                             have_uref = true;
> +                     }
> +             }

Surely that block should only be done if can_zcopy is true?
And shouldn't something unset it if info->op != VIRTIO_VSOCK_OP_RW ?
If the msg_zerocopy_realloc() fails then can't you just set can_zcopy to false.

It info->msg->msg_buf is already set then I think you have to disable zero-copy.
The caller has already requested a callback - and you can't add another.

In any case by the end of this can_zcopy and have_uref are really the same flag.

>       }
>  
>       rest_len = pkt_len;
> @@ -378,27 +371,7 @@ static int virtio_transport_send_pkt_info(struct 
> vsock_sock *vsk,
>                       break;
>               }
>  
> -             /* We process buffer part by part, allocating skb on
> -              * each iteration. If this is last skb for this buffer
> -              * and MSG_ZEROCOPY mode is in use - we must allocate
> -              * completion for the current syscall.
> -              *
> -              * Pass pkt_len because msg iter is already consumed
> -              * by virtio_transport_fill_skb(), so iter->count
> -              * can not be used for RLIMIT_MEMLOCK pinned-pages
> -              * accounting done by msg_zerocopy_realloc().
> -              */
> -             if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY &&
> -                 skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) {
> -                     if (virtio_transport_init_zcopy_skb(vsk, skb,
> -                                                         info->msg,
> -                                                         pkt_len,
> -                                                         can_zcopy)) {
> -                             kfree_skb(skb);
> -                             ret = -ENOMEM;
> -                             break;
> -                     }
> -             }
> +             skb_zcopy_set(skb, uarg, NULL);
>  
>               virtio_transport_inc_tx_pkt(vvs, skb);
>  
> @@ -422,6 +395,18 @@ static int virtio_transport_send_pkt_info(struct 
> vsock_sock *vsk,
>  
>       virtio_transport_put_credit(vvs, rest_len);
>  
> +     /* msg_zerocopy_realloc() initializes the ubuf_info refcnt to 1.
> +      * skb_zcopy_set() increases it for each skb, so we can drop that
                                                            ^ must

> +      * initial reference to keep it balanced.
> +      */
> +     if (have_uref) {
> +             if (rest_len == pkt_len)
> +                     /* No data sent, abort the notification. */
> +                     net_zcopy_put_abort(uarg, true);

Is it worth optimising for the 'nothing sent' case ?

-- David

> +             else
> +                     net_zcopy_put(uarg);
> +     }
> +
>       /* Return number of bytes, if any data has been sent. */
>       if (rest_len != pkt_len)
>               ret = pkt_len - rest_len;


Reply via email to