On Mon, 18 May 2026 11:54:19 +0200
Stefano Garzarella <[email protected]> wrote:

> On Mon, May 18, 2026 at 05:33:08AM -0400, Michael S. Tsirkin wrote:
> >On Mon, May 18, 2026 at 11:18:24AM +0200, Stefano Garzarella wrote:  
> >> On Sat, May 16, 2026 at 12:53:29PM +0100, David Laight wrote:  
> >> > On Thu, 14 May 2026 11:29:48 +0200
> >> > Stefano Garzarella <[email protected]> wrote:
> >> >  
> >> > > From: Stefano Garzarella <[email protected]>
> >> > >
> >> > > When a large message is fragmented into multiple skbs, the zerocopy
> >> > > uarg is only allocated and attached to the last skb in the loop.
> >> > > Non-final skbs carry pinned user pages with no completion tracking,
> >> > > so the kernel has no way to notify userspace when those pages are safe
> >> > > to reuse. If the loop breaks early the uarg is never allocated at all,
> >> > > leaking pinned pages with no completion notification.
> >> > >
> >> > > Fix this by following the approach used by TCP: allocate the zerocopy
> >> > > uarg (if not provided by the caller) before the send loop and attach
> >> > > it to every skb via skb_zcopy_set(), which takes a reference per skb.
> >> > > Each skb's completion properly decrements the refcount, and the
> >> > > notification only fires after the last skb is freed.
> >> > > On failure, if no data was sent, the uarg is cleanly aborted via
> >> > > net_zcopy_put_abort().
> >> > >
> >> > > This issue was initially discovered by sashiko while reviewing commit
> >> > > 1cb36e252211 ("vsock/virtio: fix MSG_ZEROCOPY pinned-pages accounting")
> >> > > but was pre-existing.
> >> > >
> >> > > Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support")
> >> > > Cc: Arseniy Krasnov <[email protected]>
> >> > > Closes: 
> >> > > https://sashiko.dev/#/patchset/20260420132051.217589-1-sgarzare%40redhat.com
> >> > > Reported-by: Maher Azzouzi <[email protected]>
> >> > > Signed-off-by: Stefano Garzarella <[email protected]>
> >> > > ---
> >> > >  net/vmw_vsock/virtio_transport_common.c | 83 ++++++++++---------------
> >> > >  1 file changed, 34 insertions(+), 49 deletions(-)
> >> > >
> >> > > diff --git a/net/vmw_vsock/virtio_transport_common.c 
> >> > > b/net/vmw_vsock/virtio_transport_common.c
> >> > > index 989cc252d3d3..1e3409d28164 100644
> >> > > --- a/net/vmw_vsock/virtio_transport_common.c
> >> > > +++ b/net/vmw_vsock/virtio_transport_common.c
> >> > > @@ -70,34 +70,6 @@ static bool virtio_transport_can_zcopy(const struct 
> >> > > virtio_transport *t_ops,
> >> > >        return true;
> >> > >  }
> >> > >
> >> > > -static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
> >> > > -                                         struct sk_buff *skb,
> >> > > -                                         struct msghdr *msg,
> >> > > -                                         size_t pkt_len,
> >> > > -                                         bool zerocopy)
> >> > > -{
> >> > > -      struct ubuf_info *uarg;
> >> > > -
> >> > > -      if (msg->msg_ubuf) {
> >> > > -              uarg = msg->msg_ubuf;
> >> > > -              net_zcopy_get(uarg);
> >> > > -      } else {
> >> > > -              struct ubuf_info_msgzc *uarg_zc;
> >> > > -
> >> > > -              uarg = msg_zerocopy_realloc(sk_vsock(vsk),
> >> > > -                                          pkt_len, NULL, false);
> >> > > -              if (!uarg)
> >> > > -                      return -1;
> >> > > -
> >> > > -              uarg_zc = uarg_to_msgzc(uarg);
> >> > > -              uarg_zc->zerocopy = zerocopy ? 1 : 0;
> >> > > -      }
> >> > > -
> >> > > -      skb_zcopy_init(skb, uarg);
> >> > > -
> >> > > -      return 0;
> >> > > -}
> >> > > -
> >> > >  static int virtio_transport_fill_skb(struct sk_buff *skb,
> >> > >                                     struct virtio_vsock_pkt_info *info,
> >> > >                                     size_t len,
> >> > > @@ -317,8 +289,10 @@ static int virtio_transport_send_pkt_info(struct 
> >> > > vsock_sock *vsk,
> >> > >        u32 src_cid, src_port, dst_cid, dst_port;
> >> > >        const struct virtio_transport *t_ops;
> >> > >        struct virtio_vsock_sock *vvs;
> >> > > +      struct ubuf_info *uarg = NULL;
> >> > >        u32 pkt_len = info->pkt_len;
> >> > >        bool can_zcopy = false;
> >> > > +      bool have_uref = false;
> >> > >        u32 rest_len;
> >> > >        int ret;
> >> > >
> >> > > @@ -360,6 +334,25 @@ static int virtio_transport_send_pkt_info(struct 
> >> > > vsock_sock *vsk,
> >> > >                if (can_zcopy)
> >> > >                        max_skb_len = min_t(u32, 
> >> > > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
> >> > >                                            (MAX_SKB_FRAGS * 
> >> > > PAGE_SIZE));
> >> > > +
> >> > > +              if (info->msg->msg_flags & MSG_ZEROCOPY &&
> >> > > +                  info->op == VIRTIO_VSOCK_OP_RW) {
> >> > > +                      uarg = info->msg->msg_ubuf;
> >> > > +
> >> > > +                      if (!uarg) {
> >> > > +                              uarg = 
> >> > > msg_zerocopy_realloc(sk_vsock(vsk),
> >> > > +                                                          pkt_len, 
> >> > > NULL, false);
> >> > > +                              if (!uarg) {
> >> > > +                                      
> >> > > virtio_transport_put_credit(vvs, pkt_len);
> >> > > +                                      return -ENOMEM;
> >> > > +                              }
> >> > > +
> >> > > +                              if (!can_zcopy)
> >> > > +                                      uarg_to_msgzc(uarg)->zerocopy = 
> >> > > 0;
> >> > > +
> >> > > +                              have_uref = true;
> >> > > +                      }
> >> > > +              }  
> >> >
> >> > Surely that block should only be done if can_zcopy is true?
> >> > And shouldn't something unset it if info->op != VIRTIO_VSOCK_OP_RW ?
> >> > If the msg_zerocopy_realloc() fails then can't you just set can_zcopy to 
> >> > false.
> >> >
> >> > It info->msg->msg_buf is already set then I think you have to disable 
> >> > zero-copy.
> >> > The caller has already requested a callback - and you can't add another.
> >> >
> >> > In any case by the end of this can_zcopy and have_uref are really the 
> >> > same flag.  
> >>
> >> I kept the same approach we had before, trying to make as few changes as
> >> possible.
> >>
> >> All these potential issues seem to be pre-existing and should be eventually
> >> addressed in other patches IMHO. This patch one only resolves the main 
> >> issue
> >> of calling `skb_zcopy_set()` for every skb to avoid leaking pages, etc.  
> >
> >the patch is upstream now, right? So pretty much have to be patches on
> >top.  
> 
> If those are actual issues, then yes. TBH, I didn’t look into that 
> aspect and left it as it was before. We should take a closer look at how 
> MSG_ZEROCOPY is handled.
> 
> David, if you think it needs fixing and you have time, feel free to send 
> patches on top.

I'm not fully sure how it all works.
Especially the paths where msg->msg_ubuf is non-NULL, I suspect it should
be added to all the skb even if the ZEROCOPY flag isn't set.
I was just reading the one function.
But there did look like some very dodgy conditionals.

-- David

> 
> Thanks,
> Stefano
> 
> >  
> >> @Arseniy can you help on this?
> >>  
> >> >  
> >> > >        }
> >> > >
> >> > >        rest_len = pkt_len;
> >> > > @@ -378,27 +371,7 @@ static int virtio_transport_send_pkt_info(struct 
> >> > > vsock_sock *vsk,
> >> > >                        break;
> >> > >                }
> >> > >
> >> > > -              /* We process buffer part by part, allocating skb on
> >> > > -               * each iteration. If this is last skb for this buffer
> >> > > -               * and MSG_ZEROCOPY mode is in use - we must allocate
> >> > > -               * completion for the current syscall.
> >> > > -               *
> >> > > -               * Pass pkt_len because msg iter is already consumed
> >> > > -               * by virtio_transport_fill_skb(), so iter->count
> >> > > -               * can not be used for RLIMIT_MEMLOCK pinned-pages
> >> > > -               * accounting done by msg_zerocopy_realloc().
> >> > > -               */
> >> > > -              if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY &&
> >> > > -                  skb_len == rest_len && info->op == 
> >> > > VIRTIO_VSOCK_OP_RW) {
> >> > > -                      if (virtio_transport_init_zcopy_skb(vsk, skb,
> >> > > -                                                          info->msg,
> >> > > -                                                          pkt_len,
> >> > > -                                                          can_zcopy)) 
> >> > > {
> >> > > -                              kfree_skb(skb);
> >> > > -                              ret = -ENOMEM;
> >> > > -                              break;
> >> > > -                      }
> >> > > -              }
> >> > > +              skb_zcopy_set(skb, uarg, NULL);
> >> > >
> >> > >                virtio_transport_inc_tx_pkt(vvs, skb);
> >> > >
> >> > > @@ -422,6 +395,18 @@ static int virtio_transport_send_pkt_info(struct 
> >> > > vsock_sock *vsk,
> >> > >
> >> > >        virtio_transport_put_credit(vvs, rest_len);
> >> > >
> >> > > +      /* msg_zerocopy_realloc() initializes the ubuf_info refcnt to 1.
> >> > > +       * skb_zcopy_set() increases it for each skb, so we can drop 
> >> > > that  
> >> >                                                            ^ must
> >> >  
> >> > > +       * initial reference to keep it balanced.
> >> > > +       */
> >> > > +      if (have_uref) {
> >> > > +              if (rest_len == pkt_len)
> >> > > +                      /* No data sent, abort the notification. */
> >> > > +                      net_zcopy_put_abort(uarg, true);  
> >> >
> >> > Is it worth optimising for the 'nothing sent' case ?  
> >>
> >> What do you suggest doing?
> >>
> >> I followed what TCP does.
> >>
> >> Thanks,
> >> Stefano
> >>  
> >> >
> >> > -- David
> >> >  
> >> > > +              else
> >> > > +                      net_zcopy_put(uarg);
> >> > > +      }
> >> > > +
> >> > >        /* Return number of bytes, if any data has been sent. */
> >> > >        if (rest_len != pkt_len)
> >> > >                ret = pkt_len - rest_len;  
> >> >  
> >  
> 


Reply via email to