Hi Stefano, Michael,
Thanks for the suggestions and guidance.
I’ve drafted a 4-part series based on the recap. I’ve included the
four diffs below for discussion. Can wait for comments, iterate, and
then send the patch series in a few days.
---
Patch 1/4 — vsock/virtio: make get_credit() s64-safe and clamp negatives
virtio_transport_get_credit() was doing unsigned arithmetic; if the
peer shrinks its window, the subtraction can underflow and look like
“lots of credit”. This makes it compute “space” in s64 and clamp < 0
to 0.
diff --git a/net/vmw_vsock/virtio_transport_common.c
b/net/vmw_vsock/virtio_transport_common.c
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -494,16 +494,23 @@ EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent);
u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
{
+ s64 bytes;
u32 ret;
if (!credit)
return 0;
spin_lock_bh(&vvs->tx_lock);
- ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
- if (ret > credit)
- ret = credit;
+ bytes = (s64)vvs->peer_buf_alloc -
+ ((s64)vvs->tx_cnt - (s64)vvs->peer_fwd_cnt);
+ if (bytes < 0)
+ bytes = 0;
+
+ ret = min_t(u32, credit, (u32)bytes);
vvs->tx_cnt += ret;
vvs->bytes_unsent += ret;
spin_unlock_bh(&vvs->tx_lock);
return ret;
}
---
Patch 2/4 — vsock/virtio: cap TX window by local buffer (helper + use
everywhere in TX path)
Cap the effective advertised window to min(peer_buf_alloc, buf_alloc)
and use it consistently in TX paths (get_credit, has_space,
seqpacket_enqueue).
diff --git a/net/vmw_vsock/virtio_transport_common.c
b/net/vmw_vsock/virtio_transport_common.c
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -491,6 +491,16 @@ void virtio_transport_consume_skb_sent(struct
sk_buff *skb, bool consume)
}
EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent);
+/* Return the effective peer buffer size for TX credit computation.
+ *
+ * The peer advertises its receive buffer via peer_buf_alloc, but we cap it
+ * to our local buf_alloc (derived from SO_VM_SOCKETS_BUFFER_SIZE and
+ * already clamped to buffer_max_size).
+ */
+static u32 virtio_transport_tx_buf_alloc(struct virtio_vsock_sock *vvs)
+{
+ return min(vvs->peer_buf_alloc, vvs->buf_alloc);
+}
u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
{
s64 bytes;
@@ -502,7 +512,8 @@ u32 virtio_transport_get_credit(struct
virtio_vsock_sock *vvs, u32 credit)
return 0;
spin_lock_bh(&vvs->tx_lock);
- bytes = (s64)vvs->peer_buf_alloc -
+ bytes = (s64)virtio_transport_tx_buf_alloc(vvs) -
((s64)vvs->tx_cnt - (s64)vvs->peer_fwd_cnt);
if (bytes < 0)
bytes = 0;
@@ -834,7 +845,7 @@ virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
spin_lock_bh(&vvs->tx_lock);
- if (len > vvs->peer_buf_alloc) {
+ if (len > virtio_transport_tx_buf_alloc(vvs)) {
spin_unlock_bh(&vvs->tx_lock);
return -EMSGSIZE;
}
@@ -884,7 +895,8 @@ static s64 virtio_transport_has_space(struct
vsock_sock *vsk)
struct virtio_vsock_sock *vvs = vsk->trans;
s64 bytes;
- bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+ bytes = (s64)virtio_transport_tx_buf_alloc(vvs) -
+ ((s64)vvs->tx_cnt - (s64)vvs->peer_fwd_cnt);
if (bytes < 0)
bytes = 0;
return bytes;
}
---
Patch 3/4 — vsock/test: fix seqpacket msg bounds test (set client buf too)
After fixing TX credit bounds, the client can fill its TX window and
block before it wakes the server. Setting the buffer on the client
makes the test deterministic again.
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -353,6 +353,7 @@ static void test_stream_msg_peek_server(const
struct test_opts *opts)
static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
{
+ unsigned long long sock_buf_size;
unsigned long curr_hash;
size_t max_msg_size;
int page_size;
@@ -366,6 +367,18 @@ static void
test_seqpacket_msg_bounds_client(const struct test_opts *opts)
exit(EXIT_FAILURE);
}
+ sock_buf_size = SOCK_BUF_SIZE;
+
+ setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE,
+ sock_buf_size,
+ "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)");
+
+ setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+ sock_buf_size,
+ "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+
/* Wait, until receiver sets buffer size. */
control_expectln("SRVREADY");
---
Patch 4/4 — vsock/test: add stream TX credit bounds regression test
This directly guards the original failure mode for stream sockets: if
the peer advertises a large window but the sender’s local policy is
small, the sender must stall quickly (hit EAGAIN in nonblocking mode)
rather than queueing megabytes.
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -349,6 +349,7 @@
#define SOCK_BUF_SIZE (2 * 1024 * 1024)
+#define SMALL_SOCK_BUF_SIZE (64 * 1024ULL)
#define MAX_MSG_PAGES 4
/* Insert new test functions after test_stream_msg_peek_server, before
* test_seqpacket_msg_bounds_client (around line 352) */
+static void test_stream_tx_credit_bounds_client(const struct test_opts *opts)
+{
+ ... /* full function as provided */
+}
+
+static void test_stream_tx_credit_bounds_server(const struct test_opts *opts)
+{
+ ... /* full function as provided */
+}
@@ -2224,6 +2305,10 @@
.run_client = test_stream_msg_peek_client,
.run_server = test_stream_msg_peek_server,
},
+ {
+ .name = "SOCK_STREAM TX credit bounds",
+ .run_client = test_stream_tx_credit_bounds_client,
+ .run_server = test_stream_tx_credit_bounds_server,
+ },
{
.name = "SOCK_SEQPACKET msg bounds",
On Thu, Dec 11, 2025 at 2:52 PM Stefano Garzarella <[email protected]> wrote:
>
> On Thu, 11 Dec 2025 at 15:44, Melbin Mathew Antony <[email protected]> wrote:
> >
> > Hi Stefano, Michael,
> >
> > Thanks for the feedback and for pointing out the s64 issue in
> > virtio_transport_get_credit() and the vsock_test regression.
> >
> > I can take this up and send a small series:
> >
> > 1/2 – vsock/virtio: cap TX credit to local buffer size
> > - use a helper to bound peer_buf_alloc by buf_alloc
> > - compute available credit in s64 like has_space(), and clamp
> > negative values to zero before applying the caller’s credit
>
> IMO they should be fixed in 2 different patches.
>
> I think we can easily reuse virtio_transport_has_space() in
> virtio_transport_get_credit().
>
> >
> > 2/2 – vsock/test: fix seqpacket message bounds test
> > - include your vsock_test.c change so the seqpacket bounds test
> > keeps working with the corrected TX credit handling
> >
> > I’ll roll these into a [PATCH net v4 0/2] series and send it out shortly.
>
> Please, wait a bit also for other maintainers comments.
> See
> https://www.kernel.org/doc/html/latest/process/submitting-patches.html#don-t-get-discouraged-or-impatient
>
> So, to recap I'd do the following:
> Patch 1: fix virtio_transport_get_credit() maybe using
> virtio_transport_has_space() to calculate the space
> Patch 2: (this one) cap TX credit to local buffer size
> Patch 3: vsock/test: fix seqpacket message bounds test
> Patch 4 (if you have time): add a new test for TX credit on stream socket
>
> Stefano
>
> >
> > Thanks again for all the guidance,
> > Melbin
> >
> >
> > On Thu, Dec 11, 2025 at 1:57 PM Stefano Garzarella <[email protected]>
> > wrote:
> > >
> > > On Thu, Dec 11, 2025 at 08:05:11AM -0500, Michael S. Tsirkin wrote:
> > > >On Thu, Dec 11, 2025 at 01:51:04PM +0100, Melbin K Mathew wrote:
> > > >> The virtio vsock transport currently derives its TX credit directly
> > > >> from
> > > >> peer_buf_alloc, which is populated from the remote endpoint's
> > > >> SO_VM_SOCKETS_BUFFER_SIZE value.
> > > >>
> > > >> On the host side, this means the amount of data we are willing to queue
> > > >> for a given connection is scaled purely by a peer-chosen value, rather
> > > >> than by the host's own vsock buffer configuration. A guest that
> > > >> advertises a very large buffer and reads slowly can cause the host to
> > > >> allocate a correspondingly large amount of sk_buff memory for that
> > > >> connection.
> > > >>
> > > >> In practice, a malicious guest can:
> > > >>
> > > >> - set a large AF_VSOCK buffer size (e.g. 2 GiB) with
> > > >> SO_VM_SOCKETS_BUFFER_MAX_SIZE / SO_VM_SOCKETS_BUFFER_SIZE, and
> > > >>
> > > >> - open multiple connections to a host vsock service that sends data
> > > >> while the guest drains slowly.
> > > >>
> > > >> On an unconstrained host this can drive Slab/SUnreclaim into the tens
> > > >> of
> > > >> GiB range, causing allocation failures and OOM kills in unrelated host
> > > >> processes while the offending VM remains running.
> > > >>
> > > >> On non-virtio transports and compatibility:
> > > >>
> > > >> - VMCI uses the AF_VSOCK buffer knobs to size its queue pairs per
> > > >> socket based on the local vsk->buffer_* values; the remote side
> > > >> can’t enlarge those queues beyond what the local endpoint
> > > >> configured.
> > > >>
> > > >> - Hyper-V’s vsock transport uses fixed-size VMBus ring buffers and
> > > >> an MTU bound; there is no peer-controlled credit field comparable
> > > >> to peer_buf_alloc, and the remote endpoint can’t drive in-flight
> > > >> kernel memory above those ring sizes.
> > > >>
> > > >> - The loopback path reuses virtio_transport_common.c, so it
> > > >> naturally follows the same semantics as the virtio transport.
> > > >>
> > > >> Make virtio-vsock consistent with that model by intersecting the peer’s
> > > >> advertised receive window with the local vsock buffer size when
> > > >> computing TX credit. We introduce a small helper and use it in
> > > >> virtio_transport_get_credit(), virtio_transport_has_space() and
> > > >> virtio_transport_seqpacket_enqueue(), so that:
> > > >>
> > > >> effective_tx_window = min(peer_buf_alloc, buf_alloc)
> > > >>
> > > >> This prevents a remote endpoint from forcing us to queue more data than
> > > >> our own configuration allows, while preserving the existing credit
> > > >> semantics and keeping virtio-vsock compatible with the other
> > > >> transports.
> > > >>
> > > >> On an unpatched Ubuntu 22.04 host (~64 GiB RAM), running a PoC with
> > > >> 32 guest vsock connections advertising 2 GiB each and reading slowly
> > > >> drove Slab/SUnreclaim from ~0.5 GiB to ~57 GiB and the system only
> > > >> recovered after killing the QEMU process.
> > > >>
> > > >> With this patch applied, rerunning the same PoC yields:
> > > >>
> > > >> Before:
> > > >> MemFree: ~61.6 GiB
> > > >> MemAvailable: ~62.3 GiB
> > > >> Slab: ~142 MiB
> > > >> SUnreclaim: ~117 MiB
> > > >>
> > > >> After 32 high-credit connections:
> > > >> MemFree: ~61.5 GiB
> > > >> MemAvailable: ~62.3 GiB
> > > >> Slab: ~178 MiB
> > > >> SUnreclaim: ~152 MiB
> > > >>
> > > >> i.e. only ~35 MiB increase in Slab/SUnreclaim, no host OOM, and the
> > > >> guest remains responsive.
> > > >>
> > > >> Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko")
> > > >> Suggested-by: Stefano Garzarella <[email protected]>
> > > >> Signed-off-by: Melbin K Mathew <[email protected]>
> > > >> ---
> > > >> net/vmw_vsock/virtio_transport_common.c | 27 ++++++++++++++++++++++---
> > > >> 1 file changed, 24 insertions(+), 3 deletions(-)
> > > >>
> > > >> diff --git a/net/vmw_vsock/virtio_transport_common.c
> > > >> b/net/vmw_vsock/virtio_transport_common.c
> > > >> index dcc8a1d58..02eeb96dd 100644
> > > >> --- a/net/vmw_vsock/virtio_transport_common.c
> > > >> +++ b/net/vmw_vsock/virtio_transport_common.c
> > > >> @@ -491,6 +491,25 @@ void virtio_transport_consume_skb_sent(struct
> > > >> sk_buff *skb, bool consume)
> > > >> }
> > > >> EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent);
> > > >>
> > > >> +/* Return the effective peer buffer size for TX credit computation.
> > > >> + *
> > > >> + * The peer advertises its receive buffer via peer_buf_alloc, but we
> > > >> + * cap that to our local buf_alloc (derived from
> > > >> + * SO_VM_SOCKETS_BUFFER_SIZE and already clamped to buffer_max_size)
> > > >> + * so that a remote endpoint cannot force us to queue more data than
> > > >> + * our own configuration allows.
> > > >> + */
> > > >> +static u32 virtio_transport_tx_buf_alloc(struct virtio_vsock_sock
> > > >> *vvs)
> > > >> +{
> > > >> + return min(vvs->peer_buf_alloc, vvs->buf_alloc);
> > > >> +}
> > > >> +
> > > >> u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32
> > > >> credit)
> > > >> {
> > > >> u32 ret;
> > > >> @@ -499,7 +518,8 @@ u32 virtio_transport_get_credit(struct
> > > >> virtio_vsock_sock *vvs, u32 credit)
> > > >> return 0;
> > > >>
> > > >> spin_lock_bh(&vvs->tx_lock);
> > > >> - ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
> > > >> + ret = virtio_transport_tx_buf_alloc(vvs) -
> > > >> + (vvs->tx_cnt - vvs->peer_fwd_cnt);
> > > >> if (ret > credit)
> > > >> ret = credit;
> > > >> vvs->tx_cnt += ret;
> > > >> @@ -831,7 +851,7 @@ virtio_transport_seqpacket_enqueue(struct
> > > >> vsock_sock *vsk,
> > > >>
> > > >> spin_lock_bh(&vvs->tx_lock);
> > > >>
> > > >> - if (len > vvs->peer_buf_alloc) {
> > > >> + if (len > virtio_transport_tx_buf_alloc(vvs)) {
> > > >> spin_unlock_bh(&vvs->tx_lock);
> > > >> return -EMSGSIZE;
> > > >> }
> > > >> @@ -882,7 +902,8 @@ static s64 virtio_transport_has_space(struct
> > > >> vsock_sock *vsk)
> > > >> struct virtio_vsock_sock *vvs = vsk->trans;
> > > >> s64 bytes;
> > > >>
> > > >> - bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt -
> > > >> vvs->peer_fwd_cnt);
> > > >> + bytes = (s64)virtio_transport_tx_buf_alloc(vvs) -
> > > >> + (vvs->tx_cnt - vvs->peer_fwd_cnt);
> > > >> if (bytes < 0)
> > > >> bytes = 0;
> > > >>
> > > >
> > > >Acked-by: Michael S. Tsirkin <[email protected]>
> > > >
> > > >
> > > >Looking at this, why is one place casting to s64 the other is not?
> > >
> > > Yeah, I pointed out that too in previous interactions. IMO we should fix
> > > virtio_transport_get_credit() since the peer can reduce `peer_buf_alloc`
> > > so it will overflow. Fortunately, we are limited by the credit requested
> > > by the caller, but we are still sending stuff when we shouldn't be.
> > >
> > > @Melbin let me know if you will fix it, otherwise I can do that, but I'd
> > > like to do in a single series (multiple patches), since they depends on
> > > each other.
> > >
> > > So if you prefer, I can pickup this patch and post a series with this +
> > > the other fix + the fix on the test I posted on the v2.
> > >
> > > Stefano
> > >
> >
>