From: Wesley Atwell <[email protected]>

Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
more sender-visible window than the current hard receive-memory backing
can cover.

The new helper keeps backlog and memory-pressure limits in the same
units as the rest of the receive path, while __tcp_select_window()
backs any rounding slack before advertising it.

Signed-off-by: Wesley Atwell <[email protected]>
---
 include/net/tcp.h     | 12 ++++++++++++
 net/ipv4/tcp_input.c  | 36 ++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_output.c | 15 +++++++++++++--
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fc22ab6b80d5..5b479ad44f89 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
 enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb);
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
 void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
 void tcp_rcv_space_adjust(struct sock *sk);
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
 void tcp_twsk_destructor(struct sock *sk);
@@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
        return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
 }
 
+/* Passive children clone the listener's sk_socket until accept() grafts
+ * their own struct socket, so only sockets that point back to themselves
+ * should autotune receive-buffer backing.
+ */
+static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
+{
+       struct socket *sock = READ_ONCE(sk->sk_socket);
+
+       return sock && READ_ONCE(sock->sk) == sk;
+}
+
 /* Note: caller must be prepared to deal with negative returns */
 static inline int tcp_space(const struct sock *sk)
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 352f814a4ff6..32256519a085 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -774,6 +774,38 @@ static void tcp_init_buffer_space(struct sock *sk)
                                    (u32)TCP_INIT_CWND * tp->advmss);
 }
 
+/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed
+ * bytes beyond sk_rmem_alloc while preserving sender-visible headroom
+ * already consumed by sk_backlog.len.
+ */
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed)
+{
+       struct net *net = sock_net(sk);
+       int backlog;
+       int rmem2;
+       int target;
+
+       needed = max(needed, 0);
+       backlog = READ_ONCE(sk->sk_backlog.len);
+       target = tcp_rmem_used(sk) + backlog + needed;
+
+       if (target <= READ_ONCE(sk->sk_rcvbuf))
+               return true;
+
+       rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+       if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 ||
+           (sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
+           tcp_under_memory_pressure(sk) ||
+           sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+               return false;
+
+       WRITE_ONCE(sk->sk_rcvbuf,
+                  min_t(int, rmem2,
+                        max_t(int, READ_ONCE(sk->sk_rcvbuf), target)));
+
+       return target <= READ_ONCE(sk->sk_rcvbuf);
+}
+
 /* 4. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk)
 {
@@ -785,14 +817,14 @@ static void tcp_clamp_window(struct sock *sk)
        icsk->icsk_ack.quick = 0;
        rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
 
-       if (sk->sk_rcvbuf < rmem2 &&
+       if (READ_ONCE(sk->sk_rcvbuf) < rmem2 &&
            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
            !tcp_under_memory_pressure(sk) &&
            sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
                WRITE_ONCE(sk->sk_rcvbuf,
                           min(atomic_read(&sk->sk_rmem_alloc), rmem2));
        }
-       if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+       if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
                tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 57a2a6daaad3..53781cf591d2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3375,13 +3375,24 @@ u32 __tcp_select_window(struct sock *sk)
         * scaled window will not line up with the MSS boundary anyway.
         */
        if (tp->rx_opt.rcv_wscale) {
+               int rcv_wscale = 1 << tp->rx_opt.rcv_wscale;
+
                window = free_space;
 
                /* Advertise enough space so that it won't get scaled away.
-                * Import case: prevent zero window announcement if
+                * Important case: prevent zero-window announcement if
                 * 1<<rcv_wscale > mss.
                 */
-               window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
+               window = ALIGN(window, rcv_wscale);
+
+               /* Back any scale-quantization slack before we expose it.
+                * Otherwise tcp_can_ingest() can reject data which is still
+                * within the sender-visible window.
+                */
+               if (window > free_space &&
+                   (!tcp_rcvbuf_grow_allowed(sk) ||
+                    !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window))))
+                       window = round_down(free_space, rcv_wscale);
        } else {
                window = tp->rcv_wnd;
                /* Get the largest window that is a nice multiple of mss.
-- 
2.43.0


Reply via email to