From: Wesley Atwell <[email protected]> Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose more sender-visible window than the current hard receive-memory backing can cover.
The new helper keeps backlog and memory-pressure limits in the same units as the rest of the receive path, while __tcp_select_window() backs any rounding slack before advertising it. Signed-off-by: Wesley Atwell <[email protected]> --- include/net/tcp.h | 12 ++++++++++++ net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++-- net/ipv4/tcp_output.c | 15 +++++++++++++-- 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index fc22ab6b80d5..5b479ad44f89 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg); enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcvbuf_grow(struct sock *sk, u32 newval); +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); @@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk) return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len); } +/* Passive children clone the listener's sk_socket until accept() grafts + * their own struct socket, so only sockets that point back to themselves + * should autotune receive-buffer backing. + */ +static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk) +{ + struct socket *sock = READ_ONCE(sk->sk_socket); + + return sock && READ_ONCE(sock->sk) == sk; +} + /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 352f814a4ff6..32256519a085 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -774,6 +774,38 @@ static void tcp_init_buffer_space(struct sock *sk) (u32)TCP_INIT_CWND * tp->advmss); } +/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed + * bytes beyond sk_rmem_alloc while preserving sender-visible headroom + * already consumed by sk_backlog.len. + */ +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed) +{ + struct net *net = sock_net(sk); + int backlog; + int rmem2; + int target; + + needed = max(needed, 0); + backlog = READ_ONCE(sk->sk_backlog.len); + target = tcp_rmem_used(sk) + backlog + needed; + + if (target <= READ_ONCE(sk->sk_rcvbuf)) + return true; + + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); + if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 || + (sk->sk_userlocks & SOCK_RCVBUF_LOCK) || + tcp_under_memory_pressure(sk) || + sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) + return false; + + WRITE_ONCE(sk->sk_rcvbuf, + min_t(int, rmem2, + max_t(int, READ_ONCE(sk->sk_rcvbuf), target))); + + return target <= READ_ONCE(sk->sk_rcvbuf); +} + /* 4. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk) { @@ -785,14 +817,14 @@ static void tcp_clamp_window(struct sock *sk) icsk->icsk_ack.quick = 0; rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); - if (sk->sk_rcvbuf < rmem2 && + if (READ_ONCE(sk->sk_rcvbuf) < rmem2 && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { WRITE_ONCE(sk->sk_rcvbuf, min(atomic_read(&sk->sk_rmem_alloc), rmem2)); } - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 57a2a6daaad3..53781cf591d2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3375,13 +3375,24 @@ u32 __tcp_select_window(struct sock *sk) * scaled window will not line up with the MSS boundary anyway. */ if (tp->rx_opt.rcv_wscale) { + int rcv_wscale = 1 << tp->rx_opt.rcv_wscale; + window = free_space; /* Advertise enough space so that it won't get scaled away. - * Import case: prevent zero window announcement if + * Important case: prevent zero-window announcement if * 1<<rcv_wscale > mss. */ - window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); + window = ALIGN(window, rcv_wscale); + + /* Back any scale-quantization slack before we expose it. + * Otherwise tcp_can_ingest() can reject data which is still + * within the sender-visible window. + */ + if (window > free_space && + (!tcp_rcvbuf_grow_allowed(sk) || + !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window)))) + window = round_down(free_space, rcv_wscale); } else { window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. -- 2.43.0
