From: Wesley Atwell <[email protected]> If receive-side accounting retracts the live rwnd below a larger sender-visible window that was already advertised, allow one in-order skb within that historical bound to repair its backing and reach the normal receive path.
Hard receive-memory admission is still enforced through the existing prune and collapse path. The rescue only changes how data already inside sender-visible sequence space is classified and backed. Signed-off-by: Wesley Atwell <[email protected]> --- net/ipv4/tcp_input.c | 92 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 86 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d76e4e4c0e57..4b9309c37e99 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5376,24 +5376,86 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); +/* Sequence checks run against the sender-visible receive window before this + * point. If later receive-side accounting retracts the live receive window + * below the maximum right edge we already advertised, allow one in-order skb + * which still fits inside that sender-visible bound to reach the normal + * receive queue path. + * + * Keep receive-memory admission itself on the legacy hard-cap path so prune + * and collapse behavior stay aligned with the established retracted-window + * handling. + */ +static bool tcp_skb_in_retracted_window(const struct tcp_sock *tp, + const struct sk_buff *skb) +{ + u32 live_end = tp->rcv_nxt + tcp_receive_window(tp); + u32 max_end = tp->rcv_nxt + tcp_max_receive_window(tp); + + return after(max_end, live_end) && + after(TCP_SKB_CB(skb)->end_seq, live_end) && + !after(TCP_SKB_CB(skb)->end_seq, max_end); +} + static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) { - unsigned int rmem = atomic_read(&sk->sk_rmem_alloc); + return tcp_rmem_used(sk) <= READ_ONCE(sk->sk_rcvbuf); +} + +/* Caller already established that @skb extends into the retracted-but-still- + * valid sender-visible window. For in-order progress, regrow sk_rcvbuf before + * falling into prune/forced-mem handling. + * + * This path intentionally repairs backing for one in-order skb that is already + * within sender-visible sequence space, rather than treating it like ordinary + * receive-buffer autotuning. + * + * Keep this rescue bounded to the span accepted by this skb instead of the + * full historical tp->rcv_mwnd_seq. However, never grow below skb->truesize, + * because sk_rmem_schedule() still charges hard memory, not sender-visible + * window bytes. + */ +static void tcp_try_grow_retracted_skb(struct sock *sk, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + int needed = skb->truesize; + int span_space; + u32 span_win; + + if (TCP_SKB_CB(skb)->seq != tp->rcv_nxt) + return; + + span_win = TCP_SKB_CB(skb)->end_seq - tp->rcv_nxt; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + span_win--; + + if (tcp_space_from_rcv_mwnd(tp, span_win, &span_space)) + needed = max_t(int, needed, span_space); - return rmem <= sk->sk_rcvbuf; + tcp_try_grow_rcvbuf(sk, needed); } +/* Sender-visible window rescue does not relax hard receive-memory admission. + * If growth did not make room, fall back to the established prune/collapse + * path. + */ static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, unsigned int size) { - if (!tcp_can_ingest(sk, skb) || - !sk_rmem_schedule(sk, skb, size)) { + bool can_ingest = tcp_can_ingest(sk, skb); + bool scheduled = can_ingest && sk_rmem_schedule(sk, skb, size); + + if (!scheduled) { + int pruned = tcp_prune_queue(sk, skb); - if (tcp_prune_queue(sk, skb) < 0) + if (pruned < 0) return -1; while (!sk_rmem_schedule(sk, skb, size)) { - if (!tcp_prune_ofo_queue(sk, skb)) + bool pruned_ofo = tcp_prune_ofo_queue(sk, skb); + + if (!pruned_ofo) return -1; } } @@ -5629,6 +5691,7 @@ void tcp_data_ready(struct sock *sk) static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + bool retracted; enum skb_drop_reason reason; bool fragstolen; int eaten; @@ -5647,6 +5710,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); + retracted = skb->len && tcp_skb_in_retracted_window(tp, skb); reason = SKB_DROP_REASON_NOT_SPECIFIED; tp->rx_opt.dsack = 0; @@ -5667,6 +5731,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) goto queue_and_out; + if (retracted) + goto queue_and_out; + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; @@ -5674,7 +5741,20 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* Ok. In sequence. In window. */ queue_and_out: + if (unlikely(retracted)) + tcp_try_grow_retracted_skb(sk, skb); + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + /* If the live rwnd collapsed to zero while rescuing an + * skb that still fit in sender-visible sequence space, + * report zero-window rather than generic proto-mem. + */ + if (unlikely(!tcp_receive_window(tp) && retracted)) { + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPZEROWINDOWDROP); + goto out_of_window; + } /* TODO: maybe ratelimit these WIN 0 ACK ? */ inet_csk(sk)->icsk_ack.pending |= (ICSK_ACK_NOMEM | ICSK_ACK_NOW); -- 2.43.0
