From: Wesley Atwell <[email protected]>

If receive-side accounting retracts the live rwnd below a larger
sender-visible window that was already advertised, allow one in-order
skb within that historical bound to repair its backing and reach the
normal receive path.

Hard receive-memory admission is still enforced through the existing
prune and collapse path. The rescue only changes how data already
inside sender-visible sequence space is classified and backed.

Signed-off-by: Wesley Atwell <[email protected]>
---
 net/ipv4/tcp_input.c | 92 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 86 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d76e4e4c0e57..4b9309c37e99 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5376,24 +5376,86 @@ static void tcp_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
 static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
 
+/* Sequence checks run against the sender-visible receive window before this
+ * point. If later receive-side accounting retracts the live receive window
+ * below the maximum right edge we already advertised, allow one in-order skb
+ * which still fits inside that sender-visible bound to reach the normal
+ * receive queue path.
+ *
+ * Keep receive-memory admission itself on the legacy hard-cap path so prune
+ * and collapse behavior stay aligned with the established retracted-window
+ * handling.
+ */
+static bool tcp_skb_in_retracted_window(const struct tcp_sock *tp,
+                                       const struct sk_buff *skb)
+{
+       u32 live_end = tp->rcv_nxt + tcp_receive_window(tp);
+       u32 max_end = tp->rcv_nxt + tcp_max_receive_window(tp);
+
+       return after(max_end, live_end) &&
+              after(TCP_SKB_CB(skb)->end_seq, live_end) &&
+              !after(TCP_SKB_CB(skb)->end_seq, max_end);
+}
+
 static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
 {
-       unsigned int rmem = atomic_read(&sk->sk_rmem_alloc);
+       return tcp_rmem_used(sk) <= READ_ONCE(sk->sk_rcvbuf);
+}
+
+/* Caller already established that @skb extends into the retracted-but-still-
+ * valid sender-visible window. For in-order progress, regrow sk_rcvbuf before
+ * falling into prune/forced-mem handling.
+ *
+ * This path intentionally repairs backing for one in-order skb that is already
+ * within sender-visible sequence space, rather than treating it like ordinary
+ * receive-buffer autotuning.
+ *
+ * Keep this rescue bounded to the span accepted by this skb instead of the
+ * full historical tp->rcv_mwnd_seq. However, never grow below skb->truesize,
+ * because sk_rmem_schedule() still charges hard memory, not sender-visible
+ * window bytes.
+ */
+static void tcp_try_grow_retracted_skb(struct sock *sk,
+                                      const struct sk_buff *skb)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       int needed = skb->truesize;
+       int span_space;
+       u32 span_win;
+
+       if (TCP_SKB_CB(skb)->seq != tp->rcv_nxt)
+               return;
+
+       span_win = TCP_SKB_CB(skb)->end_seq - tp->rcv_nxt;
+       if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+               span_win--;
+
+       if (tcp_space_from_rcv_mwnd(tp, span_win, &span_space))
+               needed = max_t(int, needed, span_space);
 
-       return rmem <= sk->sk_rcvbuf;
+       tcp_try_grow_rcvbuf(sk, needed);
 }
 
+/* Sender-visible window rescue does not relax hard receive-memory admission.
+ * If growth did not make room, fall back to the established prune/collapse
+ * path.
+ */
 static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
                                 unsigned int size)
 {
-       if (!tcp_can_ingest(sk, skb) ||
-           !sk_rmem_schedule(sk, skb, size)) {
+       bool can_ingest = tcp_can_ingest(sk, skb);
+       bool scheduled = can_ingest && sk_rmem_schedule(sk, skb, size);
+
+       if (!scheduled) {
+               int pruned = tcp_prune_queue(sk, skb);
 
-               if (tcp_prune_queue(sk, skb) < 0)
+               if (pruned < 0)
                        return -1;
 
                while (!sk_rmem_schedule(sk, skb, size)) {
-                       if (!tcp_prune_ofo_queue(sk, skb))
+                       bool pruned_ofo = tcp_prune_ofo_queue(sk, skb);
+
+                       if (!pruned_ofo)
                                return -1;
                }
        }
@@ -5629,6 +5691,7 @@ void tcp_data_ready(struct sock *sk)
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+       bool retracted;
        enum skb_drop_reason reason;
        bool fragstolen;
        int eaten;
@@ -5647,6 +5710,7 @@ static void tcp_data_queue(struct sock *sk, struct 
sk_buff *skb)
        }
        tcp_cleanup_skb(skb);
        __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+       retracted = skb->len && tcp_skb_in_retracted_window(tp, skb);
 
        reason = SKB_DROP_REASON_NOT_SPECIFIED;
        tp->rx_opt.dsack = 0;
@@ -5667,6 +5731,9 @@ static void tcp_data_queue(struct sock *sk, struct 
sk_buff *skb)
                            (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
                                goto queue_and_out;
 
+                       if (retracted)
+                               goto queue_and_out;
+
                        reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
                        NET_INC_STATS(sock_net(sk), 
LINUX_MIB_TCPZEROWINDOWDROP);
                        goto out_of_window;
@@ -5674,7 +5741,20 @@ static void tcp_data_queue(struct sock *sk, struct 
sk_buff *skb)
 
                /* Ok. In sequence. In window. */
 queue_and_out:
+               if (unlikely(retracted))
+                       tcp_try_grow_retracted_skb(sk, skb);
+
                if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+                       /* If the live rwnd collapsed to zero while rescuing an
+                        * skb that still fit in sender-visible sequence space,
+                        * report zero-window rather than generic proto-mem.
+                        */
+                       if (unlikely(!tcp_receive_window(tp) && retracted)) {
+                               reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPZEROWINDOWDROP);
+                               goto out_of_window;
+                       }
                        /* TODO: maybe ratelimit these WIN 0 ACK ? */
                        inet_csk(sk)->icsk_ack.pending |=
                                        (ICSK_ACK_NOMEM | ICSK_ACK_NOW);
-- 
2.43.0


Reply via email to