[PATCH net 1/7] tcp: track advertise-time scaling basis for rcv_wnd

Wesley Atwell Wed, 11 Mar 2026 01:17:24 -0700

tp->rcv_wnd is an advertised window, but later receive-side accounting
needs to recover the hard memory budget that window represented when it
was exposed.


Prepare for that by storing the scaling basis alongside tp->rcv_wnd and
centralizing the helper API around the paired state. While here, make the
existing receive-memory arithmetic use the shared helper names so later
behavioral changes can build on one explicit accounting model.

This patch is groundwork only. Later patches will refresh the snapshot at
window write sites and consume it in the receive-memory paths.

Signed-off-by: Wesley Atwell <[email protected]>
---
 .../networking/net_cachelines/tcp_sock.rst    |  1 +
 include/linux/tcp.h                           |  1 +
 include/net/tcp.h                             | 79 +++++++++++++++++--
 net/ipv4/tcp.c                                |  1 +
 4 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst 
b/Documentation/networking/net_cachelines/tcp_sock.rst
index 563daea10d6c..1415981b9d8a 100644
--- a/Documentation/networking/net_cachelines/tcp_sock.rst
+++ b/Documentation/networking/net_cachelines/tcp_sock.rst
@@ -12,6 +12,7 @@ struct inet_connection_sock   inet_conn
 u16                           tcp_header_len          read_mostly         
read_mostly         
tcp_bound_to_half_wnd,tcp_current_mss(tx);tcp_rcv_established(rx)
 u16                           gso_segs                read_mostly              
               tcp_xmit_size_goal
 __be32                        pred_flags              read_write          
read_mostly         tcp_select_window(tx);tcp_rcv_established(rx)
+u8                            rcv_wnd_scaling_ratio   read_write          
read_mostly         tcp_set_rcv_wnd,tcp_can_ingest,tcp_clamp_window
 u64                           bytes_received                              
read_write          tcp_rcv_nxt_update(rx)
 u32                           segs_in                                     
read_write          tcp_v6_rcv(rx)
 u32                           data_segs_in                                
read_write          tcp_v6_rcv(rx)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f72eef31fa23..ec6b70c1174b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -297,6 +297,7 @@ struct tcp_sock {
                est_ecnfield:2,/* ECN field for AccECN delivered estimates */
                accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
                prev_ecnfield:2; /* ECN bits from the previous segment */
+       u8      rcv_wnd_scaling_ratio; /* 0 if unknown, else tp->rcv_wnd basis 
*/
        __be32  pred_flags;
        u64     tcp_clock_cache; /* cache last tcp_clock_ns() (see 
tcp_mstamp_refresh()) */
        u64     tcp_mstamp;     /* most recent packet received/sent */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 978eea2d5df0..187e6d660f62 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1702,6 +1702,26 @@ static inline int tcp_space_from_win(const struct sock 
*sk, int win)
        return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
 }
 
+static inline bool tcp_rcv_wnd_snapshot_valid(const struct tcp_sock *tp)
+{
+       return tp->rcv_wnd_scaling_ratio != 0;
+}
+
+/* Rebuild hard receive-memory units for data already covered by tp->rcv_wnd if
+ * the advertise-time basis is known. Legacy TCP_REPAIR restores can only
+ * recover tp->rcv_wnd itself; callers must fall back when the snapshot is
+ * unknown.
+ */
+static inline bool tcp_space_from_rcv_wnd(const struct tcp_sock *tp, int win,
+                                         int *space)
+{
+       if (!tcp_rcv_wnd_snapshot_valid(tp))
+               return false;
+
+       *space = __tcp_space_from_win(tp->rcv_wnd_scaling_ratio, win);
+       return true;
+}
+
 /* Assume a 50% default for skb->len/skb->truesize ratio.
  * This may be adjusted later in tcp_measure_rcv_mss().
  */
@@ -1709,15 +1729,62 @@ static inline int tcp_space_from_win(const struct sock 
*sk, int win)
 
 static inline void tcp_scaling_ratio_init(struct sock *sk)
 {
-       tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       tp->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
+       tp->rcv_wnd_scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
+}
+
+/* tp->rcv_wnd is paired with the scaling_ratio that was in force when that
+ * window was last advertised. Legacy TCP_REPAIR restores can only recover the
+ * window value itself and use a zero snapshot until a fresh local window
+ * advertisement refreshes the pair.
+ */
+static inline void tcp_set_rcv_wnd_snapshot(struct tcp_sock *tp, u32 win,
+                                           u8 scaling_ratio)
+{
+       tp->rcv_wnd = win;
+       tp->rcv_wnd_scaling_ratio = scaling_ratio;
+}
+
+static inline void tcp_set_rcv_wnd(struct tcp_sock *tp, u32 win)
+{
+       tcp_set_rcv_wnd_snapshot(tp, win, tp->scaling_ratio);
+}
+
+static inline void tcp_set_rcv_wnd_unknown(struct tcp_sock *tp, u32 win)
+{
+       tcp_set_rcv_wnd_snapshot(tp, win, 0);
+}
+
+/* TCP receive-side accounting reuses sk_rcvbuf as both a hard memory limit
+ * and as the source material for the advertised receive window after
+ * scaling_ratio conversion. Keep the byte accounting explicit so admission,
+ * pruning, and rwnd selection all start from the same quantities.
+ */
+static inline int tcp_rmem_used(const struct sock *sk)
+{
+       return atomic_read(&sk->sk_rmem_alloc);
+}
+
+static inline int tcp_rmem_avail(const struct sock *sk)
+{
+       return READ_ONCE(sk->sk_rcvbuf) - tcp_rmem_used(sk);
+}
+
+/* Sender-visible rwnd headroom also reserves bytes already queued on backlog.
+ * Those bytes are not free to advertise again until __release_sock() drains
+ * backlog and clears sk_backlog.len.
+ */
+static inline int tcp_rwnd_avail(const struct sock *sk)
+{
+       return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
 }
 
 /* Note: caller must be prepared to deal with negative returns */
 static inline int tcp_space(const struct sock *sk)
 {
-       return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
-                                 READ_ONCE(sk->sk_backlog.len) -
-                                 atomic_read(&sk->sk_rmem_alloc));
+       return tcp_win_from_space(sk, tcp_rwnd_avail(sk));
 }
 
 static inline int tcp_full_space(const struct sock *sk)
@@ -1760,7 +1827,7 @@ static inline bool tcp_rmem_pressure(const struct sock 
*sk)
        rcvbuf = READ_ONCE(sk->sk_rcvbuf);
        threshold = rcvbuf - (rcvbuf >> 3);
 
-       return atomic_read(&sk->sk_rmem_alloc) > threshold;
+       return tcp_rmem_used(sk) > threshold;
 }
 
 static inline bool tcp_epollin_ready(const struct sock *sk, int target)
@@ -1910,7 +1977,7 @@ static inline void tcp_fast_path_check(struct sock *sk)
 
        if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
            tp->rcv_wnd &&
-           atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+           tcp_rmem_avail(sk) > 0 &&
            !tp->urg_data)
                tcp_fast_path_on(tp);
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 202a4e57a218..cec9ae1bf875 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5238,6 +5238,7 @@ static void __init tcp_struct_check(void)
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, 
received_ce);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, 
received_ecn_bytes);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, 
app_limited);
+       CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, 
rcv_wnd_scaling_ratio);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, 
rcv_wnd);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, 
rcv_tstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, 
rx_opt);
-- 
2.34.1

[PATCH net 1/7] tcp: track advertise-time scaling basis for rcv_wnd

Reply via email to