From: Wesley Atwell <[email protected]> Track the maximum sender-visible receive-window right edge separately from the live rwnd, along with the scaling basis that was in force when that larger window was advertised.
This gives later admission and restore paths enough information to reason about retracted windows without losing the original sender- visible bound. Signed-off-by: Wesley Atwell <[email protected]> --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 1 + include/net/tcp.h | 21 ++++++++++++++++++- net/ipv4/tcp.c | 1 + net/ipv4/tcp_fastopen.c | 2 +- net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 2 +- 8 files changed, 28 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 09ece1c59c2d..d58a3b1eb55d 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -11,6 +11,7 @@ Type Name fastpath_tx_access fastpa struct inet_connection_sock inet_conn u16 tcp_header_len read_mostly read_mostly tcp_bound_to_half_wnd,tcp_current_mss(tx);tcp_rcv_established(rx) u16 gso_segs read_mostly tcp_xmit_size_goal +u8 rcv_mwnd_scaling_ratio read_write read_mostly tcp_init_max_rcv_wnd_seq,tcp_update_max_rcv_wnd_seq,tcp_repair_set_window,do_tcp_getsockopt u8 rcv_wnd_scaling_ratio read_write read_mostly tcp_set_rcv_wnd,tcp_can_ingest,tcp_repair_set_window,do_tcp_getsockopt __be32 pred_flags read_write read_mostly tcp_select_window(tx);tcp_rcv_established(rx) u64 bytes_received read_write tcp_rcv_nxt_update(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2ace563d59d6..e5d7a65ac439 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -297,6 +297,7 @@ struct tcp_sock { est_ecnfield:2,/* ECN field for AccECN delivered estimates */ accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */ prev_ecnfield:2; /* ECN bits from the previous segment */ + u8 rcv_mwnd_scaling_ratio; /* 0 if unknown, else tp->rcv_mwnd_seq basis */ u8 rcv_wnd_scaling_ratio; /* 0 if unknown, else tp->rcv_wnd basis */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 6fa7cdb0979e..fc22ab6b80d5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -947,13 +947,21 @@ static inline u32 tcp_max_receive_window(const struct tcp_sock *tp) return (u32) win; } +static inline void tcp_init_max_rcv_wnd_seq(struct tcp_sock *tp) +{ + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; + tp->rcv_mwnd_scaling_ratio = tp->rcv_wnd_scaling_ratio; +} + /* Check if we need to update the maximum receive window sequence number */ static inline void tcp_update_max_rcv_wnd_seq(struct tcp_sock *tp) { u32 wre = tp->rcv_wup + tp->rcv_wnd; - if (after(wre, tp->rcv_mwnd_seq)) + if (after(wre, tp->rcv_mwnd_seq)) { tp->rcv_mwnd_seq = wre; + tp->rcv_mwnd_scaling_ratio = tp->rcv_wnd_scaling_ratio; + } } /* Choose a new window, without checks for shrinking, and without @@ -1766,6 +1774,16 @@ static inline bool tcp_space_from_rcv_wnd(const struct tcp_sock *tp, int win, space); } +/* Same as tcp_space_from_rcv_wnd(), but for the remembered maximum + * sender-visible receive window. + */ +static inline bool tcp_space_from_rcv_mwnd(const struct tcp_sock *tp, int win, + int *space) +{ + return tcp_space_from_wnd_snapshot(tp->rcv_mwnd_scaling_ratio, win, + space); +} + /* Assume a 50% default for skb->len/skb->truesize ratio. * This may be adjusted later in tcp_measure_rcv_mss(). */ @@ -1776,6 +1794,7 @@ static inline void tcp_scaling_ratio_init(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; + tp->rcv_mwnd_scaling_ratio = TCP_DEFAULT_SCALING_RATIO; tp->rcv_wnd_scaling_ratio = TCP_DEFAULT_SCALING_RATIO; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0383ee8d3b78..66706dbb90f5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5275,6 +5275,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_mwnd_scaling_ratio); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd_scaling_ratio); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_mwnd_seq); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 4e389d609f91..56113cf2a165 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -377,7 +377,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; tp->rcv_wup = tp->rcv_nxt; - tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; + tcp_init_max_rcv_wnd_seq(tp); /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b8e65e31255e..352f814a4ff6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6902,7 +6902,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; - tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; + tcp_init_max_rcv_wnd_seq(tp); /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. @@ -7015,7 +7015,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; - tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; + tcp_init_max_rcv_wnd_seq(tp); /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1c02c9cd13fe..85bd9580caf9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -604,7 +604,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; tcp_set_rcv_wnd(newtp, req->rsk_rcv_wnd); - newtp->rcv_mwnd_seq = newtp->rcv_wup + req->rsk_rcv_wnd; + tcp_init_max_rcv_wnd_seq(newtp); newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0b082726d7c4..57a2a6daaad3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4171,7 +4171,7 @@ static void tcp_connect_init(struct sock *sk) else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; - tp->rcv_mwnd_seq = tp->rcv_nxt + tp->rcv_wnd; + tcp_init_max_rcv_wnd_seq(tp); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); -- 2.43.0
