This patch adds suppport for setting the initial advertized window from
within a BPF_SOCKET_OPS program. This can be used to support larger
initial cwnd values in environments where it is known to be safe.

Signed-off-by: Lawrence Brakmo <bra...@fb.com>
---
 include/net/tcp.h        | 10 ++++++++++
 include/uapi/linux/bpf.h |  4 ++++
 net/ipv4/tcp_minisocks.c |  9 ++++++++-
 net/ipv4/tcp_output.c    |  7 ++++++-
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a726486..29c27dc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2029,4 +2029,14 @@ static inline u32 tcp_timeout_init(struct sock *sk, bool 
is_req_sock)
        return timeout;
 }
 
+static inline u32 tcp_rwnd_init_bpf(struct sock *sk, bool is_req_sock)
+{
+       int rwnd;
+
+       rwnd = tcp_call_bpf(sk, is_req_sock, BPF_SOCKET_OPS_RWND_INIT);
+
+       if (rwnd < 0)
+               rwnd = 0;
+       return rwnd;
+}
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 039f327..d945336 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -749,6 +749,10 @@ enum {
        BPF_SOCKET_OPS_TIMEOUT_INIT,    /* Should return SYN-RTO value to use or
                                         * -1 if default value should be used
                                         */
+       BPF_SOCKET_OPS_RWND_INIT,       /* Should return initial advertized
+                                        * window (in packets) or -1 if default
+                                        * value should be used
+                                        */
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d30ee31..bbaf3c6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -351,6 +351,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
        int full_space = tcp_full_space(sk_listener);
        u32 window_clamp;
        __u8 rcv_wscale;
+       u32 rcv_wnd;
        int mss;
 
        mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -363,6 +364,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
            (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
                req->rsk_window_clamp = full_space;
 
+       rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req, true);
+       if (rcv_wnd == 0)
+               rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+       else if (full_space < rcv_wnd * mss)
+               full_space = rcv_wnd * mss;
+
        /* tcp_full_space because it is guaranteed to be the first packet */
        tcp_select_initial_window(full_space,
                mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -370,7 +377,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
                &req->rsk_window_clamp,
                ireq->wscale_ok,
                &rcv_wscale,
-               dst_metric(dst, RTAX_INITRWND));
+               rcv_wnd);
        ireq->rcv_wscale = rcv_wscale;
 }
 EXPORT_SYMBOL(tcp_openreq_init_rwin);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5e478a1..e5f623f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3267,6 +3267,7 @@ static void tcp_connect_init(struct sock *sk)
        const struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        __u8 rcv_wscale;
+       u32 rcv_wnd;
 
        /* We'll fix this up when we get a response from the other end.
         * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3300,13 +3301,17 @@ static void tcp_connect_init(struct sock *sk)
            (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
                tp->window_clamp = tcp_full_space(sk);
 
+       rcv_wnd = tcp_rwnd_init_bpf(sk, false);
+       if (rcv_wnd == 0)
+               rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+
        tcp_select_initial_window(tcp_full_space(sk),
                                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? 
tp->tcp_header_len - sizeof(struct tcphdr) : 0),
                                  &tp->rcv_wnd,
                                  &tp->window_clamp,
                                  sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
                                  &rcv_wscale,
-                                 dst_metric(dst, RTAX_INITRWND));
+                                 rcv_wnd);
 
        tp->rx_opt.rcv_wscale = rcv_wscale;
        tp->rcv_ssthresh = tp->rcv_wnd;
-- 
2.9.3

Reply via email to