In Linux 3.17 and earlier, tcp_init_congestion_ops (i.e. tcp_reno) is
used as the ca_ops during 3WHS, and after 3WHS, ca_ops is assigned as 
the default congestion control set by sysctl and immediately its parameters
stored in icsk_ca_priv[] are initialized. Commit 55d8694fa82c ("net:
tcp: assign tcp cong_ops when tcp sk is created") splits assignment and
initialization into two steps: assignment is done before SYN or SYN-ACK
is sent out; initialization is done after 3WHS (assume without
fastopen). But this can cause out-of-order invocation for ca_ops functions
other than .init() during 3WHS, as they could be called before its
parameters get initialized. It may cause unexpected behavior for
congestion controls, and make troubles for those that need dynamic
object allocation, like tcp_cdg etc.

We used tcp_dctcp as an example to visualize the problem, and set it as
default congestion control via sysctl. Three parameters
(ca->prior_snd_una, ca->prior_rcv_nxt, ca->dctcp_alpha) were monitored
when functions, such as dctcp_update_alpha() and dctcp_ssthresh(), are
called during 3WHS. All of three are found to be zero, which is likely
impossible if dctcp_init() was called ahead, where those three
parameters should be initialized. Some other congestion controls are
examined too and the same problem was reproduced.

This patch checks ca_initialized flag before ca_ops is invoked, and
allows a call only if initialization has been done. .ssthresh() is a
special case, where tcp_reno_ssthresh() is called instead if it is
uninitialized. .get_info() is still always allowed, as it is expected
case mentioned in commit 55d8694fa82c ("net: tcp: assign tcp cong_ops
when tcp sk is created").

Fixes: 55d8694fa82c ("net: tcp: assign tcp cong_ops when tcp sk is
created")
Signed-off-by: Ji Li <j...@akamai.com>
---
include/net/inet_connection_sock.h |  3 ++-
 include/net/tcp.h                  |  4 ++--
 net/ipv4/tcp_cong.c                |  4 +++-
 net/ipv4/tcp_input.c               | 21 +++++++++++++++------
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 49dcad4..933d217 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -100,7 +100,8 @@ struct inet_connection_sock {
        const struct tcp_congestion_ops *icsk_ca_ops;
        const struct inet_connection_sock_af_ops *icsk_af_ops;
        unsigned int              (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
-       __u8                      icsk_ca_state:6,
+       __u8                      icsk_ca_state:5,
+                                 icsk_ca_initialized:1,
                                  icsk_ca_setsockopt:1,
                                  icsk_ca_dst_locked:1;
        __u8                      icsk_retransmits;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0bcc70f..4c26e1c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -934,7 +934,7 @@ static inline void tcp_set_ca_state(struct sock *sk, const 
u8 ca_state)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);

-       if (icsk->icsk_ca_ops->set_state)
+       if (icsk->icsk_ca_ops->set_state && icsk->icsk_ca_initialized)
                icsk->icsk_ca_ops->set_state(sk, ca_state);
        icsk->icsk_ca_state = ca_state;
 }
@@ -943,7 +943,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum 
tcp_ca_event event)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);

-       if (icsk->icsk_ca_ops->cwnd_event)
+       if (icsk->icsk_ca_ops->cwnd_event && icsk->icsk_ca_initialized)
                icsk->icsk_ca_ops->cwnd_event(sk, event);
 }

diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 882caa4..dd1de39 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk)

        if (icsk->icsk_ca_ops->init)
                icsk->icsk_ca_ops->init(sk);
+       inet_csk(sk)->icsk_ca_initialized = 1;
        if (tcp_ca_needs_ecn(sk))
                INET_ECN_xmit(sk);
        else
@@ -209,8 +210,9 @@ void tcp_cleanup_congestion_control(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

-       if (icsk->icsk_ca_ops->release)
+       if (icsk->icsk_ca_ops->release && icsk->icsk_ca_initialized)
                icsk->icsk_ca_ops->release(sk);
+       icsk->icsk_ca_initialized = 0;
        module_put(icsk->icsk_ca_ops->owner);
 }

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 42bf89a..62f65dc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1878,6 +1878,14 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
        tp->undo_retrans = tp->retrans_out ? : -1;
 }

+static inline u32 tcp_ca_ssthresh(struct sock *sk)
+{
+       if (inet_csk(sk)->icsk_ca_initialized)
+               return inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+       else
+               return tcp_reno_ssthresh(sk);
+}
+
 /* Enter Loss state. If we detect SACK reneging, forget all SACK information
  * and reset tags completely, otherwise preserve SACKs. If receiver
  * dropped its ofo queue, we will know this due to reneging detection.
@@ -1896,7 +1904,7 @@ void tcp_enter_loss(struct sock *sk)
            !after(tp->high_seq, tp->snd_una) ||
            (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
                tp->prior_ssthresh = tcp_current_ssthresh(sk);
-               tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+               tp->snd_ssthresh = tcp_ca_ssthresh(sk);
                tcp_ca_event(sk, CA_EVENT_LOSS);
                tcp_init_undo(tp);
        }
@@ -2362,7 +2370,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool 
unmark_loss)
        if (tp->prior_ssthresh) {
                const struct inet_connection_sock *icsk = inet_csk(sk);

-               if (icsk->icsk_ca_ops->undo_cwnd)
+               if (icsk->icsk_ca_ops->undo_cwnd && icsk->icsk_ca_initialized)
                        tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
                else
                        tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
@@ -2467,7 +2475,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
        tp->prior_cwnd = tp->snd_cwnd;
        tp->prr_delivered = 0;
        tp->prr_out = 0;
-       tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+       tp->snd_ssthresh = tcp_ca_ssthresh(sk);
        tcp_ecn_queue_cwr(tp);
 }

@@ -3248,7 +3256,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int 
prior_fackets,
                tcp_rearm_rto(sk);
        }

-       if (icsk->icsk_ca_ops->pkts_acked) {
+       if (icsk->icsk_ca_ops->pkts_acked && icsk->icsk_ca_initialized) {
                struct ack_sample sample = { .pkts_acked = pkts_acked,
                                             .rtt_us = ca_rtt_us };

@@ -3335,7 +3343,8 @@ static void tcp_cong_control(struct sock *sk, u32 ack, 
u32 acked_sacked,
        if (tcp_in_cwnd_reduction(sk)) {
                /* Reduce cwnd if state mandates */
                tcp_cwnd_reduction(sk, acked_sacked, flag);
-       } else if (tcp_may_raise_cwnd(sk, flag)) {
+       } else if (tcp_may_raise_cwnd(sk, flag) &&
+                  inet_csk(sk)->icsk_ca_initialized) {
                /* Advance cwnd if state allows */
                tcp_cong_avoid(sk, ack, acked_sacked);
        }
@@ -3545,7 +3554,7 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 
flags)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);

-       if (icsk->icsk_ca_ops->in_ack_event)
+       if (icsk->icsk_ca_ops->in_ack_event && icsk->icsk_ca_initialized)
                icsk->icsk_ca_ops->in_ack_event(sk, flags);
 }


--
2.8.2

Reply via email to