The branch main has been updated by rrs:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=26cbd0028c508787e1d88361f345ac707acabce5

commit 26cbd0028c508787e1d88361f345ac707acabce5
Author:     Randall Stewart <r...@freebsd.org>
AuthorDate: 2021-11-11 11:35:51 +0000
Commit:     Randall Stewart <r...@freebsd.org>
CommitDate: 2021-11-11 11:35:51 +0000

    tcp: Rack may still calculate long RTT on persists probes.
    
    When a persists probe is lost, we will end up calculating a long
    RTT based on the initial probe and when the response comes from the
    second probe (or third etc). This means we have a minimum of a
    confidence level of 3 on a incorrect probe. This commit will change it
    so that we have one of two options
    a) Just not count RTT of probes where we had a loss
    <or>
    b) Count them still but degrade the confidence to 0.
    
    I have set in this the default being to just not measure them, but I am open
    to having the default be otherwise.
    
    Reviewed by: Michael Tuexen
    Sponsored by: Netflix Inc.
    Differential Revision: https://reviews.freebsd.org/D32897
---
 sys/netinet/tcp_stacks/rack.c     | 163 ++++++++++++++++++++++++++++++++------
 sys/netinet/tcp_stacks/tcp_rack.h |   4 +-
 sys/netinet/tcp_subr.c            |  25 +++++-
 3 files changed, 163 insertions(+), 29 deletions(-)

diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index b1927f0a17e4..7769aa1272c0 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -205,6 +205,7 @@ static int32_t rack_hw_up_only = 1;
 static int32_t rack_stats_gets_ms_rtt = 1;
 static int32_t rack_prr_addbackmax = 2;
 static int32_t rack_do_hystart = 0;
+static int32_t rack_apply_rtt_with_reduced_conf = 0;
 
 static int32_t rack_pkt_delay = 1000;
 static int32_t rack_send_a_lot_in_prr = 1;
@@ -343,6 +344,10 @@ counter_u64_t rack_saw_enetunreach;
 counter_u64_t rack_per_timer_hole;
 counter_u64_t rack_large_ackcmp;
 counter_u64_t rack_small_ackcmp;
+counter_u64_t rack_persists_sends;
+counter_u64_t rack_persists_acks;
+counter_u64_t rack_persists_loss;
+counter_u64_t rack_persists_lost_ends;
 #ifdef INVARIANTS
 counter_u64_t rack_adjust_map_bw;
 #endif
@@ -772,6 +777,10 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
                counter_u64_zero(rack_per_timer_hole);
                counter_u64_zero(rack_large_ackcmp);
                counter_u64_zero(rack_small_ackcmp);
+               counter_u64_zero(rack_persists_sends);
+               counter_u64_zero(rack_persists_acks);
+               counter_u64_zero(rack_persists_loss);
+               counter_u64_zero(rack_persists_lost_ends);
 #ifdef INVARIANTS
                counter_u64_zero(rack_adjust_map_bw);
 #endif
@@ -1412,6 +1421,11 @@ rack_init_sysctls(void)
            &rack_tcp_accounting, 0,
            "Should we turn on TCP accounting for all rack sessions?");
 #endif
+       SYSCTL_ADD_S32(&rack_sysctl_ctx,
+           SYSCTL_CHILDREN(rack_misc),
+           OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
+           &rack_apply_rtt_with_reduced_conf, 0,
+           "When a persist or keep-alive probe is not answered do we calculate 
rtt on subsequent answers?");
        SYSCTL_ADD_S32(&rack_sysctl_ctx,
            SYSCTL_CHILDREN(rack_misc),
            OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
@@ -1774,6 +1788,30 @@ rack_init_sysctls(void)
            OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD,
            &rack_large_ackcmp,
            "Number of TCP connections with large mbuf's for compressed acks");
+       rack_persists_sends = counter_u64_alloc(M_WAITOK);
+       SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+           SYSCTL_CHILDREN(rack_counters),
+           OID_AUTO, "persist_sends", CTLFLAG_RD,
+           &rack_persists_sends,
+           "Number of times we sent a persist probe");
+       rack_persists_acks = counter_u64_alloc(M_WAITOK);
+       SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+           SYSCTL_CHILDREN(rack_counters),
+           OID_AUTO, "persist_acks", CTLFLAG_RD,
+           &rack_persists_acks,
+           "Number of times a persist probe was acked");
+       rack_persists_loss = counter_u64_alloc(M_WAITOK);
+       SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+           SYSCTL_CHILDREN(rack_counters),
+           OID_AUTO, "persist_loss", CTLFLAG_RD,
+           &rack_persists_loss,
+           "Number of times we detected a lost persist probe (no ack)");
+       rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
+       SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+           SYSCTL_CHILDREN(rack_counters),
+           OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
+           &rack_persists_lost_ends,
+           "Number of lost persist probe (no ack) that the run ended with a 
PERSIST abort");
        rack_small_ackcmp = counter_u64_alloc(M_WAITOK);
        SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
            SYSCTL_CHILDREN(rack_counters),
@@ -2938,6 +2976,10 @@ rack_counter_destroy(void)
        counter_u64_free(rack_per_timer_hole);
        counter_u64_free(rack_large_ackcmp);
        counter_u64_free(rack_small_ackcmp);
+       counter_u64_free(rack_persists_sends);
+       counter_u64_free(rack_persists_acks);
+       counter_u64_free(rack_persists_loss);
+       counter_u64_free(rack_persists_lost_ends);
 #ifdef INVARIANTS
        counter_u64_free(rack_adjust_map_bw);
 #endif
@@ -5623,6 +5665,9 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack 
*rack, uint32_t cts)
                if (rack->r_ctl.rc_went_idle_time == 0)
                        rack->r_ctl.rc_went_idle_time = 1;
                rack_timer_cancel(tp, rack, cts, __LINE__);
+               rack->r_ctl.persist_lost_ends = 0;
+               rack->probe_not_answered = 0;
+               rack->forced_ack = 0;
                tp->t_rxtshift = 0;
                RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
                              rack_rto_min, rack_rto_max, 
rack->r_ctl.timer_slop);
@@ -6494,6 +6539,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack 
*rack, uint32_t cts)
                tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
                rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, 
__LINE__);
                tcp_set_inp_to_drop(inp, ETIMEDOUT);
+               counter_u64_add(rack_persists_lost_ends, 
rack->r_ctl.persist_lost_ends);
                return (1);
        }
        KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
@@ -6515,6 +6561,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack 
*rack, uint32_t cts)
                retval = 1;
                tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
                tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+               counter_u64_add(rack_persists_lost_ends, 
rack->r_ctl.persist_lost_ends);
                goto out;
        }
        if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
@@ -6531,6 +6578,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack 
*rack, uint32_t cts)
                KMOD_TCPSTAT_INC(tcps_persistdrop);
                tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
                tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+               counter_u64_add(rack_persists_lost_ends, 
rack->r_ctl.persist_lost_ends);
                goto out;
        }
        t_template = tcpip_maketemplate(rack->rc_inp);
@@ -6539,7 +6587,12 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack 
*rack, uint32_t cts)
                if (rack->forced_ack == 0) {
                        rack->forced_ack = 1;
                        rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+               } else {
+                       rack->probe_not_answered = 1;
+                       counter_u64_add(rack_persists_loss, 1);
+                       rack->r_ctl.persist_lost_ends++;
                }
+               counter_u64_add(rack_persists_sends, 1);
                tcp_respond(tp, t_template->tt_ipgen,
                            &t_template->tt_t, (struct mbuf *)NULL,
                            tp->rcv_nxt, tp->snd_una - 1, 0);
@@ -6602,6 +6655,8 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack 
*rack, uint32_t cts)
                        if (rack->forced_ack == 0) {
                                rack->forced_ack = 1;
                                rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+                       } else {
+                               rack->probe_not_answered = 1;
                        }
                        tcp_respond(tp, t_template->tt_ipgen,
                            &t_template->tt_t, (struct mbuf *)NULL,
@@ -10301,6 +10356,14 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, 
struct socket *so,
        INP_WLOCK_ASSERT(tp->t_inpcb);
 
        acked = BYTES_THIS_ACK(tp, th);
+       if (acked) {
+               /* 
+                * Any time we move the cum-ack forward clear
+                * keep-alive tied probe-not-answered. The
+                * persists clears its own on entry.
+                */
+               rack->probe_not_answered = 0;
+       }
        KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
        KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
        /*
@@ -13374,6 +13437,61 @@ rack_log_input_packet(struct tcpcb *tp, struct 
tcp_rack *rack, struct tcp_ackent
 
 }
 
+static void
+rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t 
us_cts)
+{
+       uint32_t us_rtt;
+       /*
+        * A persist or keep-alive was forced out, update our
+        * min rtt time. Note now worry about lost responses.
+        * When a subsequent keep-alive or persist times out
+        * and forced_ack is still on, then the last probe
+        * was not responded to. In such cases we have a
+        * sysctl that controls the behavior. Either we apply
+        * the rtt but with reduced confidence (0). Or we just
+        * plain don't apply the rtt estimate. Having data flow
+        * will clear the probe_not_answered flag i.e. cum-ack
+        * move forward <or> exiting and reentering persists.
+        */
+
+       rack->forced_ack = 0;
+       rack->rc_tp->t_rxtshift = 0;
+       if ((rack->rc_in_persist &&
+            (tiwin == rack->rc_tp->snd_wnd)) ||
+           (rack->rc_in_persist == 0)) {
+               /*
+                * In persists only apply the RTT update if this is
+                * a response to our window probe. And that
+                * means the rwnd sent must match the current
+                * snd_wnd. If it does not, then we got a
+                * window update ack instead. For keepalive
+                * we allow the answer no matter what the window.
+                *
+                * Note that if the probe_not_answered is set then
+                * the forced_ack_ts is the oldest one i.e. the first
+                * probe sent that might have been lost. This assures
+                * us that if we do calculate an RTT it is longer not
+                * some short thing.
+                */
+               if (rack->rc_in_persist)
+                       counter_u64_add(rack_persists_acks, 1);
+               us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
+               if (us_rtt == 0)
+                       us_rtt = 1;
+               if (rack->probe_not_answered == 0) {
+                       rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+                       tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 
1);
+               } else {
+                       /* We have a retransmitted probe here too */
+                       if (rack_apply_rtt_with_reduced_conf) {
+                               rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+                               tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, 
NULL, 1);
+                       }
+               }
+       }
+}
+
+
 static int
 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct 
mbuf *m, int nxt_pkt, struct timeval *tv)
 {
@@ -13483,7 +13601,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, 
struct socket *so, struct mb
                } else if (SEQ_GT(ae->ack, high_seq)) {
                        /* Case A */
                        ae->ack_val_set = ACK_CUMACK;
-               } else if (tiwin == the_win) {
+               } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
                        /* Case D */
                        ae->ack_val_set = ACK_DUPACK;
                } else {
@@ -13596,6 +13714,18 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, 
struct socket *so, struct mb
                        rack_strike_dupack(rack);
                } else if (ae->ack_val_set == ACK_RWND) {
                        /* Case C */
+                       if ((ae->flags & TSTMP_LRO) || (ae->flags & 
TSTMP_HDWR)) {
+                               ts.tv_sec = ae->timestamp / 1000000000;
+                               ts.tv_nsec = ae->timestamp % 1000000000;
+                               rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
+                               rack->r_ctl.act_rcv_time.tv_usec = 
ts.tv_nsec/1000;
+                       } else {
+                               rack->r_ctl.act_rcv_time = *tv;
+                       }
+                       if (rack->forced_ack) {
+                               rack_handle_probe_response(rack, tiwin,
+                                                          
tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
+                       }
                        win_up_req = 1;
                        win_upd_ack = ae->ack;
                        win_seq = ae->seq;
@@ -13677,6 +13807,11 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, 
struct socket *so, struct mb
 #endif
        acked_amount = acked = (high_seq - tp->snd_una);
        if (acked) {
+               /* 
+                * Clear the probe not answered flag
+                * since cum-ack moved forward.
+                */
+               rack->probe_not_answered = 0;
                if (rack->sack_attack_disable == 0)
                        rack_do_decay(rack);
                if (acked >= segsiz) {
@@ -14432,31 +14567,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr 
*th, struct socket *so,
        }
        rack_clear_rate_sample(rack);
        if (rack->forced_ack) {
-               uint32_t us_rtt;
-
-               /*
-                * A persist or keep-alive was forced out, update our
-                * min rtt time. Note we do not worry about lost
-                * retransmissions since KEEP-ALIVES and persists
-                * are usually way long on times of sending (though
-                * if we were really paranoid or worried we could
-                * at least use timestamps if available to validate).
-                */
-               rack->forced_ack = 0;
-               if (tiwin == tp->snd_wnd) {
-                       /*
-                        * Only apply the RTT update if this is
-                        * a response to our window probe. And that
-                        * means the rwnd sent must match the current
-                        * snd_wnd. If it does not, then we got a
-                        * window update ack instead.
-                        */
-                       us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
-                       if (us_rtt == 0)
-                               us_rtt = 1;
-                       rack_apply_updated_usrtt(rack, us_rtt, us_cts);
-                       tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 
1);
-               }
+               rack_handle_probe_response(rack, tiwin, us_cts);
        }
        /*
         * This is the one exception case where we set the rack state
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h 
b/sys/netinet/tcp_stacks/tcp_rack.h
index 0893237e92f9..4b1f8513055d 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -496,6 +496,7 @@ struct rack_control {
        uint32_t challenge_ack_cnt;
        uint32_t rc_min_to;     /* Socket option value Lock(a) */
        uint32_t rc_pkt_delay;  /* Socket option value Lock(a) */
+       uint32_t persist_lost_ends;
        struct newreno rc_saved_beta;   /*
                                         * For newreno cc:
                                         * rc_saved_cc are the values we have 
had
@@ -567,7 +568,8 @@ struct tcp_rack {
                rc_last_tlp_past_cumack: 1,
                rc_last_sent_tlp_seq_valid: 1,
                rc_last_sent_tlp_past_cumack: 1,
-               avail_bytes : 3;
+               probe_not_answered: 1,
+               avail_bytes : 2;
        uint32_t rc_rack_rtt;   /* RACK-RTT Lock(a) */
        uint16_t r_mbuf_queue : 1,      /* Do we do mbuf queue for non-paced */
                 rtt_limit_mul : 4,     /* muliply this by low rtt */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index fab225fbc804..4805d6c80327 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1748,6 +1748,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr 
*th, struct mbuf *m,
        struct mbuf *optm;
        struct udphdr *uh = NULL;
        struct tcphdr *nth;
+       struct tcp_log_buffer *lgb;
        u_char *optp;
 #ifdef INET6
        struct ip6_hdr *ip6;
@@ -1756,6 +1757,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr 
*th, struct mbuf *m,
        int optlen, tlen, win, ulen;
        bool incl_opts;
        uint16_t port;
+       int output_ret;
 
        KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
        NET_EPOCH_ASSERT();
@@ -2086,11 +2088,26 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct 
tcphdr *th, struct mbuf *m,
        TCP_PROBE3(debug__output, tp, th, m);
        if (flags & TH_RST)
                TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
+       if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+               union tcp_log_stackspecific log;
+               struct timeval tv;
+
+               memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+               log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
+               log.u_bbr.ininput = tp->t_inpcb->inp_in_input;
+               log.u_bbr.flex8 = 4;
+               log.u_bbr.pkts_out = tp->t_maxseg;
+               log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+               log.u_bbr.delivered = 0;
+               lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT, 
ERRNO_UNK,
+                                    0, &log, false, NULL, NULL, 0, &tv);
+       } else
+               lgb = NULL;
 
 #ifdef INET6
        if (isipv6) {
                TCP_PROBE5(send, NULL, tp, ip6, tp, nth);
-               (void)ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
+               output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
        }
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
@@ -2099,9 +2116,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr 
*th, struct mbuf *m,
 #ifdef INET
        {
                TCP_PROBE5(send, NULL, tp, ip, tp, nth);
-               (void)ip_output(m, NULL, NULL, 0, NULL, inp);
+               output_ret = ip_output(m, NULL, NULL, 0, NULL, inp);
        }
 #endif
+       if (lgb) {
+               lgb->tlb_errno = output_ret;
+               lgb = NULL;
+       }
 }
 
 /*

Reply via email to