The branch main has been updated by rrs:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=62ce18fc9a8e46ea72ce3a106e7b0cd1ad6a712b

commit 62ce18fc9a8e46ea72ce3a106e7b0cd1ad6a712b
Author:     Randall Stewart <[email protected]>
AuthorDate: 2022-08-23 13:17:05 +0000
Commit:     Randall Stewart <[email protected]>
CommitDate: 2022-08-23 13:17:05 +0000

    tcp: Rack rwnd collapse.
    
    Currently when the peer collapses its rwnd, we mark packets to be 
retransmitted
    and use the must_retran flags like we do when a PMTU collapses to 
retransmit the
    collapsed packets. However this causes a problem with some middle boxes that
    play with the rwnd to control flow. As soon as the rwnd increases we start 
resending
    which may be not even a rtt.. and in fact the peer may have gotten the 
packets. Which
    means we gratuitously retransmit packets we should not.
    
    The fix here is to make sure that a rack time has passed before 
retransmitting the packets.
    This makes sure that the rwnd collapse was real and the packets do need 
retransmission.
    
    Reviewed by: tuexen
    Sponsored by: Netflix Inc
    Differential Revision: https://reviews.freebsd.org/D35166
---
 sys/netinet/tcp_log_buf.h         |   4 +-
 sys/netinet/tcp_stacks/rack.c     | 403 ++++++++++++++++++++++++++------------
 sys/netinet/tcp_stacks/tcp_rack.h |  19 +-
 3 files changed, 300 insertions(+), 126 deletions(-)

diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
index 1290a8ce6b29..c11757099c5d 100644
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -236,7 +236,9 @@ enum tcp_log_events {
        TCP_LOG_FSB,            /* FSB information 63 */
        RACK_DSACK_HANDLING,    /* Handling of DSACK in rack for reordering 
window 64 */
        TCP_HYSTART,            /* TCP Hystart logging 65 */
-       TCP_LOG_END             /* End (keep at end)                66 */
+       TCP_CHG_QUERY,          /* Change query during fnc_init() 66 */
+       TCP_RACK_LOG_COLLAPSE,  /* Window collapse by peer 67 */
+       TCP_LOG_END             /* End (keep at end)                68 */
 };
 
 enum tcp_log_states {
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 0c91c9c6703f..ea370fe9247c 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -385,6 +385,9 @@ counter_u64_t rack_move_some;
 
 counter_u64_t rack_input_idle_reduces;
 counter_u64_t rack_collapsed_win;
+counter_u64_t rack_collapsed_win_seen;
+counter_u64_t rack_collapsed_win_rxt;
+counter_u64_t rack_collapsed_win_rxt_bytes;
 counter_u64_t rack_try_scwnd;
 counter_u64_t rack_hw_pace_init_fail;
 counter_u64_t rack_hw_pace_lost;
@@ -790,6 +793,9 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
                counter_u64_zero(rack_move_some);
                counter_u64_zero(rack_try_scwnd);
                counter_u64_zero(rack_collapsed_win);
+               counter_u64_zero(rack_collapsed_win_rxt);
+               counter_u64_zero(rack_collapsed_win_seen);
+               counter_u64_zero(rack_collapsed_win_rxt_bytes);
        }
        rack_clear_counter = 0;
        return (0);
@@ -1757,12 +1763,31 @@ rack_init_sysctls(void)
            OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
            &rack_input_idle_reduces,
            "Total number of idle reductions on input");
+       rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK);
+       SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+           SYSCTL_CHILDREN(rack_counters),
+           OID_AUTO, "collapsed_win_seen", CTLFLAG_RD,
+           &rack_collapsed_win_seen,
+           "Total number of collapsed window events seen (where our window 
shrinks)");
+
        rack_collapsed_win = counter_u64_alloc(M_WAITOK);
        SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
            SYSCTL_CHILDREN(rack_counters),
            OID_AUTO, "collapsed_win", CTLFLAG_RD,
            &rack_collapsed_win,
-           "Total number of collapsed windows");
+           "Total number of collapsed window events where we mark packets");
+       rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK);
+       SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+           SYSCTL_CHILDREN(rack_counters),
+           OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD,
+           &rack_collapsed_win_rxt,
+           "Total number of packets that were retransmitted");
+       rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK);
+       SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+           SYSCTL_CHILDREN(rack_counters),
+           OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD,
+           &rack_collapsed_win_rxt_bytes,
+           "Total number of bytes that were retransmitted");
        rack_try_scwnd = counter_u64_alloc(M_WAITOK);
        SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
            SYSCTL_CHILDREN(rack_counters),
@@ -2772,6 +2797,9 @@ rack_counter_destroy(void)
        counter_u64_free(rack_sack_splits);
        counter_u64_free(rack_input_idle_reduces);
        counter_u64_free(rack_collapsed_win);
+       counter_u64_free(rack_collapsed_win_rxt);
+       counter_u64_free(rack_collapsed_win_rxt_bytes);
+       counter_u64_free(rack_collapsed_win_seen);
        counter_u64_free(rack_try_scwnd);
        counter_u64_free(rack_persists_sends);
        counter_u64_free(rack_persists_acks);
@@ -5295,7 +5323,9 @@ activate_rxt:
                goto activate_rxt;
        }
        /* Convert from ms to usecs */
-       if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= 
DUP_ACK_THRESHOLD)) {
+       if ((rsm->r_flags & RACK_SACK_PASSED) ||
+           (rsm->r_flags & RACK_RWND_COLLAPSED) ||
+           (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
                if ((tp->t_flags & TF_SENTFIN) &&
                    ((tp->snd_max - tp->snd_una) == 1) &&
                    (rsm->r_flags & RACK_HAS_FIN)) {
@@ -5757,7 +5787,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb 
*tp, uint32_t cts,
                 * real pacing. And the tlp or rxt is smaller
                 * than the pacing calculation. Lets not
                 * pace that long since we know the calculation
-                * so far is not accurate. 
+                * so far is not accurate.
                 */
                slot = hpts_timeout;
        }
@@ -6501,7 +6531,7 @@ rack_remxt_tmr(struct tcpcb *tp)
                trsm = rsm;
                if (rsm->r_flags & RACK_ACKED)
                        rsm->r_flags |= RACK_WAS_ACKED;
-               rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | 
RACK_WAS_SACKPASS);
+               rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | 
RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED);
                rsm->r_flags |= RACK_MUST_RXT;
        }
        /* Clear the count (we just un-acked them) */
@@ -8040,6 +8070,13 @@ rack_log_sack_passed(struct tcpcb *tp,
                         */
                        continue;
                }
+               if (nrsm->r_flags & RACK_RWND_COLLAPSED) {
+                       /*
+                        * If the peer dropped the rwnd on
+                        * these then we don't worry about them.
+                        */
+                       continue;
+               }
                if (nrsm->r_flags & RACK_SACK_PASSED) {
                        /*
                         * We found one that is already marked
@@ -9797,7 +9834,7 @@ rack_strike_dupack(struct tcp_rack *rack)
                        /* Sendmap entries that are marked to
                         * be retransmitted do not need dupack's
                         * struck. We get these marks for a number
-                        * of reasons (rxt timeout with no sack, 
+                        * of reasons (rxt timeout with no sack,
                         * mtu change, or rwnd collapses). When
                         * these events occur, we know we must retransmit
                         * them and mark the sendmap entries. Dupack counting
@@ -10308,47 +10345,83 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, 
struct socket *so,
        return (0);
 }
 
+
 static void
-rack_collapsed_window(struct tcp_rack *rack)
+rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, 
uint32_t out, int line,
+                 int dir, uint32_t flags, struct rack_sendmap *rsm)
+{
+       if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+               union tcp_log_stackspecific log;
+               struct timeval tv;
+
+               memset(&log, 0, sizeof(log));
+               log.u_bbr.flex1 = cnt;
+               log.u_bbr.flex2 = split;
+               log.u_bbr.flex3 = out;
+               log.u_bbr.flex4 = line;
+               log.u_bbr.flex5 = rack->r_must_retran;
+               log.u_bbr.flex6 = flags;
+               log.u_bbr.flex7 = rack->rc_has_collapsed;
+               log.u_bbr.flex8 = dir;  /*
+                                        * 1 is collapsed, 0 is uncollapsed,
+                                        * 2 is log of a rsm being marked, 3 is 
a split.
+                                        */
+               if (rsm == NULL)
+                       log.u_bbr.rttProp = 0;
+               else
+                       log.u_bbr.rttProp = (uint64_t)rsm;
+               log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+               log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, 
rack->r_ctl.rc_sacked);
+               TCP_LOG_EVENTP(rack->rc_tp, NULL,
+                   &rack->rc_inp->inp_socket->so_rcv,
+                   &rack->rc_inp->inp_socket->so_snd,
+                   TCP_RACK_LOG_COLLAPSE, 0,
+                   0, &log, false, &tv);
+       }
+}
+
+static void
+rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line)
 {
        /*
-        * Now we must walk the
-        * send map and divide the
-        * ones left stranded. These
-        * guys can't cause us to abort
-        * the connection and are really
-        * "unsent". However if a buggy
-        * client actually did keep some
-        * of the data i.e. collapsed the win
-        * and refused to ack and then opened
-        * the win and acked that data. We would
-        * get into an ack war, the simplier
-        * method then of just pretending we
-        * did not send those segments something
-        * won't work.
+        * Here all we do is mark the collapsed point and set the flag.
+        * This may happen again and again, but there is no
+        * sense splitting our map until we know where the
+        * peer finally lands in the collapse.
         */
-       struct rack_sendmap *rsm, *nrsm, fe;
+       rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
+       if ((rack->rc_has_collapsed == 0) ||
+           (rack->r_ctl.last_collapse_point != (rack->rc_tp->snd_una + 
rack->rc_tp->snd_wnd)))
+               counter_u64_add(rack_collapsed_win_seen, 1);
+       rack->r_ctl.last_collapse_point = rack->rc_tp->snd_una + 
rack->rc_tp->snd_wnd;
+       rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max;
+       rack->rc_has_collapsed = 1;
+       rack->r_collapse_point_valid = 1;
+       rack_log_collapse(rack, 0, 0, rack->r_ctl.last_collapse_point, line, 1, 
0, NULL);
+}
+
+static void
+rack_un_collapse_window(struct tcp_rack *rack, int line)
+{
+       struct rack_sendmap *nrsm, *rsm, fe;
+       int cnt = 0, split = 0;
 #ifdef INVARIANTS
        struct rack_sendmap *insret;
 #endif
-       tcp_seq max_seq;
 
-       rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
-       max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
        memset(&fe, 0, sizeof(fe));
-       fe.r_start = max_seq;
-       /* Find the first seq past or at maxseq */
+       rack->rc_has_collapsed = 0;
+       fe.r_start = rack->r_ctl.last_collapse_point;
        rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
        if (rsm == NULL) {
-               /* Nothing to do strange */
-               rack->rc_has_collapsed = 0;
+               /* Nothing to do maybe the peer ack'ed it all */
+               rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), 
line, 0, 0, NULL);
                return;
        }
-       /*
-        * Now do we need to split at
-        * the collapse point?
-        */
-       if (SEQ_GT(max_seq, rsm->r_start)) {
+       /* Now do we need to split this one? */
+       if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) {
+               rack_log_collapse(rack, rsm->r_start, rsm->r_end,
+                                 rack->r_ctl.last_collapse_point, line, 3, 
rsm->r_flags, rsm);
                nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
                if (nrsm == NULL) {
                        /* We can't get a rsm, mark all? */
@@ -10356,7 +10429,8 @@ rack_collapsed_window(struct tcp_rack *rack)
                        goto no_split;
                }
                /* Clone it */
-               rack_clone_rsm(rack, nrsm, rsm, max_seq);
+               split = 1;
+               rack_clone_rsm(rack, nrsm, rsm, 
rack->r_ctl.last_collapse_point);
 #ifndef INVARIANTS
                (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 #else
@@ -10366,7 +10440,8 @@ rack_collapsed_window(struct tcp_rack *rack)
                              nrsm, insret, rack, rsm);
                }
 #endif
-               rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 
max_seq, __LINE__);
+               rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT,
+                                rack->r_ctl.last_collapse_point, __LINE__);
                if (rsm->r_in_tmap) {
                        TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, 
r_tnext);
                        nrsm->r_in_tmap = 1;
@@ -10378,38 +10453,15 @@ rack_collapsed_window(struct tcp_rack *rack)
                rsm = nrsm;
        }
 no_split:
-       counter_u64_add(rack_collapsed_win, 1);
        RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
                nrsm->r_flags |= RACK_RWND_COLLAPSED;
+               rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, 
nrsm->r_flags, nrsm);
+               cnt++;
        }
-       rack->rc_has_collapsed = 1;
-}
-
-static void
-rack_un_collapse_window(struct tcp_rack *rack)
-{
-       struct rack_sendmap *rsm;
-       int cnt = 0;;
-
-       rack->r_ctl.rc_out_at_rto = 0;
-       rack->r_ctl.rc_snd_max_at_rto = rack->rc_tp->snd_una;
-       RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
-               if (rsm->r_flags & RACK_RWND_COLLAPSED) {
-                       rsm->r_flags &= ~RACK_RWND_COLLAPSED;
-                       rsm->r_flags |= RACK_MUST_RXT;
-                       if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) 
{
-                               rack->r_ctl.rc_snd_max_at_rto = rsm->r_end;
-                               rack->r_ctl.rc_out_at_rto += (rsm->r_end - 
rsm->r_start);
-                       }
-                       cnt++;
-               }
-               else
-                       break;
-       }
-       rack->rc_has_collapsed = 0;
        if (cnt) {
-               rack->r_must_retran = 1;
+               counter_u64_add(rack_collapsed_win, 1);
        }
+       rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 
0, 0, NULL);
 }
 
 static void
@@ -10518,9 +10570,12 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, 
struct socket *so,
        }
        if (tp->snd_wnd < ctf_outstanding(tp))
                /* The peer collapsed the window */
-               rack_collapsed_window(rack);
+               rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
        else if (rack->rc_has_collapsed)
-               rack_un_collapse_window(rack);
+               rack_un_collapse_window(rack, __LINE__);
+       if ((rack->r_collapse_point_valid) &&
+           (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point)))
+               rack->r_collapse_point_valid = 0;
        /* Was persist timer active and now we have window space? */
        if ((rack->rc_in_persist != 0) &&
            (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
@@ -11076,10 +11131,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, 
struct socket *so,
         */
        if (tp->snd_wnd < ctf_outstanding(tp)) {
                /* The peer collapsed the window */
-               rack_collapsed_window(rack);
+               rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
        } else if (rack->rc_has_collapsed)
-               rack_un_collapse_window(rack);
-
+               rack_un_collapse_window(rack, __LINE__);
+       if ((rack->r_collapse_point_valid) &&
+           (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point)))
+               rack->r_collapse_point_valid = 0;
        /*
         * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
         */
@@ -13066,13 +13123,6 @@ rack_do_win_updates(struct tcpcb *tp, struct tcp_rack 
*rack, uint32_t tiwin, uin
                /* Not a valid win update */
                return;
        }
-       if (tp->snd_wnd > tp->max_sndwnd)
-               tp->max_sndwnd = tp->snd_wnd;
-       if (tp->snd_wnd < (tp->snd_max - high_seq)) {
-               /* The peer collapsed the window */
-               rack_collapsed_window(rack);
-       } else if (rack->rc_has_collapsed)
-               rack_un_collapse_window(rack);
        /* Do we exit persists? */
        if ((rack->rc_in_persist != 0) &&
            (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
@@ -13609,6 +13659,15 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, 
struct socket *so, struct mb
 #ifdef TCP_ACCOUNTING
        ts_val = get_cyclecount();
 #endif
+       /* Tend to any collapsed window */
+       if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - 
high_seq))) {
+               /* The peer collapsed the window */
+               rack_collapsed_window(rack, (tp->snd_max - high_seq), __LINE__);
+       } else if (rack->rc_has_collapsed)
+               rack_un_collapse_window(rack, __LINE__);
+       if ((rack->r_collapse_point_valid) &&
+           (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point)))
+               rack->r_collapse_point_valid = 0;
        acked_amount = acked = (high_seq - tp->snd_una);
        if (acked) {
                /*
@@ -15930,6 +15989,11 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack 
*rack, struct rack_sendma
        if (tp->t_logstate != TCP_LOG_STATE_OFF) {
                union tcp_log_stackspecific log;
 
+               if (rsm->r_flags & RACK_RWND_COLLAPSED) {
+                       rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, 
__LINE__, 5, rsm->r_flags, rsm);
+                       counter_u64_add(rack_collapsed_win_rxt, 1);
+                       counter_u64_add(rack_collapsed_win_rxt_bytes, 
(rsm->r_end - rsm->r_start));
+               }
                memset(&log.u_bbr, 0, sizeof(log.u_bbr));
                log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
                if (rack->rack_no_prr)
@@ -16538,6 +16602,58 @@ failed:
        return (-1);
 }
 
+static struct rack_sendmap *
+rack_check_collapsed(struct tcp_rack *rack, uint32_t cts)
+{
+       struct rack_sendmap *rsm = NULL;
+       struct rack_sendmap fe;
+       int thresh;
+
+restart:
+       fe.r_start = rack->r_ctl.last_collapse_point;
+       rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+       if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) {
+               /* Nothing, strange turn off validity  */
+               rack->r_collapse_point_valid = 0;
+               return (NULL);
+       }
+       /* Can we send it yet? */
+       if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) {
+               /*
+                * Receiver window has not grown enough for
+                * the segment to be put on the wire.
+                */
+               return (NULL);
+       }
+       if (rsm->r_flags & RACK_ACKED) {
+               /*
+                * It has been sacked, lets move to the
+                * next one if possible.
+                */
+               rack->r_ctl.last_collapse_point = rsm->r_end;
+               /* Are we done? */
+               if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
+                           rack->r_ctl.high_collapse_point)) {
+                       rack->r_collapse_point_valid = 0;
+                       return (NULL);
+               }
+               goto restart;
+       }
+       /* Now has it been long enough ? */
+       thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), 
cts);
+       if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > 
thresh) {
+               rack_log_collapse(rack, rsm->r_start,
+                                 (cts - 
((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
+                                 thresh, __LINE__, 6, rsm->r_flags, rsm);
+               return (rsm);
+       }
+       /* Not enough time */
+       rack_log_collapse(rack, rsm->r_start,
+                         (cts - 
((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
+                         thresh, __LINE__, 7, rsm->r_flags, rsm);
+       return (NULL);
+}
+
 static int
 rack_output(struct tcpcb *tp)
 {
@@ -16598,7 +16714,6 @@ rack_output(struct tcpcb *tp)
        struct ip6_hdr *ip6 = NULL;
        int32_t isipv6;
 #endif
-       uint8_t filled_all = 0;
        bool hw_tls = false;
 
        /* setup and take the cache hits here */
@@ -16863,6 +16978,29 @@ again:
                sb_offset = rsm->r_start - tp->snd_una;
                if (len >= segsiz)
                        len = segsiz;
+       } else if (rack->r_collapse_point_valid &&
+                  ((rsm = rack_check_collapsed(rack, cts)) != NULL)) {
+               /*
+                * If an RSM is returned then enough time has passed
+                * for us to retransmit it. Move up the collapse point,
+                * since this rsm has its chance to retransmit now.
+                */
+               rack_trace_point(rack, RACK_TP_COLLAPSED_RXT);
+               rack->r_ctl.last_collapse_point = rsm->r_end;
+               /* Are we done? */
+               if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
+                           rack->r_ctl.high_collapse_point))
+                       rack->r_collapse_point_valid = 0;
+               sack_rxmit = 1;
+               /* We are not doing a TLP */
+               doing_tlp = 0;
+               len = rsm->r_end - rsm->r_start;
+               sb_offset = rsm->r_start - tp->snd_una;
+               sendalot = 0;
+               if ((rack->full_size_rxt == 0) &&
+                   (rack->shape_rxt_to_pacing_min == 0) &&
+                   (len >= segsiz))
+                       len = segsiz;
        } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
                /* We have a retransmit that takes precedence */
                if ((!IN_FASTRECOVERY(tp->t_flags)) &&
@@ -16921,53 +17059,72 @@ again:
        }
        if (rack->r_must_retran &&
            (doing_tlp == 0) &&
+           (SEQ_GT(tp->snd_max, tp->snd_una)) &&
            (rsm == NULL)) {
                /*
-                * Non-Sack and we had a RTO or Sack/non-Sack and a
-                * MTU change, we need to retransmit until we reach
-                * the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
+                * There are two different ways that we
+                * can get into this block:
+                * a) This is a non-sack connection, we had a time-out
+                *    and thus r_must_retran was set and everything
+                *    left outstanding as been marked for retransmit.
+                * b) The MTU of the path shrank, so that everything
+                *    was marked to be retransmitted with the smaller
+                *    mtu and r_must_retran was set.
+                *
+                * This means that we expect the sendmap (outstanding)
+                * to all be marked must. We can use the tmap to
+                * look at them.
+                *
                 */
-               if (SEQ_GT(tp->snd_max, tp->snd_una)) {
-                       int sendwin, flight;
-
-                       sendwin = min(tp->snd_wnd, tp->snd_cwnd);
-                       flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
-                       if (flight >= sendwin) {
-                               so = inp->inp_socket;
-                               sb = &so->so_snd;
-                               goto just_return_nolock;
-                       }
-                       rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
-                       if (rsm == NULL) {
-                               /* TSNH */
-                               rack->r_must_retran = 0;
-                               rack->r_ctl.rc_out_at_rto = 0;
-                               so = inp->inp_socket;
-                               sb = &so->so_snd;
-                               goto just_return_nolock;
-                       }
-                       if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
-                               /* It does not have the flag, we are done */
-                               rack->r_must_retran = 0;
-                               rack->r_ctl.rc_out_at_rto = 0;
-                       } else {
-                               sack_rxmit = 1;
-                               len = rsm->r_end - rsm->r_start;
-                               sendalot = 0;
-                               sb_offset = rsm->r_start - tp->snd_una;
-                               if (len >= segsiz)
-                                       len = segsiz;
-                               /*
-                                * Delay removing the flag RACK_MUST_RXT so
-                                * that the fastpath for retransmit will
-                                * work with this rsm.
-                                */
+               int sendwin, flight;
 
-                       }
-               } else {
-                       /* We must be done if there is nothing outstanding */
+               sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+               flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
+               if (flight >= sendwin) {
+                       /*
+                        * We can't send yet.
+                        */
+                       so = inp->inp_socket;
+                       sb = &so->so_snd;
+                       goto just_return_nolock;
+               }
+               /*
+                * This is the case a/b mentioned above. All
+                * outstanding/not-acked should be marked.
+                * We can use the tmap to find them.
+                */
+               rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+               if (rsm == NULL) {
+                       /* TSNH */
+                       rack->r_must_retran = 0;
+                       rack->r_ctl.rc_out_at_rto = 0;
+                       so = inp->inp_socket;
+                       sb = &so->so_snd;
+                       goto just_return_nolock;
+               }
+               if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
+                       /*
+                        * The first one does not have the flag, did we collapse
+                        * further up in our list?
+                        */
                        rack->r_must_retran = 0;
                        rack->r_ctl.rc_out_at_rto = 0;
+                       rsm = NULL;
+                       sack_rxmit = 0;
+               } else {
+                       sack_rxmit = 1;
+                       len = rsm->r_end - rsm->r_start;
+                       sb_offset = rsm->r_start - tp->snd_una;
+                       sendalot = 0;
+                       if ((rack->full_size_rxt == 0) &&
+                           (rack->shape_rxt_to_pacing_min == 0) &&
+                           (len >= segsiz))
+                               len = segsiz;
+                       /*
+                        * Delay removing the flag RACK_MUST_RXT so
+                        * that the fastpath for retransmit will
+                        * work with this rsm.
+                        */
                }
        }
        /*
@@ -18177,7 +18334,7 @@ send:
                                if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
                                ((rsm == NULL) ? hw_tls : 0)
 #ifdef NETFLIX_COPY_ARGS
-                               , &filled_all
+                               , &s_mb, &s_moff
 #endif
                                );
                        if (len <= (tp->t_maxseg - optlen)) {
@@ -18548,15 +18705,17 @@ send:
                log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
                log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
                log.u_bbr.flex4 = orig_len;
-               if (filled_all)
-                       log.u_bbr.flex5 = 0x80000000;
-               else
-                       log.u_bbr.flex5 = 0;
                /* Save off the early/late values */
                log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
                log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
                log.u_bbr.bw_inuse = rack_get_bw(rack);
-               if (rsm || sack_rxmit) {
+               log.u_bbr.flex8 = 0;
+               if (rsm) {
+                       if (rsm->r_flags & RACK_RWND_COLLAPSED) {
+                               rack_log_collapse(rack, rsm->r_start, 
rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
+                               counter_u64_add(rack_collapsed_win_rxt, 1);
+                               counter_u64_add(rack_collapsed_win_rxt_bytes, 
(rsm->r_end - rsm->r_start));
+                       }
                        if (doing_tlp)
                                log.u_bbr.flex8 = 2;
                        else
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h 
b/sys/netinet/tcp_stacks/tcp_rack.h
index e8560446b798..c747ceac7628 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -275,7 +275,7 @@ struct rack_opts_stats {
  * non-zero, the default is 4 for continuous tracing.
  * You also set in the number of connections you want
  * have get BB logs in net.inet.tcp.<stack>.tp.count.
- * 
+ *
  * Count will decrement every time BB logging is assigned
  * to a connection that hit your tracepoint.
  *
@@ -291,6 +291,7 @@ struct rack_opts_stats {
 #define RACK_TP_HWENOBUF       0x00000001      /* When we are doing hardware 
pacing and hit enobufs */
 #define RACK_TP_ENOBUF         0x00000002      /* When we hit enobufs with 
software pacing */
 #define RACK_TP_COLLAPSED_WND  0x00000003      /* When a peer to collapses its 
rwnd on us */
+#define RACK_TP_COLLAPSED_RXT  0x00000004      /* When we actually retransmit 
a collapsed window rsm */
 
 #define MIN_GP_WIN 6   /* We need at least 6 MSS in a GP measurement */
 #ifdef _KERNEL
@@ -472,6 +473,8 @@ struct rack_control {
        uint32_t roundends;             /* acked value above which round ends */
        uint32_t num_dsack;             /* Count of dsack's seen  (1 per 
window)*/
        uint32_t forced_ack_ts;
+       uint32_t last_collapse_point;   /* Last point peer collapsed too */
+       uint32_t high_collapse_point;
        uint32_t rc_lower_rtt_us_cts;   /* Time our GP rtt was last lowered */
        uint32_t rc_time_probertt_entered;
        uint32_t rc_time_probertt_starts;
@@ -546,7 +549,15 @@ struct tcp_rack {
        struct inpcb *rc_inp;   /* The inpcb Lock(a) */
        uint8_t rc_free_cnt;    /* Number of free entries on the rc_free list
                                 * Lock(a) */
-       uint8_t client_bufferlvl;       /* 0 - 5 normaly, less than or at 2 
means its real low */
+       uint8_t client_bufferlvl : 4, /* Expected range [0,5]: 0=unset, 
1=low/empty */
+               rack_deferred_inited : 1,
+               /* 
******************************************************************** */
+               /* Note for details of next two fields see 
rack_init_retransmit_rate()  */
+               /* 
******************************************************************** */
+               full_size_rxt: 1,
+               shape_rxt_to_pacing_min : 1,
+               /* 
******************************************************************** */
+               spare : 1;
        uint8_t no_prr_addback : 1,
                gp_ready : 1,
                defer_options: 1,
@@ -647,7 +658,9 @@ struct tcp_rack {
                r_late : 1,
                r_wanted_output: 1,
                r_rr_config : 2,
-               rc_avail_bit : 3;
+               r_persist_lt_bw_off : 1,
+               r_collapse_point_valid : 1,
+               rc_avail_bit : 2;
        uint16_t rc_init_win : 8,
                rc_gp_rtt_set : 1,
                rc_gp_dyn_mul : 1,

Reply via email to