SACK has been enabled in GENERIC kernels for over a decade and it's time to make it an official part of the TCP stack. This grows bsd.rd on amd64 by 8k but Theo said it's within reasonable. OK?
diff --git sys/conf/GENERIC sys/conf/GENERIC index 87dd069f514..cd68ae9e651 100644 --- sys/conf/GENERIC +++ sys/conf/GENERIC @@ -43,11 +43,10 @@ option MSDOSFS # MS-DOS file system option FIFO # FIFOs; RECOMMENDED #option TMPFS # efficient memory file system option FUSE # FUSE option SOCKET_SPLICE # Socket Splicing for TCP and UDP -option TCP_SACK # Selective Acknowledgements for TCP option TCP_ECN # Explicit Congestion Notification for TCP option TCP_SIGNATURE # TCP MD5 Signatures, for BGP routing sessions #option TCP_FACK # Forward Acknowledgements for TCP option INET6 # IPv6 diff --git sys/netinet/tcp_input.c sys/netinet/tcp_input.c index 52c206f0bf5..9951923bbdb 100644 --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -852,14 +852,12 @@ findpcb: */ tp->t_rcvtime = tcp_now; if (TCPS_HAVEESTABLISHED(tp->t_state)) TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); -#ifdef TCP_SACK if (tp->sack_enable) tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ -#endif /* TCP_SACK */ /* * Process options. */ #ifdef TCP_SIGNATURE @@ -962,25 +960,23 @@ findpcb: */ if (tp->t_pmtud_mss_acked < acked) tp->t_pmtud_mss_acked = acked; tp->snd_una = th->th_ack; -#if defined(TCP_SACK) || defined(TCP_ECN) /* * We want snd_last to track snd_una so * as to avoid sequence wraparound problems * for very large transfers. */ #ifdef TCP_ECN if (SEQ_GT(tp->snd_una, tp->snd_last)) #endif tp->snd_last = tp->snd_una; -#endif /* TCP_SACK */ -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK tp->snd_fack = tp->snd_una; tp->retran_data = 0; -#endif /* TCP_FACK */ +#endif m_freem(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer @@ -1012,15 +1008,13 @@ findpcb: /* * This is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ -#ifdef TCP_SACK /* Clean receiver SACK report if present */ if (tp->sack_enable && tp->rcv_numsacks) tcp_clean_sackreport(tp); -#endif /* TCP_SACK */ tcpstat_inc(tcps_preddat); tp->rcv_nxt += tlen; tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); ND6_HINT(tp); @@ -1137,19 +1131,17 @@ findpcb: /* Reset initial window to 1 segment for retransmit */ if (tp->t_rxtshift > 0) tp->snd_cwnd = tp->t_maxseg; tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; -#ifdef TCP_SACK /* * If we've sent a SACK_PERMITTED option, and the peer * also replied with one, then TF_SACK_PERMIT should have * been set in tcp_dooptions(). If it was not, disable SACKs. */ if (tp->sack_enable) tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; -#endif #ifdef TCP_ECN /* * if ECE is set but CWR is not set for SYN-ACK, or * both ECE and CWR are set for simultaneous open, * peer is ECN capable. @@ -1569,11 +1561,11 @@ trimthenstep6: * to keep a constant cwnd packets in the * network. */ if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) tp->t_dupacks = 0; -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK /* * In FACK, can enter fast rec. if the receiver * reports a reass. queue longer than 3 segs. */ else if (++tp->t_dupacks == tcprexmtthresh || @@ -1586,34 +1578,31 @@ trimthenstep6: tcp_seq onxt = tp->snd_nxt; u_long win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; -#if defined(TCP_SACK) || defined(TCP_ECN) if (SEQ_LT(th->th_ack, tp->snd_last)){ /* * False fast retx after * timeout. Do not cut window. */ tp->t_dupacks = 0; goto drop; } -#endif if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; -#ifdef TCP_SACK tp->snd_last = tp->snd_max; if (tp->sack_enable) { TCP_TIMER_DISARM(tp, TCPT_REXMT); tp->t_rtttime = 0; #ifdef TCP_ECN tp->t_flags |= TF_SEND_CWR; #endif tcpstat_inc(tcps_cwr_frecovery); tcpstat_inc(tcps_sack_recovery_episode); -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK tp->t_dupacks = tcprexmtthresh; (void) tcp_output(tp); /* * During FR, snd_cwnd is held * constant for FACK. @@ -1628,11 +1617,10 @@ trimthenstep6: tp->snd_cwnd = tp->snd_ssthresh+ tp->t_maxseg * tp->t_dupacks; #endif /* TCP_FACK */ goto drop; } -#endif /* TCP_SACK */ TCP_TIMER_DISARM(tp, TCPT_REXMT); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; #ifdef TCP_ECN @@ -1646,11 +1634,11 @@ trimthenstep6: tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > tcprexmtthresh) { -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK /* * while (awnd < cwnd) * sendsomething(); */ if (tp->sack_enable) { @@ -1676,16 +1664,15 @@ trimthenstep6: } /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ -#if defined(TCP_SACK) if (tp->sack_enable) { if (tp->t_dupacks >= tcprexmtthresh) { /* Check for a partial ACK */ if (tcp_sack_partialack(tp, th)) { -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK /* Force call to tcp_output */ if (tp->snd_awnd < tp->snd_cwnd) tp->t_flags |= TF_NEEDOUTPUT; #else tp->snd_cwnd += tp->t_maxseg; @@ -1698,14 +1685,14 @@ trimthenstep6: th->th_ack) < tp->snd_ssthresh) tp->snd_cwnd = tcp_seq_subtract(tp->snd_max, th->th_ack); tp->t_dupacks = 0; -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK if (SEQ_GT(th->th_ack, tp->snd_fack)) tp->snd_fack = th->th_ack; -#endif /* TCP_FACK */ +#endif } } } else { if (tp->t_dupacks >= tcprexmtthresh && !tcp_newreno(tp, th)) { @@ -1719,16 +1706,10 @@ trimthenstep6: tp->t_dupacks = 0; } } if (tp->t_dupacks < tcprexmtthresh) tp->t_dupacks = 0; -#else /* else no TCP_SACK */ - if (tp->t_dupacks >= tcprexmtthresh && - tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; - tp->t_dupacks = 0; -#endif if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat_inc(tcps_rcvacktoomuch); goto dropafterack_ratelim; } acked = th->th_ack - tp->snd_una; @@ -1770,14 +1751,13 @@ trimthenstep6: u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; -#if defined (TCP_SACK) if (tp->t_dupacks < tcprexmtthresh) -#endif - tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); + tp->snd_cwnd = ulmin(cw + incr, + TCP_MAXWIN << tp->snd_scale); } ND6_HINT(tp); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); @@ -1817,11 +1797,11 @@ trimthenstep6: if (SEQ_GT(tp->snd_una, tp->snd_last)) tp->snd_last = tp->snd_una; #endif if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; -#if defined (TCP_SACK) && defined (TCP_FACK) +#ifdef TCP_FACK if (SEQ_GT(tp->snd_una, tp->snd_fack)) { tp->snd_fack = tp->snd_una; /* Update snd_awnd for partial ACK * without any SACK blocks. */ @@ -1980,14 +1960,13 @@ dodata: /* XXX */ * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (tiflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { -#ifdef TCP_SACK tcp_seq laststart = th->th_seq; tcp_seq lastend = th->th_seq + tlen; -#endif + if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && tp->t_state == TCPS_ESTABLISHED) { TCP_SETUP_ACK(tp, tiflags, m); tp->rcv_nxt += tlen; tiflags = th->th_flags & TH_FIN; @@ -2005,14 +1984,12 @@ dodata: /* XXX */ } else { m_adj(m, hdroptlen); tiflags = tcp_reass(tp, th, m, &tlen); tp->t_flags |= TF_ACKNOW; } -#ifdef TCP_SACK if (tp->sack_enable) tcp_update_sack_list(tp, laststart, lastend); -#endif /* * variable len never referenced again in modern BSD, * so why bother computing it ?? */ @@ -2257,11 +2234,10 @@ tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = oi->ts_val; tp->ts_recent_age = tcp_now; break; -#ifdef TCP_SACK case TCPOPT_SACK_PERMITTED: if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) continue; if (!(th->th_flags & TH_SYN)) continue; @@ -2271,11 +2247,10 @@ tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, tp->t_flags |= TF_SACK_PERMIT; break; case TCPOPT_SACK: tcp_sack_option(tp, th, cp, optlen); break; -#endif #ifdef TCP_SIGNATURE case TCPOPT_SIGNATURE: if (optlen != TCPOLEN_SIGNATURE) continue; @@ -2355,20 +2330,16 @@ tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, #endif /* TCP_SIGNATURE */ return (0); } -#if defined(TCP_SACK) u_long tcp_seq_subtract(u_long a, u_long b) { return ((long)(a - b)); } -#endif - -#ifdef TCP_SACK /* * This function is called upon receipt of new valid data (while not in header * prediction mode), and it updates the ordered list of sacks. */ void @@ -2504,15 +2475,15 @@ tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) tmp_cp += TCPOLEN_SACK; if (SEQ_LEQ(sack.end, sack.start)) continue; /* bad SACK fields */ if (SEQ_LEQ(sack.end, tp->snd_una)) continue; /* old block */ -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK /* Updates snd_fack. */ if (SEQ_GT(sack.end, tp->snd_fack)) tp->snd_fack = sack.end; -#endif /* TCP_FACK */ +#endif if (SEQ_GT(th->th_ack, tp->snd_una)) { if (SEQ_LT(sack.start, th->th_ack)) continue; } if (SEQ_GT(sack.end, tp->snd_max)) @@ -2557,20 +2528,20 @@ tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) cur = cur->next; continue; } if (SEQ_LEQ(sack.start, cur->start)) { /* Data acks at least the beginning of hole */ -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK if (SEQ_GT(sack.end, cur->rxmit)) tp->retran_data -= tcp_seq_subtract(cur->rxmit, cur->start); else tp->retran_data -= tcp_seq_subtract(sack.end, cur->start); -#endif /* TCP_FACK */ +#endif if (SEQ_GEQ(sack.end, cur->end)) { /* Acks entire hole, so delete hole */ if (p != cur) { p->next = cur->next; pool_put(&sackhl_pool, cur); @@ -2591,16 +2562,16 @@ tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) cur = cur->next; continue; } /* move end of hole backward */ if (SEQ_GEQ(sack.end, cur->end)) { -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK if (SEQ_GT(cur->rxmit, sack.start)) tp->retran_data -= tcp_seq_subtract(cur->rxmit, sack.start); -#endif /* TCP_FACK */ +#endif cur->end = sack.start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); cur->dups++; if (((sack.end - cur->end)/tp->t_maxseg) >= tcprexmtthresh) @@ -2617,20 +2588,20 @@ tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) */ temp = (struct sackhole *) pool_get(&sackhl_pool, PR_NOWAIT); if (temp == NULL) goto done; /* ENOBUFS */ -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK if (SEQ_GT(cur->rxmit, sack.end)) tp->retran_data -= tcp_seq_subtract(sack.end, sack.start); else if (SEQ_GT(cur->rxmit, sack.start)) tp->retran_data -= tcp_seq_subtract(cur->rxmit, sack.start); -#endif /* TCP_FACK */ +#endif temp->next = cur->next; temp->start = sack.end; temp->end = cur->end; temp->dups = cur->dups; temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); @@ -2668,11 +2639,11 @@ tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) tp->rcv_lastsack = sack.end; tp->snd_numholes++; } } done: -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK /* * Update retran_data and snd_awnd. Go through the list of * holes. Increment retran_data by (hole->rxmit - hole->start). */ tp->retran_data = 0; @@ -2681,11 +2652,11 @@ done: tp->retran_data += cur->rxmit - cur->start; cur = cur->next; } tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + tp->retran_data; -#endif /* TCP_FACK */ +#endif return; } /* @@ -2759,11 +2730,10 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) #endif return (1); } return (0); } -#endif /* TCP_SACK */ /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for @@ -3118,11 +3088,10 @@ tcp_mss_update(struct tcpcb *tp) (void)sbreserve(so, &so->so_rcv, bufsize); } } -#if defined (TCP_SACK) /* * Checks for partial ack. If partial ack arrives, force the retransmission * of the next unacknowledged segment, do not clear tp->t_dupacks, and return * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. If the ack advances at least to tp->snd_last, return 0. @@ -3163,11 +3132,10 @@ tcp_newreno(struct tcpcb *tp, struct tcphdr *th) return 1; } return 0; } -#endif /* TCP_SACK */ int tcp_mss_adv(struct mbuf *m, int af) { int mss = 0; @@ -3738,37 +3706,30 @@ syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ so = NULL; m_freem(m); goto abort; } -#ifdef TCP_SACK tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT; -#endif - tp->ts_modulate = sc->sc_modulate; tp->ts_recent = sc->sc_timestamp; tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_sendseqinit(tp); -#if defined (TCP_SACK) || defined(TCP_ECN) tp->snd_last = tp->snd_una; -#endif /* TCP_SACK */ -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK tp->snd_fack = tp->snd_una; tp->retran_data = 0; tp->snd_awnd = 0; -#endif /* TCP_FACK */ +#endif #ifdef TCP_ECN if (sc->sc_flags & SCF_ECN_PERMIT) { tp->t_flags |= TF_ECN_PERMIT; tcpstat_inc(tcps_ecn_accepts); } #endif -#ifdef TCP_SACK if (sc->sc_flags & SCF_SACK_PERMIT) tp->t_flags |= TF_SACK_PERMIT; -#endif #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif tcp_rcvseqinit(tp); @@ -3917,13 +3878,11 @@ syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, if (optp || (tp->t_flags & TF_SIGNATURE)) { #else if (optp) { #endif tb.pf = tp->pf; -#ifdef TCP_SACK tb.sack_enable = tp->sack_enable; -#endif tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) tb.t_flags |= TF_SIGNATURE; #endif @@ -4032,18 +3991,16 @@ syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, */ if (tcp_do_ecn && (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) sc->sc_flags |= SCF_ECN_PERMIT; #endif -#ifdef TCP_SACK /* * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). */ if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) sc->sc_flags |= SCF_SACK_PERMIT; -#endif #ifdef TCP_SIGNATURE if (tb.t_flags & TF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif sc->sc_tp = tp; @@ -4087,13 +4044,11 @@ syn_cache_respond(struct syn_cache *sc, struct mbuf *m) return (EAFNOSUPPORT); } /* Compute the size of the TCP options. */ optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + -#ifdef TCP_SACK ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) + -#endif #ifdef TCP_SIGNATURE ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + #endif ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); @@ -4169,17 +4124,15 @@ syn_cache_respond(struct syn_cache *sc, struct mbuf *m) *optp++ = TCPOPT_MAXSEG; *optp++ = 4; *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; *optp++ = sc->sc_ourmaxseg & 0xff; -#ifdef TCP_SACK /* Include SACK_PERMIT_HDR option if peer has already done so. */ if (sc->sc_flags & SCF_SACK_PERMIT) { *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); optp += 4; } -#endif if (sc->sc_request_r_scale != 15) { *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | sc->sc_request_r_scale); diff --git sys/netinet/tcp_output.c sys/netinet/tcp_output.c index d2510bf83e1..f3bf0b8b7a1 100644 --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -94,15 +94,12 @@ #ifdef notyet extern struct mbuf *m_copypack(); #endif -#ifdef TCP_SACK extern int tcprexmtthresh; -#endif -#ifdef TCP_SACK #ifdef TCP_SACK_DEBUG void tcp_print_holes(struct tcpcb *tp); void tcp_print_holes(struct tcpcb *tp) @@ -191,11 +188,10 @@ tcp_sack_adjust(struct tcpcb *tp) if (SEQ_LT(tp->snd_nxt, cur->end)) return; tp->snd_nxt = tp->rcv_lastsack; return; } -#endif /* TCP_SACK */ /* * Tcp output routine: figure out what should be sent and send it. */ int @@ -208,15 +204,13 @@ tcp_output(struct tcpcb *tp) struct tcphdr *th; u_int32_t optbuf[howmany(MAX_TCPOPTLEN, sizeof(u_int32_t))]; u_char *opt = (u_char *)optbuf; unsigned int optlen, hdrlen, packetlen; int idle, sendalot = 0; -#ifdef TCP_SACK int i, sack_rxmit = 0; struct sackhole *p; int maxburst = TCP_MAXBURST; -#endif #ifdef TCP_SIGNATURE unsigned int sigoff; #endif /* TCP_SIGNATURE */ #ifdef TCP_ECN int needect; @@ -226,14 +220,14 @@ tcp_output(struct tcpcb *tp) tp->t_flags |= TF_NEEDOUTPUT; return (0); } else tp->t_flags &= ~TF_NEEDOUTPUT; -#if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) +#if defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) if (tp->sack_enable && (tp->t_flags & TF_SIGNATURE)) return (EINVAL); -#endif /* defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */ +#endif /* defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */ /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) @@ -254,21 +248,19 @@ tcp_output(struct tcpcb *tp) idle = 0; } else tp->t_flags &= ~TF_LASTIDLE; again: -#ifdef TCP_SACK /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); -#endif off = tp->snd_nxt - tp->snd_una; -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK /* Normally, sendable data is limited by off < tp->snd_cwnd. * But in FACK, sendable data is limited by snd_awnd < snd_cwnd, * regardless of offset. */ if (tp->sack_enable && (tp->t_dupacks > tcprexmtthresh)) @@ -277,11 +269,10 @@ again: #endif win = ulmin(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; -#ifdef TCP_SACK /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission @@ -299,11 +290,10 @@ again: if (SEQ_LT(tp->snd_una, tp->snd_last)) tp->snd_cwnd -= tp->t_maxseg; #endif } } -#endif /* TCP_SACK */ sendalot = 0; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero @@ -335,28 +325,24 @@ again: TCP_TIMER_DISARM(tp, TCPT_PERSIST); tp->t_rxtshift = 0; } } -#ifdef TCP_SACK if (!sack_rxmit) { -#endif - len = ulmin(so->so_snd.sb_cc, win) - off; + len = ulmin(so->so_snd.sb_cc, win) - off; -#if defined(TCP_SACK) && defined(TCP_FACK) - /* - * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and - * amount of outstanding data (snd_awnd) is >= snd_cwnd, then - * do not send data (like zero window conditions) - */ - if (tp->sack_enable && len && SEQ_GT(tp->snd_last, tp->snd_una) && - (tp->snd_awnd >= tp->snd_cwnd)) - len = 0; +#ifdef TCP_FACK + /* + * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), + * and amount of outstanding data (snd_awnd) is >= snd_cwnd, then + * do not send data (like zero window conditions) + */ + if (tp->sack_enable && SEQ_GT(tp->snd_last, tp->snd_una) && + len && (tp->snd_awnd >= tp->snd_cwnd)) + len = 0; #endif /* TCP_FACK */ -#ifdef TCP_SACK } -#endif if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, @@ -415,14 +401,12 @@ again: goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) goto send; -#ifdef TCP_SACK if (sack_rxmit) goto send; -#endif } /* * Compare available window to amount of window * known to peer (as advertised window less @@ -460,11 +444,10 @@ again: * then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; -#ifdef TCP_SACK /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ @@ -472,11 +455,10 @@ again: TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); return (0); } -#endif /* TCP_SACK */ /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: @@ -546,11 +528,10 @@ send: memcpy(opt + 2, &mss, sizeof(mss)); optlen = 4; if (flags & TH_ACK) tcp_mss_update(tp); -#ifdef TCP_SACK /* * If this is the first SYN of connection (not a SYN * ACK), include SACK_PERMIT_HDR option. If this is a * SYN ACK, include SACK_PERMIT_HDR option if peer has * already done so. @@ -559,12 +540,10 @@ send: (tp->t_flags & TF_SACK_PERMIT))) { *((u_int32_t *) (opt + optlen)) = htonl(TCPOPT_SACK_PERMIT_HDR); optlen += 4; } -#endif - if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *) (opt + optlen)) = htonl( TCPOPT_NOP << 24 | @@ -624,11 +603,10 @@ send: optlen += TCPOLEN_SIGLEN; } #endif /* TCP_SIGNATURE */ -#ifdef TCP_SACK /* * Send SACKs if necessary. This should be the last option processed. * Only as many SACKs are sent as are permitted by the maximum options * size. No more than three SACKs are sent. */ @@ -651,11 +629,10 @@ send: count++; } *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ } -#endif /* TCP_SACK */ #ifdef DIAGNOSTIC if (optlen > MAX_TCPOPTLEN) panic("tcp_output: options too long"); #endif /* DIAGNOSTIC */ @@ -805,27 +782,25 @@ send: if (len || (flags & (TH_SYN|TH_FIN)) || TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); -#ifdef TCP_SACK if (sack_rxmit) { /* * If sendalot was turned on (due to option stuffing), turn it * off. Properly set th_seq field. Advance the ret'x pointer * by len. */ if (sendalot) sendalot = 0; th->th_seq = htonl(p->rxmit); p->rxmit += len; -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK tp->retran_data += len; -#endif /* TCP_FACK */ +#endif tcpstat_pkt(tcps_sack_rexmits, tcps_sack_rexmit_bytes, len); } -#endif /* TCP_SACK */ th->th_ack = htonl(tp->rcv_nxt); if (optlen) { memcpy(th + 1, opt, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; @@ -960,17 +935,15 @@ send: if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } -#ifdef TCP_SACK if (tp->sack_enable) { if (sack_rxmit && (p->rxmit != tp->snd_nxt)) { goto timer; } } -#endif tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and @@ -989,22 +962,20 @@ send: * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ -#ifdef TCP_SACK timer: if (tp->sack_enable && sack_rxmit && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && tp->snd_nxt != tp->snd_max) { TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { TCP_TIMER_DISARM(tp, TCPT_PERSIST); tp->t_rxtshift = 0; } } -#endif if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && tp->snd_nxt != tp->snd_una) { TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { @@ -1123,15 +1094,15 @@ send: 0, NULL, tp->t_inpcb); break; #endif /* INET6 */ } -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK /* Update snd_awnd to reflect the new data that was sent. */ tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) + tp->retran_data; -#endif /* defined(TCP_SACK) && defined(TCP_FACK) */ +#endif if (error) { out: if (error == ENOBUFS) { /* @@ -1182,15 +1153,11 @@ out: if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + win; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~TF_ACKNOW; TCP_CLEAR_DELACK(tp); -#if defined(TCP_SACK) if (sendalot && --maxburst) -#else - if (sendalot) -#endif goto again; return (0); } void diff --git sys/netinet/tcp_subr.c sys/netinet/tcp_subr.c index 27cab0e05bb..0a426fc4db1 100644 --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -102,13 +102,11 @@ int tcp_mssdflt = TCP_MSS; int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; /* values controllable via sysctl */ int tcp_do_rfc1323 = 1; -#ifdef TCP_SACK int tcp_do_sack = 1; /* RFC 2018 selective ACKs */ -#endif int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */ #ifdef TCP_ECN int tcp_do_ecn = 0; /* RFC3168 ECN enabled/disabled? */ #endif int tcp_do_rfc3390 = 2; /* Increase TCP's Initial Window to 10*mss */ @@ -118,19 +116,15 @@ u_int32_t tcp_now = 1; #ifndef TCB_INITIAL_HASH_SIZE #define TCB_INITIAL_HASH_SIZE 128 #endif int tcp_reass_limit = NMBCLUSTERS / 8; /* hardlimit for tcpqe_pool */ -#ifdef TCP_SACK int tcp_sackhole_limit = 32*1024; /* hardlimit for sackhl_pool */ -#endif struct pool tcpcb_pool; struct pool tcpqe_pool; -#ifdef TCP_SACK struct pool sackhl_pool; -#endif struct cpumem *tcpcounters; /* tcp statistics */ tcp_seq tcp_iss; /* @@ -143,15 +137,13 @@ tcp_init(void) pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, IPL_SOFTNET, 0, "tcpcb", NULL); pool_init(&tcpqe_pool, sizeof(struct tcpqent), 0, IPL_SOFTNET, 0, "tcpqe", NULL); pool_sethardlimit(&tcpqe_pool, tcp_reass_limit, NULL, 0); -#ifdef TCP_SACK pool_init(&sackhl_pool, sizeof(struct sackhole), 0, IPL_SOFTNET, 0, "sackhl", NULL); pool_sethardlimit(&sackhl_pool, tcp_sackhole_limit, NULL, 0); -#endif /* TCP_SACK */ in_pcbinit(&tcbtable, TCB_INITIAL_HASH_SIZE); tcpcounters = counters_alloc(tcps_ncounters); #ifdef INET6 /* @@ -437,13 +429,11 @@ tcp_newtcpcb(struct inpcb *inp) TCP_INIT_DELACK(tp); for (i = 0; i < TCPT_NTIMERS; i++) TCP_TIMER_INIT(tp, i); timeout_set(&tp->t_reap_to, tcp_reaper, tp); -#ifdef TCP_SACK tp->sack_enable = tcp_do_sack; -#endif tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; tp->t_inpcb = inp; /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives @@ -513,30 +503,27 @@ tcp_drop(struct tcpcb *tp, int errno) struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; -#ifdef TCP_SACK struct sackhole *p, *q; -#endif /* free the reassembly queue, if any */ tcp_freeq(tp); tcp_canceltimers(tp); TCP_CLEAR_DELACK(tp); syn_cache_cleanup(tp); -#ifdef TCP_SACK /* Free SACK holes. */ q = p = tp->snd_holes; while (p != 0) { q = p->next; pool_put(&sackhl_pool, p); p = q; } -#endif + m_free(tp->t_template); tp->t_flags |= TF_DEAD; timeout_add(&tp->t_reap_to, 0); diff --git sys/netinet/tcp_timer.c sys/netinet/tcp_timer.c index 3f063ed9936..9c40693acb6 100644 --- sys/netinet/tcp_timer.c +++ sys/netinet/tcp_timer.c @@ -154,11 +154,10 @@ int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ /* * TCP timer processing. */ -#ifdef TCP_SACK void tcp_timer_freesack(struct tcpcb *); void tcp_timer_freesack(struct tcpcb *tp) { @@ -175,13 +174,12 @@ tcp_timer_freesack(struct tcpcb *tp) tp->snd_holes = 0; #ifdef TCP_FACK tp->snd_fack = tp->snd_una; tp->retran_data = 0; tp->snd_awnd = 0; -#endif /* TCP_FACK */ +#endif } -#endif /* TCP_SACK */ void tcp_timer_rexmt(void *arg) { struct tcpcb *tp = arg; @@ -217,13 +215,11 @@ tcp_timer_rexmt(void *arg) in_pcbnotifyall(&tcbtable, sintosa(&sin), tp->t_inpcb->inp_rtableid, EMSGSIZE, tcp_mtudisc); goto out; } -#ifdef TCP_SACK tcp_timer_freesack(tp); -#endif if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; tcpstat_inc(tcps_timeoutdrop); (void)tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); @@ -303,17 +299,15 @@ tcp_timer_rexmt(void *arg) in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; -#if defined(TCP_SACK) /* * Note: We overload snd_last to function also as the * snd_last variable described in RFC 2582 */ tp->snd_last = tp->snd_max; -#endif /* TCP_SACK */ /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; #ifdef TCP_ECN @@ -460,13 +454,11 @@ tcp_timer_2msl(void *arg) NET_LOCK(); if (tp->t_flags & TF_DEAD) goto out; -#ifdef TCP_SACK tcp_timer_freesack(tp); -#endif if (tp->t_state != TCPS_TIME_WAIT && ((tcp_maxidle == 0) || ((tcp_now - tp->t_rcvtime) <= tcp_maxidle))) TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl); else diff --git sys/netinet/tcp_usrreq.c sys/netinet/tcp_usrreq.c index 1e70b9e07c9..2d784ff52ea 100644 --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -266,14 +266,12 @@ tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, tcpstat_inc(tcps_connattempt); tp->t_state = TCPS_SYN_SENT; TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); tcp_set_iss_tsm(tp); tcp_sendseqinit(tp); -#if defined(TCP_SACK) tp->snd_last = tp->snd_una; -#endif -#if defined(TCP_SACK) && defined(TCP_FACK) +#ifdef TCP_FACK tp->snd_fack = tp->snd_una; tp->retran_data = 0; tp->snd_awnd = 0; #endif error = tcp_output(tp); @@ -494,11 +492,10 @@ tcp_ctloutput(int op, struct socket *so, int level, int optname, tp->t_maxseg = i; else error = EINVAL; break; -#ifdef TCP_SACK case TCP_SACK_ENABLE: if (m == NULL || m->m_len < sizeof (int)) { error = EINVAL; break; } @@ -516,11 +513,10 @@ tcp_ctloutput(int op, struct socket *so, int level, int optname, if (*mtod(m, int *)) tp->sack_enable = 1; else tp->sack_enable = 0; break; -#endif #ifdef TCP_SIGNATURE case TCP_MD5SIG: if (m == NULL || m->m_len < sizeof (int)) { error = EINVAL; break; @@ -531,13 +527,11 @@ tcp_ctloutput(int op, struct socket *so, int level, int optname, break; } if (*mtod(m, int *)) { tp->t_flags |= TF_SIGNATURE; -#ifdef TCP_SACK tp->sack_enable = 0; -#endif /* TCP_SACK */ } else tp->t_flags &= ~TF_SIGNATURE; break; #endif /* TCP_SIGNATURE */ default: @@ -557,15 +551,13 @@ tcp_ctloutput(int op, struct socket *so, int level, int optname, *mtod(m, int *) = tp->t_flags & TF_NOPUSH; break; case TCP_MAXSEG: *mtod(m, int *) = tp->t_maxseg; break; -#ifdef TCP_SACK case TCP_SACK_ENABLE: *mtod(m, int *) = tp->sack_enable; break; -#endif #ifdef TCP_SIGNATURE case TCP_MD5SIG: *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; break; #endif @@ -958,18 +950,17 @@ tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, /* All sysctl names at this level are terminal. */ if (namelen != 1) return (ENOTDIR); switch (name[0]) { -#ifdef TCP_SACK case TCPCTL_SACK: NET_LOCK(); error = sysctl_int(oldp, oldlenp, newp, newlen, &tcp_do_sack); NET_UNLOCK(); return (error); -#endif + case TCPCTL_SLOWHZ: return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); case TCPCTL_BADDYNAMIC: NET_LOCK(); @@ -1023,11 +1014,11 @@ tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, if (!error) tcp_reass_limit = nval; } NET_UNLOCK(); return (error); -#ifdef TCP_SACK + case TCPCTL_SACKHOLE_LIMIT: NET_LOCK(); nval = tcp_sackhole_limit; error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); if (!error && nval != tcp_sackhole_limit) { @@ -1035,11 +1026,10 @@ tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, if (!error) tcp_sackhole_limit = nval; } NET_UNLOCK(); return (error); -#endif case TCPCTL_STATS: return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); case TCPCTL_SYN_USE_LIMIT: diff --git sys/netinet/tcp_var.h sys/netinet/tcp_var.h index de3390b61f1..bc2b77d188b 100644 --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -114,34 +114,28 @@ struct tcpcb { tcp_seq snd_up; /* send urgent pointer */ tcp_seq snd_wl1; /* window update seg seq number */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq iss; /* initial send sequence number */ u_long snd_wnd; /* send window */ -#if 1 /*def TCP_SACK*/ int sack_enable; /* enable SACK for this connection */ int snd_numholes; /* number of holes seen by sender */ struct sackhole *snd_holes; /* linked list of holes (sorted) */ -#if 1 /*defined(TCP_SACK) && defined(TCP_FACK)*/ +#if 1 /*defined(TCP_FACK)*/ tcp_seq snd_fack; /* for FACK congestion control */ u_long snd_awnd; /* snd_nxt - snd_fack + */ /* retransmitted data */ int retran_data; /* amount of outstanding retx. data */ #endif /* TCP_FACK */ -#endif /* TCP_SACK */ -#if 1 /*defined(TCP_SACK) || defined(TCP_ECN)*/ tcp_seq snd_last; /* for use in fast recovery */ -#endif /* receive sequence variables */ u_long rcv_wnd; /* receive window */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_up; /* receive urgent pointer */ tcp_seq irs; /* initial receive sequence number */ -#if 1 /*def TCP_SACK*/ tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/ int rcv_numsacks; /* # distinct sack blks present */ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ -#endif /* * Additional variables for this implementation. */ /* receive variables */ @@ -696,15 +690,13 @@ extern u_int32_t tcp_now; /* for RFC 1323 timestamps */ extern int tcp_do_rfc1323; /* enabled/disabled? */ extern int tcptv_keep_init; /* time to keep alive the initial SYN packet */ extern int tcp_mssdflt; /* default maximum segment size */ extern int tcp_rst_ppslim; /* maximum outgoing RST packet per second */ extern int tcp_ack_on_push; /* ACK immediately on PUSH */ -#ifdef TCP_SACK extern int tcp_do_sack; /* SACK enabled/disabled */ extern struct pool sackhl_pool; extern int tcp_sackhole_limit; /* max entries for tcp sack queues */ -#endif extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */ extern int tcp_do_rfc3390; /* RFC3390 Increasing TCP's Initial Window */ extern struct pool tcpqe_pool; extern int tcp_reass_limit; /* max entries for tcp reass queues */ @@ -765,11 +757,10 @@ int tcp_sysctl(int *, u_int, void *, size_t *, void *, size_t); int tcp_usrreq(struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *, struct proc *); int tcp_attach(struct socket *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcpdropoldhalfopen(struct tcpcb *, u_int16_t); -#ifdef TCP_SACK void tcp_sack_option(struct tcpcb *,struct tcphdr *,u_char *,int); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq, tcp_seq); void tcp_del_sackholes(struct tcpcb *, struct tcphdr *); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); @@ -777,15 +768,12 @@ struct sackhole * tcp_sack_output(struct tcpcb *tp); int tcp_sack_partialack(struct tcpcb *, struct tcphdr *); #ifdef DEBUG void tcp_print_holes(struct tcpcb *tp); #endif -#endif /* TCP_SACK */ -#if defined(TCP_SACK) int tcp_newreno(struct tcpcb *, struct tcphdr *); u_long tcp_seq_subtract(u_long, u_long ); -#endif /* TCP_SACK */ #ifdef TCP_SIGNATURE int tcp_signature_apply(caddr_t, caddr_t, unsigned int); int tcp_signature(struct tdb *, int, struct mbuf *, struct tcphdr *, int, int, char *); #endif /* TCP_SIGNATURE */