Author: pkelsey
Date: Wed Jul 29 17:59:13 2015
New Revision: 286027
URL: https://svnweb.freebsd.org/changeset/base/286027

Log:
  Revert r265338, r271089 and r271123 as those changes do not handle
  non-inline urgent data and introduce an mbuf exhaustion attack vector
  similar to FreeBSD-SA-15:15.tcp, but not requiring VNETs.
  
  Address the issue described in FreeBSD-SA-15:15.tcp.
  
  Reviewed by:  glebius
  Approved by:  so
  Approved by:  jmallett (mentor)
  Security:     FreeBSD-SA-15:15.tcp
  Sponsored by: Norse Corp, Inc.

Modified:
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_reass.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h
  head/sys/sys/mbuf.h

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c        Wed Jul 29 17:50:14 2015        
(r286026)
+++ head/sys/netinet/tcp_input.c        Wed Jul 29 17:59:13 2015        
(r286027)
@@ -1665,7 +1665,8 @@ tcp_do_segment(struct mbuf *m, struct tc
            tp->snd_nxt == tp->snd_max &&
            tiwin && tiwin == tp->snd_wnd && 
            ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
-           tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 ||
+           LIST_EMPTY(&tp->t_segq) &&
+           ((to.to_flags & TOF_TS) == 0 ||
             TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
                /*
@@ -2903,7 +2904,8 @@ dodata:                                                   
/* XXX */
                 * immediately when segments are out of order (so
                 * fast retransmit can work).
                 */
-               if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL &&
+               if (th->th_seq == tp->rcv_nxt &&
+                   LIST_EMPTY(&tp->t_segq) &&
                    TCPS_HAVEESTABLISHED(tp->t_state)) {
                        if (DELAY_ACK(tp, tlen))
                                tp->t_flags |= TF_DELACK;

Modified: head/sys/netinet/tcp_reass.c
==============================================================================
--- head/sys/netinet/tcp_reass.c        Wed Jul 29 17:50:14 2015        
(r286026)
+++ head/sys/netinet/tcp_reass.c        Wed Jul 29 17:59:13 2015        
(r286027)
@@ -71,33 +71,80 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+
+static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
+    "TCP Segment Reassembly Queue");
+
+static int tcp_reass_maxseg = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
+    &tcp_reass_maxseg, 0,
+    "Global maximum number of TCP Segments in Reassembly Queue");
+
+static uma_zone_t tcp_reass_zone;
+SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_VNET,
+    &tcp_reass_zone,
+    "Global number of TCP Segments currently in Reassembly Queue");
+
+/* Initialize TCP reassembly queue */
+static void
+tcp_reass_zone_change(void *tag)
+{
+
+       /* Set the zone limit and read back the effective value. */
+       tcp_reass_maxseg = nmbclusters / 16;
+       tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
+           tcp_reass_maxseg);
+}
+
+void
+tcp_reass_global_init(void)
+{
+
+       tcp_reass_maxseg = nmbclusters / 16;
+       TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
+           &tcp_reass_maxseg);
+       tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
+           NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+       /* Set the zone limit and read back the effective value. */
+       tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
+           tcp_reass_maxseg);
+       EVENTHANDLER_REGISTER(nmbclusters_change,
+           tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
+}
 
 void
 tcp_reass_flush(struct tcpcb *tp)
 {
-       struct mbuf *m;
+       struct tseg_qent *qe;
 
        INP_WLOCK_ASSERT(tp->t_inpcb);
 
-       while ((m = tp->t_segq) != NULL) {
-               tp->t_segq = m->m_nextpkt;
-               tp->t_segqlen -= m->m_pkthdr.len;
-               m_freem(m);
+       while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) {
+               LIST_REMOVE(qe, tqe_q);
+               m_freem(qe->tqe_m);
+               uma_zfree(tcp_reass_zone, qe);
+               tp->t_segqlen--;
        }
 
        KASSERT((tp->t_segqlen == 0),
-           ("TCP reass queue %p length is %d instead of 0 after flush.",
+           ("TCP reass queue %p segment count is %d instead of 0 after flush.",
            tp, tp->t_segqlen));
 }
 
-#define        M_TCPHDR(m)     ((struct tcphdr *)((m)->m_pkthdr.pkt_tcphdr))
-
 int
 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
 {
+       struct tseg_qent *q;
+       struct tseg_qent *p = NULL;
+       struct tseg_qent *nq;
+       struct tseg_qent *te = NULL;
        struct socket *so = tp->t_inpcb->inp_socket;
-       struct mbuf *mq, *mp;
-       int flags, wakeup;
+       char *s = NULL;
+       int flags;
+       struct tseg_qent tqs;
 
        INP_WLOCK_ASSERT(tp->t_inpcb);
 
@@ -113,10 +160,6 @@ tcp_reass(struct tcpcb *tp, struct tcphd
        if (th == NULL)
                goto present;
 
-       M_ASSERTPKTHDR(m);
-       KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__,
-           *tlenp, m->m_pkthdr.len));
-
        /*
         * Limit the number of segments that can be queued to reduce the
         * potential for mbuf exhaustion. For best performance, we want to be
@@ -127,15 +170,17 @@ tcp_reass(struct tcpcb *tp, struct tcphd
         * Always let the missing segment through which caused this queue.
         * NB: Access to the socket buffer is left intentionally unlocked as we
         * can tolerate stale information here.
+        *
+        * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat
+        * should work but causes packets to be dropped when they shouldn't.
+        * Investigate why and re-evaluate the below limit after the behaviour
+        * is understood.
         */
        if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
-           tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) {
-               char *s;
-
+           tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) {
                TCPSTAT_INC(tcps_rcvreassfull);
                *tlenp = 0;
-               if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
-                   NULL))) {
+               if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) 
{
                        log(LOG_DEBUG, "%s; %s: queue limit reached, "
                            "segment dropped\n", s, __func__);
                        free(s, M_TCPLOG);
@@ -145,13 +190,46 @@ tcp_reass(struct tcpcb *tp, struct tcphd
        }
 
        /*
+        * Allocate a new queue entry. If we can't, or hit the zone limit
+        * just drop the pkt.
+        *
+        * Use a temporary structure on the stack for the missing segment
+        * when the zone is exhausted. Otherwise we may get stuck.
+        */
+       te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
+       if (te == NULL) {
+               if (th->th_seq != tp->rcv_nxt || 
!TCPS_HAVEESTABLISHED(tp->t_state)) {
+                       TCPSTAT_INC(tcps_rcvmemdrop);
+                       m_freem(m);
+                       *tlenp = 0;
+                       if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
+                           NULL))) {
+                               log(LOG_DEBUG, "%s; %s: global zone limit "
+                                   "reached, segment dropped\n", s, __func__);
+                               free(s, M_TCPLOG);
+                       }
+                       return (0);
+               } else {
+                       bzero(&tqs, sizeof(struct tseg_qent));
+                       te = &tqs;
+                       if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
+                           NULL))) {
+                               log(LOG_DEBUG,
+                                   "%s; %s: global zone limit reached, using "
+                                   "stack for missing segment\n", s, __func__);
+                               free(s, M_TCPLOG);
+                       }
+               }
+       }
+       tp->t_segqlen++;
+
+       /*
         * Find a segment which begins after this one does.
         */
-       mp = NULL;
-       for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) {
-               if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq))
+       LIST_FOREACH(q, &tp->t_segq, tqe_q) {
+               if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
                        break;
-               mp = mq;
+               p = q;
        }
 
        /*
@@ -159,16 +237,18 @@ tcp_reass(struct tcpcb *tp, struct tcphd
         * our data already.  If so, drop the data from the incoming
         * segment.  If it provides all of our data, drop us.
         */
-       if (mp != NULL) {
+       if (p != NULL) {
                int i;
-
                /* conversion to int (in i) handles seq wraparound */
-               i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq;
+               i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
                if (i > 0) {
                        if (i >= *tlenp) {
                                TCPSTAT_INC(tcps_rcvduppack);
                                TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
                                m_freem(m);
+                               if (te != &tqs)
+                                       uma_zfree(tcp_reass_zone, te);
+                               tp->t_segqlen--;
                                /*
                                 * Try to present any queued data
                                 * at the left window edge to the user.
@@ -190,54 +270,37 @@ tcp_reass(struct tcpcb *tp, struct tcphd
         * While we overlap succeeding segments trim them or,
         * if they are completely covered, dequeue them.
         */
-       while (mq) {
-               struct mbuf *nq;
-               int i;
-
-               i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq;
+       while (q) {
+               int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
                if (i <= 0)
                        break;
-               if (i < mq->m_pkthdr.len) {
-                       M_TCPHDR(mq)->th_seq += i;
-                       m_adj(mq, i);
-                       tp->t_segqlen -= i;
+               if (i < q->tqe_len) {
+                       q->tqe_th->th_seq += i;
+                       q->tqe_len -= i;
+                       m_adj(q->tqe_m, i);
                        break;
                }
 
-               nq = mq->m_nextpkt;
-               tp->t_segqlen -= mq->m_pkthdr.len;
-               m_freem(mq);
-               if (mp)
-                       mp->m_nextpkt = nq;
-               else
-                       tp->t_segq = nq;
-               mq = nq;
+               nq = LIST_NEXT(q, tqe_q);
+               LIST_REMOVE(q, tqe_q);
+               m_freem(q->tqe_m);
+               uma_zfree(tcp_reass_zone, q);
+               tp->t_segqlen--;
+               q = nq;
        }
 
-       /*
-        * Insert the new segment queue entry into place.  Try to collapse
-        * mbuf chains if segments are adjacent.
-        */
-       if (mp) {
-               if (M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len == th->th_seq)
-                       m_catpkt(mp, m);
-               else {
-                       m->m_nextpkt = mp->m_nextpkt;
-                       mp->m_nextpkt = m;
-                       m->m_pkthdr.pkt_tcphdr = th;
-               }
+       /* Insert the new segment queue entry into place. */
+       te->tqe_m = m;
+       te->tqe_th = th;
+       te->tqe_len = *tlenp;
+
+       if (p == NULL) {
+               LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
        } else {
-               mq = tp->t_segq;
-               tp->t_segq = m;
-               if (mq && th->th_seq + *tlenp == M_TCPHDR(mq)->th_seq) {
-                       m->m_nextpkt = mq->m_nextpkt;
-                       mq->m_nextpkt = NULL;
-                       m_catpkt(m, mq);
-               } else
-                       m->m_nextpkt = mq;
-               m->m_pkthdr.pkt_tcphdr = th;
+               KASSERT(te != &tqs, ("%s: temporary stack based entry not "
+                   "first element in queue", __func__));
+               LIST_INSERT_AFTER(p, te, tqe_q);
        }
-       tp->t_segqlen += *tlenp;
 
 present:
        /*
@@ -246,30 +309,25 @@ present:
         */
        if (!TCPS_HAVEESTABLISHED(tp->t_state))
                return (0);
-
-       flags = 0;
-       wakeup = 0;
+       q = LIST_FIRST(&tp->t_segq);
+       if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
+               return (0);
        SOCKBUF_LOCK(&so->so_rcv);
-       while ((mq = tp->t_segq) != NULL &&
-           M_TCPHDR(mq)->th_seq == tp->rcv_nxt) {
-               tp->t_segq = mq->m_nextpkt;
-
-               tp->rcv_nxt += mq->m_pkthdr.len;
-               tp->t_segqlen -= mq->m_pkthdr.len;
-               flags = M_TCPHDR(mq)->th_flags & TH_FIN;
-
+       do {
+               tp->rcv_nxt += q->tqe_len;
+               flags = q->tqe_th->th_flags & TH_FIN;
+               nq = LIST_NEXT(q, tqe_q);
+               LIST_REMOVE(q, tqe_q);
                if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
-                       m_freem(mq);
-               else {
-                       mq->m_nextpkt = NULL;
-                       sbappendstream_locked(&so->so_rcv, mq, 0);
-                       wakeup = 1;
-               }
-       }
+                       m_freem(q->tqe_m);
+               else
+                       sbappendstream_locked(&so->so_rcv, q->tqe_m, 0);
+               if (q != &tqs)
+                       uma_zfree(tcp_reass_zone, q);
+               tp->t_segqlen--;
+               q = nq;
+       } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
        ND6_HINT(tp);
-       if (wakeup)
-               sorwakeup_locked(so);
-       else
-               SOCKBUF_UNLOCK(&so->so_rcv);
+       sorwakeup_locked(so);
        return (flags);
 }

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c Wed Jul 29 17:50:14 2015        (r286026)
+++ head/sys/netinet/tcp_subr.c Wed Jul 29 17:59:13 2015        (r286027)
@@ -385,6 +385,8 @@ tcp_init(void)
        if (!IS_DEFAULT_VNET(curvnet))
                return;
 
+       tcp_reass_global_init();
+
        /* XXX virtualize those bellow? */
        tcp_delacktime = TCPTV_DELACK;
        tcp_keepinit = TCPTV_KEEP_INIT;

Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c       Wed Jul 29 17:50:14 2015        
(r286026)
+++ head/sys/netinet/tcp_usrreq.c       Wed Jul 29 17:59:13 2015        
(r286027)
@@ -1977,7 +1977,7 @@ db_print_tcpcb(struct tcpcb *tp, const c
 
        db_print_indent(indent);
        db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
-          tp->t_segq, tp->t_segqlen, tp->t_dupacks);
+          LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
 
        db_print_indent(indent);
        db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h  Wed Jul 29 17:50:14 2015        (r286026)
+++ head/sys/netinet/tcp_var.h  Wed Jul 29 17:59:13 2015        (r286027)
@@ -46,6 +46,15 @@ VNET_DECLARE(int, tcp_do_rfc1323);
 
 #endif /* _KERNEL */
 
+/* TCP segment queue entry */
+struct tseg_qent {
+       LIST_ENTRY(tseg_qent) tqe_q;
+       int     tqe_len;                /* TCP segment data length */
+       struct  tcphdr *tqe_th;         /* a pointer to tcp header */
+       struct  mbuf    *tqe_m;         /* mbuf contains packet */
+};
+LIST_HEAD(tsegqe_head, tseg_qent);
+
 struct sackblk {
        tcp_seq start;          /* start seq no. of sack block */
        tcp_seq end;            /* end seq no. */
@@ -91,7 +100,7 @@ do {                                                         
\
  * Organized for 16 byte cacheline efficiency.
  */
 struct tcpcb {
-       struct  mbuf *t_segq;           /* segment reassembly queue */
+       struct  tsegqe_head t_segq;     /* segment reassembly queue */
        void    *t_pspare[2];           /* new reassembly queue */
        int     t_segqlen;              /* segment reassembly queue length */
        int     t_dupacks;              /* consecutive dup acks recd */
@@ -667,6 +676,7 @@ char        *tcp_log_addrs(struct in_conninfo *
 char   *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
            const void *);
 int     tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
+void    tcp_reass_global_init(void);
 void    tcp_reass_flush(struct tcpcb *);
 int     tcp_input(struct mbuf **, int *, int);
 u_long  tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);

Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h Wed Jul 29 17:50:14 2015        (r286026)
+++ head/sys/sys/mbuf.h Wed Jul 29 17:59:13 2015        (r286027)
@@ -150,7 +150,6 @@ struct pkthdr {
 #define        tso_segsz       PH_per.sixteen[1]
 #define        csum_phsum      PH_per.sixteen[2]
 #define        csum_data       PH_per.thirtytwo[1]
-#define        pkt_tcphdr      PH_loc.ptr
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to