Author: jch
Date: Thu Oct 30 08:53:56 2014
New Revision: 273850
URL: https://svnweb.freebsd.org/changeset/base/273850

Log:
  Fix a race condition in TCP timewait between tcp_tw_2msl_reuse() and
  tcp_tw_2msl_scan().  This race condition drives unplanned timewait
  timeout cancellation.  Also simplify implementation by holding inpcb
  reference and removing tcptw reference counting.
  
  Differential Revision:        https://reviews.freebsd.org/D826
  Submitted by:         Marc De la Gueronniere <mdelagueronni...@verisign.com>
  Submitted by:         jch
  Reviewed By:          jhb (mentor), adrian, rwatson
  Sponsored by:         Verisign, Inc.
  MFC after:            2 weeks
  X-MFC-With:           r264321

Modified:
  head/sys/netinet/tcp_timer.c
  head/sys/netinet/tcp_timer.h
  head/sys/netinet/tcp_timewait.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h

Modified: head/sys/netinet/tcp_timer.c
==============================================================================
--- head/sys/netinet/tcp_timer.c        Thu Oct 30 08:50:01 2014        
(r273849)
+++ head/sys/netinet/tcp_timer.c        Thu Oct 30 08:53:56 2014        
(r273850)
@@ -243,7 +243,7 @@ tcp_slowtimo(void)
        VNET_LIST_RLOCK_NOSLEEP();
        VNET_FOREACH(vnet_iter) {
                CURVNET_SET(vnet_iter);
-               tcp_tw_2msl_scan();
+               (void) tcp_tw_2msl_scan(0);
                CURVNET_RESTORE();
        }
        VNET_LIST_RUNLOCK_NOSLEEP();

Modified: head/sys/netinet/tcp_timer.h
==============================================================================
--- head/sys/netinet/tcp_timer.h        Thu Oct 30 08:50:01 2014        
(r273849)
+++ head/sys/netinet/tcp_timer.h        Thu Oct 30 08:53:56 2014        
(r273850)
@@ -178,8 +178,7 @@ extern int tcp_fast_finwait2_recycle;
 void   tcp_timer_init(void);
 void   tcp_timer_2msl(void *xtp);
 struct tcptw *
-       tcp_tw_2msl_reuse(void);        /* XXX temporary? */
-void   tcp_tw_2msl_scan(void);
+       tcp_tw_2msl_scan(int reuse);    /* XXX temporary? */
 void   tcp_timer_keep(void *xtp);
 void   tcp_timer_persist(void *xtp);
 void   tcp_timer_rexmt(void *xtp);

Modified: head/sys/netinet/tcp_timewait.c
==============================================================================
--- head/sys/netinet/tcp_timewait.c     Thu Oct 30 08:50:01 2014        
(r273849)
+++ head/sys/netinet/tcp_timewait.c     Thu Oct 30 08:53:56 2014        
(r273850)
@@ -49,7 +49,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
-#include <sys/refcount.h>
 
 #include <vm/uma.h>
 
@@ -101,6 +100,11 @@ static int maxtcptw;
  * currently in the TIME_WAIT state.  The queue pointers, including the
  * queue pointers in each tcptw structure, are protected using the global
  * timewait lock, which must be held over queue iteration and modification.
+ *
+ * Rules on tcptw usage:
+ *  - a inpcb is always freed _after_ its tcptw
+ *  - a tcptw relies on its inpcb reference counting for memory stability
+ *  - a tcptw is dereferenceable only while its inpcb is locked
  */
 static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
 #define        V_twq_2msl              VNET(twq_2msl)
@@ -124,32 +128,6 @@ static void        tcp_tw_2msl_reset(struct tcp
 static void    tcp_tw_2msl_stop(struct tcptw *, int);
 static int     tcp_twrespond(struct tcptw *, int);
 
-/*
- * tw_pcbref() bumps the reference count on an tw in order to maintain
- * stability of an tw pointer despite the tw lock being released.
- */
-static void
-tw_pcbref(struct tcptw *tw)
-{
-
-       KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
-       refcount_acquire(&tw->tw_refcount);
-}
-
-/*
- * Drop a refcount on an tw elevated using tw_pcbref().
- */
-static int
-tw_pcbrele(struct tcptw *tw)
-{
-
-       KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
-       if (!refcount_release(&tw->tw_refcount))
-               return (0);
-       uma_zfree(V_tcptw_zone, tw);
-       return (1);
-}
-
 static int
 tcptw_auto_size(void)
 {
@@ -279,8 +257,11 @@ tcp_twstart(struct tcpcb *tp)
                 * Reached limit on total number of TIMEWAIT connections
                 * allowed. Remove a connection from TIMEWAIT queue in LRU
                 * fashion to make room for this connection.
+                *
+                * pcbinfo lock is needed here to prevent deadlock as
+                * two inpcb locks can be acquired simultaneously.
                 */
-               tw = tcp_tw_2msl_reuse();
+               tw = tcp_tw_2msl_scan(1);
                if (tw == NULL) {
                        tp = tcp_close(tp);
                        if (tp != NULL)
@@ -288,8 +269,12 @@ tcp_twstart(struct tcpcb *tp)
                        return;
                }
        }
+       /*
+        * The tcptw will hold a reference on its inpcb until tcp_twclose
+        * is called
+        */
        tw->tw_inpcb = inp;
-       refcount_init(&tw->tw_refcount, 1);
+       in_pcbref(inp); /* Reference from tw */
 
        /*
         * Recover last window size sent.
@@ -479,7 +464,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
        INP_INFO_WLOCK_ASSERT(&V_tcbinfo);      /* in_pcbfree() */
        INP_WLOCK_ASSERT(inp);
 
-       tw->tw_inpcb = NULL;
        tcp_tw_2msl_stop(tw, reuse);
        inp->inp_ppcb = NULL;
        in_pcbdrop(inp);
@@ -509,8 +493,13 @@ tcp_twclose(struct tcptw *tw, int reuse)
                         */
                        INP_WUNLOCK(inp);
                }
-       } else
+       } else {
+               /*
+                * The socket has been already cleaned-up for us, only free the
+                * inpcb.
+                */
                in_pcbfree(inp);
+       }
        TCPSTAT_INC(tcps_closed);
 }
 
@@ -641,71 +630,94 @@ tcp_tw_2msl_reset(struct tcptw *tw, int 
 static void
 tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
 {
+       struct ucred *cred;
+       struct inpcb *inp;
+       int released;
 
        INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
        TW_WLOCK(V_tw_lock);
+       inp = tw->tw_inpcb;
+       tw->tw_inpcb = NULL;
+
        TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
-       crfree(tw->tw_cred);
+       cred = tw->tw_cred;
        tw->tw_cred = NULL;
        TW_WUNLOCK(V_tw_lock);
 
+       if (cred != NULL)
+               crfree(cred);
+
+       released = in_pcbrele_wlocked(inp);
+       KASSERT(!released, ("%s: inp should not be released here", __func__));
+
        if (!reuse)
-               tw_pcbrele(tw);
+               uma_zfree(V_tcptw_zone, tw);
 }
 
 struct tcptw *
-tcp_tw_2msl_reuse(void)
+tcp_tw_2msl_scan(int reuse)
 {
        struct tcptw *tw;
+       struct inpcb *inp;
 
-       INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
-       TW_WLOCK(V_tw_lock);
-       tw = TAILQ_FIRST(&V_twq_2msl);
-       if (tw == NULL) {
-               TW_WUNLOCK(V_tw_lock);
-               return NULL;
+#ifdef INVARIANTS
+       if (reuse) {
+               /*
+                * pcbinfo lock is needed in reuse case to prevent deadlock
+                * as two inpcb locks can be acquired simultaneously:
+                *  - the inpcb transitioning to TIME_WAIT state in
+                *    tcp_tw_start(),
+                *  - the inpcb closed by tcp_twclose().
+                */
+               INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
        }
-       TW_WUNLOCK(V_tw_lock);
-
-       INP_WLOCK(tw->tw_inpcb);
-       tcp_twclose(tw, 1);
-
-       return (tw);
-}
-
-void
-tcp_tw_2msl_scan(void)
-{
-       struct tcptw *tw;
+#endif
 
        for (;;) {
                TW_RLOCK(V_tw_lock);
                tw = TAILQ_FIRST(&V_twq_2msl);
-               if (tw == NULL || tw->tw_time - ticks > 0) {
+               if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
                        TW_RUNLOCK(V_tw_lock);
                        break;
                }
-               tw_pcbref(tw);
+               KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
+                   __func__));
+
+               inp = tw->tw_inpcb;
+               in_pcbref(inp);
                TW_RUNLOCK(V_tw_lock);
 
-               /* Close timewait state */
                if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
-                       if (tw_pcbrele(tw)) {
+
+                       INP_WLOCK(inp);
+                       tw = intotw(inp);
+                       if (in_pcbrele_wlocked(inp)) {
+                               KASSERT(tw == NULL, ("%s: held last inp "
+                                   "reference but tw not NULL", __func__));
                                INP_INFO_WUNLOCK(&V_tcbinfo);
                                continue;
                        }
 
-                       KASSERT(tw->tw_inpcb != NULL,
-                           ("%s: tw->tw_inpcb == NULL", __func__));
-                       INP_WLOCK(tw->tw_inpcb);
-                       tcp_twclose(tw, 0);
+                       if (tw == NULL) {
+                               /* tcp_twclose() has already been called */
+                               INP_WUNLOCK(inp);
+                               INP_INFO_WUNLOCK(&V_tcbinfo);
+                               continue;
+                       }
+
+                       tcp_twclose(tw, reuse);
                        INP_INFO_WUNLOCK(&V_tcbinfo);
+                       if (reuse)
+                           return tw;
                } else {
-                       /* INP_INFO lock is busy; continue later. */
-                       tw_pcbrele(tw);
+                       /* INP_INFO lock is busy, continue later. */
+                       INP_WLOCK(inp);
+                       if (!in_pcbrele_wlocked(inp))
+                               INP_WUNLOCK(inp);
                        break;
                }
        }
+
+       return NULL;
 }

Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c       Thu Oct 30 08:50:01 2014        
(r273849)
+++ head/sys/netinet/tcp_usrreq.c       Thu Oct 30 08:53:56 2014        
(r273850)
@@ -183,6 +183,21 @@ tcp_detach(struct socket *so, struct inp
                 * present until timewait ends.
                 *
                 * XXXRW: Would it be cleaner to free the tcptw here?
+                *
+                * Astute question indeed, from twtcp perspective there are
+                * three cases to consider:
+                *
+                * #1 tcp_detach is called at tcptw creation time by
+                *  tcp_twstart, then do not discard the newly created tcptw
+                *  and leave inpcb present until timewait ends
+                * #2 tcp_detach is called at timewait end (or reuse) by
+                *  tcp_twclose, then the tcptw has already been discarded
+                *  and inpcb is freed here
+                * #3 tcp_detach is called() after timewait ends (or reuse)
+                *  (e.g. by soclose), then tcptw has already been discarded
+                *  and inpcb is freed here
+                *
+                *  In all three cases the tcptw should not be freed here.
                 */
                if (inp->inp_flags & INP_DROPPED) {
                        KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h  Thu Oct 30 08:50:01 2014        (r273849)
+++ head/sys/netinet/tcp_var.h  Thu Oct 30 08:53:56 2014        (r273850)
@@ -358,7 +358,6 @@ struct tcptw {
        u_int           t_starttime;
        int             tw_time;
        TAILQ_ENTRY(tcptw) tw_2msl;
-       u_int           tw_refcount;    /* refcount */
 };
 
 #define        intotcpcb(ip)   ((struct tcpcb *)(ip)->inp_ppcb)
@@ -651,7 +650,7 @@ struct tcpcb *
         tcp_close(struct tcpcb *);
 void    tcp_discardcb(struct tcpcb *);
 void    tcp_twstart(struct tcpcb *);
-void    tcp_twclose(struct tcptw *_tw, int _reuse);
+void    tcp_twclose(struct tcptw *, int);
 void    tcp_ctlinput(int, struct sockaddr *, void *);
 int     tcp_ctloutput(struct socket *, struct sockopt *);
 struct tcpcb *
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to