Author: mmacy
Date: Tue Jun 19 01:54:00 2018
New Revision: 335356
URL: https://svnweb.freebsd.org/changeset/base/335356

Log:
  convert inpcbinfo hash and info rwlocks to epoch + mutex
  
  - Convert inpcbinfo info & hash locks to epoch for read and mutex for write
  - Garbage collect code that handled INP_INFO_TRY_RLOCK failures as
    INP_INFO_RLOCK which can no longer fail
  
  When running 64 netperfs sending minimal sized packets on a 2x8x2 reduces
  unhalted core cycles samples in rwlock rlock/runlock in udp_send from 51% to
  3%.
  
  Overall packet throughput rate limited by CPU affinity and NIC driver design
  choices.
  
  On the receiver unhalted core cycles samples in in_pcblookup_hash went from
  13% to to 1.6%
  
  Tested by LLNW and pho@
  
  Reviewed by: jtl
  Sponsored by: Limelight Networks
  Differential Revision: https://reviews.freebsd.org/D15686

Modified:
  head/sys/kern/subr_witness.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/tcp_hpts.c
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_stacks/rack.c
  head/sys/netinet/tcp_timewait.c

Modified: head/sys/kern/subr_witness.c
==============================================================================
--- head/sys/kern/subr_witness.c        Tue Jun 19 01:33:03 2018        
(r335355)
+++ head/sys/kern/subr_witness.c        Tue Jun 19 01:54:00 2018        
(r335356)
@@ -561,14 +561,14 @@ static struct witness_order_list_entry order_lists[] =
        /*
         * UDP/IP
         */
-       { "udp", &lock_class_rw },
+       { "udp", &lock_class_mtx_sleep },
        { "udpinp", &lock_class_rw },
        { "so_snd", &lock_class_mtx_sleep },
        { NULL, NULL },
        /*
         * TCP/IP
         */
-       { "tcp", &lock_class_rw },
+       { "tcp", &lock_class_mtx_sleep },
        { "tcpinp", &lock_class_rw },
        { "so_snd", &lock_class_mtx_sleep },
        { NULL, NULL },

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h   Tue Jun 19 01:33:03 2018        (r335355)
+++ head/sys/netinet/in_pcb.h   Tue Jun 19 01:54:00 2018        (r335356)
@@ -51,6 +51,8 @@
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <net/vnet.h>
+#include <net/if.h>
+#include <net/if_var.h>
 #include <vm/uma.h>
 #endif
 #include <sys/ck.h>
@@ -157,6 +159,7 @@ struct in_conninfo {
  * Key:
  * (b) - Protected by the hpts lock.
  * (c) - Constant after initialization
+ * (e) - Protected by the net_epoch_prempt epoch
  * (g) - Protected by the pcbgroup lock
  * (i) - Protected by the inpcb lock
  * (p) - Protected by the pcbinfo lock for the inpcb
@@ -231,7 +234,7 @@ struct inpcbpolicy;
 struct m_snd_tag;
 struct inpcb {
        /* Cache line #1 (amd64) */
-       CK_LIST_ENTRY(inpcb) inp_hash;  /* (h/i) hash list */
+       CK_LIST_ENTRY(inpcb) inp_hash;  /* [w](h/i) [r](e/i)  hash list */
        CK_LIST_ENTRY(inpcb) inp_pcbgrouphash;  /* (g/i) hash list */
        struct rwlock   inp_lock;
        /* Cache line #2 (amd64) */
@@ -324,8 +327,8 @@ struct inpcb {
                struct route_in6 inp_route6;
        };
        CK_LIST_ENTRY(inpcb) inp_list;  /* (p/l) list for all PCBs for proto */
-                                       /* (p[w]) for list iteration */
-                                       /* (p[r]/l) for addition/removal */
+                                       /* (e[r]) for list iteration */
+                                       /* (p[w]/l) for addition/removal */
        struct epoch_context inp_epoch_ctx;
 };
 #endif /* _KERNEL */
@@ -436,22 +439,23 @@ struct in_pcblist {
  * Locking key:
  *
  * (c) Constant or nearly constant after initialisation
+ * (e) - Protected by the net_epoch_prempt epoch
  * (g) Locked by ipi_lock
  * (l) Locked by ipi_list_lock
- * (h) Read using either ipi_hash_lock or inpcb lock; write requires both
+ * (h) Read using either net_epoch_preempt or inpcb lock; write requires both 
ipi_hash_lock and inpcb lock
  * (p) Protected by one or more pcbgroup locks
  * (x) Synchronisation properties poorly defined
  */
 struct inpcbinfo {
        /*
-        * Global lock protecting full inpcb list traversal
+        * Global lock protecting inpcb list modification
         */
-       struct rwlock            ipi_lock;
+       struct mtx               ipi_lock;
 
        /*
         * Global list of inpcbs on the protocol.
         */
-       struct inpcbhead        *ipi_listhead;          /* (g/l) */
+       struct inpcbhead        *ipi_listhead;          /* [r](e) [w](g/l) */
        u_int                    ipi_count;             /* (l) */
 
        /*
@@ -482,9 +486,9 @@ struct inpcbinfo {
        u_int                    ipi_hashfields;        /* (c) */
 
        /*
-        * Global lock protecting non-pcbgroup hash lookup tables.
+        * Global lock protecting modification non-pcbgroup hash lookup tables.
         */
-       struct rwlock            ipi_hash_lock;
+       struct mtx               ipi_hash_lock;
 
        /*
         * Global hash of inpcbs, hashed by local and foreign addresses and
@@ -626,20 +630,18 @@ int       inp_so_options(const struct inpcb *inp);
 #endif /* _KERNEL */
 
 #define INP_INFO_LOCK_INIT(ipi, d) \
-       rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
-#define INP_INFO_LOCK_DESTROY(ipi)  rw_destroy(&(ipi)->ipi_lock)
-#define INP_INFO_RLOCK(ipi)    rw_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCK(ipi)    rw_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_RLOCK(ipi)        rw_try_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_WLOCK(ipi)        rw_try_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_UPGRADE(ipi)      rw_try_upgrade(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCKED(ipi)  rw_wowned(&(ipi)->ipi_lock)
-#define INP_INFO_RUNLOCK(ipi)  rw_runlock(&(ipi)->ipi_lock)
-#define INP_INFO_WUNLOCK(ipi)  rw_wunlock(&(ipi)->ipi_lock)
-#define        INP_INFO_LOCK_ASSERT(ipi)       rw_assert(&(ipi)->ipi_lock, 
RA_LOCKED)
-#define INP_INFO_RLOCK_ASSERT(ipi)     rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
-#define INP_INFO_WLOCK_ASSERT(ipi)     rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
-#define INP_INFO_UNLOCK_ASSERT(ipi)    rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+       mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
+#define INP_INFO_LOCK_DESTROY(ipi)  mtx_destroy(&(ipi)->ipi_lock)
+#define INP_INFO_RLOCK(ipi)    NET_EPOCH_ENTER()
+#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
+#define INP_INFO_TRY_WLOCK(ipi)        mtx_trylock(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCKED(ipi)  mtx_owned(&(ipi)->ipi_lock)
+#define INP_INFO_RUNLOCK(ipi)  NET_EPOCH_EXIT()
+#define INP_INFO_WUNLOCK(ipi)  mtx_unlock(&(ipi)->ipi_lock)
+#define        INP_INFO_LOCK_ASSERT(ipi)       MPASS(in_epoch() || 
mtx_owned(&(ipi)->ipi_lock))
+#define INP_INFO_RLOCK_ASSERT(ipi)     MPASS(in_epoch())
+#define INP_INFO_WLOCK_ASSERT(ipi)     mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
+#define INP_INFO_UNLOCK_ASSERT(ipi)    MPASS(!in_epoch() && 
!mtx_owned(&(ipi)->ipi_lock))
 
 #define INP_LIST_LOCK_INIT(ipi, d) \
         rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
@@ -660,17 +662,14 @@ int       inp_so_options(const struct inpcb *inp);
 #define INP_LIST_UNLOCK_ASSERT(ipi) \
        rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
 
-#define        INP_HASH_LOCK_INIT(ipi, d) \
-       rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
-#define        INP_HASH_LOCK_DESTROY(ipi)      
rw_destroy(&(ipi)->ipi_hash_lock)
-#define        INP_HASH_RLOCK(ipi)             rw_rlock(&(ipi)->ipi_hash_lock)
-#define        INP_HASH_WLOCK(ipi)             rw_wlock(&(ipi)->ipi_hash_lock)
-#define        INP_HASH_RUNLOCK(ipi)           
rw_runlock(&(ipi)->ipi_hash_lock)
-#define        INP_HASH_WUNLOCK(ipi)           
rw_wunlock(&(ipi)->ipi_hash_lock)
-#define        INP_HASH_LOCK_ASSERT(ipi)       
rw_assert(&(ipi)->ipi_hash_lock, \
-                                           RA_LOCKED)
-#define        INP_HASH_WLOCK_ASSERT(ipi)      
rw_assert(&(ipi)->ipi_hash_lock, \
-                                           RA_WLOCKED)
+#define        INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), 
NULL, MTX_DEF)
+#define        INP_HASH_LOCK_DESTROY(ipi)      
mtx_destroy(&(ipi)->ipi_hash_lock)
+#define        INP_HASH_RLOCK(ipi)             NET_EPOCH_ENTER()
+#define        INP_HASH_WLOCK(ipi)             mtx_lock(&(ipi)->ipi_hash_lock)
+#define        INP_HASH_RUNLOCK(ipi)           NET_EPOCH_EXIT()
+#define        INP_HASH_WUNLOCK(ipi)           
mtx_unlock(&(ipi)->ipi_hash_lock)
+#define        INP_HASH_LOCK_ASSERT(ipi)       MPASS(in_epoch() || 
mtx_owned(&(ipi)->ipi_hash_lock))
+#define        INP_HASH_WLOCK_ASSERT(ipi)      
mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
 
 #define        INP_GROUP_LOCK_INIT(ipg, d)     mtx_init(&(ipg)->ipg_lock, (d), 
NULL, \
                                            MTX_DEF | MTX_DUPOK)

Modified: head/sys/netinet/tcp_hpts.c
==============================================================================
--- head/sys/netinet/tcp_hpts.c Tue Jun 19 01:33:03 2018        (r335355)
+++ head/sys/netinet/tcp_hpts.c Tue Jun 19 01:54:00 2018        (r335356)
@@ -184,9 +184,6 @@ TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_
 
 static struct tcp_hptsi tcp_pace;
 
-static int
-tcp_hptsi_lock_inpinfo(struct inpcb *inp,
-    struct tcpcb **tp);
 static void tcp_wakehpts(struct tcp_hpts_entry *p);
 static void tcp_wakeinput(struct tcp_hpts_entry *p);
 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
@@ -498,59 +495,6 @@ SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE
     0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
 
 
-/*
- * Try to get the INP_INFO lock.
- *
- * This function always succeeds in getting the lock. It will clear
- * *tpp and return (1) if something critical changed while the inpcb
- * was unlocked. Otherwise, it will leave *tpp unchanged and return (0).
- *
- * This function relies on the fact that the hpts always holds a
- * reference on the inpcb while the segment is on the hptsi wheel and
- * in the input queue.
- *
- */
-static int
-tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp)
-{
-       struct tcp_function_block *tfb;
-       struct tcpcb *tp;
-       void *ptr;
-
-       /* Try the easy way. */
-       if (INP_INFO_TRY_RLOCK(&V_tcbinfo))
-               return (0);
-
-       /*
-        * OK, let's try the hard way. We'll save the function pointer block
-        * to make sure that doesn't change while we aren't holding the
-        * lock.
-        */
-       tp = *tpp;
-       tfb = tp->t_fb;
-       ptr = tp->t_fb_ptr;
-       INP_WUNLOCK(inp);
-       INP_INFO_RLOCK(&V_tcbinfo);
-       INP_WLOCK(inp);
-       /* If the session went away, return an error. */
-       if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
-           (inp->inp_flags2 & INP_FREED)) {
-               *tpp = NULL;
-               return (1);
-       }
-       /*
-        * If the function block or stack-specific data block changed,
-        * report an error.
-        */
-       tp = intotcpcb(inp);
-       if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) {
-               *tpp = NULL;
-               return (1);
-       }
-       return (0);
-}
-
-
 static void
 tcp_wakehpts(struct tcp_hpts_entry *hpts)
 {
@@ -1290,10 +1234,7 @@ out:
                    (m->m_pkthdr.pace_lock == TI_RLOCKED ||
                    tp->t_state != TCPS_ESTABLISHED)) {
                        ti_locked = TI_RLOCKED;
-                       if (tcp_hptsi_lock_inpinfo(inp, &tp)) {
-                               CURVNET_RESTORE();
-                               goto out;
-                       }
+                       INP_INFO_RLOCK(&V_tcbinfo);
                        m = tp->t_in_pkt;
                }
                if (in_newts_every_tcb) {
@@ -1360,7 +1301,6 @@ out:
                                 */
                                if ((inp->inp_flags & (INP_TIMEWAIT | 
INP_DROPPED)) ||
                                    (inp->inp_flags2 & INP_FREED)) {
-                       out_free:
                                        while (m) {
                                                m_freem(m);
                                                m = n;
@@ -1376,8 +1316,7 @@ out:
                                if (ti_locked == TI_UNLOCKED &&
                                    (tp->t_state != TCPS_ESTABLISHED)) {
                                        ti_locked = TI_RLOCKED;
-                                       if (tcp_hptsi_lock_inpinfo(inp, &tp))
-                                               goto out_free;
+                                       INP_INFO_RLOCK(&V_tcbinfo);
                                }
                        }       /** end while(m) */
                }               /** end if ((m != NULL)  && (m == 
tp->t_in_pkt)) */

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c        Tue Jun 19 01:33:03 2018        
(r335355)
+++ head/sys/netinet/tcp_input.c        Tue Jun 19 01:54:00 2018        
(r335356)
@@ -960,25 +960,10 @@ findpcb:
         *
         * XXXRW: It may be time to rethink timewait locking.
         */
-relocked:
        if (inp->inp_flags & INP_TIMEWAIT) {
                if (ti_locked == TI_UNLOCKED) {
-                       if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
-                               in_pcbref(inp);
-                               INP_WUNLOCK(inp);
-                               INP_INFO_RLOCK(&V_tcbinfo);
-                               ti_locked = TI_RLOCKED;
-                               INP_WLOCK(inp);
-                               if (in_pcbrele_wlocked(inp)) {
-                                       inp = NULL;
-                                       goto findpcb;
-                               } else if (inp->inp_flags & INP_DROPPED) {
-                                       INP_WUNLOCK(inp);
-                                       inp = NULL;
-                                       goto findpcb;
-                               }
-                       } else
-                               ti_locked = TI_RLOCKED;
+                       INP_INFO_RLOCK();
+                       ti_locked = TI_RLOCKED;
                }
                INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 
@@ -1026,23 +1011,8 @@ relocked:
              (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) &&
               !IS_FASTOPEN(tp->t_flags)))) {
                if (ti_locked == TI_UNLOCKED) {
-                       if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
-                               in_pcbref(inp);
-                               INP_WUNLOCK(inp);
-                               INP_INFO_RLOCK(&V_tcbinfo);
-                               ti_locked = TI_RLOCKED;
-                               INP_WLOCK(inp);
-                               if (in_pcbrele_wlocked(inp)) {
-                                       inp = NULL;
-                                       goto findpcb;
-                               } else if (inp->inp_flags & INP_DROPPED) {
-                                       INP_WUNLOCK(inp);
-                                       inp = NULL;
-                                       goto findpcb;
-                               }
-                               goto relocked;
-                       } else
-                               ti_locked = TI_RLOCKED;
+                       INP_INFO_RLOCK();
+                       ti_locked = TI_RLOCKED;
                }
                INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
        }

Modified: head/sys/netinet/tcp_stacks/rack.c
==============================================================================
--- head/sys/netinet/tcp_stacks/rack.c  Tue Jun 19 01:33:03 2018        
(r335355)
+++ head/sys/netinet/tcp_stacks/rack.c  Tue Jun 19 01:54:00 2018        
(r335356)
@@ -6837,34 +6837,8 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, str
                 * Initial input (ACK to SYN-ACK etc)lets go ahead and get
                 * it processed
                 */
-               if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo))
-                       ti_locked = TI_RLOCKED;
-               if (ti_locked != TI_RLOCKED) {
-                       inp = tp->t_inpcb;
-                       tfb = tp->t_fb;
-                       in_pcbref(inp);
-                       INP_WUNLOCK(inp);
-                       INP_INFO_RLOCK(&V_tcbinfo);
-                       ti_locked = TI_RLOCKED;
-                       INP_WLOCK(inp);
-                       if (in_pcbrele_wlocked(inp))
-                               inp = NULL;
-                       if (inp == NULL || (inp->inp_flags2 & INP_FREED) ||
-                           (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
-                               /* The TCPCB went away. Free the packet. */
-                               INP_INFO_RUNLOCK(&V_tcbinfo);
-                               if (inp)
-                                       INP_WUNLOCK(inp);
-                               m_freem(m);
-                               return;
-                       }
-                       /* If the stack changed, call the correct stack. */
-                       if (tp->t_fb != tfb) {
-                               tp->t_fb->tfb_tcp_do_segment(m, th, so, tp,
-                                   drop_hdrlen, tlen, iptos, ti_locked);
-                               return;
-                       }
-               }
+               INP_INFO_RLOCK();
+               ti_locked = TI_RLOCKED;
                tcp_get_usecs(&tv);
                rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
                    tlen, iptos, ti_locked, 0, &tv);

Modified: head/sys/netinet/tcp_timewait.c
==============================================================================
--- head/sys/netinet/tcp_timewait.c     Tue Jun 19 01:33:03 2018        
(r335355)
+++ head/sys/netinet/tcp_timewait.c     Tue Jun 19 01:54:00 2018        
(r335356)
@@ -707,54 +707,46 @@ tcp_tw_2msl_scan(int reuse)
                in_pcbref(inp);
                TW_RUNLOCK(V_tw_lock);
 
-               if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
-
-                       INP_WLOCK(inp);
-                       tw = intotw(inp);
-                       if (in_pcbrele_wlocked(inp)) {
-                               if (__predict_true(tw == NULL)) {
-                                       INP_INFO_RUNLOCK(&V_tcbinfo);
-                                       continue;
-                               } else {
-                                       /* This should not happen as in TIMEWAIT
-                                        * state the inp should not be destroyed
-                                        * before its tcptw. If INVARIANTS is
-                                        * defined panic.
-                                        */
+               INP_INFO_RLOCK(&V_tcbinfo);
+               INP_WLOCK(inp);
+               tw = intotw(inp);
+               if (in_pcbrele_wlocked(inp)) {
+                       if (__predict_true(tw == NULL)) {
+                               INP_INFO_RUNLOCK(&V_tcbinfo);
+                               continue;
+                       } else {
+                               /* This should not happen as in TIMEWAIT
+                                * state the inp should not be destroyed
+                                * before its tcptw. If INVARIANTS is
+                                * defined panic.
+                                */
 #ifdef INVARIANTS
-                                       panic("%s: Panic before an infinite "
-                                           "loop: INP_TIMEWAIT && (INP_FREED "
-                                           "|| inp last reference) && tw != "
-                                           "NULL", __func__);
+                               panic("%s: Panic before an infinite "
+                                         "loop: INP_TIMEWAIT && (INP_FREED "
+                                         "|| inp last reference) && tw != "
+                                         "NULL", __func__);
 #else
-                                       log(LOG_ERR, "%s: Avoid an infinite "
-                                           "loop: INP_TIMEWAIT && (INP_FREED "
-                                           "|| inp last reference) && tw != "
-                                           "NULL", __func__);
+                               log(LOG_ERR, "%s: Avoid an infinite "
+                                       "loop: INP_TIMEWAIT && (INP_FREED "
+                                       "|| inp last reference) && tw != "
+                                       "NULL", __func__);
 #endif
-                                       INP_INFO_RUNLOCK(&V_tcbinfo);
-                                       break;
-                               }
-                       }
-
-                       if (tw == NULL) {
-                               /* tcp_twclose() has already been called */
-                               INP_WUNLOCK(inp);
                                INP_INFO_RUNLOCK(&V_tcbinfo);
-                               continue;
+                               break;
                        }
+               }
 
-                       tcp_twclose(tw, reuse);
+               if (tw == NULL) {
+                       /* tcp_twclose() has already been called */
+                       INP_WUNLOCK(inp);
                        INP_INFO_RUNLOCK(&V_tcbinfo);
-                       if (reuse)
-                           return tw;
-               } else {
-                       /* INP_INFO lock is busy, continue later. */
-                       INP_WLOCK(inp);
-                       if (!in_pcbrele_wlocked(inp))
-                               INP_WUNLOCK(inp);
-                       break;
+                       continue;
                }
+
+               tcp_twclose(tw, reuse);
+               INP_INFO_RUNLOCK(&V_tcbinfo);
+               if (reuse)
+                       return tw;
        }
 
        return NULL;
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "[email protected]"

Reply via email to