On Sun, May 28, 2017 at 04:22:20PM +0200, Claudio Jeker wrote:
> This is a start at pushing locking down into the socket layer.  Use a
> SRPL list to manage the route PCBs and start running stuff without the
> kernel lock where save. Includes some cleanup since rawcb was removed in
> along the way.
> 
> The goal is to make the block running without the kernel lock larger and
> larger.

Better diff. Keeping an extra reference (refcnt_take) in the SRPL_FOREACH
loop since both the current and previous (last) element are used in the loop.
Also release the rw_lock before calling refcnt_finalize() since sleeping
with that lock set could lead to a deadlock.

-- 
:wq Claudio

Index: net/raw_cb.c
===================================================================
RCS file: /cvs/src/sys/net/raw_cb.c,v
retrieving revision 1.11
diff -u -p -r1.11 raw_cb.c
--- net/raw_cb.c        24 Jan 2017 10:08:30 -0000      1.11
+++ net/raw_cb.c        28 May 2017 07:34:19 -0000
@@ -46,16 +46,10 @@
 
 /*
  * Routines to manage the raw protocol control blocks.
- *
- * TODO:
- *     hash lookups by protocol family/protocol + address family
- *     take care of unique address problems per AF?
- *     redo address binding to allow wildcards
  */
 
 u_long raw_sendspace = RAWSNDQ;
 u_long raw_recvspace = RAWRCVQ;
-struct rawcbhead rawcb;
 
 /*
  * Allocate a control block and a nominal amount
@@ -72,14 +66,13 @@ raw_attach(struct socket *so, int proto)
         * after space has been allocated for the
         * rawcb.
         */
-       if (rp == 0)
+       if (rp == NULL)
                return (ENOBUFS);
        if ((error = soreserve(so, raw_sendspace, raw_recvspace)) != 0)
                return (error);
        rp->rcb_socket = so;
        rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family;
        rp->rcb_proto.sp_protocol = proto;
-       LIST_INSERT_HEAD(&rawcb, rp, rcb_list);
        return (0);
 }
 
@@ -94,7 +87,6 @@ raw_detach(struct rawcb *rp)
 
        so->so_pcb = 0;
        sofree(so);
-       LIST_REMOVE(rp, rcb_list);
        free((caddr_t)(rp), M_PCB, 0);
 }
 
@@ -104,7 +96,6 @@ raw_detach(struct rawcb *rp)
 void
 raw_disconnect(struct rawcb *rp)
 {
-
        if (rp->rcb_socket->so_state & SS_NOFDREF)
                raw_detach(rp);
 }
Index: net/raw_cb.h
===================================================================
RCS file: /cvs/src/sys/net/raw_cb.h,v
retrieving revision 1.11
diff -u -p -r1.11 raw_cb.h
--- net/raw_cb.h        23 Jan 2017 16:31:24 -0000      1.11
+++ net/raw_cb.h        27 May 2017 19:36:17 -0000
@@ -40,7 +40,6 @@
  * to tie a socket to the generic raw interface.
  */
 struct rawcb {
-       LIST_ENTRY(rawcb) rcb_list;     /* doubly linked list */
        struct  socket *rcb_socket;     /* back pointer to socket */
        struct  sockaddr *rcb_faddr;    /* destination address */
        struct  sockaddr *rcb_laddr;    /* socket's address */
@@ -54,8 +53,6 @@ struct rawcb {
 #define        RAWRCVQ         8192
 
 #ifdef _KERNEL
-
-extern LIST_HEAD(rawcbhead, rawcb) rawcb;              /* head of list */
 
 #define        sotorawcb(so)           ((struct rawcb *)(so)->so_pcb)
 int     raw_attach(struct socket *, int);
Index: net/raw_usrreq.c
===================================================================
RCS file: /cvs/src/sys/net/raw_usrreq.c,v
retrieving revision 1.31
diff -u -p -r1.31 raw_usrreq.c
--- net/raw_usrreq.c    13 Mar 2017 20:18:21 -0000      1.31
+++ net/raw_usrreq.c    28 May 2017 07:32:16 -0000
@@ -45,15 +45,6 @@
 #include <net/raw_cb.h>
 
 #include <sys/stdarg.h>
-/*
- * Initialize raw connection block q.
- */
-void
-raw_init(void)
-{
-
-       LIST_INIT(&rawcb);
-}
 
 int
 raw_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
@@ -71,7 +62,7 @@ raw_usrreq(struct socket *so, int req, s
                m_freem(m);
                return (EOPNOTSUPP);
        }
-       if (rp == 0) {
+       if (rp == NULL) {
                m_freem(m);
                return (EINVAL);
        }
@@ -81,10 +72,6 @@ raw_usrreq(struct socket *so, int req, s
         * Flush data or not depending on the options.
         */
        case PRU_DETACH:
-               if (rp == 0) {
-                       error = ENOTCONN;
-                       break;
-               }
                raw_detach(rp);
                break;
 
Index: net/rtsock.c
===================================================================
RCS file: /cvs/src/sys/net/rtsock.c,v
retrieving revision 1.237
diff -u -p -r1.237 rtsock.c
--- net/rtsock.c        19 Apr 2017 15:21:54 -0000      1.237
+++ net/rtsock.c        29 May 2017 08:41:01 -0000
@@ -70,6 +70,7 @@
 #include <sys/socketvar.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
+#include <sys/srp.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
@@ -98,6 +99,9 @@ struct walkarg {
        caddr_t w_where, w_tmem;
 };
 
+void   route_prinit(void);
+void   route_ref(void *, void *);
+void   route_unref(void *, void *);
 int    route_output(struct mbuf *, struct socket *, struct sockaddr *,
            struct mbuf *);
 int    route_ctloutput(int, struct socket *, int, int, struct mbuf *);
@@ -126,19 +130,21 @@ int                sysctl_ifnames(struct walkarg *);
 int             sysctl_rtable_rtstat(void *, size_t *, void *);
 
 struct routecb {
-       struct rawcb    rcb;
-       struct timeout  timeout;
-       unsigned int    msgfilter;
-       unsigned int    flags;
-       u_int           rtableid;
+       struct rawcb            rcb;
+       SRPL_ENTRY(routecb)     rcb_list;
+       struct refcnt           refcnt;
+       struct timeout          timeout;
+       unsigned int            msgfilter;
+       unsigned int            flags;
+       u_int                   rtableid;
 };
 #define        sotoroutecb(so) ((struct routecb *)(so)->so_pcb)
 
 struct route_cb {
-       int             ip_count;
-       int             ip6_count;
-       int             mpls_count;
-       int             any_count;
+       SRPL_HEAD(, routecb)    rcb;
+       struct srpl_rc          rcb_rc;
+       struct rwlock           rcb_lk;
+       unsigned int            any_count;
 };
 
 struct route_cb route_cb;
@@ -154,45 +160,70 @@ struct route_cb route_cb;
 
 #define ROUTE_DESYNC_RESEND_TIMEOUT    (hz / 5)        /* In hz */
 
+void
+route_prinit(void)
+{
+       srpl_rc_init(&route_cb.rcb_rc, route_ref, route_unref, NULL);
+       rw_init(&route_cb.rcb_lk, "rtsock");
+       SRPL_INIT(&route_cb.rcb);
+}
+
+void
+route_ref(void *null, void *v)
+{
+       struct routecb *rop = v;
+
+       refcnt_take(&rop->refcnt);
+}
+
+void
+route_unref(void *null, void *v)
+{
+       struct routecb *rop = v;
+       
+       refcnt_rele_wake(&rop->refcnt);
+}
+
 int
 route_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
     struct mbuf *control, struct proc *p)
 {
-       struct rawcb    *rp;
        struct routecb  *rop;
-       int              af;
        int              error = 0;
 
-       rp = sotorawcb(so);
+       rop = sotoroutecb(so);
+       if (rop == NULL)
+               return ENOTCONN;
 
        switch (req) {
        case PRU_RCVD:
-               rop = (struct routecb *)rp;
-
                /*
                 * If we are in a FLUSH state, check if the buffer is
                 * empty so that we can clear the flag.
                 */
                if (((rop->flags & ROUTECB_FLAG_FLUSH) != 0) &&
-                   ((sbspace(&rp->rcb_socket->so_rcv) ==
-                   rp->rcb_socket->so_rcv.sb_hiwat)))
+                   ((sbspace(&rop->rcb.rcb_socket->so_rcv) ==
+                   rop->rcb.rcb_socket->so_rcv.sb_hiwat)))
                        rop->flags &= ~ROUTECB_FLAG_FLUSH;
                break;
 
        case PRU_DETACH:
-               if (rp) {
-                       timeout_del(&((struct routecb *)rp)->timeout);
-                       af = rp->rcb_proto.sp_protocol;
-                       if (af == AF_INET)
-                               route_cb.ip_count--;
-                       else if (af == AF_INET6)
-                               route_cb.ip6_count--;
-#ifdef MPLS
-                       else if (af == AF_MPLS)
-                               route_cb.mpls_count--;
-#endif
-                       route_cb.any_count--;
-               }
+               KERNEL_UNLOCK();
+
+               error = rw_enter(&route_cb.rcb_lk, RW_WRITE | RW_INTR);
+               if (error != 0)
+                       break;
+
+               timeout_del(&rop->timeout);
+               route_cb.any_count--;
+               SRPL_REMOVE_LOCKED(&route_cb.rcb_rc, &route_cb.rcb,
+                   rop, routecb, rcb_list);
+
+               rw_exit(&route_cb.rcb_lk);
+               /* wait for all references to drop */
+               refcnt_finalize(&rop->refcnt, "rtsockrefs");
+               KERNEL_LOCK();
+
                /* FALLTHROUGH */
        default:
                error = raw_usrreq(so, req, m, nam, control, p);
@@ -206,7 +237,6 @@ route_attach(struct socket *so, int prot
 {
        struct rawcb    *rp;
        struct routecb  *rop;
-       int              af;
        int              error = 0;
 
        /*
@@ -219,6 +249,7 @@ route_attach(struct socket *so, int prot
        so->so_pcb = rp;
        /* Init the timeout structure */
        timeout_set(&rop->timeout, route_senddesync, rp);
+       refcnt_init(&rop->refcnt);
 
        if (curproc == NULL)
                error = EACCES;
@@ -228,20 +259,27 @@ route_attach(struct socket *so, int prot
                free(rop, M_PCB, sizeof(struct routecb));
                return (error);
        }
+
+       KERNEL_UNLOCK();
+
        rop->rtableid = curproc->p_p->ps_rtableid;
-       af = rp->rcb_proto.sp_protocol;
-       if (af == AF_INET)
-               route_cb.ip_count++;
-       else if (af == AF_INET6)
-               route_cb.ip6_count++;
-#ifdef MPLS
-       else if (af == AF_MPLS)
-               route_cb.mpls_count++;
-#endif
        rp->rcb_faddr = &route_src;
+
+       error = rw_enter(&route_cb.rcb_lk, RW_WRITE | RW_INTR);
+       if (error != 0) {
+               free(rop, M_PCB, sizeof(struct routecb));
+               return (error);
+       }
+               
+       SRPL_INSERT_HEAD_LOCKED(&route_cb.rcb_rc, &route_cb.rcb, rop, rcb_list);
        route_cb.any_count++;
-       soisconnected(so);
+
+       rw_exit(&route_cb.rcb_lk);
+
+       KERNEL_LOCK();
+
        so->so_options |= SO_USELOOPBACK;
+       soisconnected(so);
 
        return (error);
 }
@@ -347,6 +385,7 @@ route_input(struct mbuf *m0, struct sock
        int sockets = 0;
        struct socket *last = NULL;
        struct sockaddr *sosrc, *sodst;
+       struct srp_ref sr;
 
        KERNEL_ASSERT_LOCKED();
 
@@ -359,7 +398,8 @@ route_input(struct mbuf *m0, struct sock
                return;
        }
 
-       LIST_FOREACH(rp, &rawcb, rcb_list) {
+       SRPL_FOREACH(rop, &sr, &route_cb.rcb, rcb_list) {
+               rp = &rop->rcb;
                if (rp->rcb_socket->so_state & SS_CANTRCVMORE)
                        continue;
                if (rp->rcb_proto.sp_family != PF_ROUTE)
@@ -393,7 +433,6 @@ route_input(struct mbuf *m0, struct sock
                        continue;
 
                /* filter messages that the process does not want */
-               rop = (struct routecb *)rp;
                rtm = mtod(m, struct rt_msghdr *);
                /* but RTM_DESYNC can't be filtered */
                if (rtm->rtm_type != RTM_DESYNC && rop->msgfilter != 0 &&
@@ -448,7 +487,10 @@ route_input(struct mbuf *m0, struct sock
                                        sockets++;
                                }
                        }
+                       refcnt_rele_wake(&sotoroutecb(last)->refcnt);
                }
+               /* keep a reference for last */
+               refcnt_take(&rop->refcnt);
                last = rp->rcb_socket;
        }
        if (last) {
@@ -464,8 +506,11 @@ route_input(struct mbuf *m0, struct sock
                        sorwakeup(last);
                        sockets++;
                }
+               refcnt_rele_wake(&sotoroutecb(last)->refcnt);
        } else
                m_freem(m);
+
+       SRPL_LEAVE(&sr);
 }
 
 struct rt_msghdr *
@@ -1781,7 +1826,7 @@ struct protosw routesw[] = {
   .pr_ctloutput        = route_ctloutput,
   .pr_usrreq   = route_usrreq,
   .pr_attach   = route_attach,
-  .pr_init     = raw_init,
+  .pr_init     = route_prinit,
   .pr_sysctl   = sysctl_rtable
 }
 };

Reply via email to