On Sun, May 28, 2017 at 04:22:20PM +0200, Claudio Jeker wrote:
> This is a start at pushing locking down into the socket layer. Use a
> SRPL list to manage the route PCBs and start running stuff without the
> kernel lock where save. Includes some cleanup since rawcb was removed in
> along the way.
>
> The goal is to make the block running without the kernel lock larger and
> larger.
Better diff. Keeping an extra reference (refcnt_take) in the SRPL_FOREACH
loop since both the current and previous (last) element are used in the loop.
Also release the rw_lock before calling refcnt_finalize() since sleeping
with that lock set could lead to a deadlock.
--
:wq Claudio
Index: net/raw_cb.c
===================================================================
RCS file: /cvs/src/sys/net/raw_cb.c,v
retrieving revision 1.11
diff -u -p -r1.11 raw_cb.c
--- net/raw_cb.c 24 Jan 2017 10:08:30 -0000 1.11
+++ net/raw_cb.c 28 May 2017 07:34:19 -0000
@@ -46,16 +46,10 @@
/*
* Routines to manage the raw protocol control blocks.
- *
- * TODO:
- * hash lookups by protocol family/protocol + address family
- * take care of unique address problems per AF?
- * redo address binding to allow wildcards
*/
u_long raw_sendspace = RAWSNDQ;
u_long raw_recvspace = RAWRCVQ;
-struct rawcbhead rawcb;
/*
* Allocate a control block and a nominal amount
@@ -72,14 +66,13 @@ raw_attach(struct socket *so, int proto)
* after space has been allocated for the
* rawcb.
*/
- if (rp == 0)
+ if (rp == NULL)
return (ENOBUFS);
if ((error = soreserve(so, raw_sendspace, raw_recvspace)) != 0)
return (error);
rp->rcb_socket = so;
rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family;
rp->rcb_proto.sp_protocol = proto;
- LIST_INSERT_HEAD(&rawcb, rp, rcb_list);
return (0);
}
@@ -94,7 +87,6 @@ raw_detach(struct rawcb *rp)
so->so_pcb = 0;
sofree(so);
- LIST_REMOVE(rp, rcb_list);
free((caddr_t)(rp), M_PCB, 0);
}
@@ -104,7 +96,6 @@ raw_detach(struct rawcb *rp)
void
raw_disconnect(struct rawcb *rp)
{
-
if (rp->rcb_socket->so_state & SS_NOFDREF)
raw_detach(rp);
}
Index: net/raw_cb.h
===================================================================
RCS file: /cvs/src/sys/net/raw_cb.h,v
retrieving revision 1.11
diff -u -p -r1.11 raw_cb.h
--- net/raw_cb.h 23 Jan 2017 16:31:24 -0000 1.11
+++ net/raw_cb.h 27 May 2017 19:36:17 -0000
@@ -40,7 +40,6 @@
* to tie a socket to the generic raw interface.
*/
struct rawcb {
- LIST_ENTRY(rawcb) rcb_list; /* doubly linked list */
struct socket *rcb_socket; /* back pointer to socket */
struct sockaddr *rcb_faddr; /* destination address */
struct sockaddr *rcb_laddr; /* socket's address */
@@ -54,8 +53,6 @@ struct rawcb {
#define RAWRCVQ 8192
#ifdef _KERNEL
-
-extern LIST_HEAD(rawcbhead, rawcb) rawcb; /* head of list */
#define sotorawcb(so) ((struct rawcb *)(so)->so_pcb)
int raw_attach(struct socket *, int);
Index: net/raw_usrreq.c
===================================================================
RCS file: /cvs/src/sys/net/raw_usrreq.c,v
retrieving revision 1.31
diff -u -p -r1.31 raw_usrreq.c
--- net/raw_usrreq.c 13 Mar 2017 20:18:21 -0000 1.31
+++ net/raw_usrreq.c 28 May 2017 07:32:16 -0000
@@ -45,15 +45,6 @@
#include <net/raw_cb.h>
#include <sys/stdarg.h>
-/*
- * Initialize raw connection block q.
- */
-void
-raw_init(void)
-{
-
- LIST_INIT(&rawcb);
-}
int
raw_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
@@ -71,7 +62,7 @@ raw_usrreq(struct socket *so, int req, s
m_freem(m);
return (EOPNOTSUPP);
}
- if (rp == 0) {
+ if (rp == NULL) {
m_freem(m);
return (EINVAL);
}
@@ -81,10 +72,6 @@ raw_usrreq(struct socket *so, int req, s
* Flush data or not depending on the options.
*/
case PRU_DETACH:
- if (rp == 0) {
- error = ENOTCONN;
- break;
- }
raw_detach(rp);
break;
Index: net/rtsock.c
===================================================================
RCS file: /cvs/src/sys/net/rtsock.c,v
retrieving revision 1.237
diff -u -p -r1.237 rtsock.c
--- net/rtsock.c 19 Apr 2017 15:21:54 -0000 1.237
+++ net/rtsock.c 29 May 2017 08:41:01 -0000
@@ -70,6 +70,7 @@
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
+#include <sys/srp.h>
#include <net/if.h>
#include <net/if_dl.h>
@@ -98,6 +99,9 @@ struct walkarg {
caddr_t w_where, w_tmem;
};
+void route_prinit(void);
+void route_ref(void *, void *);
+void route_unref(void *, void *);
int route_output(struct mbuf *, struct socket *, struct sockaddr *,
struct mbuf *);
int route_ctloutput(int, struct socket *, int, int, struct mbuf *);
@@ -126,19 +130,21 @@ int sysctl_ifnames(struct walkarg *);
int sysctl_rtable_rtstat(void *, size_t *, void *);
struct routecb {
- struct rawcb rcb;
- struct timeout timeout;
- unsigned int msgfilter;
- unsigned int flags;
- u_int rtableid;
+ struct rawcb rcb;
+ SRPL_ENTRY(routecb) rcb_list;
+ struct refcnt refcnt;
+ struct timeout timeout;
+ unsigned int msgfilter;
+ unsigned int flags;
+ u_int rtableid;
};
#define sotoroutecb(so) ((struct routecb *)(so)->so_pcb)
struct route_cb {
- int ip_count;
- int ip6_count;
- int mpls_count;
- int any_count;
+ SRPL_HEAD(, routecb) rcb;
+ struct srpl_rc rcb_rc;
+ struct rwlock rcb_lk;
+ unsigned int any_count;
};
struct route_cb route_cb;
@@ -154,45 +160,70 @@ struct route_cb route_cb;
#define ROUTE_DESYNC_RESEND_TIMEOUT (hz / 5) /* In hz */
+void
+route_prinit(void)
+{
+ srpl_rc_init(&route_cb.rcb_rc, route_ref, route_unref, NULL);
+ rw_init(&route_cb.rcb_lk, "rtsock");
+ SRPL_INIT(&route_cb.rcb);
+}
+
+void
+route_ref(void *null, void *v)
+{
+ struct routecb *rop = v;
+
+ refcnt_take(&rop->refcnt);
+}
+
+void
+route_unref(void *null, void *v)
+{
+ struct routecb *rop = v;
+
+ refcnt_rele_wake(&rop->refcnt);
+}
+
int
route_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
struct mbuf *control, struct proc *p)
{
- struct rawcb *rp;
struct routecb *rop;
- int af;
int error = 0;
- rp = sotorawcb(so);
+ rop = sotoroutecb(so);
+ if (rop == NULL)
+ return ENOTCONN;
switch (req) {
case PRU_RCVD:
- rop = (struct routecb *)rp;
-
/*
* If we are in a FLUSH state, check if the buffer is
* empty so that we can clear the flag.
*/
if (((rop->flags & ROUTECB_FLAG_FLUSH) != 0) &&
- ((sbspace(&rp->rcb_socket->so_rcv) ==
- rp->rcb_socket->so_rcv.sb_hiwat)))
+ ((sbspace(&rop->rcb.rcb_socket->so_rcv) ==
+ rop->rcb.rcb_socket->so_rcv.sb_hiwat)))
rop->flags &= ~ROUTECB_FLAG_FLUSH;
break;
case PRU_DETACH:
- if (rp) {
- timeout_del(&((struct routecb *)rp)->timeout);
- af = rp->rcb_proto.sp_protocol;
- if (af == AF_INET)
- route_cb.ip_count--;
- else if (af == AF_INET6)
- route_cb.ip6_count--;
-#ifdef MPLS
- else if (af == AF_MPLS)
- route_cb.mpls_count--;
-#endif
- route_cb.any_count--;
- }
+ KERNEL_UNLOCK();
+
+ error = rw_enter(&route_cb.rcb_lk, RW_WRITE | RW_INTR);
+ if (error != 0)
+ break;
+
+ timeout_del(&rop->timeout);
+ route_cb.any_count--;
+ SRPL_REMOVE_LOCKED(&route_cb.rcb_rc, &route_cb.rcb,
+ rop, routecb, rcb_list);
+
+ rw_exit(&route_cb.rcb_lk);
+ /* wait for all references to drop */
+ refcnt_finalize(&rop->refcnt, "rtsockrefs");
+ KERNEL_LOCK();
+
/* FALLTHROUGH */
default:
error = raw_usrreq(so, req, m, nam, control, p);
@@ -206,7 +237,6 @@ route_attach(struct socket *so, int prot
{
struct rawcb *rp;
struct routecb *rop;
- int af;
int error = 0;
/*
@@ -219,6 +249,7 @@ route_attach(struct socket *so, int prot
so->so_pcb = rp;
/* Init the timeout structure */
timeout_set(&rop->timeout, route_senddesync, rp);
+ refcnt_init(&rop->refcnt);
if (curproc == NULL)
error = EACCES;
@@ -228,20 +259,27 @@ route_attach(struct socket *so, int prot
free(rop, M_PCB, sizeof(struct routecb));
return (error);
}
+
+ KERNEL_UNLOCK();
+
rop->rtableid = curproc->p_p->ps_rtableid;
- af = rp->rcb_proto.sp_protocol;
- if (af == AF_INET)
- route_cb.ip_count++;
- else if (af == AF_INET6)
- route_cb.ip6_count++;
-#ifdef MPLS
- else if (af == AF_MPLS)
- route_cb.mpls_count++;
-#endif
rp->rcb_faddr = &route_src;
+
+ error = rw_enter(&route_cb.rcb_lk, RW_WRITE | RW_INTR);
+ if (error != 0) {
+ free(rop, M_PCB, sizeof(struct routecb));
+ return (error);
+ }
+
+ SRPL_INSERT_HEAD_LOCKED(&route_cb.rcb_rc, &route_cb.rcb, rop, rcb_list);
route_cb.any_count++;
- soisconnected(so);
+
+ rw_exit(&route_cb.rcb_lk);
+
+ KERNEL_LOCK();
+
so->so_options |= SO_USELOOPBACK;
+ soisconnected(so);
return (error);
}
@@ -347,6 +385,7 @@ route_input(struct mbuf *m0, struct sock
int sockets = 0;
struct socket *last = NULL;
struct sockaddr *sosrc, *sodst;
+ struct srp_ref sr;
KERNEL_ASSERT_LOCKED();
@@ -359,7 +398,8 @@ route_input(struct mbuf *m0, struct sock
return;
}
- LIST_FOREACH(rp, &rawcb, rcb_list) {
+ SRPL_FOREACH(rop, &sr, &route_cb.rcb, rcb_list) {
+ rp = &rop->rcb;
if (rp->rcb_socket->so_state & SS_CANTRCVMORE)
continue;
if (rp->rcb_proto.sp_family != PF_ROUTE)
@@ -393,7 +433,6 @@ route_input(struct mbuf *m0, struct sock
continue;
/* filter messages that the process does not want */
- rop = (struct routecb *)rp;
rtm = mtod(m, struct rt_msghdr *);
/* but RTM_DESYNC can't be filtered */
if (rtm->rtm_type != RTM_DESYNC && rop->msgfilter != 0 &&
@@ -448,7 +487,10 @@ route_input(struct mbuf *m0, struct sock
sockets++;
}
}
+ refcnt_rele_wake(&sotoroutecb(last)->refcnt);
}
+ /* keep a reference for last */
+ refcnt_take(&rop->refcnt);
last = rp->rcb_socket;
}
if (last) {
@@ -464,8 +506,11 @@ route_input(struct mbuf *m0, struct sock
sorwakeup(last);
sockets++;
}
+ refcnt_rele_wake(&sotoroutecb(last)->refcnt);
} else
m_freem(m);
+
+ SRPL_LEAVE(&sr);
}
struct rt_msghdr *
@@ -1781,7 +1826,7 @@ struct protosw routesw[] = {
.pr_ctloutput = route_ctloutput,
.pr_usrreq = route_usrreq,
.pr_attach = route_attach,
- .pr_init = raw_init,
+ .pr_init = route_prinit,
.pr_sysctl = sysctl_rtable
}
};