This is a start at pushing locking down into the socket layer.
Use a SRPL list to manage the route PCBs and start running stuff without
the kernel lock where save. Includes some cleanup since rawcb was removed
in along the way.
The goal is to make the block running without the kernel lock larger and
larger.
--
:wq Claudio
Index: net/raw_cb.c
===================================================================
RCS file: /cvs/src/sys/net/raw_cb.c,v
retrieving revision 1.11
diff -u -p -r1.11 raw_cb.c
--- net/raw_cb.c 24 Jan 2017 10:08:30 -0000 1.11
+++ net/raw_cb.c 28 May 2017 07:34:19 -0000
@@ -46,16 +46,10 @@
/*
* Routines to manage the raw protocol control blocks.
- *
- * TODO:
- * hash lookups by protocol family/protocol + address family
- * take care of unique address problems per AF?
- * redo address binding to allow wildcards
*/
u_long raw_sendspace = RAWSNDQ;
u_long raw_recvspace = RAWRCVQ;
-struct rawcbhead rawcb;
/*
* Allocate a control block and a nominal amount
@@ -72,14 +66,13 @@ raw_attach(struct socket *so, int proto)
* after space has been allocated for the
* rawcb.
*/
- if (rp == 0)
+ if (rp == NULL)
return (ENOBUFS);
if ((error = soreserve(so, raw_sendspace, raw_recvspace)) != 0)
return (error);
rp->rcb_socket = so;
rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family;
rp->rcb_proto.sp_protocol = proto;
- LIST_INSERT_HEAD(&rawcb, rp, rcb_list);
return (0);
}
@@ -94,7 +87,6 @@ raw_detach(struct rawcb *rp)
so->so_pcb = 0;
sofree(so);
- LIST_REMOVE(rp, rcb_list);
free((caddr_t)(rp), M_PCB, 0);
}
@@ -104,7 +96,6 @@ raw_detach(struct rawcb *rp)
void
raw_disconnect(struct rawcb *rp)
{
-
if (rp->rcb_socket->so_state & SS_NOFDREF)
raw_detach(rp);
}
Index: net/raw_cb.h
===================================================================
RCS file: /cvs/src/sys/net/raw_cb.h,v
retrieving revision 1.11
diff -u -p -r1.11 raw_cb.h
--- net/raw_cb.h 23 Jan 2017 16:31:24 -0000 1.11
+++ net/raw_cb.h 27 May 2017 19:36:17 -0000
@@ -40,7 +40,6 @@
* to tie a socket to the generic raw interface.
*/
struct rawcb {
- LIST_ENTRY(rawcb) rcb_list; /* doubly linked list */
struct socket *rcb_socket; /* back pointer to socket */
struct sockaddr *rcb_faddr; /* destination address */
struct sockaddr *rcb_laddr; /* socket's address */
@@ -54,8 +53,6 @@ struct rawcb {
#define RAWRCVQ 8192
#ifdef _KERNEL
-
-extern LIST_HEAD(rawcbhead, rawcb) rawcb; /* head of list */
#define sotorawcb(so) ((struct rawcb *)(so)->so_pcb)
int raw_attach(struct socket *, int);
Index: net/raw_usrreq.c
===================================================================
RCS file: /cvs/src/sys/net/raw_usrreq.c,v
retrieving revision 1.31
diff -u -p -r1.31 raw_usrreq.c
--- net/raw_usrreq.c 13 Mar 2017 20:18:21 -0000 1.31
+++ net/raw_usrreq.c 28 May 2017 07:32:16 -0000
@@ -45,15 +45,6 @@
#include <net/raw_cb.h>
#include <sys/stdarg.h>
-/*
- * Initialize raw connection block q.
- */
-void
-raw_init(void)
-{
-
- LIST_INIT(&rawcb);
-}
int
raw_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
@@ -71,7 +62,7 @@ raw_usrreq(struct socket *so, int req, s
m_freem(m);
return (EOPNOTSUPP);
}
- if (rp == 0) {
+ if (rp == NULL) {
m_freem(m);
return (EINVAL);
}
@@ -81,10 +72,6 @@ raw_usrreq(struct socket *so, int req, s
* Flush data or not depending on the options.
*/
case PRU_DETACH:
- if (rp == 0) {
- error = ENOTCONN;
- break;
- }
raw_detach(rp);
break;
Index: net/rtsock.c
===================================================================
RCS file: /cvs/src/sys/net/rtsock.c,v
retrieving revision 1.237
diff -u -p -r1.237 rtsock.c
--- net/rtsock.c 19 Apr 2017 15:21:54 -0000 1.237
+++ net/rtsock.c 28 May 2017 13:58:14 -0000
@@ -70,6 +70,7 @@
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
+#include <sys/srp.h>
#include <net/if.h>
#include <net/if_dl.h>
@@ -98,6 +99,9 @@ struct walkarg {
caddr_t w_where, w_tmem;
};
+void route_prinit(void);
+void route_ref(void *, void *);
+void route_unref(void *, void *);
int route_output(struct mbuf *, struct socket *, struct sockaddr *,
struct mbuf *);
int route_ctloutput(int, struct socket *, int, int, struct mbuf *);
@@ -126,19 +130,21 @@ int sysctl_ifnames(struct walkarg *);
int sysctl_rtable_rtstat(void *, size_t *, void *);
struct routecb {
- struct rawcb rcb;
- struct timeout timeout;
- unsigned int msgfilter;
- unsigned int flags;
- u_int rtableid;
+ struct rawcb rcb;
+ SRPL_ENTRY(routecb) rcb_list;
+ struct refcnt refcnt;
+ struct timeout timeout;
+ unsigned int msgfilter;
+ unsigned int flags;
+ u_int rtableid;
};
#define sotoroutecb(so) ((struct routecb *)(so)->so_pcb)
struct route_cb {
- int ip_count;
- int ip6_count;
- int mpls_count;
- int any_count;
+ SRPL_HEAD(, routecb) rcb;
+ struct srpl_rc rcb_rc;
+ struct rwlock rcb_lk;
+ unsigned int any_count;
};
struct route_cb route_cb;
@@ -154,46 +160,70 @@ struct route_cb route_cb;
#define ROUTE_DESYNC_RESEND_TIMEOUT (hz / 5) /* In hz */
+void
+route_prinit(void)
+{
+ srpl_rc_init(&route_cb.rcb_rc, route_ref, route_unref, NULL);
+ rw_init(&route_cb.rcb_lk, "rtsock");
+ SRPL_INIT(&route_cb.rcb);
+}
+
+void
+route_ref(void *null, void *v)
+{
+ struct routecb *rop = v;
+
+ refcnt_take(&rop->refcnt);
+}
+
+void
+route_unref(void *null, void *v)
+{
+ struct routecb *rop = v;
+
+ refcnt_rele_wake(&rop->refcnt);
+}
+
int
route_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
struct mbuf *control, struct proc *p)
{
- struct rawcb *rp;
struct routecb *rop;
- int af;
int error = 0;
- rp = sotorawcb(so);
+ rop = sotoroutecb(so);
+ if (rop == NULL)
+ return ENOTCONN;
switch (req) {
case PRU_RCVD:
- rop = (struct routecb *)rp;
-
/*
* If we are in a FLUSH state, check if the buffer is
* empty so that we can clear the flag.
*/
if (((rop->flags & ROUTECB_FLAG_FLUSH) != 0) &&
- ((sbspace(&rp->rcb_socket->so_rcv) ==
- rp->rcb_socket->so_rcv.sb_hiwat)))
+ ((sbspace(&rop->rcb.rcb_socket->so_rcv) ==
+ rop->rcb.rcb_socket->so_rcv.sb_hiwat)))
rop->flags &= ~ROUTECB_FLAG_FLUSH;
break;
case PRU_DETACH:
- if (rp) {
- timeout_del(&((struct routecb *)rp)->timeout);
- af = rp->rcb_proto.sp_protocol;
- if (af == AF_INET)
- route_cb.ip_count--;
- else if (af == AF_INET6)
- route_cb.ip6_count--;
-#ifdef MPLS
- else if (af == AF_MPLS)
- route_cb.mpls_count--;
-#endif
- route_cb.any_count--;
- }
+ KERNEL_UNLOCK();
+
+ error = rw_enter(&route_cb.rcb_lk, RW_WRITE | RW_INTR);
+ if (error != 0)
+ break;
+
+ timeout_del(&rop->timeout);
+ route_cb.any_count--;
+ SRPL_REMOVE_LOCKED(&route_cb.rcb_rc, &route_cb.rcb,
+ rop, routecb, rcb_list);
+ refcnt_finalize(&rop->refcnt, "rtsockrefs");
+
+ rw_exit(&route_cb.rcb_lk);
/* FALLTHROUGH */
+
+ KERNEL_LOCK();
default:
error = raw_usrreq(so, req, m, nam, control, p);
}
@@ -206,7 +236,6 @@ route_attach(struct socket *so, int prot
{
struct rawcb *rp;
struct routecb *rop;
- int af;
int error = 0;
/*
@@ -219,6 +248,7 @@ route_attach(struct socket *so, int prot
so->so_pcb = rp;
/* Init the timeout structure */
timeout_set(&rop->timeout, route_senddesync, rp);
+ refcnt_init(&rop->refcnt);
if (curproc == NULL)
error = EACCES;
@@ -228,20 +258,27 @@ route_attach(struct socket *so, int prot
free(rop, M_PCB, sizeof(struct routecb));
return (error);
}
+
+ KERNEL_UNLOCK();
+
rop->rtableid = curproc->p_p->ps_rtableid;
- af = rp->rcb_proto.sp_protocol;
- if (af == AF_INET)
- route_cb.ip_count++;
- else if (af == AF_INET6)
- route_cb.ip6_count++;
-#ifdef MPLS
- else if (af == AF_MPLS)
- route_cb.mpls_count++;
-#endif
rp->rcb_faddr = &route_src;
+
+ error = rw_enter(&route_cb.rcb_lk, RW_WRITE | RW_INTR);
+ if (error != 0) {
+ free(rop, M_PCB, sizeof(struct routecb));
+ return (error);
+ }
+
+ SRPL_INSERT_HEAD_LOCKED(&route_cb.rcb_rc, &route_cb.rcb, rop, rcb_list);
route_cb.any_count++;
- soisconnected(so);
+
+ rw_exit(&route_cb.rcb_lk);
+
+ KERNEL_LOCK();
+
so->so_options |= SO_USELOOPBACK;
+ soisconnected(so);
return (error);
}
@@ -347,6 +384,7 @@ route_input(struct mbuf *m0, struct sock
int sockets = 0;
struct socket *last = NULL;
struct sockaddr *sosrc, *sodst;
+ struct srp_ref sr;
KERNEL_ASSERT_LOCKED();
@@ -359,7 +397,8 @@ route_input(struct mbuf *m0, struct sock
return;
}
- LIST_FOREACH(rp, &rawcb, rcb_list) {
+ SRPL_FOREACH(rop, &sr, &route_cb.rcb, rcb_list) {
+ rp = &rop->rcb;
if (rp->rcb_socket->so_state & SS_CANTRCVMORE)
continue;
if (rp->rcb_proto.sp_family != PF_ROUTE)
@@ -393,7 +432,6 @@ route_input(struct mbuf *m0, struct sock
continue;
/* filter messages that the process does not want */
- rop = (struct routecb *)rp;
rtm = mtod(m, struct rt_msghdr *);
/* but RTM_DESYNC can't be filtered */
if (rtm->rtm_type != RTM_DESYNC && rop->msgfilter != 0 &&
@@ -466,6 +504,8 @@ route_input(struct mbuf *m0, struct sock
}
} else
m_freem(m);
+
+ SRPL_LEAVE(&sr);
}
struct rt_msghdr *
@@ -1781,7 +1821,7 @@ struct protosw routesw[] = {
.pr_ctloutput = route_ctloutput,
.pr_usrreq = route_usrreq,
.pr_attach = route_attach,
- .pr_init = raw_init,
+ .pr_init = route_prinit,
.pr_sysctl = sysctl_rtable
}
};