On Mon, Jun 04, 2018 at 03:56:02PM +0200, Martin Pieuchot wrote:
> Diff below adds a 'struct socket *' argument to sounlock() in order to
> prepare the stack for per-socket locks.
>
> That means sofree() will now unlock a given socket before freeing it.
> But since we do not want to not release the NET_LOCK() when processing
> incoming TCP packets, in_pcbdetach() needs a special treatment. That's
> also true for unp_drop() as long as Unix sockets will required the
> KERNEL_LOCK().
>
> This is on top of my previous diff to reduce the number of sofree().
>
> Comments? Oks?
I am working on the inpcb layer and also came to the conclusion
that this is necessary. I have added a mutex to struct inpcb that
needs some control from the socket layer. My idea was to add
PRU_LOCK and PRU_UNLOCK. solock() and sounlock() would have to
call them. Adding the socket to the sounlock() parameter is the
right thing.
The passing s to sofree() and unlock there in looks odd, but may
be a necessary intermediate step. Basically I have to solve the
same problem in in_pcbdetach().
My diff is not ready yet, go ahead, OK bluhm@
> diff --git sys/kern/sys_socket.c sys/kern/sys_socket.c
> index 916c33a0c1a..a754a7b2698 100644
> --- sys/kern/sys_socket.c
> +++ sys/kern/sys_socket.c
> @@ -88,7 +88,7 @@ soo_ioctl(struct file *fp, u_long cmd, caddr_t data, struct
> proc *p)
> so->so_state |= SS_NBIO;
> else
> so->so_state &= ~SS_NBIO;
> - sounlock(s);
> + sounlock(so, s);
> break;
>
> case FIOASYNC:
> @@ -102,7 +102,7 @@ soo_ioctl(struct file *fp, u_long cmd, caddr_t data,
> struct proc *p)
> so->so_rcv.sb_flags &= ~SB_ASYNC;
> so->so_snd.sb_flags &= ~SB_ASYNC;
> }
> - sounlock(s);
> + sounlock(so, s);
> break;
>
> case FIONREAD:
> @@ -176,7 +176,7 @@ soo_poll(struct file *fp, int events, struct proc *p)
> so->so_snd.sb_flags |= SB_SEL;
> }
> }
> - sounlock(s);
> + sounlock(so, s);
> return (revents);
> }
>
> @@ -197,7 +197,7 @@ soo_stat(struct file *fp, struct stat *ub, struct proc *p)
> ub->st_gid = so->so_egid;
> (void) ((*so->so_proto->pr_usrreq)(so, PRU_SENSE,
> (struct mbuf *)ub, NULL, NULL, p));
> - sounlock(s);
> + sounlock(so, s);
> return (0);
> }
>
> diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
> index 211966c79c8..aa789d403cc 100644
> --- sys/kern/uipc_socket.c
> +++ sys/kern/uipc_socket.c
> @@ -142,11 +142,11 @@ socreate(int dom, struct socket **aso, int type, int
> proto)
> error = (*prp->pr_attach)(so, proto);
> if (error) {
> so->so_state |= SS_NOFDREF;
> - sofree(so);
> - sounlock(s);
> + /* sofree() calls sounlock(). */
> + sofree(so, s);
> return (error);
> }
> - sounlock(s);
> + sounlock(so, s);
> *aso = so;
> return (0);
> }
> @@ -177,7 +177,7 @@ solisten(struct socket *so, int backlog)
> error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
> curproc);
> if (error) {
> - sounlock(s);
> + sounlock(so, s);
> return (error);
> }
> if (TAILQ_FIRST(&so->so_q) == NULL)
> @@ -187,25 +187,29 @@ solisten(struct socket *so, int backlog)
> if (backlog < sominconn)
> backlog = sominconn;
> so->so_qlimit = backlog;
> - sounlock(s);
> + sounlock(so, s);
> return (0);
> }
>
> void
> -sofree(struct socket *so)
> +sofree(struct socket *so, int s)
> {
> soassertlocked(so);
>
> - if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
> + if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
> + sounlock(so, s);
> return;
> + }
> if (so->so_head) {
> /*
> * We must not decommission a socket that's on the accept(2)
> * queue. If we do, then accept(2) may hang after select(2)
> * indicated that the listening socket was ready.
> */
> - if (!soqremque(so, 0))
> + if (!soqremque(so, 0)) {
> + sounlock(so, s);
> return;
> + }
> }
> #ifdef SOCKET_SPLICE
> if (so->so_sp) {
> @@ -218,6 +222,7 @@ sofree(struct socket *so)
> #endif /* SOCKET_SPLICE */
> sbrelease(so, &so->so_snd);
> sorflush(so);
> + sounlock(so, s);
> #ifdef SOCKET_SPLICE
> if (so->so_sp) {
> /* Reuse splice idle, sounsplice() has been called before. */
> @@ -284,8 +289,8 @@ drop:
> discard:
> KASSERT((so->so_state & SS_NOFDREF) == 0);
> so->so_state |= SS_NOFDREF;
> - sofree(so);
> - sounlock(s);
> + /* sofree() calls sounlock(). */
> + sofree(so, s);
> return (error);
> }
>
> @@ -349,7 +354,7 @@ soconnect2(struct socket *so1, struct socket *so2)
> s = solock(so1);
> error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
> (struct mbuf *)so2, NULL, curproc);
> - sounlock(s);
> + sounlock(so1, s);
> return (error);
> }
>
> @@ -478,7 +483,7 @@ restart:
> if (flags & MSG_EOR)
> top->m_flags |= M_EOR;
> } else {
> - sounlock(s);
> + sounlock(so, s);
> error = m_getuio(&top, atomic, space, uio);
> s = solock(so);
> if (error)
> @@ -507,7 +512,7 @@ release:
> so->so_state &= ~SS_ISSENDING;
> sbunlock(so, &so->so_snd);
> out:
> - sounlock(s);
> + sounlock(so, s);
> m_freem(top);
> m_freem(control);
> return (error);
> @@ -661,7 +666,7 @@ soreceive(struct socket *so, struct mbuf **paddr, struct
> uio *uio,
> s = solock(so);
> error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
> (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc);
> - sounlock(s);
> + sounlock(so, s);
> if (error)
> goto bad;
> do {
> @@ -679,7 +684,7 @@ bad:
> s = solock(so);
> restart:
> if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
> - sounlock(s);
> + sounlock(so, s);
> return (error);
> }
>
> @@ -747,7 +752,7 @@ restart:
> sbunlock(so, &so->so_rcv);
> error = sbwait(so, &so->so_rcv);
> if (error) {
> - sounlock(s);
> + sounlock(so, s);
> return (error);
> }
> goto restart;
> @@ -883,7 +888,7 @@ dontblock:
> SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
> SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
> resid = uio->uio_resid;
> - sounlock(s);
> + sounlock(so, s);
> uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
> s = solock(so);
> if (uio_error)
> @@ -967,7 +972,7 @@ dontblock:
> error = sbwait(so, &so->so_rcv);
> if (error) {
> sbunlock(so, &so->so_rcv);
> - sounlock(s);
> + sounlock(so, s);
> return (0);
> }
> if ((m = so->so_rcv.sb_mb) != NULL)
> @@ -1013,7 +1018,7 @@ dontblock:
> *flagsp |= flags;
> release:
> sbunlock(so, &so->so_rcv);
> - sounlock(s);
> + sounlock(so, s);
> return (error);
> }
>
> @@ -1039,7 +1044,7 @@ soshutdown(struct socket *so, int how)
> error = EINVAL;
> break;
> }
> - sounlock(s);
> + sounlock(so, s);
>
> return (error);
> }
> @@ -1218,7 +1223,7 @@ soidle(void *arg)
> so->so_error = ETIMEDOUT;
> sounsplice(so, so->so_sp->ssp_socket, 1);
> }
> - sounlock(s);
> + sounlock(so, s);
> }
>
> void
> @@ -1236,7 +1241,7 @@ sotask(void *arg)
> */
> somove(so, M_DONTWAIT);
> }
> - sounlock(s);
> + sounlock(so, s);
>
> /* Avoid user land starvation. */
> yield();
> diff --git sys/kern/uipc_socket2.c sys/kern/uipc_socket2.c
> index 8bb11fd97a1..0cb8b6dc98f 100644
> --- sys/kern/uipc_socket2.c
> +++ sys/kern/uipc_socket2.c
> @@ -277,25 +277,38 @@ solock(struct socket *so)
> {
> int s = 0;
>
> - if ((so->so_proto->pr_domain->dom_family != PF_UNIX) &&
> - (so->so_proto->pr_domain->dom_family != PF_ROUTE) &&
> - (so->so_proto->pr_domain->dom_family != PF_KEY))
> + switch (so->so_proto->pr_domain->dom_family) {
> + case PF_INET:
> + case PF_INET6:
> + s = -42;
> NET_LOCK();
> - else {
> + break;
> + case PF_UNIX:
> + case PF_ROUTE:
> + case PF_KEY:
> + default:
> KERNEL_LOCK();
> - s = -42;
> + break;
> }
>
> return (s);
> }
>
> void
> -sounlock(int s)
> +sounlock(struct socket *so, int s)
> {
> - if (s != -42)
> - NET_UNLOCK();
> - else {
> + switch (so->so_proto->pr_domain->dom_family) {
> + case PF_INET:
> + case PF_INET6:
> + if (s == -42)
> + NET_UNLOCK();
> + break;
> + case PF_UNIX:
> + case PF_ROUTE:
> + case PF_KEY:
> + default:
> KERNEL_UNLOCK();
> + break;
> }
> }
>
> diff --git sys/kern/uipc_syscalls.c sys/kern/uipc_syscalls.c
> index 1c23bb59091..a6a6aee173d 100644
> --- sys/kern/uipc_syscalls.c
> +++ sys/kern/uipc_syscalls.c
> @@ -209,7 +209,7 @@ sys_bind(struct proc *p, void *v, register_t *retval)
> #endif
> s = solock(so);
> error = sobind(so, nam, p);
> - sounlock(s);
> + sounlock(so, s);
> m_freem(nam);
> out:
> FRELE(fp, p);
> @@ -351,7 +351,7 @@ out:
> so->so_state |= SS_NBIO;
> else
> so->so_state &= ~SS_NBIO;
> - sounlock(s);
> + sounlock(head, s);
> fdplock(fdp);
> fp->f_data = so;
> fdinsert(fdp, tmpfd, cloexec, fp);
> @@ -359,7 +359,7 @@ out:
> FRELE(fp, p);
> *retval = tmpfd;
> } else {
> - sounlock(s);
> + sounlock(head, s);
> fdplock(fdp);
> fdremove(fdp, tmpfd);
> closef(fp, p);
> @@ -437,7 +437,7 @@ bad:
> if (!interrupted)
> so->so_state &= ~SS_ISCONNECTING;
> out:
> - sounlock(s);
> + sounlock(so, s);
> FRELE(fp, p);
> m_freem(nam);
> if (error == ERESTART)
> @@ -1000,7 +1000,7 @@ sys_setsockopt(struct proc *p, void *v, register_t
> *retval)
> so = fp->f_data;
> s = solock(so);
> error = sosetopt(so, SCARG(uap, level), SCARG(uap, name), m);
> - sounlock(s);
> + sounlock(so, s);
> bad:
> m_freem(m);
> FRELE(fp, p);
> @@ -1039,7 +1039,7 @@ sys_getsockopt(struct proc *p, void *v, register_t
> *retval)
> so = fp->f_data;
> s = solock(so);
> error = sogetopt(so, SCARG(uap, level), SCARG(uap, name), m);
> - sounlock(s);
> + sounlock(so, s);
> if (error == 0 && SCARG(uap, val) && valsize && m != NULL) {
> if (valsize > m->m_len)
> valsize = m->m_len;
> @@ -1083,7 +1083,7 @@ sys_getsockname(struct proc *p, void *v, register_t
> *retval)
> m = m_getclr(M_WAIT, MT_SONAME);
> s = solock(so);
> error = (*so->so_proto->pr_usrreq)(so, PRU_SOCKADDR, 0, m, 0, p);
> - sounlock(s);
> + sounlock(so, s);
> if (error)
> goto bad;
> error = copyaddrout(p, m, SCARG(uap, asa), len, SCARG(uap, alen));
> @@ -1126,7 +1126,7 @@ sys_getpeername(struct proc *p, void *v, register_t
> *retval)
> m = m_getclr(M_WAIT, MT_SONAME);
> s = solock(so);
> error = (*so->so_proto->pr_usrreq)(so, PRU_PEERADDR, 0, m, 0, p);
> - sounlock(s);
> + sounlock(so, s);
> if (error)
> goto bad;
> error = copyaddrout(p, m, SCARG(uap, asa), len, SCARG(uap, alen));
> diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c
> index 5d95208adc6..81a64a8554a 100644
> --- sys/kern/uipc_usrreq.c
> +++ sys/kern/uipc_usrreq.c
> @@ -612,11 +612,19 @@ unp_drop(struct unpcb *unp, int errno)
> {
> struct socket *so = unp->unp_socket;
>
> + KERNEL_ASSERT_LOCKED();
> +
> so->so_error = errno;
> unp_disconnect(unp);
> if (so->so_head) {
> so->so_pcb = NULL;
> - sofree(so);
> + /*
> + * sofree() releases the socket lock, so we need to
> + * grab it beforehand as long as Unix sockets rely on
> + * the KERNEL_LOCK();
> + */
> + KERNEL_LOCK();
> + sofree(so, 0);
> m_freem(unp->unp_addr);
> free(unp, M_PCB, sizeof *unp);
> }
> diff --git sys/miscfs/fifofs/fifo_vnops.c sys/miscfs/fifofs/fifo_vnops.c
> index 03a5677a05d..472bfd408a0 100644
> --- sys/miscfs/fifofs/fifo_vnops.c
> +++ sys/miscfs/fifofs/fifo_vnops.c
> @@ -170,7 +170,7 @@ fifo_open(void *v)
> fip->fi_writers++;
> if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) {
> error = ENXIO;
> - sounlock(s);
> + sounlock(wso, s);
> goto bad;
> }
> if (fip->fi_writers == 1) {
> @@ -179,7 +179,7 @@ fifo_open(void *v)
> wakeup(&fip->fi_readers);
> }
> }
> - sounlock(s);
> + sounlock(wso, s);
> if ((ap->a_mode & O_NONBLOCK) == 0) {
> if ((ap->a_mode & FREAD) && fip->fi_writers == 0) {
> VOP_UNLOCK(vp);
> @@ -334,7 +334,7 @@ fifo_poll(void *v)
> wso->so_snd.sb_flags |= SB_SEL;
> }
> }
> - sounlock(s);
> + sounlock(rso, s);
> return (revents);
> }
>
> @@ -369,7 +369,7 @@ fifo_close(void *v)
>
> s = solock(wso);
> socantsendmore(wso);
> - sounlock(s);
> + sounlock(wso, s);
> }
> }
> if (ap->a_fflag & FWRITE) {
> @@ -380,7 +380,7 @@ fifo_close(void *v)
> /* SS_ISDISCONNECTED will result in POLLHUP */
> rso->so_state |= SS_ISDISCONNECTED;
> socantrcvmore(rso);
> - sounlock(s);
> + sounlock(rso, s);
> }
> }
> if (fip->fi_readers == 0 && fip->fi_writers == 0) {
> diff --git sys/net/bfd.c sys/net/bfd.c
> index e3e557e0d37..8bcfb305c99 100644
> --- sys/net/bfd.c
> +++ sys/net/bfd.c
> @@ -611,7 +611,7 @@ bfd_sender(struct bfd_config *bfd, unsigned int port)
>
> s = solock(so);
> error = soconnect(so, m);
> - sounlock(s);
> + sounlock(so, s);
> if (error && error != ECONNREFUSED) {
> printf("%s: soconnect error %d\n",
> __func__, error);
> diff --git sys/net/if_pflow.c sys/net/if_pflow.c
> index 9ba382d5069..1b1c2a4e33e 100644
> --- sys/net/if_pflow.c
> +++ sys/net/if_pflow.c
> @@ -442,7 +442,7 @@ pflow_set(struct pflow_softc *sc, struct pflowreq *pflowr)
>
> s = solock(so);
> error = sobind(so, m, p);
> - sounlock(s);
> + sounlock(so, s);
> m_freem(m);
> if (error) {
> soclose(so);
> diff --git sys/netinet/in_pcb.c sys/netinet/in_pcb.c
> index 413f9de9df5..62bb9af000c 100644
> --- sys/netinet/in_pcb.c
> +++ sys/netinet/in_pcb.c
> @@ -584,8 +584,13 @@ in_pcbdetach(struct inpcb *inp)
>
> NET_ASSERT_LOCKED();
>
> - so->so_pcb = 0;
> - sofree(so);
> + so->so_pcb = NULL;
> + /*
> + * As long as the NET_LOCK() is the default lock for Internet
> + * sockets, do not release it to not introduce new sleeping
> + * points.
> + */
> + sofree(so, 0);
> m_freem(inp->inp_options);
> if (inp->inp_route.ro_rt) {
> rtfree(inp->inp_route.ro_rt);
> diff --git sys/nfs/krpc_subr.c sys/nfs/krpc_subr.c
> index 346ff9ec989..e487867db1b 100644
> --- sys/nfs/krpc_subr.c
> +++ sys/nfs/krpc_subr.c
> @@ -241,7 +241,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers,
> u_int func,
> m->m_len = sizeof(tv);
> s = solock(so);
> error = sosetopt(so, SOL_SOCKET, SO_RCVTIMEO, m);
> - sounlock(s);
> + sounlock(so, s);
> m_freem(m);
> if (error)
> goto out;
> @@ -257,7 +257,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers,
> u_int func,
> *on = 1;
> s = solock(so);
> error = sosetopt(so, SOL_SOCKET, SO_BROADCAST, m);
> - sounlock(s);
> + sounlock(so, s);
> m_freem(m);
> if (error)
> goto out;
> @@ -274,7 +274,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers,
> u_int func,
> *ip = IP_PORTRANGE_LOW;
> s = solock(so);
> error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt);
> - sounlock(s);
> + sounlock(so, s);
> m_freem(mopt);
> if (error)
> goto out;
> @@ -288,7 +288,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers,
> u_int func,
> sin->sin_port = htons(0);
> s = solock(so);
> error = sobind(so, m, &proc0);
> - sounlock(s);
> + sounlock(so, s);
> m_freem(m);
> if (error) {
> printf("bind failed\n");
> @@ -301,7 +301,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers,
> u_int func,
> *ip = IP_PORTRANGE_DEFAULT;
> s = solock(so);
> error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt);
> - sounlock(s);
> + sounlock(so, s);
> m_freem(mopt);
> if (error)
> goto out;
> diff --git sys/nfs/nfs_socket.c sys/nfs/nfs_socket.c
> index 383db1cb930..0119f135ef2 100644
> --- sys/nfs/nfs_socket.c
> +++ sys/nfs/nfs_socket.c
> @@ -365,7 +365,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
> goto bad;
> so->so_rcv.sb_flags |= SB_NOINTR;
> so->so_snd.sb_flags |= SB_NOINTR;
> - sounlock(s);
> + sounlock(so, s);
>
> m_freem(mopt);
> m_freem(nam);
> @@ -378,7 +378,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
> return (0);
>
> bad:
> - sounlock(s);
> + sounlock(so, s);
>
> m_freem(mopt);
> m_freem(nam);
> diff --git sys/nfs/nfs_syscalls.c sys/nfs/nfs_syscalls.c
> index 527a61a37de..6b501cc419f 100644
> --- sys/nfs/nfs_syscalls.c
> +++ sys/nfs/nfs_syscalls.c
> @@ -250,7 +250,7 @@ nfssvc_addsock(struct file *fp, struct mbuf *mynam)
> s = solock(so);
> error = soreserve(so, siz, siz);
> if (error) {
> - sounlock(s);
> + sounlock(so, s);
> m_freem(mynam);
> return (error);
> }
> @@ -279,7 +279,7 @@ nfssvc_addsock(struct file *fp, struct mbuf *mynam)
> so->so_rcv.sb_timeo = 0;
> so->so_snd.sb_flags &= ~SB_NOINTR;
> so->so_snd.sb_timeo = 0;
> - sounlock(s);
> + sounlock(so, s);
> if (tslp)
> slp = tslp;
> else {
> diff --git sys/sys/socketvar.h sys/sys/socketvar.h
> index 097ae3a4ab9..ce8bbdf9f2d 100644
> --- sys/sys/socketvar.h
> +++ sys/sys/socketvar.h
> @@ -311,7 +311,7 @@ int soconnect(struct socket *so, struct mbuf *nam);
> int soconnect2(struct socket *so1, struct socket *so2);
> int socreate(int dom, struct socket **aso, int type, int proto);
> int sodisconnect(struct socket *so);
> -void sofree(struct socket *so);
> +void sofree(struct socket *so, int);
> int sogetopt(struct socket *so, int level, int optname, struct mbuf *m);
> void sohasoutofband(struct socket *so);
> void soisconnected(struct socket *so);
> @@ -338,7 +338,7 @@ int sockargs(struct mbuf **, const void *, size_t,
> int);
>
> int sosleep(struct socket *, void *, int, const char *, int);
> int solock(struct socket *);
> -void sounlock(int);
> +void sounlock(struct socket *, int);
>
> int sendit(struct proc *, int, struct msghdr *, int, register_t *);
> int recvit(struct proc *, int, struct msghdr *, caddr_t,