On 20/01/17(Fri) 12:04, Martin Pieuchot wrote: > Diff below enables the NET_LOCK(), again. > > What's new? > > - The lock ordering problem with fdplock() should now be fixed. It is > also documented, fdplock() should be taken first if both locks are > needed. > > - Page faults involving NFS, or any other code path already holding the > NET_LOCK(), have been fixed. The recursion is similar to the existing > vnode problem, so I simply added a hack there. > > - I added some NET_ASSERT_UNLOCKED() to help finding other possible > deadlocks.
Thanks to all the testers! Updated diff with: - A hack to mark and prevent pflow(4)/pf(4) recursion reported by many - A fix to prevent the deadlock in accept(2) - The introduction of inifioctl to prevent false positive when checking that no unwanted tsleep(9) are done while holding the NET_LOCK(). diff --git sys/kern/kern_rwlock.c sys/kern/kern_rwlock.c index bdaee0c1ae7..9182b42706a 100644 --- sys/kern/kern_rwlock.c +++ sys/kern/kern_rwlock.c @@ -195,6 +195,9 @@ retry: unsigned long set = o | op->wait_set; int do_sleep; + if (panicstr) + return (0); + rw_enter_diag(rwl, flags); if (flags & RW_NOSLEEP) @@ -205,7 +208,7 @@ retry: sleep_setup_signal(&sls, op->wait_prio | PCATCH); do_sleep = !rw_cas(&rwl->rwl_owner, o, set); - + NET_ASSERT_UNLOCKED(); sleep_finish(&sls, do_sleep); if ((flags & RW_INTR) && (error = sleep_finish_signal(&sls)) != 0) diff --git sys/kern/kern_synch.c sys/kern/kern_synch.c index beeefe06458..68ad163f839 100644 --- sys/kern/kern_synch.c +++ sys/kern/kern_synch.c @@ -111,6 +111,11 @@ tsleep(const volatile void *ident, int priority, const char *wmesg, int timo) int hold_count; #endif +#if 1 + extern int inifioctl; + if (!inifioctl) + NET_ASSERT_UNLOCKED(); +#endif KASSERT((priority & ~(PRIMASK | PCATCH)) == 0); #ifdef MULTIPROCESSOR @@ -170,6 +175,11 @@ msleep(const volatile void *ident, struct mutex *mtx, int priority, int hold_count; #endif +#if 1 + extern int inifioctl; + if (!inifioctl) + NET_ASSERT_UNLOCKED(); +#endif KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0); KASSERT(mtx != NULL); diff --git sys/kern/sys_socket.c sys/kern/sys_socket.c index 7c8d8f728d1..a3b7db9ab7c 100644 --- sys/kern/sys_socket.c +++ sys/kern/sys_socket.c @@ -121,7 +121,14 @@ soo_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p) */ if (IOCGROUP(cmd) == 'i') { NET_LOCK(s); +#if 1 + extern int inifioctl; + inifioctl = 1; +#endif error = ifioctl(so, cmd, data, p); +#if 1 + inifioctl = 0; +#endif NET_UNLOCK(s); return (error); } diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c index 245210e594a..edc419eb937 100644 --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -256,7 +256,7 @@ soclose(struct socket *so) (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { - error = tsleep(&so->so_timeo, + error = rwsleep(&so->so_timeo, &netlock, PSOCK | PCATCH, "netcls", so->so_linger * hz); if (error) @@ -615,7 +615,7 @@ sbsync(struct sockbuf *sb, struct mbuf *nextrecord) * followed by an optional mbuf or mbufs containing ancillary data, * and then zero or more mbufs of data. * In order to avoid blocking network for the entire time here, we splx() - * and release NET_LOCK() while doing the actual copy to user space. + * and release ``netlock'' while doing the actual copy to user space. * Although the sockbuf is locked, new data may still be appended, * and thus we must maintain consistency of the sockbuf during that time. * @@ -800,9 +800,13 @@ dontblock: if (controlp) { if (pr->pr_domain->dom_externalize && mtod(cm, struct cmsghdr *)->cmsg_type == - SCM_RIGHTS) - error = (*pr->pr_domain->dom_externalize)(cm, - controllen, flags); + SCM_RIGHTS) { + NET_UNLOCK(s); + error = + (*pr->pr_domain->dom_externalize) + (cm, controllen, flags); + NET_LOCK(s); + } *controlp = cm; } else { /* @@ -1039,7 +1043,7 @@ sorflush(struct socket *so) struct sockbuf asb; sb->sb_flags |= SB_NOINTR; - (void) sblock(sb, M_WAITOK, NULL); + (void) sblock(sb, M_WAITOK, &netlock); socantrcvmore(so); sbunlock(sb); asb = *sb; @@ -1528,7 +1532,10 @@ sorwakeup(struct socket *so) #endif sowakeup(so, &so->so_rcv); if (so->so_upcall) { + /* XXXSMP breaks atomicity */ + rw_exit_write(&netlock); (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); + rw_enter_write(&netlock); } } diff --git sys/kern/uipc_socket2.c sys/kern/uipc_socket2.c index 1ac5f515bc5..6dd5abbd473 100644 --- sys/kern/uipc_socket2.c +++ sys/kern/uipc_socket2.c @@ -276,7 +276,7 @@ sbwait(struct sockbuf *sb) NET_ASSERT_LOCKED(); sb->sb_flagsintr |= SB_WAIT; - return (tsleep(&sb->sb_cc, + return (rwsleep(&sb->sb_cc, &netlock, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "netio", sb->sb_timeo)); } diff --git sys/kern/uipc_syscalls.c sys/kern/uipc_syscalls.c index ed24be8ab85..1a9460f78a4 100644 --- sys/kern/uipc_syscalls.c +++ sys/kern/uipc_syscalls.c @@ -265,7 +265,7 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen, { struct filedesc *fdp = p->p_fd; struct file *fp, *headfp; - struct mbuf *nam; + struct mbuf *nam = NULL; socklen_t namelen; int error, s, tmpfd; struct socket *head, *so; @@ -277,55 +277,56 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, socklen_t *anamelen, return (error); headfp = fp; + head = headfp->f_data; + + fdplock(fdp); + error = falloc(p, &fp, &tmpfd); + if (error == 0 && (flags & SOCK_CLOEXEC)) + fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE; + fdpunlock(fdp); + if (error != 0) { + /* + * Probably ran out of file descriptors. Wakeup + * so some other process might have a chance at it. + */ + wakeup_one(&head->so_timeo); + FRELE(headfp, p); + return (error); + } + redo: NET_LOCK(s); - head = headfp->f_data; if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; - goto bad; + goto out; } if ((head->so_state & SS_NBIO) && head->so_qlen == 0) { if (head->so_state & SS_CANTRCVMORE) error = ECONNABORTED; else error = EWOULDBLOCK; - goto bad; + goto out; } while (head->so_qlen == 0 && head->so_error == 0) { if (head->so_state & SS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } - error = tsleep(&head->so_timeo, PSOCK | PCATCH, + error = rwsleep(&head->so_timeo, &netlock, PSOCK | PCATCH, "netcon", 0); - if (error) { - goto bad; - } + if (error) + goto out; } if (head->so_error) { error = head->so_error; head->so_error = 0; - goto bad; + goto out; } /* Figure out whether the new socket should be non-blocking. */ nflag = flags & SOCK_NONBLOCK_INHERIT ? (headfp->f_flag & FNONBLOCK) : (flags & SOCK_NONBLOCK ? FNONBLOCK : 0); - fdplock(fdp); - error = falloc(p, &fp, &tmpfd); - if (error == 0 && (flags & SOCK_CLOEXEC)) - fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE; - fdpunlock(fdp); - if (error != 0) { - /* - * Probably ran out of file descriptors. Wakeup - * so some other process might have a chance at it. - */ - wakeup_one(&head->so_timeo); - goto bad; - } - nam = m_get(M_WAIT, MT_SONAME); /* @@ -360,24 +361,20 @@ redo: error = soaccept(so, nam); if (!error && name != NULL) error = copyaddrout(p, nam, name, namelen, anamelen); - - if (error) { - /* if an error occurred, free the file descriptor */ - NET_UNLOCK(s); - m_freem(nam); + if (!error) { + (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&nflag, p); + FILE_SET_MATURE(fp, p); + *retval = tmpfd; + } +out: + NET_UNLOCK(s); + m_freem(nam); + if (error != 0) { fdplock(fdp); fdremove(fdp, tmpfd); closef(fp, p); fdpunlock(fdp); - goto out; } - (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&nflag, p); - FILE_SET_MATURE(fp, p); - *retval = tmpfd; - m_freem(nam); -bad: - NET_UNLOCK(s); -out: FRELE(headfp, p); return (error); } @@ -436,7 +433,7 @@ sys_connect(struct proc *p, void *v, register_t *retval) } NET_LOCK(s); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - error = tsleep(&so->so_timeo, PSOCK | PCATCH, + error = rwsleep(&so->so_timeo, &netlock, PSOCK | PCATCH, "netcon2", 0); if (error) { if (error == EINTR || error == ERESTART) diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c index 94204dc527b..556d1d08069 100644 --- sys/kern/uipc_usrreq.c +++ sys/kern/uipc_usrreq.c @@ -131,7 +131,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, break; case PRU_BIND: + /* XXXSMP breaks atomicity */ + rw_exit_write(&netlock); error = unp_bind(unp, nam, p); + rw_enter_write(&netlock); break; case PRU_LISTEN: @@ -140,7 +143,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, break; case PRU_CONNECT: + /* XXXSMP breaks atomicity */ + rw_exit_write(&netlock); error = unp_connect(so, nam, p); + rw_enter_write(&netlock); break; case PRU_CONNECT2: @@ -208,7 +214,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, error = EISCONN; break; } + /* XXXSMP breaks atomicity */ + rw_exit_write(&netlock); error = unp_connect(so, nam, p); + rw_enter_write(&netlock); if (error) break; } else { @@ -485,6 +494,8 @@ unp_connect(struct socket *so, struct mbuf *nam, struct proc *p) struct nameidata nd; int error, s; + NET_ASSERT_UNLOCKED(); + if (soun->sun_family != AF_UNIX) return (EAFNOSUPPORT); diff --git sys/net/if.c sys/net/if.c index 20e9dcbc9de..ffa1d99347f 100644 --- sys/net/if.c +++ sys/net/if.c @@ -230,6 +230,25 @@ struct taskq *softnettq; struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL); /* + * Serialize socket operations to ensure no new sleeping points + * are introduced in IP output paths. + * + * If a code path needs both the NET_LOCK() and fdplock(), they + * have to be taken in the following order: + * + * fdplock() + * NET_LOCK() + * ... + * NET_UNLOCK() + * fdpunlock() + */ +struct rwlock netlock = RWLOCK_INITIALIZER("netlock"); +#if 1 +/* tsleep() is ok if inside ifioctl(). */ +int inifioctl; +#endif + +/* * Network interface utility routines. */ void @@ -1076,7 +1095,10 @@ if_clone_create(const char *name, int rdomain) if (ifunit(name) != NULL) return (EEXIST); + /* XXXSMP breaks atomicity */ + rw_exit_write(&netlock); ret = (*ifc->ifc_create)(ifc, unit); + rw_enter_write(&netlock); if (ret != 0 || (ifp = ifunit(name)) == NULL) return (ret); @@ -1118,7 +1140,10 @@ if_clone_destroy(const char *name) splx(s); } + /* XXXSMP breaks atomicity */ + rw_exit_write(&netlock); ret = (*ifc->ifc_destroy)(ifp); + rw_enter_write(&netlock); return (ret); } diff --git sys/net/if_pflow.c sys/net/if_pflow.c index 50a2a2cd1da..f929b86c5a9 100644 --- sys/net/if_pflow.c +++ sys/net/if_pflow.c @@ -504,6 +504,8 @@ pflowioctl(struct ifnet *ifp, u_long cmd, caddr_t data) sizeof(pflowr)))) return (error); + /* XXXSMP breaks atomicity */ + rw_exit_write(&netlock); s = splnet(); error = pflow_set(sc, &pflowr); splx(s); @@ -521,6 +523,7 @@ pflowioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } else ifp->if_flags &= ~IFF_RUNNING; + rw_enter_write(&netlock); break; default: diff --git sys/net/pf.c sys/net/pf.c index 98578708bab..2ae434e405d 100644 --- sys/net/pf.c +++ sys/net/pf.c @@ -1305,8 +1305,12 @@ pf_remove_state(struct pf_state *cur) } RB_REMOVE(pf_state_tree_id, &tree_id, cur); #if NPFLOW > 0 - if (cur->state_flags & PFSTATE_PFLOW) + if (cur->state_flags & PFSTATE_PFLOW) { + /* XXXSMP breaks atomicity */ + rw_exit_write(&netlock); export_pflow(cur); + rw_enter_write(&netlock); + } #endif /* NPFLOW > 0 */ #if NPFSYNC > 0 pfsync_delete_state(cur); diff --git sys/sys/filedesc.h sys/sys/filedesc.h index 148c8bc0182..730e5794eb1 100644 --- sys/sys/filedesc.h +++ sys/sys/filedesc.h @@ -138,7 +138,7 @@ struct file *fd_getfile_mode(struct filedesc *, int, int); int closef(struct file *, struct proc *); int getsock(struct proc *, int, struct file **); -#define fdplock(fdp) rw_enter_write(&(fdp)->fd_lock) +#define fdplock(fdp) do { NET_ASSERT_UNLOCKED(); rw_enter_write(&(fdp)->fd_lock); } while (0) #define fdpunlock(fdp) rw_exit_write(&(fdp)->fd_lock) #define fdpassertlocked(fdp) rw_assert_wrlock(&(fdp)->fd_lock) #endif diff --git sys/sys/systm.h sys/sys/systm.h index f9d34f34a17..834de358be5 100644 --- sys/sys/systm.h +++ sys/sys/systm.h @@ -291,21 +291,35 @@ int uiomove(void *, size_t, struct uio *); #if defined(_KERNEL) +#include <sys/rwlock.h> + +extern struct rwlock netlock; + #define NET_LOCK(s) \ do { \ + rw_enter_write(&netlock); \ s = splsoftnet(); \ } while (0) -#define NET_UNLOCK(s) \ +#define NET_UNLOCK(s) \ do { \ splx(s); \ + rw_exit_write(&netlock); \ } while (0) #define NET_ASSERT_LOCKED() \ do { \ + if (rw_status(&netlock) != RW_WRITE) \ + splassert_fail(RW_WRITE, rw_status(&netlock), __func__);\ splsoftassert(IPL_SOFTNET); \ } while (0) +#define NET_ASSERT_UNLOCKED() \ +do { \ + if (rw_status(&netlock) == RW_WRITE) \ + splassert_fail(0, rw_status(&netlock), __func__); \ +} while (0) + __returns_twice int setjmp(label_t *); __dead void longjmp(label_t *); #endif diff --git sys/uvm/uvm_vnode.c sys/uvm/uvm_vnode.c index 2abfac2f14d..b7a7f746806 100644 --- sys/uvm/uvm_vnode.c +++ sys/uvm/uvm_vnode.c @@ -1176,6 +1176,15 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc); if (result == 0) { + int netlocked = (rw_status(&netlock) == RW_WRITE); + + /* + * This process may already have the NET_LOCK(), if we + * faulted in copyin() or copyout() in the network stack. + */ + if (netlocked) + rw_exit_write(&netlock); + /* NOTE: vnode now locked! */ if (rw == UIO_READ) result = VOP_READ(vn, &uio, 0, curproc->p_ucred); @@ -1184,6 +1193,9 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw) (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0, curproc->p_ucred); + if (netlocked) + rw_enter_write(&netlock); + if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) VOP_UNLOCK(vn, curproc); }