On 20/01/17(Fri) 12:04, Martin Pieuchot wrote:
> Diff below enables the NET_LOCK(), again.
>
> What's new?
>
> - The lock ordering problem with fdplock() should now be fixed. It is
> also documented, fdplock() should be taken first if both locks are
> needed.
>
> - Page faults involving NFS, or any other code path already holding the
> NET_LOCK(), have been fixed. The recursion is similar to the existing
> vnode problem, so I simply added a hack there.
>
> - I added some NET_ASSERT_UNLOCKED() to help finding other possible
> deadlocks.
Thanks to all the testers!
Updated diff with:
- A hack to mark and prevent pflow(4)/pf(4) recursion reported by many
- A fix to prevent the deadlock in accept(2)
- The introduction of inifioctl to prevent false positive when
checking that no unwanted tsleep(9) are done while holding the
NET_LOCK().
diff --git sys/kern/kern_rwlock.c sys/kern/kern_rwlock.c
index bdaee0c1ae7..9182b42706a 100644
--- sys/kern/kern_rwlock.c
+++ sys/kern/kern_rwlock.c
@@ -195,6 +195,9 @@ retry:
unsigned long set = o | op->wait_set;
int do_sleep;
+ if (panicstr)
+ return (0);
+
rw_enter_diag(rwl, flags);
if (flags & RW_NOSLEEP)
@@ -205,7 +208,7 @@ retry:
sleep_setup_signal(&sls, op->wait_prio | PCATCH);
do_sleep = !rw_cas(&rwl->rwl_owner, o, set);
-
+ NET_ASSERT_UNLOCKED();
sleep_finish(&sls, do_sleep);
if ((flags & RW_INTR) &&
(error = sleep_finish_signal(&sls)) != 0)
diff --git sys/kern/kern_synch.c sys/kern/kern_synch.c
index beeefe06458..68ad163f839 100644
--- sys/kern/kern_synch.c
+++ sys/kern/kern_synch.c
@@ -111,6 +111,11 @@ tsleep(const volatile void *ident, int priority, const
char *wmesg, int timo)
int hold_count;
#endif
+#if 1
+ extern int inifioctl;
+ if (!inifioctl)
+ NET_ASSERT_UNLOCKED();
+#endif
KASSERT((priority & ~(PRIMASK | PCATCH)) == 0);
#ifdef MULTIPROCESSOR
@@ -170,6 +175,11 @@ msleep(const volatile void *ident, struct mutex *mtx, int
priority,
int hold_count;
#endif
+#if 1
+ extern int inifioctl;
+ if (!inifioctl)
+ NET_ASSERT_UNLOCKED();
+#endif
KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
KASSERT(mtx != NULL);
diff --git sys/kern/sys_socket.c sys/kern/sys_socket.c
index 7c8d8f728d1..a3b7db9ab7c 100644
--- sys/kern/sys_socket.c
+++ sys/kern/sys_socket.c
@@ -121,7 +121,14 @@ soo_ioctl(struct file *fp, u_long cmd, caddr_t data,
struct proc *p)
*/
if (IOCGROUP(cmd) == 'i') {
NET_LOCK(s);
+#if 1
+ extern int inifioctl;
+ inifioctl = 1;
+#endif
error = ifioctl(so, cmd, data, p);
+#if 1
+ inifioctl = 0;
+#endif
NET_UNLOCK(s);
return (error);
}
diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
index 245210e594a..edc419eb937 100644
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -256,7 +256,7 @@ soclose(struct socket *so)
(so->so_state & SS_NBIO))
goto drop;
while (so->so_state & SS_ISCONNECTED) {
- error = tsleep(&so->so_timeo,
+ error = rwsleep(&so->so_timeo, &netlock,
PSOCK | PCATCH, "netcls",
so->so_linger * hz);
if (error)
@@ -615,7 +615,7 @@ sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
* followed by an optional mbuf or mbufs containing ancillary data,
* and then zero or more mbufs of data.
* In order to avoid blocking network for the entire time here, we splx()
- * and release NET_LOCK() while doing the actual copy to user space.
+ * and release ``netlock'' while doing the actual copy to user space.
* Although the sockbuf is locked, new data may still be appended,
* and thus we must maintain consistency of the sockbuf during that time.
*
@@ -800,9 +800,13 @@ dontblock:
if (controlp) {
if (pr->pr_domain->dom_externalize &&
mtod(cm, struct cmsghdr *)->cmsg_type ==
- SCM_RIGHTS)
- error = (*pr->pr_domain->dom_externalize)(cm,
- controllen, flags);
+ SCM_RIGHTS) {
+ NET_UNLOCK(s);
+ error =
+ (*pr->pr_domain->dom_externalize)
+ (cm, controllen, flags);
+ NET_LOCK(s);
+ }
*controlp = cm;
} else {
/*
@@ -1039,7 +1043,7 @@ sorflush(struct socket *so)
struct sockbuf asb;
sb->sb_flags |= SB_NOINTR;
- (void) sblock(sb, M_WAITOK, NULL);
+ (void) sblock(sb, M_WAITOK, &netlock);
socantrcvmore(so);
sbunlock(sb);
asb = *sb;
@@ -1528,7 +1532,10 @@ sorwakeup(struct socket *so)
#endif
sowakeup(so, &so->so_rcv);
if (so->so_upcall) {
+ /* XXXSMP breaks atomicity */
+ rw_exit_write(&netlock);
(*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
+ rw_enter_write(&netlock);
}
}
diff --git sys/kern/uipc_socket2.c sys/kern/uipc_socket2.c
index 1ac5f515bc5..6dd5abbd473 100644
--- sys/kern/uipc_socket2.c
+++ sys/kern/uipc_socket2.c
@@ -276,7 +276,7 @@ sbwait(struct sockbuf *sb)
NET_ASSERT_LOCKED();
sb->sb_flagsintr |= SB_WAIT;
- return (tsleep(&sb->sb_cc,
+ return (rwsleep(&sb->sb_cc, &netlock,
(sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "netio",
sb->sb_timeo));
}
diff --git sys/kern/uipc_syscalls.c sys/kern/uipc_syscalls.c
index ed24be8ab85..1a9460f78a4 100644
--- sys/kern/uipc_syscalls.c
+++ sys/kern/uipc_syscalls.c
@@ -265,7 +265,7 @@ doaccept(struct proc *p, int sock, struct sockaddr *name,
socklen_t *anamelen,
{
struct filedesc *fdp = p->p_fd;
struct file *fp, *headfp;
- struct mbuf *nam;
+ struct mbuf *nam = NULL;
socklen_t namelen;
int error, s, tmpfd;
struct socket *head, *so;
@@ -277,55 +277,56 @@ doaccept(struct proc *p, int sock, struct sockaddr *name,
socklen_t *anamelen,
return (error);
headfp = fp;
+ head = headfp->f_data;
+
+ fdplock(fdp);
+ error = falloc(p, &fp, &tmpfd);
+ if (error == 0 && (flags & SOCK_CLOEXEC))
+ fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
+ fdpunlock(fdp);
+ if (error != 0) {
+ /*
+ * Probably ran out of file descriptors. Wakeup
+ * so some other process might have a chance at it.
+ */
+ wakeup_one(&head->so_timeo);
+ FRELE(headfp, p);
+ return (error);
+ }
+
redo:
NET_LOCK(s);
- head = headfp->f_data;
if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
- goto bad;
+ goto out;
}
if ((head->so_state & SS_NBIO) && head->so_qlen == 0) {
if (head->so_state & SS_CANTRCVMORE)
error = ECONNABORTED;
else
error = EWOULDBLOCK;
- goto bad;
+ goto out;
}
while (head->so_qlen == 0 && head->so_error == 0) {
if (head->so_state & SS_CANTRCVMORE) {
head->so_error = ECONNABORTED;
break;
}
- error = tsleep(&head->so_timeo, PSOCK | PCATCH,
+ error = rwsleep(&head->so_timeo, &netlock, PSOCK | PCATCH,
"netcon", 0);
- if (error) {
- goto bad;
- }
+ if (error)
+ goto out;
}
if (head->so_error) {
error = head->so_error;
head->so_error = 0;
- goto bad;
+ goto out;
}
/* Figure out whether the new socket should be non-blocking. */
nflag = flags & SOCK_NONBLOCK_INHERIT ? (headfp->f_flag & FNONBLOCK)
: (flags & SOCK_NONBLOCK ? FNONBLOCK : 0);
- fdplock(fdp);
- error = falloc(p, &fp, &tmpfd);
- if (error == 0 && (flags & SOCK_CLOEXEC))
- fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
- fdpunlock(fdp);
- if (error != 0) {
- /*
- * Probably ran out of file descriptors. Wakeup
- * so some other process might have a chance at it.
- */
- wakeup_one(&head->so_timeo);
- goto bad;
- }
-
nam = m_get(M_WAIT, MT_SONAME);
/*
@@ -360,24 +361,20 @@ redo:
error = soaccept(so, nam);
if (!error && name != NULL)
error = copyaddrout(p, nam, name, namelen, anamelen);
-
- if (error) {
- /* if an error occurred, free the file descriptor */
- NET_UNLOCK(s);
- m_freem(nam);
+ if (!error) {
+ (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&nflag, p);
+ FILE_SET_MATURE(fp, p);
+ *retval = tmpfd;
+ }
+out:
+ NET_UNLOCK(s);
+ m_freem(nam);
+ if (error != 0) {
fdplock(fdp);
fdremove(fdp, tmpfd);
closef(fp, p);
fdpunlock(fdp);
- goto out;
}
- (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&nflag, p);
- FILE_SET_MATURE(fp, p);
- *retval = tmpfd;
- m_freem(nam);
-bad:
- NET_UNLOCK(s);
-out:
FRELE(headfp, p);
return (error);
}
@@ -436,7 +433,7 @@ sys_connect(struct proc *p, void *v, register_t *retval)
}
NET_LOCK(s);
while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
- error = tsleep(&so->so_timeo, PSOCK | PCATCH,
+ error = rwsleep(&so->so_timeo, &netlock, PSOCK | PCATCH,
"netcon2", 0);
if (error) {
if (error == EINTR || error == ERESTART)
diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c
index 94204dc527b..556d1d08069 100644
--- sys/kern/uipc_usrreq.c
+++ sys/kern/uipc_usrreq.c
@@ -131,7 +131,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m,
struct mbuf *nam,
break;
case PRU_BIND:
+ /* XXXSMP breaks atomicity */
+ rw_exit_write(&netlock);
error = unp_bind(unp, nam, p);
+ rw_enter_write(&netlock);
break;
case PRU_LISTEN:
@@ -140,7 +143,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m,
struct mbuf *nam,
break;
case PRU_CONNECT:
+ /* XXXSMP breaks atomicity */
+ rw_exit_write(&netlock);
error = unp_connect(so, nam, p);
+ rw_enter_write(&netlock);
break;
case PRU_CONNECT2:
@@ -208,7 +214,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m,
struct mbuf *nam,
error = EISCONN;
break;
}
+ /* XXXSMP breaks atomicity */
+ rw_exit_write(&netlock);
error = unp_connect(so, nam, p);
+ rw_enter_write(&netlock);
if (error)
break;
} else {
@@ -485,6 +494,8 @@ unp_connect(struct socket *so, struct mbuf *nam, struct
proc *p)
struct nameidata nd;
int error, s;
+ NET_ASSERT_UNLOCKED();
+
if (soun->sun_family != AF_UNIX)
return (EAFNOSUPPORT);
diff --git sys/net/if.c sys/net/if.c
index 20e9dcbc9de..ffa1d99347f 100644
--- sys/net/if.c
+++ sys/net/if.c
@@ -230,6 +230,25 @@ struct taskq *softnettq;
struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
/*
+ * Serialize socket operations to ensure no new sleeping points
+ * are introduced in IP output paths.
+ *
+ * If a code path needs both the NET_LOCK() and fdplock(), they
+ * have to be taken in the following order:
+ *
+ * fdplock()
+ * NET_LOCK()
+ * ...
+ * NET_UNLOCK()
+ * fdpunlock()
+ */
+struct rwlock netlock = RWLOCK_INITIALIZER("netlock");
+#if 1
+/* tsleep() is ok if inside ifioctl(). */
+int inifioctl;
+#endif
+
+/*
* Network interface utility routines.
*/
void
@@ -1076,7 +1095,10 @@ if_clone_create(const char *name, int rdomain)
if (ifunit(name) != NULL)
return (EEXIST);
+ /* XXXSMP breaks atomicity */
+ rw_exit_write(&netlock);
ret = (*ifc->ifc_create)(ifc, unit);
+ rw_enter_write(&netlock);
if (ret != 0 || (ifp = ifunit(name)) == NULL)
return (ret);
@@ -1118,7 +1140,10 @@ if_clone_destroy(const char *name)
splx(s);
}
+ /* XXXSMP breaks atomicity */
+ rw_exit_write(&netlock);
ret = (*ifc->ifc_destroy)(ifp);
+ rw_enter_write(&netlock);
return (ret);
}
diff --git sys/net/if_pflow.c sys/net/if_pflow.c
index 50a2a2cd1da..f929b86c5a9 100644
--- sys/net/if_pflow.c
+++ sys/net/if_pflow.c
@@ -504,6 +504,8 @@ pflowioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
sizeof(pflowr))))
return (error);
+ /* XXXSMP breaks atomicity */
+ rw_exit_write(&netlock);
s = splnet();
error = pflow_set(sc, &pflowr);
splx(s);
@@ -521,6 +523,7 @@ pflowioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
} else
ifp->if_flags &= ~IFF_RUNNING;
+ rw_enter_write(&netlock);
break;
default:
diff --git sys/net/pf.c sys/net/pf.c
index 98578708bab..2ae434e405d 100644
--- sys/net/pf.c
+++ sys/net/pf.c
@@ -1305,8 +1305,12 @@ pf_remove_state(struct pf_state *cur)
}
RB_REMOVE(pf_state_tree_id, &tree_id, cur);
#if NPFLOW > 0
- if (cur->state_flags & PFSTATE_PFLOW)
+ if (cur->state_flags & PFSTATE_PFLOW) {
+ /* XXXSMP breaks atomicity */
+ rw_exit_write(&netlock);
export_pflow(cur);
+ rw_enter_write(&netlock);
+ }
#endif /* NPFLOW > 0 */
#if NPFSYNC > 0
pfsync_delete_state(cur);
diff --git sys/sys/filedesc.h sys/sys/filedesc.h
index 148c8bc0182..730e5794eb1 100644
--- sys/sys/filedesc.h
+++ sys/sys/filedesc.h
@@ -138,7 +138,7 @@ struct file *fd_getfile_mode(struct filedesc *, int, int);
int closef(struct file *, struct proc *);
int getsock(struct proc *, int, struct file **);
-#define fdplock(fdp) rw_enter_write(&(fdp)->fd_lock)
+#define fdplock(fdp) do { NET_ASSERT_UNLOCKED();
rw_enter_write(&(fdp)->fd_lock); } while (0)
#define fdpunlock(fdp) rw_exit_write(&(fdp)->fd_lock)
#define fdpassertlocked(fdp) rw_assert_wrlock(&(fdp)->fd_lock)
#endif
diff --git sys/sys/systm.h sys/sys/systm.h
index f9d34f34a17..834de358be5 100644
--- sys/sys/systm.h
+++ sys/sys/systm.h
@@ -291,21 +291,35 @@ int uiomove(void *, size_t, struct uio *);
#if defined(_KERNEL)
+#include <sys/rwlock.h>
+
+extern struct rwlock netlock;
+
#define NET_LOCK(s)
\
do { \
+ rw_enter_write(&netlock); \
s = splsoftnet(); \
} while (0)
-#define NET_UNLOCK(s)
\
+#define NET_UNLOCK(s) \
do { \
splx(s); \
+ rw_exit_write(&netlock); \
} while (0)
#define NET_ASSERT_LOCKED()
\
do { \
+ if (rw_status(&netlock) != RW_WRITE) \
+ splassert_fail(RW_WRITE, rw_status(&netlock), __func__);\
splsoftassert(IPL_SOFTNET); \
} while (0)
+#define NET_ASSERT_UNLOCKED()
\
+do { \
+ if (rw_status(&netlock) == RW_WRITE) \
+ splassert_fail(0, rw_status(&netlock), __func__); \
+} while (0)
+
__returns_twice int setjmp(label_t *);
__dead void longjmp(label_t *);
#endif
diff --git sys/uvm/uvm_vnode.c sys/uvm/uvm_vnode.c
index 2abfac2f14d..b7a7f746806 100644
--- sys/uvm/uvm_vnode.c
+++ sys/uvm/uvm_vnode.c
@@ -1176,6 +1176,15 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int
npages, int flags, int rw)
result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
if (result == 0) {
+ int netlocked = (rw_status(&netlock) == RW_WRITE);
+
+ /*
+ * This process may already have the NET_LOCK(), if we
+ * faulted in copyin() or copyout() in the network stack.
+ */
+ if (netlocked)
+ rw_exit_write(&netlock);
+
/* NOTE: vnode now locked! */
if (rw == UIO_READ)
result = VOP_READ(vn, &uio, 0, curproc->p_ucred);
@@ -1184,6 +1193,9 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages,
int flags, int rw)
(flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
curproc->p_ucred);
+ if (netlocked)
+ rw_enter_write(&netlock);
+
if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
VOP_UNLOCK(vn, curproc);
}