On 20/01/17(Fri) 12:04, Martin Pieuchot wrote:
> Diff below enables the NET_LOCK(), again.
> 
> What's new?
> 
>  - The lock ordering problem with fdplock() should now be fixed.  It is
>    also documented, fdplock() should be taken first if both locks are 
>    needed.
> 
>  - Page faults involving NFS, or any other code path already holding the
>    NET_LOCK(), have been fixed.  The recursion is similar to the existing
>    vnode problem, so I simply added a hack there.
> 
>  - I added some NET_ASSERT_UNLOCKED() to help finding other possible
>    deadlocks.

Thanks to all the testers!

Updated diff with:

   - A hack to mark and prevent pflow(4)/pf(4) recursion reported by many

   - A fix to prevent the deadlock in accept(2) 

   - The introduction of inifioctl to prevent false positive when
     checking that no unwanted tsleep(9) are done while holding the
     NET_LOCK().

diff --git sys/kern/kern_rwlock.c sys/kern/kern_rwlock.c
index bdaee0c1ae7..9182b42706a 100644
--- sys/kern/kern_rwlock.c
+++ sys/kern/kern_rwlock.c
@@ -195,6 +195,9 @@ retry:
                unsigned long set = o | op->wait_set;
                int do_sleep;
 
+               if (panicstr)
+                       return (0);
+
                rw_enter_diag(rwl, flags);
 
                if (flags & RW_NOSLEEP)
@@ -205,7 +208,7 @@ retry:
                        sleep_setup_signal(&sls, op->wait_prio | PCATCH);
 
                do_sleep = !rw_cas(&rwl->rwl_owner, o, set);
-
+               NET_ASSERT_UNLOCKED();
                sleep_finish(&sls, do_sleep);
                if ((flags & RW_INTR) &&
                    (error = sleep_finish_signal(&sls)) != 0)
diff --git sys/kern/kern_synch.c sys/kern/kern_synch.c
index beeefe06458..68ad163f839 100644
--- sys/kern/kern_synch.c
+++ sys/kern/kern_synch.c
@@ -111,6 +111,11 @@ tsleep(const volatile void *ident, int priority, const 
char *wmesg, int timo)
        int hold_count;
 #endif
 
+#if 1
+       extern int inifioctl;
+       if (!inifioctl)
+               NET_ASSERT_UNLOCKED();
+#endif
        KASSERT((priority & ~(PRIMASK | PCATCH)) == 0);
 
 #ifdef MULTIPROCESSOR
@@ -170,6 +175,11 @@ msleep(const volatile void *ident, struct mutex *mtx, int 
priority,
        int hold_count;
 #endif
 
+#if 1
+       extern int inifioctl;
+       if (!inifioctl)
+               NET_ASSERT_UNLOCKED();
+#endif
        KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0);
        KASSERT(mtx != NULL);
 
diff --git sys/kern/sys_socket.c sys/kern/sys_socket.c
index 7c8d8f728d1..a3b7db9ab7c 100644
--- sys/kern/sys_socket.c
+++ sys/kern/sys_socket.c
@@ -121,7 +121,14 @@ soo_ioctl(struct file *fp, u_long cmd, caddr_t data, 
struct proc *p)
         */
        if (IOCGROUP(cmd) == 'i') {
                NET_LOCK(s);
+#if 1
+               extern int inifioctl;
+               inifioctl = 1;
+#endif
                error = ifioctl(so, cmd, data, p);
+#if 1
+               inifioctl = 0;
+#endif
                NET_UNLOCK(s);
                return (error);
        }
diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
index 245210e594a..edc419eb937 100644
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -256,7 +256,7 @@ soclose(struct socket *so)
                            (so->so_state & SS_NBIO))
                                goto drop;
                        while (so->so_state & SS_ISCONNECTED) {
-                               error = tsleep(&so->so_timeo,
+                               error = rwsleep(&so->so_timeo, &netlock,
                                    PSOCK | PCATCH, "netcls",
                                    so->so_linger * hz);
                                if (error)
@@ -615,7 +615,7 @@ sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
  * followed by an optional mbuf or mbufs containing ancillary data,
  * and then zero or more mbufs of data.
  * In order to avoid blocking network for the entire time here, we splx()
- * and release NET_LOCK() while doing the actual copy to user space.
+ * and release ``netlock'' while doing the actual copy to user space.
  * Although the sockbuf is locked, new data may still be appended,
  * and thus we must maintain consistency of the sockbuf during that time.
  *
@@ -800,9 +800,13 @@ dontblock:
                        if (controlp) {
                                if (pr->pr_domain->dom_externalize &&
                                    mtod(cm, struct cmsghdr *)->cmsg_type ==
-                                   SCM_RIGHTS)
-                                  error = (*pr->pr_domain->dom_externalize)(cm,
-                                      controllen, flags);
+                                   SCM_RIGHTS) {
+                                       NET_UNLOCK(s);
+                                       error =
+                                           (*pr->pr_domain->dom_externalize)
+                                           (cm, controllen, flags);
+                                       NET_LOCK(s);
+                               }
                                *controlp = cm;
                        } else {
                                /*
@@ -1039,7 +1043,7 @@ sorflush(struct socket *so)
        struct sockbuf asb;
 
        sb->sb_flags |= SB_NOINTR;
-       (void) sblock(sb, M_WAITOK, NULL);
+       (void) sblock(sb, M_WAITOK, &netlock);
        socantrcvmore(so);
        sbunlock(sb);
        asb = *sb;
@@ -1528,7 +1532,10 @@ sorwakeup(struct socket *so)
 #endif
        sowakeup(so, &so->so_rcv);
        if (so->so_upcall) {
+               /* XXXSMP breaks atomicity */
+               rw_exit_write(&netlock);
                (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
+               rw_enter_write(&netlock);
        }
 }
 
diff --git sys/kern/uipc_socket2.c sys/kern/uipc_socket2.c
index 1ac5f515bc5..6dd5abbd473 100644
--- sys/kern/uipc_socket2.c
+++ sys/kern/uipc_socket2.c
@@ -276,7 +276,7 @@ sbwait(struct sockbuf *sb)
        NET_ASSERT_LOCKED();
 
        sb->sb_flagsintr |= SB_WAIT;
-       return (tsleep(&sb->sb_cc,
+       return (rwsleep(&sb->sb_cc, &netlock,
            (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "netio",
            sb->sb_timeo));
 }
diff --git sys/kern/uipc_syscalls.c sys/kern/uipc_syscalls.c
index ed24be8ab85..1a9460f78a4 100644
--- sys/kern/uipc_syscalls.c
+++ sys/kern/uipc_syscalls.c
@@ -265,7 +265,7 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, 
socklen_t *anamelen,
 {
        struct filedesc *fdp = p->p_fd;
        struct file *fp, *headfp;
-       struct mbuf *nam;
+       struct mbuf *nam = NULL;
        socklen_t namelen;
        int error, s, tmpfd;
        struct socket *head, *so;
@@ -277,55 +277,56 @@ doaccept(struct proc *p, int sock, struct sockaddr *name, 
socklen_t *anamelen,
                return (error);
 
        headfp = fp;
+       head = headfp->f_data;
+
+       fdplock(fdp);
+       error = falloc(p, &fp, &tmpfd);
+       if (error == 0 && (flags & SOCK_CLOEXEC))
+               fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
+       fdpunlock(fdp);
+       if (error != 0) {
+               /*
+                * Probably ran out of file descriptors.  Wakeup
+                * so some other process might have a chance at it.
+                */
+               wakeup_one(&head->so_timeo);
+               FRELE(headfp, p);
+               return (error);
+       }
+
 redo:
        NET_LOCK(s);
-       head = headfp->f_data;
        if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) {
                error = EINVAL;
-               goto bad;
+               goto out;
        }
        if ((head->so_state & SS_NBIO) && head->so_qlen == 0) {
                if (head->so_state & SS_CANTRCVMORE)
                        error = ECONNABORTED;
                else
                        error = EWOULDBLOCK;
-               goto bad;
+               goto out;
        }
        while (head->so_qlen == 0 && head->so_error == 0) {
                if (head->so_state & SS_CANTRCVMORE) {
                        head->so_error = ECONNABORTED;
                        break;
                }
-               error = tsleep(&head->so_timeo, PSOCK | PCATCH,
+               error = rwsleep(&head->so_timeo, &netlock, PSOCK | PCATCH,
                    "netcon", 0);
-               if (error) {
-                       goto bad;
-               }
+               if (error)
+                       goto out;
        }
        if (head->so_error) {
                error = head->so_error;
                head->so_error = 0;
-               goto bad;
+               goto out;
        }
 
        /* Figure out whether the new socket should be non-blocking. */
        nflag = flags & SOCK_NONBLOCK_INHERIT ? (headfp->f_flag & FNONBLOCK)
            : (flags & SOCK_NONBLOCK ? FNONBLOCK : 0);
 
-       fdplock(fdp);
-       error = falloc(p, &fp, &tmpfd);
-       if (error == 0 && (flags & SOCK_CLOEXEC))
-               fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
-       fdpunlock(fdp);
-       if (error != 0) {
-               /*
-                * Probably ran out of file descriptors.  Wakeup
-                * so some other process might have a chance at it.
-                */
-               wakeup_one(&head->so_timeo);
-               goto bad;
-       }
-
        nam = m_get(M_WAIT, MT_SONAME);
 
        /*
@@ -360,24 +361,20 @@ redo:
        error = soaccept(so, nam);
        if (!error && name != NULL)
                error = copyaddrout(p, nam, name, namelen, anamelen);
-
-       if (error) {
-               /* if an error occurred, free the file descriptor */
-               NET_UNLOCK(s);
-               m_freem(nam);
+       if (!error) {
+               (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&nflag, p);
+               FILE_SET_MATURE(fp, p);
+               *retval = tmpfd;
+       }
+out:
+       NET_UNLOCK(s);
+       m_freem(nam);
+       if (error != 0) {
                fdplock(fdp);
                fdremove(fdp, tmpfd);
                closef(fp, p);
                fdpunlock(fdp);
-               goto out;
        }
-       (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&nflag, p);
-       FILE_SET_MATURE(fp, p);
-       *retval = tmpfd;
-       m_freem(nam);
-bad:
-       NET_UNLOCK(s);
-out:
        FRELE(headfp, p);
        return (error);
 }
@@ -436,7 +433,7 @@ sys_connect(struct proc *p, void *v, register_t *retval)
        }
        NET_LOCK(s);
        while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
-               error = tsleep(&so->so_timeo, PSOCK | PCATCH,
+               error = rwsleep(&so->so_timeo, &netlock, PSOCK | PCATCH,
                    "netcon2", 0);
                if (error) {
                        if (error == EINTR || error == ERESTART)
diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c
index 94204dc527b..556d1d08069 100644
--- sys/kern/uipc_usrreq.c
+++ sys/kern/uipc_usrreq.c
@@ -131,7 +131,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, 
struct mbuf *nam,
                break;
 
        case PRU_BIND:
+               /* XXXSMP breaks atomicity */
+               rw_exit_write(&netlock);
                error = unp_bind(unp, nam, p);
+               rw_enter_write(&netlock);
                break;
 
        case PRU_LISTEN:
@@ -140,7 +143,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, 
struct mbuf *nam,
                break;
 
        case PRU_CONNECT:
+               /* XXXSMP breaks atomicity */
+               rw_exit_write(&netlock);
                error = unp_connect(so, nam, p);
+               rw_enter_write(&netlock);
                break;
 
        case PRU_CONNECT2:
@@ -208,7 +214,10 @@ uipc_usrreq(struct socket *so, int req, struct mbuf *m, 
struct mbuf *nam,
                                        error = EISCONN;
                                        break;
                                }
+                               /* XXXSMP breaks atomicity */
+                               rw_exit_write(&netlock);
                                error = unp_connect(so, nam, p);
+                               rw_enter_write(&netlock);
                                if (error)
                                        break;
                        } else {
@@ -485,6 +494,8 @@ unp_connect(struct socket *so, struct mbuf *nam, struct 
proc *p)
        struct nameidata nd;
        int error, s;
 
+       NET_ASSERT_UNLOCKED();
+
        if (soun->sun_family != AF_UNIX)
                return (EAFNOSUPPORT);
 
diff --git sys/net/if.c sys/net/if.c
index 20e9dcbc9de..ffa1d99347f 100644
--- sys/net/if.c
+++ sys/net/if.c
@@ -230,6 +230,25 @@ struct taskq       *softnettq;
 struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
 
 /*
+ * Serialize socket operations to ensure no new sleeping points
+ * are introduced in IP output paths.
+ *
+ * If a code path needs both the NET_LOCK() and fdplock(), they
+ * have to be taken in the following order:
+ *
+ *  fdplock()
+ *  NET_LOCK()
+ *  ...
+ *  NET_UNLOCK()
+ *  fdpunlock()
+ */
+struct rwlock netlock = RWLOCK_INITIALIZER("netlock");
+#if 1
+/* tsleep() is ok if inside ifioctl(). */
+int inifioctl;
+#endif
+
+/*
  * Network interface utility routines.
  */
 void
@@ -1076,7 +1095,10 @@ if_clone_create(const char *name, int rdomain)
        if (ifunit(name) != NULL)
                return (EEXIST);
 
+       /* XXXSMP breaks atomicity */
+       rw_exit_write(&netlock);
        ret = (*ifc->ifc_create)(ifc, unit);
+       rw_enter_write(&netlock);
 
        if (ret != 0 || (ifp = ifunit(name)) == NULL)
                return (ret);
@@ -1118,7 +1140,10 @@ if_clone_destroy(const char *name)
                splx(s);
        }
 
+       /* XXXSMP breaks atomicity */
+       rw_exit_write(&netlock);
        ret = (*ifc->ifc_destroy)(ifp);
+       rw_enter_write(&netlock);
 
        return (ret);
 }
diff --git sys/net/if_pflow.c sys/net/if_pflow.c
index 50a2a2cd1da..f929b86c5a9 100644
--- sys/net/if_pflow.c
+++ sys/net/if_pflow.c
@@ -504,6 +504,8 @@ pflowioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
                    sizeof(pflowr))))
                        return (error);
 
+               /* XXXSMP breaks atomicity */
+               rw_exit_write(&netlock);
                s = splnet();
                error = pflow_set(sc, &pflowr);
                splx(s);
@@ -521,6 +523,7 @@ pflowioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
                } else
                        ifp->if_flags &= ~IFF_RUNNING;
 
+               rw_enter_write(&netlock);
                break;
 
        default:
diff --git sys/net/pf.c sys/net/pf.c
index 98578708bab..2ae434e405d 100644
--- sys/net/pf.c
+++ sys/net/pf.c
@@ -1305,8 +1305,12 @@ pf_remove_state(struct pf_state *cur)
        }
        RB_REMOVE(pf_state_tree_id, &tree_id, cur);
 #if NPFLOW > 0
-       if (cur->state_flags & PFSTATE_PFLOW)
+       if (cur->state_flags & PFSTATE_PFLOW) {
+               /* XXXSMP breaks atomicity */
+               rw_exit_write(&netlock);
                export_pflow(cur);
+               rw_enter_write(&netlock);
+       }
 #endif /* NPFLOW > 0 */
 #if NPFSYNC > 0
        pfsync_delete_state(cur);
diff --git sys/sys/filedesc.h sys/sys/filedesc.h
index 148c8bc0182..730e5794eb1 100644
--- sys/sys/filedesc.h
+++ sys/sys/filedesc.h
@@ -138,7 +138,7 @@ struct file *fd_getfile_mode(struct filedesc *, int, int);
 int    closef(struct file *, struct proc *);
 int    getsock(struct proc *, int, struct file **);
 
-#define        fdplock(fdp)    rw_enter_write(&(fdp)->fd_lock)
+#define        fdplock(fdp)    do { NET_ASSERT_UNLOCKED(); 
rw_enter_write(&(fdp)->fd_lock); } while (0)
 #define        fdpunlock(fdp)  rw_exit_write(&(fdp)->fd_lock)
 #define        fdpassertlocked(fdp)    rw_assert_wrlock(&(fdp)->fd_lock)
 #endif
diff --git sys/sys/systm.h sys/sys/systm.h
index f9d34f34a17..834de358be5 100644
--- sys/sys/systm.h
+++ sys/sys/systm.h
@@ -291,21 +291,35 @@ int       uiomove(void *, size_t, struct uio *);
 
 #if defined(_KERNEL)
 
+#include <sys/rwlock.h>
+
+extern struct rwlock netlock;
+
 #define        NET_LOCK(s)                                                     
\
 do {                                                                   \
+       rw_enter_write(&netlock);                                       \
        s = splsoftnet();                                               \
 } while (0)
 
-#define        NET_UNLOCK(s)                                                   
\
+#define        NET_UNLOCK(s)                                           \
 do {                                                                   \
        splx(s);                                                        \
+       rw_exit_write(&netlock);                                        \
 } while (0)
 
 #define        NET_ASSERT_LOCKED()                                             
\
 do {                                                                   \
+       if (rw_status(&netlock) != RW_WRITE)                            \
+               splassert_fail(RW_WRITE, rw_status(&netlock), __func__);\
        splsoftassert(IPL_SOFTNET);                                     \
 } while (0)
 
+#define        NET_ASSERT_UNLOCKED()                                           
\
+do {                                                                   \
+       if (rw_status(&netlock) == RW_WRITE)                            \
+               splassert_fail(0, rw_status(&netlock), __func__);       \
+} while (0)
+
 __returns_twice int    setjmp(label_t *);
 __dead void    longjmp(label_t *);
 #endif
diff --git sys/uvm/uvm_vnode.c sys/uvm/uvm_vnode.c
index 2abfac2f14d..b7a7f746806 100644
--- sys/uvm/uvm_vnode.c
+++ sys/uvm/uvm_vnode.c
@@ -1176,6 +1176,15 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int 
npages, int flags, int rw)
                result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL, curproc);
 
        if (result == 0) {
+               int netlocked = (rw_status(&netlock) == RW_WRITE);
+
+               /*
+                * This process may already have the NET_LOCK(), if we
+                * faulted in copyin() or copyout() in the network stack.
+                */
+               if (netlocked)
+                       rw_exit_write(&netlock);
+
                /* NOTE: vnode now locked! */
                if (rw == UIO_READ)
                        result = VOP_READ(vn, &uio, 0, curproc->p_ucred);
@@ -1184,6 +1193,9 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, 
int flags, int rw)
                            (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
                            curproc->p_ucred);
 
+               if (netlocked)
+                       rw_enter_write(&netlock);
+
                if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
                        VOP_UNLOCK(vn, curproc);
        }

Reply via email to