On Fri, Nov 05, 2021 at 09:50:05AM +0100, Mark Kettenis wrote:
> > Date: Fri, 5 Nov 2021 07:18:03 +0100
> > From: Martin Pieuchot <[email protected]>
> > 
> > On 30/10/21(Sat) 21:22, Vitaliy Makkoveev wrote:
> > > This completely removes global rwlock(9) from the unp_internalize() and
> > > unp_externalize() normal paths but only leaves it in unp_externalize()
> > > error path. Also we don't need to simultaneously hold both fdplock()
> > > and `unp_lock' in unp_internalize(). As non obvious profit this
> > > simplifies the future lock dances in the UNIX sockets layer.
> > > 
> > > It's safe to call fptounp() without `unp_lock' held. We always got this
> > > file descriptor by fd_getfile(9) so we always have the extra reference
> > > and this descriptor can't be closed by concurrent thread. Some sockets
> > > could be destroyed through 'PRU_ABORT' path but they don't have
> > > associated file descriptor and they are not accessible in the
> > > unp_internalize() path.
> > > 
> > > The `unp_file' access without `unp_lock' held is also safe. Each socket
> > > could have the only associated file descriptor and each file descriptor
> > > could have the only associated socket. We only assign `unp_file' in the
> > > unp_internalize() path where we got the socket by fd_getfile(9). This
> > > descriptor has the extra reference and couldn't be closed concurrently.
> > > We could override `unp_file' but with the same address because the
> > > associated file descriptor can't be changed so the address will be also
> > > the same. So while unp_gc() concurrently runs the dereference of
> > > non-NULL `unp_file' is always safe.
> > 
> > Using an atomic operation for `unp_msgcount' is ok with me, one comment
> > about `unp_rights' below.
> > 
> > [...] 
> > 
> > >  
> > > - rw_enter_write(&unp_lock);
> > > - if (unp_rights + nfds > maxfiles / 10) {
> > > -         rw_exit_write(&unp_lock);
> > > + if (atomic_add_int_nv(&unp_rights, nfds) > maxfiles / 10) {
> > > +         atomic_sub_int(&unp_rights, nfds);
> > 
> > I can't believe this is race free. If two threads, T1 and T2, call
> > atomic_add at the same time both might end up returning EMFILE even
> > if only the first one currently does.  This could happen if T1 exceeds
> > the limit and T2 does atomic_add on an already-exceeded `unp_rights'
> > before T1 could do atomic_sub.
> > 
> > I suggest using a mutex to protect `unp_rights' instead to solve this
> > issue.
> 
> Yes, that would be better.  Otherwise it would be trivial to DOS
> anything that does file descriptor passing.
>

Thanks for pointing, this sounds reasonable. The updated diff introduces
`unp_rights_mtx' mutex(9) to protect `unp_rights'.

Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.153
diff -u -p -r1.153 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c      30 Oct 2021 16:35:31 -0000      1.153
+++ sys/kern/uipc_usrreq.c      5 Nov 2021 12:56:41 -0000
@@ -52,14 +52,19 @@
 #include <sys/pledge.h>
 #include <sys/pool.h>
 #include <sys/rwlock.h>
+#include <sys/mutex.h>
 #include <sys/sysctl.h>
 
 /*
  * Locks used to protect global data and struct members:
  *      I       immutable after creation
  *      U       unp_lock
+ *      R       unp_rights_mtx
+ *      a       atomic
  */
+
 struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
+struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
 
 /*
  * Stack of sets of files that were passed over a socket but were
@@ -99,7 +104,7 @@ SLIST_HEAD(,unp_deferral)    unp_deferred =
        SLIST_HEAD_INITIALIZER(unp_deferred);
 
 ino_t  unp_ino;        /* [U] prototype for fake inode numbers */
-int    unp_rights;     /* [U] file descriptors in flight */
+int    unp_rights;     /* [R] file descriptors in flight */
 int    unp_defer;      /* [U] number of deferred fp to close by the GC task */
 int    unp_gcing;      /* [U] GC task currently running */
 
@@ -927,17 +932,18 @@ restart:
         */
        rp = (struct fdpass *)CMSG_DATA(cm);
 
-       rw_enter_write(&unp_lock);
        for (i = 0; i < nfds; i++) {
                struct unpcb *unp;
 
                fp = rp->fp;
                rp++;
                if ((unp = fptounp(fp)) != NULL)
-                       unp->unp_msgcount--;
-               unp_rights--;
+                       atomic_dec_long(&unp->unp_msgcount);
        }
-       rw_exit_write(&unp_lock);
+
+       mtx_enter(&unp_rights_mtx);
+       unp_rights -= nfds;
+       mtx_leave(&unp_rights_mtx);
 
        /*
         * Copy temporary array to message and adjust length, in case of
@@ -985,13 +991,13 @@ unp_internalize(struct mbuf *control, st
                return (EINVAL);
        nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
 
-       rw_enter_write(&unp_lock);
+       mtx_enter(&unp_rights_mtx);
        if (unp_rights + nfds > maxfiles / 10) {
-               rw_exit_write(&unp_lock);
+               mtx_leave(&unp_rights_mtx);
                return (EMFILE);
        }
        unp_rights += nfds;
-       rw_exit_write(&unp_lock);
+       mtx_leave(&unp_rights_mtx);
 
        /* Make sure we have room for the struct file pointers */
 morespace:
@@ -1031,7 +1037,6 @@ morespace:
        ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
        rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
        fdplock(fdp);
-       rw_enter_write(&unp_lock);
        for (i = 0; i < nfds; i++) {
                memcpy(&fd, ip, sizeof fd);
                ip--;
@@ -1056,15 +1061,13 @@ morespace:
                rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
                rp--;
                if ((unp = fptounp(fp)) != NULL) {
+                       atomic_inc_long(&unp->unp_msgcount);
                        unp->unp_file = fp;
-                       unp->unp_msgcount++;
                }
        }
-       rw_exit_write(&unp_lock);
        fdpunlock(fdp);
        return (0);
 fail:
-       rw_exit_write(&unp_lock);
        fdpunlock(fdp);
        if (fp != NULL)
                FRELE(fp, p);
@@ -1072,17 +1075,15 @@ fail:
        for ( ; i > 0; i--) {
                rp++;
                fp = rp->fp;
-               rw_enter_write(&unp_lock);
                if ((unp = fptounp(fp)) != NULL)
-                       unp->unp_msgcount--;
-               rw_exit_write(&unp_lock);
+                       atomic_dec_long(&unp->unp_msgcount);
                FRELE(fp, p);
        }
 
 nospace:
-       rw_enter_write(&unp_lock);
+       mtx_enter(&unp_rights_mtx);
        unp_rights -= nfds;
-       rw_exit_write(&unp_lock);
+       mtx_leave(&unp_rights_mtx);
 
        return (error);
 }
@@ -1105,21 +1106,23 @@ unp_gc(void *arg __unused)
        /* close any fds on the deferred list */
        while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
                SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
+               rw_exit_write(&unp_lock);
                for (i = 0; i < defer->ud_n; i++) {
                        fp = defer->ud_fp[i].fp;
                        if (fp == NULL)
                                continue;
-                        /* closef() expects a refcount of 2 */
-                       FREF(fp);
                        if ((unp = fptounp(fp)) != NULL)
-                               unp->unp_msgcount--;
+                               atomic_dec_long(&unp->unp_msgcount);
+                       mtx_enter(&unp_rights_mtx);
                        unp_rights--;
-                       rw_exit_write(&unp_lock);
+                       mtx_leave(&unp_rights_mtx);
+                        /* closef() expects a refcount of 2 */
+                       FREF(fp);
                        (void) closef(fp, NULL);
-                       rw_enter_write(&unp_lock);
                }
                free(defer, M_TEMP, sizeof(*defer) +
                    sizeof(struct fdpass) * defer->ud_n);
+               rw_enter_write(&unp_lock);
        }
 
        unp_defer = 0;
Index: sys/sys/unpcb.h
===================================================================
RCS file: /cvs/src/sys/sys/unpcb.h,v
retrieving revision 1.18
diff -u -p -r1.18 unpcb.h
--- sys/sys/unpcb.h     10 Feb 2021 08:20:09 -0000      1.18
+++ sys/sys/unpcb.h     5 Nov 2021 12:56:41 -0000
@@ -60,19 +60,20 @@
  * Locks used to protect struct members:
  *      I       immutable after creation
  *      U       unp_lock
+ *      a       atomic
  */
 
 
 struct unpcb {
        struct  socket *unp_socket;     /* [I] pointer back to socket */
        struct  vnode *unp_vnode;       /* [U] if associated with file */
-       struct  file *unp_file;         /* [U] backpointer for unp_gc() */
+       struct  file *unp_file;         /* [a] backpointer for unp_gc() */
        struct  unpcb *unp_conn;        /* [U] control block of connected 
socket */
        ino_t   unp_ino;                /* [U] fake inode number */
        SLIST_HEAD(,unpcb) unp_refs;    /* [U] referencing socket linked list */
        SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */
        struct  mbuf *unp_addr;         /* [U] bound address of socket */
-       long    unp_msgcount;           /* [U] references from socket rcv buf */
+       long    unp_msgcount;           /* [a] references from socket rcv buf */
        int     unp_flags;              /* [U] this unpcb contains peer eids */
        struct  sockpeercred unp_connid;/* [U] id of peer process */
        struct  timespec unp_ctime;     /* [I] holds creation time */

Reply via email to