On Mon, Nov 15, 2021 at 02:16:34PM +0300, Vitaliy Makkoveev wrote:
> ping...

OK bluhm@

> On Sat, Nov 13, 2021 at 07:20:23PM +0300, Vitaliy Makkoveev wrote:
> > On Fri, Nov 12, 2021 at 03:28:42AM +0300, Vitaliy Makkoveev wrote:
> > > The final step before rework UNIX sockets to fine grained locks. Except
> > > `unp_ino' this leaves only per-socket data protected by `unp_lock'. The
> > > `unp_ino' protection is not the big deal and will be done with mutex(9)
> > > in the future diff.
> > > 
> > > The `unp_gc_lock' rwlock(9) protects `unp_defer', `unp_gcing',
> > > `unp_gcflags' and `unp_link' list.
> > > 
> > > We need to simultaneously lock `unp_gc_lock' and `unp_lock'. When we
> > > perform unp_attach() or unp_detach() we link PCB to `unp_link' list with
> > > `unp_lock' held and the lock order should be `unp_lock' ->
> > > `unp_gc_lock'. But when unp_gc() does `unp_link' list walkthrough with
> > > the `unp_gc_lock' lock held it should lock socket while performs
> > > `so_rcv' buffer scan and the lock order should be the opposite.
> > > 
> > > In the future diff `unp_lock' will be replaced by per-socket `so_lock'
> > > so it's better to enforce `unp_gc_lock' -> `unp_lock' (solock()) lock
> > > order and release `unp_lock' in the unp_attach() and unp_detach() paths.
> > > The previously committed diffs made this safe.
> > > 
> > > The `unp_df_lock' introduced because the `unp_lock' and `unp_gc_lock'
> > > state are unknown when unp_discard() called. Since it touches only
> > > `ud_link' list the re-lock dances are unwanted in this path. Also this
> > > keeps M_WAITOK allocation outside rwlock(9) when unp_discard() called
> > > from unp_externalize() error path.
> > > 
> > > The garbage collector flags moved from solock() protected `unp_flags' to
> > > `unp_gc_lock' protected `unp_gcflags'
> > > 
> > > The regress/sys/kern/ungc could be used to test this diff.
> > >
> > 
> > Updated diff. solock() used to protect sockets in the unp_gc(). This
> > changes nothing but makes next diffs shorter.
> > 
> > Index: sys/kern/uipc_usrreq.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> > retrieving revision 1.156
> > diff -u -p -r1.156 uipc_usrreq.c
> > --- sys/kern/uipc_usrreq.c  11 Nov 2021 18:36:59 -0000      1.156
> > +++ sys/kern/uipc_usrreq.c  13 Nov 2021 16:18:51 -0000
> > @@ -59,12 +59,17 @@
> >  /*
> >   * Locks used to protect global data and struct members:
> >   *      I       immutable after creation
> > + *      D       unp_df_lock
> > + *      G       unp_gc_lock
> >   *      U       unp_lock
> >   *      R       unp_rights_mtx
> >   *      a       atomic
> >   */
> >  
> >  struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
> > +struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk");
> > +struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk");
> > +
> >  struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
> >  
> >  /*
> > @@ -72,7 +77,7 @@ struct mutex unp_rights_mtx = MUTEX_INIT
> >   * not received and need to be closed.
> >   */
> >  struct     unp_deferral {
> > -   SLIST_ENTRY(unp_deferral)       ud_link;        /* [U] */
> > +   SLIST_ENTRY(unp_deferral)       ud_link;        /* [D] */
> >     int                             ud_n;           /* [I] */
> >     /* followed by ud_n struct fdpass */
> >     struct fdpass                   ud_fp[];        /* [I] */
> > @@ -97,17 +102,17 @@ struct task unp_gc_task = TASK_INITIALIZ
> >   */
> >  const struct       sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
> >  
> > -/* [U] list of all UNIX domain sockets, for unp_gc() */
> > +/* [G] list of all UNIX domain sockets, for unp_gc() */
> >  LIST_HEAD(unp_head, unpcb) unp_head =
> >     LIST_HEAD_INITIALIZER(unp_head);
> > -/* [U] list of sets of files that were sent over sockets that are now 
> > closed */
> > +/* [D] list of sets of files that were sent over sockets that are now 
> > closed */
> >  SLIST_HEAD(,unp_deferral)  unp_deferred =
> >     SLIST_HEAD_INITIALIZER(unp_deferred);
> >  
> >  ino_t      unp_ino;        /* [U] prototype for fake inode numbers */
> >  int        unp_rights;     /* [R] file descriptors in flight */
> > -int        unp_defer;      /* [U] number of deferred fp to close by the GC 
> > task */
> > -int        unp_gcing;      /* [U] GC task currently running */
> > +int        unp_defer;      /* [G] number of deferred fp to close by the GC 
> > task */
> > +int        unp_gcing;      /* [G] GC task currently running */
> >  
> >  void
> >  unp_init(void)
> > @@ -430,7 +435,21 @@ uipc_attach(struct socket *so, int proto
> >     unp->unp_socket = so;
> >     so->so_pcb = unp;
> >     getnanotime(&unp->unp_ctime);
> > +
> > +   /*
> > +    * Enforce `unp_gc_lock' -> `solock()' lock order.
> > +    */
> > +   /*
> > +    * We also release the lock on listening socket and on our peer
> > +    * socket when called from unp_connect(). This is safe. The
> > +    * listening socket protected by vnode(9) lock. The peer socket
> > +    * has 'UNP_CONNECTING' flag set.
> > +    */
> > +   sounlock(so, SL_LOCKED);
> > +   rw_enter_write(&unp_gc_lock);
> >     LIST_INSERT_HEAD(&unp_head, unp, unp_link);
> > +   rw_exit_write(&unp_gc_lock);
> > +   solock(so);
> >     return (0);
> >  }
> >  
> > @@ -490,28 +509,29 @@ unp_detach(struct unpcb *unp)
> >  
> >     rw_assert_wrlock(&unp_lock);
> >  
> > -   LIST_REMOVE(unp, unp_link);
> > +   unp->unp_vnode = NULL;
> >  
> > -   if (vp != NULL) {
> > -           unp->unp_vnode = NULL;
> > -
> > -           /*
> > -            * Enforce `i_lock' -> `unp_lock' because fifo
> > -            * subsystem requires it.
> > -            */
> > +   /*
> > +    * Enforce `unp_gc_lock' -> `solock()' lock order.
> > +    * Enforce `i_lock' -> `unp_lock' lock order.
> > +    */
> > +   sounlock(so, SL_LOCKED);
> >  
> > -           sounlock(so, SL_LOCKED);
> > +   rw_enter_write(&unp_gc_lock);
> > +   LIST_REMOVE(unp, unp_link);
> > +   rw_exit_write(&unp_gc_lock);
> >  
> > +   if (vp != NULL) {
> >             VOP_LOCK(vp, LK_EXCLUSIVE);
> >             vp->v_socket = NULL;
> >  
> >             KERNEL_LOCK();
> >             vput(vp);
> >             KERNEL_UNLOCK();
> > -
> > -           solock(so);
> >     }
> >  
> > +   solock(so);
> > +
> >     if (unp->unp_conn)
> >             unp_disconnect(unp);
> >     while (!SLIST_EMPTY(&unp->unp_refs))
> > @@ -954,10 +974,11 @@ restart:
> >  
> >     if (error) {
> >             if (nfds > 0) {
> > +                   /*
> > +                    * No lock required. We are the only `cm' holder.
> > +                    */
> >                     rp = ((struct fdpass *)CMSG_DATA(cm));
> > -                   rw_enter_write(&unp_lock);
> >                     unp_discard(rp, nfds);
> > -                   rw_exit_write(&unp_lock);
> >             }
> >     }
> >  
> > @@ -1093,16 +1114,17 @@ unp_gc(void *arg __unused)
> >     struct unpcb *unp;
> >     int nunref, i;
> >  
> > -   rw_enter_write(&unp_lock);
> > -
> > +   rw_enter_write(&unp_gc_lock);
> >     if (unp_gcing)
> >             goto unlock;
> >     unp_gcing = 1;
> > +   rw_exit_write(&unp_gc_lock);
> >  
> > +   rw_enter_write(&unp_df_lock);
> >     /* close any fds on the deferred list */
> >     while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
> >             SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
> > -           rw_exit_write(&unp_lock);
> > +           rw_exit_write(&unp_df_lock);
> >             for (i = 0; i < defer->ud_n; i++) {
> >                     fp = defer->ud_fp[i].fp;
> >                     if (fp == NULL)
> > @@ -1118,25 +1140,27 @@ unp_gc(void *arg __unused)
> >             }
> >             free(defer, M_TEMP, sizeof(*defer) +
> >                 sizeof(struct fdpass) * defer->ud_n);
> > -           rw_enter_write(&unp_lock);
> > +           rw_enter_write(&unp_df_lock);
> >     }
> > +   rw_exit_write(&unp_df_lock);
> >  
> > +   rw_enter_write(&unp_gc_lock);
> >     unp_defer = 0;
> >     LIST_FOREACH(unp, &unp_head, unp_link)
> > -           unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
> > +           unp->unp_gcflags = 0;
> >     do {
> >             nunref = 0;
> >             LIST_FOREACH(unp, &unp_head, unp_link) {
> >                     fp = unp->unp_file;
> > -                   if (unp->unp_flags & UNP_GCDEFER) {
> > +                   if (unp->unp_gcflags & UNP_GCDEFER) {
> >                             /*
> >                              * This socket is referenced by another
> >                              * socket which is known to be live,
> >                              * so it's certainly live.
> >                              */
> > -                           unp->unp_flags &= ~UNP_GCDEFER;
> > +                           unp->unp_gcflags &= ~UNP_GCDEFER;
> >                             unp_defer--;
> > -                   } else if (unp->unp_flags & UNP_GCMARK) {
> > +                   } else if (unp->unp_gcflags & UNP_GCMARK) {
> >                             /* marked as live in previous pass */
> >                             continue;
> >                     } else if (fp == NULL) {
> > @@ -1155,7 +1179,7 @@ unp_gc(void *arg __unused)
> >                              */
> >                             if (fp->f_count == unp->unp_msgcount) {
> >                                     nunref++;
> > -                                   unp->unp_flags |= UNP_GCDEAD;
> > +                                   unp->unp_gcflags |= UNP_GCDEAD;
> >                                     continue;
> >                             }
> >                     }
> > @@ -1167,10 +1191,12 @@ unp_gc(void *arg __unused)
> >                      * sockets and note them as deferred (== referenced,
> >                      * but not yet marked).
> >                      */
> > -                   unp->unp_flags |= UNP_GCMARK;
> > +                   unp->unp_gcflags |= UNP_GCMARK;
> >  
> >                     so = unp->unp_socket;
> > +                   solock(so);
> >                     unp_scan(so->so_rcv.sb_mb, unp_mark);
> > +                   sounlock(so, SL_LOCKED);
> >             }
> >     } while (unp_defer);
> >  
> > @@ -1180,14 +1206,23 @@ unp_gc(void *arg __unused)
> >      */
> >     if (nunref) {
> >             LIST_FOREACH(unp, &unp_head, unp_link) {
> > -                   if (unp->unp_flags & UNP_GCDEAD)
> > -                           unp_scan(unp->unp_socket->so_rcv.sb_mb,
> > -                               unp_discard);
> > +                   if (unp->unp_gcflags & UNP_GCDEAD) {
> > +                           /*
> > +                            * This socket could still be connected
> > +                            * and if so it's `so_rcv' is still
> > +                            * accessible by concurrent PRU_SEND
> > +                            * thread.
> > +                            */
> > +                           so = unp->unp_socket;
> > +                           solock(so);
> > +                           unp_scan(so->so_rcv.sb_mb, unp_discard);
> > +                           sounlock(so, SL_LOCKED);
> > +                   }
> >             }
> >     }
> >     unp_gcing = 0;
> >  unlock:
> > -   rw_exit_write(&unp_lock);
> > +   rw_exit_write(&unp_gc_lock);
> >  }
> >  
> >  void
> > @@ -1233,7 +1268,7 @@ unp_mark(struct fdpass *rp, int nfds)
> >     struct unpcb *unp;
> >     int i;
> >  
> > -   rw_assert_wrlock(&unp_lock);
> > +   rw_assert_wrlock(&unp_gc_lock);
> >  
> >     for (i = 0; i < nfds; i++) {
> >             if (rp[i].fp == NULL)
> > @@ -1243,12 +1278,12 @@ unp_mark(struct fdpass *rp, int nfds)
> >             if (unp == NULL)
> >                     continue;
> >  
> > -           if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
> > +           if (unp->unp_gcflags & (UNP_GCMARK|UNP_GCDEFER))
> >                     continue;
> >  
> >             unp_defer++;
> > -           unp->unp_flags |= UNP_GCDEFER;
> > -           unp->unp_flags &= ~UNP_GCDEAD;
> > +           unp->unp_gcflags |= UNP_GCDEFER;
> > +           unp->unp_gcflags &= ~UNP_GCDEAD;
> >     }
> >  }
> >  
> > @@ -1257,14 +1292,15 @@ unp_discard(struct fdpass *rp, int nfds)
> >  {
> >     struct unp_deferral *defer;
> >  
> > -   rw_assert_wrlock(&unp_lock);
> > -
> >     /* copy the file pointers to a deferral structure */
> >     defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
> >     defer->ud_n = nfds;
> >     memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
> >     memset(rp, 0, sizeof(*rp) * nfds);
> > +
> > +   rw_enter_write(&unp_df_lock);
> >     SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
> > +   rw_exit_write(&unp_df_lock);
> >  
> >     task_add(systqmp, &unp_gc_task);
> >  }
> > Index: sys/sys/unpcb.h
> > ===================================================================
> > RCS file: /cvs/src/sys/sys/unpcb.h,v
> > retrieving revision 1.19
> > diff -u -p -r1.19 unpcb.h
> > --- sys/sys/unpcb.h 6 Nov 2021 17:35:14 -0000       1.19
> > +++ sys/sys/unpcb.h 13 Nov 2021 16:18:51 -0000
> > @@ -59,6 +59,7 @@
> >   *
> >   * Locks used to protect struct members:
> >   *      I       immutable after creation
> > + *      G       unp_gc_lock
> >   *      U       unp_lock
> >   *      a       atomic
> >   */
> > @@ -75,9 +76,10 @@ struct   unpcb {
> >     struct  mbuf *unp_addr;         /* [U] bound address of socket */
> >     long    unp_msgcount;           /* [a] references from socket rcv buf */
> >     int     unp_flags;              /* [U] this unpcb contains peer eids */
> > +   int     unp_gcflags;            /* [G] garbge collector flags */
> >     struct  sockpeercred unp_connid;/* [U] id of peer process */
> >     struct  timespec unp_ctime;     /* [I] holds creation time */
> > -   LIST_ENTRY(unpcb) unp_link;     /* [U] link in per-AF list of sockets */
> > +   LIST_ENTRY(unpcb) unp_link;     /* [G] link in per-AF list of sockets */
> >  };
> >  
> >  /*
> > @@ -85,11 +87,15 @@ struct  unpcb {
> >   */
> >  #define UNP_FEIDS  0x01            /* unp_connid contains information */
> >  #define UNP_FEIDSBIND      0x02            /* unp_connid was set by a bind 
> > */
> > -#define UNP_GCMARK 0x04            /* mark during unp_gc() */
> > -#define UNP_GCDEFER        0x08            /* ref'd, but not marked in 
> > this pass */
> > -#define UNP_GCDEAD 0x10            /* unref'd in this pass */
> > -#define UNP_BINDING        0x20            /* unp is binding now */
> > -#define UNP_CONNECTING     0x40            /* unp is connecting now */
> > +#define UNP_BINDING        0x04            /* unp is binding now */
> > +#define UNP_CONNECTING     0x08            /* unp is connecting now */
> > +
> > +/*
> > + * flag bits in unp_gcflags
> > + */
> > +#define UNP_GCMARK 0x01            /* mark during unp_gc() */
> > +#define UNP_GCDEFER        0x02            /* ref'd, but not marked in 
> > this pass */
> > +#define UNP_GCDEAD 0x04            /* unref'd in this pass */
> >  
> >  #define    sotounpcb(so)   ((struct unpcb *)((so)->so_pcb))
> >  
> > 

Reply via email to