On Mon, Nov 15, 2021 at 02:16:34PM +0300, Vitaliy Makkoveev wrote:
> ping...
OK bluhm@
> On Sat, Nov 13, 2021 at 07:20:23PM +0300, Vitaliy Makkoveev wrote:
> > On Fri, Nov 12, 2021 at 03:28:42AM +0300, Vitaliy Makkoveev wrote:
> > > The final step before rework UNIX sockets to fine grained locks. Except
> > > `unp_ino' this leaves only per-socket data protected by `unp_lock'. The
> > > `unp_ino' protection is not the big deal and will be done with mutex(9)
> > > in the future diff.
> > >
> > > The `unp_gc_lock' rwlock(9) protects `unp_defer', `unp_gcing',
> > > `unp_gcflags' and `unp_link' list.
> > >
> > > We need to simultaneously lock `unp_gc_lock' and `unp_lock'. When we
> > > perform unp_attach() or unp_detach() we link PCB to `unp_link' list with
> > > `unp_lock' held and the lock order should be `unp_lock' ->
> > > `unp_gc_lock'. But when unp_gc() does `unp_link' list walkthrough with
> > > the `unp_gc_lock' lock held it should lock socket while performs
> > > `so_rcv' buffer scan and the lock order should be the opposite.
> > >
> > > In the future diff `unp_lock' will be replaced by per-socket `so_lock'
> > > so it's better to enforce `unp_gc_lock' -> `unp_lock' (solock()) lock
> > > order and release `unp_lock' in the unp_attach() and unp_detach() paths.
> > > The previously committed diffs made this safe.
> > >
> > > The `unp_df_lock' introduced because the `unp_lock' and `unp_gc_lock'
> > > state are unknown when unp_discard() called. Since it touches only
> > > `ud_link' list the re-lock dances are unwanted in this path. Also this
> > > keeps M_WAITOK allocation outside rwlock(9) when unp_discard() called
> > > from unp_externalize() error path.
> > >
> > > The garbage collector flags moved from solock() protected `unp_flags' to
> > > `unp_gc_lock' protected `unp_gcflags'
> > >
> > > The regress/sys/kern/ungc could be used to test this diff.
> > >
> >
> > Updated diff. solock() used to protect sockets in the unp_gc(). This
> > changes nothing but makes next diffs shorter.
> >
> > Index: sys/kern/uipc_usrreq.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> > retrieving revision 1.156
> > diff -u -p -r1.156 uipc_usrreq.c
> > --- sys/kern/uipc_usrreq.c 11 Nov 2021 18:36:59 -0000 1.156
> > +++ sys/kern/uipc_usrreq.c 13 Nov 2021 16:18:51 -0000
> > @@ -59,12 +59,17 @@
> > /*
> > * Locks used to protect global data and struct members:
> > * I immutable after creation
> > + * D unp_df_lock
> > + * G unp_gc_lock
> > * U unp_lock
> > * R unp_rights_mtx
> > * a atomic
> > */
> >
> > struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
> > +struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk");
> > +struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk");
> > +
> > struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
> >
> > /*
> > @@ -72,7 +77,7 @@ struct mutex unp_rights_mtx = MUTEX_INIT
> > * not received and need to be closed.
> > */
> > struct unp_deferral {
> > - SLIST_ENTRY(unp_deferral) ud_link; /* [U] */
> > + SLIST_ENTRY(unp_deferral) ud_link; /* [D] */
> > int ud_n; /* [I] */
> > /* followed by ud_n struct fdpass */
> > struct fdpass ud_fp[]; /* [I] */
> > @@ -97,17 +102,17 @@ struct task unp_gc_task = TASK_INITIALIZ
> > */
> > const struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
> >
> > -/* [U] list of all UNIX domain sockets, for unp_gc() */
> > +/* [G] list of all UNIX domain sockets, for unp_gc() */
> > LIST_HEAD(unp_head, unpcb) unp_head =
> > LIST_HEAD_INITIALIZER(unp_head);
> > -/* [U] list of sets of files that were sent over sockets that are now
> > closed */
> > +/* [D] list of sets of files that were sent over sockets that are now
> > closed */
> > SLIST_HEAD(,unp_deferral) unp_deferred =
> > SLIST_HEAD_INITIALIZER(unp_deferred);
> >
> > ino_t unp_ino; /* [U] prototype for fake inode numbers */
> > int unp_rights; /* [R] file descriptors in flight */
> > -int unp_defer; /* [U] number of deferred fp to close by the GC
> > task */
> > -int unp_gcing; /* [U] GC task currently running */
> > +int unp_defer; /* [G] number of deferred fp to close by the GC
> > task */
> > +int unp_gcing; /* [G] GC task currently running */
> >
> > void
> > unp_init(void)
> > @@ -430,7 +435,21 @@ uipc_attach(struct socket *so, int proto
> > unp->unp_socket = so;
> > so->so_pcb = unp;
> > getnanotime(&unp->unp_ctime);
> > +
> > + /*
> > + * Enforce `unp_gc_lock' -> `solock()' lock order.
> > + */
> > + /*
> > + * We also release the lock on listening socket and on our peer
> > + * socket when called from unp_connect(). This is safe. The
> > + * listening socket protected by vnode(9) lock. The peer socket
> > + * has 'UNP_CONNECTING' flag set.
> > + */
> > + sounlock(so, SL_LOCKED);
> > + rw_enter_write(&unp_gc_lock);
> > LIST_INSERT_HEAD(&unp_head, unp, unp_link);
> > + rw_exit_write(&unp_gc_lock);
> > + solock(so);
> > return (0);
> > }
> >
> > @@ -490,28 +509,29 @@ unp_detach(struct unpcb *unp)
> >
> > rw_assert_wrlock(&unp_lock);
> >
> > - LIST_REMOVE(unp, unp_link);
> > + unp->unp_vnode = NULL;
> >
> > - if (vp != NULL) {
> > - unp->unp_vnode = NULL;
> > -
> > - /*
> > - * Enforce `i_lock' -> `unp_lock' because fifo
> > - * subsystem requires it.
> > - */
> > + /*
> > + * Enforce `unp_gc_lock' -> `solock()' lock order.
> > + * Enforce `i_lock' -> `unp_lock' lock order.
> > + */
> > + sounlock(so, SL_LOCKED);
> >
> > - sounlock(so, SL_LOCKED);
> > + rw_enter_write(&unp_gc_lock);
> > + LIST_REMOVE(unp, unp_link);
> > + rw_exit_write(&unp_gc_lock);
> >
> > + if (vp != NULL) {
> > VOP_LOCK(vp, LK_EXCLUSIVE);
> > vp->v_socket = NULL;
> >
> > KERNEL_LOCK();
> > vput(vp);
> > KERNEL_UNLOCK();
> > -
> > - solock(so);
> > }
> >
> > + solock(so);
> > +
> > if (unp->unp_conn)
> > unp_disconnect(unp);
> > while (!SLIST_EMPTY(&unp->unp_refs))
> > @@ -954,10 +974,11 @@ restart:
> >
> > if (error) {
> > if (nfds > 0) {
> > + /*
> > + * No lock required. We are the only `cm' holder.
> > + */
> > rp = ((struct fdpass *)CMSG_DATA(cm));
> > - rw_enter_write(&unp_lock);
> > unp_discard(rp, nfds);
> > - rw_exit_write(&unp_lock);
> > }
> > }
> >
> > @@ -1093,16 +1114,17 @@ unp_gc(void *arg __unused)
> > struct unpcb *unp;
> > int nunref, i;
> >
> > - rw_enter_write(&unp_lock);
> > -
> > + rw_enter_write(&unp_gc_lock);
> > if (unp_gcing)
> > goto unlock;
> > unp_gcing = 1;
> > + rw_exit_write(&unp_gc_lock);
> >
> > + rw_enter_write(&unp_df_lock);
> > /* close any fds on the deferred list */
> > while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
> > SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
> > - rw_exit_write(&unp_lock);
> > + rw_exit_write(&unp_df_lock);
> > for (i = 0; i < defer->ud_n; i++) {
> > fp = defer->ud_fp[i].fp;
> > if (fp == NULL)
> > @@ -1118,25 +1140,27 @@ unp_gc(void *arg __unused)
> > }
> > free(defer, M_TEMP, sizeof(*defer) +
> > sizeof(struct fdpass) * defer->ud_n);
> > - rw_enter_write(&unp_lock);
> > + rw_enter_write(&unp_df_lock);
> > }
> > + rw_exit_write(&unp_df_lock);
> >
> > + rw_enter_write(&unp_gc_lock);
> > unp_defer = 0;
> > LIST_FOREACH(unp, &unp_head, unp_link)
> > - unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
> > + unp->unp_gcflags = 0;
> > do {
> > nunref = 0;
> > LIST_FOREACH(unp, &unp_head, unp_link) {
> > fp = unp->unp_file;
> > - if (unp->unp_flags & UNP_GCDEFER) {
> > + if (unp->unp_gcflags & UNP_GCDEFER) {
> > /*
> > * This socket is referenced by another
> > * socket which is known to be live,
> > * so it's certainly live.
> > */
> > - unp->unp_flags &= ~UNP_GCDEFER;
> > + unp->unp_gcflags &= ~UNP_GCDEFER;
> > unp_defer--;
> > - } else if (unp->unp_flags & UNP_GCMARK) {
> > + } else if (unp->unp_gcflags & UNP_GCMARK) {
> > /* marked as live in previous pass */
> > continue;
> > } else if (fp == NULL) {
> > @@ -1155,7 +1179,7 @@ unp_gc(void *arg __unused)
> > */
> > if (fp->f_count == unp->unp_msgcount) {
> > nunref++;
> > - unp->unp_flags |= UNP_GCDEAD;
> > + unp->unp_gcflags |= UNP_GCDEAD;
> > continue;
> > }
> > }
> > @@ -1167,10 +1191,12 @@ unp_gc(void *arg __unused)
> > * sockets and note them as deferred (== referenced,
> > * but not yet marked).
> > */
> > - unp->unp_flags |= UNP_GCMARK;
> > + unp->unp_gcflags |= UNP_GCMARK;
> >
> > so = unp->unp_socket;
> > + solock(so);
> > unp_scan(so->so_rcv.sb_mb, unp_mark);
> > + sounlock(so, SL_LOCKED);
> > }
> > } while (unp_defer);
> >
> > @@ -1180,14 +1206,23 @@ unp_gc(void *arg __unused)
> > */
> > if (nunref) {
> > LIST_FOREACH(unp, &unp_head, unp_link) {
> > - if (unp->unp_flags & UNP_GCDEAD)
> > - unp_scan(unp->unp_socket->so_rcv.sb_mb,
> > - unp_discard);
> > + if (unp->unp_gcflags & UNP_GCDEAD) {
> > + /*
> > + * This socket could still be connected
> > + * and if so it's `so_rcv' is still
> > + * accessible by concurrent PRU_SEND
> > + * thread.
> > + */
> > + so = unp->unp_socket;
> > + solock(so);
> > + unp_scan(so->so_rcv.sb_mb, unp_discard);
> > + sounlock(so, SL_LOCKED);
> > + }
> > }
> > }
> > unp_gcing = 0;
> > unlock:
> > - rw_exit_write(&unp_lock);
> > + rw_exit_write(&unp_gc_lock);
> > }
> >
> > void
> > @@ -1233,7 +1268,7 @@ unp_mark(struct fdpass *rp, int nfds)
> > struct unpcb *unp;
> > int i;
> >
> > - rw_assert_wrlock(&unp_lock);
> > + rw_assert_wrlock(&unp_gc_lock);
> >
> > for (i = 0; i < nfds; i++) {
> > if (rp[i].fp == NULL)
> > @@ -1243,12 +1278,12 @@ unp_mark(struct fdpass *rp, int nfds)
> > if (unp == NULL)
> > continue;
> >
> > - if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
> > + if (unp->unp_gcflags & (UNP_GCMARK|UNP_GCDEFER))
> > continue;
> >
> > unp_defer++;
> > - unp->unp_flags |= UNP_GCDEFER;
> > - unp->unp_flags &= ~UNP_GCDEAD;
> > + unp->unp_gcflags |= UNP_GCDEFER;
> > + unp->unp_gcflags &= ~UNP_GCDEAD;
> > }
> > }
> >
> > @@ -1257,14 +1292,15 @@ unp_discard(struct fdpass *rp, int nfds)
> > {
> > struct unp_deferral *defer;
> >
> > - rw_assert_wrlock(&unp_lock);
> > -
> > /* copy the file pointers to a deferral structure */
> > defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
> > defer->ud_n = nfds;
> > memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
> > memset(rp, 0, sizeof(*rp) * nfds);
> > +
> > + rw_enter_write(&unp_df_lock);
> > SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
> > + rw_exit_write(&unp_df_lock);
> >
> > task_add(systqmp, &unp_gc_task);
> > }
> > Index: sys/sys/unpcb.h
> > ===================================================================
> > RCS file: /cvs/src/sys/sys/unpcb.h,v
> > retrieving revision 1.19
> > diff -u -p -r1.19 unpcb.h
> > --- sys/sys/unpcb.h 6 Nov 2021 17:35:14 -0000 1.19
> > +++ sys/sys/unpcb.h 13 Nov 2021 16:18:51 -0000
> > @@ -59,6 +59,7 @@
> > *
> > * Locks used to protect struct members:
> > * I immutable after creation
> > + * G unp_gc_lock
> > * U unp_lock
> > * a atomic
> > */
> > @@ -75,9 +76,10 @@ struct unpcb {
> > struct mbuf *unp_addr; /* [U] bound address of socket */
> > long unp_msgcount; /* [a] references from socket rcv buf */
> > int unp_flags; /* [U] this unpcb contains peer eids */
> > + int unp_gcflags; /* [G] garbge collector flags */
> > struct sockpeercred unp_connid;/* [U] id of peer process */
> > struct timespec unp_ctime; /* [I] holds creation time */
> > - LIST_ENTRY(unpcb) unp_link; /* [U] link in per-AF list of sockets */
> > + LIST_ENTRY(unpcb) unp_link; /* [G] link in per-AF list of sockets */
> > };
> >
> > /*
> > @@ -85,11 +87,15 @@ struct unpcb {
> > */
> > #define UNP_FEIDS 0x01 /* unp_connid contains information */
> > #define UNP_FEIDSBIND 0x02 /* unp_connid was set by a bind
> > */
> > -#define UNP_GCMARK 0x04 /* mark during unp_gc() */
> > -#define UNP_GCDEFER 0x08 /* ref'd, but not marked in
> > this pass */
> > -#define UNP_GCDEAD 0x10 /* unref'd in this pass */
> > -#define UNP_BINDING 0x20 /* unp is binding now */
> > -#define UNP_CONNECTING 0x40 /* unp is connecting now */
> > +#define UNP_BINDING 0x04 /* unp is binding now */
> > +#define UNP_CONNECTING 0x08 /* unp is connecting now */
> > +
> > +/*
> > + * flag bits in unp_gcflags
> > + */
> > +#define UNP_GCMARK 0x01 /* mark during unp_gc() */
> > +#define UNP_GCDEFER 0x02 /* ref'd, but not marked in
> > this pass */
> > +#define UNP_GCDEAD 0x04 /* unref'd in this pass */
> >
> > #define sotounpcb(so) ((struct unpcb *)((so)->so_pcb))
> >
> >