ping...
On Sat, Nov 13, 2021 at 07:20:23PM +0300, Vitaliy Makkoveev wrote:
> On Fri, Nov 12, 2021 at 03:28:42AM +0300, Vitaliy Makkoveev wrote:
> > The final step before rework UNIX sockets to fine grained locks. Except
> > `unp_ino' this leaves only per-socket data protected by `unp_lock'. The
> > `unp_ino' protection is not the big deal and will be done with mutex(9)
> > in the future diff.
> >
> > The `unp_gc_lock' rwlock(9) protects `unp_defer', `unp_gcing',
> > `unp_gcflags' and `unp_link' list.
> >
> > We need to simultaneously lock `unp_gc_lock' and `unp_lock'. When we
> > perform unp_attach() or unp_detach() we link PCB to `unp_link' list with
> > `unp_lock' held and the lock order should be `unp_lock' ->
> > `unp_gc_lock'. But when unp_gc() does `unp_link' list walkthrough with
> > the `unp_gc_lock' lock held it should lock socket while performs
> > `so_rcv' buffer scan and the lock order should be the opposite.
> >
> > In the future diff `unp_lock' will be replaced by per-socket `so_lock'
> > so it's better to enforce `unp_gc_lock' -> `unp_lock' (solock()) lock
> > order and release `unp_lock' in the unp_attach() and unp_detach() paths.
> > The previously committed diffs made this safe.
> >
> > The `unp_df_lock' introduced because the `unp_lock' and `unp_gc_lock'
> > state are unknown when unp_discard() called. Since it touches only
> > `ud_link' list the re-lock dances are unwanted in this path. Also this
> > keeps M_WAITOK allocation outside rwlock(9) when unp_discard() called
> > from unp_externalize() error path.
> >
> > The garbage collector flags moved from solock() protected `unp_flags' to
> > `unp_gc_lock' protected `unp_gcflags'
> >
> > The regress/sys/kern/ungc could be used to test this diff.
> >
>
> Updated diff. solock() used to protect sockets in the unp_gc(). This
> changes nothing but makes next diffs shorter.
>
> Index: sys/kern/uipc_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.156
> diff -u -p -r1.156 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c 11 Nov 2021 18:36:59 -0000 1.156
> +++ sys/kern/uipc_usrreq.c 13 Nov 2021 16:18:51 -0000
> @@ -59,12 +59,17 @@
> /*
> * Locks used to protect global data and struct members:
> * I immutable after creation
> + * D unp_df_lock
> + * G unp_gc_lock
> * U unp_lock
> * R unp_rights_mtx
> * a atomic
> */
>
> struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
> +struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk");
> +struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk");
> +
> struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
>
> /*
> @@ -72,7 +77,7 @@ struct mutex unp_rights_mtx = MUTEX_INIT
> * not received and need to be closed.
> */
> struct unp_deferral {
> - SLIST_ENTRY(unp_deferral) ud_link; /* [U] */
> + SLIST_ENTRY(unp_deferral) ud_link; /* [D] */
> int ud_n; /* [I] */
> /* followed by ud_n struct fdpass */
> struct fdpass ud_fp[]; /* [I] */
> @@ -97,17 +102,17 @@ struct task unp_gc_task = TASK_INITIALIZ
> */
> const struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
>
> -/* [U] list of all UNIX domain sockets, for unp_gc() */
> +/* [G] list of all UNIX domain sockets, for unp_gc() */
> LIST_HEAD(unp_head, unpcb) unp_head =
> LIST_HEAD_INITIALIZER(unp_head);
> -/* [U] list of sets of files that were sent over sockets that are now closed
> */
> +/* [D] list of sets of files that were sent over sockets that are now closed
> */
> SLIST_HEAD(,unp_deferral) unp_deferred =
> SLIST_HEAD_INITIALIZER(unp_deferred);
>
> ino_t unp_ino; /* [U] prototype for fake inode numbers */
> int unp_rights; /* [R] file descriptors in flight */
> -int unp_defer; /* [U] number of deferred fp to close by the GC task */
> -int unp_gcing; /* [U] GC task currently running */
> +int unp_defer; /* [G] number of deferred fp to close by the GC task */
> +int unp_gcing; /* [G] GC task currently running */
>
> void
> unp_init(void)
> @@ -430,7 +435,21 @@ uipc_attach(struct socket *so, int proto
> unp->unp_socket = so;
> so->so_pcb = unp;
> getnanotime(&unp->unp_ctime);
> +
> + /*
> + * Enforce `unp_gc_lock' -> `solock()' lock order.
> + */
> + /*
> + * We also release the lock on listening socket and on our peer
> + * socket when called from unp_connect(). This is safe. The
> + * listening socket protected by vnode(9) lock. The peer socket
> + * has 'UNP_CONNECTING' flag set.
> + */
> + sounlock(so, SL_LOCKED);
> + rw_enter_write(&unp_gc_lock);
> LIST_INSERT_HEAD(&unp_head, unp, unp_link);
> + rw_exit_write(&unp_gc_lock);
> + solock(so);
> return (0);
> }
>
> @@ -490,28 +509,29 @@ unp_detach(struct unpcb *unp)
>
> rw_assert_wrlock(&unp_lock);
>
> - LIST_REMOVE(unp, unp_link);
> + unp->unp_vnode = NULL;
>
> - if (vp != NULL) {
> - unp->unp_vnode = NULL;
> -
> - /*
> - * Enforce `i_lock' -> `unp_lock' because fifo
> - * subsystem requires it.
> - */
> + /*
> + * Enforce `unp_gc_lock' -> `solock()' lock order.
> + * Enforce `i_lock' -> `unp_lock' lock order.
> + */
> + sounlock(so, SL_LOCKED);
>
> - sounlock(so, SL_LOCKED);
> + rw_enter_write(&unp_gc_lock);
> + LIST_REMOVE(unp, unp_link);
> + rw_exit_write(&unp_gc_lock);
>
> + if (vp != NULL) {
> VOP_LOCK(vp, LK_EXCLUSIVE);
> vp->v_socket = NULL;
>
> KERNEL_LOCK();
> vput(vp);
> KERNEL_UNLOCK();
> -
> - solock(so);
> }
>
> + solock(so);
> +
> if (unp->unp_conn)
> unp_disconnect(unp);
> while (!SLIST_EMPTY(&unp->unp_refs))
> @@ -954,10 +974,11 @@ restart:
>
> if (error) {
> if (nfds > 0) {
> + /*
> + * No lock required. We are the only `cm' holder.
> + */
> rp = ((struct fdpass *)CMSG_DATA(cm));
> - rw_enter_write(&unp_lock);
> unp_discard(rp, nfds);
> - rw_exit_write(&unp_lock);
> }
> }
>
> @@ -1093,16 +1114,17 @@ unp_gc(void *arg __unused)
> struct unpcb *unp;
> int nunref, i;
>
> - rw_enter_write(&unp_lock);
> -
> + rw_enter_write(&unp_gc_lock);
> if (unp_gcing)
> goto unlock;
> unp_gcing = 1;
> + rw_exit_write(&unp_gc_lock);
>
> + rw_enter_write(&unp_df_lock);
> /* close any fds on the deferred list */
> while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
> SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
> - rw_exit_write(&unp_lock);
> + rw_exit_write(&unp_df_lock);
> for (i = 0; i < defer->ud_n; i++) {
> fp = defer->ud_fp[i].fp;
> if (fp == NULL)
> @@ -1118,25 +1140,27 @@ unp_gc(void *arg __unused)
> }
> free(defer, M_TEMP, sizeof(*defer) +
> sizeof(struct fdpass) * defer->ud_n);
> - rw_enter_write(&unp_lock);
> + rw_enter_write(&unp_df_lock);
> }
> + rw_exit_write(&unp_df_lock);
>
> + rw_enter_write(&unp_gc_lock);
> unp_defer = 0;
> LIST_FOREACH(unp, &unp_head, unp_link)
> - unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
> + unp->unp_gcflags = 0;
> do {
> nunref = 0;
> LIST_FOREACH(unp, &unp_head, unp_link) {
> fp = unp->unp_file;
> - if (unp->unp_flags & UNP_GCDEFER) {
> + if (unp->unp_gcflags & UNP_GCDEFER) {
> /*
> * This socket is referenced by another
> * socket which is known to be live,
> * so it's certainly live.
> */
> - unp->unp_flags &= ~UNP_GCDEFER;
> + unp->unp_gcflags &= ~UNP_GCDEFER;
> unp_defer--;
> - } else if (unp->unp_flags & UNP_GCMARK) {
> + } else if (unp->unp_gcflags & UNP_GCMARK) {
> /* marked as live in previous pass */
> continue;
> } else if (fp == NULL) {
> @@ -1155,7 +1179,7 @@ unp_gc(void *arg __unused)
> */
> if (fp->f_count == unp->unp_msgcount) {
> nunref++;
> - unp->unp_flags |= UNP_GCDEAD;
> + unp->unp_gcflags |= UNP_GCDEAD;
> continue;
> }
> }
> @@ -1167,10 +1191,12 @@ unp_gc(void *arg __unused)
> * sockets and note them as deferred (== referenced,
> * but not yet marked).
> */
> - unp->unp_flags |= UNP_GCMARK;
> + unp->unp_gcflags |= UNP_GCMARK;
>
> so = unp->unp_socket;
> + solock(so);
> unp_scan(so->so_rcv.sb_mb, unp_mark);
> + sounlock(so, SL_LOCKED);
> }
> } while (unp_defer);
>
> @@ -1180,14 +1206,23 @@ unp_gc(void *arg __unused)
> */
> if (nunref) {
> LIST_FOREACH(unp, &unp_head, unp_link) {
> - if (unp->unp_flags & UNP_GCDEAD)
> - unp_scan(unp->unp_socket->so_rcv.sb_mb,
> - unp_discard);
> + if (unp->unp_gcflags & UNP_GCDEAD) {
> + /*
> + * This socket could still be connected
> + * and if so it's `so_rcv' is still
> + * accessible by concurrent PRU_SEND
> + * thread.
> + */
> + so = unp->unp_socket;
> + solock(so);
> + unp_scan(so->so_rcv.sb_mb, unp_discard);
> + sounlock(so, SL_LOCKED);
> + }
> }
> }
> unp_gcing = 0;
> unlock:
> - rw_exit_write(&unp_lock);
> + rw_exit_write(&unp_gc_lock);
> }
>
> void
> @@ -1233,7 +1268,7 @@ unp_mark(struct fdpass *rp, int nfds)
> struct unpcb *unp;
> int i;
>
> - rw_assert_wrlock(&unp_lock);
> + rw_assert_wrlock(&unp_gc_lock);
>
> for (i = 0; i < nfds; i++) {
> if (rp[i].fp == NULL)
> @@ -1243,12 +1278,12 @@ unp_mark(struct fdpass *rp, int nfds)
> if (unp == NULL)
> continue;
>
> - if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
> + if (unp->unp_gcflags & (UNP_GCMARK|UNP_GCDEFER))
> continue;
>
> unp_defer++;
> - unp->unp_flags |= UNP_GCDEFER;
> - unp->unp_flags &= ~UNP_GCDEAD;
> + unp->unp_gcflags |= UNP_GCDEFER;
> + unp->unp_gcflags &= ~UNP_GCDEAD;
> }
> }
>
> @@ -1257,14 +1292,15 @@ unp_discard(struct fdpass *rp, int nfds)
> {
> struct unp_deferral *defer;
>
> - rw_assert_wrlock(&unp_lock);
> -
> /* copy the file pointers to a deferral structure */
> defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
> defer->ud_n = nfds;
> memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
> memset(rp, 0, sizeof(*rp) * nfds);
> +
> + rw_enter_write(&unp_df_lock);
> SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
> + rw_exit_write(&unp_df_lock);
>
> task_add(systqmp, &unp_gc_task);
> }
> Index: sys/sys/unpcb.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/unpcb.h,v
> retrieving revision 1.19
> diff -u -p -r1.19 unpcb.h
> --- sys/sys/unpcb.h 6 Nov 2021 17:35:14 -0000 1.19
> +++ sys/sys/unpcb.h 13 Nov 2021 16:18:51 -0000
> @@ -59,6 +59,7 @@
> *
> * Locks used to protect struct members:
> * I immutable after creation
> + * G unp_gc_lock
> * U unp_lock
> * a atomic
> */
> @@ -75,9 +76,10 @@ struct unpcb {
> struct mbuf *unp_addr; /* [U] bound address of socket */
> long unp_msgcount; /* [a] references from socket rcv buf */
> int unp_flags; /* [U] this unpcb contains peer eids */
> + int unp_gcflags; /* [G] garbge collector flags */
> struct sockpeercred unp_connid;/* [U] id of peer process */
> struct timespec unp_ctime; /* [I] holds creation time */
> - LIST_ENTRY(unpcb) unp_link; /* [U] link in per-AF list of sockets */
> + LIST_ENTRY(unpcb) unp_link; /* [G] link in per-AF list of sockets */
> };
>
> /*
> @@ -85,11 +87,15 @@ struct unpcb {
> */
> #define UNP_FEIDS 0x01 /* unp_connid contains information */
> #define UNP_FEIDSBIND 0x02 /* unp_connid was set by a bind
> */
> -#define UNP_GCMARK 0x04 /* mark during unp_gc() */
> -#define UNP_GCDEFER 0x08 /* ref'd, but not marked in this pass */
> -#define UNP_GCDEAD 0x10 /* unref'd in this pass */
> -#define UNP_BINDING 0x20 /* unp is binding now */
> -#define UNP_CONNECTING 0x40 /* unp is connecting now */
> +#define UNP_BINDING 0x04 /* unp is binding now */
> +#define UNP_CONNECTING 0x08 /* unp is connecting now */
> +
> +/*
> + * flag bits in unp_gcflags
> + */
> +#define UNP_GCMARK 0x01 /* mark during unp_gc() */
> +#define UNP_GCDEFER 0x02 /* ref'd, but not marked in this pass */
> +#define UNP_GCDEAD 0x04 /* unref'd in this pass */
>
> #define sotounpcb(so) ((struct unpcb *)((so)->so_pcb))
>
>