The final step before rework UNIX sockets to fine grained locks. Except
`unp_ino' this leaves only per-socket data protected by `unp_lock'. The
`unp_ino' protection is not the big deal and will be done with mutex(9)
in the future diff.
The `unp_gc_lock' rwlock(9) protects `unp_defer', `unp_gcing',
`unp_gcflags' and `unp_link' list.
We need to simultaneously lock `unp_gc_lock' and `unp_lock'. When we
perform unp_attach() or unp_detach() we link PCB to `unp_link' list with
`unp_lock' held and the lock order should be `unp_lock' ->
`unp_gc_lock'. But when unp_gc() does `unp_link' list walkthrough with
the `unp_gc_lock' lock held it should lock socket while performs
`so_rcv' buffer scan and the lock order should be the opposite.
In the future diff `unp_lock' will be replaced by per-socket `so_lock'
so it's better to enforce `unp_gc_lock' -> `unp_lock' (solock()) lock
order and release `unp_lock' in the unp_attach() and unp_detach() paths.
The previously committed diffs made this safe.
The `unp_df_lock' introduced because the `unp_lock' and `unp_gc_lock'
state are unknown when unp_discard() called. Since it touches only
`ud_link' list the re-lock dances are unwanted in this path. Also this
keeps M_WAITOK allocation outside rwlock(9) when unp_discard() called
from unp_externalize() error path.
The garbage collector flags moved from solock() protected `unp_flags' to
`unp_gc_lock' protected `unp_gcflags'
The regress/sys/kern/ungc could be used to test this diff.
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.156
diff -u -p -r1.156 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c 11 Nov 2021 18:36:59 -0000 1.156
+++ sys/kern/uipc_usrreq.c 12 Nov 2021 00:18:21 -0000
@@ -59,12 +59,17 @@
/*
* Locks used to protect global data and struct members:
* I immutable after creation
+ * D unp_df_lock
+ * G unp_gc_lock
* U unp_lock
* R unp_rights_mtx
* a atomic
*/
struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
+struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk");
+struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk");
+
struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
/*
@@ -72,7 +77,7 @@ struct mutex unp_rights_mtx = MUTEX_INIT
* not received and need to be closed.
*/
struct unp_deferral {
- SLIST_ENTRY(unp_deferral) ud_link; /* [U] */
+ SLIST_ENTRY(unp_deferral) ud_link; /* [D] */
int ud_n; /* [I] */
/* followed by ud_n struct fdpass */
struct fdpass ud_fp[]; /* [I] */
@@ -97,17 +102,17 @@ struct task unp_gc_task = TASK_INITIALIZ
*/
const struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
-/* [U] list of all UNIX domain sockets, for unp_gc() */
+/* [G] list of all UNIX domain sockets, for unp_gc() */
LIST_HEAD(unp_head, unpcb) unp_head =
LIST_HEAD_INITIALIZER(unp_head);
-/* [U] list of sets of files that were sent over sockets that are now closed */
+/* [D] list of sets of files that were sent over sockets that are now closed */
SLIST_HEAD(,unp_deferral) unp_deferred =
SLIST_HEAD_INITIALIZER(unp_deferred);
ino_t unp_ino; /* [U] prototype for fake inode numbers */
int unp_rights; /* [R] file descriptors in flight */
-int unp_defer; /* [U] number of deferred fp to close by the GC task */
-int unp_gcing; /* [U] GC task currently running */
+int unp_defer; /* [G] number of deferred fp to close by the GC task */
+int unp_gcing; /* [G] GC task currently running */
void
unp_init(void)
@@ -430,7 +435,21 @@ uipc_attach(struct socket *so, int proto
unp->unp_socket = so;
so->so_pcb = unp;
getnanotime(&unp->unp_ctime);
+
+ /*
+ * Enforce `unp_gc_lock' -> `solock()' lock order.
+ */
+ /*
+ * We also release the lock on listening socket and on our peer
+ * socket when called from unp_connect(). This is safe. The
+ * listening socket protected by vnode(9) lock. The peer socket
+ * has 'UNP_CONNECTING' flag set.
+ */
+ sounlock(so, SL_LOCKED);
+ rw_enter_write(&unp_gc_lock);
LIST_INSERT_HEAD(&unp_head, unp, unp_link);
+ rw_exit_write(&unp_gc_lock);
+ solock(so);
return (0);
}
@@ -490,28 +509,29 @@ unp_detach(struct unpcb *unp)
rw_assert_wrlock(&unp_lock);
- LIST_REMOVE(unp, unp_link);
-
- if (vp != NULL) {
- unp->unp_vnode = NULL;
+ unp->unp_vnode = NULL;
- /*
- * Enforce `i_lock' -> `unp_lock' because fifo
- * subsystem requires it.
- */
+ /*
+ * Enforce `unp_gc_lock' -> `solock()' lock order.
+ * Enforce `i_lock' -> `unp_lock' lock order.
+ */
+ sounlock(so, SL_LOCKED);
- sounlock(so, SL_LOCKED);
+ rw_enter_write(&unp_gc_lock);
+ LIST_REMOVE(unp, unp_link);
+ rw_exit_write(&unp_gc_lock);
+ if (vp != NULL) {
VOP_LOCK(vp, LK_EXCLUSIVE);
vp->v_socket = NULL;
KERNEL_LOCK();
vput(vp);
KERNEL_UNLOCK();
-
- solock(so);
}
+ solock(so);
+
if (unp->unp_conn)
unp_disconnect(unp);
while (!SLIST_EMPTY(&unp->unp_refs))
@@ -954,10 +974,11 @@ restart:
if (error) {
if (nfds > 0) {
+ /*
+ * No lock required. We are the only `cm' holder.
+ */
rp = ((struct fdpass *)CMSG_DATA(cm));
- rw_enter_write(&unp_lock);
unp_discard(rp, nfds);
- rw_exit_write(&unp_lock);
}
}
@@ -1093,16 +1114,17 @@ unp_gc(void *arg __unused)
struct unpcb *unp;
int nunref, i;
- rw_enter_write(&unp_lock);
-
+ rw_enter_write(&unp_gc_lock);
if (unp_gcing)
goto unlock;
unp_gcing = 1;
+ rw_exit_write(&unp_gc_lock);
+ rw_enter_write(&unp_df_lock);
/* close any fds on the deferred list */
while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
- rw_exit_write(&unp_lock);
+ rw_exit_write(&unp_df_lock);
for (i = 0; i < defer->ud_n; i++) {
fp = defer->ud_fp[i].fp;
if (fp == NULL)
@@ -1118,25 +1140,27 @@ unp_gc(void *arg __unused)
}
free(defer, M_TEMP, sizeof(*defer) +
sizeof(struct fdpass) * defer->ud_n);
- rw_enter_write(&unp_lock);
+ rw_enter_write(&unp_df_lock);
}
+ rw_exit_write(&unp_df_lock);
+ rw_enter_write(&unp_gc_lock);
unp_defer = 0;
LIST_FOREACH(unp, &unp_head, unp_link)
- unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
+ unp->unp_gcflags = 0;
do {
nunref = 0;
LIST_FOREACH(unp, &unp_head, unp_link) {
fp = unp->unp_file;
- if (unp->unp_flags & UNP_GCDEFER) {
+ if (unp->unp_gcflags & UNP_GCDEFER) {
/*
* This socket is referenced by another
* socket which is known to be live,
* so it's certainly live.
*/
- unp->unp_flags &= ~UNP_GCDEFER;
+ unp->unp_gcflags &= ~UNP_GCDEFER;
unp_defer--;
- } else if (unp->unp_flags & UNP_GCMARK) {
+ } else if (unp->unp_gcflags & UNP_GCMARK) {
/* marked as live in previous pass */
continue;
} else if (fp == NULL) {
@@ -1155,7 +1179,7 @@ unp_gc(void *arg __unused)
*/
if (fp->f_count == unp->unp_msgcount) {
nunref++;
- unp->unp_flags |= UNP_GCDEAD;
+ unp->unp_gcflags |= UNP_GCDEAD;
continue;
}
}
@@ -1167,10 +1191,12 @@ unp_gc(void *arg __unused)
* sockets and note them as deferred (== referenced,
* but not yet marked).
*/
- unp->unp_flags |= UNP_GCMARK;
+ unp->unp_gcflags |= UNP_GCMARK;
+ rw_enter_write(&unp_lock);
so = unp->unp_socket;
unp_scan(so->so_rcv.sb_mb, unp_mark);
+ rw_exit_write(&unp_lock);
}
} while (unp_defer);
@@ -1180,14 +1206,23 @@ unp_gc(void *arg __unused)
*/
if (nunref) {
LIST_FOREACH(unp, &unp_head, unp_link) {
- if (unp->unp_flags & UNP_GCDEAD)
+ if (unp->unp_gcflags & UNP_GCDEAD) {
+ /*
+ * This socket could still be connected
+ * and if so it's `so_rcv' is still
+ * accessible by concurrent PRU_SEND
+ * thread.
+ */
+ rw_enter_write(&unp_lock);
unp_scan(unp->unp_socket->so_rcv.sb_mb,
unp_discard);
+ rw_exit_write(&unp_lock);
+ }
}
}
unp_gcing = 0;
unlock:
- rw_exit_write(&unp_lock);
+ rw_exit_write(&unp_gc_lock);
}
void
@@ -1233,7 +1268,7 @@ unp_mark(struct fdpass *rp, int nfds)
struct unpcb *unp;
int i;
- rw_assert_wrlock(&unp_lock);
+ rw_assert_wrlock(&unp_gc_lock);
for (i = 0; i < nfds; i++) {
if (rp[i].fp == NULL)
@@ -1243,12 +1278,12 @@ unp_mark(struct fdpass *rp, int nfds)
if (unp == NULL)
continue;
- if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
+ if (unp->unp_gcflags & (UNP_GCMARK|UNP_GCDEFER))
continue;
unp_defer++;
- unp->unp_flags |= UNP_GCDEFER;
- unp->unp_flags &= ~UNP_GCDEAD;
+ unp->unp_gcflags |= UNP_GCDEFER;
+ unp->unp_gcflags &= ~UNP_GCDEAD;
}
}
@@ -1257,14 +1292,15 @@ unp_discard(struct fdpass *rp, int nfds)
{
struct unp_deferral *defer;
- rw_assert_wrlock(&unp_lock);
-
/* copy the file pointers to a deferral structure */
defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
defer->ud_n = nfds;
memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
memset(rp, 0, sizeof(*rp) * nfds);
+
+ rw_enter_write(&unp_df_lock);
SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
+ rw_exit_write(&unp_df_lock);
task_add(systqmp, &unp_gc_task);
}
Index: sys/sys/unpcb.h
===================================================================
RCS file: /cvs/src/sys/sys/unpcb.h,v
retrieving revision 1.19
diff -u -p -r1.19 unpcb.h
--- sys/sys/unpcb.h 6 Nov 2021 17:35:14 -0000 1.19
+++ sys/sys/unpcb.h 12 Nov 2021 00:18:22 -0000
@@ -59,6 +59,7 @@
*
* Locks used to protect struct members:
* I immutable after creation
+ * G unp_gc_lock
* U unp_lock
* a atomic
*/
@@ -75,9 +76,10 @@ struct unpcb {
struct mbuf *unp_addr; /* [U] bound address of socket */
long unp_msgcount; /* [a] references from socket rcv buf */
int unp_flags; /* [U] this unpcb contains peer eids */
+ int unp_gcflags; /* [G] garbge collector flags */
struct sockpeercred unp_connid;/* [U] id of peer process */
struct timespec unp_ctime; /* [I] holds creation time */
- LIST_ENTRY(unpcb) unp_link; /* [U] link in per-AF list of sockets */
+ LIST_ENTRY(unpcb) unp_link; /* [G] link in per-AF list of sockets */
};
/*
@@ -85,11 +87,15 @@ struct unpcb {
*/
#define UNP_FEIDS 0x01 /* unp_connid contains information */
#define UNP_FEIDSBIND 0x02 /* unp_connid was set by a bind */
-#define UNP_GCMARK 0x04 /* mark during unp_gc() */
-#define UNP_GCDEFER 0x08 /* ref'd, but not marked in this pass */
-#define UNP_GCDEAD 0x10 /* unref'd in this pass */
-#define UNP_BINDING 0x20 /* unp is binding now */
-#define UNP_CONNECTING 0x40 /* unp is connecting now */
+#define UNP_BINDING 0x04 /* unp is binding now */
+#define UNP_CONNECTING 0x08 /* unp is connecting now */
+
+/*
+ * flag bits in unp_gcflags
+ */
+#define UNP_GCMARK 0x01 /* mark during unp_gc() */
+#define UNP_GCDEFER 0x02 /* ref'd, but not marked in this pass */
+#define UNP_GCDEAD 0x04 /* unref'd in this pass */
#define sotounpcb(so) ((struct unpcb *)((so)->so_pcb))