I want to move a little bit forward with fine grained locking in UNIX
domain sockets area. My goal is to introduce the new rwlock(9) for
garbage collector data and keep existing `unp_lock' only for per-socket
data. This will allow to replace it by per-socket `so_lock' in the
future.
I like to start from unp_externalize(). It touches only garbage
collector data such as `unp_rights', `unp_msgcount' directly and
`unp_deferred' through unp_discard(). So `unp_lock' could be dropped
outside unp_externalize(). I don't want to introduce the new gc related
rwlock(9) with this diff, so `unp_lock' is still taken around garbage
collector data access. But right now this diff removes M_WAITOK
allocation from rwlock(9) and useless and weird fdplock() dances and
memory allocation from the error path. Also with this diff the only
place where we have `unp_lock' and fdplock() are both taken will be the
unp_internalize(). This allows to change their existing lock order or
completely avoid the case where these locks are both taken. This should
help to mpi@'s knote(9) work.
I want to do the unp_internalize() related work and introduce the new
garbage collector related rwlock(9) in the further diffs.
The socket locking could be not clean to somebody, so I like to put a
little explanation here. The solock() (`unp_lock') release is absolutely
safe around unp_externalize() within soreceive() path. The socket's
`so_rcv' buffer is still locked by sblock(). Concurrent thread could
disconnect this socket, but our soreceive() thread will be finished fine.
Such races are application specific. Also the lock sequence:
s = solock(so);
sblock(so, &so->so_rcv, ...);
sounlock(so, s);
s = solock(so);
does not provide lock order issue because the rwsleep_nsec(9) called
within sblock() releases passed rwlock(9). Since sblock() and sosleep()
release rwlock(9) taken by preceding solock(), the only `so_state' flags
and the PCB flags like `unp_flags' make socket consistent but not
solock().
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.264
diff -u -p -r1.264 uipc_socket.c
--- sys/kern/uipc_socket.c 26 Jul 2021 05:51:13 -0000 1.264
+++ sys/kern/uipc_socket.c 2 Oct 2021 23:07:04 -0000
@@ -892,9 +892,11 @@ dontblock:
sbsync(&so->so_rcv, nextrecord);
if (controlp) {
if (pr->pr_domain->dom_externalize) {
+ sounlock(so, s);
error =
(*pr->pr_domain->dom_externalize)
(cm, controllen, flags);
+ s = solock(so);
}
*controlp = cm;
} else {
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.148
diff -u -p -r1.148 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c 25 May 2021 22:45:09 -0000 1.148
+++ sys/kern/uipc_usrreq.c 2 Oct 2021 23:07:04 -0000
@@ -817,8 +817,6 @@ unp_externalize(struct mbuf *rights, soc
struct file *fp;
int nfds, error = 0;
- rw_assert_wrlock(&unp_lock);
-
/*
* This code only works because SCM_RIGHTS is the only supported
* control message type on unix sockets. Enforce this here.
@@ -834,7 +832,7 @@ unp_externalize(struct mbuf *rights, soc
controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
if (nfds > controllen / sizeof(int)) {
error = EMSGSIZE;
- goto restart;
+ goto out;
}
/* Make sure the recipient should be able to see the descriptors.. */
@@ -868,18 +866,13 @@ unp_externalize(struct mbuf *rights, soc
KERNEL_UNLOCK();
+ if (error)
+ goto out;
+
fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
-restart:
fdplock(fdp);
- if (error != 0) {
- if (nfds > 0) {
- rp = ((struct fdpass *)CMSG_DATA(cm));
- unp_discard(rp, nfds);
- }
- goto out;
- }
-
+restart:
/*
* First loop -- allocate file descriptor table slots for the
* new descriptors.
@@ -895,17 +888,19 @@ restart:
if (error == ENOSPC) {
fdexpand(p);
- error = 0;
- } else {
- /*
- * This is the error that has historically
- * been returned, and some callers may
- * expect it.
- */
- error = EMSGSIZE;
+ goto restart;
}
+
fdpunlock(fdp);
- goto restart;
+
+ /*
+ * This is the error that has historically
+ * been returned, and some callers may
+ * expect it.
+ */
+
+ error = EMSGSIZE;
+ goto out;
}
/*
@@ -924,12 +919,15 @@ restart:
rp++;
}
+ fdpunlock(fdp);
/*
* Now that adding them has succeeded, update all of the
* descriptor passing state.
*/
rp = (struct fdpass *)CMSG_DATA(cm);
+
+ rw_enter_write(&unp_lock);
for (i = 0; i < nfds; i++) {
struct unpcb *unp;
@@ -939,6 +937,7 @@ restart:
unp->unp_msgcount--;
unp_rights--;
}
+ rw_exit_write(&unp_lock);
/*
* Copy temporary array to message and adjust length, in case of
@@ -947,10 +946,19 @@ restart:
memcpy(CMSG_DATA(cm), fds, nfds * sizeof(int));
cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
rights->m_len = CMSG_LEN(nfds * sizeof(int));
- out:
- fdpunlock(fdp);
+out:
if (fds != NULL)
free(fds, M_TEMP, nfds * sizeof(int));
+
+ if (error) {
+ if (nfds > 0) {
+ rp = ((struct fdpass *)CMSG_DATA(cm));
+ rw_enter_write(&unp_lock);
+ unp_discard(rp, nfds);
+ rw_exit_write(&unp_lock);
+ }
+ }
+
return (error);
}