This diff introduces a mutex that serializes access to the kernel message buffers. At the moment, the buffers do not have clear concurrency control.
The mutex controls access to all the modifiable fields of struct msgbuf. It also protects logsoftc.sc_state for managing thread wakeups. A tricky thing with the diff is that the code is indirectly used in many different contexts and in varied machine-specific functions. Luckily, the code already utilizes splhigh() in critical parts, so the transition to using a mutex possibly is not such a big step any longer. To avoid creating problems with lock order, the code does not take new locks when the log mutex is held. This is visible in logwakeup() and logread(). It looks that selrecord() does not acquire new locks, so no extra steps are needed in logpoll(). In logread(), the code assumes that there is only one reader. This keeps the data extraction logic simple. An additional benefit (?) of the diff is that now uiomove(9) is called without splhigh(). I have tested the diff on amd64 and octeon. However, additional tests on these and other platforms would be very helpful. OK? Index: kern/subr_log.c =================================================================== RCS file: src/sys/kern/subr_log.c,v retrieving revision 1.68 diff -u -p -r1.68 subr_log.c --- kern/subr_log.c 18 Aug 2020 13:41:49 -0000 1.68 +++ kern/subr_log.c 18 Aug 2020 14:10:25 -0000 @@ -52,6 +52,7 @@ #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/fcntl.h> +#include <sys/mutex.h> #include <sys/timeout.h> #ifdef KTRACE @@ -69,8 +70,12 @@ #define LOG_ASYNC 0x04 #define LOG_RDWAIT 0x08 +/* + * Locking: + * L log_mtx + */ struct logsoftc { - int sc_state; /* see above for possibilities */ + int sc_state; /* [L] see above for possibilities */ struct selinfo sc_selp; /* process waiting on select call */ struct sigio_ref sc_sigio; /* async I/O registration */ int sc_need_wakeup; /* if set, wake up waiters */ @@ -83,6 +88,14 @@ struct msgbuf *msgbufp; /* the mapped b struct msgbuf *consbufp; /* console message buffer. */ struct file *syslogf; +/* + * Lock that serializes access to log message buffers. + * This should be kept as a leaf lock in order not to constrain where + * printf(9) can be used. + */ +struct mutex log_mtx = + MUTEX_INITIALIZER_FLAGS(IPL_HIGH, "logmtx", MTX_NOWITNESS); + void filt_logrdetach(struct knote *kn); int filt_logread(struct knote *kn, long hint); @@ -144,13 +157,11 @@ initconsbuf(void) void msgbuf_putchar(struct msgbuf *mbp, const char c) { - int s; - if (mbp->msg_magic != MSG_MAGIC) /* Nothing we can do */ return; - s = splhigh(); + mtx_enter(&log_mtx); mbp->msg_bufc[mbp->msg_bufx++] = c; if (mbp->msg_bufx < 0 || mbp->msg_bufx >= mbp->msg_bufs) mbp->msg_bufx = 0; @@ -160,20 +171,19 @@ msgbuf_putchar(struct msgbuf *mbp, const mbp->msg_bufr = 0; mbp->msg_bufd++; } - splx(s); + mtx_leave(&log_mtx); } size_t msgbuf_getlen(struct msgbuf *mbp) { long len; - int s; - s = splhigh(); + mtx_enter(&log_mtx); len = mbp->msg_bufx - mbp->msg_bufr; if (len < 0) len += mbp->msg_bufs; - splx(s); + mtx_leave(&log_mtx); return (len); } @@ -208,34 +218,47 @@ logclose(dev_t dev, int flag, int mode, int logread(dev_t dev, struct uio *uio, int flag) { + struct sleep_state sls; struct msgbuf *mbp = msgbufp; - size_t l; - int s, error = 0; + size_t l, rpos; + int error = 0; - s = splhigh(); + mtx_enter(&log_mtx); while (mbp->msg_bufr == mbp->msg_bufx) { if (flag & IO_NDELAY) { error = EWOULDBLOCK; goto out; } logsoftc.sc_state |= LOG_RDWAIT; - error = tsleep_nsec(mbp, LOG_RDPRI | PCATCH, "klog", INFSLP); + mtx_leave(&log_mtx); + /* + * Set up and enter sleep manually instead of using msleep() + * to keep log_mtx as a leaf lock. + */ + sleep_setup(&sls, mbp, LOG_RDWAIT | PCATCH, "klog"); + sleep_setup_signal(&sls); + sleep_finish(&sls, logsoftc.sc_state & LOG_RDWAIT); + error = sleep_finish_signal(&sls); + mtx_enter(&log_mtx); if (error) goto out; } - logsoftc.sc_state &= ~LOG_RDWAIT; if (mbp->msg_bufd > 0) { char buf[64]; + long ndropped; + ndropped = mbp->msg_bufd; + mtx_leave(&log_mtx); l = snprintf(buf, sizeof(buf), "<%d>klog: dropped %ld byte%s, message buffer full\n", - LOG_KERN|LOG_WARNING, mbp->msg_bufd, - mbp->msg_bufd == 1 ? "" : "s"); + LOG_KERN|LOG_WARNING, ndropped, + ndropped == 1 ? "" : "s"); error = uiomove(buf, ulmin(l, sizeof(buf) - 1), uio); + mtx_enter(&log_mtx); if (error) goto out; - mbp->msg_bufd = 0; + mbp->msg_bufd -= ndropped; } while (uio->uio_resid > 0) { @@ -246,7 +269,11 @@ logread(dev_t dev, struct uio *uio, int l = ulmin(l, uio->uio_resid); if (l == 0) break; - error = uiomove(&mbp->msg_bufc[mbp->msg_bufr], l, uio); + rpos = mbp->msg_bufr; + mtx_leave(&log_mtx); + /* Ignore that concurrent readers may consume the same data. */ + error = uiomove(&mbp->msg_bufc[rpos], l, uio); + mtx_enter(&log_mtx); if (error) break; mbp->msg_bufr += l; @@ -254,23 +281,23 @@ logread(dev_t dev, struct uio *uio, int mbp->msg_bufr = 0; } out: - splx(s); + mtx_leave(&log_mtx); return (error); } int logpoll(dev_t dev, int events, struct proc *p) { - int s, revents = 0; + int revents = 0; - s = splhigh(); + mtx_enter(&log_mtx); if (events & (POLLIN | POLLRDNORM)) { if (msgbufp->msg_bufr != msgbufp->msg_bufx) revents |= events & (POLLIN | POLLRDNORM); else selrecord(p, &logsoftc.sc_selp); } - splx(s); + mtx_leave(&log_mtx); return (revents); } @@ -339,6 +366,8 @@ logwakeup(void) void logtick(void *arg) { + int state; + if (!log_open) return; @@ -348,17 +377,20 @@ logtick(void *arg) /* * sc_need_wakeup has to be cleared before handling the wakeup. - * This ensures that no wakeup is lost. + * Visiting log_mtx ensures the proper order. */ - membar_enter(); + + mtx_enter(&log_mtx); + state = logsoftc.sc_state; + if (logsoftc.sc_state & LOG_RDWAIT) + logsoftc.sc_state &= ~LOG_RDWAIT; + mtx_leave(&log_mtx); selwakeup(&logsoftc.sc_selp); - if (logsoftc.sc_state & LOG_ASYNC) + if (state & LOG_ASYNC) pgsigio(&logsoftc.sc_sigio, SIGIO, 0); - if (logsoftc.sc_state & LOG_RDWAIT) { + if (state & LOG_RDWAIT) wakeup(msgbufp); - logsoftc.sc_state &= ~LOG_RDWAIT; - } out: timeout_add_msec(&logsoftc.sc_tick, LOG_TICK); } @@ -380,10 +412,12 @@ logioctl(dev_t dev, u_long com, caddr_t break; case FIOASYNC: + mtx_enter(&log_mtx); if (*(int *)data) logsoftc.sc_state |= LOG_ASYNC; else logsoftc.sc_state &= ~LOG_ASYNC; + mtx_leave(&log_mtx); break; case FIOSETOWN: Index: sys/msgbuf.h =================================================================== RCS file: src/sys/sys/msgbuf.h,v retrieving revision 1.12 diff -u -p -r1.12 msgbuf.h --- sys/msgbuf.h 18 Aug 2020 13:38:24 -0000 1.12 +++ sys/msgbuf.h 18 Aug 2020 14:10:25 -0000 @@ -32,14 +32,20 @@ * @(#)msgbuf.h 8.1 (Berkeley) 6/2/93 */ +/* + * Locking: + * I immutable after creation + * L log_mtx + * Lw log_mtx for writing + */ struct msgbuf { #define MSG_MAGIC 0x063061 - long msg_magic; - long msg_bufx; /* write pointer */ - long msg_bufr; /* read pointer */ - long msg_bufs; /* real msg_bufc size (bytes) */ - long msg_bufd; /* number of dropped bytes */ - char msg_bufc[1]; /* buffer */ + long msg_magic; /* [I] buffer magic value */ + long msg_bufx; /* [L] write pointer */ + long msg_bufr; /* [L] read pointer */ + long msg_bufs; /* [I] real msg_bufc size (bytes) */ + long msg_bufd; /* [L] number of dropped bytes */ + char msg_bufc[1]; /* [Lw] buffer */ }; #ifdef _KERNEL #define CONSBUFSIZE (16 * 1024) /* console message buffer size */