> On Jul 13, 2022, at 7:18 PM, Jason Thorpe <thor...@me.com> wrote: > > Ok, new version. Main differences:
Aaaaand another new version. This: ==> Creates a knote_impl structure that’s private to kern_event.c that has the new lock. I took the opportunity to move kn_influx to the knote_impl as well, since absolutely no one outside of kern_event.c should touch it. This change is ABI-compatible. ==> Improves the comments about the locking rules. There are no functional changes since the last patch.
Index: kern/kern_event.c =================================================================== RCS file: /cvsroot/src/sys/kern/kern_event.c,v retrieving revision 1.143 diff -u -p -r1.143 kern_event.c --- kern/kern_event.c 13 Jul 2022 14:11:46 -0000 1.143 +++ kern/kern_event.c 17 Jul 2022 19:44:06 -0000 @@ -120,6 +120,61 @@ static void filt_userdetach(struct knote static int filt_user(struct knote *, long hint); static int filt_usertouch(struct knote *, struct kevent *, long type); +/* + * Private knote state that should never be exposed outside + * of kern_event.c + * + * Field locking: + * + * q kn_kq->kq_lock + */ +struct knote_impl { + struct knote ki_knote; + unsigned int ki_influx; /* q: in-flux counter */ + kmutex_t ki_foplock; /* for kn_filterops */ +}; + +#define KIMPL_TO_KNOTE(kip) (&(kip)->ki_knote) +#define KNOTE_TO_KIMPL(knp) container_of((knp), struct knote_impl, ki_knote) + +static inline struct knote * +knote_alloc(bool sleepok) +{ + struct knote_impl *ki; + + ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP); + mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE); + + return KIMPL_TO_KNOTE(ki); +} + +static inline void +knote_free(struct knote *kn) +{ + struct knote_impl *ki = KNOTE_TO_KIMPL(kn); + + mutex_destroy(&ki->ki_foplock); + kmem_free(ki, sizeof(*ki)); +} + +static inline void +knote_foplock_enter(struct knote *kn) +{ + mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock); +} + +static inline void +knote_foplock_exit(struct knote *kn) +{ + mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock); +} + +static inline bool +knote_foplock_owned(struct knote *kn) +{ + return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock); +} + static const struct fileops kqueueops = { .fo_name = "kqueue", .fo_read = (void *)enxio, @@ -133,6 +188,31 @@ static const struct fileops kqueueops = .fo_restart = kqueue_restart, }; +static void +filt_nopdetach(struct knote *kn __unused) +{ +} + +static int +filt_nopevent(struct knote *kn __unused, long hint __unused) +{ + return 0; +} + +static const struct filterops nop_fd_filtops = { + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, + .f_attach = NULL, + .f_detach = filt_nopdetach, + .f_event = filt_nopevent, +}; + +static const struct filterops nop_filtops = { + .f_flags = FILTEROP_MPSAFE, + .f_attach = NULL, + .f_detach = filt_nopdetach, + .f_event = filt_nopevent, +}; + static const struct filterops kqread_filtops = { .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, @@ -232,15 +312,41 @@ static size_t user_kfiltersz; /* size * -> object lock (e.g., device driver lock, &c.) * -> kn_kq->kq_lock * - * Locking rules: + * Locking rules. ==> indicates the lock is acquired by the backing + * object, locks prior are acquired before calling filter ops: + * + * f_attach: fdp->fd_lock -> knote foplock -> + * (maybe) KERNEL_LOCK ==> backing object lock + * + * f_detach: fdp->fd_lock -> knote foplock -> + * (maybe) KERNEL_LOCK ==> backing object lock + * + * f_event via kevent: fdp->fd_lock -> knote foplock -> + * (maybe) KERNEL_LOCK ==> backing object lock + * N.B. NOTE_SUBMIT will never be set in the "hint" argument + * in this case. * - * f_attach: fdp->fd_lock, KERNEL_LOCK - * f_detach: fdp->fd_lock, KERNEL_LOCK - * f_event(!NOTE_SUBMIT) via kevent: fdp->fd_lock, _no_ object lock - * f_event via knote: whatever caller guarantees - * Typically, f_event(NOTE_SUBMIT) via knote: object lock - * f_event(!NOTE_SUBMIT) via knote: nothing, - * acquires/releases object lock inside. + * f_event via knote (via backing object: Whatever caller guarantees. + * Typically: + * f_event(NOTE_SUBMIT): caller has already acquired backing + * object lock. + * f_event(!NOTE_SUBMIT): caller has not acquired backing object, + * lock or has possibly acquired KERNEL_LOCK. Backing object + * lock may or may not be acquired as-needed. + * N.B. the knote foplock will **not** be acquired in this case. The + * caller guarantees that klist_fini() will not be called concurrently + * with knote(). + * + * f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock) + * N.B. knote foplock is **not** acquired in this case and + * the caller must guarantee that klist_fini() will never + * be called. kevent_register() restricts filters that + * provide f_touch to known-safe cases. + * + * klist_fini(): Caller must guarantee that no more knotes can + * be attached to the klist, and must **not** hold the backing + * object's lock; klist_fini() itself will acquire the foplock + * of each knote on the klist. * * Locking rules when detaching knotes: * @@ -286,7 +392,7 @@ static inline bool kn_in_flux(struct knote *kn) { KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); - return kn->kn_influx != 0; + return KNOTE_TO_KIMPL(kn)->ki_influx != 0; } static inline bool @@ -298,8 +404,9 @@ kn_enter_flux(struct knote *kn) return false; } - KASSERT(kn->kn_influx < UINT_MAX); - kn->kn_influx++; + struct knote_impl *ki = KNOTE_TO_KIMPL(kn); + KASSERT(ki->ki_influx < UINT_MAX); + ki->ki_influx++; return true; } @@ -308,14 +415,17 @@ static inline bool kn_leave_flux(struct knote *kn) { KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); - KASSERT(kn->kn_influx > 0); - kn->kn_influx--; - return kn->kn_influx == 0; + + struct knote_impl *ki = KNOTE_TO_KIMPL(kn); + KASSERT(ki->ki_influx > 0); + ki->ki_influx--; + return ki->ki_influx == 0; } static void kn_wait_flux(struct knote *kn, bool can_loop) { + struct knote_impl *ki = KNOTE_TO_KIMPL(kn); bool loop; KASSERT(mutex_owned(&kn->kn_kq->kq_lock)); @@ -325,7 +435,7 @@ kn_wait_flux(struct knote *kn, bool can_ * dropping the kq_lock. The caller has let us know in * 'can_loop'. */ - for (loop = true; loop && kn->kn_influx != 0; loop = can_loop) { + for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) { KQ_FLUX_WAIT(kn->kn_kq); } } @@ -423,33 +533,31 @@ knote_detach_quiesce(struct knote *kn) return false; } -static inline struct knote * -knote_alloc(bool sleepok) -{ - struct knote *kn; - - kn = kmem_zalloc(sizeof(*kn), sleepok ? KM_SLEEP : KM_NOSLEEP); - - return kn; -} - -static inline void -knote_free(struct knote *kn) -{ - kmem_free(kn, sizeof(*kn)); -} +/* + * Calls into the filterops need to be resilient against things which + * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid + * chasing garbage pointers (to data, or even potentially code in a + * module about to be unloaded). To that end, we acquire the + * knote foplock before calling into the filter ops. When a driver + * (or anything else) is tearing down its klist, klist_fini() enumerates + * each knote, acquires its foplock, and replaces the filterops with a + * nop stub, allowing knote detach (when descriptors are closed) to safely + * proceed. + */ static int filter_attach(struct knote *kn) { int rv; + KASSERT(knote_foplock_owned(kn)); KASSERT(kn->kn_fop != NULL); KASSERT(kn->kn_fop->f_attach != NULL); /* * N.B. that kn->kn_fop may change as the result of calling - * f_attach(). + * f_attach(). After f_attach() returns, kn->kn_fop may not + * be modified by code outside of klist_fini(). */ if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) { rv = kn->kn_fop->f_attach(kn); @@ -465,6 +573,8 @@ filter_attach(struct knote *kn) static void filter_detach(struct knote *kn) { + + KASSERT(knote_foplock_owned(kn)); KASSERT(kn->kn_fop != NULL); KASSERT(kn->kn_fop->f_detach != NULL); @@ -478,10 +588,12 @@ filter_detach(struct knote *kn) } static int -filter_event(struct knote *kn, long hint) +filter_event(struct knote *kn, long hint, bool submitting) { int rv; + /* See knote(). */ + KASSERT(submitting || knote_foplock_owned(kn)); KASSERT(kn->kn_fop != NULL); KASSERT(kn->kn_fop->f_event != NULL); @@ -499,6 +611,19 @@ filter_event(struct knote *kn, long hint static int filter_touch(struct knote *kn, struct kevent *kev, long type) { + + /* + * XXX We cannot assert that the knote foplock is held here + * XXX beause we cannot safely acquire it in all cases + * XXX where "touch" will be used in kqueue_scan(). We just + * XXX have to assume that f_touch will always be safe to call, + * XXX and kqueue_register() allows only the two known-safe + * XXX users of that op. + */ + + KASSERT(kn->kn_fop != NULL); + KASSERT(kn->kn_fop->f_touch != NULL); + return kn->kn_fop->f_touch(kn, kev, type); } @@ -1849,6 +1974,17 @@ kqueue_register(struct kqueue *kq, struc KASSERT(kn->kn_fop != NULL); /* + * XXX Allow only known-safe users of f_touch. + * XXX See filter_touch() for details. + */ + if (kn->kn_fop->f_touch != NULL && + kn->kn_fop != &timer_filtops && + kn->kn_fop != &user_filtops) { + error = ENOTSUP; + goto fail_ev_add; + } + + /* * apply reference count to knote structure, and * do not release it at the end of this routine. */ @@ -1880,6 +2016,7 @@ kqueue_register(struct kqueue *kq, struc * N.B. kn->kn_fop may change as the result * of filter_attach()! */ + knote_foplock_enter(kn); error = filter_attach(kn); if (error != 0) { #ifdef DEBUG @@ -1893,6 +2030,7 @@ kqueue_register(struct kqueue *kq, struc ft ? ft->f_ops->fo_name : "?", error); #endif + fail_ev_add: /* * N.B. no need to check for this note to * be in-flux, since it was never visible @@ -1900,6 +2038,7 @@ kqueue_register(struct kqueue *kq, struc * * knote_detach() drops fdp->fd_lock */ + knote_foplock_exit(kn); mutex_enter(&kq->kq_lock); KNOTE_WILLDETACH(kn); KASSERT(kn_in_flux(kn) == false); @@ -1957,6 +2096,7 @@ kqueue_register(struct kqueue *kq, struc * initial EV_ADD, but doing so will not reset any * filter which have already been triggered. */ + knote_foplock_enter(kn); kn->kn_kevent.udata = kev->udata; KASSERT(kn->kn_fop != NULL); if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) && @@ -1967,6 +2107,7 @@ kqueue_register(struct kqueue *kq, struc if (__predict_false(error != 0)) { /* Never a new knote (which would consume newkn). */ KASSERT(newkn != NULL); + knote_foplock_exit(kn); goto doneunlock; } } else { @@ -1981,10 +2122,12 @@ kqueue_register(struct kqueue *kq, struc * broken and does not return an error. */ done_ev_add: - rv = filter_event(kn, 0); + rv = filter_event(kn, 0, false); if (rv) knote_activate(kn); + knote_foplock_exit(kn); + /* disable knote */ if ((kev->flags & EV_DISABLE)) { mutex_spin_enter(&kq->kq_lock); @@ -2256,7 +2399,9 @@ relock: if ((kn->kn_flags & EV_ONESHOT) == 0) { mutex_spin_exit(&kq->kq_lock); KASSERT(mutex_owned(&fdp->fd_lock)); - rv = filter_event(kn, 0); + knote_foplock_enter(kn); + rv = filter_event(kn, 0, false); + knote_foplock_exit(kn); mutex_spin_enter(&kq->kq_lock); /* Re-poll if note was re-enqueued. */ if ((kn->kn_status & KN_QUEUED) != 0) { @@ -2615,7 +2760,15 @@ knote(struct klist *list, long hint) struct knote *kn, *tmpkn; SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) { - if (filter_event(kn, hint)) { + /* + * We assume here that the backing object's lock is + * already held if we're traversing the klist, and + * so acquiring the knote foplock would create a + * deadlock scenario. But we also know that the klist + * won't disappear on us while we're here, so not + * acquiring it is safe. + */ + if (filter_event(kn, hint, true)) { knote_activate(kn); } } @@ -2664,7 +2817,9 @@ knote_detach(struct knote *kn, filedesc_ /* Remove from monitored object. */ if (dofop) { + knote_foplock_enter(kn); filter_detach(kn); + knote_foplock_exit(kn); } /* Remove from descriptor table. */ @@ -2829,7 +2984,26 @@ klist_init(struct klist *list) void klist_fini(struct klist *list) { - /* Nothing, for now. */ + struct knote *kn; + + /* + * Neuter all existing knotes on the klist because the list is + * being destroyed. The caller has guaranteed that no additional + * knotes will be added to the list, that the backing object's + * locks are not held (otherwise there is a locking order issue + * with acquiring the knote foplock ), and that we can traverse + * the list safely in this state. + */ + SLIST_FOREACH(kn, list, kn_selnext) { + knote_foplock_enter(kn); + KASSERT(kn->kn_fop != NULL); + if (kn->kn_fop->f_flags & FILTEROP_ISFD) { + kn->kn_fop = &nop_fd_filtops; + } else { + kn->kn_fop = &nop_filtops; + } + knote_foplock_exit(kn); + } } /* Index: sys/event.h =================================================================== RCS file: /cvsroot/src/sys/sys/event.h,v retrieving revision 1.53 diff -u -p -r1.53 event.h --- sys/event.h 13 Jul 2022 14:11:46 -0000 1.53 +++ sys/event.h 17 Jul 2022 19:44:06 -0000 @@ -262,7 +262,6 @@ struct knote { struct kfilter *kn_kfilter; void *kn_hook; int kn_hookid; - unsigned int kn_influx; /* q: in-flux counter */ #define KN_ACTIVE 0x01U /* event has been triggered */ #define KN_QUEUED 0x02U /* event is on queue */
-- thorpej