Diff below is the last version of my kqueue diff to allow grabbing the
solock() and possibly sleeping, inside kqueue_scan().
Sleeping in kqueue_scan() is not a problem since threads are already
doing it when no event are available, in that case `kq_count' is 0.
When this happens, a thread going to sleep first remove its own marker
from the given kqueue's TAILQ.
The diff below allows threads to sleep *inside* the event collection
loop. For that marker are now flagged with EVFILT_MARKER and thread
are taught how to skip other threads markers.
Now if a thread, tA, is sleeping inside `f_event' it "owns" the corres-
ponding knote. That means the knote is no longer present in the kqueue's
TAILQ and kq_count no longer accounts for it.
However a sibling thread, tB, could decide it's a good time to close the
file descriptor associated to this knote. So to make sure tA finished
processing the knote before tB frees it, we use the KN_PROCESSING flag.
That means fdrelease() will sleeps until it can acquires the knote.
This logic is taken from DragonflyBSD where it is used as part of a
finer grained locking scheme. We still rely on the KERNEL_LOCK() for
serializing accesses to the data structures.
The sys_close() vs sys_kevent() race described above has been triggered
by abieber@ with www/gitea and homestead with previous version of this
diff. Base doesn't seem to be good enough to exercise it.
Second part of the diff below enables the solock() inside the filters.
Comments and tests welcome.
Index: kern/kern_event.c
===
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.81
diff -u -p -r1.81 kern_event.c
--- kern/kern_event.c 11 Oct 2017 08:06:56 - 1.81
+++ kern/kern_event.c 11 Oct 2017 10:40:10 -
@@ -84,6 +84,8 @@ void knote_attach(struct knote *kn, stru
void knote_drop(struct knote *kn, struct proc *p, struct filedesc *fdp);
void knote_enqueue(struct knote *kn);
void knote_dequeue(struct knote *kn);
+intknote_acquire(struct knote *kn);
+intknote_release(struct knote *kn);
#define knote_alloc() ((struct knote *)pool_get(_pool, PR_WAITOK))
#define knote_free(kn) pool_put(_pool, (kn))
@@ -759,27 +761,43 @@ start:
goto done;
}
+ marker.kn_filter = EVFILT_MARKER;
+ marker.kn_status = KN_PROCESSING;
TAILQ_INSERT_TAIL(>kq_head, , kn_tqe);
while (count) {
kn = TAILQ_FIRST(>kq_head);
if (kn == ) {
- TAILQ_REMOVE(>kq_head, kn, kn_tqe);
+ TAILQ_REMOVE(>kq_head, , kn_tqe);
splx(s);
if (count == maxevents)
goto retry;
goto done;
}
+ if (kn->kn_filter == EVFILT_MARKER) {
+ struct knote *other_marker = kn;
+
+ /* Move some other threads marker past this kn */
+ kn = TAILQ_NEXT(other_marker, kn_tqe);
+ TAILQ_REMOVE(>kq_head, kn, kn_tqe);
+ TAILQ_INSERT_BEFORE(other_marker, kn, kn_tqe);
+ continue;
+ }
+
+ if (!knote_acquire(kn))
+ continue;
TAILQ_REMOVE(>kq_head, kn, kn_tqe);
kq->kq_count--;
if (kn->kn_status & KN_DISABLED) {
kn->kn_status &= ~KN_QUEUED;
+ knote_release(kn);
continue;
}
if ((kn->kn_flags & EV_ONESHOT) == 0 &&
kn->kn_fop->f_event(kn, 0) == 0) {
kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+ knote_release(kn);
continue;
}
*kevp = kn->kn_kevent;
@@ -799,9 +817,11 @@ start:
if (kn->kn_flags & EV_DISPATCH)
kn->kn_status |= KN_DISABLED;
kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+ knote_release(kn);
} else {
TAILQ_INSERT_TAIL(>kq_head, kn, kn_tqe);
kq->kq_count++;
+ knote_release(kn);
}
count--;
if (nkev == KQ_NEVENTS) {
@@ -956,6 +976,41 @@ kqueue_wakeup(struct kqueue *kq)
}
/*
+ * Acquire a knote, return non-zero on success, 0 on failure.
+ *
+ * If we cannot acquire the knote we sleep and return 0. The knote
+ * may be stale on return in this case and the caller must restart
+ * whatever loop they are in.
+ */
+int
+knote_acquire(struct knote *kn)
+{
+ if (kn->kn_status & KN_PROCESSING) {
+ kn->kn_status |= KN_WAITING;
+ tsleep(kn, 0, "kqepts", hz);
+ /* knote may be stale now */
+