New poll/select(2) implementation convert 'struct pollfd' and 'fdset' to
knotes (kqueue event descriptors) then pass them to the kqueue subsystem.
A knote is allocated, with kqueue_register(), for every read, write and
except condition watched on a given FD. That means at most 3 allocations
might be necessary per FD.
The diff below reduce the overhead of per-syscall allocation/free of those
descriptors by leaving those which didn't trigger on the kqueue across
syscall. Leaving knotes on the kqueue allows kqueue_register() to re-use
existing descriptor instead of re-allocating a new one.
With this knotes are now lazily removed. The mechanism uses a serial
number which is incremented for every syscall that indicates if a knote
sitting in the kqueue is still valid or should be freed.
Note that performance improvements might not be visible with this diff
alone because kqueue_register() still pre-allocate a descriptor then drop
it.
visa@ already pointed out that the lazy removal logic could be integrated
in kqueue_scan() which would reduce the complexity of those two syscalls.
I'm arguing for doing this in a next step in-tree.
Please test and review :)
Index: kern/sys_generic.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.139
diff -u -p -r1.139 sys_generic.c
--- kern/sys_generic.c 29 Oct 2021 15:52:44 -0000 1.139
+++ kern/sys_generic.c 5 Nov 2021 08:11:05 -0000
@@ -598,7 +598,7 @@ sys_pselect(struct proc *p, void *v, reg
int
dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
- struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
+ struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
{
struct kqueue_scan_state scan;
fd_mask bits[6];
@@ -666,10 +666,10 @@ dopselect(struct proc *p, int nd, fd_set
if (nevents == 0 && ncollected == 0) {
uint64_t nsecs = INFSLP;
- if (timeout != NULL) {
- if (!timespecisset(timeout))
+ if (tsp != NULL) {
+ if (!timespecisset(tsp))
goto done;
- nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
+ nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP));
}
error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqsel", nsecs);
/* select is not restarted after signals... */
@@ -682,28 +682,37 @@ dopselect(struct proc *p, int nd, fd_set
/* Collect at most `nevents' possibly waiting in kqueue_scan() */
kqueue_scan_setup(&scan, p->p_kq);
- while (nevents > 0) {
+ while ((nevents - ncollected) > 0) {
struct kevent kev[KQ_NEVENTS];
int i, ready, count;
- /* Maximum number of events per iteration */
- count = MIN(nitems(kev), nevents);
- ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
+ /*
+ * Maximum number of events per iteration. Use the whole
+ * array to gather as many spurious events as possible.
+ */
+ count = nitems(kev);
+ ready = kqueue_scan(&scan, count, kev, tsp, p, &error);
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ktrevent(p, kev, ready);
#endif
- /* Convert back events that are ready. */
+ /* Convert back events that are ready/delete spurious ones. */
for (i = 0; i < ready && error == 0; i++)
error = pselcollect(p, &kev[i], pobits, &ncollected);
+
/*
- * Stop if there was an error or if we had enough
- * space to collect all events that were ready.
+ * Stop if there was an error or if we had enough space
+ * to collect all non-spurious events that were ready.
*/
- if (error || ready < count)
+ if (error || !ready || (ncollected > 0 && ready < count))
break;
- nevents -= ready;
+ /*
+ * If we only got spurious events try again repositioning
+ * the marker.
+ */
+ if (ncollected == 0 && ((tsp == NULL) || timespecisset(tsp)))
+ scan.kqs_nevent = 0;
}
kqueue_scan_finish(&scan);
*retval = ncollected;
@@ -730,7 +739,7 @@ done:
if (pibits[0] != (fd_set *)&bits[0])
free(pibits[0], M_TEMP, 6 * ni);
- kqueue_purge(p, p->p_kq);
+ /* Needed to remove events lazily. */
p->p_kq_serial += nd;
return (error);
@@ -759,7 +768,7 @@ pselregister(struct proc *p, fd_set *pib
DPRINTFN(2, "select fd %d mask %d serial %lu\n",
fd, msk, p->p_kq_serial);
EV_SET(&kev, fd, evf[msk],
- EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL,
+ EV_ADD|EV_ENABLE|__EV_POLL,
evff[msk], 0, (void *)(p->p_kq_serial));
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
@@ -806,8 +815,9 @@ pselcollect(struct proc *p, struct keven
{
/* Filter out and lazily delete spurious events */
if ((unsigned long)kevp->udata != p->p_kq_serial) {
- DPRINTFN(0, "select fd %u mismatched serial %lu\n",
- (int)kevp->ident, p->p_kq_serial);
+ DPRINTFN(0, "select fd %u lazily deleted (%lu != %lu)\n",
+ (int)kevp->ident, p->p_kq_serial,
+ (unsigned long)kevp->udata);
kevp->flags = EV_DISABLE|EV_DELETE;
kqueue_register(p->p_kq, kevp, p);
return (0);
@@ -816,6 +826,8 @@ pselcollect(struct proc *p, struct keven
if (kevp->flags & EV_ERROR) {
DPRINTFN(2, "select fd %d filt %d error %d\n",
(int)kevp->ident, kevp->filter, (int)kevp->data);
+ kevp->flags = EV_DISABLE|EV_DELETE;
+ kqueue_register(p->p_kq, kevp, p);
return (kevp->data);
}
@@ -1001,14 +1013,14 @@ ppollregister(struct proc *p, struct pol
kevp = kev;
if (pl[i].events & (POLLIN | POLLRDNORM)) {
EV_SET(kevp, pl[i].fd, EVFILT_READ,
- EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL, 0, 0,
+ EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
(void *)(p->p_kq_serial + i));
nkev++;
kevp++;
}
if (pl[i].events & (POLLOUT | POLLWRNORM)) {
EV_SET(kevp, pl[i].fd, EVFILT_WRITE,
- EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL, 0, 0,
+ EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
(void *)(p->p_kq_serial + i));
nkev++;
kevp++;
@@ -1124,7 +1136,7 @@ sys_ppoll(struct proc *p, void *v, regis
int
doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
- struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
+ struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
{
struct kqueue_scan_state scan;
struct pollfd pfds[4], *pl = pfds;
@@ -1164,10 +1176,10 @@ doppoll(struct proc *p, struct pollfd *f
if (nevents == 0 && ncollected == 0) {
uint64_t nsecs = INFSLP;
- if (timeout != NULL) {
- if (!timespecisset(timeout))
+ if (tsp != NULL) {
+ if (!timespecisset(tsp))
goto done;
- nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
+ nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP));
}
error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqpoll", nsecs);
@@ -1180,29 +1192,37 @@ doppoll(struct proc *p, struct pollfd *f
/* Collect at most `nevents' possibly waiting in kqueue_scan() */
kqueue_scan_setup(&scan, p->p_kq);
- while (nevents > 0) {
+ while ((nevents - ncollected) > 0) {
struct kevent kev[KQ_NEVENTS];
int i, ready, count;
- /* Maximum number of events per iteration */
- count = MIN(nitems(kev), nevents);
- ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
+ /*
+ * Maximum number of events per iteration. Use the whole
+ * array to gather as many spurious events as possible.
+ */
+ count = nitems(kev);
+ ready = kqueue_scan(&scan, count, kev, tsp, p, &error);
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
ktrevent(p, kev, ready);
#endif
- /* Convert back events that are ready. */
+ /* Convert back events that are ready/delete spurious ones. */
for (i = 0; i < ready; i++)
ncollected += ppollcollect(p, &kev[i], pl, nfds);
/*
- * Stop if there was an error or if we had enough
- * place to collect all events that were ready.
+ * Stop if there was an error or if we had enough space
+ * to collect all non-spurious events that were ready.
*/
- if (error || ready < count)
+ if (error || !ready || (ncollected > 0 && ready < count))
break;
- nevents -= ready;
+ /*
+ * If we only got spurious events try again repositioning
+ * the marker.
+ */
+ if (ncollected == 0 && ((tsp == NULL) || timespecisset(tsp)))
+ scan.kqs_nevent = 0;
}
kqueue_scan_finish(&scan);
*retval = ncollected;
@@ -1230,7 +1250,7 @@ bad:
if (pl != pfds)
free(pl, M_TEMP, sz);
- kqueue_purge(p, p->p_kq);
+ /* Needed to remove events lazily. */
p->p_kq_serial += nfds;
return (error);
@@ -1248,17 +1268,11 @@ ppollcollect(struct proc *p, struct keve
/* Extract poll array index */
i = (unsigned long)kevp->udata - p->p_kq_serial;
- /*
- * Lazily delete spurious events.
- *
- * This should not happen as long as kqueue_purge() is called
- * at the end of every syscall. It migh be interesting to do
- * like DragonFlyBSD and not always allocated a new knote in
- * kqueue_register() with that lazy removal makes sense.
- */
+ /* Filter out and lazily delete spurious events */
if (i >= nfds) {
- DPRINTFN(0, "poll get out of range udata %lu vs serial %lu\n",
- (unsigned long)kevp->udata, p->p_kq_serial);
+ DPRINTFN(0, "poll fd %d lazily deleted (%lu != %lu)\n",
+ (int)kevp->ident, p->p_kq_serial,
+ (unsigned long)kevp->udata);
kevp->flags = EV_DISABLE|EV_DELETE;
kqueue_register(p->p_kq, kevp, p);
return (0);
@@ -1266,6 +1280,8 @@ ppollcollect(struct proc *p, struct keve
if ((int)kevp->ident != pl[i].fd) {
DPRINTFN(0, "poll get %lu/%d mismatch fd %u!=%d serial %lu\n",
i+1, nfds, (int)kevp->ident, pl[i].fd, p->p_kq_serial);
+ kevp->flags = EV_DISABLE|EV_DELETE;
+ kqueue_register(p->p_kq, kevp, p);
return (0);
}
@@ -1309,9 +1325,10 @@ ppollcollect(struct proc *p, struct keve
KASSERT(0);
}
- DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n",
- i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata,
- kevp->filter);
+ DPRINTFN(1, "poll get %lu/%d fd %d events %02x revents %02x serial %lu"
+ " filt %d flags %x\n", i+1, nfds, pl[i].fd, pl[i].events,
+ pl[i].revents, (unsigned long)kevp->udata, kevp->filter,
+ kevp->flags);
if (!already_seen && (pl[i].revents != 0))
return (1);