On Sat, Dec 25, 2021 at 01:23:09PM +0000, Visa Hankala wrote: > Here is a revised version of the patch. > > A number of fixes to event filter routines have already been committed. > > Changes to the previous version: > > * Prevent excessive use of kernel memory with poll(2). Now the code > follows how many knotes are registered. If the number looks high, > kqpoll_done() removes knotes eagerly. The condition could include > `num' as well, but I think the heuristic is good enough already. > > * Set forcehup anew on each iteration in ppollregister(). > > Please test.
I did run a full regress on amd64 with it. regress/sys/kern/poll fails. http://bluhm.genua.de/regress/results/2021-12-25T15%3A50%3A25Z/logs/sys/kern/poll/make.log But this test started failing a day before, so it may be unrelated to the diff. run-regress-poll_iocond-pty and run-regress-poll_iocond-socket-tcp are unreliable now. run-regress-poll_iocond-socket-unix hangs for an hour and is aborted. bluhm > Index: kern/kern_event.c > =================================================================== > RCS file: src/sys/kern/kern_event.c,v > retrieving revision 1.178 > diff -u -p -r1.178 kern_event.c > --- kern/kern_event.c 25 Dec 2021 11:04:58 -0000 1.178 > +++ kern/kern_event.c 25 Dec 2021 13:09:06 -0000 > @@ -221,6 +221,7 @@ KQRELE(struct kqueue *kq) > } > > KASSERT(TAILQ_EMPTY(&kq->kq_head)); > + KASSERT(kq->kq_nknotes == 0); > > free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize * > sizeof(struct knlist)); > @@ -451,7 +452,7 @@ filt_proc(struct knote *kn, long hint) > kev.fflags = kn->kn_sfflags; > kev.data = kn->kn_id; /* parent */ > kev.udata = kn->kn_udata; /* preserve udata */ > - error = kqueue_register(kq, &kev, NULL); > + error = kqueue_register(kq, &kev, 0, NULL); > if (error) > kn->kn_fflags |= NOTE_TRACKERR; > } > @@ -814,11 +815,15 @@ void > kqpoll_done(unsigned int num) > { > struct proc *p = curproc; > + struct kqueue *kq = p->p_kq; > > KASSERT(p->p_kq != NULL); > KASSERT(p->p_kq_serial + num >= p->p_kq_serial); > > p->p_kq_serial += num; > + > + if (kq->kq_nknotes > 4 * kq->kq_knlistsize) > + kqueue_purge(p, kq); > } > > void > @@ -944,7 +949,7 @@ sys_kevent(struct proc *p, void *v, regi > for (i = 0; i < n; i++) { > kevp = &kev[i]; > kevp->flags &= ~EV_SYSFLAGS; > - error = kqueue_register(kq, kevp, p); > + error = kqueue_register(kq, kevp, 0, p); > if (error || (kevp->flags & EV_RECEIPT)) { > if (SCARG(uap, nevents) != 0) { > kevp->flags = EV_ERROR; > @@ -1040,7 +1045,8 @@ bad: > #endif > > int > -kqueue_register(struct kqueue *kq, struct kevent *kev, struct proc *p) > +kqueue_register(struct kqueue *kq, struct kevent *kev, unsigned int pollid, > + struct proc *p) > { > struct filedesc *fdp = kq->kq_fdp; > const struct filterops *fops = NULL; > @@ -1049,6 +1055,8 @@ kqueue_register(struct kqueue *kq, struc > struct knlist *list = NULL; > int active, error = 0; > > + KASSERT(pollid == 0 || (p != NULL && p->p_kq == kq)); > + > if (kev->filter < 0) { > if (kev->filter + EVFILT_SYSCOUNT < 0) > return (EINVAL); > @@ -1096,7 +1104,8 @@ again: > if (list != NULL) { > SLIST_FOREACH(kn, list, kn_link) { > if (kev->filter == kn->kn_filter && > - kev->ident == kn->kn_id) { > + kev->ident == kn->kn_id && > + pollid == kn->kn_pollid) { > if (!knote_acquire(kn, NULL, 0)) { > /* knote_acquire() has released > * kq_lock. */ > @@ -1141,6 +1150,7 @@ again: > kev->fflags = 0; > kev->data = 0; > kn->kn_kevent = *kev; > + kn->kn_pollid = pollid; > > knote_attach(kn); > mtx_leave(&kq->kq_lock); > @@ -1905,6 +1915,7 @@ knote_attach(struct knote *kn) > list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; > } > SLIST_INSERT_HEAD(list, kn, kn_link); > + kq->kq_nknotes++; > } > > void > @@ -1916,6 +1927,7 @@ knote_detach(struct knote *kn) > MUTEX_ASSERT_LOCKED(&kq->kq_lock); > KASSERT(kn->kn_status & KN_PROCESSING); > > + kq->kq_nknotes--; > if (kn->kn_fop->f_flags & FILTEROP_ISFD) > list = &kq->kq_knlist[kn->kn_id]; > else > Index: kern/sys_generic.c > =================================================================== > RCS file: src/sys/kern/sys_generic.c,v > retrieving revision 1.146 > diff -u -p -r1.146 sys_generic.c > --- kern/sys_generic.c 11 Dec 2021 09:28:26 -0000 1.146 > +++ kern/sys_generic.c 25 Dec 2021 13:09:06 -0000 > @@ -81,8 +81,9 @@ int kqpoll_debug = 0; > > int pselregister(struct proc *, fd_set *[], fd_set *[], int, int *, int *); > int pselcollect(struct proc *, struct kevent *, fd_set *[], int *); > +void ppollregister(struct proc *, struct pollfd *, int, int *, int *); > +int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int); > > -void pollscan(struct proc *, struct pollfd *, u_int, register_t *); > int pollout(struct pollfd *, struct pollfd *, u_int); > int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, > struct timespec *, const sigset_t *, register_t *); > @@ -768,7 +769,7 @@ pselregister(struct proc *p, fd_set *pib > if (KTRPOINT(p, KTR_STRUCT)) > ktrevent(p, &kev, 1); > #endif > - error = kqueue_register(p->p_kq, &kev, p); > + error = kqueue_register(p->p_kq, &kev, 0, p); > switch (error) { > case 0: > nevents++; > @@ -910,33 +911,6 @@ doselwakeup(struct selinfo *sip) > } > } > > -void > -pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval) > -{ > - struct filedesc *fdp = p->p_fd; > - struct file *fp; > - u_int i; > - int n = 0; > - > - for (i = 0; i < nfd; i++, pl++) { > - /* Check the file descriptor. */ > - if (pl->fd < 0) { > - pl->revents = 0; > - continue; > - } > - if ((fp = fd_getfile(fdp, pl->fd)) == NULL) { > - pl->revents = POLLNVAL; > - n++; > - continue; > - } > - pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p); > - FRELE(fp, p); > - if (pl->revents != 0) > - n++; > - } > - *retval = n; > -} > - > /* > * Only copyout the revents field. > */ > @@ -1024,11 +998,11 @@ int > doppoll(struct proc *p, struct pollfd *fds, u_int nfds, > struct timespec *timeout, const sigset_t *sigmask, register_t *retval) > { > - size_t sz; > + struct kqueue_scan_state scan; > + struct timespec zerots = {}; > struct pollfd pfds[4], *pl = pfds; > - struct timespec elapsed, start, stop; > - uint64_t nsecs; > - int ncoll, i, s, error; > + int error, ncollected = 0, nevents = 0; > + size_t sz; > > /* Standards say no more than MAX_OPEN; this is possibly better. */ > if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) > @@ -1042,58 +1016,80 @@ doppoll(struct proc *p, struct pollfd *f > return (EINVAL); > } > > + kqpoll_init(nfds); > + > sz = nfds * sizeof(*pl); > > if ((error = copyin(fds, pl, sz)) != 0) > goto bad; > > - for (i = 0; i < nfds; i++) { > - pl[i].events &= ~POLL_NOHUP; > - pl[i].revents = 0; > - } > - > if (sigmask) > dosigsuspend(p, *sigmask &~ sigcantmask); > > -retry: > - ncoll = nselcoll; > - atomic_setbits_int(&p->p_flag, P_SELECT); > - pollscan(p, pl, nfds, retval); > - if (*retval) > - goto done; > - if (timeout == NULL || timespecisset(timeout)) { > - if (timeout != NULL) { > - getnanouptime(&start); > - nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP); > - } else > - nsecs = INFSLP; > - s = splhigh(); > - if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { > - splx(s); > - goto retry; > - } > - atomic_clearbits_int(&p->p_flag, P_SELECT); > - error = tsleep_nsec(&selwait, PSOCK | PCATCH, "poll", nsecs); > - splx(s); > + /* Register kqueue events */ > + ppollregister(p, pl, nfds, &nevents, &ncollected); > + > + /* > + * The poll/select family of syscalls has been designed to > + * block when file descriptors are not available, even if > + * there's nothing to wait for. > + */ > + if (nevents == 0 && ncollected == 0) { > + uint64_t nsecs = INFSLP; > + > if (timeout != NULL) { > - getnanouptime(&stop); > - timespecsub(&stop, &start, &elapsed); > - timespecsub(timeout, &elapsed, timeout); > - if (timeout->tv_sec < 0) > - timespecclear(timeout); > + if (!timespecisset(timeout)) > + goto done; > + nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); > } > - if (error == 0 || error == EWOULDBLOCK) > - goto retry; > + > + error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs); > + if (error == ERESTART) > + error = EINTR; > + if (error == EWOULDBLOCK) > + error = 0; > + goto done; > } > > + /* Do not block if registering found pending events. */ > + if (ncollected > 0) > + timeout = &zerots; > + > + /* Collect at most `nevents' possibly waiting in kqueue_scan() */ > + kqueue_scan_setup(&scan, p->p_kq); > + while (nevents > 0) { > + struct kevent kev[KQ_NEVENTS]; > + int i, ready, count; > + > + /* Maximum number of events per iteration */ > + count = MIN(nitems(kev), nevents); > + ready = kqueue_scan(&scan, count, kev, timeout, p, &error); > +#ifdef KTRACE > + if (KTRPOINT(p, KTR_STRUCT)) > + ktrevent(p, kev, ready); > +#endif > + /* Convert back events that are ready. */ > + for (i = 0; i < ready; i++) > + ncollected += ppollcollect(p, &kev[i], pl, nfds); > + > + /* > + * Stop if there was an error or if we had enough > + * place to collect all events that were ready. > + */ > + if (error || ready < count) > + break; > + > + nevents -= ready; > + } > + kqueue_scan_finish(&scan); > + *retval = ncollected; > done: > - atomic_clearbits_int(&p->p_flag, P_SELECT); > /* > * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is > * ignored (since the whole point is to see what would block). > */ > switch (error) { > - case ERESTART: > + case EINTR: > error = pollout(pl, fds, nfds); > if (error == 0) > error = EINTR; > @@ -1110,9 +1106,235 @@ done: > bad: > if (pl != pfds) > free(pl, M_TEMP, sz); > + > + kqpoll_done(nfds); > + > return (error); > } > > +int > +ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev, > + struct pollfd *pl, unsigned int pollid) > +{ > + int i, error, nevents = 0; > + > + KASSERT(pl->revents == 0); > + > +#ifdef KTRACE > + if (KTRPOINT(p, KTR_STRUCT)) > + ktrevent(p, kevp, nkev); > +#endif > + for (i = 0; i < nkev; i++, kevp++) { > +again: > + error = kqueue_register(p->p_kq, kevp, pollid, p); > + switch (error) { > + case 0: > + nevents++; > + break; > + case EOPNOTSUPP:/* No underlying kqfilter */ > + case EINVAL: /* Unimplemented filter */ > + break; > + case EBADF: /* Bad file descriptor */ > + pl->revents |= POLLNVAL; > + break; > + case EPERM: /* Specific to FIFO */ > + KASSERT(kevp->filter == EVFILT_WRITE); > + if (nkev == 1) { > + /* > + * If this is the only filter make sure > + * POLLHUP is passed to userland. > + */ > + kevp->filter = EVFILT_EXCEPT; > + goto again; > + } > + break; > + case EPIPE: /* Specific to pipes */ > + KASSERT(kevp->filter == EVFILT_WRITE); > + pl->revents |= POLLHUP; > + break; > + default: > + DPRINTFN(0, "poll err %lu fd %d revents %02x serial" > + " %lu filt %d ERROR=%d\n", > + ((unsigned long)kevp->udata - p->p_kq_serial), > + pl->fd, pl->revents, p->p_kq_serial, kevp->filter, > + error); > + /* FALLTHROUGH */ > + case ENXIO: /* Device has been detached */ > + pl->revents |= POLLERR; > + break; > + } > + } > + > + return (nevents); > +} > + > +/* > + * Convert pollfd into kqueue events and register them on the > + * per-thread queue. > + * > + * At most 3 events can correspond to a single pollfd. > + */ > +void > +ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered, > + int *ncollected) > +{ > + int i, nkev, nevt, forcehup; > + struct kevent kev[3], *kevp; > + > + for (i = 0; i < nfds; i++) { > + pl[i].events &= ~POLL_NOHUP; > + pl[i].revents = 0; > + > + if (pl[i].fd < 0) > + continue; > + > + /* > + * POLLHUP checking is implicit in the event filters. > + * However, the checking must be even if no events are > + * requested. > + */ > + forcehup = ((pl[i].events & ~POLLHUP) == 0); > + > + DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n", > + i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial); > + > + nevt = 0; > + nkev = 0; > + kevp = kev; > + if (pl[i].events & (POLLIN | POLLRDNORM)) { > + EV_SET(kevp, pl[i].fd, EVFILT_READ, > + EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, > + (void *)(p->p_kq_serial + i)); > + nkev++; > + kevp++; > + } > + if (pl[i].events & (POLLOUT | POLLWRNORM)) { > + EV_SET(kevp, pl[i].fd, EVFILT_WRITE, > + EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, > + (void *)(p->p_kq_serial + i)); > + nkev++; > + kevp++; > + } > + if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) { > + int evff = forcehup ? 0 : NOTE_OOB; > + > + EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, > + EV_ADD|EV_ENABLE|__EV_POLL, evff, 0, > + (void *)(p->p_kq_serial + i)); > + nkev++; > + kevp++; > + } > + > + if (nkev == 0) > + continue; > + > + *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i); > + > + if (pl[i].revents != 0) > + (*ncollected)++; > + } > + > + DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered, > + *ncollected); > +} > + > +/* > + * Convert given kqueue event into corresponding poll(2) revents bit. > + */ > +int > +ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int > nfds) > +{ > + static struct timeval poll_errintvl = { 5, 0 }; > + static struct timeval poll_lasterr; > + int already_seen; > + unsigned long i; > + > + /* Extract poll array index */ > + i = (unsigned long)kevp->udata - p->p_kq_serial; > + > + if (i >= nfds) { > + panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx", > + __func__, kevp, nfds, > + (unsigned long)kevp->udata, p->p_kq_serial); > + } > + if ((int)kevp->ident != pl[i].fd) { > + panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx", > + __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd, > + p->p_kq_serial); > + } > + > + /* > + * A given descriptor may already have generated an error > + * against another filter during kqueue_register(). > + * > + * Make sure to set the appropriate flags but do not > + * increment `*retval' more than once. > + */ > + already_seen = (pl[i].revents != 0); > + > + /* POLLNVAL preempts other events. */ > + if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) { > + pl[i].revents = POLLNVAL; > + goto done; > + } else if (pl[i].revents & POLLNVAL) { > + goto done; > + } > + > + switch (kevp->filter) { > + case EVFILT_READ: > + if (kevp->flags & __EV_HUP) > + pl[i].revents |= POLLHUP; > + if (pl[i].events & (POLLIN | POLLRDNORM)) > + pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM); > + break; > + case EVFILT_WRITE: > + /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */ > + if (kevp->flags & __EV_HUP) { > + pl[i].revents |= POLLHUP; > + } else if (pl[i].events & (POLLOUT | POLLWRNORM)) { > + pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM); > + } > + break; > + case EVFILT_EXCEPT: > + if (kevp->flags & __EV_HUP) { > + if (pl[i].events != 0 && pl[i].events != POLLOUT) > + DPRINTFN(0, "weird events %x\n", pl[i].events); > + pl[i].revents |= POLLHUP; > + break; > + } > + if (pl[i].events & (POLLPRI | POLLRDBAND)) > + pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND); > + break; > + default: > + KASSERT(0); > + } > + > +done: > + DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n", > + i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata, > + kevp->filter); > + > + /* > + * Make noise about unclaimed events as they might indicate a bug > + * and can result in spurious-looking wakeups of poll(2). > + * > + * Live-locking within the system call should not happen because > + * the scan loop in doppoll() has an upper limit for the number > + * of events to process. > + */ > + if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) { > + printf("%s[%d]: poll index %lu fd %d events 0x%x " > + "filter %d/0x%x unclaimed\n", > + p->p_p->ps_comm, p->p_tid, i, pl[i].fd, > + pl[i].events, kevp->filter, kevp->flags); > + } > + > + if (!already_seen && (pl[i].revents != 0)) > + return (1); > + > + return (0); > +} > + > /* > * utrace system call > */ > Index: sys/event.h > =================================================================== > RCS file: src/sys/sys/event.h,v > retrieving revision 1.61 > diff -u -p -r1.61 event.h > --- sys/event.h 11 Dec 2021 09:28:26 -0000 1.61 > +++ sys/event.h 25 Dec 2021 13:09:06 -0000 > @@ -246,6 +246,8 @@ struct knote { > } kn_ptr; > const struct filterops *kn_fop; > void *kn_hook; /* [o] */ > + unsigned int kn_pollid; /* [I] */ > + > #define KN_ACTIVE 0x0001 /* event has been triggered */ > #define KN_QUEUED 0x0002 /* event is on queue */ > #define KN_DISABLED 0x0004 /* event is disabled */ > @@ -296,8 +298,8 @@ extern void knote_modify(const struct ke > extern void knote_submit(struct knote *, struct kevent *); > extern void kqueue_init(void); > extern void kqueue_init_percpu(void); > -extern int kqueue_register(struct kqueue *kq, > - struct kevent *kev, struct proc *p); > +extern int kqueue_register(struct kqueue *kq, struct kevent *kev, > + unsigned int pollid, struct proc *p); > extern int kqueue_scan(struct kqueue_scan_state *, int, struct kevent *, > struct timespec *, struct proc *, int *); > extern void kqueue_scan_setup(struct kqueue_scan_state *, struct kqueue *); > Index: sys/eventvar.h > =================================================================== > RCS file: src/sys/sys/eventvar.h,v > retrieving revision 1.12 > diff -u -p -r1.12 eventvar.h > --- sys/eventvar.h 10 Jun 2021 15:10:56 -0000 1.12 > +++ sys/eventvar.h 25 Dec 2021 13:09:06 -0000 > @@ -53,6 +53,8 @@ struct kqueue { > > LIST_ENTRY(kqueue) kq_next; > > + u_int kq_nknotes; /* [q] # of registered knotes */ > + > int kq_knlistsize; /* [q] size of kq_knlist */ > struct knlist *kq_knlist; /* [q] list of > * attached knotes */
