On Fri, Oct 02, 2020 at 12:19:35PM +0200, Martin Pieuchot wrote:
> 
> [...]
> 
> I'd like to get this in early in this release cycle, so please test and
> report back :o)

You removed the resleep logic that accounts for if/when tsleep_nsec(9)
returns early.  So now select and pselect can return too soon.

I've left questions below in the spots I think look off.

> Index: kern/kern_event.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.142
> diff -u -p -r1.142 kern_event.c
> --- kern/kern_event.c 12 Aug 2020 13:49:24 -0000      1.142
> +++ kern/kern_event.c 1 Oct 2020 12:53:54 -0000
> @@ -64,9 +64,6 @@ void        KQREF(struct kqueue *);
>  void KQRELE(struct kqueue *);
>  
>  int  kqueue_sleep(struct kqueue *, struct timespec *);
> -int  kqueue_scan(struct kqueue *kq, int maxevents,
> -                 struct kevent *ulistp, struct timespec *timeout,
> -                 struct kevent *kev, struct proc *p, int *retval);
>  
>  int  kqueue_read(struct file *, struct uio *, int);
>  int  kqueue_write(struct file *, struct uio *, int);
> @@ -521,6 +518,14 @@ kqueue_alloc(struct filedesc *fdp)
>       return (kq);
>  }
>  
> +void
> +kqueue_exit(struct proc *p)
> +{
> +     kqueue_terminate(p, p->p_kq);
> +     kqueue_free(p->p_kq);
> +     p->p_kq = NULL;
> +}
> +
>  int
>  sys_kqueue(struct proc *p, void *v, register_t *retval)
>  {
> @@ -554,6 +559,7 @@ out:
>  int
>  sys_kevent(struct proc *p, void *v, register_t *retval)
>  {
> +     struct kqueue_scan_state scan;
>       struct filedesc* fdp = p->p_fd;
>       struct sys_kevent_args /* {
>               syscallarg(int) fd;
> @@ -569,6 +575,7 @@ sys_kevent(struct proc *p, void *v, regi
>       struct timespec ts;
>       struct timespec *tsp = NULL;
>       int i, n, nerrors, error;
> +     int ready, total;
>       struct kevent kev[KQ_NEVENTS];
>  
>       if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
> @@ -597,9 +604,9 @@ sys_kevent(struct proc *p, void *v, regi
>       kq = fp->f_data;
>       nerrors = 0;
>  
> -     while (SCARG(uap, nchanges) > 0) {
> -             n = SCARG(uap, nchanges) > KQ_NEVENTS ?
> -                 KQ_NEVENTS : SCARG(uap, nchanges);
> +     while ((n = SCARG(uap, nchanges)) > 0) {
> +             if (n > nitems(kev))
> +                     n = nitems(kev);
>               error = copyin(SCARG(uap, changelist), kev,
>                   n * sizeof(struct kevent));
>               if (error)
> @@ -635,12 +642,39 @@ sys_kevent(struct proc *p, void *v, regi
>               goto done;
>       }
>  
> +
>       KQREF(kq);
>       FRELE(fp, p);
> -     error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
> -         tsp, kev, p, &n);
> +     /*
> +      * Collect as many events as we can.  The timeout on successive
> +      * loops is disabled (kqueue_scan() becomes non-blocking).
> +      */
> +     total = 0;
> +     error = 0;
> +     kqueue_scan_setup(&scan, kq);
> +     while ((n = SCARG(uap, nevents) - total) > 0) {
> +             if (n > nitems(kev))
> +                     n = nitems(kev);
> +             ready = kqueue_scan(&scan, n, kev, tsp, p, &error);
> +             if (ready == 0)
> +                     break;
> +             error = copyout(kev, SCARG(uap, eventlist) + total,
> +                 sizeof(struct kevent) * ready);
> +#ifdef KTRACE
> +             if (KTRPOINT(p, KTR_STRUCT))
> +                     ktrevent(p, kev, ready);
> +#endif
> +             total += ready;
> +             if (error || ready < n)
> +                     break;
> +             tsp = &ts;              /* successive loops non-blocking */
> +             timespecclear(tsp);

Here, this.  Why do we force a non-blocking loop the second time?

> +     }
> +     kqueue_scan_finish(&scan);
>       KQRELE(kq);
> -     *retval = n;
> +     if (error == EWOULDBLOCK)
> +             error = 0;
> +     *retval = total;
>       return (error);
>  
>   done:
> @@ -894,24 +928,22 @@ kqueue_sleep(struct kqueue *kq, struct t
>       return (error);
>  }
>  
> +/*
> + * Scan the kqueue, blocking if necessary until the target time is reached.
> + * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
> + * 0 we do not block at all.
> + */
>  int
> -kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
> -    struct timespec *tsp, struct kevent *kev, struct proc *p, int *retval)
> +kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
> +    struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp)
>  {
> -     struct kevent *kevp;
> -     struct knote mend, mstart, *kn;
> -     int s, count, nkev, error = 0;
> -
> -     nkev = 0;
> -     kevp = kev;
> +     struct knote *kn;
> +     struct kqueue *kq = scan->kqs_kq;
> +     int s, count, nkev = 0, error = 0;
>  
>       count = maxevents;
>       if (count == 0)
>               goto done;
> -
> -     memset(&mstart, 0, sizeof(mstart));
> -     memset(&mend, 0, sizeof(mend));
> -
>  retry:
>       KASSERT(count == maxevents);
>       KASSERT(nkev == 0);
> @@ -923,7 +955,8 @@ retry:
>  
>       s = splhigh();
>       if (kq->kq_count == 0) {
> -             if (tsp != NULL && !timespecisset(tsp)) {
> +             if ((tsp != NULL && !timespecisset(tsp)) ||
> +                 scan->kqs_nevent != 0) {
>                       splx(s);
>                       error = 0;
>                       goto done;
> @@ -931,7 +964,7 @@ retry:
>               kq->kq_state |= KQ_SLEEP;
>               error = kqueue_sleep(kq, tsp);
>               splx(s);
> -             if (error == 0 || error == EWOULDBLOCK)
> +             if (error == 0)
>                       goto retry;

Why wouldn't we want to retry in the EWOULDBLOCK case?
You have a check for

        tsp != NULL && !timespecisset(tsp)

e.g., when you time out.

>               /* don't restart after signals... */
>               if (error == ERESTART)
> @@ -939,27 +972,40 @@ retry:
>               goto done;
>       }
>  
> -     mstart.kn_filter = EVFILT_MARKER;
> -     mstart.kn_status = KN_PROCESSING;
> -     TAILQ_INSERT_HEAD(&kq->kq_head, &mstart, kn_tqe);
> -     mend.kn_filter = EVFILT_MARKER;
> -     mend.kn_status = KN_PROCESSING;
> -     TAILQ_INSERT_TAIL(&kq->kq_head, &mend, kn_tqe);
> +     /*
> +      * Put the end marker in the queue to limit the scan to the events
> +      * that are currently active.  This prevents events from being
> +      * recollected if they reactivate during scan.
> +      *
> +      * If a partial scan has been performed already but no events have
> +      * been collected, reposition the end marker to make any new events
> +      * reachable.
> +      */
> +     if (!scan->kqs_queued) {
> +             TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe);
> +             scan->kqs_queued = 1;
> +     } else if (scan->kqs_nevent == 0) {
> +             TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
> +             TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe);
> +     }
> +
> +     TAILQ_INSERT_HEAD(&kq->kq_head, &scan->kqs_start, kn_tqe);
>       while (count) {
> -             kn = TAILQ_NEXT(&mstart, kn_tqe);
> +             kn = TAILQ_NEXT(&scan->kqs_start, kn_tqe);
>               if (kn->kn_filter == EVFILT_MARKER) {
> -                     if (kn == &mend) {
> -                             TAILQ_REMOVE(&kq->kq_head, &mend, kn_tqe);
> -                             TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe);
> +                     if (kn == &scan->kqs_end) {
> +                             TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start,
> +                                 kn_tqe);
>                               splx(s);
> -                             if (count == maxevents)
> +                             if (scan->kqs_nevent == 0)
>                                       goto retry;
>                               goto done;
>                       }
>  
>                       /* Move start marker past another thread's marker. */
> -                     TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe);
> -                     TAILQ_INSERT_AFTER(&kq->kq_head, kn, &mstart, kn_tqe);
> +                     TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
> +                     TAILQ_INSERT_AFTER(&kq->kq_head, kn, &scan->kqs_start,
> +                         kn_tqe);
>                       continue;
>               }
>  
> @@ -987,6 +1033,12 @@ retry:
>               *kevp = kn->kn_kevent;
>               kevp++;
>               nkev++;
> +             count--;
> +             scan->kqs_nevent++;
> +
> +             /*
> +              * Post-event action on the note
> +              */
>               if (kn->kn_flags & EV_ONESHOT) {
>                       splx(s);
>                       kn->kn_fop->f_detach(kn);
> @@ -1012,37 +1064,44 @@ retry:
>                       knote_release(kn);
>               }
>               kqueue_check(kq);
> -             count--;
> -             if (nkev == KQ_NEVENTS) {
> -                     splx(s);
> -#ifdef KTRACE
> -                     if (KTRPOINT(p, KTR_STRUCT))
> -                             ktrevent(p, kev, nkev);
> -#endif
> -                     error = copyout(kev, ulistp,
> -                         sizeof(struct kevent) * nkev);
> -                     ulistp += nkev;
> -                     nkev = 0;
> -                     kevp = kev;
> -                     s = splhigh();
> -                     if (error)
> -                             break;
> -             }
>       }
> -     TAILQ_REMOVE(&kq->kq_head, &mend, kn_tqe);
> -     TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe);
> +     TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
>       splx(s);
> +     if (scan->kqs_nevent == 0)
> +             goto retry;
>  done:
> -     if (nkev != 0) {
> -#ifdef KTRACE
> -             if (KTRPOINT(p, KTR_STRUCT))
> -                     ktrevent(p, kev, nkev);
> -#endif
> -             error = copyout(kev, ulistp,
> -                 sizeof(struct kevent) * nkev);
> +     *errorp = error;
> +     return (nkev);
> +}
> +
> +void
> +kqueue_scan_setup(struct kqueue_scan_state *scan, struct kqueue *kq)
> +{
> +     memset(scan, 0, sizeof(*scan));
> +     scan->kqs_kq = kq;
> +     scan->kqs_start.kn_filter = EVFILT_MARKER;
> +     scan->kqs_start.kn_status = KN_PROCESSING;
> +     scan->kqs_end.kn_filter = EVFILT_MARKER;
> +     scan->kqs_end.kn_status = KN_PROCESSING;
> +}
> +
> +void
> +kqueue_scan_finish(struct kqueue_scan_state *scan)
> +{
> +     struct kqueue *kq = scan->kqs_kq;
> +     int s;
> +
> +     KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER);
> +     KASSERT(scan->kqs_start.kn_status == KN_PROCESSING);
> +     KASSERT(scan->kqs_end.kn_filter == EVFILT_MARKER);
> +     KASSERT(scan->kqs_end.kn_status == KN_PROCESSING);
> +
> +     if (scan->kqs_queued) {
> +             scan->kqs_queued = 0;
> +             s = splhigh();
> +             TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
> +             splx(s);
>       }
> -     *retval = maxevents - count;
> -     return (error);
>  }
>  
>  /*
> @@ -1099,7 +1158,7 @@ kqueue_stat(struct file *fp, struct stat
>  }
>  
>  void
> -kqueue_terminate(struct proc *p, struct kqueue *kq)
> +kqueue_purge(struct proc *p, struct kqueue *kq)
>  {
>       int i;
>  
> @@ -1111,6 +1170,12 @@ kqueue_terminate(struct proc *p, struct 
>               for (i = 0; i < kq->kq_knhashmask + 1; i++)
>                       knote_remove(p, &kq->kq_knhash[i]);
>       }
> +}
> +
> +void
> +kqueue_terminate(struct proc *p, struct kqueue *kq)
> +{
> +     kqueue_purge(p, kq);
>       kq->kq_state |= KQ_DYING;
>       kqueue_wakeup(kq);
>  
> Index: kern/kern_exit.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_exit.c,v
> retrieving revision 1.188
> diff -u -p -r1.188 kern_exit.c
> --- kern/kern_exit.c  18 Mar 2020 15:48:21 -0000      1.188
> +++ kern/kern_exit.c  1 Oct 2020 12:53:54 -0000
> @@ -184,6 +184,8 @@ exit1(struct proc *p, int xexit, int xsi
>       if ((p->p_flag & P_THREAD) == 0)
>               pr->ps_siglist = 0;
>  
> +     kqueue_exit(p);
> +
>  #if NKCOV > 0
>       kcov_exit(p);
>  #endif
> Index: kern/kern_fork.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_fork.c,v
> retrieving revision 1.225
> diff -u -p -r1.225 kern_fork.c
> --- kern/kern_fork.c  20 Mar 2020 08:14:07 -0000      1.225
> +++ kern/kern_fork.c  1 Oct 2020 13:04:36 -0000
> @@ -422,6 +422,8 @@ fork1(struct proc *curp, int flags, void
>               newptstat = malloc(sizeof(*newptstat), M_SUBPROC, M_WAITOK);
>  
>       p->p_tid = alloctid();
> +     p->p_kq = kqueue_alloc(p->p_fd);
> +     p->p_kq_serial = arc4random();
>  
>       LIST_INSERT_HEAD(&allproc, p, p_list);
>       LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
> @@ -553,6 +555,8 @@ thread_fork(struct proc *curp, void *sta
>       cpu_fork(curp, p, stack, tcb, child_return, p);
>  
>       p->p_tid = alloctid();
> +     p->p_kq = kqueue_alloc(p->p_fd);
> +     p->p_kq_serial = arc4random();
>  
>       LIST_INSERT_HEAD(&allproc, p, p_list);
>       LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
> Index: kern/sys_generic.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/sys_generic.c,v
> retrieving revision 1.131
> diff -u -p -r1.131 sys_generic.c
> --- kern/sys_generic.c        20 Mar 2020 04:11:05 -0000      1.131
> +++ kern/sys_generic.c        2 Oct 2020 09:20:59 -0000
> @@ -55,6 +55,7 @@
>  #include <sys/time.h>
>  #include <sys/malloc.h>
>  #include <sys/poll.h>
> +#include <sys/eventvar.h>
>  #ifdef KTRACE
>  #include <sys/ktrace.h>
>  #endif
> @@ -66,8 +67,20 @@
>  
>  #include <uvm/uvm_extern.h>
>  
> -int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
> -void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
> +/*
> + * Debug values:
> + *  1 - print implementation errors, things that should not happen.
> + *  2 - print ppoll(2) information, somewhat verbose
> + *  3 - print pselect(2) and ppoll(2) information, very verbose
> + */
> +int kqpoll_debug = 0;
> +#define DPRINTFN(v, x...) if (kqpoll_debug > v) {            \
> +    printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);       \
> +    printf(x);                                                       \
> +}
> +int pselregister(struct proc *, fd_set *, int, int, int *);
> +int pselcollect(struct proc *, struct kevent *, fd_set *[]);
> +
>  int pollout(struct pollfd *, struct pollfd *, u_int);
>  int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
>      struct timespec *, const sigset_t *, register_t *);
> @@ -584,11 +597,11 @@ int
>  dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
>      struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
>  {
> +     struct kqueue_scan_state scan;
>       fd_mask bits[6];
>       fd_set *pibits[3], *pobits[3];
> -     struct timespec elapsed, start, stop;
> -     uint64_t nsecs;
> -     int s, ncoll, error = 0;
> +     struct timespec ts;
> +     int error, nevents = 0;
>       u_int ni;
>  
>       if (nd < 0)
> @@ -636,43 +649,61 @@ dopselect(struct proc *p, int nd, fd_set
>       if (sigmask)
>               dosigsuspend(p, *sigmask &~ sigcantmask);
>  
> -retry:
> -     ncoll = nselcoll;
> -     atomic_setbits_int(&p->p_flag, P_SELECT);
> -     error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
> -     if (error || *retval)
> +     /* Register kqueue events */
> +     if ((error = pselregister(p, pibits[0], nd, ni, &nevents) != 0))
>               goto done;
> -     if (timeout == NULL || timespecisset(timeout)) {
> -             if (timeout != NULL) {
> -                     getnanouptime(&start);
> -                     nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
> -             } else
> -                     nsecs = INFSLP;
> -             s = splhigh();
> -             if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
> -                     splx(s);
> -                     goto retry;
> -             }
> -             atomic_clearbits_int(&p->p_flag, P_SELECT);
> -             error = tsleep_nsec(&selwait, PSOCK | PCATCH, "select", nsecs);
> -             splx(s);
> -             if (timeout != NULL) {
> -                     getnanouptime(&stop);
> -                     timespecsub(&stop, &start, &elapsed);
> -                     timespecsub(timeout, &elapsed, timeout);
> -                     if (timeout->tv_sec < 0)
> -                             timespecclear(timeout);
> -             }
> -             if (error == 0 || error == EWOULDBLOCK)
> -                     goto retry;
> +
> +     /*
> +      * The poll/select family of syscalls has been designed to
> +      * block when file descriptors are not available, even if
> +      * there's nothing to wait for.
> +      */
> +     if (nevents == 0) {
> +             uint64_t nsecs = INFSLP;
> +
> +             if (timeout != NULL)
> +                     nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
> +
> +             error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqsel", nsecs);
> +             /* select is not restarted after signals... */
> +             if (error == ERESTART)
> +                     error = EINTR;
> +     }

Aside: can the new logic (below) not handle the case where
nevents == 0?  Like, what happens if we go into kqueue_scan()
with count == 0?

> +
> +     /* Collect at most `nevents' possibly waiting in kqueue_scan() */
> +     kqueue_scan_setup(&scan, p->p_kq);
> +     while (nevents > 0) {
> +             struct kevent kev[KQ_NEVENTS];
> +             int i, ready, count;
> +
> +             /* Maxium number of events per iteration */
> +             count = MIN(nitems(kev), nevents);
> +             ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
> +#ifdef KTRACE
> +             if (KTRPOINT(p, KTR_STRUCT))
> +                     ktrevent(p, kev, ready);
> +#endif
> +             /* Convert back events that are ready. */
> +             for (i = 0; i < ready; i++)
> +                     *retval += pselcollect(p, &kev[i], pobits);
> +
> +             /*
> +              * Stop if there was an error or if we had enough
> +              * place to collect all events that were ready.
> +              */
> +             if (error || ready < count)
> +                     break;
> +
> +             timeout = &ts;          /* successive loops non-blocking */
> +             timespecclear(timeout);

... again, it looks like we now truncate instead of resleeping.

Reply via email to