On Fri, Aug 14, 2020 at 12:31:33PM +0200, Martin Pieuchot wrote:
> The previous change introducing the kqueue_scan_setup()/finish() API
> required to switch poll(2) internals to use the kqueue mechanism has
> been backed out.  The reason for the regression is still unknown, so
> let's take a baby step approach.
> 
> Diff below introduces the new API with only minimal changes.  It should
> not introduce any change in behavior.

There is a subtle change in behaviour: the markers' kn_filter and
kn_status are now initialized only once, at the start of the scan.
Previously, they were set also at the start of each retry.

This diff should be tested at least by those who had problems with
the original kqueue_scan_state patch. The original patch could cause
data loss, and the exact cause is still unknown. Hence extra caution is
warranted.

> Index: kern/kern_event.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.142
> diff -u -p -r1.142 kern_event.c
> --- kern/kern_event.c 12 Aug 2020 13:49:24 -0000      1.142
> +++ kern/kern_event.c 14 Aug 2020 10:13:38 -0000
> @@ -64,9 +64,6 @@ void        KQREF(struct kqueue *);
>  void KQRELE(struct kqueue *);
>  
>  int  kqueue_sleep(struct kqueue *, struct timespec *);
> -int  kqueue_scan(struct kqueue *kq, int maxevents,
> -                 struct kevent *ulistp, struct timespec *timeout,
> -                 struct kevent *kev, struct proc *p, int *retval);
>  
>  int  kqueue_read(struct file *, struct uio *, int);
>  int  kqueue_write(struct file *, struct uio *, int);
> @@ -554,6 +551,7 @@ out:
>  int
>  sys_kevent(struct proc *p, void *v, register_t *retval)
>  {
> +     struct kqueue_scan_state scan;
>       struct filedesc* fdp = p->p_fd;
>       struct sys_kevent_args /* {
>               syscallarg(int) fd;
> @@ -635,11 +633,12 @@ sys_kevent(struct proc *p, void *v, regi
>               goto done;
>       }
>  
> -     KQREF(kq);
> +     kqueue_scan_setup(&scan, kq);
>       FRELE(fp, p);
> -     error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
> +     error = kqueue_scan(&scan, SCARG(uap, nevents), SCARG(uap, eventlist),
>           tsp, kev, p, &n);
> -     KQRELE(kq);
> +     kqueue_scan_finish(&scan);
> +
>       *retval = n;
>       return (error);
>  
> @@ -895,11 +894,13 @@ kqueue_sleep(struct kqueue *kq, struct t
>  }
>  
>  int
> -kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
> -    struct timespec *tsp, struct kevent *kev, struct proc *p, int *retval)
> +kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
> +    struct kevent *ulistp, struct timespec *tsp, struct kevent *kev,
> +    struct proc *p, int *retval)
>  {
> +     struct kqueue *kq = scan->kqs_kq;
>       struct kevent *kevp;
> -     struct knote mend, mstart, *kn;
> +     struct knote *kn;
>       int s, count, nkev, error = 0;
>  
>       nkev = 0;
> @@ -909,9 +910,6 @@ kqueue_scan(struct kqueue *kq, int maxev
>       if (count == 0)
>               goto done;
>  
> -     memset(&mstart, 0, sizeof(mstart));
> -     memset(&mend, 0, sizeof(mend));
> -
>  retry:
>       KASSERT(count == maxevents);
>       KASSERT(nkev == 0);
> @@ -939,18 +937,16 @@ retry:
>               goto done;
>       }
>  
> -     mstart.kn_filter = EVFILT_MARKER;
> -     mstart.kn_status = KN_PROCESSING;
> -     TAILQ_INSERT_HEAD(&kq->kq_head, &mstart, kn_tqe);
> -     mend.kn_filter = EVFILT_MARKER;
> -     mend.kn_status = KN_PROCESSING;
> -     TAILQ_INSERT_TAIL(&kq->kq_head, &mend, kn_tqe);
> +     TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe);
> +     TAILQ_INSERT_HEAD(&kq->kq_head, &scan->kqs_start, kn_tqe);
>       while (count) {
> -             kn = TAILQ_NEXT(&mstart, kn_tqe);
> +             kn = TAILQ_NEXT(&scan->kqs_start, kn_tqe);
>               if (kn->kn_filter == EVFILT_MARKER) {
> -                     if (kn == &mend) {
> -                             TAILQ_REMOVE(&kq->kq_head, &mend, kn_tqe);
> -                             TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe);
> +                     if (kn == &scan->kqs_end) {
> +                             TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end,
> +                                 kn_tqe);
> +                             TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start,
> +                                 kn_tqe);
>                               splx(s);
>                               if (count == maxevents)
>                                       goto retry;
> @@ -958,8 +954,9 @@ retry:
>                       }
>  
>                       /* Move start marker past another thread's marker. */
> -                     TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe);
> -                     TAILQ_INSERT_AFTER(&kq->kq_head, kn, &mstart, kn_tqe);
> +                     TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
> +                     TAILQ_INSERT_AFTER(&kq->kq_head, kn, &scan->kqs_start,
> +                         kn_tqe);
>                       continue;
>               }
>  
> @@ -1029,8 +1026,8 @@ retry:
>                               break;
>               }
>       }
> -     TAILQ_REMOVE(&kq->kq_head, &mend, kn_tqe);
> -     TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe);
> +     TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
> +     TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
>       splx(s);
>  done:
>       if (nkev != 0) {
> @@ -1044,6 +1041,33 @@ done:
>       *retval = maxevents - count;
>       return (error);
>  }
> +
> +void
> +kqueue_scan_setup(struct kqueue_scan_state *scan, struct kqueue *kq)
> +{
> +     memset(scan, 0, sizeof(*scan));
> +
> +     KQREF(kq);
> +     scan->kqs_kq = kq;
> +     scan->kqs_start.kn_filter = EVFILT_MARKER;
> +     scan->kqs_start.kn_status = KN_PROCESSING;
> +     scan->kqs_end.kn_filter = EVFILT_MARKER;
> +     scan->kqs_end.kn_status = KN_PROCESSING;
> +}
> +
> +void
> +kqueue_scan_finish(struct kqueue_scan_state *scan)
> +{
> +     struct kqueue *kq = scan->kqs_kq;
> +
> +     KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER);
> +     KASSERT(scan->kqs_start.kn_status == KN_PROCESSING);
> +     KASSERT(scan->kqs_end.kn_filter == EVFILT_MARKER);
> +     KASSERT(scan->kqs_end.kn_status == KN_PROCESSING);
> +
> +     KQRELE(kq);
> +}
> +
>  
>  /*
>   * XXX
> Index: sys/event.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/event.h,v
> retrieving revision 1.44
> diff -u -p -r1.44 event.h
> --- sys/event.h       22 Jun 2020 13:14:32 -0000      1.44
> +++ sys/event.h       14 Aug 2020 10:14:18 -0000
> @@ -200,7 +200,14 @@ struct knote {
>  #define kn_fp                kn_ptr.p_fp
>  };
>  
> +struct kqueue_scan_state {
> +     struct kqueue   *kqs_kq;                /* kqueue of this scan */
> +     struct knote     kqs_start;             /* start marker */
> +     struct knote     kqs_end;               /* end marker */
> +};
> +
>  struct proc;
> +struct timespec;
>  
>  extern const struct filterops sig_filtops;
>  extern const struct filterops dead_filtops;
> @@ -212,6 +219,10 @@ extern void      knote_fdclose(struct proc *p
>  extern void  knote_processexit(struct proc *);
>  extern int   kqueue_register(struct kqueue *kq,
>                   struct kevent *kev, struct proc *p);
> +int  kqueue_scan(struct kqueue_scan_state *, int, struct kevent *,
> +                 struct timespec *, struct kevent *, struct proc *, int *);
> +extern void  kqueue_scan_setup(struct kqueue_scan_state *, struct kqueue *);
> +extern void  kqueue_scan_finish(struct kqueue_scan_state *);
>  extern int   filt_seltrue(struct knote *kn, long hint);
>  extern int   seltrue_kqfilter(dev_t, struct knote *);
>  extern void  klist_insert(struct klist *, struct knote *);
> 

Reply via email to