On 18/05/21(Tue) 14:22, Visa Hankala wrote:
> This diff adds f_modify and f_process callbacks to socket event filters.
> As a result, socket events are handled using the non-legacy paths in
> filter_modify() and filter_process() of kern_event.c This a step toward
> MP-safety. However, everything still runs under the kernel lock.
> 
> The change has three intended effects:
> 
> * Socket events are handled without raising the system priority level.
>   This makes the activity observable with btrace(8).
> 
> * kqueue itself no longer calls f_event of socket filterops, which
>   allows replacing the conditional, NOTE_SUBMIT-based locking with
>   a fixed call pattern.

I love this.

> * The state of a socket event is now always rechecked before delivery
>   to user. Before, the recheck was skipped if the event was registered
>   with EV_ONESHOT.

To me this sounds sane.  I can't think of a way to rely on the current
behavior.  However if there's an easy way to split these changes in two
commits, I'd prefer to stay on the safe side.

> However, the change of behaviour with EV_ONESHOT is questionable.
> When an activated event is being processed, the code will acquire the
> socket lock anyway. Skipping the state check would only be a minor
> optimization. In addition, I think the behaviour becomes more
> consistent as now a delivered EV_ONESHOT event really was active at
> the time of delivery.
> 
> Consider the following program. It creates a socket pair, writes a byte
> to the socket, registers an EV_ONESHOT event, and reads the byte from
> the socket. Next it checks how kevent(2) behaves.
> 
> #include <sys/types.h>
> #include <sys/event.h>
> #include <sys/socket.h>
> #include <sys/time.h>
> #include <err.h>
> #include <fcntl.h>
> #include <stdio.h>
> #include <unistd.h>
> 
> int
> main(void)
> {
>       struct kevent kev[1];
>       struct timespec ts = {};
>       int fds[2], flags, kq, n;
>       char b;
> 
>       if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == -1)
>               err(1, "socketpair");
>       flags = fcntl(fds[0], F_GETFL, 0);
>       fcntl(fds[0], F_SETFL, flags | O_NONBLOCK);
> 
>       printf("write 1\n");
>       write(fds[1], "x", 1);
> 
>       kq = kqueue();
>       if (kq == -1)
>               err(1, "kqueue");
>       EV_SET(&kev[0], fds[0], EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, NULL);
>       if (kevent(kq, kev, 1, NULL, 0, NULL) == -1)
>               err(1, "kevent");
> 
>       n = read(fds[0], &b, 1);
>       printf("read %d\n", n);
>       n = read(fds[0], &b, 1);
>       printf("read %d\n", n);
> 
>       n = kevent(kq, NULL, 0, kev, 1, &ts);
>       printf("kevent %d\n", n);
>       n = read(fds[0], &b, 1);
>       printf("read %d\n", n);
> 
>       n = kevent(kq, NULL, 0, kev, 1, &ts);
>       printf("kevent %d\n", n);
> 
>       printf("write 1\n");
>       write(fds[1], "x", 1);
> 
>       n = kevent(kq, NULL, 0, kev, 1, &ts);
>       printf("kevent %d\n", n);
>       n = read(fds[0], &b, 1);
>       printf("read %d\n", n);
> 
>       return 0;
> }
> 
> With an unpatched kernel, the EV_ONESHOT event gets activated by the
> pending byte when the event is registered. The event remains active
> until delivery, and the delivery happens even though it is clear that
> reading from the socket will fail. The program prints:
> 
> write 1
> read 1
> read -1
> kevent 1
> read -1
> kevent 0
> write 1
> kevent 0
> read 1
> 
> With the patch applied, the event gets delivered only if the socket
> has bytes pending.
> 
> write 1
> read 1
> read -1
> kevent 0
> read -1
> kevent 0
> write 1
> kevent 1
> read 1
> 
> So, is this EV_ONESHOT change reasonable, or should the implementation
> stick with the old way? FreeBSD appears to follow the old way. MacOS
> might perform differently, though I am not sure about that.
> 
> It is not essential to change EV_ONESHOT, however.
> 
> Feedback and tests are welcome.
> 
> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: src/sys/kern/uipc_socket.c,v
> retrieving revision 1.261
> diff -u -p -r1.261 uipc_socket.c
> --- kern/uipc_socket.c        13 May 2021 19:43:11 -0000      1.261
> +++ kern/uipc_socket.c        18 May 2021 12:56:24 -0000
> @@ -70,15 +70,26 @@ void      sorflush(struct socket *);
>  
>  void filt_sordetach(struct knote *kn);
>  int  filt_soread(struct knote *kn, long hint);
> +int  filt_soreadmodify(struct kevent *kev, struct knote *kn);
> +int  filt_soreadprocess(struct knote *kn, struct kevent *kev);
> +int  filt_soread_common(struct knote *kn, struct socket *so);
>  void filt_sowdetach(struct knote *kn);
>  int  filt_sowrite(struct knote *kn, long hint);
> +int  filt_sowritemodify(struct kevent *kev, struct knote *kn);
> +int  filt_sowriteprocess(struct knote *kn, struct kevent *kev);
> +int  filt_sowrite_common(struct knote *kn, struct socket *so);
>  int  filt_solisten(struct knote *kn, long hint);
> +int  filt_solistenmodify(struct kevent *kev, struct knote *kn);
> +int  filt_solistenprocess(struct knote *kn, struct kevent *kev);
> +int  filt_solisten_common(struct knote *kn, struct socket *so);
>  
>  const struct filterops solisten_filtops = {
>       .f_flags        = FILTEROP_ISFD,
>       .f_attach       = NULL,
>       .f_detach       = filt_sordetach,
>       .f_event        = filt_solisten,
> +     .f_modify       = filt_solistenmodify,
> +     .f_process      = filt_solistenprocess,
>  };
>  
>  const struct filterops soread_filtops = {
> @@ -86,6 +97,8 @@ const struct filterops soread_filtops = 
>       .f_attach       = NULL,
>       .f_detach       = filt_sordetach,
>       .f_event        = filt_soread,
> +     .f_modify       = filt_soreadmodify,
> +     .f_process      = filt_soreadprocess,
>  };
>  
>  const struct filterops sowrite_filtops = {
> @@ -93,6 +106,8 @@ const struct filterops sowrite_filtops =
>       .f_attach       = NULL,
>       .f_detach       = filt_sowdetach,
>       .f_event        = filt_sowrite,
> +     .f_modify       = filt_sowritemodify,
> +     .f_process      = filt_sowriteprocess,
>  };
>  
>  const struct filterops soexcept_filtops = {
> @@ -100,6 +115,8 @@ const struct filterops soexcept_filtops 
>       .f_attach       = NULL,
>       .f_detach       = filt_sordetach,
>       .f_event        = filt_soread,
> +     .f_modify       = filt_soreadmodify,
> +     .f_process      = filt_soreadprocess,
>  };
>  
>  #ifndef SOMINCONN
> @@ -2056,13 +2073,12 @@ filt_sordetach(struct knote *kn)
>  }
>  
>  int
> -filt_soread(struct knote *kn, long hint)
> +filt_soread_common(struct knote *kn, struct socket *so)
>  {
> -     struct socket *so = kn->kn_fp->f_data;
> -     int s, rv = 0;
> +     int rv = 0;
> +
> +     soassertlocked(so);
>  
> -     if ((hint & NOTE_SUBMIT) == 0)
> -             s = solock(so);
>       kn->kn_data = so->so_rcv.sb_cc;
>  #ifdef SOCKET_SPLICE
>       if (isspliced(so)) {
> @@ -2090,12 +2106,47 @@ filt_soread(struct knote *kn, long hint)
>       } else {
>               rv = (kn->kn_data >= so->so_rcv.sb_lowat);
>       }
> -     if ((hint & NOTE_SUBMIT) == 0)
> -             sounlock(so, s);
>  
>       return rv;
>  }
>  
> +int
> +filt_soread(struct knote *kn, long hint)
> +{
> +     struct socket *so = kn->kn_fp->f_data;
> +
> +     return (filt_soread_common(kn, so));
> +}
> +
> +int
> +filt_soreadmodify(struct kevent *kev, struct knote *kn)
> +{
> +     struct socket *so = kn->kn_fp->f_data;
> +     int rv, s;
> +
> +     s = solock(so);
> +     knote_modify(kev, kn);
> +     rv = filt_soread_common(kn, so);
> +     sounlock(so, s);
> +
> +     return (rv);
> +}
> +
> +int
> +filt_soreadprocess(struct knote *kn, struct kevent *kev)
> +{
> +     struct socket *so = kn->kn_fp->f_data;
> +     int rv, s;
> +
> +     s = solock(so);
> +     rv = filt_soread_common(kn, so);
> +     if (rv != 0)
> +             knote_submit(kn, kev);
> +     sounlock(so, s);
> +
> +     return (rv);
> +}
> +
>  void
>  filt_sowdetach(struct knote *kn)
>  {
> @@ -2107,13 +2158,12 @@ filt_sowdetach(struct knote *kn)
>  }
>  
>  int
> -filt_sowrite(struct knote *kn, long hint)
> +filt_sowrite_common(struct knote *kn, struct socket *so)
>  {
> -     struct socket *so = kn->kn_fp->f_data;
> -     int s, rv;
> +     int rv;
> +
> +     soassertlocked(so);
>  
> -     if ((hint & NOTE_SUBMIT) == 0)
> -             s = solock(so);
>       kn->kn_data = sbspace(so, &so->so_snd);
>       if (so->so_state & SS_CANTSENDMORE) {
>               kn->kn_flags |= EV_EOF;
> @@ -2133,27 +2183,94 @@ filt_sowrite(struct knote *kn, long hint
>       } else {
>               rv = (kn->kn_data >= so->so_snd.sb_lowat);
>       }
> -     if ((hint & NOTE_SUBMIT) == 0)
> -             sounlock(so, s);
>  
>       return (rv);
>  }
>  
>  int
> -filt_solisten(struct knote *kn, long hint)
> +filt_sowrite(struct knote *kn, long hint)
>  {
>       struct socket *so = kn->kn_fp->f_data;
> -     int s;
>  
> -     if ((hint & NOTE_SUBMIT) == 0)
> -             s = solock(so);
> +     return (filt_sowrite_common(kn, so));
> +}
> +
> +int
> +filt_sowritemodify(struct kevent *kev, struct knote *kn)
> +{
> +     struct socket *so = kn->kn_fp->f_data;
> +     int rv, s;
> +
> +     s = solock(so);
> +     knote_modify(kev, kn);
> +     rv = filt_sowrite_common(kn, so);
> +     sounlock(so, s);
> +
> +     return (rv);
> +}
> +
> +int
> +filt_sowriteprocess(struct knote *kn, struct kevent *kev)
> +{
> +     struct socket *so = kn->kn_fp->f_data;
> +     int rv, s;
> +
> +     s = solock(so);
> +     rv = filt_sowrite_common(kn, so);
> +     if (rv != 0)
> +             knote_submit(kn, kev);
> +     sounlock(so, s);
> +
> +     return (rv);
> +}
> +
> +int
> +filt_solisten_common(struct knote *kn, struct socket *so)
> +{
> +     soassertlocked(so);
> +
>       kn->kn_data = so->so_qlen;
> -     if ((hint & NOTE_SUBMIT) == 0)
> -             sounlock(so, s);
>  
>       return (kn->kn_data != 0);
>  }
>  
> +int
> +filt_solisten(struct knote *kn, long hint)
> +{
> +     struct socket *so = kn->kn_fp->f_data;
> +
> +     return (filt_solisten_common(kn, so));
> +}
> +
> +int
> +filt_solistenmodify(struct kevent *kev, struct knote *kn)
> +{
> +     struct socket *so = kn->kn_fp->f_data;
> +     int rv, s;
> +
> +     s = solock(so);
> +     knote_modify(kev, kn);
> +     rv = filt_solisten_common(kn, so);
> +     sounlock(so, s);
> +
> +     return (rv);
> +}
> +
> +int
> +filt_solistenprocess(struct knote *kn, struct kevent *kev)
> +{
> +     struct socket *so = kn->kn_fp->f_data;
> +     int rv, s;
> +
> +     s = solock(so);
> +     rv = filt_solisten_common(kn, so);
> +     if (rv != 0)
> +             knote_submit(kn, kev);
> +     sounlock(so, s);
> +
> +     return (rv);
> +}
> +
>  #ifdef DDB
>  void
>  sobuf_print(struct sockbuf *,
> Index: kern/uipc_syscalls.c
> ===================================================================
> RCS file: src/sys/kern/uipc_syscalls.c,v
> retrieving revision 1.190
> diff -u -p -r1.190 uipc_syscalls.c
> --- kern/uipc_syscalls.c      13 May 2021 17:31:59 -0000      1.190
> +++ kern/uipc_syscalls.c      18 May 2021 12:56:24 -0000
> @@ -308,7 +308,7 @@ doaccept(struct proc *p, int sock, struc
>           : (flags & SOCK_NONBLOCK ? FNONBLOCK : 0);
>  
>       /* connection has been removed from the listen queue */
> -     KNOTE(&head->so_rcv.sb_sel.si_note, NOTE_SUBMIT);
> +     KNOTE(&head->so_rcv.sb_sel.si_note, 0);
>  
>       fp->f_type = DTYPE_SOCKET;
>       fp->f_flag = FREAD | FWRITE | nflag;
> 

Reply via email to