On 18/05/21(Tue) 14:22, Visa Hankala wrote: > This diff adds f_modify and f_process callbacks to socket event filters. > As a result, socket events are handled using the non-legacy paths in > filter_modify() and filter_process() of kern_event.c This a step toward > MP-safety. However, everything still runs under the kernel lock. > > The change has three intended effects: > > * Socket events are handled without raising the system priority level. > This makes the activity observable with btrace(8). > > * kqueue itself no longer calls f_event of socket filterops, which > allows replacing the conditional, NOTE_SUBMIT-based locking with > a fixed call pattern.
I love this. > * The state of a socket event is now always rechecked before delivery > to user. Before, the recheck was skipped if the event was registered > with EV_ONESHOT. To me this sounds sane. I can't think of a way to rely on the current behavior. However if there's an easy way to split these changes in two commits, I'd prefer to stay on the safe side. > However, the change of behaviour with EV_ONESHOT is questionable. > When an activated event is being processed, the code will acquire the > socket lock anyway. Skipping the state check would only be a minor > optimization. In addition, I think the behaviour becomes more > consistent as now a delivered EV_ONESHOT event really was active at > the time of delivery. > > Consider the following program. It creates a socket pair, writes a byte > to the socket, registers an EV_ONESHOT event, and reads the byte from > the socket. Next it checks how kevent(2) behaves. > > #include <sys/types.h> > #include <sys/event.h> > #include <sys/socket.h> > #include <sys/time.h> > #include <err.h> > #include <fcntl.h> > #include <stdio.h> > #include <unistd.h> > > int > main(void) > { > struct kevent kev[1]; > struct timespec ts = {}; > int fds[2], flags, kq, n; > char b; > > if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == -1) > err(1, "socketpair"); > flags = fcntl(fds[0], F_GETFL, 0); > fcntl(fds[0], F_SETFL, flags | O_NONBLOCK); > > printf("write 1\n"); > write(fds[1], "x", 1); > > kq = kqueue(); > if (kq == -1) > err(1, "kqueue"); > EV_SET(&kev[0], fds[0], EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, NULL); > if (kevent(kq, kev, 1, NULL, 0, NULL) == -1) > err(1, "kevent"); > > n = read(fds[0], &b, 1); > printf("read %d\n", n); > n = read(fds[0], &b, 1); > printf("read %d\n", n); > > n = kevent(kq, NULL, 0, kev, 1, &ts); > printf("kevent %d\n", n); > n = read(fds[0], &b, 1); > printf("read %d\n", n); > > n = kevent(kq, NULL, 0, kev, 1, &ts); > printf("kevent %d\n", n); > > printf("write 1\n"); > write(fds[1], "x", 1); > > n = kevent(kq, NULL, 0, kev, 1, &ts); > printf("kevent %d\n", n); > n = read(fds[0], &b, 1); > printf("read %d\n", n); > > return 0; > } > > With an unpatched kernel, the EV_ONESHOT event gets activated by the > pending byte when the event is registered. The event remains active > until delivery, and the delivery happens even though it is clear that > reading from the socket will fail. The program prints: > > write 1 > read 1 > read -1 > kevent 1 > read -1 > kevent 0 > write 1 > kevent 0 > read 1 > > With the patch applied, the event gets delivered only if the socket > has bytes pending. > > write 1 > read 1 > read -1 > kevent 0 > read -1 > kevent 0 > write 1 > kevent 1 > read 1 > > So, is this EV_ONESHOT change reasonable, or should the implementation > stick with the old way? FreeBSD appears to follow the old way. MacOS > might perform differently, though I am not sure about that. > > It is not essential to change EV_ONESHOT, however. > > Feedback and tests are welcome. > > Index: kern/uipc_socket.c > =================================================================== > RCS file: src/sys/kern/uipc_socket.c,v > retrieving revision 1.261 > diff -u -p -r1.261 uipc_socket.c > --- kern/uipc_socket.c 13 May 2021 19:43:11 -0000 1.261 > +++ kern/uipc_socket.c 18 May 2021 12:56:24 -0000 > @@ -70,15 +70,26 @@ void sorflush(struct socket *); > > void filt_sordetach(struct knote *kn); > int filt_soread(struct knote *kn, long hint); > +int filt_soreadmodify(struct kevent *kev, struct knote *kn); > +int filt_soreadprocess(struct knote *kn, struct kevent *kev); > +int filt_soread_common(struct knote *kn, struct socket *so); > void filt_sowdetach(struct knote *kn); > int filt_sowrite(struct knote *kn, long hint); > +int filt_sowritemodify(struct kevent *kev, struct knote *kn); > +int filt_sowriteprocess(struct knote *kn, struct kevent *kev); > +int filt_sowrite_common(struct knote *kn, struct socket *so); > int filt_solisten(struct knote *kn, long hint); > +int filt_solistenmodify(struct kevent *kev, struct knote *kn); > +int filt_solistenprocess(struct knote *kn, struct kevent *kev); > +int filt_solisten_common(struct knote *kn, struct socket *so); > > const struct filterops solisten_filtops = { > .f_flags = FILTEROP_ISFD, > .f_attach = NULL, > .f_detach = filt_sordetach, > .f_event = filt_solisten, > + .f_modify = filt_solistenmodify, > + .f_process = filt_solistenprocess, > }; > > const struct filterops soread_filtops = { > @@ -86,6 +97,8 @@ const struct filterops soread_filtops = > .f_attach = NULL, > .f_detach = filt_sordetach, > .f_event = filt_soread, > + .f_modify = filt_soreadmodify, > + .f_process = filt_soreadprocess, > }; > > const struct filterops sowrite_filtops = { > @@ -93,6 +106,8 @@ const struct filterops sowrite_filtops = > .f_attach = NULL, > .f_detach = filt_sowdetach, > .f_event = filt_sowrite, > + .f_modify = filt_sowritemodify, > + .f_process = filt_sowriteprocess, > }; > > const struct filterops soexcept_filtops = { > @@ -100,6 +115,8 @@ const struct filterops soexcept_filtops > .f_attach = NULL, > .f_detach = filt_sordetach, > .f_event = filt_soread, > + .f_modify = filt_soreadmodify, > + .f_process = filt_soreadprocess, > }; > > #ifndef SOMINCONN > @@ -2056,13 +2073,12 @@ filt_sordetach(struct knote *kn) > } > > int > -filt_soread(struct knote *kn, long hint) > +filt_soread_common(struct knote *kn, struct socket *so) > { > - struct socket *so = kn->kn_fp->f_data; > - int s, rv = 0; > + int rv = 0; > + > + soassertlocked(so); > > - if ((hint & NOTE_SUBMIT) == 0) > - s = solock(so); > kn->kn_data = so->so_rcv.sb_cc; > #ifdef SOCKET_SPLICE > if (isspliced(so)) { > @@ -2090,12 +2106,47 @@ filt_soread(struct knote *kn, long hint) > } else { > rv = (kn->kn_data >= so->so_rcv.sb_lowat); > } > - if ((hint & NOTE_SUBMIT) == 0) > - sounlock(so, s); > > return rv; > } > > +int > +filt_soread(struct knote *kn, long hint) > +{ > + struct socket *so = kn->kn_fp->f_data; > + > + return (filt_soread_common(kn, so)); > +} > + > +int > +filt_soreadmodify(struct kevent *kev, struct knote *kn) > +{ > + struct socket *so = kn->kn_fp->f_data; > + int rv, s; > + > + s = solock(so); > + knote_modify(kev, kn); > + rv = filt_soread_common(kn, so); > + sounlock(so, s); > + > + return (rv); > +} > + > +int > +filt_soreadprocess(struct knote *kn, struct kevent *kev) > +{ > + struct socket *so = kn->kn_fp->f_data; > + int rv, s; > + > + s = solock(so); > + rv = filt_soread_common(kn, so); > + if (rv != 0) > + knote_submit(kn, kev); > + sounlock(so, s); > + > + return (rv); > +} > + > void > filt_sowdetach(struct knote *kn) > { > @@ -2107,13 +2158,12 @@ filt_sowdetach(struct knote *kn) > } > > int > -filt_sowrite(struct knote *kn, long hint) > +filt_sowrite_common(struct knote *kn, struct socket *so) > { > - struct socket *so = kn->kn_fp->f_data; > - int s, rv; > + int rv; > + > + soassertlocked(so); > > - if ((hint & NOTE_SUBMIT) == 0) > - s = solock(so); > kn->kn_data = sbspace(so, &so->so_snd); > if (so->so_state & SS_CANTSENDMORE) { > kn->kn_flags |= EV_EOF; > @@ -2133,27 +2183,94 @@ filt_sowrite(struct knote *kn, long hint > } else { > rv = (kn->kn_data >= so->so_snd.sb_lowat); > } > - if ((hint & NOTE_SUBMIT) == 0) > - sounlock(so, s); > > return (rv); > } > > int > -filt_solisten(struct knote *kn, long hint) > +filt_sowrite(struct knote *kn, long hint) > { > struct socket *so = kn->kn_fp->f_data; > - int s; > > - if ((hint & NOTE_SUBMIT) == 0) > - s = solock(so); > + return (filt_sowrite_common(kn, so)); > +} > + > +int > +filt_sowritemodify(struct kevent *kev, struct knote *kn) > +{ > + struct socket *so = kn->kn_fp->f_data; > + int rv, s; > + > + s = solock(so); > + knote_modify(kev, kn); > + rv = filt_sowrite_common(kn, so); > + sounlock(so, s); > + > + return (rv); > +} > + > +int > +filt_sowriteprocess(struct knote *kn, struct kevent *kev) > +{ > + struct socket *so = kn->kn_fp->f_data; > + int rv, s; > + > + s = solock(so); > + rv = filt_sowrite_common(kn, so); > + if (rv != 0) > + knote_submit(kn, kev); > + sounlock(so, s); > + > + return (rv); > +} > + > +int > +filt_solisten_common(struct knote *kn, struct socket *so) > +{ > + soassertlocked(so); > + > kn->kn_data = so->so_qlen; > - if ((hint & NOTE_SUBMIT) == 0) > - sounlock(so, s); > > return (kn->kn_data != 0); > } > > +int > +filt_solisten(struct knote *kn, long hint) > +{ > + struct socket *so = kn->kn_fp->f_data; > + > + return (filt_solisten_common(kn, so)); > +} > + > +int > +filt_solistenmodify(struct kevent *kev, struct knote *kn) > +{ > + struct socket *so = kn->kn_fp->f_data; > + int rv, s; > + > + s = solock(so); > + knote_modify(kev, kn); > + rv = filt_solisten_common(kn, so); > + sounlock(so, s); > + > + return (rv); > +} > + > +int > +filt_solistenprocess(struct knote *kn, struct kevent *kev) > +{ > + struct socket *so = kn->kn_fp->f_data; > + int rv, s; > + > + s = solock(so); > + rv = filt_solisten_common(kn, so); > + if (rv != 0) > + knote_submit(kn, kev); > + sounlock(so, s); > + > + return (rv); > +} > + > #ifdef DDB > void > sobuf_print(struct sockbuf *, > Index: kern/uipc_syscalls.c > =================================================================== > RCS file: src/sys/kern/uipc_syscalls.c,v > retrieving revision 1.190 > diff -u -p -r1.190 uipc_syscalls.c > --- kern/uipc_syscalls.c 13 May 2021 17:31:59 -0000 1.190 > +++ kern/uipc_syscalls.c 18 May 2021 12:56:24 -0000 > @@ -308,7 +308,7 @@ doaccept(struct proc *p, int sock, struc > : (flags & SOCK_NONBLOCK ? FNONBLOCK : 0); > > /* connection has been removed from the listen queue */ > - KNOTE(&head->so_rcv.sb_sel.si_note, NOTE_SUBMIT); > + KNOTE(&head->so_rcv.sb_sel.si_note, 0); > > fp->f_type = DTYPE_SOCKET; > fp->f_flag = FREAD | FWRITE | nflag; >