Module Name: src Committed By: christos Date: Sat Oct 31 01:08:32 UTC 2020
Modified Files: src/lib/libc/sys: kqueue.2 src/sys/kern: kern_event.c src/sys/sys: event.h src/tests/lib/libc/sys: t_kevent.c Log Message: PR/55663: Ruslan Nikolaev: Add support for EVFILT_USER in kqueue(2) To generate a diff of this commit: cvs rdiff -u -r1.50 -r1.51 src/lib/libc/sys/kqueue.2 cvs rdiff -u -r1.107 -r1.108 src/sys/kern/kern_event.c cvs rdiff -u -r1.38 -r1.39 src/sys/sys/event.h cvs rdiff -u -r1.8 -r1.9 src/tests/lib/libc/sys/t_kevent.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/lib/libc/sys/kqueue.2 diff -u src/lib/libc/sys/kqueue.2:1.50 src/lib/libc/sys/kqueue.2:1.51 --- src/lib/libc/sys/kqueue.2:1.50 Sun Dec 22 20:46:09 2019 +++ src/lib/libc/sys/kqueue.2 Fri Oct 30 21:08:31 2020 @@ -1,4 +1,4 @@ -.\" $NetBSD: kqueue.2,v 1.50 2019/12/23 01:46:09 kamil Exp $ +.\" $NetBSD: kqueue.2,v 1.51 2020/10/31 01:08:31 christos Exp $ .\" .\" Copyright (c) 2000 Jonathan Lemon .\" All rights reserved. @@ -32,7 +32,7 @@ .\" .\" $FreeBSD: src/lib/libc/sys/kqueue.2,v 1.22 2001/06/27 19:55:57 dd Exp $ .\" -.Dd December 22, 2019 +.Dd October 30, 2020 .Dt KQUEUE 2 .Os .Sh NAME @@ -54,7 +54,9 @@ .Fn kevent "int kq" "const struct kevent *changelist" "size_t nchanges" "struct kevent *eventlist" "size_t nevents" "const struct timespec *timeout" .Fn EV_SET "&kev" ident filter flags fflags data udata .Sh DESCRIPTION +The .Fn kqueue +system call provides a generic method of notifying the user when an event happens or a condition holds, based on the results of small pieces of kernel code termed filters. @@ -80,12 +82,14 @@ Calling .Xr close 2 on a file descriptor will remove any kevents that reference the descriptor. .Pp +The .Fn kqueue +system call creates a new kernel event queue and returns a descriptor. .Pp The .Fn kqueue1 -function also allows to set the following +system call also allows to set the following .Fa flags on the returned file descriptor: .Bl -column O_NONBLOCK -offset indent @@ -109,10 +113,14 @@ The queue is not inherited by a child cr .\" flag, then the descriptor table is shared, .\" which will allow sharing of the kqueue between two processes. .Pp +The .Fn kevent +system call is used to register events with the queue, and return any pending events to the user. +The .Fa changelist +argument is a pointer to an array of .Va kevent structures, as defined in @@ -120,14 +128,28 @@ structures, as defined in All changes contained in the .Fa changelist are applied before any pending events are read from the queue. +The .Fa nchanges +argument gives the size of .Fa changelist . +The .Fa eventlist +argument is a pointer to an array of kevent structures. +The .Fa nevents +argument determines the size of .Fa eventlist . +When +.Fa nevents +is zero, +.Fn kevent +will return immediately even if there is a +.Fa timeout +specified unlike +.Xr select 2 . If .Fa timeout is a @@ -154,8 +176,9 @@ The same array may be used for the and .Fa eventlist . .Pp +The .Fn EV_SET -is a macro which is provided for ease of initializing a kevent structure. +macro is provided for ease of initializing a kevent structure. This macro does not evaluate its parameters multiple times. .Pp The @@ -175,22 +198,22 @@ struct kevent { The fields of .Fa struct kevent are: -.Bl -tag -width XXXfilter -offset indent +.Bl -tag -width "Fa filter" -offset indent .It ident Value used to identify this event. The exact interpretation is determined by the attached filter, but often is a file descriptor. -.It filter +.It Fa filter Identifies the kernel filter used to process this event. There are pre-defined system filters (which are described below), and other filters may be added by kernel subsystems as necessary. -.It flags +.It Fa flags Actions to perform on the event. -.It fflags +.It Fa fflags Filter-specific flags. -.It data +.It Fa data Filter-specific data value. -.It udata +.It Fa udata Opaque user-defined value passed through the kernel unchanged. .El .Pp @@ -231,6 +254,11 @@ to always be returned. When a filter is successfully added the .Va data field will be zero. +Note that if this flag is encountered and there is no remaining space in +.Fa eventlist +to hold the +.Dv EV_ERROR +event, then subsequent changes will not get processed. .It Dv EV_ONESHOT Causes the event to return only the first occurrence of the filter being triggered. @@ -289,6 +317,7 @@ struct kfilter_mapping { }; .Ed .Pp +The predefined system filters are listed below. Arguments may be passed to and from the filter via the .Va fflags and @@ -339,6 +368,14 @@ Returns when the file pointer is not at .Va data contains the offset from current position to end of file, and may be negative. +.\" .Pp +.\" This behavior is different from +.\" .Xr poll 2 , +.\" where read events are triggered for regular files unconditionally. +.\" This event can be triggered unconditionally by setting the +.\" .Dv NOTE_FILE_POLL +.\" flag in +.\" .Va fflags . .It "Fifos, Pipes" Returns when there is data to read; .Va data @@ -349,6 +386,12 @@ When the last writer disconnects, the fi This may be cleared by passing in EV_CLEAR, at which point the filter will resume waiting for data to become available before returning. +.It "BPF devices" +Returns when the BPF buffer is full, the BPF timeout has expired, or +when the BPF has +.Dq immediate mode +enabled and there is any data to read; +.Va data .El .It Dv EVFILT_WRITE Takes a descriptor as the identifier, and returns whenever @@ -487,16 +530,68 @@ This filter automatically sets the EV_CL .It Dv EVFILT_FS Establishes a file system monitor. Currently it only monitors file system mount and unmount actions. +.It Dv EVFILT_USER +Establishes a user event identified by +.Va ident +which is not associated with any kernel mechanism but is triggered by +user level code. +The lower 24 bits of the +.Va fflags +may be used for user defined flags and manipulated using the following: +.Bl -tag -width "Dv NOTE_FFLAGSMASK" +.It Dv NOTE_FFNOP +Ignore the input +.Va fflags . +.It Dv NOTE_FFAND +Bitwise AND +.Va fflags . +.It Dv NOTE_FFOR +Bitwise OR +.Va fflags . +.It Dv NOTE_FFCOPY +Copy +.Va fflags . +.It Dv NOTE_FFCTRLMASK +Control mask for +.Va fflags . +.It Dv NOTE_FFLAGSMASK +User defined flag mask for +.Va fflags . +.El +.Pp +A user event is triggered for output with the following: +.Bl -tag -width "Dv NOTE_FFLAGSMASK" +.It Dv NOTE_TRIGGER +Cause the event to be triggered. +.El +.Pp +On return, +.Va fflags +contains the users defined flags in the lower 24 bits. .El +.Sh CANCELLATION BEHAVIOUR +If +.Fa nevents +is non-zero, i.e., the function is potentially blocking, the call +is a cancellation point. +Otherwise, i.e., if +.Fa nevents +is zero, the call is not cancellable. +Cancellation can only occur before any changes are made to the kqueue, +or when the call was blocked and no changes to the queue were requested. .Sh RETURN VALUES +The .Fn kqueue +system call creates a new kernel event queue and returns a file descriptor. If there was an error creating the kernel event queue, a value of \-1 is returned and .Dv errno is set. .Pp +The .Fn kevent +system call returns the number of events placed in the .Fa eventlist , up to the value given by Index: src/sys/kern/kern_event.c diff -u src/sys/kern/kern_event.c:1.107 src/sys/kern/kern_event.c:1.108 --- src/sys/kern/kern_event.c:1.107 Sat May 23 19:42:43 2020 +++ src/sys/kern/kern_event.c Fri Oct 30 21:08:32 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: kern_event.c,v 1.107 2020/05/23 23:42:43 ad Exp $ */ +/* $NetBSD: kern_event.c,v 1.108 2020/10/31 01:08:32 christos Exp $ */ /*- * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. @@ -31,6 +31,7 @@ /*- * Copyright (c) 1999,2000,2001 Jonathan Lemon <jle...@freebsd.org> + * Copyright (c) 2009 Apple, Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -58,7 +59,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.107 2020/05/23 23:42:43 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.108 2020/10/31 01:08:32 christos Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -109,6 +110,10 @@ static int filt_timer(struct knote *, lo static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fs(struct knote *kn, long hint); +static int filt_userattach(struct knote *); +static void filt_userdetach(struct knote *); +static int filt_user(struct knote *, long hint); +static void filt_usertouch(struct knote *, struct kevent *, long type); static const struct fileops kqueueops = { .fo_name = "kqueue", @@ -158,6 +163,14 @@ static const struct filterops fs_filtops .f_event = filt_fs, }; +static const struct filterops user_filtops = { + .f_isfd = 0, + .f_attach = filt_userattach, + .f_detach = filt_userdetach, + .f_event = filt_user, + .f_touch = filt_usertouch, +}; + static u_int kq_ncallouts = 0; static int kq_calloutmax = (4 * 1024); @@ -192,6 +205,7 @@ static struct kfilter sys_kfilters[] = { { "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 }, { "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 }, { "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 }, + { "EVFILT_USER", EVFILT_USER, 0, &user_filtops, 0 }, { NULL, 0, 0, NULL, 0 }, }; @@ -776,6 +790,106 @@ filt_fs(struct knote *kn, long hint) return rv; } +static int +filt_userattach(struct knote *kn) +{ + struct kqueue *kq = kn->kn_kq; + + /* + * EVFILT_USER knotes are not attached to anything in the kernel. + */ + mutex_spin_enter(&kq->kq_lock); + kn->kn_hook = NULL; + if (kn->kn_fflags & NOTE_TRIGGER) + kn->kn_hookid = 1; + else + kn->kn_hookid = 0; + mutex_spin_exit(&kq->kq_lock); + return (0); +} + +static void +filt_userdetach(struct knote *kn) +{ + + /* + * EVFILT_USER knotes are not attached to anything in the kernel. + */ +} + +static int +filt_user(struct knote *kn, long hint) +{ + struct kqueue *kq = kn->kn_kq; + int hookid; + + mutex_spin_enter(&kq->kq_lock); + hookid = kn->kn_hookid; + mutex_spin_exit(&kq->kq_lock); + + return hookid; +} + +static void +filt_usertouch(struct knote *kn, struct kevent *kev, long type) +{ + struct kqueue *kq = kn->kn_kq; + int ffctrl; + + mutex_spin_enter(&kq->kq_lock); + switch (type) { + case EVENT_REGISTER: + if (kev->fflags & NOTE_TRIGGER) + kn->kn_hookid = 1; + + ffctrl = kev->fflags & NOTE_FFCTRLMASK; + kev->fflags &= NOTE_FFLAGSMASK; + switch (ffctrl) { + case NOTE_FFNOP: + break; + + case NOTE_FFAND: + kn->kn_sfflags &= kev->fflags; + break; + + case NOTE_FFOR: + kn->kn_sfflags |= kev->fflags; + break; + + case NOTE_FFCOPY: + kn->kn_sfflags = kev->fflags; + break; + + default: + /* XXX Return error? */ + break; + } + kn->kn_sdata = kev->data; + if (kev->flags & EV_CLEAR) { + kn->kn_hookid = 0; + kn->kn_data = 0; + kn->kn_fflags = 0; + } + break; + + case EVENT_PROCESS: + *kev = kn->kn_kevent; + kev->fflags = kn->kn_sfflags; + kev->data = kn->kn_sdata; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_hookid = 0; + kn->kn_data = 0; + kn->kn_fflags = 0; + } + break; + + default: + panic("filt_usertouch() - invalid type (%ld)", type); + break; + } + mutex_spin_exit(&kq->kq_lock); +} + /* * filt_seltrue: * @@ -809,6 +923,7 @@ const struct filterops seltrue_filtops = .f_attach = NULL, .f_detach = filt_seltruedetach, .f_event = filt_seltrue, + .f_touch = NULL, }; int @@ -1072,8 +1187,8 @@ kqueue_register(struct kqueue *kq, struc /* * kn now contains the matching knote, or NULL if no match */ - if (kev->flags & EV_ADD) { - if (kn == NULL) { + if (kn == NULL) { + if (kev->flags & EV_ADD) { /* create new knote */ kn = newkn; newkn = NULL; @@ -1137,41 +1252,51 @@ kqueue_register(struct kqueue *kq, struc goto done; } atomic_inc_uint(&kfilter->refcnt); + goto done_ev_add; } else { - /* - * The user may change some filter values after the - * initial EV_ADD, but doing so will not reset any - * filter which have already been triggered. - */ - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - kn->kn_kevent.udata = kev->udata; + /* No matching knote and the EV_ADD flag is not set. */ + error = ENOENT; + goto doneunlock; } - /* - * We can get here if we are trying to attach - * an event to a file descriptor that does not - * support events, and the attach routine is - * broken and does not return an error. - */ - KASSERT(kn->kn_fop != NULL); - KASSERT(kn->kn_fop->f_event != NULL); + } + + if (kev->flags & EV_DELETE) { + /* knote_detach() drops fdp->fd_lock */ + knote_detach(kn, fdp, true); + goto done; + } + + /* + * The user may change some filter values after the + * initial EV_ADD, but doing so will not reset any + * filter which have already been triggered. + */ + kn->kn_kevent.udata = kev->udata; + KASSERT(kn->kn_fop != NULL); + if (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL) { KERNEL_LOCK(1, NULL); /* XXXSMP */ - rv = (*kn->kn_fop->f_event)(kn, 0); + (*kn->kn_fop->f_touch)(kn, kev, EVENT_REGISTER); KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ - if (rv) - knote_activate(kn); } else { - if (kn == NULL) { - error = ENOENT; - goto doneunlock; - } - if (kev->flags & EV_DELETE) { - /* knote_detach() drops fdp->fd_lock */ - knote_detach(kn, fdp, true); - goto done; - } + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; } + /* + * We can get here if we are trying to attach + * an event to a file descriptor that does not + * support events, and the attach routine is + * broken and does not return an error. + */ +done_ev_add: + KASSERT(kn->kn_fop != NULL); + KASSERT(kn->kn_fop->f_event != NULL); + KERNEL_LOCK(1, NULL); /* XXXSMP */ + rv = (*kn->kn_fop->f_event)(kn, 0); + KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ + if (rv) + knote_activate(kn); + /* disable knote */ if ((kev->flags & EV_DISABLE)) { mutex_spin_enter(&kq->kq_lock); @@ -1271,7 +1396,7 @@ kqueue_scan(file_t *fp, size_t maxevents struct timespec ats, sleepts; struct knote *kn, *marker, morker; size_t count, nkev, nevents; - int timeout, error, rv; + int timeout, error, touch, rv; filedesc_t *fdp; fdp = curlwp->l_fd; @@ -1382,8 +1507,20 @@ kqueue_scan(file_t *fp, size_t maxevents continue; } } + KASSERT(kn->kn_fop != NULL); + touch = (!kn->kn_fop->f_isfd && + kn->kn_fop->f_touch != NULL); /* XXXAD should be got from f_event if !oneshot. */ - *kevp++ = kn->kn_kevent; + if (touch) { + mutex_spin_exit(&kq->kq_lock); + KERNEL_LOCK(1, NULL); /* XXXSMP */ + (*kn->kn_fop->f_touch)(kn, kevp, EVENT_PROCESS); + KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ + mutex_spin_enter(&kq->kq_lock); + } else { + *kevp = kn->kn_kevent; + } + kevp++; nkev++; if (kn->kn_flags & EV_ONESHOT) { /* delete ONESHOT events after retrieval */ @@ -1396,6 +1533,14 @@ kqueue_scan(file_t *fp, size_t maxevents /* clear state after retrieval */ kn->kn_data = 0; kn->kn_fflags = 0; + /* + * Manually clear knotes who weren't + * 'touch'ed. + */ + if (touch == 0) { + kn->kn_data = 0; + kn->kn_fflags = 0; + } kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY); } else if (kn->kn_flags & EV_DISPATCH) { kn->kn_status |= KN_DISABLED; Index: src/sys/sys/event.h diff -u src/sys/sys/event.h:1.38 src/sys/sys/event.h:1.39 --- src/sys/sys/event.h:1.38 Thu Oct 3 18:16:52 2019 +++ src/sys/sys/event.h Fri Oct 30 21:08:32 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: event.h,v 1.38 2019/10/03 22:16:52 kamil Exp $ */ +/* $NetBSD: event.h,v 1.39 2020/10/31 01:08:32 christos Exp $ */ /*- * Copyright (c) 1999,2000,2001 Jonathan Lemon <jle...@freebsd.org> @@ -44,7 +44,8 @@ #define EVFILT_SIGNAL 5U /* attached to struct proc */ #define EVFILT_TIMER 6U /* arbitrary timer (in ms) */ #define EVFILT_FS 7U /* filesystem events */ -#define EVFILT_SYSCOUNT 8U /* number of filters */ +#define EVFILT_USER 8U /* user events */ +#define EVFILT_SYSCOUNT 9U /* number of filters */ struct kevent { uintptr_t ident; /* identifier for this event */ @@ -91,6 +92,25 @@ _EV_SET(struct kevent *_kevp, uintptr_t #define EV_ERROR 0x4000U /* error, data contains errno */ /* + * data/hint flags/masks for EVFILT_USER, shared with userspace + * + * On input, the top two bits of fflags specifies how the lower twenty four + * bits should be applied to the stored value of fflags. + * + * On output, the top two bits will always be set to NOTE_FFNOP and the + * remaining twenty four bits will contain the stored fflags value. + */ +#define NOTE_FFNOP 0x00000000U /* ignore input fflags */ +#define NOTE_FFAND 0x40000000U /* AND fflags */ +#define NOTE_FFOR 0x80000000U /* OR fflags */ +#define NOTE_FFCOPY 0xc0000000U /* copy fflags */ + +#define NOTE_FFCTRLMASK 0xc0000000U /* masks for operations */ +#define NOTE_FFLAGSMASK 0x00ffffffU + +#define NOTE_TRIGGER 0x01000000U /* Cause the event to be + triggered for output. */ +/* * hint flag for in-kernel use - must not equal any existing note */ #ifdef _KERNEL @@ -162,6 +182,16 @@ struct kfilter_mapping { #define NOTE_SIGNAL 0x08000000U /* + * Hint values for the optional f_touch event filter. If f_touch is not set + * to NULL and f_isfd is zero the f_touch filter will be called with the type + * argument set to EVENT_REGISTER during a kevent() system call. It is also + * called under the same conditions with the type argument set to EVENT_PROCESS + * when the event has been triggered. + */ +#define EVENT_REGISTER 1 +#define EVENT_PROCESS 2 + +/* * Callback methods for each filter type. */ struct filterops { @@ -172,6 +202,7 @@ struct filterops { /* called when knote is DELETEd */ int (*f_event) (struct knote *, long); /* called when event is triggered */ + void (*f_touch) (struct knote *, struct kevent *, long); }; /* @@ -196,6 +227,7 @@ struct knote { const struct filterops *kn_fop; struct kfilter *kn_kfilter; void *kn_hook; + int kn_hookid; #define KN_ACTIVE 0x01U /* event has been triggered */ #define KN_QUEUED 0x02U /* event is on queue */ Index: src/tests/lib/libc/sys/t_kevent.c diff -u src/tests/lib/libc/sys/t_kevent.c:1.8 src/tests/lib/libc/sys/t_kevent.c:1.9 --- src/tests/lib/libc/sys/t_kevent.c:1.8 Thu Jun 25 07:12:03 2020 +++ src/tests/lib/libc/sys/t_kevent.c Fri Oct 30 21:08:32 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: t_kevent.c,v 1.8 2020/06/25 11:12:03 jruoho Exp $ */ +/* $NetBSD: t_kevent.c,v 1.9 2020/10/31 01:08:32 christos Exp $ */ /*- * Copyright (c) 2011 The NetBSD Foundation, Inc. @@ -29,7 +29,7 @@ * POSSIBILITY OF SUCH DAMAGE. */ #include <sys/cdefs.h> -__RCSID("$NetBSD: t_kevent.c,v 1.8 2020/06/25 11:12:03 jruoho Exp $"); +__RCSID("$NetBSD: t_kevent.c,v 1.9 2020/10/31 01:08:32 christos Exp $"); #include <sys/types.h> #include <sys/event.h> @@ -172,8 +172,40 @@ ATF_TC_BODY(kqueue_unsupported_fd, tc) ATF_REQUIRE_ERRNO(EOPNOTSUPP, true); (void)close(fd); + (void)close(kq); } +ATF_TC(kqueue_EVFILT_USER); +ATF_TC_HEAD(kqueue_EVFILT_USER, tc) +{ + atf_tc_set_md_var(tc, "descr", "Checks usability of EVFILT_USER"); +} + +ATF_TC_BODY(kqueue_EVFILT_USER, tc) +{ + /* mqueue and semaphore use fnullop_kqueue also */ + int kq; + struct kevent ev, rev; + + ATF_REQUIRE((kq = kqueue()) != -1); + + EV_SET(&ev, 666, EVFILT_USER, EV_ADD | EV_ENABLE, 0, 0, 0); + ATF_REQUIRE(kevent(kq, &ev, 1, NULL, 0, NULL) == 0); + EV_SET(&ev, 666, EVFILT_USER, 0, NOTE_FFCOPY | NOTE_TRIGGER | 8, 0, 0); + ATF_REQUIRE(kevent(kq, &ev, 1, NULL, 0, NULL) == 0); + const struct timespec timeout = { + .tv_sec = 1, + .tv_nsec = 0, + }; + + ATF_REQUIRE(kevent(kq, NULL, 0, &rev, 1, &timeout) == 1); + ATF_REQUIRE(rev.ident == 666); + ATF_REQUIRE(rev.filter == EVFILT_USER); + ATF_REQUIRE(rev.fflags == 8); + (void)close(kq); +} + + ATF_TP_ADD_TCS(tp) { @@ -181,6 +213,7 @@ ATF_TP_ADD_TCS(tp) ATF_TP_ADD_TC(tp, kevent_zerotimer); ATF_TP_ADD_TC(tp, kqueue_desc_passing); ATF_TP_ADD_TC(tp, kqueue_unsupported_fd); + ATF_TP_ADD_TC(tp, kqueue_EVFILT_USER); return atf_no_error(); }