The branch stable/15 has been updated by jamie: URL: https://cgit.FreeBSD.org/src/commit/?id=9d7f89ef26073bb56e6ec9c3370089067a71babc
commit 9d7f89ef26073bb56e6ec9c3370089067a71babc Author: Jamie Gritton <ja...@freebsd.org> AuthorDate: 2025-09-12 18:33:19 +0000 Commit: Jamie Gritton <ja...@freebsd.org> CommitDate: 2025-09-15 03:33:48 +0000 jaildesc: add kevent support Give jail descriptors the same kevent flags as jails. Also fix the event reporting in jails, where it was including data for events the user didn't ask for. (cherry picked from commit 66d8ffe3046ded1eb3f78599c6af8eb965482ef5) --- lib/libsys/kqueue.2 | 15 +++++- sys/kern/kern_event.c | 18 ++++--- sys/kern/kern_jail.c | 1 + sys/kern/kern_jaildesc.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++- sys/sys/event.h | 5 +- sys/sys/jaildesc.h | 4 ++ 6 files changed, 168 insertions(+), 13 deletions(-) diff --git a/lib/libsys/kqueue.2 b/lib/libsys/kqueue.2 index aafb5317c5e0..96c9b0222a37 100644 --- a/lib/libsys/kqueue.2 +++ b/lib/libsys/kqueue.2 @@ -22,7 +22,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd September 11, 2025 +.Dd September 12, 2025 .Dt KQUEUE 2 .Os .Sh NAME @@ -638,6 +638,19 @@ or .Dv NOTE_JAIL_CHILD event has been received since the last call to .Fn kevent . +.It Dv EVFILT_JAILDESC +Takes a jail descriptor returned by +.Xr jail_set 2 +or +.Xr jail_get 2 +as the identifier and the events to watch for in +.Va fflags , +and returns when the jail performs one or more of the requested events. +The events to monitor and the resulting +.Va fflags +are the same as those listed in +.Dv EVFILT_JAIL , +above. .It Dv EVFILT_TIMER Establishes an arbitrary timer identified by .Va ident . diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index 8d1ff313735b..57cbfb8a0361 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -51,6 +51,7 @@ #include <sys/filio.h> #include <sys/fcntl.h> #include <sys/jail.h> +#include <sys/jaildesc.h> #include <sys/kthread.h> #include <sys/selinfo.h> #include <sys/queue.h> @@ -376,6 +377,7 @@ static struct { [~EVFILT_SENDFILE] = { &null_filtops }, [~EVFILT_EMPTY] = { &file_filtops, 1 }, [~EVFILT_JAIL] = { &jail_filtops, 1 }, + [~EVFILT_JAILDESC] = { &file_filtops, 1 }, }; /* @@ -682,15 +684,15 @@ filt_jail(struct knote *kn, long hint) (u_int)hint & NOTE_JAIL_CTRLMASK; /* If the user is interested in this event, record it. */ - if (kn->kn_sfflags & event) + if (kn->kn_sfflags & event) { kn->kn_fflags |= event; - - /* Report the created jail id or attached process id. */ - if (event == NOTE_JAIL_CHILD || event == NOTE_JAIL_ATTACH) { - if (kn->kn_data != 0) - kn->kn_fflags |= NOTE_JAIL_MULTI; - kn->kn_data = (kn->kn_fflags & NOTE_JAIL_MULTI) ? 0U : - (u_int)hint & ~event; + /* Report the created jail id or attached process id. */ + if (event == NOTE_JAIL_CHILD || event == NOTE_JAIL_ATTACH) { + if (kn->kn_data != 0) + kn->kn_fflags |= NOTE_JAIL_MULTI; + kn->kn_data = (kn->kn_fflags & NOTE_JAIL_MULTI) ? 0U : + (u_int)hint & ~event; + } } /* Prison is gone, so flag the event as finished. */ diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index d90ccf4a04c8..43035dc009b3 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -5371,6 +5371,7 @@ prison_knote(struct prison *pr, long hint) if (!locked) mtx_lock(&pr->pr_mtx); KNOTE_LOCKED(pr->pr_klist, hint); + jaildesc_knote(pr, hint); if (!locked) mtx_unlock(&pr->pr_mtx); } diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c index c9e80f5d8941..3f322b271400 100644 --- a/sys/kern/kern_jaildesc.c +++ b/sys/kern/kern_jaildesc.c @@ -36,6 +36,7 @@ #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> +#include <sys/poll.h> #include <sys/priv.h> #include <sys/stat.h> #include <sys/sysproto.h> @@ -46,6 +47,8 @@ MALLOC_DEFINE(M_JAILDESC, "jaildesc", "jail descriptors"); +static fo_poll_t jaildesc_poll; +static fo_kqfilter_t jaildesc_kqfilter; static fo_stat_t jaildesc_stat; static fo_close_t jaildesc_close; static fo_fill_kinfo_t jaildesc_fill_kinfo; @@ -56,8 +59,8 @@ static struct fileops jaildesc_ops = { .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, - .fo_poll = invfo_poll, - .fo_kqfilter = invfo_kqfilter, + .fo_poll = jaildesc_poll, + .fo_kqfilter = jaildesc_kqfilter, .fo_stat = jaildesc_stat, .fo_close = jaildesc_close, .fo_chmod = invfo_chmod, @@ -135,6 +138,7 @@ jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning) finit(fp, priv_check_cred(fp->f_cred, PRIV_JAIL_SET) == 0 ? FREAD | FWRITE : FREAD, DTYPE_JAILDESC, jd, &jaildesc_ops); JAILDESC_LOCK_INIT(jd); + knlist_init_mtx(&jd->jd_selinfo.si_note, &jd->jd_lock); if (owning) jd->jd_flags |= JDF_OWNING; *fpp = fp; @@ -176,6 +180,36 @@ jaildesc_prison_cleanup(struct prison *pr) } } +/* + * Pass a note to all listening kqueues. + */ +void +jaildesc_knote(struct prison *pr, long hint) +{ + struct jaildesc *jd; + int prison_locked; + + if (!LIST_EMPTY(&pr->pr_descs)) { + prison_locked = mtx_owned(&pr->pr_mtx); + if (!prison_locked) + prison_lock(pr); + LIST_FOREACH(jd, &pr->pr_descs, jd_list) { + JAILDESC_LOCK(jd); + if (hint == NOTE_JAIL_REMOVE) { + jd->jd_flags |= JDF_REMOVED; + if (jd->jd_flags & JDF_SELECTED) { + jd->jd_flags &= ~JDF_SELECTED; + selwakeup(&jd->jd_selinfo); + } + } + KNOTE_LOCKED(&jd->jd_selinfo.si_note, hint); + JAILDESC_UNLOCK(jd); + } + if (!prison_locked) + prison_unlock(pr); + } +} + static int jaildesc_close(struct file *fp, struct thread *td) { @@ -223,12 +257,112 @@ jaildesc_close(struct file *fp, struct thread *td) } prison_free(pr); } + knlist_destroy(&jd->jd_selinfo.si_note); JAILDESC_LOCK_DESTROY(jd); free(jd, M_JAILDESC); } return (0); } +static int +jaildesc_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + struct jaildesc *jd; + int revents; + + revents = 0; + jd = fp->f_data; + JAILDESC_LOCK(jd); + if (jd->jd_flags & JDF_REMOVED) + revents |= POLLHUP; + if (revents == 0) { + selrecord(td, &jd->jd_selinfo); + jd->jd_flags |= JDF_SELECTED; + } + JAILDESC_UNLOCK(jd); + return (revents); +} + +static void +jaildesc_kqops_detach(struct knote *kn) +{ + struct jaildesc *jd; + + jd = kn->kn_fp->f_data; + knlist_remove(&jd->jd_selinfo.si_note, kn, 0); +} + +static int +jaildesc_kqops_event(struct knote *kn, long hint) +{ + struct jaildesc *jd; + u_int event; + + jd = kn->kn_fp->f_data; + if (hint == 0) { + /* + * Initial test after registration. Generate a + * NOTE_JAIL_REMOVE in case the prison already died + * before registration. + */ + event = jd->jd_flags & JDF_REMOVED ? NOTE_JAIL_REMOVE : 0; + } else { + /* + * Mask off extra data. In the NOTE_JAIL_CHILD case, + * that's everything except the NOTE_JAIL_CHILD bit + * itself, since a JID is any positive integer. + */ + event = ((u_int)hint & NOTE_JAIL_CHILD) ? NOTE_JAIL_CHILD : + (u_int)hint & NOTE_JAIL_CTRLMASK; + } + + /* If the user is interested in this event, record it. */ + if (kn->kn_sfflags & event) { + kn->kn_fflags |= event; + /* Report the created jail id or attached process id. */ + if (event == NOTE_JAIL_CHILD || event == NOTE_JAIL_ATTACH) { + if (kn->kn_data != 0) + kn->kn_fflags |= NOTE_JAIL_MULTI; + kn->kn_data = (kn->kn_fflags & NOTE_JAIL_MULTI) ? 0U : + (u_int)hint & ~event; + } + } + + /* Prison is gone, so flag the event as finished. */ + if (event == NOTE_JAIL_REMOVE) { + kn->kn_flags |= EV_EOF | EV_ONESHOT; + if (kn->kn_fflags == 0) + kn->kn_flags |= EV_DROP; + return (1); + } + + return (kn->kn_fflags != 0); +} + +static const struct filterops jaildesc_kqops = { + .f_isfd = 1, + .f_detach = jaildesc_kqops_detach, + .f_event = jaildesc_kqops_event, +}; + +static int +jaildesc_kqfilter(struct file *fp, struct knote *kn) +{ + struct jaildesc *jd; + + jd = fp->f_data; + switch (kn->kn_filter) { + case EVFILT_JAILDESC: + kn->kn_fop = &jaildesc_kqops; + kn->kn_flags |= EV_CLEAR; + knlist_add(&jd->jd_selinfo.si_note, kn, 0); + return (0); + default: + return (EINVAL); + } +} + static int jaildesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { diff --git a/sys/sys/event.h b/sys/sys/event.h index 91fbaa4834f7..084eaafcbdc0 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -46,7 +46,8 @@ #define EVFILT_SENDFILE (-12) /* attached to sendfile requests */ #define EVFILT_EMPTY (-13) /* empty send socket buf */ #define EVFILT_JAIL (-14) /* attached to struct prison */ -#define EVFILT_SYSCOUNT 14 +#define EVFILT_JAILDESC (-15) /* attached to jail descriptors */ +#define EVFILT_SYSCOUNT 15 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define EV_SET(kevp_, a, b, c, d, e, f) do { \ @@ -210,7 +211,7 @@ struct freebsd11_kevent32 { #define NOTE_TRACKERR 0x00000002 /* could not track child */ #define NOTE_CHILD 0x00000004 /* am a child process */ -/* data/hint flags for EVFILT_JAIL */ +/* data/hint flags for EVFILT_JAIL and EVFILT_JAILDESC */ #define NOTE_JAIL_CHILD 0x80000000 /* child jail was created */ #define NOTE_JAIL_SET 0x40000000 /* jail was modified */ #define NOTE_JAIL_ATTACH 0x20000000 /* jail was attached to */ diff --git a/sys/sys/jaildesc.h b/sys/sys/jaildesc.h index 2451b04f7302..fda270d62e70 100644 --- a/sys/sys/jaildesc.h +++ b/sys/sys/jaildesc.h @@ -35,6 +35,7 @@ #ifdef _KERNEL #include <sys/queue.h> +#include <sys/selinfo.h> #include <sys/_lock.h> #include <sys/_mutex.h> #include <sys/_types.h> @@ -54,6 +55,7 @@ struct jaildesc { LIST_ENTRY(jaildesc) jd_list; /* (d,p) this prison's descs */ struct prison *jd_prison; /* (d) the prison */ struct mtx jd_lock; + struct selinfo jd_selinfo; /* (d) event notification */ unsigned jd_flags; /* (d) JDF_* flags */ }; @@ -69,6 +71,7 @@ struct jaildesc { /* * Flags for the jd_flags field */ +#define JDF_SELECTED 0x00000001 /* issue selwakeup() */ #define JDF_REMOVED 0x00000002 /* jail was removed */ #define JDF_OWNING 0x00000004 /* closing descriptor removes jail */ @@ -77,6 +80,7 @@ int jaildesc_find(struct thread *td, int fd, struct prison **prp, int jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning); void jaildesc_set_prison(struct file *jd, struct prison *pr); void jaildesc_prison_cleanup(struct prison *pr); +void jaildesc_knote(struct prison *pr, long hint); #endif /* _KERNEL */