The branch main has been updated by kib:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=b11289f87123f8ae06fc70bc70d26a25d4356a65

commit b11289f87123f8ae06fc70bc70d26a25d4356a65
Author:     Konstantin Belousov <[email protected]>
AuthorDate: 2025-08-19 04:34:04 +0000
Commit:     Konstantin Belousov <[email protected]>
CommitDate: 2025-10-18 05:12:36 +0000

    kqueuex(2): add KQUEUE_CPONFORK
    
    The created kqueue is copied on fork, together with the registered
    events. This means that a new kqueue is created at the same fd index
    as the parent' kqueue, and all registered events are copied into the
    new kqueue (when possible). The current active events list is also
    duplicated.
    
    Reviewed by:    markj
    Tested by:      pho
    Sponsored by:   The FreeBSD Foundation
    MFC after:      2 weeks
    Differential revision:  https://reviews.freebsd.org/D52045
---
 sys/compat/linux/linux_event.c |   2 +-
 sys/kern/kern_event.c          | 174 ++++++++++++++++++++++++++++++++++++++---
 sys/sys/event.h                |   2 +
 sys/sys/eventvar.h             |   2 +
 sys/sys/syscallsubr.h          |   3 +-
 5 files changed, 168 insertions(+), 15 deletions(-)

diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
index e88791659f1f..fc3ef7c3e90a 100644
--- a/sys/compat/linux/linux_event.c
+++ b/sys/compat/linux/linux_event.c
@@ -104,7 +104,7 @@ static int
 epoll_create_common(struct thread *td, int flags)
 {
 
-       return (kern_kqueue(td, flags, NULL));
+       return (kern_kqueue(td, flags, false, NULL));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 80c289f7d802..1f3030995ec6 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -134,6 +134,7 @@ static fo_kqfilter_t        kqueue_kqfilter;
 static fo_stat_t       kqueue_stat;
 static fo_close_t      kqueue_close;
 static fo_fill_kinfo_t kqueue_fill_kinfo;
+static fo_fork_t       kqueue_fork;
 
 static const struct fileops kqueueops = {
        .fo_read = invfo_rdwr,
@@ -148,7 +149,9 @@ static const struct fileops kqueueops = {
        .fo_chown = invfo_chown,
        .fo_sendfile = invfo_sendfile,
        .fo_cmp = file_kcmp_generic,
+       .fo_fork = kqueue_fork,
        .fo_fill_kinfo = kqueue_fill_kinfo,
+       .fo_flags = DFLAG_FORK,
 };
 
 static int     knote_attach(struct knote *kn, struct kqueue *kq);
@@ -1151,7 +1154,7 @@ int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
 
-       return (kern_kqueue(td, 0, NULL));
+       return (kern_kqueue(td, 0, false, NULL));
 }
 
 int
@@ -1159,27 +1162,30 @@ sys_kqueuex(struct thread *td, struct kqueuex_args *uap)
 {
        int flags;
 
-       if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0)
+       if ((uap->flags & ~(KQUEUE_CLOEXEC | KQUEUE_CPONFORK)) != 0)
                return (EINVAL);
        flags = 0;
        if ((uap->flags & KQUEUE_CLOEXEC) != 0)
                flags |= O_CLOEXEC;
-       return (kern_kqueue(td, flags, NULL));
+       return (kern_kqueue(td, flags, (uap->flags & KQUEUE_CPONFORK) != 0,
+           NULL));
 }
 
 static void
-kqueue_init(struct kqueue *kq)
+kqueue_init(struct kqueue *kq, bool cponfork)
 {
 
        mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
        TAILQ_INIT(&kq->kq_head);
        knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
        TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+       if (cponfork)
+               kq->kq_state |= KQ_CPONFORK;
 }
 
 static int
 kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip,
-    struct file **fpp, int flags, struct filecaps *fcaps,
+    struct file **fpp, int flags, struct filecaps *fcaps, bool cponfork,
     struct kqueue **kqp)
 {
        struct ucred *cred;
@@ -1191,7 +1197,7 @@ kern_kqueue_alloc(struct thread *td, struct filedesc 
*fdp, int *fdip,
                return (ENOMEM);
 
        error = fdip != NULL ? falloc_caps(td, fpp, fdip, flags, fcaps) :
-           _falloc_noinstall(td, fpp, 2);
+           _falloc_noinstall(td, fpp, 1);
        if (error != 0) {
                chgkqcnt(cred->cr_ruidinfo, -1, 0);
                return (error);
@@ -1199,31 +1205,33 @@ kern_kqueue_alloc(struct thread *td, struct filedesc 
*fdp, int *fdip,
 
        /* An extra reference on `fp' has been held for us by falloc(). */
        kq = malloc(sizeof(*kq), M_KQUEUE, M_WAITOK | M_ZERO);
-       kqueue_init(kq);
+       kqueue_init(kq, cponfork);
        kq->kq_fdp = fdp;
        kq->kq_cred = crhold(cred);
 
-       FILEDESC_XLOCK(fdp);
+       if (fdip != NULL)
+               FILEDESC_XLOCK(fdp);
        TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
-       FILEDESC_XUNLOCK(fdp);
+       if (fdip != NULL)
+               FILEDESC_XUNLOCK(fdp);
 
+       finit(*fpp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
        *kqp = kq;
        return (0);
 }
 
 int
-kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
+kern_kqueue(struct thread *td, int flags, bool cponfork, struct filecaps 
*fcaps)
 {
        struct kqueue *kq;
        struct file *fp;
        int fd, error;
 
        error = kern_kqueue_alloc(td, td->td_proc->p_fd, &fd, &fp, flags,
-           fcaps, &kq);
+           fcaps, cponfork, &kq);
        if (error != 0)
                return (error);
 
-       finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
        fdrop(fp, td);
 
        td->td_retval[0] = fd;
@@ -1504,7 +1512,7 @@ kern_kevent_anonymous(struct thread *td, int nevents,
        struct kqueue kq = {};
        int error;
 
-       kqueue_init(&kq);
+       kqueue_init(&kq, false);
        kq.kq_refcnt = 1;
        error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
        kqueue_drain(&kq, td);
@@ -2958,6 +2966,146 @@ noacquire:
        return (error);
 }
 
+static int
+kqueue_fork_alloc(struct filedesc *fdp, struct file *fp, struct file **fp1,
+    struct thread *td)
+{
+       struct kqueue *kq, *kq1;
+       int error;
+
+       MPASS(fp->f_type == DTYPE_KQUEUE);
+       kq = fp->f_data;
+       if ((kq->kq_state & KQ_CPONFORK) == 0)
+               return (EOPNOTSUPP);
+       error = kqueue_acquire_ref(kq);
+       if (error != 0)
+               return (error);
+       error = kern_kqueue_alloc(td, fdp, NULL, fp1, 0, NULL, true, &kq1);
+       if (error == 0) {
+               kq1->kq_forksrc = kq;
+               (*fp1)->f_flag = fp->f_flag & (FREAD | FWRITE | FEXEC |
+                   O_CLOEXEC | O_CLOFORK);
+       } else {
+               kqueue_release(kq, 0);
+       }
+       return (error);
+}
+
+static void
+kqueue_fork_copy_knote(struct kqueue *kq1, struct knote *kn, struct proc *p1,
+    struct filedesc *fdp)
+{
+       struct knote *kn1;
+       const struct filterops *fop;
+       int error;
+
+       fop = kn->kn_fop;
+       if (fop->f_copy == NULL || (fop->f_isfd &&
+           fdp->fd_files->fdt_ofiles[kn->kn_kevent.ident].fde_file == NULL))
+               return;
+       error = kqueue_expand(kq1, fop, kn->kn_kevent.ident, M_WAITOK);
+       if (error != 0)
+               return;
+
+       kn1 = knote_alloc(M_WAITOK);
+       *kn1 = *kn;
+       kn1->kn_status |= KN_DETACHED;
+       kn1->kn_status &= ~KN_QUEUED;
+       kn1->kn_kq = kq1;
+       error = fop->f_copy(kn1, p1);
+       if (error != 0) {
+               knote_free(kn1);
+               return;
+       }
+       (void)kqueue_fo_find(kn->kn_kevent.filter);
+       if (fop->f_isfd && !fhold(kn1->kn_fp)) {
+               fop->f_detach(kn1);
+               kqueue_fo_release(kn->kn_kevent.filter);
+               knote_free(kn1);
+               return;
+       }
+       if (kn->kn_knlist != NULL)
+               knlist_add(kn->kn_knlist, kn1, 0);
+       KQ_LOCK(kq1);
+       knote_attach(kn1, kq1);
+       kn1->kn_influx = 0;
+       if ((kn->kn_status & KN_QUEUED) != 0)
+               knote_enqueue(kn1);
+       KQ_UNLOCK(kq1);
+}
+
+static void
+kqueue_fork_copy_list(struct klist *knlist, struct knote *marker,
+    struct kqueue *kq, struct kqueue *kq1, struct proc *p1,
+    struct filedesc *fdp)
+{
+       struct knote *kn;
+
+       KQ_OWNED(kq);
+       kn = SLIST_FIRST(knlist);
+       while (kn != NULL) {
+               if ((kn->kn_status & KN_DETACHED) != 0 ||
+                   (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0)) {
+                       kn = SLIST_NEXT(kn, kn_link);
+                       continue;
+               }
+               kn_enter_flux(kn);
+               SLIST_INSERT_AFTER(kn, marker, kn_link);
+               KQ_UNLOCK(kq);
+               kqueue_fork_copy_knote(kq1, kn, p1, fdp);
+               KQ_LOCK(kq);
+               kn_leave_flux(kn);
+               kn = SLIST_NEXT(marker, kn_link);
+               /* XXXKIB switch kn_link to LIST? */
+               SLIST_REMOVE(knlist, marker, knote, kn_link);
+       }
+}
+
+static int
+kqueue_fork_copy(struct filedesc *fdp, struct file *fp, struct file *fp1,
+    struct proc *p1, struct thread *td)
+{
+       struct kqueue *kq, *kq1;
+       struct knote *marker;
+       int error, i;
+
+       error = 0;
+       MPASS(fp == NULL);
+       MPASS(fp1->f_type == DTYPE_KQUEUE);
+
+       kq1 = fp1->f_data;
+       kq = kq1->kq_forksrc;
+       marker = knote_alloc(M_WAITOK);
+       marker->kn_status = KN_MARKER;
+
+       KQ_LOCK(kq);
+       for (i = 0; i < kq->kq_knlistsize; i++) {
+               kqueue_fork_copy_list(&kq->kq_knlist[i], marker, kq, kq1,
+                   p1, fdp);
+       }
+       if (kq->kq_knhashmask != 0) {
+               for (i = 0; i <= kq->kq_knhashmask; i++) {
+                       kqueue_fork_copy_list(&kq->kq_knhash[i], marker, kq,
+                           kq1, p1, fdp);
+               }
+       }
+       kqueue_release(kq, 1);
+       kq1->kq_forksrc = NULL;
+       KQ_UNLOCK(kq);
+
+       knote_free(marker);
+       return (error);
+}
+
+static int
+kqueue_fork(struct filedesc *fdp, struct file *fp, struct file **fp1,
+    struct proc *p1, struct thread *td)
+{
+       if (*fp1 == NULL)
+               return (kqueue_fork_alloc(fdp, fp, fp1, td));
+       return (kqueue_fork_copy(fdp, fp, *fp1, p1, td));
+}
+
 struct knote_status_export_bit {
        int kn_status_bit;
        int knt_status_bit;
diff --git a/sys/sys/event.h b/sys/sys/event.h
index 084eaafcbdc0..6e71445f03b0 100644
--- a/sys/sys/event.h
+++ b/sys/sys/event.h
@@ -228,6 +228,7 @@ struct freebsd11_kevent32 {
 
 /* Flags for kqueuex(2) */
 #define        KQUEUE_CLOEXEC  0x00000001      /* close on exec */
+#define        KQUEUE_CPONFORK 0x00000002      /* copy on fork */
 
 struct knote;
 SLIST_HEAD(klist, knote);
@@ -283,6 +284,7 @@ struct filterops {
        void    (*f_touch)(struct knote *kn, struct kevent *kev, u_long type);
        int     (*f_userdump)(struct proc *p, struct knote *kn,
                    struct kinfo_knote *kin);
+       int     (*f_copy)(struct knote *kn, struct proc *p1);
 };
 
 /*
diff --git a/sys/sys/eventvar.h b/sys/sys/eventvar.h
index 7fec444447f9..7cb3269f1fdf 100644
--- a/sys/sys/eventvar.h
+++ b/sys/sys/eventvar.h
@@ -55,12 +55,14 @@ struct kqueue {
 #define KQ_CLOSING     0x10
 #define        KQ_TASKSCHED    0x20                    /* task scheduled */
 #define        KQ_TASKDRAIN    0x40                    /* waiting for task to 
drain */
+#define        KQ_CPONFORK     0x80
        int             kq_knlistsize;          /* size of knlist */
        struct          klist *kq_knlist;       /* list of knotes */
        u_long          kq_knhashmask;          /* size of knhash */
        struct          klist *kq_knhash;       /* hash table for knotes */
        struct          task kq_task;
        struct          ucred *kq_cred;
+       struct          kqueue *kq_forksrc;
 };
 
 #endif /* !_SYS_EVENTVAR_H_ */
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index 8237165b84ce..d32690634059 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -211,7 +211,8 @@ int kern_kevent_fp(struct thread *td, struct file *fp, int 
nchanges,
            int nevents, struct kevent_copyops *k_ops,
            const struct timespec *timeout);
 int    kern_kill(struct thread *td, pid_t pid, int signum);
-int    kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps);
+int    kern_kqueue(struct thread *td, int flags, bool cponfork,
+           struct filecaps *fcaps);
 int    kern_kldload(struct thread *td, const char *file, int *fileid);
 int    kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat);
 int    kern_kldunload(struct thread *td, int fileid, int flags);

Reply via email to