On Thu, Aug 24, 2023 at 08:29:48PM +0000, Warner Losh wrote:
> The branch main has been updated by imp:
> 
> URL: 
> https://cgit.FreeBSD.org/src/commit/?id=af93fea710385b2b11f0cabd377e7ed6f3d97c34
> 
> commit af93fea710385b2b11f0cabd377e7ed6f3d97c34
> Author:     Jake Freeland <jf...@freebsd.org>
> AuthorDate: 2023-08-24 04:39:54 +0000
> Commit:     Warner Losh <i...@freebsd.org>
> CommitDate: 2023-08-24 20:28:56 +0000
> 
>     timerfd: Move implementation from linux compat to sys/kern
>     
>     Move the timerfd impelemntation from linux compat code to sys/kern. Use
>     it to implement the new system calls for timerfd. Add a hook to kern_tc
>     to allow timerfd to know when the system time has stepped. Add kqueue
>     support to timerfd. Adjust a few names to be less Linux centric.
>     
>     RelNotes: YES
>     Reviewed by: markj (on irc), imp, kib (with reservations), jhb (slack)
>     Differential Revision: https://reviews.freebsd.org/D38459
> ---
>  lib/libc/sys/Symbol.map                        |   3 +
>  sys/bsm/audit_kevents.h                        |   1 +
>  sys/compat/freebsd32/freebsd32_proto.h         |  14 +
>  sys/compat/freebsd32/freebsd32_syscall.h       |   5 +-
>  sys/compat/freebsd32/freebsd32_syscalls.c      |   3 +
>  sys/compat/freebsd32/freebsd32_sysent.c        |   3 +
>  sys/compat/freebsd32/freebsd32_systrace_args.c |  86 ++++
>  sys/compat/linux/linux_event.c                 | 443 ++---------------
>  sys/compat/linux/linux_event.h                 |  11 -
>  sys/conf/files                                 |   1 +
>  sys/kern/init_sysent.c                         |   3 +
>  sys/kern/kern_descrip.c                        |   4 +-
>  sys/kern/kern_tc.c                             |   2 +
>  sys/kern/sys_timerfd.c                         | 632 
> +++++++++++++++++++++++++
>  sys/kern/syscalls.c                            |   3 +
>  sys/kern/syscalls.master                       |  20 +
>  sys/kern/systrace_args.c                       |  86 ++++
>  sys/sys/file.h                                 |   2 +-
>  sys/sys/syscall.h                              |   5 +-
>  sys/sys/syscall.mk                             |   5 +-
>  sys/sys/sysproto.h                             |  20 +
>  sys/sys/timerfd.h                              |  66 +++
>  sys/sys/user.h                                 |   6 +
>  23 files changed, 999 insertions(+), 425 deletions(-)
> 
> diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
> index 9a07bb457eb8..7937661e3787 100644
> --- a/lib/libc/sys/Symbol.map
> +++ b/lib/libc/sys/Symbol.map
> @@ -421,6 +421,9 @@ FBSD_1.7 {
>       kqueuex;
>       membarrier;
>       swapoff;
> +     timerfd_create;
> +     timerfd_gettime;
> +     timerfd_settime;
>  };
>  
>  FBSDprivate_1.0 {
> diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
> index a6b50a67ee6a..d06381837aad 100644
> --- a/sys/bsm/audit_kevents.h
> +++ b/sys/bsm/audit_kevents.h
> @@ -661,6 +661,7 @@
>  #define      AUE_AIO_WRITEV          43267   /* FreeBSD-specific. */
>  #define      AUE_AIO_READV           43268   /* FreeBSD-specific. */
>  #define      AUE_FSPACECTL           43269   /* FreeBSD-specific. */
> +#define      AUE_TIMERFD             43270   /* FreeBSD/Linux. */
>  
>  /*
>   * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
> diff --git a/sys/compat/freebsd32/freebsd32_proto.h 
> b/sys/compat/freebsd32/freebsd32_proto.h
> index bb333e0321a0..50448b6dce16 100644
> --- a/sys/compat/freebsd32/freebsd32_proto.h
> +++ b/sys/compat/freebsd32/freebsd32_proto.h
> @@ -684,6 +684,16 @@ struct freebsd32_aio_writev_args {
>  struct freebsd32_aio_readv_args {
>       char aiocbp_l_[PADL_(struct aiocb32 *)]; struct aiocb32 * aiocbp; char 
> aiocbp_r_[PADR_(struct aiocb32 *)];
>  };
> +struct freebsd32_timerfd_gettime_args {
> +     char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
> +     char curr_value_l_[PADL_(struct itimerspec32 *)]; struct itimerspec32 * 
> curr_value; char curr_value_r_[PADR_(struct itimerspec32 *)];
> +};
> +struct freebsd32_timerfd_settime_args {
> +     char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
> +     char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
> +     char new_value_l_[PADL_(const struct itimerspec32 *)]; const struct 
> itimerspec32 * new_value; char new_value_r_[PADR_(const struct itimerspec32 
> *)];
> +     char old_value_l_[PADL_(struct itimerspec32 *)]; struct itimerspec32 * 
> old_value; char old_value_r_[PADR_(struct itimerspec32 *)];
> +};
>  int  freebsd32_wait4(struct thread *, struct freebsd32_wait4_args *);
>  int  freebsd32_ptrace(struct thread *, struct freebsd32_ptrace_args *);
>  int  freebsd32_recvmsg(struct thread *, struct freebsd32_recvmsg_args *);
> @@ -799,6 +809,8 @@ int       freebsd32_cpuset_setdomain(struct thread *, 
> struct freebsd32_cpuset_setdomai
>  int  freebsd32___sysctlbyname(struct thread *, struct 
> freebsd32___sysctlbyname_args *);
>  int  freebsd32_aio_writev(struct thread *, struct freebsd32_aio_writev_args 
> *);
>  int  freebsd32_aio_readv(struct thread *, struct freebsd32_aio_readv_args *);
> +int  freebsd32_timerfd_gettime(struct thread *, struct 
> freebsd32_timerfd_gettime_args *);
> +int  freebsd32_timerfd_settime(struct thread *, struct 
> freebsd32_timerfd_settime_args *);
>  
>  #ifdef COMPAT_43
>  
> @@ -1292,6 +1304,8 @@ int     freebsd11_freebsd32_fstatat(struct thread *, 
> struct freebsd11_freebsd32_fsta
>  #define      FREEBSD32_SYS_AUE_freebsd32___sysctlbyname      AUE_SYSCTL
>  #define      FREEBSD32_SYS_AUE_freebsd32_aio_writev  AUE_AIO_WRITEV
>  #define      FREEBSD32_SYS_AUE_freebsd32_aio_readv   AUE_AIO_READV
> +#define      FREEBSD32_SYS_AUE_freebsd32_timerfd_gettime     AUE_TIMERFD
> +#define      FREEBSD32_SYS_AUE_freebsd32_timerfd_settime     AUE_TIMERFD
>  
>  #undef PAD_
>  #undef PADL_
> diff --git a/sys/compat/freebsd32/freebsd32_syscall.h 
> b/sys/compat/freebsd32/freebsd32_syscall.h
> index c3d8617abf4b..e3777730be1c 100644
> --- a/sys/compat/freebsd32/freebsd32_syscall.h
> +++ b/sys/compat/freebsd32/freebsd32_syscall.h
> @@ -502,4 +502,7 @@
>  #define      FREEBSD32_SYS_swapoff   582
>  #define      FREEBSD32_SYS_kqueuex   583
>  #define      FREEBSD32_SYS_membarrier        584
> -#define      FREEBSD32_SYS_MAXSYSCALL        585
> +#define      FREEBSD32_SYS_timerfd_create    585
> +#define      FREEBSD32_SYS_freebsd32_timerfd_gettime 586
> +#define      FREEBSD32_SYS_freebsd32_timerfd_settime 587
> +#define      FREEBSD32_SYS_MAXSYSCALL        588
> diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c 
> b/sys/compat/freebsd32/freebsd32_syscalls.c
> index 19d454743c55..ccc910ee5ca9 100644
> --- a/sys/compat/freebsd32/freebsd32_syscalls.c
> +++ b/sys/compat/freebsd32/freebsd32_syscalls.c
> @@ -590,4 +590,7 @@ const char *freebsd32_syscallnames[] = {
>       "swapoff",                      /* 582 = swapoff */
>       "kqueuex",                      /* 583 = kqueuex */
>       "membarrier",                   /* 584 = membarrier */
> +     "timerfd_create",                       /* 585 = timerfd_create */
> +     "freebsd32_timerfd_gettime",                    /* 586 = 
> freebsd32_timerfd_gettime */
> +     "freebsd32_timerfd_settime",                    /* 587 = 
> freebsd32_timerfd_settime */
>  };
> diff --git a/sys/compat/freebsd32/freebsd32_sysent.c 
> b/sys/compat/freebsd32/freebsd32_sysent.c
> index 971f06a643c5..fec6f4a47bd6 100644
> --- a/sys/compat/freebsd32/freebsd32_sysent.c
> +++ b/sys/compat/freebsd32/freebsd32_sysent.c
> @@ -646,4 +646,7 @@ struct sysent freebsd32_sysent[] = {
>       { .sy_narg = AS(swapoff_args), .sy_call = (sy_call_t *)sys_swapoff, 
> .sy_auevent = AUE_SWAPOFF, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },     
> /* 582 = swapoff */
>       { .sy_narg = AS(kqueuex_args), .sy_call = (sy_call_t *)sys_kqueuex, 
> .sy_auevent = AUE_KQUEUE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = 
> SY_THR_STATIC }, /* 583 = kqueuex */
>       { .sy_narg = AS(membarrier_args), .sy_call = (sy_call_t 
> *)sys_membarrier, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, 
> .sy_thrcnt = SY_THR_STATIC },     /* 584 = membarrier */
> +     { .sy_narg = AS(timerfd_create_args), .sy_call = (sy_call_t 
> *)sys_timerfd_create, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, 
> .sy_thrcnt = SY_THR_STATIC },  /* 585 = timerfd_create */
> +     { .sy_narg = AS(freebsd32_timerfd_gettime_args), .sy_call = (sy_call_t 
> *)freebsd32_timerfd_gettime, .sy_auevent = AUE_TIMERFD, .sy_flags = 
> SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },        /* 586 = 
> freebsd32_timerfd_gettime */
> +     { .sy_narg = AS(freebsd32_timerfd_settime_args), .sy_call = (sy_call_t 
> *)freebsd32_timerfd_settime, .sy_auevent = AUE_TIMERFD, .sy_flags = 
> SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },        /* 587 = 
> freebsd32_timerfd_settime */
>  };
> diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c 
> b/sys/compat/freebsd32/freebsd32_systrace_args.c
> index 5dfc82c30b7b..2c26a0ddab2f 100644
> --- a/sys/compat/freebsd32/freebsd32_systrace_args.c
> +++ b/sys/compat/freebsd32/freebsd32_systrace_args.c
> @@ -3336,6 +3336,32 @@ systrace_args(int sysnum, void *params, uint64_t 
> *uarg, int *n_args)
>               *n_args = 3;
>               break;
>       }
> +     /* timerfd_create */
> +     case 585: {
> +             struct timerfd_create_args *p = params;
> +             iarg[a++] = p->clockid; /* int */
> +             iarg[a++] = p->flags; /* int */
> +             *n_args = 2;
> +             break;
> +     }
> +     /* freebsd32_timerfd_gettime */
> +     case 586: {
> +             struct freebsd32_timerfd_gettime_args *p = params;
> +             iarg[a++] = p->fd; /* int */
> +             uarg[a++] = (intptr_t)p->curr_value; /* struct itimerspec32 * */
> +             *n_args = 2;
> +             break;
> +     }
> +     /* freebsd32_timerfd_settime */
> +     case 587: {
> +             struct freebsd32_timerfd_settime_args *p = params;
> +             iarg[a++] = p->fd; /* int */
> +             iarg[a++] = p->flags; /* int */
> +             uarg[a++] = (intptr_t)p->new_value; /* const struct 
> itimerspec32 * */
> +             uarg[a++] = (intptr_t)p->old_value; /* struct itimerspec32 * */
> +             *n_args = 4;
> +             break;
> +     }
>       default:
>               *n_args = 0;
>               break;
> @@ -9005,6 +9031,51 @@ systrace_entry_setargdesc(int sysnum, int ndx, char 
> *desc, size_t descsz)
>                       break;
>               };
>               break;
> +     /* timerfd_create */
> +     case 585:
> +             switch (ndx) {
> +             case 0:
> +                     p = "int";
> +                     break;
> +             case 1:
> +                     p = "int";
> +                     break;
> +             default:
> +                     break;
> +             };
> +             break;
> +     /* freebsd32_timerfd_gettime */
> +     case 586:
> +             switch (ndx) {
> +             case 0:
> +                     p = "int";
> +                     break;
> +             case 1:
> +                     p = "userland struct itimerspec32 *";
> +                     break;
> +             default:
> +                     break;
> +             };
> +             break;
> +     /* freebsd32_timerfd_settime */
> +     case 587:
> +             switch (ndx) {
> +             case 0:
> +                     p = "int";
> +                     break;
> +             case 1:
> +                     p = "int";
> +                     break;
> +             case 2:
> +                     p = "userland const struct itimerspec32 *";
> +                     break;
> +             case 3:
> +                     p = "userland struct itimerspec32 *";
> +                     break;
> +             default:
> +                     break;
> +             };
> +             break;
>       default:
>               break;
>       };
> @@ -10873,6 +10944,21 @@ systrace_return_setargdesc(int sysnum, int ndx, char 
> *desc, size_t descsz)
>               if (ndx == 0 || ndx == 1)
>                       p = "int";
>               break;
> +     /* timerfd_create */
> +     case 585:
> +             if (ndx == 0 || ndx == 1)
> +                     p = "int";
> +             break;
> +     /* freebsd32_timerfd_gettime */
> +     case 586:
> +             if (ndx == 0 || ndx == 1)
> +                     p = "int";
> +             break;
> +     /* freebsd32_timerfd_settime */
> +     case 587:
> +             if (ndx == 0 || ndx == 1)
> +                     p = "int";
> +             break;
>       default:
>               break;
>       };
> diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
> index a7db8516e5f0..816c68a90f1d 100644
> --- a/sys/compat/linux/linux_event.c
> +++ b/sys/compat/linux/linux_event.c
> @@ -44,6 +44,7 @@
>  #include <sys/specialfd.h>
>  #include <sys/sx.h>
>  #include <sys/syscallsubr.h>
> +#include <sys/timerfd.h>
>  #include <sys/timespec.h>
>  #include <sys/user.h>
>  
> @@ -99,55 +100,6 @@ struct epoll_copyout_args {
>       int                     error;
>  };
>  
> -/* timerfd */
> -typedef uint64_t     timerfd_t;
> -
> -static fo_rdwr_t     timerfd_read;
> -static fo_ioctl_t    timerfd_ioctl;
> -static fo_poll_t     timerfd_poll;
> -static fo_kqfilter_t timerfd_kqfilter;
> -static fo_stat_t     timerfd_stat;
> -static fo_close_t    timerfd_close;
> -static fo_fill_kinfo_t       timerfd_fill_kinfo;
> -
> -static struct fileops timerfdops = {
> -     .fo_read = timerfd_read,
> -     .fo_write = invfo_rdwr,
> -     .fo_truncate = invfo_truncate,
> -     .fo_ioctl = timerfd_ioctl,
> -     .fo_poll = timerfd_poll,
> -     .fo_kqfilter = timerfd_kqfilter,
> -     .fo_stat = timerfd_stat,
> -     .fo_close = timerfd_close,
> -     .fo_chmod = invfo_chmod,
> -     .fo_chown = invfo_chown,
> -     .fo_sendfile = invfo_sendfile,
> -     .fo_fill_kinfo = timerfd_fill_kinfo,
> -     .fo_flags = DFLAG_PASSABLE
> -};
> -
> -static void  filt_timerfddetach(struct knote *kn);
> -static int   filt_timerfdread(struct knote *kn, long hint);
> -
> -static struct filterops timerfd_rfiltops = {
> -     .f_isfd = 1,
> -     .f_detach = filt_timerfddetach,
> -     .f_event = filt_timerfdread
> -};
> -
> -struct timerfd {
> -     clockid_t       tfd_clockid;
> -     struct itimerspec tfd_time;
> -     struct callout  tfd_callout;
> -     timerfd_t       tfd_count;
> -     bool            tfd_canceled;
> -     struct selinfo  tfd_sel;
> -     struct mtx      tfd_lock;
> -};
> -
> -static void  linux_timerfd_expire(void *);
> -static void  linux_timerfd_curval(struct timerfd *, struct itimerspec *);
> -
>  static int
>  epoll_create_common(struct thread *td, int flags)
>  {
> @@ -658,255 +610,14 @@ linux_eventfd2(struct thread *td, struct 
> linux_eventfd2_args *args)
>  int
>  linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args 
> *args)
>  {
> -     struct timerfd *tfd;
> -     struct file *fp;
>       clockid_t clockid;
> -     int fflags, fd, error;
> -
> -     if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
> -             return (EINVAL);
> -
> -     error = linux_to_native_clockid(&clockid, args->clockid);
> -     if (error != 0)
> -             return (error);
> -     if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
> -             return (EINVAL);
> -
> -     fflags = 0;
> -     if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
> -             fflags |= O_CLOEXEC;
> -
> -     error = falloc(td, &fp, &fd, fflags);
> -     if (error != 0)
> -             return (error);
> -
> -     tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
> -     tfd->tfd_clockid = clockid;
> -     mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
> -
> -     callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
> -     knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
> -
> -     fflags = FREAD;
> -     if ((args->flags & LINUX_O_NONBLOCK) != 0)
> -             fflags |= FNONBLOCK;
> -
> -     finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
> -     fdrop(fp, td);
> -
> -     td->td_retval[0] = fd;
> -     return (error);
> -}
> -
> -static int
> -timerfd_close(struct file *fp, struct thread *td)
> -{
> -     struct timerfd *tfd;
> -
> -     tfd = fp->f_data;
> -     if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> -             return (EINVAL);
> -
> -     timespecclear(&tfd->tfd_time.it_value);
> -     timespecclear(&tfd->tfd_time.it_interval);
> -
> -     callout_drain(&tfd->tfd_callout);
> -
> -     seldrain(&tfd->tfd_sel);
> -     knlist_destroy(&tfd->tfd_sel.si_note);
> -
> -     fp->f_ops = &badfileops;
> -     mtx_destroy(&tfd->tfd_lock);
> -     free(tfd, M_EPOLL);
> -
> -     return (0);
> -}
> -
> -static int
> -timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
> -    int flags, struct thread *td)
> -{
> -     struct timerfd *tfd;
> -     timerfd_t count;
> -     int error;
> -
> -     tfd = fp->f_data;
> -     if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> -             return (EINVAL);
> -
> -     if (uio->uio_resid < sizeof(timerfd_t))
> -             return (EINVAL);
> -
> -     error = 0;
> -     mtx_lock(&tfd->tfd_lock);
> -retry:
> -     if (tfd->tfd_canceled) {
> -             tfd->tfd_count = 0;
> -             mtx_unlock(&tfd->tfd_lock);
> -             return (ECANCELED);
> -     }
> -     if (tfd->tfd_count == 0) {
> -             if ((fp->f_flag & FNONBLOCK) != 0) {
> -                     mtx_unlock(&tfd->tfd_lock);
> -                     return (EAGAIN);
> -             }
> -             error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, 
> "ltfdrd", 0);
> -             if (error == 0)
> -                     goto retry;
> -     }
> -     if (error == 0) {
> -             count = tfd->tfd_count;
> -             tfd->tfd_count = 0;
> -             mtx_unlock(&tfd->tfd_lock);
> -             error = uiomove(&count, sizeof(timerfd_t), uio);
> -     } else
> -             mtx_unlock(&tfd->tfd_lock);
> -
> -     return (error);
> -}
> -
> -static int
> -timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
> -    struct thread *td)
> -{
> -     struct timerfd *tfd;
> -     int revents = 0;
> -
> -     tfd = fp->f_data;
> -     if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> -             return (POLLERR);
> -
> -     mtx_lock(&tfd->tfd_lock);
> -     if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
> -             revents |= events & (POLLIN|POLLRDNORM);
> -     if (revents == 0)
> -             selrecord(td, &tfd->tfd_sel);
> -     mtx_unlock(&tfd->tfd_lock);
> -
> -     return (revents);
> -}
> -
> -static int
> -timerfd_kqfilter(struct file *fp, struct knote *kn)
> -{
> -     struct timerfd *tfd;
> -
> -     tfd = fp->f_data;
> -     if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> -             return (EINVAL);
> -
> -     if (kn->kn_filter == EVFILT_READ)
> -             kn->kn_fop = &timerfd_rfiltops;
> -     else
> -             return (EINVAL);
> -
> -     kn->kn_hook = tfd;
> -     knlist_add(&tfd->tfd_sel.si_note, kn, 0);
> -
> -     return (0);
> -}
> -
> -static void
> -filt_timerfddetach(struct knote *kn)
> -{
> -     struct timerfd *tfd = kn->kn_hook;
> -
> -     mtx_lock(&tfd->tfd_lock);
> -     knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
> -     mtx_unlock(&tfd->tfd_lock);
> -}
> -
> -static int
> -filt_timerfdread(struct knote *kn, long hint)
> -{
> -     struct timerfd *tfd = kn->kn_hook;
> -
> -     return (tfd->tfd_count > 0);
> -}
> -
> -static int
> -timerfd_ioctl(struct file *fp, u_long cmd, void *data,
> -    struct ucred *active_cred, struct thread *td)
> -{
> -
> -     if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
> -             return (EINVAL);
> -
> -     switch (cmd) {
> -     case FIONBIO:
> -     case FIOASYNC:
> -             return (0);
> -     }
> -
> -     return (ENOTTY);
> -}
> -
> -static int
> -timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
> -{
> -
> -     return (ENXIO);
> -}
> -
> -static int
> -timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc 
> *fdp)
> -{
> -
> -     kif->kf_type = KF_TYPE_UNKNOWN;
> -     return (0);
> -}
> -
> -static void
> -linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
> -{
> -
> -     if (tfd->tfd_clockid == CLOCK_REALTIME)
> -             getnanotime(ts);
> -     else    /* CLOCK_MONOTONIC */
> -             getnanouptime(ts);
> -}
> -
> -static void
> -linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
> -{
> -     struct timespec cts;
> -
> -     linux_timerfd_clocktime(tfd, &cts);
> -     *ots = tfd->tfd_time;
> -     if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
> -             timespecsub(&ots->it_value, &cts, &ots->it_value);
> -             if (ots->it_value.tv_sec < 0 ||
> -                 (ots->it_value.tv_sec == 0 &&
> -                  ots->it_value.tv_nsec == 0)) {
> -                     ots->it_value.tv_sec  = 0;
> -                     ots->it_value.tv_nsec = 1;
> -             }
> -     }
> -}
> -
> -static int
> -linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec 
> *ots)
> -{
> -     struct timerfd *tfd;
> -     struct file *fp;
>       int error;
>  
> -     error = fget(td, fd, &cap_read_rights, &fp);
> +     error = linux_to_native_clockid(&clockid, args->clockid);
>       if (error != 0)
>               return (error);
> -     tfd = fp->f_data;
> -     if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
> -             error = EINVAL;
> -             goto out;
> -     }
> -
> -     mtx_lock(&tfd->tfd_lock);
> -     linux_timerfd_curval(tfd, ots);
> -     mtx_unlock(&tfd->tfd_lock);
>  
> -out:
> -     fdrop(fp, td);
> -     return (error);
> +     return (kern_timerfd_create(td, clockid, args->flags));
>  }
>  
>  int
> @@ -916,84 +627,14 @@ linux_timerfd_gettime(struct thread *td, struct 
> linux_timerfd_gettime_args *args
>       struct itimerspec ots;
>       int error;
>  
> -     error = linux_timerfd_gettime_common(td, args->fd, &ots);
> +     error = kern_timerfd_gettime(td, args->fd, &ots);
>       if (error != 0)
>               return (error);
> -     error = native_to_linux_itimerspec(&lots, &ots);
> -     if (error == 0)
> -             error = copyout(&lots, args->old_value, sizeof(lots));
> -     return (error);
> -}
> -
> -#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
> -int
> -linux_timerfd_gettime64(struct thread *td, struct 
> linux_timerfd_gettime64_args *args)
> -{
> -     struct l_itimerspec64 lots;
> -     struct itimerspec ots;
> -     int error;
>  
> -     error = linux_timerfd_gettime_common(td, args->fd, &ots);
> -     if (error != 0)
> -             return (error);
> -     error = native_to_linux_itimerspec64(&lots, &ots);
> +     error = native_to_linux_itimerspec(&lots, &ots);
>       if (error == 0)
>               error = copyout(&lots, args->old_value, sizeof(lots));
> -     return (error);
> -}
> -#endif
> -
> -static int
> -linux_timerfd_settime_common(struct thread *td, int fd, int flags,
> -    struct itimerspec *nts, struct itimerspec *oval)
> -{
> -     struct timespec cts, ts;
> -     struct timerfd *tfd;
> -     struct timeval tv;
> -     struct file *fp;
> -     int error;
> -
> -     if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
> -             return (EINVAL);
> -
> -     error = fget(td, fd, &cap_write_rights, &fp);
> -     if (error != 0)
> -             return (error);
> -     tfd = fp->f_data;
> -     if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
> -             error = EINVAL;
> -             goto out;
> -     }
> -
> -     mtx_lock(&tfd->tfd_lock);
> -     if (!timespecisset(&nts->it_value))
> -             timespecclear(&nts->it_interval);
> -     if (oval != NULL)
> -             linux_timerfd_curval(tfd, oval);
> -
> -     bcopy(nts, &tfd->tfd_time, sizeof(*nts));
> -     tfd->tfd_count = 0;
> -     if (timespecisset(&nts->it_value)) {
> -             linux_timerfd_clocktime(tfd, &cts);
> -             ts = nts->it_value;
> -             if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
> -                     timespecadd(&tfd->tfd_time.it_value, &cts,
> -                             &tfd->tfd_time.it_value);
> -             } else {
> -                     timespecsub(&ts, &cts, &ts);
> -             }
> -             TIMESPEC_TO_TIMEVAL(&tv, &ts);
> -             callout_reset(&tfd->tfd_callout, tvtohz(&tv),
> -                     linux_timerfd_expire, tfd);
> -             tfd->tfd_canceled = false;
> -     } else {
> -             tfd->tfd_canceled = true;
> -             callout_stop(&tfd->tfd_callout);
> -     }
> -     mtx_unlock(&tfd->tfd_lock);
>  
> -out:
> -     fdrop(fp, td);
>       return (error);
>  }
>  
> @@ -1001,7 +642,7 @@ int
>  linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args 
> *args)
>  {
>       struct l_itimerspec lots;
> -     struct itimerspec nts, ots, *pots;
> +     struct itimerspec nts, ots;
>       int error;
>  
>       error = copyin(args->new_value, &lots, sizeof(lots));
> @@ -1010,23 +651,43 @@ linux_timerfd_settime(struct thread *td, struct 
> linux_timerfd_settime_args *args
>       error = linux_to_native_itimerspec(&nts, &lots);
>       if (error != 0)
>               return (error);
> -     pots = (args->old_value != NULL ? &ots : NULL);
> -     error = linux_timerfd_settime_common(td, args->fd, args->flags,
> -         &nts, pots);
> +     if (args->old_value == NULL)
> +             error = kern_timerfd_settime(td, args->fd, args->flags, &nts, 
> NULL);
> +     else
> +             error = kern_timerfd_settime(td, args->fd, args->flags, &nts, 
> &ots);
>       if (error == 0 && args->old_value != NULL) {
>               error = native_to_linux_itimerspec(&lots, &ots);
>               if (error == 0)
>                       error = copyout(&lots, args->old_value, sizeof(lots));
>       }
> +
>       return (error);
>  }
>  
>  #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
> +int
> +linux_timerfd_gettime64(struct thread *td, struct 
> linux_timerfd_gettime64_args *args)
> +{
> +     struct l_itimerspec64 lots;
> +     struct itimerspec ots;
> +     int error;
> +
> +     error = kern_timerfd_gettime(td, args->fd, &ots);
> +     if (error != 0)
> +             return (error);
> +
> +     error = native_to_linux_itimerspec64(&lots, &ots);
> +     if (error == 0)
> +             error = copyout(&lots, args->old_value, sizeof(lots));
> +
> +     return (error);
> +}
> +
>  int
>  linux_timerfd_settime64(struct thread *td, struct 
> linux_timerfd_settime64_args *args)
>  {
>       struct l_itimerspec64 lots;
> -     struct itimerspec nts, ots, *pots;
> +     struct itimerspec nts, ots;
>       int error;
>  
>       error = copyin(args->new_value, &lots, sizeof(lots));
> @@ -1035,50 +696,16 @@ linux_timerfd_settime64(struct thread *td, struct 
> linux_timerfd_settime64_args *
>       error = linux_to_native_itimerspec64(&nts, &lots);
>       if (error != 0)
>               return (error);
> -     pots = (args->old_value != NULL ? &ots : NULL);
> -     error = linux_timerfd_settime_common(td, args->fd, args->flags,
> -         &nts, pots);
> +     if (args->old_value == NULL)
> +             error = kern_timerfd_settime(td, args->fd, args->flags, &nts, 
> NULL);
> +     else
> +             error = kern_timerfd_settime(td, args->fd, args->flags, &nts, 
> &ots);
>       if (error == 0 && args->old_value != NULL) {
>               error = native_to_linux_itimerspec64(&lots, &ots);
>               if (error == 0)
>                       error = copyout(&lots, args->old_value, sizeof(lots));
>       }
> +
>       return (error);
>  }
>  #endif
> -
> -static void
> -linux_timerfd_expire(void *arg)
> -{
> -     struct timespec cts, ts;
> -     struct timeval tv;
> -     struct timerfd *tfd;
> -
> -     tfd = (struct timerfd *)arg;
> -
> -     linux_timerfd_clocktime(tfd, &cts);
> -     if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
> -             if (timespecisset(&tfd->tfd_time.it_interval))
> -                     timespecadd(&tfd->tfd_time.it_value,
> -                                 &tfd->tfd_time.it_interval,
> -                                 &tfd->tfd_time.it_value);
> -             else
> -                     /* single shot timer */
> -                     timespecclear(&tfd->tfd_time.it_value);
> -             if (timespecisset(&tfd->tfd_time.it_value)) {
> -                     timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
> -                     TIMESPEC_TO_TIMEVAL(&tv, &ts);
> -                     callout_reset(&tfd->tfd_callout, tvtohz(&tv),
> -                             linux_timerfd_expire, tfd);
> -             }
> -             tfd->tfd_count++;
> -             KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
> -             selwakeup(&tfd->tfd_sel);
> -             wakeup(&tfd->tfd_count);
> -     } else if (timespecisset(&tfd->tfd_time.it_value)) {
> -             timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
> -             TIMESPEC_TO_TIMEVAL(&tv, &ts);
> -             callout_reset(&tfd->tfd_callout, tvtohz(&tv),
> -                 linux_timerfd_expire, tfd);
> -     }
> -}
> diff --git a/sys/compat/linux/linux_event.h b/sys/compat/linux/linux_event.h
> index 32269b0070bc..fa63371b5170 100644
> --- a/sys/compat/linux/linux_event.h
> +++ b/sys/compat/linux/linux_event.h
> @@ -54,15 +54,4 @@
>  
>  #define      LINUX_EFD_SEMAPHORE     (1 << 0)
>  
> -#define      LINUX_TFD_TIMER_ABSTIME (1 << 0)
> -#define      LINUX_TFD_TIMER_CANCEL_ON_SET   (1 << 1)
> -#define      LINUX_TFD_CLOEXEC       LINUX_O_CLOEXEC
> -#define      LINUX_TFD_NONBLOCK      LINUX_O_NONBLOCK
> -
> -#define      LINUX_TFD_SHARED_FCNTL_FLAGS    (LINUX_TFD_CLOEXEC              
> \
> -             |LINUX_TFD_NONBLOCK)
> -#define      LINUX_TFD_CREATE_FLAGS  LINUX_TFD_SHARED_FCNTL_FLAGS
> -#define      LINUX_TFD_SETTIME_FLAGS (LINUX_TFD_TIMER_ABSTIME                
> \
> -             |LINUX_TFD_TIMER_CANCEL_ON_SET)
> -
>  #endif       /* !_LINUX_EVENT_H_ */
> diff --git a/sys/conf/files b/sys/conf/files
> index 3f79ce752c80..8d38b9cc8a2e 100644
> --- a/sys/conf/files
> +++ b/sys/conf/files
> @@ -3908,6 +3908,7 @@ kern/sys_pipe.c                 standard
>  kern/sys_procdesc.c          standard
>  kern/sys_process.c           standard
>  kern/sys_socket.c            standard
> +kern/sys_timerfd.c           standard
>  kern/syscalls.c                      standard
>  kern/sysv_ipc.c                      standard
>  kern/sysv_msg.c                      optional sysvmsg
> diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
> index 1e62c46b8be0..d44fec54fcd7 100644
> --- a/sys/kern/init_sysent.c
> +++ b/sys/kern/init_sysent.c
> @@ -645,4 +645,7 @@ struct sysent sysent[] = {
>       { .sy_narg = AS(swapoff_args), .sy_call = (sy_call_t *)sys_swapoff, 
> .sy_auevent = AUE_SWAPOFF, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },     
> /* 582 = swapoff */
>       { .sy_narg = AS(kqueuex_args), .sy_call = (sy_call_t *)sys_kqueuex, 
> .sy_auevent = AUE_KQUEUE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = 
> SY_THR_STATIC }, /* 583 = kqueuex */
>       { .sy_narg = AS(membarrier_args), .sy_call = (sy_call_t 
> *)sys_membarrier, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, 
> .sy_thrcnt = SY_THR_STATIC },     /* 584 = membarrier */
> +     { .sy_narg = AS(timerfd_create_args), .sy_call = (sy_call_t 
> *)sys_timerfd_create, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, 
> .sy_thrcnt = SY_THR_STATIC },  /* 585 = timerfd_create */
> +     { .sy_narg = AS(timerfd_gettime_args), .sy_call = (sy_call_t 
> *)sys_timerfd_gettime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, 
> .sy_thrcnt = SY_THR_STATIC },        /* 586 = timerfd_gettime */
> +     { .sy_narg = AS(timerfd_settime_args), .sy_call = (sy_call_t 
> *)sys_timerfd_settime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, 
> .sy_thrcnt = SY_THR_STATIC },        /* 587 = timerfd_settime */
>  };
> diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
> index c5226288afc5..35046c856d54 100644
> --- a/sys/kern/kern_descrip.c
> +++ b/sys/kern/kern_descrip.c
> @@ -5001,8 +5001,8 @@ file_type_to_name(short type)
>               return ("proc");
>       case DTYPE_EVENTFD:
>               return ("eventfd");
> -     case DTYPE_LINUXTFD:
> -             return ("ltimer");
> +     case DTYPE_TIMERFD:
> +             return ("timerfd");
>       default:
>               return ("unkn");
>       }
> diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
> index 170f35830923..26f09cb60260 100644
> --- a/sys/kern/kern_tc.c
> +++ b/sys/kern/kern_tc.c
> @@ -34,6 +34,7 @@
>  #include <sys/systm.h>
>  #include <sys/timeffc.h>
>  #include <sys/timepps.h>
> +#include <sys/timerfd.h>
>  #include <sys/timetc.h>
>  #include <sys/timex.h>
>  #include <sys/vdso.h>
> @@ -1305,6 +1306,7 @@ tc_setclock(struct timespec *ts)
>  
>       /* Avoid rtc_generation == 0, since td_rtcgen == 0 is special. */
>       atomic_add_rel_int(&rtc_generation, 2);
> +     timerfd_jumped();
>       sleepq_chains_remove_matching(sleeping_on_old_rtc);
>       if (timestepwarnings) {
>               nanotime(&taft);
> diff --git a/sys/kern/sys_timerfd.c b/sys/kern/sys_timerfd.c
> new file mode 100644
> index 000000000000..6948fa059b8c
> --- /dev/null
> +++ b/sys/kern/sys_timerfd.c
> @@ -0,0 +1,632 @@
> +/*-
> + * SPDX-License-Identifier: BSD-2-Clause
> + *
> + * Copyright (c) 2014 Dmitry Chagin <dcha...@freebsd.org>
> + * Copyright (c) 2023 Jake Freeland <jf...@freebsd.org>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/callout.h>
> +#include <sys/fcntl.h>
> +#include <sys/file.h>
> +#include <sys/filedesc.h>
> +#include <sys/filio.h>
> +#include <sys/kernel.h>
> +#include <sys/lock.h>
> +#include <sys/malloc.h>
> +#include <sys/mount.h>
> +#include <sys/mutex.h>
> +#include <sys/poll.h>
> +#include <sys/proc.h>
> +#include <sys/queue.h>
> +#include <sys/selinfo.h>
> +#include <sys/stat.h>
> +#include <sys/sysctl.h>
> +#include <sys/sysent.h>
> +#include <sys/sysproto.h>
> +#include <sys/timerfd.h>
> +#include <sys/timespec.h>
> +#include <sys/uio.h>
> +#include <sys/user.h>
> +
> +#include <security/audit/audit.h>
> +
> +#ifdef COMPAT_FREEBSD32
> +#include <compat/freebsd32/freebsd32.h>
> +#include <compat/freebsd32/freebsd32_proto.h>
> +#endif
> +
> +static MALLOC_DEFINE(M_TIMERFD, "timerfd", "timerfd structures");
> +static LIST_HEAD(, timerfd) timerfd_head;
> +static struct unrhdr64 tfdino_unr;
> +
> +#define      TFD_NOJUMP      0       /* Realtime clock has not jumped. */
> +#define      TFD_READ        1       /* Jumped, tfd has been read since. */
> +#define      TFD_ZREAD       2       /* Jumped backwards, 
> CANCEL_ON_SET=false. */
> +#define      TFD_CANCELED    4       /* Jumped, CANCEL_ON_SET=true. */
> +#define      TFD_JUMPED      (TFD_ZREAD | TFD_CANCELED)
> +
> +struct timerfd {
> +     /* User specified. */
> +     struct itimerspec tfd_time;     /* tfd timer */
> +     clockid_t       tfd_clockid;    /* timing base */
> +     int             tfd_flags;      /* creation flags */
> +     int             tfd_timflags;   /* timer flags */
> +
> +     /* Used internally. */
> +     timerfd_t       tfd_count;      /* expiration count since last read */
> +     bool            tfd_expired;    /* true upon initial expiration */
> +     struct mtx      tfd_lock;       /* mtx lock */
> +     struct callout  tfd_callout;    /* expiration notification */
> +     struct selinfo  tfd_sel;        /* I/O alerts */
> +     struct timespec tfd_boottim;    /* cached boottime */
> +     int             tfd_jumped;     /* timer jump status */
> +     LIST_ENTRY(timerfd) entry;      /* entry in list */
> +
> +     /* For stat(2). */
> +     ino_t           tfd_ino;        /* inode number */
> +     struct timespec tfd_atim;       /* time of last read */
> +     struct timespec tfd_mtim;       /* time of last settime */
> +     struct timespec tfd_birthtim;   /* creation time */
> +};
> +
> +static void
> +timerfd_init(void *data)
> +{
> +     new_unrhdr64(&tfdino_unr, 1);
> +}
> +
> +SYSINIT(timerfd, SI_SUB_VFS, SI_ORDER_ANY, timerfd_init, NULL);
> +
> +static inline void
> +timerfd_getboottime(struct timespec *ts)
> +{
> +     struct timeval tv;
> +     getboottime(&tv);
> +     TIMEVAL_TO_TIMESPEC(&tv, ts);
> +}
> +
> +/*
> + * Call when a discontinuous jump has occured in CLOCK_REALTIME and
> + * update timerfd's cached boottime. A jump can be triggered using
> + * functions like clock_settime(2) or settimeofday(2).
> + *
> + * Timer is marked TFD_CANCELED if TFD_TIMER_CANCEL_ON_SET is set
> *** 850 LINES SKIPPED ***

I did a very quick look over the added code.

I do not see any protection for the timerfd_head list manipulation.

It is not clear what is protected by tfd->tfd_lock: e.g. in timerfd_stat()
it covers reading of items, writing of which is not protected by the mtx,
everything except tfd_atim.
There is no annotations in the timer structure for the locking regime.

stat st_ctim is always zero, this is somewhat strange.

The
        tfd = fp->f_data;
        if (tfd == NULL || fp->f_type != DTYPE_TIMERFD) {
triggers UB when f_type is not DTYPE_TIMERFD.

compat32 stuff was put into the sys/kern instead of sys/compat/freebsd32.
sys/timerfd.h pollutes userspace with sys/proc.h.

The regenerated files were put in the same commit as (probably) human
written files, why?

Reply via email to