Re: kqueue(2) and close-on-exec

2023-08-14 Thread Visa Hankala
On Sun, Aug 13, 2023 at 05:58:01PM -0700, Philip Guenther wrote:
> I think that changing the behavior of the existing API in a way that
> gratuitously increases the differences between BSDs is unwise.  IMHO, we
> should follow NetBSD on this and add kqueue1(), that being obviously
> consistent with the previous 'add flags argument' extensions: pipe2(),
> dup3(), and accept4().

I agree that changing the behavior is not good.

Below is an implementation of the kqueue1() syscall and the libc stub.

NetBSD's kqueue1() accepts flags that do not really affect kqueue
behavior, namely O_NONBLOCK and O_NOSIGPIPE (this is specific to
NetBSD). For compatibility, the new syscall code passes O_NONBLOCK to
file flags but otherwise ignores it. I have omitted this flag from the
syscall manual page because it does not make sense with kqueue.
However, I can add a note about the flag if it is deemed necessary.

Index: sys/kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.197
diff -u -p -r1.197 kern_event.c
--- sys/kern/kern_event.c   13 Aug 2023 08:29:28 -  1.197
+++ sys/kern/kern_event.c   14 Aug 2023 14:52:44 -
@@ -60,6 +60,7 @@
 #define KLIST_ASSERT_LOCKED(kl)((void)(kl))
 #endif
 
+intdokqueue(struct proc *, int, register_t *);
 struct kqueue *kqueue_alloc(struct filedesc *);
 void   kqueue_terminate(struct proc *p, struct kqueue *);
 void   KQREF(struct kqueue *);
@@ -912,12 +913,14 @@ kqueue_alloc(struct filedesc *fdp)
 }
 
 int
-sys_kqueue(struct proc *p, void *v, register_t *retval)
+dokqueue(struct proc *p, int flags, register_t *retval)
 {
struct filedesc *fdp = p->p_fd;
struct kqueue *kq;
struct file *fp;
-   int fd, error;
+   int cloexec, error, fd;
+
+   cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
 
kq = kqueue_alloc(fdp);
 
@@ -925,14 +928,14 @@ sys_kqueue(struct proc *p, void *v, regi
error = falloc(p, , );
if (error)
goto out;
-   fp->f_flag = FREAD | FWRITE;
+   fp->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
fp->f_type = DTYPE_KQUEUE;
fp->f_ops = 
fp->f_data = kq;
*retval = fd;
LIST_INSERT_HEAD(>fd_kqlist, kq, kq_next);
kq = NULL;
-   fdinsert(fdp, fd, 0, fp);
+   fdinsert(fdp, fd, cloexec, fp);
FRELE(fp, p);
 out:
fdpunlock(fdp);
@@ -942,6 +945,24 @@ out:
 }
 
 int
+sys_kqueue(struct proc *p, void *v, register_t *retval)
+{
+   return (dokqueue(p, 0, retval));
+}
+
+int
+sys_kqueue1(struct proc *p, void *v, register_t *retval)
+{
+   struct sys_kqueue1_args /* {
+   syscallarg(int) flags;
+   } */ *uap = v;
+
+   if (SCARG(uap, flags) & ~(O_CLOEXEC | O_NONBLOCK))
+   return (EINVAL);
+   return (dokqueue(p, SCARG(uap, flags), retval));
+}
+
+int
 sys_kevent(struct proc *p, void *v, register_t *retval)
 {
struct kqueue_scan_state scan;
Index: sys/kern/syscalls.master
===
RCS file: src/sys/kern/syscalls.master,v
retrieving revision 1.249
diff -u -p -r1.249 syscalls.master
--- sys/kern/syscalls.master24 Jul 2023 19:32:23 -  1.249
+++ sys/kern/syscalls.master14 Aug 2023 14:52:44 -
@@ -462,7 +462,7 @@
 267OBSOL   pad_preadv
 268OBSOL   pad_pwritev
 269STD NOLOCK  { int sys_kqueue(void); }
-270OBSOL   t32_kevent
+270STD NOLOCK  { int sys_kqueue1(int flags); }
 271STD { int sys_mlockall(int flags); }
 272STD { int sys_munlockall(void); }
 273UNIMPL  sys_getpeereid
Index: sys/sys/event.h
===
RCS file: src/sys/sys/event.h,v
retrieving revision 1.70
diff -u -p -r1.70 event.h
--- sys/sys/event.h 13 Aug 2023 08:29:28 -  1.70
+++ sys/sys/event.h 14 Aug 2023 14:52:44 -
@@ -369,6 +369,7 @@ struct timespec;
 
 __BEGIN_DECLS
 intkqueue(void);
+intkqueue1(int flags);
 intkevent(int kq, const struct kevent *changelist, int nchanges,
struct kevent *eventlist, int nevents,
const struct timespec *timeout);
Index: lib/libc/Symbols.list
===
RCS file: src/lib/libc/Symbols.list,v
retrieving revision 1.81
diff -u -p -r1.81 Symbols.list
--- lib/libc/Symbols.list   11 Feb 2023 23:07:28 -  1.81
+++ lib/libc/Symbols.list   14 Aug 2023 14:52:42 -
@@ -121,6 +121,7 @@ _thread_sys_issetugid
 _thread_sys_kevent
 _thread_sys_kill
 _thread_sys_kqueue
+_thread_sys_kqueue1
 _thread_sys_ktrace
 _thread_sys_lchown
 _thread_sys_link
@@ -322,6 +323,7 @@ issetugid
 kevent
 kill
 kqueue
+kqueue1
 ktrace
 lchown
 link
Index: lib/libc/shlib_version
===
RCS 

Re: kqueue(2) and close-on-exec

2023-08-13 Thread Philip Guenther
I think that changing the behavior of the existing API in a way that
gratuitously increases the differences between BSDs is unwise.  IMHO, we
should follow NetBSD on this and add kqueue1(), that being obviously
consistent with the previous 'add flags argument' extensions: pipe2(),
dup3(), and accept4().

(As you know, close-on-fork is imposed because the "take a descriptor
number" filters are specified in a way tied to a struct filedesc and
there's no obvious extension to the kqueue behavior for sharing it across
processes.  Oh, and as a result trying to do so would result in
use-after-frees and/or assertion failures in the kernel.  Using a kqueue
after exec() is weird, but not logically undefined in that way.)

Philip

On Sun, Aug 13, 2023 at 4:55 AM Visa Hankala  wrote:

> FreeBSD and NetBSD have variants of the kqueue(2) system call that
> allow setting the close-on-exec flag on the returned file descriptor.
>
> In general, I think it is good that the flag can be set atomically
> for new descriptors. However, it seems to me that it is almost surely
> a mistake if a kqueue descriptor is passed over an exec.
>
> Instead of adding a new system call, maybe close-on-exec should be
> enabled automatically by kqueue(2). Today it feels backwards that
> close-on-exec is off by default.
>
> Note that kqueue cannot be inherited by accident in fork-then-exec
> situations because fork(2) closes kqueue descriptors for the child
> process.
>
> Index: sys/kern/kern_event.c
> ===
> RCS file: src/sys/kern/kern_event.c,v
> retrieving revision 1.197
> diff -u -p -r1.197 kern_event.c
> --- sys/kern/kern_event.c   13 Aug 2023 08:29:28 -  1.197
> +++ sys/kern/kern_event.c   13 Aug 2023 10:42:45 -
> @@ -932,7 +932,7 @@ sys_kqueue(struct proc *p, void *v, regi
> *retval = fd;
> LIST_INSERT_HEAD(>fd_kqlist, kq, kq_next);
> kq = NULL;
> -   fdinsert(fdp, fd, 0, fp);
> +   fdinsert(fdp, fd, UF_EXCLOSE, fp);
> FRELE(fp, p);
>  out:
> fdpunlock(fdp);
> Index: lib/libc/sys/kqueue.2
> ===
> RCS file: src/lib/libc/sys/kqueue.2,v
> retrieving revision 1.48
> diff -u -p -r1.48 kqueue.2
> --- lib/libc/sys/kqueue.2   13 Aug 2023 08:29:28 -  1.48
> +++ lib/libc/sys/kqueue.2   13 Aug 2023 10:42:45 -
> @@ -74,6 +74,7 @@ on a file descriptor will remove any kev
>  .Pp
>  .Fn kqueue
>  creates a new kernel event queue and returns a descriptor.
> +The new descriptor has close-on-exec flag set.
>  The queue is not inherited by a child created with
>  .Xr fork 2 .
>  Similarly, kqueues cannot be passed across UNIX-domain sockets.
>
>