Diff below is the last of the serie to remove the KERNEL_LOCK() from sendto(2) and sendmsg(2) for sockets protected by the NET_LOCK().
As explained previously [0] this diff uses a simple non-intrusive approached based on a single mutex. I'd like to get this in before doing any refactoring of this layer to get real test coverages. [0] https://marc.info/?l=openbsd-tech&m=152699644924165&w=2 Tests? Comments? Oks? Index: kern/kern_descrip.c =================================================================== RCS file: /cvs/src/sys/kern/kern_descrip.c,v retrieving revision 1.166 diff -u -p -r1.166 kern_descrip.c --- kern/kern_descrip.c 18 Jun 2018 09:15:05 -0000 1.166 +++ kern/kern_descrip.c 18 Jun 2018 09:16:20 -0000 @@ -66,7 +66,11 @@ /* * Descriptor management. + * + * We need to block interrupts as long as `fhdlk' is being taken + * with and without the KERNEL_LOCK(). */ +struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR); struct filelist filehead; /* head of list of open files */ int numfiles; /* actual number of open files */ @@ -195,16 +199,18 @@ fd_iterfile(struct file *fp, struct proc { struct file *nfp; + mtx_enter(&fhdlk); if (fp == NULL) nfp = LIST_FIRST(&filehead); else nfp = LIST_NEXT(fp, f_list); - /* don't FREF when f_count == 0 to avoid race in fdrop() */ + /* don't refcount when f_count == 0 to avoid race in fdrop() */ while (nfp != NULL && nfp->f_count == 0) nfp = LIST_NEXT(nfp, f_list); if (nfp != NULL) - FREF(nfp); + nfp->f_count++; + mtx_leave(&fhdlk); if (fp != NULL) FRELE(fp, p); @@ -217,10 +223,17 @@ fd_getfile(struct filedesc *fdp, int fd) { struct file *fp; - if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) + vfs_stall_barrier(); + + if ((u_int)fd >= fdp->fd_nfiles) return (NULL); - FREF(fp); + mtx_enter(&fhdlk); + fp = fdp->fd_ofiles[fd]; + if (fp != NULL) + fp->f_count++; + mtx_leave(&fhdlk); + return (fp); } @@ -671,6 +684,8 @@ fdinsert(struct filedesc *fdp, int fd, i struct file *fq; fdpassertlocked(fdp); + + mtx_enter(&fhdlk); if ((fq = fdp->fd_ofiles[0]) != NULL) { LIST_INSERT_AFTER(fq, fp, f_list); } else { @@ -680,6 +695,7 @@ fdinsert(struct filedesc *fdp, int fd, i fdp->fd_ofiles[fd] = fp; fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE); fp->f_iflags |= FIF_INSERTED; + mtx_leave(&fhdlk); } void @@ -978,7 +994,11 @@ restart: crhold(fp->f_cred); *resultfp = fp; *resultfd = i; - FREF(fp); + + mtx_enter(&fhdlk); + fp->f_count++; + mtx_leave(&fhdlk); + return (0); } @@ -1069,6 +1089,7 @@ fdcopy(struct process *pr) newfdp->fd_flags = fdp->fd_flags; newfdp->fd_cmask = fdp->fd_cmask; + mtx_enter(&fhdlk); for (i = 0; i <= fdp->fd_lastfile; i++) { struct file *fp = fdp->fd_ofiles[i]; @@ -1085,12 +1106,13 @@ fdcopy(struct process *pr) fp->f_type == DTYPE_KQUEUE) continue; - FREF(fp); + fp->f_count++; newfdp->fd_ofiles[i] = fp; newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; fd_used(newfdp, i); } } + mtx_leave(&fhdlk); fdpunlock(fdp); return (newfdp); @@ -1112,8 +1134,9 @@ fdfree(struct proc *p) for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) { fp = *fpp; if (fp != NULL) { - FREF(fp); *fpp = NULL; + /* closef() expects a refcount of 2 */ + FREF(fp); (void) closef(fp, p); } } @@ -1149,11 +1172,11 @@ closef(struct file *fp, struct proc *p) if (fp == NULL) return (0); -#ifdef DIAGNOSTIC - if (fp->f_count < 2) - panic("closef: count (%ld) < 2", fp->f_count); -#endif + KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count); + + mtx_enter(&fhdlk); fp->f_count--; + mtx_leave(&fhdlk); /* * POSIX record locking dictates that any close releases ALL @@ -1185,18 +1208,19 @@ fdrop(struct file *fp, struct proc *p) { int error; -#ifdef DIAGNOSTIC - if (fp->f_count != 0) - panic("fdrop: count (%ld) != 0", fp->f_count); -#endif + MUTEX_ASSERT_LOCKED(&fhdlk); + + KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count); + + if (fp->f_iflags & FIF_INSERTED) + LIST_REMOVE(fp, f_list); + mtx_leave(&fhdlk); if (fp->f_ops) error = (*fp->f_ops->fo_close)(fp, p); else error = 0; - if (fp->f_iflags & FIF_INSERTED) - LIST_REMOVE(fp, f_list); crfree(fp->f_cred); numfiles--; pool_put(&file_pool, fp); Index: kern/kern_ktrace.c =================================================================== RCS file: /cvs/src/sys/kern/kern_ktrace.c,v retrieving revision 1.97 diff -u -p -r1.97 kern_ktrace.c --- kern/kern_ktrace.c 27 May 2018 06:02:14 -0000 1.97 +++ kern/kern_ktrace.c 18 Jun 2018 09:16:21 -0000 @@ -225,7 +225,7 @@ ktrgenio(struct proc *p, int fd, enum ui struct ktr_header kth; struct ktr_genio ktp; caddr_t cp; - int count; + int count, error; int buflen; atomic_setbits_int(&p->p_flag, P_INKTR); @@ -254,7 +254,10 @@ ktrgenio(struct proc *p, int fd, enum ui if (copyin(iov->iov_base, cp, count)) break; - if (ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count) != 0) + KERNEL_LOCK(); + error = ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count); + KERNEL_UNLOCK(); + if (error != 0) break; iov->iov_len -= count; @@ -294,13 +297,14 @@ ktrstruct(struct proc *p, const char *na { struct ktr_header kth; - KERNEL_ASSERT_LOCKED(); atomic_setbits_int(&p->p_flag, P_INKTR); ktrinitheader(&kth, p, KTR_STRUCT); - + if (data == NULL) datalen = 0; + KERNEL_LOCK(); ktrwrite2(p, &kth, name, strlen(name) + 1, data, datalen); + KERNEL_UNLOCK(); atomic_clearbits_int(&p->p_flag, P_INKTR); } @@ -386,7 +390,9 @@ ktrpledge(struct proc *p, int error, uin kp.code = code; kp.syscall = syscall; + KERNEL_LOCK(); ktrwrite(p, &kth, &kp, sizeof(kp)); + KERNEL_UNLOCK(); atomic_clearbits_int(&p->p_flag, P_INKTR); } @@ -622,6 +628,8 @@ ktrwriteraw(struct proc *curp, struct vn struct iovec aiov[3]; struct process *pr; int error; + + KERNEL_ASSERT_LOCKED(); auio.uio_iov = &aiov[0]; auio.uio_offset = 0; Index: kern/kern_pledge.c =================================================================== RCS file: /cvs/src/sys/kern/kern_pledge.c,v retrieving revision 1.232 diff -u -p -r1.232 kern_pledge.c --- kern/kern_pledge.c 16 Jun 2018 15:37:00 -0000 1.232 +++ kern/kern_pledge.c 18 Jun 2018 09:16:21 -0000 @@ -524,6 +524,7 @@ pledge_fail(struct proc *p, int error, u if (p->p_p->ps_pledge & PLEDGE_ERROR) return (ENOSYS); + KERNEL_LOCK(); log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n", p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall); p->p_p->ps_acflag |= APLEDGE; @@ -536,6 +537,7 @@ pledge_fail(struct proc *p, int error, u psignal(p, SIGABRT); p->p_p->ps_pledge = 0; /* Disable all PLEDGE_ flags */ + KERNEL_UNLOCK(); return (error); } Index: kern/kern_sysctl.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sysctl.c,v retrieving revision 1.341 diff -u -p -r1.341 kern_sysctl.c --- kern/kern_sysctl.c 2 Jun 2018 16:38:21 -0000 1.341 +++ kern/kern_sysctl.c 18 Jun 2018 09:16:21 -0000 @@ -1073,7 +1073,9 @@ fill_file(struct kinfo_file *kf, struct kf->f_flag = fp->f_flag; kf->f_iflags = fp->f_iflags; kf->f_type = fp->f_type; + mtx_enter(&fhdlk); kf->f_count = fp->f_count; + mtx_leave(&fhdlk); if (show_pointers) kf->f_ucred = PTRTOINT64(fp->f_cred); kf->f_uid = fp->f_cred->cr_uid; Index: kern/syscalls.master =================================================================== RCS file: /cvs/src/sys/kern/syscalls.master,v retrieving revision 1.181 diff -u -p -r1.181 syscalls.master --- kern/syscalls.master 28 May 2018 09:17:11 -0000 1.181 +++ kern/syscalls.master 18 Jun 2018 09:16:21 -0000 @@ -90,7 +90,7 @@ #endif 27 STD { ssize_t sys_recvmsg(int s, struct msghdr *msg, \ int flags); } -28 STD { ssize_t sys_sendmsg(int s, \ +28 STD NOLOCK { ssize_t sys_sendmsg(int s, \ const struct msghdr *msg, int flags); } 29 STD { ssize_t sys_recvfrom(int s, void *buf, size_t len, \ int flags, struct sockaddr *from, \ @@ -261,7 +261,7 @@ 130 OBSOL oftruncate 131 STD { int sys_flock(int fd, int how); } 132 STD { int sys_mkfifo(const char *path, mode_t mode); } -133 STD { ssize_t sys_sendto(int s, const void *buf, \ +133 STD NOLOCK { ssize_t sys_sendto(int s, const void *buf, \ size_t len, int flags, const struct sockaddr *to, \ socklen_t tolen); } 134 STD { int sys_shutdown(int s, int how); } Index: kern/uipc_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.176 diff -u -p -r1.176 uipc_syscalls.c --- kern/uipc_syscalls.c 18 Jun 2018 09:15:05 -0000 1.176 +++ kern/uipc_syscalls.c 18 Jun 2018 09:16:21 -0000 @@ -691,13 +691,16 @@ sendit(struct proc *p, int s, struct msg } #endif len = auio.uio_resid; - error = sosend(fp->f_data, to, &auio, NULL, control, flags); + error = sosend(so, to, &auio, NULL, control, flags); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; - if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) + if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) { + KERNEL_LOCK(); ptsignal(p, SIGPIPE, STHREAD); + KERNEL_UNLOCK(); + } } if (error == 0) { *retsize = len - auio.uio_resid; @@ -1176,7 +1179,8 @@ getsock(struct proc *p, int fdes, struct { struct file *fp; - if ((fp = fd_getfile(p->p_fd, fdes)) == NULL) + fp = fd_getfile(p->p_fd, fdes); + if (fp == NULL) return (EBADF); if (fp->f_type != DTYPE_SOCKET) { FRELE(fp, p); Index: kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.129 diff -u -p -r1.129 uipc_usrreq.c --- kern/uipc_usrreq.c 11 Jun 2018 08:57:35 -0000 1.129 +++ kern/uipc_usrreq.c 18 Jun 2018 09:16:22 -0000 @@ -906,6 +906,7 @@ unp_gc(void *arg __unused) fp = defer->ud_fp[i].fp; if (fp == NULL) continue; + /* closef() expects a refcount of 2 */ FREF(fp); if ((unp = fptounp(fp)) != NULL) unp->unp_msgcount--; @@ -922,6 +923,8 @@ unp_gc(void *arg __unused) do { nunref = 0; LIST_FOREACH(unp, &unp_head, unp_link) { + mtx_enter(&fhdlk); + fp = unp->unp_file; if (unp->unp_flags & UNP_GCDEFER) { /* * This socket is referenced by another @@ -932,8 +935,9 @@ unp_gc(void *arg __unused) unp_defer--; } else if (unp->unp_flags & UNP_GCMARK) { /* marked as live in previous pass */ + mtx_leave(&fhdlk); continue; - } else if ((fp = unp->unp_file) == NULL) { + } else if (fp == NULL) { /* not being passed, so can't be in loop */ } else if (fp->f_count == 0) { /* @@ -950,9 +954,11 @@ unp_gc(void *arg __unused) if (fp->f_count == unp->unp_msgcount) { nunref++; unp->unp_flags |= UNP_GCDEAD; + mtx_leave(&fhdlk); continue; } } + mtx_leave(&fhdlk); /* * This is the first time we've seen this socket on Index: sys/file.h =================================================================== RCS file: /cvs/src/sys/sys/file.h,v retrieving revision 1.48 diff -u -p -r1.48 file.h --- sys/file.h 18 Jun 2018 09:15:05 -0000 1.48 +++ sys/file.h 18 Jun 2018 09:16:22 -0000 @@ -65,6 +65,7 @@ struct fileops { * * Locks used to protect struct members in this file: * I immutable after creation + * F global `fhdlk' mutex * f per file `f_mtx' * k kernel lock */ @@ -77,7 +78,7 @@ struct file { #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_KQUEUE 4 /* event queue */ short f_type; /* [I] descriptor type */ - long f_count; /* [k] reference count */ + long f_count; /* [F] reference count */ struct ucred *f_cred; /* [I] credentials associated with descriptor */ struct fileops *f_ops; /* [I] file operation pointers */ off_t f_offset; /* [k] */ @@ -97,12 +98,25 @@ struct file { do { \ extern void vfs_stall_barrier(void); \ vfs_stall_barrier(); \ + mtx_enter(&fhdlk); \ (fp)->f_count++; \ + mtx_leave(&fhdlk); \ } while (0) -#define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0) + +#define FRELE(fp,p) \ +({ \ + int rv = 0; \ + mtx_enter(&fhdlk); \ + if (--(fp)->f_count == 0) \ + rv = fdrop(fp, p); \ + else \ + mtx_leave(&fhdlk); \ + rv; \ +}) int fdrop(struct file *, struct proc *); +extern struct mutex fhdlk; /* protects `filehead' and f_count */ LIST_HEAD(filelist, file); extern int maxfiles; /* kernel limit on number of open files */ extern int numfiles; /* actual number of open files */