Diff below contains the interesting bits to unlock most of the network related syscalls. As previously explained [0], we know that `f_data' is immutable for sockets so we only have to protect `f_count'. This is done by using a global mutex: `fhdlk'.
I'm aware that this is not the best solution, but my goal is to remove the KERNEL_LOCK(), not to improve a not-yet-existing fine grained locking. That said, if you feel like turning this mutex into SRPs or atomic ops, please go ahead! I'll continue to reduce the scope of the KERNEL_LOCK() and the NET_LOCK() but I'll do my best to review your work. This diff also adds some more KERNEL_LOCK() for ktrace(1) and psignal(). I'm quite confident as it has been tested by many people as part of the big unlocking diff. But more tests/reviews are welcome! ok? [0] https://marc.info/?l=openbsd-tech&m=152699644924165&w=2 Index: kern/kern_descrip.c =================================================================== RCS file: /cvs/src/sys/kern/kern_descrip.c,v retrieving revision 1.163 diff -u -p -r1.163 kern_descrip.c --- kern/kern_descrip.c 2 Jun 2018 12:42:18 -0000 1.163 +++ kern/kern_descrip.c 4 Jun 2018 08:46:52 -0000 @@ -66,7 +66,11 @@ /* * Descriptor management. + * + * We need to block interrupts as long as `fhdlk' is being taken + * with and without the KERNEL_LOCK(). */ +struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR); struct filelist filehead; /* head of list of open files */ int numfiles; /* actual number of open files */ @@ -195,16 +199,18 @@ fd_iterfile(struct file *fp, struct proc { struct file *nfp; + mtx_enter(&fhdlk); if (fp == NULL) nfp = LIST_FIRST(&filehead); else nfp = LIST_NEXT(fp, f_list); - /* don't FREF when f_count == 0 to avoid race in fdrop() */ + /* don't refcount when f_count == 0 to avoid race in fdrop() */ while (nfp != NULL && nfp->f_count == 0) nfp = LIST_NEXT(nfp, f_list); if (nfp != NULL) - FREF(nfp); + nfp->f_count++; + mtx_leave(&fhdlk); if (fp != NULL) FRELE(fp, p); @@ -217,10 +223,17 @@ fd_getfile(struct filedesc *fdp, int fd) { struct file *fp; - if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) + vfs_stall_barrier(); + + if ((u_int)fd >= fdp->fd_nfiles) return (NULL); - FREF(fp); + mtx_enter(&fhdlk); + fp = fdp->fd_ofiles[fd]; + if (fp != NULL) + fp->f_count++; + mtx_leave(&fhdlk); + return (fp); } @@ -672,6 +685,8 @@ fdinsert(struct filedesc *fdp, int fd, i struct file *fq; fdpassertlocked(fdp); + + mtx_enter(&fhdlk); if ((fq = fdp->fd_ofiles[0]) != NULL) { LIST_INSERT_AFTER(fq, fp, f_list); } else { @@ -681,6 +696,7 @@ fdinsert(struct filedesc *fdp, int fd, i fdp->fd_ofiles[fd] = fp; fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE); fp->f_iflags |= FIF_INSERTED; + mtx_leave(&fhdlk); } void @@ -986,7 +1002,11 @@ restart: crhold(fp->f_cred); *resultfp = fp; *resultfd = i; - FREF(fp); + + mtx_enter(&fhdlk); + fp->f_count++; + mtx_leave(&fhdlk); + return (0); } @@ -1078,6 +1098,7 @@ fdcopy(struct process *pr) newfdp->fd_flags = fdp->fd_flags; newfdp->fd_cmask = fdp->fd_cmask; + mtx_enter(&fhdlk); for (i = 0; i <= fdp->fd_lastfile; i++) { struct file *fp = fdp->fd_ofiles[i]; @@ -1094,12 +1115,13 @@ fdcopy(struct process *pr) fp->f_type == DTYPE_KQUEUE) continue; - FREF(fp); + fp->f_count++; newfdp->fd_ofiles[i] = fp; newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; fd_used(newfdp, i); } } + mtx_leave(&fhdlk); fdpunlock(fdp); return (newfdp); @@ -1121,8 +1143,9 @@ fdfree(struct proc *p) for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) { fp = *fpp; if (fp != NULL) { - FREF(fp); *fpp = NULL; + /* closef() expects a refcount of 2 */ + FREF(fp); (void) closef(fp, p); } } @@ -1160,11 +1183,11 @@ closef(struct file *fp, struct proc *p) if (fp == NULL) return (0); -#ifdef DIAGNOSTIC - if (fp->f_count < 2) - panic("closef: count (%ld) < 2", fp->f_count); -#endif + KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count); + + mtx_enter(&fhdlk); fp->f_count--; + mtx_leave(&fhdlk); /* * POSIX record locking dictates that any close releases ALL @@ -1196,18 +1219,19 @@ fdrop(struct file *fp, struct proc *p) { int error; -#ifdef DIAGNOSTIC - if (fp->f_count != 0) - panic("fdrop: count (%ld) != 0", fp->f_count); -#endif + MUTEX_ASSERT_LOCKED(&fhdlk); + + KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count); + + if (fp->f_iflags & FIF_INSERTED) + LIST_REMOVE(fp, f_list); + mtx_leave(&fhdlk); if (fp->f_ops) error = (*fp->f_ops->fo_close)(fp, p); else error = 0; - if (fp->f_iflags & FIF_INSERTED) - LIST_REMOVE(fp, f_list); crfree(fp->f_cred); numfiles--; pool_put(&file_pool, fp); Index: kern/kern_ktrace.c =================================================================== RCS file: /cvs/src/sys/kern/kern_ktrace.c,v retrieving revision 1.97 diff -u -p -r1.97 kern_ktrace.c --- kern/kern_ktrace.c 27 May 2018 06:02:14 -0000 1.97 +++ kern/kern_ktrace.c 4 Jun 2018 08:46:52 -0000 @@ -225,7 +225,7 @@ ktrgenio(struct proc *p, int fd, enum ui struct ktr_header kth; struct ktr_genio ktp; caddr_t cp; - int count; + int count, error; int buflen; atomic_setbits_int(&p->p_flag, P_INKTR); @@ -254,7 +254,10 @@ ktrgenio(struct proc *p, int fd, enum ui if (copyin(iov->iov_base, cp, count)) break; - if (ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count) != 0) + KERNEL_LOCK(); + error = ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count); + KERNEL_UNLOCK(); + if (error != 0) break; iov->iov_len -= count; @@ -294,13 +297,14 @@ ktrstruct(struct proc *p, const char *na { struct ktr_header kth; - KERNEL_ASSERT_LOCKED(); atomic_setbits_int(&p->p_flag, P_INKTR); ktrinitheader(&kth, p, KTR_STRUCT); - + if (data == NULL) datalen = 0; + KERNEL_LOCK(); ktrwrite2(p, &kth, name, strlen(name) + 1, data, datalen); + KERNEL_UNLOCK(); atomic_clearbits_int(&p->p_flag, P_INKTR); } @@ -386,7 +390,9 @@ ktrpledge(struct proc *p, int error, uin kp.code = code; kp.syscall = syscall; + KERNEL_LOCK(); ktrwrite(p, &kth, &kp, sizeof(kp)); + KERNEL_UNLOCK(); atomic_clearbits_int(&p->p_flag, P_INKTR); } @@ -622,6 +628,8 @@ ktrwriteraw(struct proc *curp, struct vn struct iovec aiov[3]; struct process *pr; int error; + + KERNEL_ASSERT_LOCKED(); auio.uio_iov = &aiov[0]; auio.uio_offset = 0; Index: kern/kern_pledge.c =================================================================== RCS file: /cvs/src/sys/kern/kern_pledge.c,v retrieving revision 1.231 diff -u -p -r1.231 kern_pledge.c --- kern/kern_pledge.c 3 Jun 2018 18:20:28 -0000 1.231 +++ kern/kern_pledge.c 4 Jun 2018 08:46:52 -0000 @@ -523,6 +523,7 @@ pledge_fail(struct proc *p, int error, u if (p->p_p->ps_pledge & PLEDGE_ERROR) return (ENOSYS); + KERNEL_LOCK(); log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n", p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall); p->p_p->ps_acflag |= APLEDGE; @@ -535,6 +536,7 @@ pledge_fail(struct proc *p, int error, u psignal(p, SIGABRT); p->p_p->ps_pledge = 0; /* Disable all PLEDGE_ flags */ + KERNEL_UNLOCK(); return (error); } Index: kern/kern_sysctl.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sysctl.c,v retrieving revision 1.341 diff -u -p -r1.341 kern_sysctl.c --- kern/kern_sysctl.c 2 Jun 2018 16:38:21 -0000 1.341 +++ kern/kern_sysctl.c 4 Jun 2018 08:46:52 -0000 @@ -1073,7 +1073,9 @@ fill_file(struct kinfo_file *kf, struct kf->f_flag = fp->f_flag; kf->f_iflags = fp->f_iflags; kf->f_type = fp->f_type; + mtx_enter(&fhdlk); kf->f_count = fp->f_count; + mtx_leave(&fhdlk); if (show_pointers) kf->f_ucred = PTRTOINT64(fp->f_cred); kf->f_uid = fp->f_cred->cr_uid; Index: kern/syscalls.master =================================================================== RCS file: /cvs/src/sys/kern/syscalls.master,v retrieving revision 1.181 diff -u -p -r1.181 syscalls.master --- kern/syscalls.master 28 May 2018 09:17:11 -0000 1.181 +++ kern/syscalls.master 4 Jun 2018 08:46:52 -0000 @@ -90,7 +90,7 @@ #endif 27 STD { ssize_t sys_recvmsg(int s, struct msghdr *msg, \ int flags); } -28 STD { ssize_t sys_sendmsg(int s, \ +28 STD NOLOCK { ssize_t sys_sendmsg(int s, \ const struct msghdr *msg, int flags); } 29 STD { ssize_t sys_recvfrom(int s, void *buf, size_t len, \ int flags, struct sockaddr *from, \ @@ -261,7 +261,7 @@ 130 OBSOL oftruncate 131 STD { int sys_flock(int fd, int how); } 132 STD { int sys_mkfifo(const char *path, mode_t mode); } -133 STD { ssize_t sys_sendto(int s, const void *buf, \ +133 STD NOLOCK { ssize_t sys_sendto(int s, const void *buf, \ size_t len, int flags, const struct sockaddr *to, \ socklen_t tolen); } 134 STD { int sys_shutdown(int s, int how); } Index: kern/uipc_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.173 diff -u -p -r1.173 uipc_syscalls.c --- kern/uipc_syscalls.c 2 Jun 2018 10:27:43 -0000 1.173 +++ kern/uipc_syscalls.c 4 Jun 2018 08:46:52 -0000 @@ -691,13 +691,16 @@ sendit(struct proc *p, int s, struct msg } #endif len = auio.uio_resid; - error = sosend(fp->f_data, to, &auio, NULL, control, flags); + error = sosend(so, to, &auio, NULL, control, flags); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; - if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) + if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) { + KERNEL_LOCK(); ptsignal(p, SIGPIPE, STHREAD); + KERNEL_UNLOCK(); + } } if (error == 0) { *retsize = len - auio.uio_resid; @@ -1176,7 +1179,8 @@ getsock(struct proc *p, int fdes, struct { struct file *fp; - if ((fp = fd_getfile(p->p_fd, fdes)) == NULL) + fp = fd_getfile(p->p_fd, fdes); + if (fp == NULL) return (EBADF); if (fp->f_type != DTYPE_SOCKET) { FRELE(fp, p); Index: kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.126 diff -u -p -r1.126 uipc_usrreq.c --- kern/uipc_usrreq.c 28 Apr 2018 03:13:04 -0000 1.126 +++ kern/uipc_usrreq.c 4 Jun 2018 08:46:52 -0000 @@ -899,6 +899,7 @@ unp_gc(void *arg __unused) fp = defer->ud_fp[i].fp; if (fp == NULL) continue; + /* closef() expects a refcount of 2 */ FREF(fp); if ((unp = fptounp(fp)) != NULL) unp->unp_msgcount--; @@ -915,6 +916,8 @@ unp_gc(void *arg __unused) do { nunref = 0; LIST_FOREACH(unp, &unp_head, unp_link) { + mtx_enter(&fhdlk); + fp = unp->unp_file; if (unp->unp_flags & UNP_GCDEFER) { /* * This socket is referenced by another @@ -925,8 +928,9 @@ unp_gc(void *arg __unused) unp_defer--; } else if (unp->unp_flags & UNP_GCMARK) { /* marked as live in previous pass */ + mtx_leave(&fhdlk); continue; - } else if ((fp = unp->unp_file) == NULL) { + } else if (fp == NULL) { /* not being passed, so can't be in loop */ } else if (fp->f_count == 0) { /* @@ -943,9 +947,11 @@ unp_gc(void *arg __unused) if (fp->f_count == unp->unp_msgcount) { nunref++; unp->unp_flags |= UNP_GCDEAD; + mtx_leave(&fhdlk); continue; } } + mtx_leave(&fhdlk); /* * This is the first time we've seen this socket on Index: sys/file.h =================================================================== RCS file: /cvs/src/sys/sys/file.h,v retrieving revision 1.46 diff -u -p -r1.46 file.h --- sys/file.h 2 Jun 2018 10:27:43 -0000 1.46 +++ sys/file.h 4 Jun 2018 08:46:52 -0000 @@ -65,6 +65,7 @@ struct fileops { * * Locks used to protect struct members in this file: * I immutable after creation + * F global `fhdlk' mutex * f per file `f_mtx' * k kernel lock */ @@ -77,7 +78,7 @@ struct file { #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_KQUEUE 4 /* event queue */ short f_type; /* [I] descriptor type */ - long f_count; /* [k] reference count */ + long f_count; /* [F] reference count */ struct ucred *f_cred; /* [I] credentials associated with descriptor */ struct fileops *f_ops; /* [I] file operation pointers */ off_t f_offset; /* [k] */ @@ -97,12 +98,25 @@ struct file { do { \ extern void vfs_stall_barrier(void); \ vfs_stall_barrier(); \ + mtx_enter(&fhdlk); \ (fp)->f_count++; \ + mtx_leave(&fhdlk); \ } while (0) -#define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0) + +#define FRELE(fp,p) \ +({ \ + int rv = 0; \ + mtx_enter(&fhdlk); \ + if (--(fp)->f_count == 0) \ + rv = fdrop(fp, p); \ + else \ + mtx_leave(&fhdlk); \ + rv; \ +}) int fdrop(struct file *, struct proc *); +extern struct mutex fhdlk; /* protects `filehead' and f_count */ LIST_HEAD(filelist, file); extern int maxfiles; /* kernel limit on number of open files */ extern int numfiles; /* actual number of open files */
