> Date: Mon, 18 Jun 2018 11:24:00 +0200 > From: Martin Pieuchot <m...@openbsd.org> > > Diff below is the last of the serie to remove the KERNEL_LOCK() from > sendto(2) and sendmsg(2) for sockets protected by the NET_LOCK(). > > As explained previously [0] this diff uses a simple non-intrusive > approached based on a single mutex. I'd like to get this in before > doing any refactoring of this layer to get real test coverages. > > [0] https://marc.info/?l=openbsd-tech&m=152699644924165&w=2 > > Tests? Comments? Oks?
At first sight this diff seems to mess up the locking quite a bit. I'm sure that is intentional, but still I'm not sure this is the right direction. In my mind we should at some point in the future be able to use atomic operations to manage the reference count on struct file (f_count). Using atomic operations has the huge benefit of not having lock ordering problems! But it looks as if this diff gets us further away from that goal. You'll still need a lock for the global list of course. It is easy to protect insertions and removals. The real problem is indeed fd_iterfile(). There you want to take a reference but that isn't really safe since you don't necessarily hold one already. That's why we have that f_count == 0 check there. And as long as you're holding the lock that protects the list you know the struct file is not going to disappear. So if f_count > 0 you can safely take a reference. But calling FREF while holding a mutex is problematic because of vfs_stall_barrier(). You're correct that vfs_stall_barrier() isn't required here. All you need to do is atomically increase the reference count. So this one is indeed a bit special. And using a mutex to protect the reference counts could still work here. What I don't understand is why you also protect the manipulation of the file descriptor tables with the mutex in some places. Those tables are already protected by a lock! All in all, I think that using atomic operations for the reference count would really help. > Index: kern/kern_descrip.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_descrip.c,v > retrieving revision 1.166 > diff -u -p -r1.166 kern_descrip.c > --- kern/kern_descrip.c 18 Jun 2018 09:15:05 -0000 1.166 > +++ kern/kern_descrip.c 18 Jun 2018 09:16:20 -0000 > @@ -66,7 +66,11 @@ > > /* > * Descriptor management. > + * > + * We need to block interrupts as long as `fhdlk' is being taken > + * with and without the KERNEL_LOCK(). > */ > +struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR); > struct filelist filehead; /* head of list of open files */ > int numfiles; /* actual number of open files */ > > @@ -195,16 +199,18 @@ fd_iterfile(struct file *fp, struct proc > { > struct file *nfp; > > + mtx_enter(&fhdlk); > if (fp == NULL) > nfp = LIST_FIRST(&filehead); > else > nfp = LIST_NEXT(fp, f_list); > > - /* don't FREF when f_count == 0 to avoid race in fdrop() */ > + /* don't refcount when f_count == 0 to avoid race in fdrop() */ > while (nfp != NULL && nfp->f_count == 0) > nfp = LIST_NEXT(nfp, f_list); > if (nfp != NULL) > - FREF(nfp); > + nfp->f_count++; > + mtx_leave(&fhdlk); > > if (fp != NULL) > FRELE(fp, p); > @@ -217,10 +223,17 @@ fd_getfile(struct filedesc *fdp, int fd) > { > struct file *fp; > > - if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) > + vfs_stall_barrier(); > + > + if ((u_int)fd >= fdp->fd_nfiles) > return (NULL); > > - FREF(fp); > + mtx_enter(&fhdlk); > + fp = fdp->fd_ofiles[fd]; > + if (fp != NULL) > + fp->f_count++; > + mtx_leave(&fhdlk); > + > return (fp); > } > > @@ -671,6 +684,8 @@ fdinsert(struct filedesc *fdp, int fd, i > struct file *fq; > > fdpassertlocked(fdp); > + > + mtx_enter(&fhdlk); > if ((fq = fdp->fd_ofiles[0]) != NULL) { > LIST_INSERT_AFTER(fq, fp, f_list); > } else { > @@ -680,6 +695,7 @@ fdinsert(struct filedesc *fdp, int fd, i > fdp->fd_ofiles[fd] = fp; > fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE); > fp->f_iflags |= FIF_INSERTED; > + mtx_leave(&fhdlk); > } > > void > @@ -978,7 +994,11 @@ restart: > crhold(fp->f_cred); > *resultfp = fp; > *resultfd = i; > - FREF(fp); > + > + mtx_enter(&fhdlk); > + fp->f_count++; > + mtx_leave(&fhdlk); > + > return (0); > } > > @@ -1069,6 +1089,7 @@ fdcopy(struct process *pr) > newfdp->fd_flags = fdp->fd_flags; > newfdp->fd_cmask = fdp->fd_cmask; > > + mtx_enter(&fhdlk); > for (i = 0; i <= fdp->fd_lastfile; i++) { > struct file *fp = fdp->fd_ofiles[i]; > > @@ -1085,12 +1106,13 @@ fdcopy(struct process *pr) > fp->f_type == DTYPE_KQUEUE) > continue; > > - FREF(fp); > + fp->f_count++; > newfdp->fd_ofiles[i] = fp; > newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; > fd_used(newfdp, i); > } > } > + mtx_leave(&fhdlk); > fdpunlock(fdp); > > return (newfdp); > @@ -1112,8 +1134,9 @@ fdfree(struct proc *p) > for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) { > fp = *fpp; > if (fp != NULL) { > - FREF(fp); > *fpp = NULL; > + /* closef() expects a refcount of 2 */ > + FREF(fp); > (void) closef(fp, p); > } > } > @@ -1149,11 +1172,11 @@ closef(struct file *fp, struct proc *p) > if (fp == NULL) > return (0); > > -#ifdef DIAGNOSTIC > - if (fp->f_count < 2) > - panic("closef: count (%ld) < 2", fp->f_count); > -#endif > + KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count); > + > + mtx_enter(&fhdlk); > fp->f_count--; > + mtx_leave(&fhdlk); > > /* > * POSIX record locking dictates that any close releases ALL > @@ -1185,18 +1208,19 @@ fdrop(struct file *fp, struct proc *p) > { > int error; > > -#ifdef DIAGNOSTIC > - if (fp->f_count != 0) > - panic("fdrop: count (%ld) != 0", fp->f_count); > -#endif > + MUTEX_ASSERT_LOCKED(&fhdlk); > + > + KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count); > + > + if (fp->f_iflags & FIF_INSERTED) > + LIST_REMOVE(fp, f_list); > + mtx_leave(&fhdlk); > > if (fp->f_ops) > error = (*fp->f_ops->fo_close)(fp, p); > else > error = 0; > > - if (fp->f_iflags & FIF_INSERTED) > - LIST_REMOVE(fp, f_list); > crfree(fp->f_cred); > numfiles--; > pool_put(&file_pool, fp); > Index: kern/kern_ktrace.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_ktrace.c,v > retrieving revision 1.97 > diff -u -p -r1.97 kern_ktrace.c > --- kern/kern_ktrace.c 27 May 2018 06:02:14 -0000 1.97 > +++ kern/kern_ktrace.c 18 Jun 2018 09:16:21 -0000 > @@ -225,7 +225,7 @@ ktrgenio(struct proc *p, int fd, enum ui > struct ktr_header kth; > struct ktr_genio ktp; > caddr_t cp; > - int count; > + int count, error; > int buflen; > > atomic_setbits_int(&p->p_flag, P_INKTR); > @@ -254,7 +254,10 @@ ktrgenio(struct proc *p, int fd, enum ui > if (copyin(iov->iov_base, cp, count)) > break; > > - if (ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count) != 0) > + KERNEL_LOCK(); > + error = ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count); > + KERNEL_UNLOCK(); > + if (error != 0) > break; > > iov->iov_len -= count; > @@ -294,13 +297,14 @@ ktrstruct(struct proc *p, const char *na > { > struct ktr_header kth; > > - KERNEL_ASSERT_LOCKED(); > atomic_setbits_int(&p->p_flag, P_INKTR); > ktrinitheader(&kth, p, KTR_STRUCT); > - > + > if (data == NULL) > datalen = 0; > + KERNEL_LOCK(); > ktrwrite2(p, &kth, name, strlen(name) + 1, data, datalen); > + KERNEL_UNLOCK(); > atomic_clearbits_int(&p->p_flag, P_INKTR); > } > > @@ -386,7 +390,9 @@ ktrpledge(struct proc *p, int error, uin > kp.code = code; > kp.syscall = syscall; > > + KERNEL_LOCK(); > ktrwrite(p, &kth, &kp, sizeof(kp)); > + KERNEL_UNLOCK(); > atomic_clearbits_int(&p->p_flag, P_INKTR); > } > > @@ -622,6 +628,8 @@ ktrwriteraw(struct proc *curp, struct vn > struct iovec aiov[3]; > struct process *pr; > int error; > + > + KERNEL_ASSERT_LOCKED(); > > auio.uio_iov = &aiov[0]; > auio.uio_offset = 0; > Index: kern/kern_pledge.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_pledge.c,v > retrieving revision 1.232 > diff -u -p -r1.232 kern_pledge.c > --- kern/kern_pledge.c 16 Jun 2018 15:37:00 -0000 1.232 > +++ kern/kern_pledge.c 18 Jun 2018 09:16:21 -0000 > @@ -524,6 +524,7 @@ pledge_fail(struct proc *p, int error, u > if (p->p_p->ps_pledge & PLEDGE_ERROR) > return (ENOSYS); > > + KERNEL_LOCK(); > log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n", > p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall); > p->p_p->ps_acflag |= APLEDGE; > @@ -536,6 +537,7 @@ pledge_fail(struct proc *p, int error, u > psignal(p, SIGABRT); > > p->p_p->ps_pledge = 0; /* Disable all PLEDGE_ flags */ > + KERNEL_UNLOCK(); > return (error); > } > > Index: kern/kern_sysctl.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_sysctl.c,v > retrieving revision 1.341 > diff -u -p -r1.341 kern_sysctl.c > --- kern/kern_sysctl.c 2 Jun 2018 16:38:21 -0000 1.341 > +++ kern/kern_sysctl.c 18 Jun 2018 09:16:21 -0000 > @@ -1073,7 +1073,9 @@ fill_file(struct kinfo_file *kf, struct > kf->f_flag = fp->f_flag; > kf->f_iflags = fp->f_iflags; > kf->f_type = fp->f_type; > + mtx_enter(&fhdlk); > kf->f_count = fp->f_count; > + mtx_leave(&fhdlk); > if (show_pointers) > kf->f_ucred = PTRTOINT64(fp->f_cred); > kf->f_uid = fp->f_cred->cr_uid; > Index: kern/syscalls.master > =================================================================== > RCS file: /cvs/src/sys/kern/syscalls.master,v > retrieving revision 1.181 > diff -u -p -r1.181 syscalls.master > --- kern/syscalls.master 28 May 2018 09:17:11 -0000 1.181 > +++ kern/syscalls.master 18 Jun 2018 09:16:21 -0000 > @@ -90,7 +90,7 @@ > #endif > 27 STD { ssize_t sys_recvmsg(int s, struct msghdr *msg, \ > int flags); } > -28 STD { ssize_t sys_sendmsg(int s, \ > +28 STD NOLOCK { ssize_t sys_sendmsg(int s, \ > const struct msghdr *msg, int flags); } > 29 STD { ssize_t sys_recvfrom(int s, void *buf, size_t len, \ > int flags, struct sockaddr *from, \ > @@ -261,7 +261,7 @@ > 130 OBSOL oftruncate > 131 STD { int sys_flock(int fd, int how); } > 132 STD { int sys_mkfifo(const char *path, mode_t mode); } > -133 STD { ssize_t sys_sendto(int s, const void *buf, \ > +133 STD NOLOCK { ssize_t sys_sendto(int s, const void *buf, \ > size_t len, int flags, const struct sockaddr *to, \ > socklen_t tolen); } > 134 STD { int sys_shutdown(int s, int how); } > Index: kern/uipc_syscalls.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v > retrieving revision 1.176 > diff -u -p -r1.176 uipc_syscalls.c > --- kern/uipc_syscalls.c 18 Jun 2018 09:15:05 -0000 1.176 > +++ kern/uipc_syscalls.c 18 Jun 2018 09:16:21 -0000 > @@ -691,13 +691,16 @@ sendit(struct proc *p, int s, struct msg > } > #endif > len = auio.uio_resid; > - error = sosend(fp->f_data, to, &auio, NULL, control, flags); > + error = sosend(so, to, &auio, NULL, control, flags); > if (error) { > if (auio.uio_resid != len && (error == ERESTART || > error == EINTR || error == EWOULDBLOCK)) > error = 0; > - if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) > + if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) { > + KERNEL_LOCK(); > ptsignal(p, SIGPIPE, STHREAD); > + KERNEL_UNLOCK(); > + } > } > if (error == 0) { > *retsize = len - auio.uio_resid; > @@ -1176,7 +1179,8 @@ getsock(struct proc *p, int fdes, struct > { > struct file *fp; > > - if ((fp = fd_getfile(p->p_fd, fdes)) == NULL) > + fp = fd_getfile(p->p_fd, fdes); > + if (fp == NULL) > return (EBADF); > if (fp->f_type != DTYPE_SOCKET) { > FRELE(fp, p); > Index: kern/uipc_usrreq.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v > retrieving revision 1.129 > diff -u -p -r1.129 uipc_usrreq.c > --- kern/uipc_usrreq.c 11 Jun 2018 08:57:35 -0000 1.129 > +++ kern/uipc_usrreq.c 18 Jun 2018 09:16:22 -0000 > @@ -906,6 +906,7 @@ unp_gc(void *arg __unused) > fp = defer->ud_fp[i].fp; > if (fp == NULL) > continue; > + /* closef() expects a refcount of 2 */ > FREF(fp); > if ((unp = fptounp(fp)) != NULL) > unp->unp_msgcount--; > @@ -922,6 +923,8 @@ unp_gc(void *arg __unused) > do { > nunref = 0; > LIST_FOREACH(unp, &unp_head, unp_link) { > + mtx_enter(&fhdlk); > + fp = unp->unp_file; > if (unp->unp_flags & UNP_GCDEFER) { > /* > * This socket is referenced by another > @@ -932,8 +935,9 @@ unp_gc(void *arg __unused) > unp_defer--; > } else if (unp->unp_flags & UNP_GCMARK) { > /* marked as live in previous pass */ > + mtx_leave(&fhdlk); > continue; > - } else if ((fp = unp->unp_file) == NULL) { > + } else if (fp == NULL) { > /* not being passed, so can't be in loop */ > } else if (fp->f_count == 0) { > /* > @@ -950,9 +954,11 @@ unp_gc(void *arg __unused) > if (fp->f_count == unp->unp_msgcount) { > nunref++; > unp->unp_flags |= UNP_GCDEAD; > + mtx_leave(&fhdlk); > continue; > } > } > + mtx_leave(&fhdlk); > > /* > * This is the first time we've seen this socket on > Index: sys/file.h > =================================================================== > RCS file: /cvs/src/sys/sys/file.h,v > retrieving revision 1.48 > diff -u -p -r1.48 file.h > --- sys/file.h 18 Jun 2018 09:15:05 -0000 1.48 > +++ sys/file.h 18 Jun 2018 09:16:22 -0000 > @@ -65,6 +65,7 @@ struct fileops { > * > * Locks used to protect struct members in this file: > * I immutable after creation > + * F global `fhdlk' mutex > * f per file `f_mtx' > * k kernel lock > */ > @@ -77,7 +78,7 @@ struct file { > #define DTYPE_PIPE 3 /* pipe */ > #define DTYPE_KQUEUE 4 /* event queue */ > short f_type; /* [I] descriptor type */ > - long f_count; /* [k] reference count */ > + long f_count; /* [F] reference count */ > struct ucred *f_cred; /* [I] credentials associated with descriptor */ > struct fileops *f_ops; /* [I] file operation pointers */ > off_t f_offset; /* [k] */ > @@ -97,12 +98,25 @@ struct file { > do { \ > extern void vfs_stall_barrier(void); \ > vfs_stall_barrier(); \ > + mtx_enter(&fhdlk); \ > (fp)->f_count++; \ > + mtx_leave(&fhdlk); \ > } while (0) > -#define FRELE(fp,p) (--(fp)->f_count == 0 ? fdrop(fp, p) : 0) > + > +#define FRELE(fp,p) \ > +({ \ > + int rv = 0; \ > + mtx_enter(&fhdlk); \ > + if (--(fp)->f_count == 0) \ > + rv = fdrop(fp, p); \ > + else \ > + mtx_leave(&fhdlk); \ > + rv; \ > +}) > > int fdrop(struct file *, struct proc *); > > +extern struct mutex fhdlk; /* protects `filehead' and f_count */ > LIST_HEAD(filelist, file); > extern int maxfiles; /* kernel limit on number of open files > */ > extern int numfiles; /* actual number of open files */ > >