> Date: Mon, 18 Jun 2018 11:24:00 +0200
> From: Martin Pieuchot <m...@openbsd.org>
> 
> Diff below is the last of the serie to remove the KERNEL_LOCK() from
> sendto(2) and sendmsg(2) for sockets protected by the NET_LOCK().
> 
> As explained previously [0] this diff uses a simple non-intrusive
> approached based on a single mutex.  I'd like to get this in before
> doing any refactoring of this layer to get real test coverages.
> 
> [0] https://marc.info/?l=openbsd-tech&m=152699644924165&w=2
> 
> Tests?  Comments?  Oks?

At first sight this diff seems to mess up the locking quite a bit.
I'm sure that is intentional, but still I'm not sure this is the right
direction.  In my mind we should at some point in the future be able
to use atomic operations to manage the reference count on struct file
(f_count).  Using atomic operations has the huge benefit of not having
lock ordering problems!  But it looks as if this diff gets us further
away from that goal.

You'll still need a lock for the global list of course.  It is easy to
protect insertions and removals.  The real problem is indeed
fd_iterfile().  There you want to take a reference but that isn't
really safe since you don't necessarily hold one already.  That's why
we have that f_count == 0 check there.  And as long as you're holding
the lock that protects the list you know the struct file is not going
to disappear.  So if f_count > 0 you can safely take a reference.  But
calling FREF while holding a mutex is problematic because of
vfs_stall_barrier().  You're correct that vfs_stall_barrier() isn't
required here.  All you need to do is atomically increase the
reference count.  So this one is indeed a bit special.  And using a
mutex to protect the reference counts could still work here.

What I don't understand is why you also protect the manipulation of
the file descriptor tables with the mutex in some places.  Those
tables are already protected by a lock!

All in all, I think that using atomic operations for the reference
count would really help.

> Index: kern/kern_descrip.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_descrip.c,v
> retrieving revision 1.166
> diff -u -p -r1.166 kern_descrip.c
> --- kern/kern_descrip.c       18 Jun 2018 09:15:05 -0000      1.166
> +++ kern/kern_descrip.c       18 Jun 2018 09:16:20 -0000
> @@ -66,7 +66,11 @@
>  
>  /*
>   * Descriptor management.
> + *
> + * We need to block interrupts as long as `fhdlk' is being taken
> + * with and without the KERNEL_LOCK().
>   */
> +struct mutex fhdlk = MUTEX_INITIALIZER(IPL_MPFLOOR);
>  struct filelist filehead;    /* head of list of open files */
>  int numfiles;                        /* actual number of open files */
>  
> @@ -195,16 +199,18 @@ fd_iterfile(struct file *fp, struct proc
>  {
>       struct file *nfp;
>  
> +     mtx_enter(&fhdlk);
>       if (fp == NULL)
>               nfp = LIST_FIRST(&filehead);
>       else
>               nfp = LIST_NEXT(fp, f_list);
>  
> -     /* don't FREF when f_count == 0 to avoid race in fdrop() */
> +     /* don't refcount when f_count == 0 to avoid race in fdrop() */
>       while (nfp != NULL && nfp->f_count == 0)
>               nfp = LIST_NEXT(nfp, f_list);
>       if (nfp != NULL)
> -             FREF(nfp);
> +             nfp->f_count++;
> +     mtx_leave(&fhdlk);
>  
>       if (fp != NULL)
>               FRELE(fp, p);
> @@ -217,10 +223,17 @@ fd_getfile(struct filedesc *fdp, int fd)
>  {
>       struct file *fp;
>  
> -     if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
> +     vfs_stall_barrier();
> +
> +     if ((u_int)fd >= fdp->fd_nfiles)
>               return (NULL);
>  
> -     FREF(fp);
> +     mtx_enter(&fhdlk);
> +     fp = fdp->fd_ofiles[fd];
> +     if (fp != NULL)
> +             fp->f_count++;
> +     mtx_leave(&fhdlk);
> +
>       return (fp);
>  }
>  
> @@ -671,6 +684,8 @@ fdinsert(struct filedesc *fdp, int fd, i
>       struct file *fq;
>  
>       fdpassertlocked(fdp);
> +
> +     mtx_enter(&fhdlk);
>       if ((fq = fdp->fd_ofiles[0]) != NULL) {
>               LIST_INSERT_AFTER(fq, fp, f_list);
>       } else {
> @@ -680,6 +695,7 @@ fdinsert(struct filedesc *fdp, int fd, i
>       fdp->fd_ofiles[fd] = fp;
>       fdp->fd_ofileflags[fd] |= (flags & UF_EXCLOSE);
>       fp->f_iflags |= FIF_INSERTED;
> +     mtx_leave(&fhdlk);
>  }
>  
>  void
> @@ -978,7 +994,11 @@ restart:
>       crhold(fp->f_cred);
>       *resultfp = fp;
>       *resultfd = i;
> -     FREF(fp);
> +
> +     mtx_enter(&fhdlk);
> +     fp->f_count++;
> +     mtx_leave(&fhdlk);
> +
>       return (0);
>  }
>  
> @@ -1069,6 +1089,7 @@ fdcopy(struct process *pr)
>       newfdp->fd_flags = fdp->fd_flags;
>       newfdp->fd_cmask = fdp->fd_cmask;
>  
> +     mtx_enter(&fhdlk);
>       for (i = 0; i <= fdp->fd_lastfile; i++) {
>               struct file *fp = fdp->fd_ofiles[i];
>  
> @@ -1085,12 +1106,13 @@ fdcopy(struct process *pr)
>                           fp->f_type == DTYPE_KQUEUE)
>                               continue;
>  
> -                     FREF(fp);
> +                     fp->f_count++;
>                       newfdp->fd_ofiles[i] = fp;
>                       newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
>                       fd_used(newfdp, i);
>               }
>       }
> +     mtx_leave(&fhdlk);
>       fdpunlock(fdp);
>  
>       return (newfdp);
> @@ -1112,8 +1134,9 @@ fdfree(struct proc *p)
>       for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) {
>               fp = *fpp;
>               if (fp != NULL) {
> -                     FREF(fp);
>                       *fpp = NULL;
> +                      /* closef() expects a refcount of 2 */
> +                     FREF(fp);
>                       (void) closef(fp, p);
>               }
>       }
> @@ -1149,11 +1172,11 @@ closef(struct file *fp, struct proc *p)
>       if (fp == NULL)
>               return (0);
>  
> -#ifdef DIAGNOSTIC
> -     if (fp->f_count < 2)
> -             panic("closef: count (%ld) < 2", fp->f_count);
> -#endif
> +     KASSERTMSG(fp->f_count >= 2, "count (%ld) < 2", fp->f_count);
> +
> +     mtx_enter(&fhdlk);
>       fp->f_count--;
> +     mtx_leave(&fhdlk);
>  
>       /*
>        * POSIX record locking dictates that any close releases ALL
> @@ -1185,18 +1208,19 @@ fdrop(struct file *fp, struct proc *p)
>  {
>       int error;
>  
> -#ifdef DIAGNOSTIC
> -     if (fp->f_count != 0)
> -             panic("fdrop: count (%ld) != 0", fp->f_count);
> -#endif
> +     MUTEX_ASSERT_LOCKED(&fhdlk);
> +
> +     KASSERTMSG(fp->f_count == 0, "count (%ld) != 0", fp->f_count);
> +
> +     if (fp->f_iflags & FIF_INSERTED)
> +             LIST_REMOVE(fp, f_list);
> +     mtx_leave(&fhdlk);
>  
>       if (fp->f_ops)
>               error = (*fp->f_ops->fo_close)(fp, p);
>       else
>               error = 0;
>  
> -     if (fp->f_iflags & FIF_INSERTED)
> -             LIST_REMOVE(fp, f_list);
>       crfree(fp->f_cred);
>       numfiles--;
>       pool_put(&file_pool, fp);
> Index: kern/kern_ktrace.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_ktrace.c,v
> retrieving revision 1.97
> diff -u -p -r1.97 kern_ktrace.c
> --- kern/kern_ktrace.c        27 May 2018 06:02:14 -0000      1.97
> +++ kern/kern_ktrace.c        18 Jun 2018 09:16:21 -0000
> @@ -225,7 +225,7 @@ ktrgenio(struct proc *p, int fd, enum ui
>       struct ktr_header kth;
>       struct ktr_genio ktp;
>       caddr_t cp;
> -     int count;
> +     int count, error;
>       int buflen;
>  
>       atomic_setbits_int(&p->p_flag, P_INKTR);
> @@ -254,7 +254,10 @@ ktrgenio(struct proc *p, int fd, enum ui
>               if (copyin(iov->iov_base, cp, count))
>                       break;
>  
> -             if (ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count) != 0)
> +             KERNEL_LOCK();
> +             error = ktrwrite2(p, &kth, &ktp, sizeof(ktp), cp, count);
> +             KERNEL_UNLOCK();
> +             if (error != 0)
>                       break;
>  
>               iov->iov_len -= count;
> @@ -294,13 +297,14 @@ ktrstruct(struct proc *p, const char *na
>  {
>       struct ktr_header kth;
>  
> -     KERNEL_ASSERT_LOCKED();
>       atomic_setbits_int(&p->p_flag, P_INKTR);
>       ktrinitheader(&kth, p, KTR_STRUCT);
> -     
> +
>       if (data == NULL)
>               datalen = 0;
> +     KERNEL_LOCK();
>       ktrwrite2(p, &kth, name, strlen(name) + 1, data, datalen);
> +     KERNEL_UNLOCK();
>       atomic_clearbits_int(&p->p_flag, P_INKTR);
>  }
>  
> @@ -386,7 +390,9 @@ ktrpledge(struct proc *p, int error, uin
>       kp.code = code;
>       kp.syscall = syscall;
>  
> +     KERNEL_LOCK();
>       ktrwrite(p, &kth, &kp, sizeof(kp));
> +     KERNEL_UNLOCK();
>       atomic_clearbits_int(&p->p_flag, P_INKTR);
>  }
>  
> @@ -622,6 +628,8 @@ ktrwriteraw(struct proc *curp, struct vn
>       struct iovec aiov[3];
>       struct process *pr;
>       int error;
> +
> +     KERNEL_ASSERT_LOCKED();
>  
>       auio.uio_iov = &aiov[0];
>       auio.uio_offset = 0;
> Index: kern/kern_pledge.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_pledge.c,v
> retrieving revision 1.232
> diff -u -p -r1.232 kern_pledge.c
> --- kern/kern_pledge.c        16 Jun 2018 15:37:00 -0000      1.232
> +++ kern/kern_pledge.c        18 Jun 2018 09:16:21 -0000
> @@ -524,6 +524,7 @@ pledge_fail(struct proc *p, int error, u
>       if (p->p_p->ps_pledge & PLEDGE_ERROR)
>               return (ENOSYS);
>  
> +     KERNEL_LOCK();
>       log(LOG_ERR, "%s[%d]: pledge \"%s\", syscall %d\n",
>           p->p_p->ps_comm, p->p_p->ps_pid, codes, p->p_pledge_syscall);
>       p->p_p->ps_acflag |= APLEDGE;
> @@ -536,6 +537,7 @@ pledge_fail(struct proc *p, int error, u
>       psignal(p, SIGABRT);
>  
>       p->p_p->ps_pledge = 0;          /* Disable all PLEDGE_ flags */
> +     KERNEL_UNLOCK();
>       return (error);
>  }
>  
> Index: kern/kern_sysctl.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.341
> diff -u -p -r1.341 kern_sysctl.c
> --- kern/kern_sysctl.c        2 Jun 2018 16:38:21 -0000       1.341
> +++ kern/kern_sysctl.c        18 Jun 2018 09:16:21 -0000
> @@ -1073,7 +1073,9 @@ fill_file(struct kinfo_file *kf, struct 
>               kf->f_flag = fp->f_flag;
>               kf->f_iflags = fp->f_iflags;
>               kf->f_type = fp->f_type;
> +             mtx_enter(&fhdlk);
>               kf->f_count = fp->f_count;
> +             mtx_leave(&fhdlk);
>               if (show_pointers)
>                       kf->f_ucred = PTRTOINT64(fp->f_cred);
>               kf->f_uid = fp->f_cred->cr_uid;
> Index: kern/syscalls.master
> ===================================================================
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.181
> diff -u -p -r1.181 syscalls.master
> --- kern/syscalls.master      28 May 2018 09:17:11 -0000      1.181
> +++ kern/syscalls.master      18 Jun 2018 09:16:21 -0000
> @@ -90,7 +90,7 @@
>  #endif
>  27   STD             { ssize_t sys_recvmsg(int s, struct msghdr *msg, \
>                           int flags); }
> -28   STD             { ssize_t sys_sendmsg(int s, \
> +28   STD NOLOCK      { ssize_t sys_sendmsg(int s, \
>                           const struct msghdr *msg, int flags); }
>  29   STD             { ssize_t sys_recvfrom(int s, void *buf, size_t len, \
>                           int flags, struct sockaddr *from, \
> @@ -261,7 +261,7 @@
>  130  OBSOL           oftruncate
>  131  STD             { int sys_flock(int fd, int how); }
>  132  STD             { int sys_mkfifo(const char *path, mode_t mode); }
> -133  STD             { ssize_t sys_sendto(int s, const void *buf, \
> +133  STD NOLOCK      { ssize_t sys_sendto(int s, const void *buf, \
>                           size_t len, int flags, const struct sockaddr *to, \
>                           socklen_t tolen); }
>  134  STD             { int sys_shutdown(int s, int how); }
> Index: kern/uipc_syscalls.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
> retrieving revision 1.176
> diff -u -p -r1.176 uipc_syscalls.c
> --- kern/uipc_syscalls.c      18 Jun 2018 09:15:05 -0000      1.176
> +++ kern/uipc_syscalls.c      18 Jun 2018 09:16:21 -0000
> @@ -691,13 +691,16 @@ sendit(struct proc *p, int s, struct msg
>       }
>  #endif
>       len = auio.uio_resid;
> -     error = sosend(fp->f_data, to, &auio, NULL, control, flags);
> +     error = sosend(so, to, &auio, NULL, control, flags);
>       if (error) {
>               if (auio.uio_resid != len && (error == ERESTART ||
>                   error == EINTR || error == EWOULDBLOCK))
>                       error = 0;
> -             if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0)
> +             if (error == EPIPE && (flags & MSG_NOSIGNAL) == 0) {
> +                     KERNEL_LOCK();
>                       ptsignal(p, SIGPIPE, STHREAD);
> +                     KERNEL_UNLOCK();
> +             }
>       }
>       if (error == 0) {
>               *retsize = len - auio.uio_resid;
> @@ -1176,7 +1179,8 @@ getsock(struct proc *p, int fdes, struct
>  {
>       struct file *fp;
>  
> -     if ((fp = fd_getfile(p->p_fd, fdes)) == NULL)
> +     fp = fd_getfile(p->p_fd, fdes);
> +     if (fp == NULL)
>               return (EBADF);
>       if (fp->f_type != DTYPE_SOCKET) {
>               FRELE(fp, p);
> Index: kern/uipc_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.129
> diff -u -p -r1.129 uipc_usrreq.c
> --- kern/uipc_usrreq.c        11 Jun 2018 08:57:35 -0000      1.129
> +++ kern/uipc_usrreq.c        18 Jun 2018 09:16:22 -0000
> @@ -906,6 +906,7 @@ unp_gc(void *arg __unused)
>                       fp = defer->ud_fp[i].fp;
>                       if (fp == NULL)
>                               continue;
> +                      /* closef() expects a refcount of 2 */
>                       FREF(fp);
>                       if ((unp = fptounp(fp)) != NULL)
>                               unp->unp_msgcount--;
> @@ -922,6 +923,8 @@ unp_gc(void *arg __unused)
>       do {
>               nunref = 0;
>               LIST_FOREACH(unp, &unp_head, unp_link) {
> +                     mtx_enter(&fhdlk);
> +                     fp = unp->unp_file;
>                       if (unp->unp_flags & UNP_GCDEFER) {
>                               /*
>                                * This socket is referenced by another
> @@ -932,8 +935,9 @@ unp_gc(void *arg __unused)
>                               unp_defer--;
>                       } else if (unp->unp_flags & UNP_GCMARK) {
>                               /* marked as live in previous pass */
> +                             mtx_leave(&fhdlk);
>                               continue;
> -                     } else if ((fp = unp->unp_file) == NULL) {
> +                     } else if (fp == NULL) {
>                               /* not being passed, so can't be in loop */
>                       } else if (fp->f_count == 0) {
>                               /*
> @@ -950,9 +954,11 @@ unp_gc(void *arg __unused)
>                               if (fp->f_count == unp->unp_msgcount) {
>                                       nunref++;
>                                       unp->unp_flags |= UNP_GCDEAD;
> +                                     mtx_leave(&fhdlk);
>                                       continue;
>                               }
>                       }
> +                     mtx_leave(&fhdlk);
>  
>                       /*
>                        * This is the first time we've seen this socket on
> Index: sys/file.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/file.h,v
> retrieving revision 1.48
> diff -u -p -r1.48 file.h
> --- sys/file.h        18 Jun 2018 09:15:05 -0000      1.48
> +++ sys/file.h        18 Jun 2018 09:16:22 -0000
> @@ -65,6 +65,7 @@ struct      fileops {
>   *
>   *  Locks used to protect struct members in this file:
>   *   I       immutable after creation
> + *   F       global `fhdlk' mutex
>   *   f       per file `f_mtx'
>   *   k       kernel lock
>   */
> @@ -77,7 +78,7 @@ struct file {
>  #define      DTYPE_PIPE      3       /* pipe */
>  #define      DTYPE_KQUEUE    4       /* event queue */
>       short   f_type;         /* [I] descriptor type */
> -     long    f_count;        /* [k] reference count */
> +     long    f_count;        /* [F] reference count */
>       struct  ucred *f_cred;  /* [I] credentials associated with descriptor */
>       struct  fileops *f_ops; /* [I] file operation pointers */
>       off_t   f_offset;       /* [k] */
> @@ -97,12 +98,25 @@ struct file {
>       do { \
>               extern void vfs_stall_barrier(void); \
>               vfs_stall_barrier(); \
> +             mtx_enter(&fhdlk); \
>               (fp)->f_count++; \
> +             mtx_leave(&fhdlk); \
>       } while (0)
> -#define FRELE(fp,p)  (--(fp)->f_count == 0 ? fdrop(fp, p) : 0)
> +
> +#define FRELE(fp,p) \
> +({ \
> +     int rv = 0; \
> +     mtx_enter(&fhdlk); \
> +     if (--(fp)->f_count == 0) \
> +             rv = fdrop(fp, p); \
> +     else \
> +             mtx_leave(&fhdlk); \
> +     rv; \
> +})
>  
>  int  fdrop(struct file *, struct proc *);
>  
> +extern struct mutex fhdlk;           /* protects `filehead' and f_count */
>  LIST_HEAD(filelist, file);
>  extern int maxfiles;                 /* kernel limit on number of open files 
> */
>  extern int numfiles;                 /* actual number of open files */
> 
> 

Reply via email to