Author: chs
Date: Fri Mar  6 18:41:37 2020
New Revision: 358714
URL: https://svnweb.freebsd.org/changeset/base/358714

Log:
  Add a new "mntfs" pseudo file system which provides private device vnodes for
  file systems to safely access their disk devices, and adapt FFS to use it.
  Also add a new BO_NOBUFS flag to allow enforcing that file systems using
  mntfs vnodes do not accidentally use the original devfs vnode to create 
buffers.
  
  Reviewed by:  kib, mckusick
  Approved by:  imp (mentor)
  Sponsored by: Netflix
  Differential Revision:        https://reviews.freebsd.org/D23787

Added:
  head/sys/fs/mntfs/
  head/sys/fs/mntfs/mntfs_vnops.c   (contents, props changed)
Modified:
  head/sys/conf/files
  head/sys/kern/vfs_subr.c
  head/sys/sys/bufobj.h
  head/sys/sys/mount.h
  head/sys/ufs/ffs/ffs_alloc.c
  head/sys/ufs/ffs/ffs_vfsops.c
  head/sys/ufs/ufs/ufsmount.h

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files Fri Mar  6 17:24:51 2020        (r358713)
+++ head/sys/conf/files Fri Mar  6 18:41:37 2020        (r358714)
@@ -3479,6 +3479,7 @@ fs/fuse/fuse_main.c               optional fusefs
 fs/fuse/fuse_node.c            optional fusefs
 fs/fuse/fuse_vfsops.c          optional fusefs
 fs/fuse/fuse_vnops.c           optional fusefs
+fs/mntfs/mntfs_vnops.c         standard
 fs/msdosfs/msdosfs_conv.c      optional msdosfs
 fs/msdosfs/msdosfs_denode.c    optional msdosfs
 fs/msdosfs/msdosfs_fat.c       optional msdosfs

Added: head/sys/fs/mntfs/mntfs_vnops.c
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/sys/fs/mntfs/mntfs_vnops.c     Fri Mar  6 18:41:37 2020        
(r358714)
@@ -0,0 +1,95 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Netflix, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+
+/*
+ * The "mntfs" VCHR vnodes implemented here provide a safe way for file systems
+ * to access their disk devices.  Using the normal devfs vnode has the problem
+ * that if the device disappears, the devfs vnode is vgone'd as part of
+ * removing it from the application-visible namespace, and some file systems
+ * (notably FFS with softdep) get very unhappy if their dirty buffers are
+ * invalidated out from under them.  By using a separate, private vnode,
+ * file systems are able to clean up their buffer state in a controlled fashion
+ * when the underlying device disappears.
+ */
+
+static int
+mntfs_reclaim(struct vop_reclaim_args *ap)
+{
+       struct vnode *vp = ap->a_vp;
+
+       dev_rel(vp->v_rdev);
+       return (0);
+}
+
+struct vop_vector mntfs_vnodeops = {
+       .vop_default =          &default_vnodeops,
+
+       .vop_fsync =            vop_stdfsync,
+       .vop_strategy =         VOP_PANIC,
+       .vop_reclaim =          mntfs_reclaim,
+};
+VFS_VOP_VECTOR_REGISTER(mntfs_vnodeops);
+
+/*
+ * Allocate a private VCHR vnode for use by a mounted fs.
+ * The underlying device will be the same as for the given vnode.
+ * This mntfs vnode must be freed with mntfs_freevp() rather than just
+ * releasing the reference.
+ */
+struct vnode *
+mntfs_allocvp(struct mount *mp, struct vnode *ovp)
+{
+       struct vnode *vp;
+       struct cdev *dev;
+
+       ASSERT_VOP_ELOCKED(ovp, __func__);
+
+       dev = ovp->v_rdev;
+
+       getnewvnode("mntfs", mp, &mntfs_vnodeops, &vp);
+       vp->v_type = VCHR;
+       vp->v_data = NULL;
+       dev_ref(dev);
+       vp->v_rdev = dev;
+
+       return (vp);
+}
+
+void
+mntfs_freevp(struct vnode *vp)
+{
+
+       vgone(vp);
+       vrele(vp);
+}

Modified: head/sys/kern/vfs_subr.c
==============================================================================
--- head/sys/kern/vfs_subr.c    Fri Mar  6 17:24:51 2020        (r358713)
+++ head/sys/kern/vfs_subr.c    Fri Mar  6 18:41:37 2020        (r358714)
@@ -2289,6 +2289,8 @@ buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xfl
        int error;
 
        ASSERT_BO_WLOCKED(bo);
+       KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
+           ("buf_vlist_add: bo %p does not allow bufs", bo));
        KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
            ("dead bo %p", bo));
        KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,

Modified: head/sys/sys/bufobj.h
==============================================================================
--- head/sys/sys/bufobj.h       Fri Mar  6 17:24:51 2020        (r358713)
+++ head/sys/sys/bufobj.h       Fri Mar  6 18:41:37 2020        (r358714)
@@ -117,6 +117,7 @@ struct bufobj {
 #define        BO_ONWORKLST    (1 << 0)        /* On syncer work-list */
 #define        BO_WWAIT        (1 << 1)        /* Wait for output to complete 
*/
 #define        BO_DEAD         (1 << 2)        /* Dead; only with INVARIANTS */
+#define        BO_NOBUFS       (1 << 3)        /* No bufs allowed */
 
 #define        BO_LOCKPTR(bo)          (&(bo)->bo_lock)
 #define        BO_LOCK(bo)             rw_wlock(BO_LOCKPTR((bo)))

Modified: head/sys/sys/mount.h
==============================================================================
--- head/sys/sys/mount.h        Fri Mar  6 17:24:51 2020        (r358713)
+++ head/sys/sys/mount.h        Fri Mar  6 18:41:37 2020        (r358714)
@@ -940,6 +940,8 @@ extern      struct sx vfsconf_sx;
 #define        vfsconf_unlock()        sx_xunlock(&vfsconf_sx)
 #define        vfsconf_slock()         sx_slock(&vfsconf_sx)
 #define        vfsconf_sunlock()       sx_sunlock(&vfsconf_sx)
+struct vnode *mntfs_allocvp(struct mount *, struct vnode *);
+void   mntfs_freevp(struct vnode *);
 
 /*
  * Declarations for these vfs default operations are located in

Modified: head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_alloc.c        Fri Mar  6 17:24:51 2020        
(r358713)
+++ head/sys/ufs/ffs/ffs_alloc.c        Fri Mar  6 18:41:37 2020        
(r358714)
@@ -3594,6 +3594,7 @@ buffered_write(fp, uio, active_cred, flags, td)
        struct inode *ip;
        struct buf *bp;
        struct fs *fs;
+       struct ufsmount *ump;
        struct filedesc *fdp;
        int error;
        daddr_t lbn;
@@ -3622,10 +3623,12 @@ buffered_write(fp, uio, active_cred, flags, td)
                return (EINVAL);
        }
        ip = VTOI(vp);
-       if (ITODEVVP(ip) != devvp) {
+       ump = ip->i_ump;
+       if (ump->um_odevvp != devvp) {
                vput(vp);
                return (EINVAL);
        }
+       devvp = ump->um_devvp;
        fs = ITOFS(ip);
        vput(vp);
        foffset_lock_uio(fp, uio, flags);

Modified: head/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c       Fri Mar  6 17:24:51 2020        
(r358713)
+++ head/sys/ufs/ffs/ffs_vfsops.c       Fri Mar  6 18:41:37 2020        
(r358714)
@@ -151,7 +151,7 @@ static const char *ffs_opts[] = { "acls", "async", "no
 static int
 ffs_mount(struct mount *mp)
 {
-       struct vnode *devvp;
+       struct vnode *devvp, *odevvp;
        struct thread *td;
        struct ufsmount *ump = NULL;
        struct fs *fs;
@@ -246,6 +246,7 @@ ffs_mount(struct mount *mp)
        if (mp->mnt_flag & MNT_UPDATE) {
                ump = VFSTOUFS(mp);
                fs = ump->um_fs;
+               odevvp = ump->um_odevvp;
                devvp = ump->um_devvp;
                if (fsckpid == -1 && ump->um_fsckpid > 0) {
                        if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 ||
@@ -337,16 +338,15 @@ ffs_mount(struct mount *mp)
                         * If upgrade to read-write by non-root, then verify
                         * that user has necessary permissions on the device.
                         */
-                       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
-                       error = VOP_ACCESS(devvp, VREAD | VWRITE,
+                       vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY);
+                       error = VOP_ACCESS(odevvp, VREAD | VWRITE,
                            td->td_ucred, td);
                        if (error)
                                error = priv_check(td, PRIV_VFS_MOUNT_PERM);
+                       VOP_UNLOCK(odevvp);
                        if (error) {
-                               VOP_UNLOCK(devvp);
                                return (error);
                        }
-                       VOP_UNLOCK(devvp);
                        fs->fs_flags &= ~FS_UNCLEAN;
                        if (fs->fs_clean == 0) {
                                fs->fs_flags |= FS_UNCLEAN;
@@ -782,8 +782,8 @@ loop:
  * Common code for mount and mountroot
  */
 static int
-ffs_mountfs(devvp, mp, td)
-       struct vnode *devvp;
+ffs_mountfs(odevvp, mp, td)
+       struct vnode *odevvp;
        struct mount *mp;
        struct thread *td;
 {
@@ -794,6 +794,7 @@ ffs_mountfs(devvp, mp, td)
        struct ucred *cred;
        struct g_consumer *cp;
        struct mount *nmp;
+       struct vnode *devvp;
        int candelete, canspeedup;
        off_t loc;
 
@@ -802,11 +803,13 @@ ffs_mountfs(devvp, mp, td)
        cred = td ? td->td_ucred : NOCRED;
        ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 
+       devvp = mntfs_allocvp(mp, odevvp);
+       VOP_UNLOCK(odevvp);
        KASSERT(devvp->v_type == VCHR, ("reclaimed devvp"));
        dev = devvp->v_rdev;
        if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
            (uintptr_t)mp) == 0) {
-               VOP_UNLOCK(devvp);
+               mntfs_freevp(devvp);
                return (EBUSY);
        }
        g_topology_lock();
@@ -814,12 +817,14 @@ ffs_mountfs(devvp, mp, td)
        g_topology_unlock();
        if (error != 0) {
                atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
-               VOP_UNLOCK(devvp);
+               mntfs_freevp(devvp);
                return (error);
        }
        dev_ref(dev);
        devvp->v_bufobj.bo_ops = &ffs_ops;
-       VOP_UNLOCK(devvp);
+       BO_LOCK(&odevvp->v_bufobj);
+       odevvp->v_bufobj.bo_flag |= BO_NOBUFS;
+       BO_UNLOCK(&odevvp->v_bufobj);
        if (dev->si_iosize_max != 0)
                mp->mnt_iosize_max = dev->si_iosize_max;
        if (mp->mnt_iosize_max > MAXPHYS)
@@ -1020,6 +1025,7 @@ ffs_mountfs(devvp, mp, td)
        ump->um_mountp = mp;
        ump->um_dev = dev;
        ump->um_devvp = devvp;
+       ump->um_odevvp = odevvp;
        ump->um_nindir = fs->fs_nindir;
        ump->um_bptrtodb = fs->fs_fsbtodb;
        ump->um_seqinc = fs->fs_frag;
@@ -1099,7 +1105,11 @@ out:
                free(ump, M_UFSMNT);
                mp->mnt_data = NULL;
        }
+       BO_LOCK(&odevvp->v_bufobj);
+       odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
+       BO_UNLOCK(&odevvp->v_bufobj);
        atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
+       mntfs_freevp(devvp);
        dev_rel(dev);
        return (error);
 }
@@ -1304,8 +1314,12 @@ ffs_unmount(mp, mntflags)
        }
        g_vfs_close(ump->um_cp);
        g_topology_unlock();
+       BO_LOCK(&ump->um_odevvp->v_bufobj);
+       ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS;
+       BO_UNLOCK(&ump->um_odevvp->v_bufobj);
        atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0);
-       vrele(ump->um_devvp);
+       mntfs_freevp(ump->um_devvp);
+       vrele(ump->um_odevvp);
        dev_rel(ump->um_dev);
        mtx_destroy(UFS_MTX(ump));
        if (mp->mnt_gjprovider != NULL) {
@@ -2293,7 +2307,19 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
        struct buf *tbp;
        int error, nocopy;
 
+       /*
+        * This is the bufobj strategy for the private VCHR vnodes
+        * used by FFS to access the underlying storage device.
+        * We override the default bufobj strategy and thus bypass
+        * VOP_STRATEGY() for these vnodes.
+        */
        vp = bo2vnode(bo);
+       KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR ||
+           bp->b_vp->v_rdev == NULL ||
+           bp->b_vp->v_rdev->si_mountpt == NULL ||
+           VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL ||
+           vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp,
+           ("ffs_geom_strategy() with wrong vp"));
        if (bp->b_iocmd == BIO_WRITE) {
                if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
                    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&

Modified: head/sys/ufs/ufs/ufsmount.h
==============================================================================
--- head/sys/ufs/ufs/ufsmount.h Fri Mar  6 17:24:51 2020        (r358713)
+++ head/sys/ufs/ufs/ufsmount.h Fri Mar  6 18:41:37 2020        (r358714)
@@ -83,7 +83,8 @@ struct ufsmount {
        struct  cdev *um_dev;                   /* (r) device mounted */
        struct  g_consumer *um_cp;              /* (r) GEOM access point */
        struct  bufobj *um_bo;                  /* (r) Buffer cache object */
-       struct  vnode *um_devvp;                /* (r) blk dev mounted vnode */
+       struct  vnode *um_odevvp;               /* (r) devfs dev vnode */
+       struct  vnode *um_devvp;                /* (r) mntfs private vnode */
        u_long  um_fstype;                      /* (c) type of filesystem */
        struct  fs *um_fs;                      /* (r) pointer to superblock */
        struct  ufs_extattr_per_mount um_extattr; /* (c) extended attrs */
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to