The branch main has been updated by kib:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=ef6ea91593ebff73e2fc201efd9f848b71c5a125

commit ef6ea91593ebff73e2fc201efd9f848b71c5a125
Author:     Konstantin Belousov <k...@freebsd.org>
AuthorDate: 2025-06-02 07:05:06 +0000
Commit:     Konstantin Belousov <k...@freebsd.org>
CommitDate: 2025-07-04 15:23:42 +0000

    VOP_RENAME: add mp-global lock
    
    It is before all vnode locks, but after vn_start_write().
    
    The lock prevents parallel rename operations on the same mount point,
    which should in (near future) simplify a lot of code in VFS/fs that
    otherwise need to code with either the changing hierarchy, or with the
    lock order for vnodes due to changed hierarchy.
    
    On renames, the lock is taken on the lowest stacked filesystem.
    Otherwise rename could still occur in parallel, by performing one of op
    on the lower fs.
    
    Proposed by:    mjg (long time ago)
    Reviewed by:    markj, olce
    Tested by:      pho
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D50648
---
 sys/kern/vfs_mount.c    |  2 ++
 sys/kern/vfs_subr.c     |  7 +++++++
 sys/kern/vfs_syscalls.c | 20 +++++++++++++++++++-
 sys/sys/mount.h         |  1 +
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index cb18468d28bc..8e64a7fe966b 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -156,6 +156,7 @@ mount_init(void *mem, int size, int flags)
        mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
        mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
        lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
+       lockinit(&mp->mnt_renamelock, PVFS, "rename", 0, 0);
        mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO);
        mp->mnt_ref = 0;
        mp->mnt_vfs_ops = 1;
@@ -170,6 +171,7 @@ mount_fini(void *mem, int size)
 
        mp = (struct mount *)mem;
        uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu);
+       lockdestroy(&mp->mnt_renamelock);
        lockdestroy(&mp->mnt_explock);
        mtx_destroy(&mp->mnt_listmtx);
        mtx_destroy(&mp->mnt_mtx);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 877931721da4..918b256e6c59 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -5853,6 +5853,8 @@ vop_rename_pre(void *ap)
        struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
+       struct mount *tmp;
+
        if (a->a_tvp)
                ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
        ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
@@ -5870,6 +5872,11 @@ vop_rename_pre(void *ap)
        if (a->a_tvp)
                ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
        ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
+
+       tmp = NULL;
+       VOP_GETWRITEMOUNT(a->a_tdvp, &tmp);
+       lockmgr_assert(&tmp->mnt_renamelock, KA_XLOCKED);
+       vfs_rel(tmp);
 #endif
        /*
         * It may be tempting to add vn_seqc_write_begin/end calls here and
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index c236f241bf20..d880733cbfe7 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -3766,7 +3766,7 @@ int
 kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
     const char *new, enum uio_seg pathseg)
 {
-       struct mount *mp = NULL;
+       struct mount *mp, *tmp;
        struct vnode *tvp, *fvp, *tdvp;
        struct nameidata fromnd, tond;
        uint64_t tondflags;
@@ -3774,6 +3774,7 @@ kern_renameat(struct thread *td, int oldfd, const char 
*old, int newfd,
        short irflag;
 
 again:
+       tmp = mp = NULL;
        bwillwrite();
 #ifdef MAC
        if (mac_vnode_check_rename_from_enabled()) {
@@ -3809,6 +3810,7 @@ again:
        tvp = tond.ni_vp;
        error = vn_start_write(fvp, &mp, V_NOWAIT);
        if (error != 0) {
+again1:
                NDFREE_PNBUF(&fromnd);
                NDFREE_PNBUF(&tond);
                if (tvp != NULL)
@@ -3819,11 +3821,25 @@ again:
                        vput(tdvp);
                vrele(fromnd.ni_dvp);
                vrele(fvp);
+               if (tmp != NULL) {
+                       lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE, NULL);
+                       lockmgr(&tmp->mnt_renamelock, LK_RELEASE, NULL);
+                       vfs_rel(tmp);
+                       tmp = NULL;
+               }
                error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH);
                if (error != 0)
                        return (error);
                goto again;
        }
+       error = VOP_GETWRITEMOUNT(tdvp, &tmp);
+       if (error != 0 || tmp == NULL)
+               goto again1;
+       error = lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE | LK_NOWAIT, NULL);
+       if (error != 0) {
+               vn_finished_write(mp);
+               goto again1;
+       }
        irflag = vn_irflag_read(fvp);
        if (((irflag & VIRF_NAMEDATTR) != 0 && tdvp != fromnd.ni_dvp) ||
            (irflag & VIRF_NAMEDDIR) != 0) {
@@ -3884,6 +3900,8 @@ out:
                vrele(fromnd.ni_dvp);
                vrele(fvp);
        }
+       lockmgr(&tmp->mnt_renamelock, LK_RELEASE, 0);
+       vfs_rel(tmp);
        vn_finished_write(mp);
 out1:
        if (error == ERESTART)
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index a6f858e02395..f6480b173a5c 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -267,6 +267,7 @@ struct mount {
        int             mnt_lazyvnodelistsize;  /* (l) # of lazy vnodes */
        int             mnt_upper_pending;      /* (i) # of pending ops on 
mnt_uppers */
        struct lock     mnt_explock;            /* vfs_export walkers lock */
+       struct lock     mnt_renamelock;         /* renames and 
O_RESOLVE_BENEATH */
        TAILQ_HEAD(, mount_upper_node) mnt_uppers; /* (i) upper mounts over us 
*/
        TAILQ_HEAD(, mount_upper_node) mnt_notify; /* (i) upper mounts for 
notification */
        STAILQ_ENTRY(mount) mnt_taskqueue_link; /* (d) our place in deferred 
unmount list */

Reply via email to