uvm_swap: introduce uvm_swap_data_lock

Theo Buehler Thu, 30 Dec 2021 14:39:49 -0800

The diff below does two things: it adds a uvm_swap_data_lock mutex and
trades it for the KERNEL_LOCK in uvm_swapisfull() and uvm_swap_markbad()


The uvm_swap_data_lock protects all swap data structures, so needs to be
grabbed a few times, many of them already documented in the comments.

For review, I suggest comparing to what NetBSD did and also going
through the consumers (swaplist_insert, swaplist_find, swaplist_trim)
and check that they are properly locked when called, or that there is
the KERNEL_LOCK() in place when swap data structures are manipulated.

In swapmount() I introduced locking since that's needed to be able to
assert that the proper locks are held in swaplist_{insert,find,trim}.

Index: uvm/uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
retrieving revision 1.152
diff -u -p -r1.152 uvm_swap.c
--- uvm/uvm_swap.c      12 Dec 2021 09:14:59 -0000      1.152
+++ uvm/uvm_swap.c      30 Dec 2021 15:47:20 -0000
@@ -44,6 +44,7 @@
 #include <sys/fcntl.h>
 #include <sys/extent.h>
 #include <sys/mount.h>
+#include <sys/mutex.h>
 #include <sys/pool.h>
 #include <sys/syscallargs.h>
 #include <sys/swap.h>
@@ -91,6 +92,9 @@
  *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
  *    system call and prevents the swap priority list from changing
  *    while we are in the middle of a system call (e.g. SWAP_STATS).
+ *  - uvm_swap_data_lock (mutex): this lock protects all swap data
+ *    structures including the priority list, the swapdev structures,
+ *    and the swapmap arena.
  *
  * each swap device has the following info:
  *  - swap device in use (could be disabled, preventing future use)
@@ -212,6 +216,7 @@ LIST_HEAD(swap_priority, swappri);
 struct swap_priority swap_priority;
 
 /* locks */
+struct mutex uvm_swap_data_lock = MUTEX_INITIALIZER(IPL_NONE);
 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
 
 /*
@@ -442,7 +447,7 @@ uvm_swap_finicrypt_all(void)
 /*
  * swaplist_insert: insert swap device "sdp" into the global list
  *
- * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
+ * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
  * => caller must provide a newly malloc'd swappri structure (we will
  *     FREE it if we don't need it... this it to prevent malloc blocking
  *     here while adding swap)
@@ -452,6 +457,9 @@ swaplist_insert(struct swapdev *sdp, str
 {
        struct swappri *spp, *pspp;
 
+       rw_assert_wrlock(&swap_syscall_lock);
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        /*
         * find entry at or after which to insert the new device.
         */
@@ -493,7 +501,7 @@ swaplist_insert(struct swapdev *sdp, str
  * swaplist_find: find and optionally remove a swap device from the
  *     global list.
  *
- * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
+ * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
  * => we return the swapdev we found (and removed)
  */
 struct swapdev *
@@ -502,6 +510,9 @@ swaplist_find(struct vnode *vp, boolean_
        struct swapdev *sdp;
        struct swappri *spp;
 
+       rw_assert_wrlock(&swap_syscall_lock);
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        /*
         * search the lists for the requested vp
         */
@@ -524,13 +535,16 @@ swaplist_find(struct vnode *vp, boolean_
  * swaplist_trim: scan priority list for empty priority entries and kill
  *     them.
  *
- * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
+ * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
  */
 void
 swaplist_trim(void)
 {
        struct swappri *spp, *nextspp;
 
+       rw_assert_wrlock(&swap_syscall_lock);
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
                if (!TAILQ_EMPTY(&spp->spi_swapdev))
                        continue;
@@ -543,7 +557,7 @@ swaplist_trim(void)
  * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
  *
  * => caller must hold swap_syscall_lock
- * => uvm.swap_data_lock should be unlocked (we may sleep)
+ * => uvm_swap_data_lock should be unlocked (we may sleep)
  */
 void
 swapdrum_add(struct swapdev *sdp, int npages)
@@ -563,7 +577,7 @@ swapdrum_add(struct swapdev *sdp, int np
  *     to the "swapdev" that maps that section of the drum.
  *
  * => each swapdev takes one big contig chunk of the drum
- * => caller must hold uvm.swap_data_lock
+ * => caller must hold uvm_swap_data_lock
  */
 struct swapdev *
 swapdrum_getsdp(int pgno)
@@ -571,6 +585,8 @@ swapdrum_getsdp(int pgno)
        struct swapdev *sdp;
        struct swappri *spp;
 
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
                        if (pgno >= sdp->swd_drumoffset &&
@@ -629,7 +645,7 @@ sys_swapctl(struct proc *p, void *v, reg
         *
         * note that the swap_priority list can't change as long
         * as we are holding the swap_syscall_lock.  we don't want
-        * to grab the uvm.swap_data_lock because we may fault&sleep during
+        * to grab the uvm_swap_data_lock because we may fault&sleep during
         * copyout() and we don't want to be holding that lock then!
         */
        if (SCARG(uap, cmd) == SWAP_STATS) {
@@ -701,12 +717,14 @@ sys_swapctl(struct proc *p, void *v, reg
                 */
                priority = SCARG(uap, misc);
                spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
+               mtx_enter(&uvm_swap_data_lock);
                if ((sdp = swaplist_find(vp, 1)) == NULL) {
                        error = ENOENT;
                } else {
                        swaplist_insert(sdp, spp, priority);
                        swaplist_trim();
                }
+               mtx_leave(&uvm_swap_data_lock);
                if (error)
                        free(spp, M_VMSWAP, sizeof(*spp));
                break;
@@ -727,11 +745,8 @@ sys_swapctl(struct proc *p, void *v, reg
                 * trying to enable this device while we are working on
                 * it.
                 */
+
                priority = SCARG(uap, misc);
-               if ((sdp = swaplist_find(vp, 0)) != NULL) {
-                       error = EBUSY;
-                       break;
-               }
                sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK|M_ZERO);
                spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
                sdp->swd_flags = SWF_FAKE;      /* placeholder only */
@@ -745,7 +760,19 @@ sys_swapctl(struct proc *p, void *v, reg
                        sdp->swd_cred = crdup(p->p_ucred);
                }
 
+               mtx_enter(&uvm_swap_data_lock);
+               if (swaplist_find(vp, 0) != NULL) {
+                       error = EBUSY;
+                       mtx_leave(&uvm_swap_data_lock);
+                       if (vp->v_type == VREG) {
+                               crfree(sdp->swd_cred);
+                       }
+                       free(sdp, M_VMSWAP, sizeof *sdp);
+                       free(spp, M_VMSWAP, sizeof *spp);
+                       break;
+               }
                swaplist_insert(sdp, spp, priority);
+               mtx_leave(&uvm_swap_data_lock);
 
                sdp->swd_pathlen = len;
                sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
@@ -759,8 +786,10 @@ sys_swapctl(struct proc *p, void *v, reg
                 */
 
                if ((error = swap_on(p, sdp)) != 0) {
+                       mtx_enter(&uvm_swap_data_lock);
                        (void) swaplist_find(vp, 1);  /* kill fake entry */
                        swaplist_trim();
+                       mtx_leave(&uvm_swap_data_lock);
                        if (vp->v_type == VREG) {
                                crfree(sdp->swd_cred);
                        }
@@ -770,7 +799,9 @@ sys_swapctl(struct proc *p, void *v, reg
                }
                break;
        case SWAP_OFF:
+               mtx_enter(&uvm_swap_data_lock);
                if ((sdp = swaplist_find(vp, 0)) == NULL) {
+                       mtx_leave(&uvm_swap_data_lock);
                        error = ENXIO;
                        break;
                }
@@ -780,6 +811,7 @@ sys_swapctl(struct proc *p, void *v, reg
                 * can't stop swapping from it (again).
                 */
                if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
+                       mtx_leave(&uvm_swap_data_lock);
                        error = EBUSY;
                        break;
                }
@@ -808,7 +840,7 @@ out:
  *     SWF_FAKE).
  *
  * => we avoid the start of the disk (to protect disk labels)
- * => caller should leave uvm.swap_data_lock unlocked, we may lock it
+ * => caller should leave uvm_swap_data_lock unlocked, we may lock it
  *     if needed.
  */
 int
@@ -969,13 +1001,18 @@ swap_on(struct proc *p, struct swapdev *
        /* now add the new swapdev to the drum and enable. */
        swapdrum_add(sdp, npages);
        sdp->swd_npages = size;
+       mtx_enter(&uvm_swap_data_lock);
        sdp->swd_flags &= ~SWF_FAKE;    /* going live */
        sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
        uvmexp.swpages += size;
+       mtx_leave(&uvm_swap_data_lock);
        return (0);
 
+       /*
+        * failure: clean up and return error.
+        */
+
 bad:
-       /* failure: close device if necessary and return error. */
        if (vp != rootvp)
                (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
        return (error);
@@ -989,10 +1026,15 @@ bad:
 int
 swap_off(struct proc *p, struct swapdev *sdp)
 {
+       int npages = sdp->swd_npages;
        int error = 0;
 
+       rw_assert_wrlock(&swap_syscall_lock);
+       MUTEX_ASSERT_LOCKED(&uvm_swap_data_lock);
+
        /* disable the swap area being removed */
        sdp->swd_flags &= ~SWF_ENABLE;
+       mtx_leave(&uvm_swap_data_lock);
 
        /*
         * the idea is to find all the pages that are paged out to this
@@ -1005,15 +1047,16 @@ swap_off(struct proc *p, struct swapdev 
                         sdp->swd_drumoffset + sdp->swd_drumsize) ||
            amap_swap_off(sdp->swd_drumoffset,
                          sdp->swd_drumoffset + sdp->swd_drumsize)) {
-
                error = ENOMEM;
        } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
                error = EBUSY;
        }
 
        if (error) {
+               mtx_enter(&uvm_swap_data_lock);
                sdp->swd_flags |= SWF_ENABLE;
-               return (error);
+               mtx_leave(&uvm_swap_data_lock);
+               return error;
        }
 
        /*
@@ -1029,11 +1072,13 @@ swap_off(struct proc *p, struct swapdev 
                (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
        }
 
-       uvmexp.swpages -= sdp->swd_npages;
+       mtx_enter(&uvm_swap_data_lock);
+       uvmexp.swpages -= npages;
 
        if (swaplist_find(sdp->swd_vp, 1) == NULL)
                panic("swap_off: swapdev not in list");
        swaplist_trim();
+       mtx_leave(&uvm_swap_data_lock);
 
        /*
         * free all resources!
@@ -1067,7 +1112,9 @@ swstrategy(struct buf *bp)
         * in it (i.e. the blocks we are doing I/O on).
         */
        pageno = dbtob((u_int64_t)bp->b_blkno) >> PAGE_SHIFT;
+       mtx_enter(&uvm_swap_data_lock);
        sdp = swapdrum_getsdp(pageno);
+       mtx_leave(&uvm_swap_data_lock);
        if (sdp == NULL) {
                bp->b_error = EINVAL;
                bp->b_flags |= B_ERROR;
@@ -1384,7 +1431,7 @@ sw_reg_iodone_internal(void *xvbp)
  *     allocate in a priority we "rotate" the tail queue.
  * => space can be freed with uvm_swap_free
  * => we return the page slot number in /dev/drum (0 == invalid slot)
- * => we lock uvm.swap_data_lock
+ * => we lock uvm_swap_data_lock
  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
  */
 int
@@ -1404,6 +1451,8 @@ uvm_swap_alloc(int *nslots, boolean_t le
         * lock data lock, convert slots into blocks, and enter loop
         */
        KERNEL_ASSERT_LOCKED();
+       mtx_enter(&uvm_swap_data_lock);
+
 ReTry: /* XXXMRG */
        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
@@ -1425,6 +1474,7 @@ ReTry:    /* XXXMRG */
                        TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
                        sdp->swd_npginuse += *nslots;
                        uvmexp.swpginuse += *nslots;
+                       mtx_leave(&uvm_swap_data_lock);
                        /* done!  return drum slot number */
                        return result + sdp->swd_drumoffset;
                }
@@ -1437,6 +1487,7 @@ ReTry:    /* XXXMRG */
        }
        /* XXXMRG: END HACK */
 
+       mtx_leave(&uvm_swap_data_lock);
        return 0;               /* failed */
 }
 
@@ -1449,10 +1500,10 @@ uvm_swapisfull(void)
 {
        int result;
 
-       KERNEL_LOCK();
+       mtx_enter(&uvm_swap_data_lock);
        KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
        result = (uvmexp.swpgonly == uvmexp.swpages);
-       KERNEL_UNLOCK();
+       mtx_leave(&uvm_swap_data_lock);
 
        return result;
 }
@@ -1460,14 +1511,14 @@ uvm_swapisfull(void)
 /*
  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
  *
- * => we lock uvm.swap_data_lock
+ * => we lock uvm_swap_data_lock
  */
 void
 uvm_swap_markbad(int startslot, int nslots)
 {
        struct swapdev *sdp;
 
-       KERNEL_LOCK();
+       mtx_enter(&uvm_swap_data_lock);
        sdp = swapdrum_getsdp(startslot);
        if (sdp != NULL) {
                /*
@@ -1478,14 +1529,14 @@ uvm_swap_markbad(int startslot, int nslo
                 */
                sdp->swd_npgbad += nslots;
        }
-       KERNEL_UNLOCK();
+       mtx_leave(&uvm_swap_data_lock);
 }
 
 /*
  * uvm_swap_free: free swap slots
  *
  * => this can be all or part of an allocation made by uvm_swap_alloc
- * => we lock uvm.swap_data_lock
+ * => we lock uvm_swap_data_lock
  */
 void
 uvm_swap_free(int startslot, int nslots)
@@ -1506,6 +1557,7 @@ uvm_swap_free(int startslot, int nslots)
         * lookup and access the extent.
         */
        KERNEL_LOCK();
+       mtx_enter(&uvm_swap_data_lock);
        sdp = swapdrum_getsdp(startslot);
        KASSERT(uvmexp.nswapdev >= 1);
        KASSERT(sdp != NULL);
@@ -1518,6 +1570,8 @@ uvm_swap_free(int startslot, int nslots)
 
        sdp->swd_npginuse -= nslots;
        uvmexp.swpginuse -= nslots;
+       mtx_leave(&uvm_swap_data_lock);
+
 #ifdef UVM_SWAP_ENCRYPT
        {
                int i;
@@ -1647,7 +1701,9 @@ uvm_swap_io(struct vm_page **pps, int st
                 * XXX - does this information stay the same over the whole
                 * execution of this function?
                 */
+               mtx_enter(&uvm_swap_data_lock);
                sdp = swapdrum_getsdp(startslot);
+               mtx_leave(&uvm_swap_data_lock);
        }
 
        /*
@@ -1892,13 +1948,11 @@ swapmount(void)
        char *nam;
        char path[MNAMELEN + 1];
 
-       /*
-        * No locking here since we happen to know that we will just be called
-        * once before any other process has forked.
-        */
        if (swap_dev == NODEV)
                return;
 
+       rw_enter_write(&swap_syscall_lock);
+
 #if defined(NFSCLIENT)
        if (swap_dev == NETDEV) {
                extern struct nfs_diskless nfs_diskless;
@@ -1935,16 +1989,22 @@ gotit:
 
        sdp->swd_vp = vp;
 
+       mtx_enter(&uvm_swap_data_lock);
        swaplist_insert(sdp, spp, 0);
+       mtx_leave(&uvm_swap_data_lock);
 
        if (swap_on(curproc, sdp)) {
+               mtx_enter(&uvm_swap_data_lock);
                swaplist_find(vp, 1);
                swaplist_trim();
                vput(sdp->swd_vp);
+               mtx_leave(&uvm_swap_data_lock);
+               rw_exit_write(&swap_syscall_lock);
                free(sdp->swd_path, M_VMSWAP, sdp->swd_pathlen);
                free(sdp, M_VMSWAP, sizeof(*sdp));
                return;
        }
+       rw_exit_write(&swap_syscall_lock);
 }
 
 #ifdef HIBERNATE

uvm_swap: introduce uvm_swap_data_lock

Reply via email to