Hello

I took more time on fss.c. I still have some trouble to grasp the 
locking protocol in this file, but I have a proposal to address the
most offending issue: snapshot creation are serialized and the driver
locks fss_device_lock while they occur. On a big filesystem, 
the operation can takes dozens of seconds while fssconfig(1) 
commands get stuck in the kernel awaiting for fss_device_lock.

In the attached patch, I added a thread dedicated to snapshot
creation. This threads takes snapshot creation requests and 
execute them one by one. The caller does not have to wait on
fss_device_lock. 

While there, I added a few mutex_enter/mutex_exit for sc->sc_slock
where it looked inconsistent with other usages. I tried to avoid
holding the lock on long operation. One question pending: it is
okay to hold the lock during a copyin? 

Opinions welcome.

-- 
Emmanuel Dreyfus
m...@netbsd.org
Index: fss.c
===================================================================
RCS file: /cvsroot/src/sys/dev/fss.c,v
retrieving revision 1.114
diff -U4 -r1.114 fss.c
--- fss.c       22 Mar 2023 21:14:46 -0000      1.114
+++ fss.c       9 Aug 2024 16:41:30 -0000
@@ -54,8 +54,9 @@
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/uio.h>
 #include <sys/conf.h>
+#include <sys/queue.h>
 #include <sys/kthread.h>
 #include <sys/fstrans.h>
 #include <sys/vfs_syscalls.h>          /* For do_sys_unlink(). */
 
@@ -90,12 +91,23 @@
 static void fss_bs_thread(void *);
 static int fss_bs_io(struct fss_softc *, fss_io_type,
     u_int32_t, off_t, int, void *, size_t *);
 static u_int32_t *fss_bs_indir(struct fss_softc *, u_int32_t);
+static void fss_create_enqueue(struct fss_softc *, struct fss_set *);
+static void fss_create_thread(void *);
+
+SIMPLEQ_HEAD(fss_create_queue, fss_create_entry) fss_create_queue;
+struct fss_create_entry {
+       struct fss_softc *fce_sc;
+       struct fss_set fce_fss;
+       char fce_mount[MNAMELEN];
+       char fce_bstore[MNAMELEN];
+       SIMPLEQ_ENTRY(fss_create_entry) fce_link;
+};
 
 static kmutex_t fss_device_lock;       /* Protect all units. */
 static kcondvar_t fss_device_cv;       /* Serialize snapshot creation. */
-static bool fss_creating = false;      /* Currently creating a snapshot. */
+lwp_t *fss_create_lwp = NULL;          /* create thread lwp */
 static int fss_num_attached = 0;       /* Number of attached devices. */
 static struct vfs_hooks fss_vfs_hooks = {
        .vh_unmount = fss_unmount_hook
 };
@@ -135,13 +147,38 @@
 
 void
 fssattach(int num)
 {
+       int error; 
 
        mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&fss_device_cv, "snapwait");
-       if (config_cfattach_attach(fss_cd.cd_name, &fss_ca))
+       SIMPLEQ_INIT(&fss_create_queue);
+
+       error = kthread_create(PRI_NONE, KTHREAD_MUSTJOIN, NULL,
+           fss_create_thread, NULL, &fss_create_lwp, "fss_create");
+       if (error) {
+               aprint_error("%s: kthread_create failed\n", fss_cd.cd_name);
+               goto out;
+       }
+
+       error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
+       if (error) {
+               aprint_error("%s: config_cfattach_attach failed\n", 
+                   fss_cd.cd_name);
+               goto out;
+       }
+
+out:
+       if (error) {
                aprint_error("%s: unable to register\n", fss_cd.cd_name);
+               if (fss_create_lwp)
+                       fss_create_thread_join();
+               cv_destroy(&fss_device_cv);
+               mutex_destroy(&fss_device_lock);
+       }
+       return;
+
 }
 
 static int
 fss_match(device_t self, cfdata_t cfdata, void *aux)
@@ -219,9 +256,12 @@
                if (sc == NULL) {
                        mutex_exit(&fss_device_lock);
                        return ENOMEM;
                }
+
+               mutex_enter(&sc->sc_slock);
                sc->sc_state = FSS_IDLE;
+               mutex_exit(&sc->sc_slock);
        }
 
        mutex_enter(&sc->sc_slock);
 
@@ -360,8 +400,9 @@
                if (error == 0 && sc->sc_state != FSS_IDLE) {
                        error = EBUSY;
                } else {
                        sc->sc_state = FSS_CREATING;
+                       /* XXX copyin with mutex held? */
                        copyinstr(fss->fss_mount, sc->sc_mntname,
                            sizeof(sc->sc_mntname), NULL);
                        memset(&sc->sc_time, 0, sizeof(sc->sc_time));
                        sc->sc_clshift = 0;
@@ -372,38 +413,9 @@
 
                /*
                 * Serialize snapshot creation.
                 */
-               mutex_enter(&fss_device_lock);
-               while (fss_creating) {
-                       error = cv_wait_sig(&fss_device_cv, &fss_device_lock);
-                       if (error) {
-                               mutex_enter(&sc->sc_slock);
-                               KASSERT(sc->sc_state == FSS_CREATING);
-                               sc->sc_state = FSS_IDLE;
-                               mutex_exit(&sc->sc_slock);
-                               mutex_exit(&fss_device_lock);
-                               break;
-                       }
-               }
-               fss_creating = true;
-               mutex_exit(&fss_device_lock);
-
-               error = fss_create_snapshot(sc, fss, l);
-               mutex_enter(&sc->sc_slock);
-               if (error == 0) {
-                       KASSERT(sc->sc_state == FSS_ACTIVE);
-                       sc->sc_uflags = fss->fss_flags;
-               } else {
-                       KASSERT(sc->sc_state == FSS_CREATING);
-                       sc->sc_state = FSS_IDLE;
-               }
-               mutex_exit(&sc->sc_slock);
-
-               mutex_enter(&fss_device_lock);
-               fss_creating = false;
-               cv_broadcast(&fss_device_cv);
-               mutex_exit(&fss_device_lock);
+               fss_create_enqueue(sc, fss);
 
                break;
 
        case FSSIOCCLR:
@@ -701,14 +713,14 @@
        struct timespec ts;
        /* distinguish lookup 1 from lookup 2 to reduce mistakes */
        struct pathbuf *pb2;
        struct vnode *vp, *vp2;
+       struct mount *mount;
 
        /*
         * Get the mounted file system.
         */
-
-       error = namei_simple_user(fss->fss_mount,
+       error = namei_simple_kernel(fss->fss_mount,
                                NSM_FOLLOW_NOEMULROOT, &vp);
        if (error != 0)
                return error;
 
@@ -716,50 +728,61 @@
                vrele(vp);
                return EINVAL;
        }
 
-       sc->sc_mount = vp->v_mount;
-       memcpy(sc->sc_mntname, sc->sc_mount->mnt_stat.f_mntonname, MNAMELEN);
+       mount = vp->v_mount;
+       mutex_enter(&sc->sc_slock);
+       sc->sc_mount = mount;
+       memcpy(sc->sc_mntname, mount->mnt_stat.f_mntonname, MNAMELEN);
+       mutex_exit(&sc->sc_slock);
 
        vrele(vp);
 
        /*
         * Check for file system internal snapshot.
         */
 
-       error = namei_simple_user(fss->fss_bstore,
+       error = namei_simple_kernel(fss->fss_bstore,
                                NSM_FOLLOW_NOEMULROOT, &vp);
        if (error != 0)
                return error;
 
-       if (vp->v_type == VREG && vp->v_mount == sc->sc_mount) {
+       mutex_enter(&sc->sc_slock);
+       if (vp->v_type == VREG && vp->v_mount == mount) {
                sc->sc_flags |= FSS_PERSISTENT;
                sc->sc_bs_vp = vp;
 
-               fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
+               fsbsize = mount->mnt_stat.f_iosize;
                bits = sizeof(sc->sc_bs_bshift)*NBBY;
+
                for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < bits;
                    sc->sc_bs_bshift++)
                        if (FSS_FSBSIZE(sc) == fsbsize)
                                break;
-               if (sc->sc_bs_bshift >= bits)
+
+               if (sc->sc_bs_bshift >= bits) {
+                       mutex_exit(&sc->sc_slock);
                        return EINVAL;
+               }
 
                sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
                sc->sc_clshift = 0;
+               mutex_exit(&sc->sc_slock);
 
                if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
-                       error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
+                       error = do_sys_unlink(fss->fss_bstore, UIO_SYSSPACE);
                        if (error)
                                return error;
                }
                error = vn_lock(vp, LK_EXCLUSIVE);
                if (error != 0)
                        return error;
-               error = VFS_SNAPSHOT(sc->sc_mount, sc->sc_bs_vp, &ts);
+               error = VFS_SNAPSHOT(mount, vp, &ts);
+               mutex_enter(&sc->sc_slock);
                TIMESPEC_TO_TIMEVAL(&sc->sc_time, &ts);
+               mutex_exit(&sc->sc_slock);
 
-               VOP_UNLOCK(sc->sc_bs_vp);
+               VOP_UNLOCK(vp);
 
                return error;
        }
        vrele(vp);
@@ -767,12 +790,15 @@
        /*
         * Get the block device it is mounted on and its size.
         */
 
-       error = spec_node_lookup_by_mount(sc->sc_mount, &vp);
+       error = spec_node_lookup_by_mount(mount, &vp);
        if (error)
                return error;
+
+       mutex_enter(&sc->sc_slock);
        sc->sc_bdev = vp->v_rdev;
+       mutex_exit(&sc->sc_slock);
 
        error = getdisksize(vp, &numsec, &secsize);
        vrele(vp);
        if (error)
@@ -784,19 +810,21 @@
         * Get the backing store
         */
 
        error = pathbuf_copyin(fss->fss_bstore, &pb2);
-       if (error) {
+       if (error)
                return error;
-       }
+
        error = vn_open(NULL, pb2, 0, FREAD|FWRITE, 0, &vp2, NULL, NULL);
        if (error != 0) {
                pathbuf_destroy(pb2);
                return error;
        }
        VOP_UNLOCK(vp2);
 
+       mutex_enter(&sc->sc_slock);
        sc->sc_bs_vp = vp2;
+       mutex_exit(&sc->sc_slock);
 
        if (vp2->v_type != VREG && vp2->v_type != VCHR) {
                vrele(vp2);
                pathbuf_destroy(pb2);
@@ -804,27 +832,34 @@
        }
        pathbuf_destroy(pb2);
 
        if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
-               error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
+               error = do_sys_unlink(fss->fss_bstore, UIO_SYSSPACE);
                if (error)
                        return error;
        }
+
+       mutex_enter(&sc->sc_slock);
        if (sc->sc_bs_vp->v_type == VREG) {
                fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
-               if (fsbsize & (fsbsize-1))      /* No power of two */
+               if (fsbsize & (fsbsize-1)) {    /* No power of two */
+                       mutex_exit(&sc->sc_slock);
                        return EINVAL;
+               }
                for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < 32;
                    sc->sc_bs_bshift++)
                        if (FSS_FSBSIZE(sc) == fsbsize)
                                break;
-               if (sc->sc_bs_bshift >= 32)
+               if (sc->sc_bs_bshift >= 32) {
+                       mutex_exit(&sc->sc_slock);
                        return EINVAL;
+               }
                sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
        } else {
                sc->sc_bs_bshift = DEV_BSHIFT;
                sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
        }
+       mutex_exit(&sc->sc_slock);
 
        return 0;
 }
 
@@ -833,8 +868,10 @@
  */
 static int
 fss_create_snapshot(struct fss_softc *sc, struct fss_set *fss, struct lwp *l)
 {
+       struct mount *mount = sc->sc_mount;
+       struct vnode *vp;
        int len, error;
        u_int32_t csize;
        off_t bsize;
 
@@ -845,15 +882,16 @@
         */
        if ((error = fss_create_files(sc, fss, &bsize, l)) != 0)
                goto bad;
 
+       mutex_enter(&sc->sc_slock);
        if (sc->sc_flags & FSS_PERSISTENT) {
-               fss_softc_alloc(sc);
-               mutex_enter(&sc->sc_slock);
                sc->sc_state = FSS_ACTIVE;
                mutex_exit(&sc->sc_slock);
+               fss_softc_alloc(sc);
                return 0;
        }
+       mutex_exit(&sc->sc_slock);
 
        /*
         * Set cluster size. Must be a power of two and
         * a multiple of backing store block size.
@@ -864,13 +902,15 @@
                csize = fss->fss_csize;
        if (bsize/csize > FSS_CLUSTER_MAX)
                csize = bsize/FSS_CLUSTER_MAX+1;
 
+       mutex_enter(&sc->sc_slock);
        for (sc->sc_clshift = sc->sc_bs_bshift; sc->sc_clshift < 32;
            sc->sc_clshift++)
                if (FSS_CLSIZE(sc) >= csize)
                        break;
        if (sc->sc_clshift >= 32) {
+               mutex_exit(&sc->sc_slock);
                error = EINVAL;
                goto bad;
        }
        sc->sc_clmask = FSS_CLSIZE(sc)-1;
@@ -898,52 +938,67 @@
        sc->sc_indir_size = FSS_BTOCL(sc, len)+1;
        sc->sc_clnext = sc->sc_indir_size;
        sc->sc_indir_cur = 0;
 
-       if ((error = fss_softc_alloc(sc)) != 0)
+       mutex_exit(&sc->sc_slock);
+
+       if ((error = fss_softc_alloc(sc)) != 0) {
                goto bad;
+       }
 
        /*
         * Activate the snapshot.
         */
 
-       if ((error = vfs_suspend(sc->sc_mount, 0)) != 0)
+       if ((error = vfs_suspend(mount, 0)) != 0)
                goto bad;
 
+       mutex_enter(&sc->sc_slock);
        microtime(&sc->sc_time);
+       mutex_exit(&sc->sc_slock);
 
-       vrele_flush(sc->sc_mount);
-       error = VFS_SYNC(sc->sc_mount, MNT_WAIT, curlwp->l_cred);
-       if (error == 0)
-               error = fscow_establish(sc->sc_mount, fss_copy_on_write, sc);
+       vrele_flush(mount);
+       error = VFS_SYNC(mount, MNT_WAIT, curlwp->l_cred);
+       if (error == 0) 
+               error = fscow_establish(mount, fss_copy_on_write, sc);
        if (error == 0) {
                mutex_enter(&sc->sc_slock);
                sc->sc_state = FSS_ACTIVE;
                mutex_exit(&sc->sc_slock);
        }
 
-       vfs_resume(sc->sc_mount);
+       vfs_resume(mount);
 
        if (error != 0)
                goto bad;
 
+       mutex_enter(&sc->sc_slock);
        aprint_debug_dev(sc->sc_dev, "%s snapshot active\n", sc->sc_mntname);
        aprint_debug_dev(sc->sc_dev,
            "%u clusters of %u, %u cache slots, %u indir clusters\n",
            sc->sc_clcount, FSS_CLSIZE(sc),
            sc->sc_cache_size, sc->sc_indir_size);
+       mutex_exit(&sc->sc_slock);
 
        return 0;
 
 bad:
        fss_softc_free(sc);
-       if (sc->sc_bs_vp != NULL) {
-               if (sc->sc_flags & FSS_PERSISTENT)
-                       vrele(sc->sc_bs_vp);
-               else
-                       vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
+
+       mutex_enter(&sc->sc_slock);
+       if ((vp = sc->sc_bs_vp) != NULL) {
+               if (sc->sc_flags & FSS_PERSISTENT) {
+                       mutex_exit(&sc->sc_slock);
+                       vrele(vp);
+               } else {
+                       mutex_exit(&sc->sc_slock);
+                       vn_close(vp, FREAD|FWRITE, l->l_cred);
+               }
        }
+
+       mutex_enter(&sc->sc_slock);
        sc->sc_bs_vp = NULL;
+       mutex_exit(&sc->sc_slock);
 
        return error;
 }
 
@@ -1368,8 +1423,81 @@
                mutex_enter(&sc->sc_slock);
        }
 }
 
+static void
+fss_create_enqueue(struct fss_softc *sc, struct fss_set *fss)
+{
+       struct fss_create_entry *fce;
+       
+       fce = kmem_zalloc(sizeof(*fce), KM_SLEEP);
+       fce->fce_sc = sc;
+       if (fss) {
+               memcpy(&fce->fce_fss, fss, sizeof(*fss));
+
+               copyinstr(fss->fss_mount, fce->fce_mount,
+                   sizeof(fce->fce_mount), NULL);
+               fce->fce_fss.fss_mount = fce->fce_mount;
+
+               copyinstr(fss->fss_bstore, fce->fce_bstore,
+                   sizeof(fce->fce_bstore), NULL);
+               fce->fce_fss.fss_bstore = fce->fce_bstore;
+       }
+
+       mutex_enter(&fss_device_lock);
+       SIMPLEQ_INSERT_TAIL(&fss_create_queue, fce, fce_link);
+       mutex_exit(&fss_device_lock);
+
+       cv_signal(&fss_device_cv);
+
+       return;
+}
+
+/* 
+ * Snapshot serialization thread, for all fss devices
+ */
+static void
+fss_create_thread(void *arg)
+{
+       struct fss_create_entry *fce;
+
+       do {
+               struct fss_softc *sc;
+               struct fss_set *fss;
+               int error;
+
+               mutex_enter(&fss_device_lock);
+               while ((fce = SIMPLEQ_FIRST(&fss_create_queue)) == NULL)
+                       cv_wait(&fss_device_cv, &fss_device_lock);
+
+               SIMPLEQ_REMOVE_HEAD(&fss_create_queue, fce_link);
+
+               sc = fce->fce_sc;
+               fss = &fce->fce_fss;
+
+               if (sc == NULL) {
+                       mutex_exit(&fss_device_lock);
+                       kmem_free(fce, sizeof(*fce));
+                       kthread_exit(0);
+               }
+
+               mutex_exit(&fss_device_lock);
+
+               error = fss_create_snapshot(sc, fss, curlwp);
+               mutex_enter(&sc->sc_slock);
+               if (error == 0) {
+                       KASSERT(sc->sc_state == FSS_ACTIVE);
+                       sc->sc_uflags = fss->fss_flags;
+               } else {
+                       KASSERT(sc->sc_state == FSS_CREATING);
+                       sc->sc_state = FSS_IDLE;
+               }
+               mutex_exit(&sc->sc_slock);
+
+               kmem_free(fce, sizeof(*fce));
+       } while (1);
+}
+
 #ifdef _MODULE
 
 #include <sys/module.h>
 
@@ -1377,36 +1505,55 @@
 CFDRIVER_DECL(fss, DV_DISK, NULL);
 
 devmajor_t fss_bmajor = -1, fss_cmajor = -1;
 
+static void
+fss_create_thread_join(void) {
+       fss_create_enqueue(NULL, NULL);
+       if (fss_create_lwp != NULL)
+               kthread_join(fss_create_lwp);
+       fss_create_lwp = NULL;
+}
+
 static int
 fss_modcmd(modcmd_t cmd, void *arg)
 {
        int error = 0;
 
        switch (cmd) {
        case MODULE_CMD_INIT:
+               error = kthread_create(PRI_NONE, KTHREAD_MUSTJOIN, NULL,
+                   fss_create_thread, NULL, &fss_create_lwp, "fss_create");
+               if (error)
+                       break;
+
                mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
                cv_init(&fss_device_cv, "snapwait");
 
                error = devsw_attach(fss_cd.cd_name,
                    &fss_bdevsw, &fss_bmajor, &fss_cdevsw, &fss_cmajor);
                if (error) {
+                       fss_create_thread_join();
+                       cv_destroy(&fss_device_cv);
                        mutex_destroy(&fss_device_lock);
                        break;
                }
 
                error = config_cfdriver_attach(&fss_cd);
                if (error) {
                        devsw_detach(&fss_bdevsw, &fss_cdevsw);
+                       fss_create_thread_join();
+                       cv_destroy(&fss_device_cv);
                        mutex_destroy(&fss_device_lock);
                        break;
                }
 
                error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
                if (error) {
                        config_cfdriver_detach(&fss_cd);
                        devsw_detach(&fss_bdevsw, &fss_cdevsw);
+                       fss_create_thread_join();
+                       cv_destroy(&fss_device_cv);
                        mutex_destroy(&fss_device_lock);
                        break;
                }
 
@@ -1422,8 +1569,9 @@
                        config_cfattach_attach(fss_cd.cd_name, &fss_ca);
                        break;
                }
                devsw_detach(&fss_bdevsw, &fss_cdevsw);
+               fss_create_thread_join();
                cv_destroy(&fss_device_cv);
                mutex_destroy(&fss_device_lock);
                break;
 

Reply via email to