Hello I took more time on fss.c. I still have some trouble to grasp the locking protocol in this file, but I have a proposal to address the most offending issue: snapshot creation are serialized and the driver locks fss_device_lock while they occur. On a big filesystem, the operation can takes dozens of seconds while fssconfig(1) commands get stuck in the kernel awaiting for fss_device_lock.
In the attached patch, I added a thread dedicated to snapshot creation. This threads takes snapshot creation requests and execute them one by one. The caller does not have to wait on fss_device_lock. While there, I added a few mutex_enter/mutex_exit for sc->sc_slock where it looked inconsistent with other usages. I tried to avoid holding the lock on long operation. One question pending: it is okay to hold the lock during a copyin? Opinions welcome. -- Emmanuel Dreyfus m...@netbsd.org
Index: fss.c =================================================================== RCS file: /cvsroot/src/sys/dev/fss.c,v retrieving revision 1.114 diff -U4 -r1.114 fss.c --- fss.c 22 Mar 2023 21:14:46 -0000 1.114 +++ fss.c 9 Aug 2024 16:41:30 -0000 @@ -54,8 +54,9 @@ #include <sys/vnode.h> #include <sys/file.h> #include <sys/uio.h> #include <sys/conf.h> +#include <sys/queue.h> #include <sys/kthread.h> #include <sys/fstrans.h> #include <sys/vfs_syscalls.h> /* For do_sys_unlink(). */ @@ -90,12 +91,23 @@ static void fss_bs_thread(void *); static int fss_bs_io(struct fss_softc *, fss_io_type, u_int32_t, off_t, int, void *, size_t *); static u_int32_t *fss_bs_indir(struct fss_softc *, u_int32_t); +static void fss_create_enqueue(struct fss_softc *, struct fss_set *); +static void fss_create_thread(void *); + +SIMPLEQ_HEAD(fss_create_queue, fss_create_entry) fss_create_queue; +struct fss_create_entry { + struct fss_softc *fce_sc; + struct fss_set fce_fss; + char fce_mount[MNAMELEN]; + char fce_bstore[MNAMELEN]; + SIMPLEQ_ENTRY(fss_create_entry) fce_link; +}; static kmutex_t fss_device_lock; /* Protect all units. */ static kcondvar_t fss_device_cv; /* Serialize snapshot creation. */ -static bool fss_creating = false; /* Currently creating a snapshot. */ +lwp_t *fss_create_lwp = NULL; /* create thread lwp */ static int fss_num_attached = 0; /* Number of attached devices. */ static struct vfs_hooks fss_vfs_hooks = { .vh_unmount = fss_unmount_hook }; @@ -135,13 +147,38 @@ void fssattach(int num) { + int error; mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&fss_device_cv, "snapwait"); - if (config_cfattach_attach(fss_cd.cd_name, &fss_ca)) + SIMPLEQ_INIT(&fss_create_queue); + + error = kthread_create(PRI_NONE, KTHREAD_MUSTJOIN, NULL, + fss_create_thread, NULL, &fss_create_lwp, "fss_create"); + if (error) { + aprint_error("%s: kthread_create failed\n", fss_cd.cd_name); + goto out; + } + + error = config_cfattach_attach(fss_cd.cd_name, &fss_ca); + if (error) { + aprint_error("%s: config_cfattach_attach failed\n", + fss_cd.cd_name); + goto out; + } + +out: + if (error) { aprint_error("%s: unable to register\n", fss_cd.cd_name); + if (fss_create_lwp) + fss_create_thread_join(); + cv_destroy(&fss_device_cv); + mutex_destroy(&fss_device_lock); + } + return; + } static int fss_match(device_t self, cfdata_t cfdata, void *aux) @@ -219,9 +256,12 @@ if (sc == NULL) { mutex_exit(&fss_device_lock); return ENOMEM; } + + mutex_enter(&sc->sc_slock); sc->sc_state = FSS_IDLE; + mutex_exit(&sc->sc_slock); } mutex_enter(&sc->sc_slock); @@ -360,8 +400,9 @@ if (error == 0 && sc->sc_state != FSS_IDLE) { error = EBUSY; } else { sc->sc_state = FSS_CREATING; + /* XXX copyin with mutex held? */ copyinstr(fss->fss_mount, sc->sc_mntname, sizeof(sc->sc_mntname), NULL); memset(&sc->sc_time, 0, sizeof(sc->sc_time)); sc->sc_clshift = 0; @@ -372,38 +413,9 @@ /* * Serialize snapshot creation. */ - mutex_enter(&fss_device_lock); - while (fss_creating) { - error = cv_wait_sig(&fss_device_cv, &fss_device_lock); - if (error) { - mutex_enter(&sc->sc_slock); - KASSERT(sc->sc_state == FSS_CREATING); - sc->sc_state = FSS_IDLE; - mutex_exit(&sc->sc_slock); - mutex_exit(&fss_device_lock); - break; - } - } - fss_creating = true; - mutex_exit(&fss_device_lock); - - error = fss_create_snapshot(sc, fss, l); - mutex_enter(&sc->sc_slock); - if (error == 0) { - KASSERT(sc->sc_state == FSS_ACTIVE); - sc->sc_uflags = fss->fss_flags; - } else { - KASSERT(sc->sc_state == FSS_CREATING); - sc->sc_state = FSS_IDLE; - } - mutex_exit(&sc->sc_slock); - - mutex_enter(&fss_device_lock); - fss_creating = false; - cv_broadcast(&fss_device_cv); - mutex_exit(&fss_device_lock); + fss_create_enqueue(sc, fss); break; case FSSIOCCLR: @@ -701,14 +713,14 @@ struct timespec ts; /* distinguish lookup 1 from lookup 2 to reduce mistakes */ struct pathbuf *pb2; struct vnode *vp, *vp2; + struct mount *mount; /* * Get the mounted file system. */ - - error = namei_simple_user(fss->fss_mount, + error = namei_simple_kernel(fss->fss_mount, NSM_FOLLOW_NOEMULROOT, &vp); if (error != 0) return error; @@ -716,50 +728,61 @@ vrele(vp); return EINVAL; } - sc->sc_mount = vp->v_mount; - memcpy(sc->sc_mntname, sc->sc_mount->mnt_stat.f_mntonname, MNAMELEN); + mount = vp->v_mount; + mutex_enter(&sc->sc_slock); + sc->sc_mount = mount; + memcpy(sc->sc_mntname, mount->mnt_stat.f_mntonname, MNAMELEN); + mutex_exit(&sc->sc_slock); vrele(vp); /* * Check for file system internal snapshot. */ - error = namei_simple_user(fss->fss_bstore, + error = namei_simple_kernel(fss->fss_bstore, NSM_FOLLOW_NOEMULROOT, &vp); if (error != 0) return error; - if (vp->v_type == VREG && vp->v_mount == sc->sc_mount) { + mutex_enter(&sc->sc_slock); + if (vp->v_type == VREG && vp->v_mount == mount) { sc->sc_flags |= FSS_PERSISTENT; sc->sc_bs_vp = vp; - fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize; + fsbsize = mount->mnt_stat.f_iosize; bits = sizeof(sc->sc_bs_bshift)*NBBY; + for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < bits; sc->sc_bs_bshift++) if (FSS_FSBSIZE(sc) == fsbsize) break; - if (sc->sc_bs_bshift >= bits) + + if (sc->sc_bs_bshift >= bits) { + mutex_exit(&sc->sc_slock); return EINVAL; + } sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1; sc->sc_clshift = 0; + mutex_exit(&sc->sc_slock); if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) { - error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE); + error = do_sys_unlink(fss->fss_bstore, UIO_SYSSPACE); if (error) return error; } error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) return error; - error = VFS_SNAPSHOT(sc->sc_mount, sc->sc_bs_vp, &ts); + error = VFS_SNAPSHOT(mount, vp, &ts); + mutex_enter(&sc->sc_slock); TIMESPEC_TO_TIMEVAL(&sc->sc_time, &ts); + mutex_exit(&sc->sc_slock); - VOP_UNLOCK(sc->sc_bs_vp); + VOP_UNLOCK(vp); return error; } vrele(vp); @@ -767,12 +790,15 @@ /* * Get the block device it is mounted on and its size. */ - error = spec_node_lookup_by_mount(sc->sc_mount, &vp); + error = spec_node_lookup_by_mount(mount, &vp); if (error) return error; + + mutex_enter(&sc->sc_slock); sc->sc_bdev = vp->v_rdev; + mutex_exit(&sc->sc_slock); error = getdisksize(vp, &numsec, &secsize); vrele(vp); if (error) @@ -784,19 +810,21 @@ * Get the backing store */ error = pathbuf_copyin(fss->fss_bstore, &pb2); - if (error) { + if (error) return error; - } + error = vn_open(NULL, pb2, 0, FREAD|FWRITE, 0, &vp2, NULL, NULL); if (error != 0) { pathbuf_destroy(pb2); return error; } VOP_UNLOCK(vp2); + mutex_enter(&sc->sc_slock); sc->sc_bs_vp = vp2; + mutex_exit(&sc->sc_slock); if (vp2->v_type != VREG && vp2->v_type != VCHR) { vrele(vp2); pathbuf_destroy(pb2); @@ -804,27 +832,34 @@ } pathbuf_destroy(pb2); if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) { - error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE); + error = do_sys_unlink(fss->fss_bstore, UIO_SYSSPACE); if (error) return error; } + + mutex_enter(&sc->sc_slock); if (sc->sc_bs_vp->v_type == VREG) { fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize; - if (fsbsize & (fsbsize-1)) /* No power of two */ + if (fsbsize & (fsbsize-1)) { /* No power of two */ + mutex_exit(&sc->sc_slock); return EINVAL; + } for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < 32; sc->sc_bs_bshift++) if (FSS_FSBSIZE(sc) == fsbsize) break; - if (sc->sc_bs_bshift >= 32) + if (sc->sc_bs_bshift >= 32) { + mutex_exit(&sc->sc_slock); return EINVAL; + } sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1; } else { sc->sc_bs_bshift = DEV_BSHIFT; sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1; } + mutex_exit(&sc->sc_slock); return 0; } @@ -833,8 +868,10 @@ */ static int fss_create_snapshot(struct fss_softc *sc, struct fss_set *fss, struct lwp *l) { + struct mount *mount = sc->sc_mount; + struct vnode *vp; int len, error; u_int32_t csize; off_t bsize; @@ -845,15 +882,16 @@ */ if ((error = fss_create_files(sc, fss, &bsize, l)) != 0) goto bad; + mutex_enter(&sc->sc_slock); if (sc->sc_flags & FSS_PERSISTENT) { - fss_softc_alloc(sc); - mutex_enter(&sc->sc_slock); sc->sc_state = FSS_ACTIVE; mutex_exit(&sc->sc_slock); + fss_softc_alloc(sc); return 0; } + mutex_exit(&sc->sc_slock); /* * Set cluster size. Must be a power of two and * a multiple of backing store block size. @@ -864,13 +902,15 @@ csize = fss->fss_csize; if (bsize/csize > FSS_CLUSTER_MAX) csize = bsize/FSS_CLUSTER_MAX+1; + mutex_enter(&sc->sc_slock); for (sc->sc_clshift = sc->sc_bs_bshift; sc->sc_clshift < 32; sc->sc_clshift++) if (FSS_CLSIZE(sc) >= csize) break; if (sc->sc_clshift >= 32) { + mutex_exit(&sc->sc_slock); error = EINVAL; goto bad; } sc->sc_clmask = FSS_CLSIZE(sc)-1; @@ -898,52 +938,67 @@ sc->sc_indir_size = FSS_BTOCL(sc, len)+1; sc->sc_clnext = sc->sc_indir_size; sc->sc_indir_cur = 0; - if ((error = fss_softc_alloc(sc)) != 0) + mutex_exit(&sc->sc_slock); + + if ((error = fss_softc_alloc(sc)) != 0) { goto bad; + } /* * Activate the snapshot. */ - if ((error = vfs_suspend(sc->sc_mount, 0)) != 0) + if ((error = vfs_suspend(mount, 0)) != 0) goto bad; + mutex_enter(&sc->sc_slock); microtime(&sc->sc_time); + mutex_exit(&sc->sc_slock); - vrele_flush(sc->sc_mount); - error = VFS_SYNC(sc->sc_mount, MNT_WAIT, curlwp->l_cred); - if (error == 0) - error = fscow_establish(sc->sc_mount, fss_copy_on_write, sc); + vrele_flush(mount); + error = VFS_SYNC(mount, MNT_WAIT, curlwp->l_cred); + if (error == 0) + error = fscow_establish(mount, fss_copy_on_write, sc); if (error == 0) { mutex_enter(&sc->sc_slock); sc->sc_state = FSS_ACTIVE; mutex_exit(&sc->sc_slock); } - vfs_resume(sc->sc_mount); + vfs_resume(mount); if (error != 0) goto bad; + mutex_enter(&sc->sc_slock); aprint_debug_dev(sc->sc_dev, "%s snapshot active\n", sc->sc_mntname); aprint_debug_dev(sc->sc_dev, "%u clusters of %u, %u cache slots, %u indir clusters\n", sc->sc_clcount, FSS_CLSIZE(sc), sc->sc_cache_size, sc->sc_indir_size); + mutex_exit(&sc->sc_slock); return 0; bad: fss_softc_free(sc); - if (sc->sc_bs_vp != NULL) { - if (sc->sc_flags & FSS_PERSISTENT) - vrele(sc->sc_bs_vp); - else - vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred); + + mutex_enter(&sc->sc_slock); + if ((vp = sc->sc_bs_vp) != NULL) { + if (sc->sc_flags & FSS_PERSISTENT) { + mutex_exit(&sc->sc_slock); + vrele(vp); + } else { + mutex_exit(&sc->sc_slock); + vn_close(vp, FREAD|FWRITE, l->l_cred); + } } + + mutex_enter(&sc->sc_slock); sc->sc_bs_vp = NULL; + mutex_exit(&sc->sc_slock); return error; } @@ -1368,8 +1423,81 @@ mutex_enter(&sc->sc_slock); } } +static void +fss_create_enqueue(struct fss_softc *sc, struct fss_set *fss) +{ + struct fss_create_entry *fce; + + fce = kmem_zalloc(sizeof(*fce), KM_SLEEP); + fce->fce_sc = sc; + if (fss) { + memcpy(&fce->fce_fss, fss, sizeof(*fss)); + + copyinstr(fss->fss_mount, fce->fce_mount, + sizeof(fce->fce_mount), NULL); + fce->fce_fss.fss_mount = fce->fce_mount; + + copyinstr(fss->fss_bstore, fce->fce_bstore, + sizeof(fce->fce_bstore), NULL); + fce->fce_fss.fss_bstore = fce->fce_bstore; + } + + mutex_enter(&fss_device_lock); + SIMPLEQ_INSERT_TAIL(&fss_create_queue, fce, fce_link); + mutex_exit(&fss_device_lock); + + cv_signal(&fss_device_cv); + + return; +} + +/* + * Snapshot serialization thread, for all fss devices + */ +static void +fss_create_thread(void *arg) +{ + struct fss_create_entry *fce; + + do { + struct fss_softc *sc; + struct fss_set *fss; + int error; + + mutex_enter(&fss_device_lock); + while ((fce = SIMPLEQ_FIRST(&fss_create_queue)) == NULL) + cv_wait(&fss_device_cv, &fss_device_lock); + + SIMPLEQ_REMOVE_HEAD(&fss_create_queue, fce_link); + + sc = fce->fce_sc; + fss = &fce->fce_fss; + + if (sc == NULL) { + mutex_exit(&fss_device_lock); + kmem_free(fce, sizeof(*fce)); + kthread_exit(0); + } + + mutex_exit(&fss_device_lock); + + error = fss_create_snapshot(sc, fss, curlwp); + mutex_enter(&sc->sc_slock); + if (error == 0) { + KASSERT(sc->sc_state == FSS_ACTIVE); + sc->sc_uflags = fss->fss_flags; + } else { + KASSERT(sc->sc_state == FSS_CREATING); + sc->sc_state = FSS_IDLE; + } + mutex_exit(&sc->sc_slock); + + kmem_free(fce, sizeof(*fce)); + } while (1); +} + #ifdef _MODULE #include <sys/module.h> @@ -1377,36 +1505,55 @@ CFDRIVER_DECL(fss, DV_DISK, NULL); devmajor_t fss_bmajor = -1, fss_cmajor = -1; +static void +fss_create_thread_join(void) { + fss_create_enqueue(NULL, NULL); + if (fss_create_lwp != NULL) + kthread_join(fss_create_lwp); + fss_create_lwp = NULL; +} + static int fss_modcmd(modcmd_t cmd, void *arg) { int error = 0; switch (cmd) { case MODULE_CMD_INIT: + error = kthread_create(PRI_NONE, KTHREAD_MUSTJOIN, NULL, + fss_create_thread, NULL, &fss_create_lwp, "fss_create"); + if (error) + break; + mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&fss_device_cv, "snapwait"); error = devsw_attach(fss_cd.cd_name, &fss_bdevsw, &fss_bmajor, &fss_cdevsw, &fss_cmajor); if (error) { + fss_create_thread_join(); + cv_destroy(&fss_device_cv); mutex_destroy(&fss_device_lock); break; } error = config_cfdriver_attach(&fss_cd); if (error) { devsw_detach(&fss_bdevsw, &fss_cdevsw); + fss_create_thread_join(); + cv_destroy(&fss_device_cv); mutex_destroy(&fss_device_lock); break; } error = config_cfattach_attach(fss_cd.cd_name, &fss_ca); if (error) { config_cfdriver_detach(&fss_cd); devsw_detach(&fss_bdevsw, &fss_cdevsw); + fss_create_thread_join(); + cv_destroy(&fss_device_cv); mutex_destroy(&fss_device_lock); break; } @@ -1422,8 +1569,9 @@ config_cfattach_attach(fss_cd.cd_name, &fss_ca); break; } devsw_detach(&fss_bdevsw, &fss_cdevsw); + fss_create_thread_join(); cv_destroy(&fss_device_cv); mutex_destroy(&fss_device_lock); break;