This patch is another take at fixing the race between mount and umount resetting the blocksize and causing buffer errors, infinite loops in __getblk_slow, and possibly other undiscovered effects.
It adds possible flags to bd_claim such that the caller can request exclusive access and/or wait until the device becomes available. Since bd_claim already allows/denies access based on the holder, the BD_EXCL flag operates under the assumption that all callers of bd_claim for a particular holder will use the flag. This is currently true. BD_WAIT places the request on a wait queue until access can be granted. It uses a global wait queue, which isn't a contention point since bd_claim/bd_release both operate under the global bdev_lock anyway. Filesystems (via get_sb_bdev) now use BD_EXCL|BD_WAIT to ensure the previous mount has completely shut down and closed the device before re-opening it for a new mount. It's ugly, and I'm open to suggestions, but it seems to be the only way to stop this race reliably. Signed-off-by: Jeff Mahoney <[EMAIL PROTECTED]> diff -ruNpX dontdiff linux-2.6.11/fs/block_dev.c linux-2.6.11.bs/fs/block_dev.c --- linux-2.6.11/fs/block_dev.c 2005-03-14 21:25:20.000000000 -0500 +++ linux-2.6.11.bs/fs/block_dev.c 2005-03-14 22:17:16.000000000 -0500 @@ -238,6 +238,7 @@ static int block_fsync(struct file *filp */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); +static DECLARE_WAIT_QUEUE_HEAD(bdev_wq); static kmem_cache_t * bdev_cachep; static struct inode *bdev_alloc_inode(struct super_block *sb) @@ -443,20 +444,33 @@ void bd_forget(struct inode *inode) spin_unlock(&bdev_lock); } -int bd_claim(struct block_device *bdev, void *holder) +/* + * flags: + * BD_NONE: No special behavior. + * BD_EXCL: Must have sole access to device, even if holder is the same. + * This is really enforced by the holder always using BD_EXCL. + * BD_WAIT: Wait until access is available before returning. + */ +int __bd_claim(struct block_device *bdev, void *holder, int flags) { int res; + DEFINE_WAIT (wait); + +retry: spin_lock(&bdev_lock); + prepare_to_wait (&bdev_wq, &wait, TASK_UNINTERRUPTIBLE); /* first decide result */ - if (bdev->bd_holder == holder) + if (bdev->bd_holder == holder) { res = 0; /* already a holder */ - else if (bdev->bd_holder != NULL) + if (flags & BD_EXCL) + res = -EBUSY; + } else if (bdev->bd_holder != NULL) res = -EBUSY; /* held by someone else */ else if (bdev->bd_contains == bdev) res = 0; /* is a whole device which isn't held */ - else if (bdev->bd_contains->bd_holder == bd_claim) + else if (bdev->bd_contains->bd_holder == __bd_claim) res = 0; /* is a partition of a device that is being partitioned */ else if (bdev->bd_contains->bd_holder != NULL) res = -EBUSY; /* is a partition of a held device */ @@ -470,15 +484,21 @@ int bd_claim(struct block_device *bdev, * be set to bd_claim before being set to holder */ bdev->bd_contains->bd_holders ++; - bdev->bd_contains->bd_holder = bd_claim; + bdev->bd_contains->bd_holder = __bd_claim; bdev->bd_holders++; bdev->bd_holder = holder; + } else if (flags & BD_WAIT) { + spin_unlock (&bdev_lock); + schedule(); + goto retry; } + + finish_wait (&bdev_wq, &wait); spin_unlock(&bdev_lock); return res; } -EXPORT_SYMBOL(bd_claim); +EXPORT_SYMBOL(__bd_claim); void bd_release(struct block_device *bdev) { @@ -488,6 +508,7 @@ void bd_release(struct block_device *bde if (!--bdev->bd_holders) bdev->bd_holder = NULL; spin_unlock(&bdev_lock); + wake_up_all (&bdev_wq); } EXPORT_SYMBOL(bd_release); @@ -876,7 +897,8 @@ fail: * Open the blockdevice described by the special file at @path, claim it * for the @holder. */ -struct block_device *open_bdev_excl(const char *path, int flags, void *holder) +struct block_device *__open_bdev_excl(const char *path, int flags, + void *holder, int bdflags) { struct block_device *bdev; mode_t mode = FMODE_READ; @@ -894,7 +916,7 @@ struct block_device *open_bdev_excl(cons error = -EACCES; if (!(flags & MS_RDONLY) && bdev_read_only(bdev)) goto blkdev_put; - error = bd_claim(bdev, holder); + error = __bd_claim(bdev, holder, bdflags); if (error) goto blkdev_put; @@ -905,7 +927,7 @@ blkdev_put: return ERR_PTR(error); } -EXPORT_SYMBOL(open_bdev_excl); +EXPORT_SYMBOL(__open_bdev_excl); /** * close_bdev_excl - release a blockdevice openen by open_bdev_excl() diff -ruNpX dontdiff linux-2.6.11/fs/super.c linux-2.6.11.bs/fs/super.c --- linux-2.6.11/fs/super.c 2005-03-14 21:25:20.000000000 -0500 +++ linux-2.6.11.bs/fs/super.c 2005-03-14 21:38:22.000000000 -0500 @@ -677,7 +677,7 @@ struct super_block *get_sb_bdev(struct f struct super_block *s; int error = 0; - bdev = open_bdev_excl(dev_name, flags, fs_type); + bdev = __open_bdev_excl(dev_name, flags, fs_type, BD_EXCL|BD_WAIT); if (IS_ERR(bdev)) return (struct super_block *)bdev; diff -ruNpX dontdiff linux-2.6.11/include/linux/fs.h linux-2.6.11.bs/include/linux/fs.h --- linux-2.6.11/include/linux/fs.h 2005-03-14 21:25:20.000000000 -0500 +++ linux-2.6.11.bs/include/linux/fs.h 2005-03-14 21:45:21.000000000 -0500 @@ -1320,7 +1320,14 @@ extern int blkdev_ioctl(struct inode *, extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); extern int blkdev_get(struct block_device *, mode_t, unsigned); extern int blkdev_put(struct block_device *); -extern int bd_claim(struct block_device *, void *); +#define BD_NONE 0x0 +#define BD_EXCL 0x1 +#define BD_WAIT 0x2 +extern int __bd_claim(struct block_device *, void *, int); +static inline int bd_claim(struct block_device *bdev, void *holder) +{ + return __bd_claim(bdev, holder, 0); +} extern void bd_release(struct block_device *); /* fs/char_dev.c */ @@ -1337,6 +1344,11 @@ extern int chrdev_open(struct inode *, s extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); +extern struct block_device *__open_bdev_excl(const char *, int, void *, int); +static inline struct block_device *open_bdev_excl(const char *path, int flags, void *holder) +{ + return __open_bdev_excl(path, flags, holder, BD_NONE); +} extern struct block_device *open_bdev_excl(const char *, int, void *); extern void close_bdev_excl(struct block_device *); -- Jeff Mahoney SuSE Labs - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/