Hi guys,

After many people commented on the deficiencies (well, serious SMP-races!)
of my earlier version of the forced umount support, I have rewritten it to
be safe, to the best of my knowledge.

So, here it is for your comments, testing, improvements etc.

There are no known problems with it so please report to me absolutely
anything suspicious you find.

There are, however, known limitations ;)

 o It doesn't catch SCM_RIGHTS datagrams in flight, i.e. passing fds
   between processes. This is not a bug but a limitation - umount will
   just fail in those cases

 o There are possible timings (hopefully "rare" - in quotes because it
   depends how busy your apps are) whereby having apps doing io on open
   files will cause umount -f still to fail. Again, this is not a bug but
   limitation. Basically, there was a choice there between:

     a) guaranteed to succeed in almost all cases but be SMP-unsafe

     b) best effort service (since the above is not 100% to succeed
        anyway) but be safe always.

Again, I repeat that by "succeed" I meant that umount will successfully
umount the filesystem. If it doesn't succeed it will just fail with EBUSY
but no data structures are corrupted - everything remains consistent. (If
you find otherwise - I would like to know!)

The patch is against 2.4.0-test7-pre3 (and also works on
2.4.0-test7-pre2).

Regards,
Tigran

http://www.moses.uklinux.net/patches/badfs-2.4.0-test7-p2.patch

diff -urN -X dontdiff linux/fs/Makefile badfs/fs/Makefile
--- linux/fs/Makefile   Thu Aug 10 06:51:11 2000
+++ badfs/fs/Makefile   Sat Aug 12 09:14:22 2000
@@ -18,9 +18,9 @@
 ALL_SUB_DIRS := coda minix ext2 fat msdos vfat proc isofs nfs umsdos ntfs \
                hpfs sysv smbfs ncpfs ufs efs affs romfs autofs hfs lockd \
                nfsd nls devpts devfs adfs partitions qnx4 udf bfs cramfs \
-               openpromfs autofs4 ramfs jffs
+               openpromfs autofs4 ramfs jffs badfs
 
-SUB_DIRS :=
+SUB_DIRS := badfs
 
 ifeq ($(CONFIG_QUOTA),y)
 O_OBJS += dquot.o
diff -urN -X dontdiff linux/fs/badfs/Makefile badfs/fs/badfs/Makefile
--- linux/fs/badfs/Makefile     Thu Jan  1 01:00:00 1970
+++ badfs/fs/badfs/Makefile     Sat Aug 12 09:14:36 2000
@@ -0,0 +1,9 @@
+#
+# Makefile for badfs filesystem.
+#
+
+O_TARGET := badfs.o
+O_OBJS   := inode.o
+M_OBJS   := $(O_TARGET)
+
+include $(TOPDIR)/Rules.make
diff -urN -X dontdiff linux/fs/badfs/inode.c badfs/fs/badfs/inode.c
--- linux/fs/badfs/inode.c      Thu Jan  1 01:00:00 1970
+++ badfs/fs/badfs/inode.c      Sat Aug 12 09:21:07 2000
@@ -0,0 +1,265 @@
+/*
+ *  badfs - the Bad Filesystem
+ *
+ *  Author - Tigran Aivazian <[EMAIL PROTECTED]>
+ *
+ *  Thanks to:
+ *     Manfred Spraul <[EMAIL PROTECTED]>, for useful comments.
+ *
+ *  This file is released under the GPL.
+ *
+ *  The badfs filesystem is used by forced umount ('umount -f' command)
+ *  to move inodes that keep the filesystem being umounted busy to it.
+ *
+ *  The entry point into this module is via quiesce_filesystem() called
+ *  from fs/super.c:do_umount() if MNT_FORCE is passed.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+
+#define BADFS_MAGIC    0xBADF0001
+
+static struct super_block *badfs_read_super(struct super_block *,void *,int);
+
+#define FS_FLAGS_BADFS (FS_NOMOUNT | FS_SINGLE)
+static DECLARE_FSTYPE(badfs_fs_type,"badfs",badfs_read_super,FS_FLAGS_BADFS);
+
+static struct vfsmount *badfs_mnt;     /* returned by kern_mount() */
+static struct super_block *badfs_sb;   /* badfs_mnt->mnt_sb */
+static struct dentry *badfs_root;      /* badfs_sb->s_root */
+
+static int __init init_badfs_fs(void)
+{
+       int err = register_filesystem(&badfs_fs_type);
+
+       if (!err) {
+               badfs_mnt = kern_mount(&badfs_fs_type);
+               if (IS_ERR(badfs_mnt)) {
+                       err = PTR_ERR(badfs_mnt);
+                       unregister_filesystem(&badfs_fs_type);
+               } else {
+                       badfs_sb = badfs_mnt->mnt_sb;
+                       err = 0;
+               }
+       }
+       return err;
+}
+
+module_init(init_badfs_fs);
+
+static struct inode *badfs_get_inode(struct super_block *sb, int mode)
+{
+       struct inode *inode = get_empty_inode();
+
+       if (inode) {
+               make_bad_inode(inode);
+               inode->i_sb = sb;
+               inode->i_dev = sb->s_dev;
+               inode->i_mode = mode;
+               inode->i_nlink = 1;
+               inode->i_size = 0;
+               inode->i_blocks = 0;
+       }
+       return inode;
+}
+
+/* VFS ->read_super() method */
+static struct super_block *badfs_read_super(struct super_block * sb, 
+               void * data, int silent)
+{
+       static struct super_operations badfs_ops = {};
+       struct inode * root = badfs_get_inode(sb, S_IFDIR|S_IRUSR|S_IWUSR);
+
+       if (!root)
+               return NULL;
+       sb->s_blocksize = 1024;
+       sb->s_blocksize_bits = 10;
+       sb->s_magic = BADFS_MAGIC;
+       sb->s_op = &badfs_ops;
+       badfs_root = sb->s_root = d_alloc(NULL, 
+                       &(const struct qstr){ "bad:", 5, 0});
+       if (!badfs_root) {
+               iput(root);
+               return NULL;
+       }
+       sb->s_root->d_sb = sb;
+       sb->s_root->d_parent = sb->s_root;
+       d_instantiate(sb->s_root, root);
+       return sb;
+}
+
+static void disable_pwd(struct fs_struct *fs)
+{
+       struct inode *inode;
+       struct dentry *dentry;
+
+       inode = badfs_get_inode(badfs_sb, S_IFDIR|0755);
+       if (!inode) {
+               printk(KERN_ERR "disable_pwd(): can't allocate inode\n");
+               return;
+       }
+       dentry = d_alloc(badfs_root, &(const struct qstr){"dead_pwd", 8, 0});
+       if (!dentry) {
+               iput(inode);
+               printk(KERN_ERR "disable_pwd(): can't allocate dentry\n");
+               return;
+       }
+       d_instantiate(dentry, inode);
+       dget(dentry);
+       set_fs_pwd(fs, badfs_mnt, dentry);
+}
+
+static void disable_root(struct fs_struct *fs)
+{
+       struct inode *inode;
+       struct dentry *dentry;
+
+       inode = badfs_get_inode(badfs_sb, S_IFDIR|0755);
+       if (!inode) {
+               printk(KERN_ERR "disable_root(): can't allocate inode\n");
+               return;
+       }
+       dentry = d_alloc(badfs_root, &(const struct qstr){"dead_root", 9, 0});
+       if (!dentry) {
+               iput(inode);
+               printk(KERN_ERR "disable_root(): can't allocate dentry\n");
+               return;
+       }
+       d_instantiate(dentry, inode);
+       dget(dentry);
+       set_fs_root(fs, badfs_mnt, dentry);
+}
+
+/* called from do_umount() if MNT_FORCE is specified */
+void quiesce_filesystem(struct vfsmount *mnt)
+{
+       struct task_struct *p;
+       struct file *file;
+       struct inode *inode;
+
+       /* we do three passes through the task list, examining:
+        *   1. p->fs{->pwd,root} that can keep this mnt busy
+        *   2. p->files, i.e. open files (we do_close them)
+        *   3. p->mm, i.e. mmaped files (we simply do_munmap them)
+        * There is no guarantee that by the time we restart the loop
+        * the amount of work to do in the loop has not increased.
+        */
+repeat1:
+       read_lock(&tasklist_lock);
+       for_each_task(p) {
+               struct fs_struct *fs;
+
+               /* get a reference to p->fs */
+               task_lock(p);
+               fs = p->fs;
+               if (!fs) {
+                       task_unlock(p);
+                       continue;
+               } else
+                       atomic_inc(&fs->count);
+               task_unlock(p);
+
+               if (fs->pwdmnt == mnt) {
+                       read_unlock(&tasklist_lock);
+                       disable_pwd(fs); /* may sleep */
+                       put_fs_struct(fs);
+                       goto repeat1;
+               }
+               if (fs->rootmnt == mnt) {
+                       read_unlock(&tasklist_lock);
+                       disable_root(fs); /* may sleep */
+                       put_fs_struct(fs);
+                       goto repeat1;
+               }
+               put_fs_struct(fs);
+       }
+       read_unlock(&tasklist_lock);
+
+repeat2:
+       read_lock(&tasklist_lock);
+       for_each_task(p) {
+               unsigned int fd, j = 0;
+               struct files_struct *files;
+
+               /* get a reference to p->files */
+               task_lock(p);
+               files = p->files;
+               if (!files) {
+                       task_unlock(p);
+                       continue;
+               } else
+                       atomic_inc(&files->count);
+               task_unlock(p);
+
+               /* check if this process has open files here */
+               while (1) {
+                       unsigned long set;
+
+                       fd = j * __NFDBITS;
+                       if (fd >= files->max_fdset || fd >= files->max_fds)
+                               break;
+                       set = files->open_fds->fds_bits[j++];
+                       while (set) {
+                               if (set & 1) {
+                                       file = files->fd[fd];
+                                       inode = file->f_dentry->d_inode;
+                                       if (inode && (file->f_vfsmnt==mnt)) {
+                                               read_unlock(&tasklist_lock);
+                                               do_close(files, fd);
+                                               put_files_struct(files);
+                                               goto repeat2;
+                                       }
+                               }
+                               fd++;
+                               set >>= 1;
+                       }
+               }
+               put_files_struct(files);
+       }
+       read_unlock(&tasklist_lock);
+
+repeat3:
+       read_lock(&tasklist_lock);
+       for_each_task(p) {
+               struct mm_struct *mm;
+               struct vm_area_struct *vma;
+
+               /* get a reference to p->mm */
+               task_lock(p);
+               mm = p->mm;
+               if (!mm) {
+                       task_unlock(p);
+                       continue;
+               } else
+                       atomic_inc(&mm->mm_users);
+               task_unlock(p);
+
+               /* check for mmap'd files and unmap them */
+               vmlist_modify_lock(mm);
+               for (vma = mm->mmap; vma; vma=vma->vm_next) {
+                       file = vma->vm_file;
+                       if (!file)
+                               continue;
+                       inode = file->f_dentry->d_inode;
+                       if (!inode || !inode->i_sb)
+                               continue;
+                       if (file->f_vfsmnt == mnt) {
+                               vmlist_modify_unlock(mm);
+                               read_unlock(&tasklist_lock);
+                               down(&mm->mmap_sem);
+                               do_munmap(mm, vma->vm_start, 
+                                       vma->vm_end - vma->vm_start);
+                               up(&mm->mmap_sem);
+                               mmput(mm);
+                               goto repeat3;
+                       }
+               }
+               vmlist_modify_unlock(mm);
+               mmput(mm);
+       }
+       read_unlock(&tasklist_lock);
+}
diff -urN -X dontdiff linux/fs/open.c badfs/fs/open.c
--- linux/fs/open.c     Sat Aug 12 09:11:31 2000
+++ badfs/fs/open.c     Sat Aug 12 09:00:41 2000
@@ -800,10 +800,9 @@
  * releasing the fd. This ensures that one clone task can't release
  * an fd while another clone is opening it.
  */
-asmlinkage long sys_close(unsigned int fd)
+int do_close(struct files_struct *files, unsigned int fd)
 {
        struct file * filp;
-       struct files_struct *files = current->files;
 
        write_lock(&files->file_lock);
        if (fd >= files->max_fds)
@@ -821,6 +820,12 @@
        write_unlock(&files->file_lock);
        return -EBADF;
 }
+
+asmlinkage long sys_close(unsigned int fd)
+{
+       return do_close(current->files, fd);
+}
+
 
 /*
  * This routine simulates a hangup on the tty, to arrange that users
diff -urN -X dontdiff linux/fs/super.c badfs/fs/super.c
--- linux/fs/super.c    Sat Aug 12 09:11:31 2000
+++ badfs/fs/super.c    Sat Aug 12 09:15:22 2000
@@ -1033,6 +1033,9 @@
                return retval;
        }
 
+       if (flags&MNT_FORCE)
+               quiesce_filesystem(mnt);
+
        spin_lock(&dcache_lock);
        if (atomic_read(&mnt->mnt_count) > 2) {
                spin_unlock(&dcache_lock);
diff -urN -X dontdiff linux/include/linux/fs.h badfs/include/linux/fs.h
--- linux/include/linux/fs.h    Sat Aug 12 09:11:31 2000
+++ badfs/include/linux/fs.h    Sat Aug 12 09:16:28 2000
@@ -890,6 +890,7 @@
 
 /* fs/open.c */
 
+extern int do_close(struct files_struct *, unsigned int);
 asmlinkage long sys_open(const char *, int, int);
 asmlinkage long sys_close(unsigned int);       /* yes, it's really unsigned */
 extern int do_truncate(struct dentry *, loff_t start);
@@ -1191,6 +1192,8 @@
 extern kdev_t ROOT_DEV;
 extern char root_device_name[];
 
+/* fs/badfs/inode.c - used by forced umount */
+extern void quiesce_filesystem(struct vfsmount *);
 
 extern void show_buffers(void);
 extern void mount_root(void);




Reply via email to