The patch titled
     r/o bind mounts: track numbers of writers to mounts
has been added to the -mm tree.  Its filename is
     r-o-bind-mounts-track-numbers-of-writers-to-mounts.patch

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/SubmitChecklist when testing your code ***

See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find
out what to do about this

The current -mm tree may be found at http://userweb.kernel.org/~akpm/mmotm/

------------------------------------------------------
Subject: r/o bind mounts: track numbers of writers to mounts
From: Dave Hansen <[EMAIL PROTECTED]>

This is the real meat of the entire series.  It actually implements the
tracking of the number of writers to a mount.  However, it causes scalability
problems because there can be hundreds of cpus doing open()/close() on files
on the same mnt at the same time.  Even an atomic_t in the mnt has massive
scalaing problems because the cacheline gets so terribly contended.

This uses a statically-allocated percpu variable.  All want/drop operations
are local to a cpu as long that cpu operates on the same mount, and there are
no writer count imbalances.  Writer count imbalances happen when a write is
taken on one cpu, and released on another, like when an open/close pair is
performed on two

Upon a remount,ro request, all of the data from the percpu variables is
collected (expensive, but very rare) and we determine if there are any
outstanding writers to the mount.

I've written a little benchmark to sit in a loop for a couple of seconds in
several cpus in parallel doing open/write/close loops.

http://sr71.net/~dave/linux/openbench.c

The code in here is a a worst-possible case for this patch.  It does opens on
a _pair_ of files in two different mounts in parallel.  This should cause my
code to lose its "operate on the same mount" optimization completely.  This
worst-case scenario causes a 3% degredation in the benchmark.

I could probably get rid of even this 3%, but it would be more complex than
what I have here, and I think this is getting into acceptable territory.  In
practice, I expect writing more than 3 bytes to a file, as well as disk I/O to
mask any effects that this has.

(To get rid of that 3%, we could have an #defined number of mounts in the
percpu variable.  So, instead of a CPU getting operate only on percpu data
when it accesses only one mount, it could stay on percpu data when it only
accesses N or fewer mounts.)

Acked-by: Al Viro <[EMAIL PROTECTED]>
Signed-off-by: Christoph Hellwig <[EMAIL PROTECTED]>
Signed-off-by: Dave Hansen <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---

 fs/namespace.c        |  240 +++++++++++++++++++++++++++++++++++++---
 include/linux/mount.h |    7 +
 2 files changed, 232 insertions(+), 15 deletions(-)

diff -puN fs/namespace.c~r-o-bind-mounts-track-numbers-of-writers-to-mounts 
fs/namespace.c
--- a/fs/namespace.c~r-o-bind-mounts-track-numbers-of-writers-to-mounts
+++ a/fs/namespace.c
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/acct.h>
 #include <linux/capability.h>
+#include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/sysfs.h>
 #include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct 
        return tmp & (HASH_SIZE - 1);
 }
 
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
        struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
+               atomic_set(&mnt->__mnt_writers, 0);
                if (name) {
                        int size = strlen(name) + 1;
                        char *newname = kmalloc(size, GFP_KERNEL);
@@ -88,6 +92,86 @@ struct vfsmount *alloc_vfsmnt(const char
  * we can determine when writes are able to occur to
  * a filesystem.
  */
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*.  This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ * mnt_want/drop_write() will _keep_ the filesystem
+ * r/w.
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+       return (mnt->mnt_sb->s_flags & MS_RDONLY);
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+
+struct mnt_writer {
+       /*
+        * If holding multiple instances of this lock, they
+        * must be ordered by cpu number.
+        */
+       spinlock_t lock;
+       struct lock_class_key lock_class; /* compiles out with !lockdep */
+       unsigned long count;
+       struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+
+static int __init init_mnt_writers(void)
+{
+       int cpu;
+       for_each_possible_cpu(cpu) {
+               struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+               spin_lock_init(&writer->lock);
+               lockdep_set_class(&writer->lock, &writer->lock_class);
+               writer->count = 0;
+       }
+       return 0;
+}
+fs_initcall(init_mnt_writers);
+
+static void unlock_mnt_writers(void)
+{
+       int cpu;
+       struct mnt_writer *cpu_writer;
+
+       for_each_possible_cpu(cpu) {
+               cpu_writer = &per_cpu(mnt_writers, cpu);
+               spin_unlock(&cpu_writer->lock);
+       }
+}
+
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+       if (!cpu_writer->mnt)
+               return;
+       atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+       cpu_writer->count = 0;
+}
+ /*
+ * must hold cpu_writer->lock
+ */
+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+                                         struct vfsmount *mnt)
+{
+       if (cpu_writer->mnt == mnt)
+               return;
+       __clear_mnt_count(cpu_writer);
+       cpu_writer->mnt = mnt;
+}
+
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
 /**
  * mnt_want_write - get write access to a mount
  * @mnt: the mount on which to take a write
@@ -100,12 +184,77 @@ struct vfsmount *alloc_vfsmnt(const char
  */
 int mnt_want_write(struct vfsmount *mnt)
 {
-       if (__mnt_is_readonly(mnt))
-               return -EROFS;
-       return 0;
+       int ret = 0;
+       struct mnt_writer *cpu_writer;
+
+       cpu_writer = &get_cpu_var(mnt_writers);
+       spin_lock(&cpu_writer->lock);
+       if (__mnt_is_readonly(mnt)) {
+               ret = -EROFS;
+               goto out;
+       }
+       use_cpu_writer_for_mount(cpu_writer, mnt);
+       cpu_writer->count++;
+out:
+       spin_unlock(&cpu_writer->lock);
+       put_cpu_var(mnt_writers);
+       return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
+static void lock_mnt_writers(void)
+{
+       int cpu;
+       struct mnt_writer *cpu_writer;
+
+       for_each_possible_cpu(cpu) {
+               cpu_writer = &per_cpu(mnt_writers, cpu);
+               spin_lock(&cpu_writer->lock);
+               __clear_mnt_count(cpu_writer);
+               cpu_writer->mnt = NULL;
+       }
+}
+
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count.  Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+       if (atomic_read(&mnt->__mnt_writers) >=
+           MNT_WRITER_UNDERFLOW_LIMIT)
+               return;
+       /*
+        * It isn't necessary to hold all of the locks
+        * at the same time, but doing it this way makes
+        * us share a lot more code.
+        */
+       lock_mnt_writers();
+       /*
+        * vfsmount_lock is for mnt_flags.
+        */
+       spin_lock(&vfsmount_lock);
+       /*
+        * If coalescing the per-cpu writer counts did not
+        * get us back to a positive writer count, we have
+        * a bug.
+        */
+       if ((atomic_read(&mnt->__mnt_writers) < 0) &&
+           !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
+               printk(KERN_DEBUG "leak detected on mount(%p) writers "
+                               "count: %d\n",
+                       mnt, atomic_read(&mnt->__mnt_writers));
+               WARN_ON(1);
+               /* use the flag to keep the dmesg spam down */
+               mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
+       }
+       spin_unlock(&vfsmount_lock);
+       unlock_mnt_writers();
+}
+
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
@@ -116,23 +265,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
+       int must_check_underflow = 0;
+       struct mnt_writer *cpu_writer;
+
+       cpu_writer = &get_cpu_var(mnt_writers);
+       spin_lock(&cpu_writer->lock);
+
+       use_cpu_writer_for_mount(cpu_writer, mnt);
+       if (cpu_writer->count > 0) {
+               cpu_writer->count--;
+       } else {
+               must_check_underflow = 1;
+               atomic_dec(&mnt->__mnt_writers);
+       }
+
+       spin_unlock(&cpu_writer->lock);
+       /*
+        * Logically, we could call this each time,
+        * but the __mnt_writers cacheline tends to
+        * be cold, and makes this expensive.
+        */
+       if (must_check_underflow)
+               handle_write_count_underflow(mnt);
+       /*
+        * This could be done right after the spinlock
+        * is taken because the spinlock keeps us on
+        * the cpu, and disables preemption.  However,
+        * putting it here bounds the amount that
+        * __mnt_writers can underflow.  Without it,
+        * we could theoretically wrap __mnt_writers.
+        */
+       put_cpu_var(mnt_writers);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
-/*
- * __mnt_is_readonly: check whether a mount is read-only
- * @mnt: the mount to check for its write status
- *
- * This shouldn't be used directly ouside of the VFS.
- * It does not guarantee that the filesystem will stay
- * r/w, just that it is right *now*.  This can not and
- * should not be used in place of IS_RDONLY(inode).
- */
-int __mnt_is_readonly(struct vfsmount *mnt)
+int mnt_make_readonly(struct vfsmount *mnt)
 {
-       return (mnt->mnt_sb->s_flags & MS_RDONLY);
+       int ret = 0;
+
+       lock_mnt_writers();
+       /*
+        * With all the locks held, this value is stable
+        */
+       if (atomic_read(&mnt->__mnt_writers) > 0) {
+               ret = -EBUSY;
+               goto out;
+       }
+       /*
+        * actually set mount's r/o flag here to make
+        * __mnt_is_readonly() true, which keeps anyone
+        * from doing a successful mnt_want_write().
+        */
+out:
+       unlock_mnt_writers();
+       return ret;
 }
-EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
@@ -327,7 +514,30 @@ static struct vfsmount *clone_mnt(struct
 
 static inline void __mntput(struct vfsmount *mnt)
 {
+       int cpu;
        struct super_block *sb = mnt->mnt_sb;
+       /*
+        * We don't have to hold all of the locks at the
+        * same time here because we know that we're the
+        * last reference to mnt and that no new writers
+        * can come in.
+        */
+       for_each_possible_cpu(cpu) {
+               struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
+               if (cpu_writer->mnt != mnt)
+                       continue;
+               spin_lock(&cpu_writer->lock);
+               atomic_add(cpu_writer->count, &mnt->__mnt_writers);
+               cpu_writer->count = 0;
+               spin_unlock(&cpu_writer->lock);
+       }
+       /*
+        * This probably indicates that somebody messed
+        * up a mnt_want/drop_write() pair.  If this
+        * happens, the filesystem was probably unable
+        * to make r/w->r/o transitions.
+        */
+       WARN_ON(atomic_read(&mnt->__mnt_writers));
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
diff -puN 
include/linux/mount.h~r-o-bind-mounts-track-numbers-of-writers-to-mounts 
include/linux/mount.h
--- a/include/linux/mount.h~r-o-bind-mounts-track-numbers-of-writers-to-mounts
+++ a/include/linux/mount.h
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/nodemask.h>
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
@@ -30,6 +31,7 @@ struct mnt_namespace;
 #define MNT_RELATIME   0x20
 
 #define MNT_SHRINKABLE 0x100
+#define MNT_IMBALANCED_WRITE_COUNT     0x200 /* just for debugging */
 
 #define MNT_SHARED     0x1000  /* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE 0x2000  /* if the vfsmount is a unbindable mount */
@@ -61,6 +63,11 @@ struct vfsmount {
        atomic_t mnt_count;
        int mnt_expiry_mark;            /* true if marked for expiry */
        int mnt_pinned;
+       /*
+        * This value is not stable unless all of the mnt_writers[] spinlocks
+        * are held, and all mnt_writer[]s on this mount have 0 as their ->count
+        */
+       atomic_t __mnt_writers;
 };
 
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
_

Patches currently in -mm which might be from [EMAIL PROTECTED] are

enable-hotplug-memory-remove-for-ppc64.patch
reiserfs-eliminate-private-use-of-struct-file-in-xattr.patch
hppfs-pass-vfsmount-to-dentry_open.patch
check-for-null-vfsmount-in-dentry_open.patch
fix-up-new-filp-allocators.patch
do-namei_flags-calculation-inside-open_namei.patch
merge-open_namei-and-do_filp_open.patch
r-o-bind-mounts-stub-functions.patch
r-o-bind-mounts-create-helper-to-drop-file-write-access.patch
r-o-bind-mounts-drop-write-during-emergency-remount.patch
r-o-bind-mounts-elevate-write-count-for-vfs_rmdir.patch
r-o-bind-mounts-elevate-write-count-for-callers-of-vfs_mkdir.patch
r-o-bind-mounts-elevate-mnt_writers-for-unlink-callers.patch
r-o-bind-mounts-elevate-write-count-for-xattr_permission-callers.patch
r-o-bind-mounts-elevate-write-count-for-ncp_ioctl.patch
r-o-bind-mounts-write-counts-for-time-functions.patch
r-o-bind-mounts-elevate-write-count-for-do_utimes.patch
r-o-bind-mounts-write-count-for-file_update_time.patch
r-o-bind-mounts-write-counts-for-link-symlink.patch
r-o-bind-mounts-elevate-write-count-for-ioctls.patch
r-o-bind-mounts-elevate-write-count-for-opens.patch
r-o-bind-mounts-get-write-access-for-vfs_rename-callers.patch
r-o-bind-mounts-elevate-write-count-for-chmod-chown-callers.patch
r-o-bind-mounts-write-counts-for-truncate.patch
r-o-bind-mounts-elevate-count-for-xfs-timestamp-updates.patch
r-o-bind-mounts-make-access-use-new-r-o-helper.patch
r-o-bind-mounts-check-mnt-instead-of-superblock-directly.patch
r-o-bind-mounts-get-callers-of-vfs_mknod-create.patch
r-o-bind-mounts-track-numbers-of-writers-to-mounts.patch
r-o-bind-mounts-honor-mount-writer-counts-at-remount.patch
r-o-bind-mounts-debugging-for-missed-calls.patch
reiser4.patch
page-owner-tracking-leak-detector.patch

-
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to