Al Viro <v...@zeniv.linux.org.uk> writes:

> On Tue, Dec 19, 2017 at 07:40:43PM +0100, Giuseppe Scrivano wrote:
>> Giuseppe Scrivano <gscri...@redhat.com> writes:
>> 
>> > The only issue I've seen with my version is that if I do:
>> >
>> > # unshare -im /bin/sh
>> > # mount -t mqueue mqueue /dev/mqueue
>> > # touch /dev/mqueue/foo
>> > # umount /dev/mqueue
>> > # mount -t mqueue mqueue /dev/mqueue
>> >
>> > then /dev/mqueue/foo doesn't exist at this point.  Your patch does not
>> > have this problem and /dev/mqueue/foo is again accessible after the
>> > second mount.
>> 
>> although, how much is that of an issue?  Is there any other way to delay
>
> You do realize that with your patch you would end up with worse than that -
> mount/touch/umount and now you've got ->mq_sb non-NULL and pointing to freed
> super_block.  With really unpleasant effects when you quit that ipcns...
>
>> the cost of kern_mount_data()?  Most containers have /dev/mqueue mounted
>> but it is not really going to be used.
>
> _What_ cost?  At mount(2) time you are setting the superblock up anyway, so
> what would you be delaying?  kmem_cache_alloc() for struct mount and 
> assignments
> to its fields?  That's noise; if anything, I would expect the main cost with
> a plenty of containers to be in sget() scanning the list of mqueue 
> superblocks.
> And we can get rid of that, while we are at it - to hell with mount_ns(), with
> that approach we can just use mount_nodev() instead.  The logics in
> mq_internal_mount() will deal with multiple instances - if somebody has 
> already
> triggered creation of internal mount, all subsequent calls in that ipcns will
> end up avoiding kern_mount_data() entirely.  And if you have two callers
> racing - sure, you will get two superblocks.  Not for long, though - the first
> one to get to setting ->mq_mnt (serialized on mq_lock) wins, the second loses
> and prompty destroys his vfsmount and superblock.  I seriously suspect that
> variant below would cut down on the cost a whole lot more - as it is, we have
> the total of O(N^2) spent in the loop inside of sget_userns() when we create
> N ipcns and mount in each of those; this patch should cut that to
> O(N)...

If that is where the cost is, is there any point in delaying creating
the internal mount at all?

Eric

>
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index 10b82338415b..e9870b0cda29 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -325,8 +325,9 @@ static struct inode *mqueue_get_inode(struct super_block 
> *sb,
>  static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
>  {
>       struct inode *inode;
> -     struct ipc_namespace *ns = sb->s_fs_info;
> +     struct ipc_namespace *ns = data;
>  
> +     sb->s_fs_info = ns;
>       sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
>       sb->s_blocksize = PAGE_SIZE;
>       sb->s_blocksize_bits = PAGE_SHIFT;
> @@ -343,18 +344,44 @@ static int mqueue_fill_super(struct super_block *sb, 
> void *data, int silent)
>       return 0;
>  }
>  
> +static struct file_system_type mqueue_fs_type;
> +/*
> + * Return value is pinned only by reference in ->mq_mnt; it will
> + * live until ipcns dies.  Caller does not need to drop it.
> + */
> +static struct vfsmount *mq_internal_mount(void)
> +{
> +     struct ipc_namespace *ns = current->nsproxy->ipc_ns;
> +     struct vfsmount *m = ns->mq_mnt;
> +     if (m)
> +             return m;
> +     m = kern_mount_data(&mqueue_fs_type, ns);
> +     spin_lock(&mq_lock);
> +     if (unlikely(ns->mq_mnt)) {
> +             spin_unlock(&mq_lock);
> +             if (!IS_ERR(m))
> +                     kern_unmount(m);
> +             return ns->mq_mnt;
> +     }
> +     if (!IS_ERR(m))
> +             ns->mq_mnt = m;
> +     spin_unlock(&mq_lock);
> +     return m;
> +}
> +
>  static struct dentry *mqueue_mount(struct file_system_type *fs_type,
>                        int flags, const char *dev_name,
>                        void *data)
>  {
> -     struct ipc_namespace *ns;
> -     if (flags & MS_KERNMOUNT) {
> -             ns = data;
> -             data = NULL;
> -     } else {
> -             ns = current->nsproxy->ipc_ns;
> -     }
> -     return mount_ns(fs_type, flags, data, ns, ns->user_ns, 
> mqueue_fill_super);
> +     struct vfsmount *m;
> +     if (flags & MS_KERNMOUNT)
> +             return mount_nodev(fs_type, flags, data, mqueue_fill_super);
> +     m = mq_internal_mount();
> +     if (IS_ERR(m))
> +             return ERR_CAST(m);
> +     atomic_inc(&m->mnt_sb->s_active);
> +     down_write(&m->mnt_sb->s_umount);
> +     return dget(m->mnt_root);
>  }
>  
>  static void init_once(void *foo)
> @@ -743,13 +770,16 @@ static int prepare_open(struct dentry *dentry, int 
> oflag, int ro,
>  static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
>                     struct mq_attr *attr)
>  {
> -     struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;
> -     struct dentry *root = mnt->mnt_root;
> +     struct vfsmount *mnt = mq_internal_mount();
> +     struct dentry *root;
>       struct filename *name;
>       struct path path;
>       int fd, error;
>       int ro;
>  
> +     if (IS_ERR(mnt))
> +             return PTR_ERR(mnt);
> +
>       audit_mq_open(oflag, mode, attr);
>  
>       if (IS_ERR(name = getname(u_name)))
> @@ -760,6 +790,7 @@ static int do_mq_open(const char __user *u_name, int 
> oflag, umode_t mode,
>               goto out_putname;
>  
>       ro = mnt_want_write(mnt);       /* we'll drop it in any case */
> +     root = mnt->mnt_root;
>       inode_lock(d_inode(root));
>       path.dentry = lookup_one_len(name->name, root, strlen(name->name));
>       if (IS_ERR(path.dentry)) {
> @@ -1534,28 +1565,26 @@ int mq_init_ns(struct ipc_namespace *ns)
>       ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
>       ns->mq_msg_default   = DFLT_MSG;
>       ns->mq_msgsize_default  = DFLT_MSGSIZE;
> +     ns->mq_mnt = NULL;
>  
> -     ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
> -     if (IS_ERR(ns->mq_mnt)) {
> -             int err = PTR_ERR(ns->mq_mnt);
> -             ns->mq_mnt = NULL;
> -             return err;
> -     }
>       return 0;
>  }
>  
>  void mq_clear_sbinfo(struct ipc_namespace *ns)
>  {
> -     ns->mq_mnt->mnt_sb->s_fs_info = NULL;
> +     if (ns->mq_mnt)
> +             ns->mq_mnt->mnt_sb->s_fs_info = NULL;
>  }
>  
>  void mq_put_mnt(struct ipc_namespace *ns)
>  {
> -     kern_unmount(ns->mq_mnt);
> +     if (ns->mq_mnt)
> +             kern_unmount(ns->mq_mnt);
>  }
>  
>  static int __init init_mqueue_fs(void)
>  {
> +     struct vfsmount *m;
>       int error;
>  
>       mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
> @@ -1577,6 +1606,10 @@ static int __init init_mqueue_fs(void)
>       if (error)
>               goto out_filesystem;
>  
> +     m = kern_mount_data(&mqueue_fs_type, &init_ipc_ns);
> +     if (IS_ERR(m))
> +             goto out_filesystem;
> +     init_ipc_ns.mq_mnt = m;
>       return 0;
>  
>  out_filesystem:

Reply via email to