On Wed, 11 Aug 2021 15:41:39 +0300
Pavel Tikhomirov <ptikhomi...@virtuozzo.com> wrote:

> 
> 
> On 06.08.2021 17:29, Alexander Mikhalitsyn wrote:
> > After rebase to RHEL 8.4 binfmt_misc fs uses new fscontext API,
> > our implementation is incorrect. We have to use get_tree_keyed()
> > helper instead of get_tree_single() which allows only one
> > superblock per HN.
> > 
> > Fixes: 90fb0e274 ("ve/fs/binfmt: virtualization")
> > 
> > https://jira.sw.ru/browse/PSBM-132709
> > 
> > Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalit...@virtuozzo.com>
> > ---
> >   fs/binfmt_misc.c | 42 ++++++++++++++++++++++++++++++++++++------
> >   1 file changed, 36 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
> > index c21d03fd3f15..ad1d9d603034 100644
> > --- a/fs/binfmt_misc.c
> > +++ b/fs/binfmt_misc.c
> > @@ -833,10 +833,8 @@ static const struct file_operations 
> > bm_status_operations = {
> >   static void bm_put_super(struct super_block *sb)
> >   {
> >     struct binfmt_misc *bm_data = BINFMT_MISC(sb);
> > -   struct ve_struct *ve = sb->s_fs_info;
> >   
> >     bm_data->enabled = 0;
> > -   put_ve(ve);
> >   }
> >   
> >   static const struct super_operations s_ops = {
> > @@ -875,8 +873,6 @@ static int bm_fill_super(struct super_block *sb, struct 
> > fs_context *fc)
> >     }
> >   
> >     sb->s_op = &s_ops;
> > -   sb->s_fs_info = ve;
> > -   get_ve(ve);
> >   
> >     bm_data->enabled = 1;
> >   
> > @@ -885,10 +881,36 @@ static int bm_fill_super(struct super_block *sb, 
> > struct fs_context *fc)
> >   
> >   static int bm_get_tree(struct fs_context *fc)
> >   {
> > -   return get_tree_single(fc, bm_fill_super);
> > +   struct ve_struct *ve = get_exec_env();
> > +
> > +   /*
> > +    * We need one binfmt_misc superblock per VE,
> > +    * use get_tree_keyed() helper to get vfs_tree.
> > +    *
> > +    * It allows us to find sb by key (in our case ve is the key),
> > +    * and if it doesn't exists creates new.
> > +    *
> > +    * Important: we take ve refcnt here. It will be put
> > +    * in one of two places:
> > +    * 1. bm_free_fc() on error path (wrong mnt opt provided for instance)
> > +    * 2. bm_umount() when sb refcnt becomes zero (last mount umounted)
> > +    */
> > +   return get_tree_keyed(fc, bm_fill_super, get_ve(ve));
> 
> I can imagine that we can get here as much times as we want (N), each 
> mount syscall would get us here. So in case of no error paths we take N 
> references on same VE but ->kill_sb( would be called only once and we 
> release only one reference. This way VE would likely become stuck forever...

Let see, if you already have superblock initialized, then in sget_fc() function:
        if (test) {
                hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) 
{
                        if (test(old, fc))
                                goto share_extant_sb; <--- get there
                }
then:
        s->s_fs_info = fc->s_fs_info;
        err = set(s, fc);
        if (err) {
                s->s_fs_info = NULL;
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(err);
        }
        fc->s_fs_info = NULL; <--- here we NULLify s_fs_info on SUCCESS path 
when NEW superblock was created
        s->s_type = fc->fs_type;
        s->s_iflags |= fc->s_iflags;
        strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
        list_add_tail(&s->s_list, &super_blocks);
        hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(s->s_type);
        register_shrinker_prepared(&s->s_shrink);
        return s;

share_extant_sb: <--- here we NOT NULLify fc->s_fs_info and refcnt will be 
dropped
        /* PSBM-86208: we mount secondary ploop on host for
         * resize functionality so allow mount in init userns
         * if fs already mounted in non-init userns
         */
        if (user_ns != old->s_user_ns &&
            user_ns != &init_user_ns) {
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(-EBUSY);
        }
        if (!grab_super(old))
                goto retry;
        destroy_unused_super(s);
        return old;

My comment not fully correct, because I mean that "success" is when new sb 
allocated, but when we
already found superblock existing is also success case. I will fix comment :)

Thanks! ;)

> 
> > +}
> > +
> > +static void bm_free_fc(struct fs_context *fc)
> > +{
> > +   /*
> > +    * fc->s_fs_info will be NULL if *no* error was occured
> > +    * see fs/super.c sget_fc() helper
> > +    */
> > +   if (fc->s_fs_info)
> > +           put_ve(fc->s_fs_info);
> >   }
> >   
> > +
> >   static const struct fs_context_operations bm_context_ops = {
> > +   .free           = bm_free_fc,
> >     .get_tree       = bm_get_tree,
> >   };
> >   
> > @@ -898,6 +920,14 @@ static int bm_init_fs_context(struct fs_context *fc)
> >     return 0;
> >   }
> >   
> > +static void bm_umount(struct super_block *sb)
> > +{
> > +   struct ve_struct *ve = sb->s_fs_info;
> > +
> > +   kill_litter_super(sb);
> > +   put_ve(ve);
> > +}
> > +
> >   static struct linux_binfmt misc_format = {
> >     .module = THIS_MODULE,
> >     .load_binary = load_misc_binary,
> > @@ -907,7 +937,7 @@ static struct file_system_type bm_fs_type = {
> >     .owner          = THIS_MODULE,
> >     .name           = "binfmt_misc",
> >     .init_fs_context = bm_init_fs_context,
> > -   .kill_sb        = kill_litter_super,
> > +   .kill_sb        = bm_umount,
> >     .fs_flags       = FS_VIRTUALIZED | FS_VE_MOUNT,
> >   };
> >   MODULE_ALIAS_FS("binfmt_misc");
> > 
> 
> -- 
> Best regards, Tikhomirov Pavel
> Software Developer, Virtuozzo.

Regards,
Alex
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to