Paul Menage wrote:
> [RFC] Support multiply-bindable cgroup subsystems
> 
> This patch allows a cgroup subsystem to be marked as bindable on
> multiple cgroup hierarchies independently, when declared in
> cgroup_subsys.h via MULTI_SUBSYS() rather than SUBSYS().
> 
> The state for such subsystems cannot be accessed directly from a
> task->cgroups (since there's no unique mapping for a task) but instead
> must be accessed via a particular control group object.
> 
> Multiply-bound subsystems are useful in cases where there's no direct
> correspondence between the cgroup configuration and some property of
> the kernel outside of the cgroups subsystem.  So this would not be
> applicable to e.g. the CFS cgroup, since there has to a unique mapping
> from a task to its CFS run queue.
> 
> As an example, the "debug" subsystem is marked multiply-bindable,
> since it has no state outside the cgroups framework itself.
> 
> Example usage:
> 
> mount -t cgroup -o name=foo,debug,cpu cgroup /mnt1
> mount -t cgroup -o name=bar,debug,memory cgroup /mnt2
> 
> Open issues:
> 
> - what should /proc/cgroup show for multiply-bindable subsystems?
>   Currently it shows just one entry for each hierarchy the subsystem
>   is bound to, which means the subsystem doesn't show up when unbound,
>   and is indistinguishable from a regular (singleton) subsystem when
>   singly-bound.
> 
> - should the bind() callback be extended to support passing more
>   information to multiply-bindable subsystems? Or should we just scrap
>   the bind() callback since nothing appears to be using it.
> 

I agree to scrap it.

> - how to stop checkpatch.pl complaining about the way the
>   SUBSYS/MULTI_SUBSYS macros as used to define the cgroup_subsys_id
>   enum in cgroup.h?
> 

I think we can ignore it.

> Signed-off-by: Paul Menage <[email protected]>
> 
> ---
> 
>  include/linux/cgroup.h        |   37 ++++++++++-
>  include/linux/cgroup_subsys.h |    2 -
>  kernel/cgroup.c               |  137 
> ++++++++++++++++++++++++++---------------
>  3 files changed, 123 insertions(+), 53 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index adf6739..9b5999d 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -39,10 +39,37 @@ extern int cgroupstats_build(struct cgroupstats *stats,
>  
>  extern struct file_operations proc_cgroup_operations;
>  
> -/* Define the enumeration of all cgroup subsystems */
> -#define SUBSYS(_x) _x ## _subsys_id,
> +/* Define the enumeration of all cgroup subsystems. There are two

/*
 * xxx
 */

> + * kinds of subsystems:
> + *
> + * - regular (singleton) subsystems can be only mounted once; these
> + * generally correspond to some system that has substantial state
> + * outside of the cgroups framework, or where the state is being used
> + * to control some external behaviour (e.g. CPU scheduler).  Every
> + * task has an associated state pointer (accessed efficiently through
> + * task->cgroups) for each singleton subsystem.
> + *
> + * - multiply-bound subsystems may be mounted on zero or more
> + * hierarchies. Since there's no unique mapping from a task to a
> + * subsystem state pointer for multiply-bound subsystems, the state of
> + * these subsystems cannot be accessed rapidly from a task
> + * pointer. These correspond to subsystems where most or all of the
> + * state is maintained within the subsystem itself, and/or the
> + * subsystem is used for monitoring rather than control.
> + */
>  enum cgroup_subsys_id {
> +#define SUBSYS(_x) _x ## _subsys_id,
> +#define MULTI_SUBSYS(_x)
>  #include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> +     CGROUP_SINGLETON_SUBSYS_COUNT,
> +     CGROUP_DUMMY_ENUM_RESET = CGROUP_SINGLETON_SUBSYS_COUNT - 1,
> +#define SUBSYS(_x)
> +#define MULTI_SUBSYS(_x) _x ## _subsys_id,
> +#include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
>       CGROUP_SUBSYS_COUNT
>  };
>  #undef SUBSYS
> @@ -231,7 +258,7 @@ struct css_set {
>        * time). Multi-subsystems don't have an entry in here since
>        * there's no unique state for a given task.
>        */
> -     struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
> +     struct cgroup_subsys_state *subsys[CGROUP_SINGLETON_SUBSYS_COUNT];
>  };
>  
>  /*
> @@ -419,8 +446,10 @@ struct cgroup_subsys {
>  };
>  
>  #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
> +#define MULTI_SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
>  #include <linux/cgroup_subsys.h>
>  #undef SUBSYS
> +#undef MULTI_SUBSYS
>  
>  static inline struct cgroup_subsys_state *cgroup_subsys_state(
>       struct cgroup *cgrp, int subsys_id)
> @@ -431,6 +460,8 @@ static inline struct cgroup_subsys_state 
> *cgroup_subsys_state(
>  static inline struct cgroup_subsys_state *task_subsys_state(
>       struct task_struct *task, int subsys_id)
>  {
> +     /* This check is optimized out for most callers */
> +     BUG_ON(subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
>       return rcu_dereference(task->cgroups->subsys[subsys_id]);
>  }
>  
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index 9c8d31b..f78605e 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -14,7 +14,7 @@ SUBSYS(cpuset)
>  /* */
>  
>  #ifdef CONFIG_CGROUP_DEBUG
> -SUBSYS(debug)
> +MULTI_SUBSYS(debug)
>  #endif
>  
>  /* */
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index fcb8181..942a951 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -52,10 +52,18 @@
>  static DEFINE_MUTEX(cgroup_mutex);
>  
>  /* Generate an array of cgroup subsystem pointers */
> -#define SUBSYS(_x) &_x ## _subsys,
>  
>  static struct cgroup_subsys *subsys[] = {
> +#define SUBSYS(_x) &_x ## _subsys,
> +#define MULTI_SUBSYS(_x)
> +#include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> +#define SUBSYS(_x)
> +#define MULTI_SUBSYS(_x) &_x ## _subsys,
>  #include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
>  };
>  
>  /*
> @@ -265,7 +273,7 @@ static struct hlist_head *css_set_hash(struct 
> cgroup_subsys_state *css[])
>       int index;
>       unsigned long tmp = 0UL;
>  
> -     for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
> +     for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++)
>               tmp += (unsigned long)css[i];
>       tmp = (tmp >> 16) ^ tmp;
>  
> @@ -435,7 +443,7 @@ static struct css_set *find_existing_css_set(
>  
>       /* Build the set of subsystem state objects that we want to
>        * see in the new css_set */
> -     for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> +     for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
>               if (root->subsys_bits & (1UL << i)) {
>                       /* Subsystem is in this hierarchy. So we want
>                        * the subsystem state from the new
> @@ -529,7 +537,7 @@ static struct css_set *find_css_set(
>       struct css_set *oldcg, struct cgroup *cgrp)
>  {
>       struct css_set *res;
> -     struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> +     struct cgroup_subsys_state *template[CGROUP_SINGLETON_SUBSYS_COUNT];
>  
>       struct list_head tmp_cg_links;
>  
> @@ -853,6 +861,20 @@ static void cgroup_wakeup_rmdir_waiters(const struct 
> cgroup *cgrp)
>               wake_up_all(&cgroup_rmdir_waitq);
>  }
>  
> +static void init_cgroup_css(struct cgroup_subsys_state *css,
> +                            struct cgroup_subsys *ss,
> +                            struct cgroup *cgrp)
> +{
> +     css->cgroup = cgrp;
> +     atomic_set(&css->refcnt, 1);
> +     css->flags = 0;
> +     css->id = NULL;
> +     if (cgrp == &cgrp->root->top_cgroup)
> +             set_bit(CSS_ROOT, &css->flags);
> +     BUG_ON(cgrp->subsys[ss->subsys_id]);
> +     cgrp->subsys[ss->subsys_id] = css;
> +}
> +
>  static int rebind_subsystems(struct cgroupfs_root *root,
>                            unsigned long final_bits)
>  {
> @@ -863,7 +885,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
>       removed_bits = root->subsys_bits & ~final_bits;
>       added_bits = final_bits & ~root->subsys_bits;
>       /* Check that any added subsystems are currently free */
> -     for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> +     for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
>               unsigned long bit = 1UL << i;
>               if (!(bit & added_bits))
>                       continue;
> @@ -883,33 +905,57 @@ static int rebind_subsystems(struct cgroupfs_root *root,
>       /* Process each subsystem */
>       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
>               struct cgroup_subsys *ss = subsys[i];
> +             bool singleton = i < CGROUP_SINGLETON_SUBSYS_COUNT;
>               unsigned long bit = 1UL << i;
>               if (bit & added_bits) {
> +                     struct cgroup_subsys_state *css;
>                       /* We're binding this subsystem to this hierarchy */
>                       BUG_ON(cgrp->subsys[i]);
> -                     BUG_ON(!dummytop->subsys[i]);
> -                     BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
> -                     BUG_ON(!(rootnode.subsys_bits & bit));
> +                     if (singleton) {
> +                             css = dummytop->subsys[i];
> +                             BUG_ON(!css);
> +
> +                             BUG_ON(css->cgroup != dummytop);
> +                             BUG_ON(!(rootnode.subsys_bits & bit));
> +                     } else {
> +                             BUG_ON(dummytop->subsys[i]);
> +                             BUG_ON(rootnode.subsys_bits & bit);
> +                             css = ss->create(ss, cgrp);
> +                             if (IS_ERR(css))
> +                                     return PTR_ERR(css);
> +                             init_cgroup_css(css, ss, cgrp);
> +                     }
>                       mutex_lock(&ss->hierarchy_mutex);
> -                     cgrp->subsys[i] = dummytop->subsys[i];
> -                     cgrp->subsys[i]->cgroup = cgrp;
> +                     cgrp->subsys[i] = css;
> +                     css->cgroup = cgrp;
>                       if (ss->bind)
>                               ss->bind(ss, cgrp);
> -                     rootnode.subsys_bits &= ~bit;
> +                     if (singleton)
> +                             rootnode.subsys_bits &= ~bit;
>                       root->subsys_bits |= bit;
>                       mutex_unlock(&ss->hierarchy_mutex);
>               } else if (bit & removed_bits) {
> +                     struct cgroup_subsys_state *css;
>                       /* We're removing this subsystem */
> -                     BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
> -                     BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
> -                     BUG_ON(rootnode.subsys_bits & bit);
> +                     css = cgrp->subsys[i];
> +                     BUG_ON(css->cgroup != cgrp);
> +                     if (singleton) {
> +                             BUG_ON(css != dummytop->subsys[i]);
> +                             BUG_ON(rootnode.subsys_bits & bit);
> +                     }
>                       mutex_lock(&ss->hierarchy_mutex);
>                       if (ss->bind)
>                               ss->bind(ss, dummytop);
> -                     dummytop->subsys[i]->cgroup = dummytop;
> +                     if (singleton) {
> +                             css->cgroup = dummytop;
> +                     } else {
> +                             /* Is this safe? */
> +                             ss->destroy(ss, cgrp);
> +                     }
>                       cgrp->subsys[i] = NULL;
>                       root->subsys_bits &= ~bit;
> -                     rootnode.subsys_bits |= bit;
> +                     if (singleton)
> +                             rootnode.subsys_bits |= bit;

initially we can see all subsystems in /proc/cgroups, but:

# mount -o cpu,debug xxx /mnt
# remount -o cpu xxx /mnt
# umount /mnt

now no root including rootnode contains debug_subsys, so we
won't see debug_subsys in /proc/cgroups.

I think this inconsistency is wrong.

>                       mutex_unlock(&ss->hierarchy_mutex);
>               } else if (bit & final_bits) {
>                       /* Subsystem state should already exist */
> @@ -974,7 +1020,7 @@ static int parse_cgroupfs_options(char *data,
>                       /* Add all non-disabled subsystems */
>                       int i;
>                       opts->subsys_bits = 0;
> -                     for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> +                     for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {

I don't see why only singleton subsystems are binded when mount
with '-o all'.

>                               struct cgroup_subsys *ss = subsys[i];
>                               if (!ss->disabled)
>                                       opts->subsys_bits |= 1ul << i;
> @@ -1040,6 +1086,7 @@ static int cgroup_remount(struct super_block *sb, int 
> *flags, char *data)
>       struct cgroupfs_root *root = sb->s_fs_info;
>       struct cgroup *cgrp = &root->top_cgroup;
>       struct cgroup_sb_opts opts;
> +     unsigned long original_bits;
>  
>       mutex_lock(&cgrp->dentry->d_inode->i_mutex);
>       mutex_lock(&cgroup_mutex);
> @@ -1061,9 +1108,13 @@ static int cgroup_remount(struct super_block *sb, int 
> *flags, char *data)
>               goto out_unlock;
>       }
>  
> +     original_bits = root->subsys_bits;
>       ret = rebind_subsystems(root, opts.subsys_bits);
> -     if (ret)
> +     if (ret) {
> +             int tmp = rebind_subsystems(root, original_bits);
> +             BUG_ON(tmp);
>               goto out_unlock;
> +     }
>  
>       /* (re)populate subsystem files */
>       cgroup_populate_dir(cgrp);
> @@ -1265,16 +1316,15 @@ static int cgroup_get_sb(struct file_system_type 
> *fs_type,
>               }
>  
>               ret = rebind_subsystems(root, opts.subsys_bits);
> -             if (ret == -EBUSY) {
> +             if (ret) {
> +                     int tmp = rebind_subsystems(root, 0);
> +                     BUG_ON(tmp);
>                       mutex_unlock(&cgroup_mutex);
>                       mutex_unlock(&inode->i_mutex);
>                       free_cg_links(&tmp_cg_links);
>                       goto drop_new_super;
>               }
>  
> -             /* EBUSY should be the only error here */
> -             BUG_ON(ret);
> -
>               list_add(&root->root_list, &roots);
>               root_count++;
>  
> @@ -2591,20 +2641,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
>       return 0;
>  }
>  
> -static void init_cgroup_css(struct cgroup_subsys_state *css,
> -                            struct cgroup_subsys *ss,
> -                            struct cgroup *cgrp)
> -{
> -     css->cgroup = cgrp;
> -     atomic_set(&css->refcnt, 1);
> -     css->flags = 0;
> -     css->id = NULL;
> -     if (cgrp == dummytop)
> -             set_bit(CSS_ROOT, &css->flags);
> -     BUG_ON(cgrp->subsys[ss->subsys_id]);
> -     cgrp->subsys[ss->subsys_id] = css;
> -}
> -
>  static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
>  {
>       /* We need to take each hierarchy_mutex in a consistent order */
> @@ -2890,21 +2926,23 @@ again:
>  static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
>  {
>       struct cgroup_subsys_state *css;
> -
> +     bool singleton = ss->subsys_id < CGROUP_SINGLETON_SUBSYS_COUNT;
>       printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
>  
> -     /* Create the top cgroup state for this subsystem */
> -     css = ss->create(ss, dummytop);
> -     /* We don't handle early failures gracefully */
> -     BUG_ON(IS_ERR(css));
> -     init_cgroup_css(css, ss, dummytop);
> -
> -     /* Update the init_css_set to contain a subsys
> -      * pointer to this state - since the subsystem is
> -      * newly registered, all tasks and hence the
> -      * init_css_set is in the subsystem's top cgroup. */
> -     init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
> +     if (singleton) {
> +             /* Create the top cgroup state for this subsystem */
> +             css = ss->create(ss, dummytop);
> +             /* We don't handle early failures gracefully */
> +             BUG_ON(IS_ERR(css));
> +             init_cgroup_css(css, ss, dummytop);
>  
> +             /* Update the init_css_set to contain a subsys
> +              * pointer to this state - since the subsystem is
> +              * newly registered, all tasks and hence the
> +              * init_css_set is in the subsystem's top cgroup. */
> +             init_css_set.subsys[ss->subsys_id] = css;
> +             rootnode.subsys_bits |= 1ULL << ss->subsys_id;
> +     }
>       need_forkexit_callback |= ss->fork || ss->exit;
>  
>       /* At system boot, before all subsystems have been
> @@ -2916,7 +2954,6 @@ static void __init cgroup_init_subsys(struct 
> cgroup_subsys *ss)
>       lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
>       ss->active = 1;
>  

tailing blank line

> -     rootnode.subsys_bits |= 1ULL << ss->subsys_id;
>  }
>  
>  /**
> @@ -3285,6 +3322,8 @@ int cgroup_clone(struct task_struct *tsk, struct 
> cgroup_subsys *subsys,
>  
>       /* We shouldn't be called by an unregistered subsystem */
>       BUG_ON(!subsys->active);
> +     /* We can only support singleton subsystems */
> +     BUG_ON(subsys->subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
>  
>       /* First figure out what hierarchy and cgroup we're dealing
>        * with, and pin them so we can drop cgroup_mutex */
> 
> 





_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to