Paul Menage wrote:
> [RFC] Allow cgroup hierarchies to be created with no bound subsystems
> 
> This patch removes the restriction that a cgroup hierarchy must have
> at least one bound subsystem. The mount option "none" is treated as an
> explicit request for no bound subsystems.
> 
> A hierarchy with no subsystems can be useful for plan task tracking,

s/plan/plain/ ;)

> and is also a step towards the support for multiply-bindable
> subsystems in a later patch in this series.
> 
> As part of this change, the hierarchy id is no longer calculated from
> the bitmask of subsystems in the hierarchy (since this is not
> guaranteed to be unique) but is allocated via an ida. Reference counts
> on cgroups from css_set objects are now taken explicitly one per
> hierarchy, rather than one per subsystem.
> 
> Example usage:
> 
> mount -t cgroup -o none,name=foo cgroup /mnt/cgroup
> 
> Based on the "no-op"/"none" subsystem concept proposed by
> [email protected]
> 
> Signed-off-by: Paul Menage <[email protected]>
> 
> ---
> 
>  kernel/cgroup.c |  158 
> ++++++++++++++++++++++++++++++++++++-------------------
>  1 files changed, 104 insertions(+), 54 deletions(-)
> 
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 9cdbbac..4a7ef2c 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -72,6 +72,9 @@ struct cgroupfs_root {
>        */
>       unsigned long subsys_bits;
>  
> +     /* Unique id for this hierarchy. */
> +     int hierarchy_id;
> +
>       /* The bitmask of subsystems currently attached to this hierarchy */
>       unsigned long actual_subsys_bits;
>  
> @@ -142,6 +145,10 @@ struct css_id {
>  static LIST_HEAD(roots);
>  static int root_count;
>  
> +static DEFINE_IDA(hierarchy_ida);
> +static int next_hierarchy_id;
> +static DEFINE_SPINLOCK(hierarchy_id_lock);
> +
>  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
>  #define dummytop (&rootnode.top_cgroup)
>  
> @@ -259,42 +266,10 @@ static struct hlist_head *css_set_hash(struct 
> cgroup_subsys_state *css[])
>   * compiled into their kernel but not actually in use */
>  static int use_task_css_set_links __read_mostly;
>  
> -/* When we create or destroy a css_set, the operation simply
> - * takes/releases a reference count on all the cgroups referenced
> - * by subsystems in this css_set. This can end up multiple-counting
> - * some cgroups, but that's OK - the ref-count is just a
> - * busy/not-busy indicator; ensuring that we only count each cgroup
> - * once would require taking a global lock to ensure that no
> - * subsystems moved between hierarchies while we were doing so.
> - *
> - * Possible TODO: decide at boot time based on the number of
> - * registered subsystems and the number of CPUs or NUMA nodes whether
> - * it's better for performance to ref-count every subsystem, or to
> - * take a global lock and only add one ref count to each hierarchy.
> - */
> -
> -/*
> - * unlink a css_set from the list and free it
> - */
> -static void unlink_css_set(struct css_set *cg)
> +static void __put_css_set(struct css_set *cg, int taskexit)
>  {
>       struct cg_cgroup_link *link;
>       struct cg_cgroup_link *saved_link;
> -
> -     hlist_del(&cg->hlist);
> -     css_set_count--;
> -
> -     list_for_each_entry_safe(link, saved_link, &cg->cg_links,
> -                              cg_link_list) {
> -             list_del(&link->cg_link_list);
> -             list_del(&link->cgrp_link_list);
> -             kfree(link);
> -     }
> -}
> -
> -static void __put_css_set(struct css_set *cg, int taskexit)
> -{
> -     int i;
>       /*
>        * Ensure that the refcount doesn't hit zero while any readers
>        * can see it. Similar to atomic_dec_and_lock(), but for an
> @@ -307,20 +282,27 @@ static void __put_css_set(struct css_set *cg, int 
> taskexit)
>               write_unlock(&css_set_lock);
>               return;
>       }
> -     unlink_css_set(cg);
> -     write_unlock(&css_set_lock);
>  
> -     rcu_read_lock();
> -     for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> -             struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
> +     /* This css_set is dead. unlink it and release cgroup refcounts */
> +     hlist_del(&cg->hlist);
> +     css_set_count--;
> +
> +     list_for_each_entry_safe(link, saved_link, &cg->cg_links,
> +                              cg_link_list) {
> +             struct cgroup *cgrp = link->cgrp;
> +             list_del(&link->cg_link_list);
> +             list_del(&link->cgrp_link_list);
>               if (atomic_dec_and_test(&cgrp->count) &&
>                   notify_on_release(cgrp)) {
>                       if (taskexit)
>                               set_bit(CGRP_RELEASABLE, &cgrp->flags);
>                       check_for_release(cgrp);
>               }
> +
> +             kfree(link);
>       }
> -     rcu_read_unlock();
> +
> +     write_unlock(&css_set_lock);
>       kfree(cg);
>  }
>  
> @@ -513,6 +495,7 @@ static void link_css_set(struct list_head *tmp_cg_links,
>                               cgrp_link_list);
>       link->cg = cg;
>       link->cgrp = cgrp;
> +     atomic_inc(&cgrp->count);
>       list_move(&link->cgrp_link_list, &cgrp->css_sets);
>       /*
>        * Always add links to the tail of the list so that the list
> @@ -533,7 +516,6 @@ static struct css_set *find_css_set(
>  {
>       struct css_set *res;
>       struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> -     int i;
>  
>       struct list_head tmp_cg_links;
>  
> @@ -572,10 +554,6 @@ static struct css_set *find_css_set(
>  
>       write_lock(&css_set_lock);
>       /* Add reference counts and links from the new css_set. */
> -     for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> -             struct cgroup *cgrp = res->subsys[i]->cgroup;
> -             atomic_inc(&cgrp->count);
> -     }
>       list_for_each_entry_safe(link, saved_link, &oldcg->cg_links,
>                                cg_link_list) {
>               struct cgroup *c = link->cgrp;
> @@ -955,6 +933,10 @@ struct cgroup_sb_opts {
>       unsigned long flags;
>       char *release_agent;
>       char *name;
> +
> +     /* User explicitly requested empty subsystem */
> +     bool none;
> +
>       /* A flag indicating that a root was created from this options block */
>       bool created_root;
>  };
> @@ -983,6 +965,9 @@ static int parse_cgroupfs_options(char *data,
>                               if (!ss->disabled)
>                                       opts->subsys_bits |= 1ul << i;
>                       }
> +             } else if (!strcmp(token, "none")) {
> +                     /* Explicitly have no subsystems */
> +                     opts->none = true;
>               } else if (!strcmp(token, "noprefix")) {
>                       set_bit(ROOT_NOPREFIX, &opts->flags);
>               } else if (!strncmp(token, "release_agent=", 14)) {
> @@ -1019,7 +1004,16 @@ static int parse_cgroupfs_options(char *data,
>               }
>       }
>  
> -     /* We can't have an empty hierarchy */
> +     /* Consistency checks */
> +
> +     /* Can't specify "none" and some subsystems */
> +     if (opts->subsys_bits && opts->none)
> +             return -EINVAL;
> +
> +     /*
> +      * We either have to specify by name or by subsystems. (So all
> +      * empty hierarchies must have a name).
> +      */
>       if (!opts->subsys_bits && !opts->name)
>               return -EINVAL;
>  
> @@ -1085,6 +1079,7 @@ static void init_cgroup_housekeeping(struct cgroup 
> *cgrp)
>       INIT_LIST_HEAD(&cgrp->release_list);
>       init_rwsem(&cgrp->pids_mutex);
>  }
> +
>  static void init_cgroup_root(struct cgroupfs_root *root)
>  {
>       struct cgroup *cgrp = &root->top_cgroup;
> @@ -1096,6 +1091,18 @@ static void init_cgroup_root(struct cgroupfs_root 
> *root)
>       init_cgroup_housekeeping(cgrp);
>  }
>  
> +static void init_root_id(struct cgroupfs_root *root)
> +{
> +     BUG_ON(!ida_pre_get(&hierarchy_ida, GFP_KERNEL));

we should return -errno, BUG_ON() on this is wrong

> +     spin_lock(&hierarchy_id_lock);
> +     if (ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
> +                           &root->hierarchy_id)) {
> +             BUG_ON(ida_get_new(&hierarchy_ida, &root->hierarchy_id));
> +     }

next_hierarchy_id is redundant, just use ida_get_new() instead of
ida_get_new_above()

and I think we should check EAGAIN:

/*
 * ida_get_new - allocate new ID
 ...
 * If memory is required, it will return -EAGAIN, you should unlock
 * and go back to the idr_pre_get() call. ...
 */

> +     next_hierarchy_id = root->hierarchy_id + 1;
> +     spin_unlock(&hierarchy_id_lock);
> +}
> +
>  static int cgroup_test_super(struct super_block *sb, void *data)
>  {
>       struct cgroup_sb_opts *new = data;
> @@ -1116,7 +1123,7 @@ static struct cgroupfs_root 
> *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
>  {
>       struct cgroupfs_root *root;
>  
> -     if (!opts->subsys_bits)
> +     if (!opts->subsys_bits && !opts->none)
>               return ERR_PTR(-EINVAL);
>  

Shouldn't we check this in parse_cgroupfs_options() instead?

>       root = kzalloc(sizeof(*root), GFP_KERNEL);
> @@ -1124,6 +1131,8 @@ static struct cgroupfs_root 
> *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
>               return ERR_PTR(-ENOMEM);
>  
>       init_cgroup_root(root);
> +     init_root_id(root);
> +
>       root->subsys_bits = opts->subsys_bits;
>       root->flags = opts->flags;
>       if (opts->release_agent)
> @@ -1134,6 +1143,15 @@ static struct cgroupfs_root 
> *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
>       return root;
>  }
>  
> +static void cgroup_drop_root(struct cgroupfs_root *root)
> +{
> +     BUG_ON(!root->hierarchy_id);

I think this BUG_ON() is redundant, if root->hierarchy_id == 0,
we are freeing the dummy root, and kfree(root) should trigger
kernel oops.

> +     spin_lock(&hierarchy_id_lock);
> +     ida_remove(&hierarchy_ida, root->hierarchy_id);
> +     spin_unlock(&hierarchy_id_lock);
> +     kfree(root);
> +}
> +
>  static int cgroup_set_super(struct super_block *sb, void *data)
>  {
>       int ret;
> @@ -1145,7 +1163,7 @@ static int cgroup_set_super(struct super_block *sb, 
> void *data)
>               return PTR_ERR(root);
>       ret = set_anon_super(sb, NULL);
>       if (ret) {
> -             kfree(root);
> +             cgroup_drop_root(root);
>               return ret;
>       }
>  
> @@ -1331,7 +1349,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
>       mutex_unlock(&cgroup_mutex);
>  
>       kill_litter_super(sb);
> -     kfree(root);
> +     cgroup_drop_root(root);
>  }
>  
>  static struct file_system_type cgroup_fs_type = {
> @@ -2975,7 +2993,7 @@ int __init cgroup_init(void)
>       /* Add init_css_set to the hash table */
>       hhead = css_set_hash(init_css_set.subsys);
>       hlist_add_head(&init_css_set.hlist, hhead);
> -
> +     init_root_id(&rootnode);
>       err = register_filesystem(&cgroup_fs_type);
>       if (err < 0)
>               goto out;
> @@ -3030,7 +3048,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
>               struct cgroup *cgrp;
>               int count = 0;
>  
> -             seq_printf(m, "%lu:", root->subsys_bits);
> +             seq_printf(m, "%d:", root->hierarchy_id);
>               for_each_subsys(root, ss)
>                       seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
>               if (strlen(root->name))
> @@ -3076,8 +3094,8 @@ static int proc_cgroupstats_show(struct seq_file *m, 
> void *v)
>       mutex_lock(&cgroup_mutex);
>       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
>               struct cgroup_subsys *ss = subsys[i];
> -             seq_printf(m, "%s\t%lu\t%d\t%d\n",
> -                        ss->name, ss->root->subsys_bits,
> +             seq_printf(m, "%s\t%d\t%d\t%d\n",
> +                        ss->name, ss->root->hierarchy_id,
>                          ss->root->number_of_cgroups, !ss->disabled);
>       }
>       mutex_unlock(&cgroup_mutex);
> @@ -3800,8 +3818,8 @@ static int current_css_set_cg_links_read(struct cgroup 
> *cont,
>                       name = c->dentry->d_name.name;
>               else
>                       name = "?";
> -             seq_printf(seq, "Root %lu group %s\n",
> -                        c->root->subsys_bits, name);
> +             seq_printf(seq, "Root %d group %s\n",
> +                        c->root->hierarchy_id, name);
>               rcu_read_unlock();
>       }
>       task_unlock(current);
> @@ -3809,6 +3827,33 @@ static int current_css_set_cg_links_read(struct cgroup 
> *cont,
>       return 0;
>  }
>  
> +#define MAX_TASKS_SHOWN_PER_CSS 25
> +static int cgroup_css_links_read(struct cgroup *cont,
> +                              struct cftype *cft,
> +                              struct seq_file *seq)
> +{
> +     struct cg_cgroup_link *link, *saved_link;
> +     write_lock_irq(&css_set_lock);
> +     list_for_each_entry_safe(link, saved_link, &cont->css_sets,
> +                              cgrp_link_list) {
> +             struct css_set *cg = link->cg;
> +             struct task_struct *task, *saved_task;
> +             int count = 0;

> +             seq_printf(seq, "css_set %p\n", cg);
> +             list_for_each_entry_safe(task, saved_task, &cg->tasks,
> +                                      cg_list) {
> +                     if (count++ > MAX_TASKS_SHOWN_PER_CSS) {

actually this prints at most 26 tasks but not 25

and what's the concern to not print out all the tasks? the buffer
limit of seqfile?

> +                             seq_puts(seq, "  ...\n");
> +                             break;
> +                     } else {
> +                             seq_printf(seq, "  task %d\n", task->pid);

I guess we should call task_pid_vnr(tsk) instead of accessing task->pid
directly.

> +                     }
> +             }
> +     }
> +     write_unlock_irq(&css_set_lock);
> +     return 0;
> +}
> +
>  static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
>  {
>       return test_bit(CGRP_RELEASABLE, &cgrp->flags);
> @@ -3840,6 +3885,11 @@ static struct cftype debug_files[] =  {
>       },
>  
>       {
> +             .name = "cgroup_css_links",
> +             .read_seq_string = cgroup_css_links_read,
> +     },
> +
> +     {
>               .name = "releasable",
>               .read_u64 = releasable_read,
>       },
> 
> 
> 




_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to