On Thu, May 10, 2018 at 12:52:36PM +0300, Kirill Tkhai wrote:
> Imagine a big node with many cpus, memory cgroups and containers.
> Let we have 200 containers, every container has 10 mounts,
> and 10 cgroups. All container tasks don't touch foreign
> containers mounts. If there is intensive pages write,
> and global reclaim happens, a writing task has to iterate
> over all memcgs to shrink slab, before it's able to go
> to shrink_page_list().
> 
> Iteration over all the memcg slabs is very expensive:
> the task has to visit 200 * 10 = 2000 shrinkers
> for every memcg, and since there are 2000 memcgs,
> the total calls are 2000 * 2000 = 4000000.
> 
> So, the shrinker makes 4 million do_shrink_slab() calls
> just to try to isolate SWAP_CLUSTER_MAX pages in one
> of the actively writing memcg via shrink_page_list().
> I've observed a node spending almost 100% in kernel,
> making useless iteration over already shrinked slab.
> 
> This patch adds bitmap of memcg-aware shrinkers to memcg.
> The size of the bitmap depends on bitmap_nr_ids, and during
> memcg life it's maintained to be enough to fit bitmap_nr_ids
> shrinkers. Every bit in the map is related to corresponding
> shrinker id.
> 
> Next patches will maintain set bit only for really charged
> memcg. This will allow shrink_slab() to increase its
> performance in significant way. See the last patch for
> the numbers.
> 
> Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com>
> ---
>  include/linux/memcontrol.h |   21 ++++++++
>  mm/memcontrol.c            |  116 
> ++++++++++++++++++++++++++++++++++++++++++++
>  mm/vmscan.c                |   16 ++++++
>  3 files changed, 152 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 6cbea2f25a87..e5e7e0fc7158 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -105,6 +105,17 @@ struct lruvec_stat {
>       long count[NR_VM_NODE_STAT_ITEMS];
>  };
>  
> +#ifdef CONFIG_MEMCG_SHRINKER
> +/*
> + * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
> + * which have elements charged to this memcg.
> + */
> +struct memcg_shrinker_map {
> +     struct rcu_head rcu;
> +     unsigned long map[0];
> +};
> +#endif /* CONFIG_MEMCG_SHRINKER */
> +

AFAIR we don't normally ifdef structure definitions.

>  /*
>   * per-zone information in memory controller.
>   */
> @@ -118,6 +129,9 @@ struct mem_cgroup_per_node {
>  
>       struct mem_cgroup_reclaim_iter  iter[DEF_PRIORITY + 1];
>  
> +#ifdef CONFIG_MEMCG_SHRINKER
> +     struct memcg_shrinker_map __rcu *shrinker_map;
> +#endif
>       struct rb_node          tree_node;      /* RB tree node */
>       unsigned long           usage_in_excess;/* Set to the value by which */
>                                               /* the soft limit is exceeded*/
> @@ -1255,4 +1269,11 @@ static inline void memcg_put_cache_ids(void)
>  
>  #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
>  
> +#ifdef CONFIG_MEMCG_SHRINKER

> +#define MEMCG_SHRINKER_MAP(memcg, nid) (memcg->nodeinfo[nid]->shrinker_map)

I don't really like this helper macro. Accessing shrinker_map directly
looks cleaner IMO.

> +
> +extern int memcg_shrinker_nr_max;

As I've mentioned before, the capacity of shrinker map should be a
private business of memcontrol.c IMHO. We shouldn't use it in vmscan.c
as max shrinker id, instead we should introduce another variable for
this, private to vmscan.c.

> +extern int memcg_expand_shrinker_maps(int old_id, int id);

... Then this function would take just one argument, max id, and would
update shrinker_map capacity if necessary in memcontrol.c under the
corresponding mutex, which would look much more readable IMHO as all
shrinker_map related manipulations would be isolated in memcontrol.c.

> +#endif /* CONFIG_MEMCG_SHRINKER */
> +
>  #endif /* _LINUX_MEMCONTROL_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 3df3efa7ff40..18e0fdf302a9 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -322,6 +322,116 @@ struct workqueue_struct *memcg_kmem_cache_wq;
>  
>  #endif /* !CONFIG_SLOB */
>  
> +#ifdef CONFIG_MEMCG_SHRINKER
> +int memcg_shrinker_nr_max;

memcg_shrinker_map_capacity, may be?

> +static DEFINE_MUTEX(shrinkers_nr_max_mutex);

memcg_shrinker_map_mutex?

> +
> +static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
> +{
> +     kvfree(container_of(head, struct memcg_shrinker_map, rcu));
> +}
> +
> +static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
> +                                      int size, int old_size)

If you followed my advice and made the shrinker_map_capacity private to
memcontrol.c, you wouldn't need to pass old_size here either, just max
shrinker id.

> +{
> +     struct memcg_shrinker_map *new, *old;
> +     int nid;
> +
> +     lockdep_assert_held(&shrinkers_nr_max_mutex);
> +
> +     for_each_node(nid) {
> +             old = rcu_dereference_protected(MEMCG_SHRINKER_MAP(memcg, nid), 
> true);
> +             /* Not yet online memcg */
> +             if (old_size && !old)
> +                     return 0;
> +
> +             new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
> +             if (!new)
> +                     return -ENOMEM;
> +
> +             /* Set all old bits, clear all new bits */
> +             memset(new->map, (int)0xff, old_size);
> +             memset((void *)new->map + old_size, 0, size - old_size);
> +
> +             rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
> +             if (old)
> +                     call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
> +     }
> +
> +     return 0;
> +}
> +
> +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
> +{
> +     struct mem_cgroup_per_node *pn;
> +     struct memcg_shrinker_map *map;
> +     int nid;
> +
> +     if (memcg == root_mem_cgroup)
> +             return;

Nit: there's mem_cgroup_is_root() helper.

> +
> +     mutex_lock(&shrinkers_nr_max_mutex);

Why do you need to take the mutex here? You don't access shrinker map
capacity here AFAICS.

> +     for_each_node(nid) {
> +             pn = mem_cgroup_nodeinfo(memcg, nid);
> +             map = rcu_dereference_protected(pn->shrinker_map, true);
> +             if (map)
> +                     call_rcu(&map->rcu, memcg_free_shrinker_map_rcu);
> +             rcu_assign_pointer(pn->shrinker_map, NULL);
> +     }
> +     mutex_unlock(&shrinkers_nr_max_mutex);
> +}
> +
> +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
> +{
> +     int ret, size = memcg_shrinker_nr_max/BITS_PER_BYTE;
> +
> +     if (memcg == root_mem_cgroup)
> +             return 0;

Nit: mem_cgroup_is_root().

> +
> +     mutex_lock(&shrinkers_nr_max_mutex);

> +     ret = memcg_expand_one_shrinker_map(memcg, size, 0);

I don't think it's worth reusing the function designed for reallocating
shrinker maps for initial allocation. Please just fold the code here -
it will make both 'alloc' and 'expand' easier to follow IMHO.

> +     mutex_unlock(&shrinkers_nr_max_mutex);
> +
> +     if (ret)
> +             memcg_free_shrinker_maps(memcg);
> +
> +     return ret;
> +}
> +

> +static struct idr mem_cgroup_idr;

Stray change.

> +
> +int memcg_expand_shrinker_maps(int old_nr, int nr)
> +{
> +     int size, old_size, ret = 0;
> +     struct mem_cgroup *memcg;
> +
> +     old_size = old_nr / BITS_PER_BYTE;
> +     size = nr / BITS_PER_BYTE;
> +
> +     mutex_lock(&shrinkers_nr_max_mutex);
> +

> +     if (!root_mem_cgroup)
> +             goto unlock;

This wants a comment.

> +
> +     for_each_mem_cgroup(memcg) {
> +             if (memcg == root_mem_cgroup)

Nit: mem_cgroup_is_root().

> +                     continue;
> +             ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
> +             if (ret)
> +                     goto unlock;
> +     }
> +unlock:
> +     mutex_unlock(&shrinkers_nr_max_mutex);
> +     return ret;
> +}
> +#else /* CONFIG_MEMCG_SHRINKER */
> +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
> +{
> +     return 0;
> +}
> +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
> +#endif /* CONFIG_MEMCG_SHRINKER */
> +
>  /**
>   * mem_cgroup_css_from_page - css of the memcg associated with a page
>   * @page: page of interest
> @@ -4471,6 +4581,11 @@ static int mem_cgroup_css_online(struct 
> cgroup_subsys_state *css)
>  {
>       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>  
> +     if (memcg_alloc_shrinker_maps(memcg)) {
> +             mem_cgroup_id_remove(memcg);
> +             return -ENOMEM;
> +     }
> +
>       /* Online state pins memcg ID, memcg ID pins CSS */
>       atomic_set(&memcg->id.ref, 1);
>       css_get(css);
> @@ -4522,6 +4637,7 @@ static void mem_cgroup_css_free(struct 
> cgroup_subsys_state *css)
>       vmpressure_cleanup(&memcg->vmpressure);
>       cancel_work_sync(&memcg->high_work);
>       mem_cgroup_remove_from_trees(memcg);
> +     memcg_free_shrinker_maps(memcg);
>       memcg_free_kmem(memcg);
>       mem_cgroup_free(memcg);
>  }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index d691beac1048..d8a2870710e0 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -174,12 +174,26 @@ static DEFINE_IDR(shrinker_idr);
>  
>  static int prealloc_memcg_shrinker(struct shrinker *shrinker)
>  {
> -     int id, ret;
> +     int id, nr, ret;
>  
>       down_write(&shrinker_rwsem);
>       ret = id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
>       if (ret < 0)
>               goto unlock;
> +
> +     if (id >= memcg_shrinker_nr_max) {
> +             nr = memcg_shrinker_nr_max * 2;
> +             if (nr == 0)
> +                     nr = BITS_PER_BYTE;
> +             BUG_ON(id >= nr);

The logic defining shrinker map capacity growth should be private to
memcontrol.c IMHO.

> +
> +             if (memcg_expand_shrinker_maps(memcg_shrinker_nr_max, nr)) {
> +                     idr_remove(&shrinker_idr, id);
> +                     goto unlock;
> +             }
> +             memcg_shrinker_nr_max = nr;
> +     }
> +
>       shrinker->id = id;
>       ret = 0;
>  unlock:
> 

Reply via email to