Andrea Righi wrote:
> Dirty pages in the page cache can be processed asynchronously by kernel
> threads (pdflush) using a writeback policy. For this reason the real
> writes to the underlying block devices occur in a different IO context
> respect to the task that originally generated the dirty pages involved
> in the IO operation. This makes the tracking and throttling of writeback
> IO more complicate respect to the synchronous IO.
> 
> The page_cgroup infrastructure, currently available only for the memory
> cgroup controller, can be used to store the owner of each page and
> opportunely track the writeback IO. This information is encoded in
> page_cgroup->flags.

  You encode id in page_cgroup->flags, if a cgroup get removed, IMHO, you
  should remove the corresponding id in flags.
  One more thing, if a task is moving from a cgroup to another, the id in
  flags also need to be changed.

> 
> A owner can be identified using a generic ID number and the following
> interfaces are provided to store a retrieve this information:
> 
>   unsigned long page_cgroup_get_owner(struct page *page);
>   int page_cgroup_set_owner(struct page *page, unsigned long id);
>   int page_cgroup_copy_owner(struct page *npage, struct page *opage);
> 
> The io-throttle controller uses the cgroup css_id() as the owner's ID
> number.
> 
> A big part of this code is taken from the Ryo and Hirokazu's bio-cgroup
> controller (http://people.valinux.co.jp/~ryov/bio-cgroup/).
> 
> Signed-off-by: Andrea Righi <[email protected]>
> Signed-off-by: Hirokazu Takahashi <[email protected]>
> Signed-off-by: Ryo Tsuruta <[email protected]>
> ---
>  include/linux/memcontrol.h  |    6 +++
>  include/linux/mmzone.h      |    4 +-
>  include/linux/page_cgroup.h |   33 +++++++++++++-
>  init/Kconfig                |    4 ++
>  mm/Makefile                 |    3 +-
>  mm/memcontrol.c             |    6 +++
>  mm/page_cgroup.c            |   95 
> ++++++++++++++++++++++++++++++++++++++-----
>  7 files changed, 135 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 18146c9..f3e0e64 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -37,6 +37,8 @@ struct mm_struct;
>   * (Of course, if memcg does memory allocation in future, GFP_KERNEL is 
> sane.)
>   */
>  
> +extern void __init_mem_page_cgroup(struct page_cgroup *pc);
> +
>  extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
>                               gfp_t gfp_mask);
>  /* for swap handling */
> @@ -120,6 +122,10 @@ extern bool mem_cgroup_oom_called(struct task_struct 
> *task);
>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>  struct mem_cgroup;
>  
> +static inline void __init_mem_page_cgroup(struct page_cgroup *pc)
> +{
> +}
> +
>  static inline int mem_cgroup_newpage_charge(struct page *page,
>                                       struct mm_struct *mm, gfp_t gfp_mask)
>  {
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 186ec6a..b178eb9 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -607,7 +607,7 @@ typedef struct pglist_data {
>       int nr_zones;
>  #ifdef CONFIG_FLAT_NODE_MEM_MAP      /* means !SPARSEMEM */
>       struct page *node_mem_map;
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_PAGE_TRACKING
>       struct page_cgroup *node_page_cgroup;
>  #endif
>  #endif
> @@ -958,7 +958,7 @@ struct mem_section {
>  
>       /* See declaration of similar field in struct zone */
>       unsigned long *pageblock_flags;
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_PAGE_TRACKING
>       /*
>        * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
>        * section. (see memcontrol.h/page_cgroup.h about this.)
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 7339c7b..f24d081 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -1,7 +1,7 @@
>  #ifndef __LINUX_PAGE_CGROUP_H
>  #define __LINUX_PAGE_CGROUP_H
>  
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_PAGE_TRACKING
>  #include <linux/bit_spinlock.h>
>  /*
>   * Page Cgroup can be considered as an extended mem_map.
> @@ -12,11 +12,38 @@
>   */
>  struct page_cgroup {
>       unsigned long flags;
> -     struct mem_cgroup *mem_cgroup;
>       struct page *page;
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +     struct mem_cgroup *mem_cgroup;
>       struct list_head lru;           /* per cgroup LRU list */
> +#endif
>  };
>  
> +/*
> + * use lower 16 bits for flags and reserve the rest for the page tracking id
> + */
> +#define PAGE_TRACKING_ID_SHIFT       (16)
> +#define PAGE_TRACKING_ID_BITS \
> +             (8 * sizeof(unsigned long) - PAGE_TRACKING_ID_SHIFT)
> +
> +/* NOTE: must be called with page_cgroup() held */
> +static inline unsigned long page_cgroup_get_id(struct page_cgroup *pc)
> +{
> +     return pc->flags >> PAGE_TRACKING_ID_SHIFT;
> +}
> +
> +/* NOTE: must be called with page_cgroup() held */
> +static inline void page_cgroup_set_id(struct page_cgroup *pc, unsigned long 
> id)
> +{
> +     WARN_ON(id >= (1UL << PAGE_TRACKING_ID_BITS));
> +     pc->flags &= (1UL << PAGE_TRACKING_ID_SHIFT) - 1;
> +     pc->flags |= (unsigned long)(id << PAGE_TRACKING_ID_SHIFT);
> +}
> +
> +unsigned long page_cgroup_get_owner(struct page *page);
> +int page_cgroup_set_owner(struct page *page, unsigned long id);
> +int page_cgroup_copy_owner(struct page *npage, struct page *opage);
> +
>  void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
>  void __init page_cgroup_init(void);
>  struct page_cgroup *lookup_page_cgroup(struct page *page);
> @@ -71,7 +98,7 @@ static inline void unlock_page_cgroup(struct page_cgroup 
> *pc)
>       bit_spin_unlock(PCG_LOCK, &pc->flags);
>  }
>  
> -#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +#else /* CONFIG_PAGE_TRACKING */
>  struct page_cgroup;
>  
>  static inline void __meminit pgdat_page_cgroup_init(struct pglist_data 
> *pgdat)
> diff --git a/init/Kconfig b/init/Kconfig
> index 7be4d38..5428ac7 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -569,6 +569,7 @@ config CGROUP_MEM_RES_CTLR
>       bool "Memory Resource Controller for Control Groups"
>       depends on CGROUPS && RESOURCE_COUNTERS
>       select MM_OWNER
> +     select PAGE_TRACKING
>       help
>         Provides a memory resource controller that manages both anonymous
>         memory and page cache. (See Documentation/cgroups/memory.txt)
> @@ -611,6 +612,9 @@ endif # CGROUPS
>  config MM_OWNER
>       bool
>  
> +config PAGE_TRACKING
> +     bool
> +
>  config SYSFS_DEPRECATED
>       bool
>  
> diff --git a/mm/Makefile b/mm/Makefile
> index ec73c68..b94e074 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -37,4 +37,5 @@ else
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  endif
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
> -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
> +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> +obj-$(CONFIG_PAGE_TRACKING) += page_cgroup.o
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e44fb0f..69d1c31 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2524,6 +2524,12 @@ struct cgroup_subsys mem_cgroup_subsys = {
>       .use_id = 1,
>  };
>  
> +void __meminit __init_mem_page_cgroup(struct page_cgroup *pc)
> +{
> +     pc->mem_cgroup = NULL;
> +     INIT_LIST_HEAD(&pc->lru);
> +}
> +
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
>  
>  static int __init disable_swap_account(char *s)
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 791905c..b3b394c 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -3,6 +3,7 @@
>  #include <linux/bootmem.h>
>  #include <linux/bit_spinlock.h>
>  #include <linux/page_cgroup.h>
> +#include <linux/blk-io-throttle.h>
>  #include <linux/hash.h>
>  #include <linux/slab.h>
>  #include <linux/memory.h>
> @@ -14,9 +15,8 @@ static void __meminit
>  __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
>  {
>       pc->flags = 0;
> -     pc->mem_cgroup = NULL;
>       pc->page = pfn_to_page(pfn);
> -     INIT_LIST_HEAD(&pc->lru);
> +     __init_mem_page_cgroup(pc);
>  }
>  static unsigned long total_usage;
>  
> @@ -74,7 +74,7 @@ void __init page_cgroup_init(void)
>  
>       int nid, fail;
>  
> -     if (mem_cgroup_disabled())
> +     if (mem_cgroup_disabled() && iothrottle_disabled())
>               return;
>  
>       for_each_online_node(nid)  {
> @@ -83,12 +83,13 @@ void __init page_cgroup_init(void)
>                       goto fail;
>       }
>       printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> -     printk(KERN_INFO "please try cgroup_disable=memory option if you"
> -     " don't want\n");
> +     printk(KERN_INFO
> +             "try cgroup_disable=memory,blockio option if you don't want\n");
>       return;
>  fail:
>       printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
> -     printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
> +     printk(KERN_CRIT
> +             "try cgroup_disable=memory,blockio boot option\n");
>       panic("Out of memory");
>  }
>  
> @@ -243,12 +244,85 @@ static int __meminit page_cgroup_callback(struct 
> notifier_block *self,
>  
>  #endif
>  
> +/**
> + * page_cgroup_get_owner() - get the owner ID of a page
> + * @page:    the page we want to find the owner
> + *
> + * Returns the owner ID of the page, 0 means that the owner cannot be
> + * retrieved.
> + **/
> +unsigned long page_cgroup_get_owner(struct page *page)
> +{
> +     struct page_cgroup *pc;
> +     unsigned long ret;
> +
> +     pc = lookup_page_cgroup(page);
> +     if (unlikely(!pc))
> +             return 0;
> +
> +     lock_page_cgroup(pc);
> +     ret = page_cgroup_get_id(pc);
> +     unlock_page_cgroup(pc);
> +     return ret;
> +}
> +
> +/**
> + * page_cgroup_set_owner() - set the owner ID of a page
> + * @page:    the page we want to tag
> + * @id:              the ID number that will be associated to page
> + *
> + * Returns 0 if the owner is correctly associated to the page. Returns a
> + * negative value in case of failure.
> + **/
> +int page_cgroup_set_owner(struct page *page, unsigned long id)
> +{
> +     struct page_cgroup *pc;
> +
> +     pc = lookup_page_cgroup(page);
> +     if (unlikely(!pc))
> +             return -ENOENT;
> +
> +     lock_page_cgroup(pc);
> +     page_cgroup_set_id(pc, id);
> +     unlock_page_cgroup(pc);
> +     return 0;
> +}
> +
> +/**
> + * page_cgroup_copy_owner() - copy the owner ID of a page into another page
> + * @npage:   the page where we want to copy the owner
> + * @opage:   the page from which we want to copy the ID
> + *
> + * Returns 0 if the owner is correctly associated to npage. Returns a 
> negative
> + * value in case of failure.
> + **/
> +int page_cgroup_copy_owner(struct page *npage, struct page *opage)
> +{
> +     struct page_cgroup *npc, *opc;
> +     unsigned long id;
> +
> +     npc = lookup_page_cgroup(npage);
> +     if (unlikely(!npc))
> +             return -ENOENT;
> +     opc = lookup_page_cgroup(opage);
> +     if (unlikely(!opc))
> +             return -ENOENT;
> +     lock_page_cgroup(opc);
> +     lock_page_cgroup(npc);
> +     id = page_cgroup_get_id(opc);
> +     page_cgroup_set_id(npc, id);
> +     unlock_page_cgroup(npc);
> +     unlock_page_cgroup(opc);
> +
> +     return 0;
> +}
> +
>  void __init page_cgroup_init(void)
>  {
>       unsigned long pfn;
>       int fail = 0;
>  
> -     if (mem_cgroup_disabled())
> +     if (mem_cgroup_disabled() && iothrottle_disabled())
>               return;
>  
>       for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
> @@ -257,14 +331,15 @@ void __init page_cgroup_init(void)
>               fail = init_section_page_cgroup(pfn);
>       }
>       if (fail) {
> -             printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
> +             printk(KERN_CRIT
> +                     "try cgroup_disable=memory,blockio boot option\n");
>               panic("Out of memory");
>       } else {
>               hotplug_memory_notifier(page_cgroup_callback, 0);
>       }
>       printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> -     printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
> -     " want\n");
> +     printk(KERN_INFO
> +             "try cgroup_disable=memory,blockio option if you don't want\n");
>  }
>  
>  void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)

-- 
Regards
Gui Jianfeng

_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to