[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-03-30 Thread Andrea Righi
On Sun, Feb 21, 2010 at 01:28:35PM -0800, David Rientjes wrote:

[snip]

  +static struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
  +{
  +   struct page_cgroup *pc;
  +   struct mem_cgroup *mem = NULL;
  +
  +   pc = lookup_page_cgroup(page);
  +   if (unlikely(!pc))
  +   return NULL;
  +   lock_page_cgroup(pc);
  +   if (PageCgroupUsed(pc)) {
  +   mem = pc-mem_cgroup;
  +   if (mem)
  +   css_get(mem-css);
  +   }
  +   unlock_page_cgroup(pc);
  +   return mem;
  +}
 
 Is it possible to merge this with try_get_mem_cgroup_from_page()?

Agreed.

 
  +
  +void mem_cgroup_charge_dirty(struct page *page,
  +   enum zone_stat_item idx, int charge)
  +{
  +   struct mem_cgroup *mem;
  +   struct mem_cgroup_stat_cpu *cpustat;
  +   unsigned long flags;
  +   int cpu;
  +
  +   if (mem_cgroup_disabled())
  +   return;
  +   /* Translate the zone_stat_item into a mem_cgroup_stat_index */
  +   switch (idx) {
  +   case NR_FILE_DIRTY:
  +   idx = MEM_CGROUP_STAT_FILE_DIRTY;
  +   break;
  +   case NR_WRITEBACK:
  +   idx = MEM_CGROUP_STAT_WRITEBACK;
  +   break;
  +   case NR_WRITEBACK_TEMP:
  +   idx = MEM_CGROUP_STAT_WRITEBACK_TEMP;
  +   break;
  +   case NR_UNSTABLE_NFS:
  +   idx = MEM_CGROUP_STAT_UNSTABLE_NFS;
  +   break;
  +   default:
  +   return;
 
 WARN()?  We don't want to silently leak counters.

Agreed.

 
  +   }
  +   /* Charge the memory cgroup statistics */
  +   mem = get_mem_cgroup_from_page(page);
  +   if (!mem) {
  +   mem = root_mem_cgroup;
  +   css_get(mem-css);
  +   }
 
 get_mem_cgroup_from_page() should probably handle the root_mem_cgroup case 
 and return a reference from it.

Right. But I'd prefer to use try_get_mem_cgroup_from_page() without
changing the behaviour of this function.

 
  +
  +   local_irq_save(flags);
  +   cpu = get_cpu();
  +   cpustat = mem-stat.cpustat[cpu];
  +   __mem_cgroup_stat_add_safe(cpustat, idx, charge);
 
 get_cpu()?  Preemption is already disabled, just use smp_processor_id().

mmmh... actually, we can just copy the code from
mem_cgroup_charge_statistics(), so local_irq_save/restore are not
necessarily needed and we can just use get_cpu()/put_cpu().

  +   put_cpu();
  +   local_irq_restore(flags);
  +   css_put(mem-css);
  +}
  +
   static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
  enum lru_list idx)
   {
  @@ -992,6 +1061,97 @@ static unsigned int get_swappiness(struct mem_cgroup 
  *memcg)
  return swappiness;
   }
   
  +static unsigned long get_dirty_bytes(struct mem_cgroup *memcg)
  +{
  +   struct cgroup *cgrp = memcg-css.cgroup;
  +   unsigned long dirty_bytes;
  +
  +   /* root ? */
  +   if (cgrp-parent == NULL)
  +   return vm_dirty_bytes;
  +
  +   spin_lock(memcg-reclaim_param_lock);
  +   dirty_bytes = memcg-dirty_bytes;
  +   spin_unlock(memcg-reclaim_param_lock);
  +
  +   return dirty_bytes;
  +}
  +
  +unsigned long mem_cgroup_dirty_bytes(void)
  +{
  +   struct mem_cgroup *memcg;
  +   unsigned long dirty_bytes;
  +
  +   if (mem_cgroup_disabled())
  +   return vm_dirty_bytes;
  +
  +   rcu_read_lock();
  +   memcg = mem_cgroup_from_task(current);
  +   if (memcg == NULL)
  +   dirty_bytes = vm_dirty_bytes;
  +   else
  +   dirty_bytes = get_dirty_bytes(memcg);
  +   rcu_read_unlock();
 
 The rcu_read_lock() isn't protecting anything here.

Right!

 
  +
  +   return dirty_bytes;
  +}
  +
  +u64 mem_cgroup_page_state(enum memcg_page_stat_item item)
  +{
  +   struct mem_cgroup *memcg;
  +   struct cgroup *cgrp;
  +   u64 ret = 0;
  +
  +   if (mem_cgroup_disabled())
  +   return 0;
  +
  +   rcu_read_lock();
 
 Again, this isn't necessary.

OK. I'll apply your changes to the next version of this patch.

Thanks for reviewing!

-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-03-30 Thread Andrea Righi
On Mon, Feb 22, 2010 at 09:44:42PM +0530, Balbir Singh wrote:
[snip]
  +void mem_cgroup_charge_dirty(struct page *page,
  +   enum zone_stat_item idx, int charge)
  +{
  +   struct mem_cgroup *mem;
  +   struct mem_cgroup_stat_cpu *cpustat;
  +   unsigned long flags;
  +   int cpu;
  +
  +   if (mem_cgroup_disabled())
  +   return;
  +   /* Translate the zone_stat_item into a mem_cgroup_stat_index */
  +   switch (idx) {
  +   case NR_FILE_DIRTY:
  +   idx = MEM_CGROUP_STAT_FILE_DIRTY;
  +   break;
  +   case NR_WRITEBACK:
  +   idx = MEM_CGROUP_STAT_WRITEBACK;
  +   break;
  +   case NR_WRITEBACK_TEMP:
  +   idx = MEM_CGROUP_STAT_WRITEBACK_TEMP;
  +   break;
  +   case NR_UNSTABLE_NFS:
  +   idx = MEM_CGROUP_STAT_UNSTABLE_NFS;
  +   break;
  +   default:
  +   return;
  +   }
  +   /* Charge the memory cgroup statistics */
  +   mem = get_mem_cgroup_from_page(page);
  +   if (!mem) {
  +   mem = root_mem_cgroup;
  +   css_get(mem-css);
  +   }
  +
  +   local_irq_save(flags);
  +   cpu = get_cpu();
 
 Kamezawa is in the process of changing these, so you might want to
 look at and integrate with those patches when they are ready.

OK, I'll rebase the patch to -mm. Are those changes already included in
mmotm?

Thanks,
-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-25 Thread Minchan Kim
Hi

On Tue, Feb 23, 2010 at 8:58 PM, Andrea Righi ari...@develer.com wrote:
 On Mon, Feb 22, 2010 at 01:07:32PM -0500, Vivek Goyal wrote:
+unsigned long mem_cgroup_dirty_bytes(void)
+{
+       struct mem_cgroup *memcg;
+       unsigned long dirty_bytes;
+
+       if (mem_cgroup_disabled())
+               return vm_dirty_bytes;
+
+       rcu_read_lock();
+       memcg = mem_cgroup_from_task(current);
+       if (memcg == NULL)
+               dirty_bytes = vm_dirty_bytes;
+       else
+               dirty_bytes = get_dirty_bytes(memcg);
+       rcu_read_unlock();
  
   The rcu_read_lock() isn't protecting anything here.
 
  Right!

 Are we not protecting memcg pointer using rcu here?

 Vivek, you are right:

  mem_cgroup_from_task() - task_subsys_state() - rcu_dereference()

 So, this *must* be RCU protected.

So, Doesn't mem_cgroup_from_task in mem_cgroup_can_attach need RCU, too?


-- 
Kind regards,
Minchan Kim
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-25 Thread KAMEZAWA Hiroyuki
On Fri, 26 Feb 2010 00:36:15 +0900
Minchan Kim minchan@gmail.com wrote:

 Hi
 
 On Tue, Feb 23, 2010 at 8:58 PM, Andrea Righi ari...@develer.com wrote:
  On Mon, Feb 22, 2010 at 01:07:32PM -0500, Vivek Goyal wrote:
 +unsigned long mem_cgroup_dirty_bytes(void)
 +{
 +       struct mem_cgroup *memcg;
 +       unsigned long dirty_bytes;
 +
 +       if (mem_cgroup_disabled())
 +               return vm_dirty_bytes;
 +
 +       rcu_read_lock();
 +       memcg = mem_cgroup_from_task(current);
 +       if (memcg == NULL)
 +               dirty_bytes = vm_dirty_bytes;
 +       else
 +               dirty_bytes = get_dirty_bytes(memcg);
 +       rcu_read_unlock();
   
The rcu_read_lock() isn't protecting anything here.
  
   Right!
 
  Are we not protecting memcg pointer using rcu here?
 
  Vivek, you are right:
 
   mem_cgroup_from_task() - task_subsys_state() - rcu_dereference()
 
  So, this *must* be RCU protected.
 
 So, Doesn't mem_cgroup_from_task in mem_cgroup_can_attach need RCU, too?
 
Hm ? I don't read the whole thread but can_attach() is called under
cgroup_mutex(). So, it doesn't need to use RCU.

Thanks,
-Kame


___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-25 Thread Minchan Kim
Hi, Kame.

On Fri, Feb 26, 2010 at 9:23 AM, KAMEZAWA Hiroyuki
kamezawa.hir...@jp.fujitsu.com wrote:
 On Fri, 26 Feb 2010 00:36:15 +0900
 Minchan Kim minchan@gmail.com wrote:

 Hi

 On Tue, Feb 23, 2010 at 8:58 PM, Andrea Righi ari...@develer.com wrote:
  On Mon, Feb 22, 2010 at 01:07:32PM -0500, Vivek Goyal wrote:
 +unsigned long mem_cgroup_dirty_bytes(void)
 +{
 +       struct mem_cgroup *memcg;
 +       unsigned long dirty_bytes;
 +
 +       if (mem_cgroup_disabled())
 +               return vm_dirty_bytes;
 +
 +       rcu_read_lock();
 +       memcg = mem_cgroup_from_task(current);
 +       if (memcg == NULL)
 +               dirty_bytes = vm_dirty_bytes;
 +       else
 +               dirty_bytes = get_dirty_bytes(memcg);
 +       rcu_read_unlock();
   
The rcu_read_lock() isn't protecting anything here.
  
   Right!
 
  Are we not protecting memcg pointer using rcu here?
 
  Vivek, you are right:
 
   mem_cgroup_from_task() - task_subsys_state() - rcu_dereference()
 
  So, this *must* be RCU protected.

 So, Doesn't mem_cgroup_from_task in mem_cgroup_can_attach need RCU, too?

 Hm ? I don't read the whole thread but can_attach() is called under
 cgroup_mutex(). So, it doesn't need to use RCU.

Vivek mentioned memcg is protected by RCU if I understand his intention right.
So I commented that without enough knowledge of memcg.
After your comment, I dive into the code.

Just out of curiosity.

Really, memcg is protected by RCU?
I think most of RCU around memcg is for protecting task_struct and
cgroup_subsys_state.
The memcg is protected by cgroup_mutex as you mentioned.
Am I missing something?

 Thanks,
 -Kame






-- 
Kind regards,
Minchan Kim
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-25 Thread KAMEZAWA Hiroyuki
Hi,

On Fri, 26 Feb 2010 13:50:04 +0900
Minchan Kim minchan@gmail.com wrote:

  Hm ? I don't read the whole thread but can_attach() is called under
  cgroup_mutex(). So, it doesn't need to use RCU.
 
 Vivek mentioned memcg is protected by RCU if I understand his intention right.
 So I commented that without enough knowledge of memcg.
 After your comment, I dive into the code.
 
 Just out of curiosity.
 
 Really, memcg is protected by RCU?
yes. All cgroup subsystem is protected by RCU.

 I think most of RCU around memcg is for protecting task_struct and
 cgroup_subsys_state.
 The memcg is protected by cgroup_mutex as you mentioned.
 Am I missing something?

There are several levels of protections.

cgroup subsystem's -destroy() call back is finally called by

As this.

 768 synchronize_rcu();
 769 
 770 mutex_lock(cgroup_mutex);
 771 /*
 772  * Release the subsystem state objects.
 773  */
 774 for_each_subsys(cgrp-root, ss)
 775 ss-destroy(ss, cgrp);
 776 
 777 cgrp-root-number_of_cgroups--;
 778 mutex_unlock(cgroup_mutex);

Before here, 
- there are no tasks under this cgroup (cgroup's refcnt is 0)
   cgroup is marked as REMOVED.

Then, this access
rcu_read_lock();
mem = mem_cgroup_from_task(task);
if (css_tryget(mem-css))   ===checks cgroup refcnt

rcu_read_unlock()
is O.K.

And, it's graranteed that we don't have to do complicated fine-grain check
if cgroup_mutex() is held.

Because cgroup_mutex() is system-wide heavy lock, this refcnt+RCU trick is
used and works quite well.

Thanks,
-Kame

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-25 Thread Minchan Kim
On Fri, Feb 26, 2010 at 2:01 PM, KAMEZAWA Hiroyuki
kamezawa.hir...@jp.fujitsu.com wrote:
 Hi,

 On Fri, 26 Feb 2010 13:50:04 +0900
 Minchan Kim minchan@gmail.com wrote:

  Hm ? I don't read the whole thread but can_attach() is called under
  cgroup_mutex(). So, it doesn't need to use RCU.

 Vivek mentioned memcg is protected by RCU if I understand his intention 
 right.
 So I commented that without enough knowledge of memcg.
 After your comment, I dive into the code.

 Just out of curiosity.

 Really, memcg is protected by RCU?
 yes. All cgroup subsystem is protected by RCU.

 I think most of RCU around memcg is for protecting task_struct and
 cgroup_subsys_state.
 The memcg is protected by cgroup_mutex as you mentioned.
 Am I missing something?

 There are several levels of protections.

 cgroup subsystem's -destroy() call back is finally called by

 As this.

  768                 synchronize_rcu();
  769
  770                 mutex_lock(cgroup_mutex);
  771                 /*
  772                  * Release the subsystem state objects.
  773                  */
  774                 for_each_subsys(cgrp-root, ss)
  775                         ss-destroy(ss, cgrp);
  776
  777                 cgrp-root-number_of_cgroups--;
  778                 mutex_unlock(cgroup_mutex);

 Before here,
        - there are no tasks under this cgroup (cgroup's refcnt is 0)
           cgroup is marked as REMOVED.

 Then, this access
        rcu_read_lock();
        mem = mem_cgroup_from_task(task);
        if (css_tryget(mem-css))   ===checks cgroup refcnt

If it it, do we always need css_tryget after mem_cgroup_from_task
without cgroup_mutex to make sure css is vaild?

But I found several cases that don't call css_tryget

1. mm_match_cgroup
It's used by page_referenced_xxx. so we I think we don't grab
cgroup_mutex at that time.

2. mem_cgroup_oom_called
I think in here we don't grab cgroup_mutex, too.

I guess some design would cover that problems.
Could you tell me if you don't mind?
Sorry for bothering you.

Thanks, Kame.



-- 
Kind regards,
Minchan Kim
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-25 Thread KAMEZAWA Hiroyuki
On Fri, 26 Feb 2010 14:53:39 +0900
Minchan Kim minchan@gmail.com wrote:

 On Fri, Feb 26, 2010 at 2:01 PM, KAMEZAWA Hiroyuki
 kamezawa.hir...@jp.fujitsu.com wrote:
  Hi,
 
  On Fri, 26 Feb 2010 13:50:04 +0900
  Minchan Kim minchan@gmail.com wrote:
 
   Hm ? I don't read the whole thread but can_attach() is called under
   cgroup_mutex(). So, it doesn't need to use RCU.
 
  Vivek mentioned memcg is protected by RCU if I understand his intention 
  right.
  So I commented that without enough knowledge of memcg.
  After your comment, I dive into the code.
 
  Just out of curiosity.
 
  Really, memcg is protected by RCU?
  yes. All cgroup subsystem is protected by RCU.
 
  I think most of RCU around memcg is for protecting task_struct and
  cgroup_subsys_state.
  The memcg is protected by cgroup_mutex as you mentioned.
  Am I missing something?
 
  There are several levels of protections.
 
  cgroup subsystem's -destroy() call back is finally called by
 
  As this.
 
   768                 synchronize_rcu();
   769
   770                 mutex_lock(cgroup_mutex);
   771                 /*
   772                  * Release the subsystem state objects.
   773                  */
   774                 for_each_subsys(cgrp-root, ss)
   775                         ss-destroy(ss, cgrp);
   776
   777                 cgrp-root-number_of_cgroups--;
   778                 mutex_unlock(cgroup_mutex);
 
  Before here,
         - there are no tasks under this cgroup (cgroup's refcnt is 0)
            cgroup is marked as REMOVED.
 
  Then, this access
         rcu_read_lock();
         mem = mem_cgroup_from_task(task);
         if (css_tryget(mem-css))   ===checks cgroup refcnt
 
 If it it, do we always need css_tryget after mem_cgroup_from_task
 without cgroup_mutex to make sure css is vaild?
 
On a case by cases. 

 But I found several cases that don't call css_tryget
 
 1. mm_match_cgroup
 It's used by page_referenced_xxx. so we I think we don't grab
 cgroup_mutex at that time.
 
yes. but all check are done under RCU. And this function never
access contents of memory which pointer holds.
And, please conider the whole context.

mem_cgroup_try_charge()
mem_cout =. (refcnt +1)

try_to_free_mem_cgroup_pages(mem_cont)

shrink_xxx_list()

page_referenced_anon(page, mem_cont)
mm_match_cgroup(mm, mem_cont)

Then, this mem_cont (2nd argument to mm_match_cgroup) is always valid.
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(mm-ownder));
rcu_read_unlock();
return memcg != mem_cont;

Here,
1. mem_cont is never reused. (because refcnt+1)
2. we don't access memcg's contents.

I think even rcu_read_lock()/unlock() is unnecessary.



 2. mem_cgroup_oom_called
 I think in here we don't grab cgroup_mutex, too.
 
In OOM-killer context, memcg which causes OOM has refcnt +1.
Then, not necessary. 


 I guess some design would cover that problems.
Maybe.

 Could you tell me if you don't mind?
 Sorry for bothering you.
 

In my point of view, the most terrible porblem is heavy cost of
css_tryget() when you run multi-thread heavy program.
So, I want to see some inovation, but haven't find yet.

I admit this RCU+refcnt is tend to be hard to review. But it's costly
operation to take refcnt and it's worth to be handled in the best
usage of each logics, on a case by cases for now.

Thanks,
-Kame


___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-25 Thread Minchan Kim
On Fri, Feb 26, 2010 at 3:15 PM, KAMEZAWA Hiroyuki
kamezawa.hir...@jp.fujitsu.com wrote:
 On Fri, 26 Feb 2010 14:53:39 +0900
 Minchan Kim minchan@gmail.com wrote:

 On Fri, Feb 26, 2010 at 2:01 PM, KAMEZAWA Hiroyuki
 kamezawa.hir...@jp.fujitsu.com wrote:
  Hi,
 
  On Fri, 26 Feb 2010 13:50:04 +0900
  Minchan Kim minchan@gmail.com wrote:
 
   Hm ? I don't read the whole thread but can_attach() is called under
   cgroup_mutex(). So, it doesn't need to use RCU.
 
  Vivek mentioned memcg is protected by RCU if I understand his intention 
  right.
  So I commented that without enough knowledge of memcg.
  After your comment, I dive into the code.
 
  Just out of curiosity.
 
  Really, memcg is protected by RCU?
  yes. All cgroup subsystem is protected by RCU.
 
  I think most of RCU around memcg is for protecting task_struct and
  cgroup_subsys_state.
  The memcg is protected by cgroup_mutex as you mentioned.
  Am I missing something?
 
  There are several levels of protections.
 
  cgroup subsystem's -destroy() call back is finally called by
 
  As this.
 
   768                 synchronize_rcu();
   769
   770                 mutex_lock(cgroup_mutex);
   771                 /*
   772                  * Release the subsystem state objects.
   773                  */
   774                 for_each_subsys(cgrp-root, ss)
   775                         ss-destroy(ss, cgrp);
   776
   777                 cgrp-root-number_of_cgroups--;
   778                 mutex_unlock(cgroup_mutex);
 
  Before here,
         - there are no tasks under this cgroup (cgroup's refcnt is 0)
            cgroup is marked as REMOVED.
 
  Then, this access
         rcu_read_lock();
         mem = mem_cgroup_from_task(task);
         if (css_tryget(mem-css))   ===checks cgroup refcnt

 If it it, do we always need css_tryget after mem_cgroup_from_task
 without cgroup_mutex to make sure css is vaild?

 On a case by cases.

 But I found several cases that don't call css_tryget

 1. mm_match_cgroup
 It's used by page_referenced_xxx. so we I think we don't grab
 cgroup_mutex at that time.

 yes. but all check are done under RCU. And this function never
 access contents of memory which pointer holds.
 And, please conider the whole context.

        mem_cgroup_try_charge()
                mem_cout =. (refcnt +1)
                
                try_to_free_mem_cgroup_pages(mem_cont)
                
                shrink_xxx_list()
                
                        page_referenced_anon(page, mem_cont)
                                mm_match_cgroup(mm, mem_cont)

 Then, this mem_cont (2nd argument to mm_match_cgroup) is always valid.
        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm-ownder));
        rcu_read_unlock();
        return memcg != mem_cont;

 Here,
        1. mem_cont is never reused. (because refcnt+1)
        2. we don't access memcg's contents.

 I think even rcu_read_lock()/unlock() is unnecessary.



 2. mem_cgroup_oom_called
 I think in here we don't grab cgroup_mutex, too.

 In OOM-killer context, memcg which causes OOM has refcnt +1.
 Then, not necessary.


 I guess some design would cover that problems.
 Maybe.

 Could you tell me if you don't mind?
 Sorry for bothering you.


 In my point of view, the most terrible porblem is heavy cost of
 css_tryget() when you run multi-thread heavy program.
 So, I want to see some inovation, but haven't find yet.

 I admit this RCU+refcnt is tend to be hard to review. But it's costly
 operation to take refcnt and it's worth to be handled in the best
 usage of each logics, on a case by cases for now.


Thanks for always kind explanation, Kame.

 Thanks,
 -Kame






-- 
Kind regards,
Minchan Kim
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-23 Thread KAMEZAWA Hiroyuki
On Tue, 23 Feb 2010 10:28:53 +0100
Andrea Righi ari...@develer.com wrote:

 On Mon, Feb 22, 2010 at 09:44:42PM +0530, Balbir Singh wrote:
 [snip]
   +void mem_cgroup_charge_dirty(struct page *page,
   + enum zone_stat_item idx, int charge)
   +{
   + struct mem_cgroup *mem;
   + struct mem_cgroup_stat_cpu *cpustat;
   + unsigned long flags;
   + int cpu;
   +
   + if (mem_cgroup_disabled())
   + return;
   + /* Translate the zone_stat_item into a mem_cgroup_stat_index */
   + switch (idx) {
   + case NR_FILE_DIRTY:
   + idx = MEM_CGROUP_STAT_FILE_DIRTY;
   + break;
   + case NR_WRITEBACK:
   + idx = MEM_CGROUP_STAT_WRITEBACK;
   + break;
   + case NR_WRITEBACK_TEMP:
   + idx = MEM_CGROUP_STAT_WRITEBACK_TEMP;
   + break;
   + case NR_UNSTABLE_NFS:
   + idx = MEM_CGROUP_STAT_UNSTABLE_NFS;
   + break;
   + default:
   + return;
   + }
   + /* Charge the memory cgroup statistics */
   + mem = get_mem_cgroup_from_page(page);
   + if (!mem) {
   + mem = root_mem_cgroup;
   + css_get(mem-css);
   + }
   +
   + local_irq_save(flags);
   + cpu = get_cpu();
  
  Kamezawa is in the process of changing these, so you might want to
  look at and integrate with those patches when they are ready.
 
 OK, I'll rebase the patch to -mm. Are those changes already included in
 mmotm?
 
yes.

-Kame

 Thanks,
 -Andrea

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-22 Thread Vivek Goyal
On Sun, Feb 21, 2010 at 04:18:44PM +0100, Andrea Righi wrote:
 Infrastructure to account dirty pages per cgroup + add memory.dirty_bytes 
 limit
 in cgroupfs.
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  include/linux/memcontrol.h |   31 ++
  mm/memcontrol.c|  218 
 +++-
  2 files changed, 248 insertions(+), 1 deletions(-)
 
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 1f9b119..ba3fe0d 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -25,6 +25,16 @@ struct page_cgroup;
  struct page;
  struct mm_struct;
  
 +/* Cgroup memory statistics items exported to the kernel */
 +enum memcg_page_stat_item {
 + MEMCG_NR_FREE_PAGES,
 + MEMCG_NR_RECLAIMABLE_PAGES,
 + MEMCG_NR_FILE_DIRTY,
 + MEMCG_NR_WRITEBACK,
 + MEMCG_NR_WRITEBACK_TEMP,
 + MEMCG_NR_UNSTABLE_NFS,
 +};
 +
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  /*
   * All charge functions with gfp_mask should use GFP_KERNEL or
 @@ -48,6 +58,8 @@ extern void mem_cgroup_cancel_charge_swapin(struct 
 mem_cgroup *ptr);
  
  extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
   gfp_t gfp_mask);
 +extern void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge);
  extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
  extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
  extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
 @@ -117,6 +129,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup 
 *memcg,
  extern int do_swap_account;
  #endif
  
 +extern unsigned long mem_cgroup_dirty_bytes(void);
 +
 +extern u64 mem_cgroup_page_state(enum memcg_page_stat_item item);
 +
  static inline bool mem_cgroup_disabled(void)
  {
   if (mem_cgroup_subsys.disabled)
 @@ -144,6 +160,11 @@ static inline int mem_cgroup_cache_charge(struct page 
 *page,
   return 0;
  }
  
 +static inline void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge)
 +{
 +}
 +
  static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
   struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
  {
 @@ -312,6 +333,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone 
 *zone, int order,
   return 0;
  }
  
 +static inline unsigned long mem_cgroup_dirty_bytes(void)
 +{
 + return vm_dirty_bytes;
 +}
 +
 +static inline u64 mem_cgroup_page_state(enum memcg_page_stat_item item)
 +{
 + return 0;
 +}
 +
  #endif /* CONFIG_CGROUP_MEM_CONT */
  
  #endif /* _LINUX_MEMCONTROL_H */
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 954032b..288b9a4 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -64,13 +64,18 @@ enum mem_cgroup_stat_index {
   /*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
 - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
 + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
   MEM_CGROUP_STAT_RSS,   /* # of pages charged as anon rss */
   MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
   MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
   MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
   MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
   MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 + MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
 + MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
 + MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
 + temporary buffers */
 + MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
  
   MEM_CGROUP_STAT_NSTATS,
  };
 @@ -225,6 +230,9 @@ struct mem_cgroup {
   /* set when res.limit == memsw.limit */
   boolmemsw_is_minimum;
  
 + /* control memory cgroup dirty pages */
 + unsigned long dirty_bytes;
 +
   /*
* statistics. This must be placed at the end of memcg.
*/
 @@ -519,6 +527,67 @@ static void mem_cgroup_charge_statistics(struct 
 mem_cgroup *mem,
   put_cpu();
  }
  
 +static struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
 +{
 + struct page_cgroup *pc;
 + struct mem_cgroup *mem = NULL;
 +
 + pc = lookup_page_cgroup(page);
 + if (unlikely(!pc))
 + return NULL;
 + lock_page_cgroup(pc);
 + if (PageCgroupUsed(pc)) {
 + mem = pc-mem_cgroup;
 + if (mem)
 + css_get(mem-css);
 + }
 + unlock_page_cgroup(pc);
 + return mem;
 +}
 +
 +void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge)
 +{
 + struct mem_cgroup *mem;
 + struct 

[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-22 Thread Balbir Singh
* Andrea Righi ari...@develer.com [2010-02-21 16:18:44]:

 Infrastructure to account dirty pages per cgroup + add memory.dirty_bytes 
 limit
 in cgroupfs.
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  include/linux/memcontrol.h |   31 ++
  mm/memcontrol.c|  218 
 +++-
  2 files changed, 248 insertions(+), 1 deletions(-)
 
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 1f9b119..ba3fe0d 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -25,6 +25,16 @@ struct page_cgroup;
  struct page;
  struct mm_struct;
 
 +/* Cgroup memory statistics items exported to the kernel */
 +enum memcg_page_stat_item {
 + MEMCG_NR_FREE_PAGES,
 + MEMCG_NR_RECLAIMABLE_PAGES,
 + MEMCG_NR_FILE_DIRTY,
 + MEMCG_NR_WRITEBACK,
 + MEMCG_NR_WRITEBACK_TEMP,
 + MEMCG_NR_UNSTABLE_NFS,
 +};
 +
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  /*
   * All charge functions with gfp_mask should use GFP_KERNEL or
 @@ -48,6 +58,8 @@ extern void mem_cgroup_cancel_charge_swapin(struct 
 mem_cgroup *ptr);
 
  extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
   gfp_t gfp_mask);
 +extern void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge);
  extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
  extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
  extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
 @@ -117,6 +129,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup 
 *memcg,
  extern int do_swap_account;
  #endif
 
 +extern unsigned long mem_cgroup_dirty_bytes(void);
 +
 +extern u64 mem_cgroup_page_state(enum memcg_page_stat_item item);
 +
  static inline bool mem_cgroup_disabled(void)
  {
   if (mem_cgroup_subsys.disabled)
 @@ -144,6 +160,11 @@ static inline int mem_cgroup_cache_charge(struct page 
 *page,
   return 0;
  }
 
 +static inline void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge)
 +{
 +}
 +
  static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
   struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
  {
 @@ -312,6 +333,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone 
 *zone, int order,
   return 0;
  }
 
 +static inline unsigned long mem_cgroup_dirty_bytes(void)
 +{
 + return vm_dirty_bytes;
 +}
 +
 +static inline u64 mem_cgroup_page_state(enum memcg_page_stat_item item)
 +{
 + return 0;
 +}
 +
  #endif /* CONFIG_CGROUP_MEM_CONT */
 
  #endif /* _LINUX_MEMCONTROL_H */
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 954032b..288b9a4 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -64,13 +64,18 @@ enum mem_cgroup_stat_index {
   /*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
 - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
 + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
   MEM_CGROUP_STAT_RSS,   /* # of pages charged as anon rss */
   MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
   MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
   MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
   MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
   MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 + MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
 + MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
 + MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
 + temporary buffers */
 + MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
 
   MEM_CGROUP_STAT_NSTATS,
  };
 @@ -225,6 +230,9 @@ struct mem_cgroup {
   /* set when res.limit == memsw.limit */
   boolmemsw_is_minimum;
 
 + /* control memory cgroup dirty pages */
 + unsigned long dirty_bytes;
 +
   /*
* statistics. This must be placed at the end of memcg.
*/
 @@ -519,6 +527,67 @@ static void mem_cgroup_charge_statistics(struct 
 mem_cgroup *mem,
   put_cpu();
  }
 
 +static struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
 +{
 + struct page_cgroup *pc;
 + struct mem_cgroup *mem = NULL;
 +
 + pc = lookup_page_cgroup(page);
 + if (unlikely(!pc))
 + return NULL;
 + lock_page_cgroup(pc);
 + if (PageCgroupUsed(pc)) {
 + mem = pc-mem_cgroup;
 + if (mem)
 + css_get(mem-css);
 + }
 + unlock_page_cgroup(pc);
 + return mem;
 +}

Sounds like a reusable routine, have I seen it before?

 +
 +void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge)
 +{
 + 

[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-22 Thread Balbir Singh
* Vivek Goyal vgo...@redhat.com [2010-02-22 10:58:40]:

 
 We seem to be doing same operation as existing mem_cgroup_update_file_mapped
 function is doing to udpate some stats. Can we just reuse that? We
 probably can create one core function which take index of stat to update
 and update_file_mapped and other variants for memcg dirty ratio can make
 use of it.


Good Point, it can be easily extended to be generic
 
 In fact instead of single function charge_dirty() accounting for
 WRITEBACK, we well as other states like UNSTABLE_NFS is not very intutive.
 May be we can have indivdual functions.
 
 mem_cgroup_update_dirty()
 mem_cgroup_update_writeback()
 mem_cgroup_update_unstable_nfs() etc.


Hmm.. probably yes. 

-- 
Three Cheers,
Balbir
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-22 Thread Vivek Goyal
On Sun, Feb 21, 2010 at 11:17:01PM +0100, Andrea Righi wrote:
 On Sun, Feb 21, 2010 at 01:28:35PM -0800, David Rientjes wrote:
 
 [snip]
 
   +static struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
   +{
   + struct page_cgroup *pc;
   + struct mem_cgroup *mem = NULL;
   +
   + pc = lookup_page_cgroup(page);
   + if (unlikely(!pc))
   + return NULL;
   + lock_page_cgroup(pc);
   + if (PageCgroupUsed(pc)) {
   + mem = pc-mem_cgroup;
   + if (mem)
   + css_get(mem-css);
   + }
   + unlock_page_cgroup(pc);
   + return mem;
   +}
  
  Is it possible to merge this with try_get_mem_cgroup_from_page()?
 
 Agreed.
 
  
   +
   +void mem_cgroup_charge_dirty(struct page *page,
   + enum zone_stat_item idx, int charge)
   +{
   + struct mem_cgroup *mem;
   + struct mem_cgroup_stat_cpu *cpustat;
   + unsigned long flags;
   + int cpu;
   +
   + if (mem_cgroup_disabled())
   + return;
   + /* Translate the zone_stat_item into a mem_cgroup_stat_index */
   + switch (idx) {
   + case NR_FILE_DIRTY:
   + idx = MEM_CGROUP_STAT_FILE_DIRTY;
   + break;
   + case NR_WRITEBACK:
   + idx = MEM_CGROUP_STAT_WRITEBACK;
   + break;
   + case NR_WRITEBACK_TEMP:
   + idx = MEM_CGROUP_STAT_WRITEBACK_TEMP;
   + break;
   + case NR_UNSTABLE_NFS:
   + idx = MEM_CGROUP_STAT_UNSTABLE_NFS;
   + break;
   + default:
   + return;
  
  WARN()?  We don't want to silently leak counters.
 
 Agreed.
 
  
   + }
   + /* Charge the memory cgroup statistics */
   + mem = get_mem_cgroup_from_page(page);
   + if (!mem) {
   + mem = root_mem_cgroup;
   + css_get(mem-css);
   + }
  
  get_mem_cgroup_from_page() should probably handle the root_mem_cgroup case 
  and return a reference from it.
 
 Right. But I'd prefer to use try_get_mem_cgroup_from_page() without
 changing the behaviour of this function.
 
  
   +
   + local_irq_save(flags);
   + cpu = get_cpu();
   + cpustat = mem-stat.cpustat[cpu];
   + __mem_cgroup_stat_add_safe(cpustat, idx, charge);
  
  get_cpu()?  Preemption is already disabled, just use smp_processor_id().
 
 mmmh... actually, we can just copy the code from
 mem_cgroup_charge_statistics(), so local_irq_save/restore are not
 necessarily needed and we can just use get_cpu()/put_cpu().
 
   + put_cpu();
   + local_irq_restore(flags);
   + css_put(mem-css);
   +}
   +
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup 
   *mem,
 enum lru_list idx)
{
   @@ -992,6 +1061,97 @@ static unsigned int get_swappiness(struct 
   mem_cgroup *memcg)
 return swappiness;
}

   +static unsigned long get_dirty_bytes(struct mem_cgroup *memcg)
   +{
   + struct cgroup *cgrp = memcg-css.cgroup;
   + unsigned long dirty_bytes;
   +
   + /* root ? */
   + if (cgrp-parent == NULL)
   + return vm_dirty_bytes;
   +
   + spin_lock(memcg-reclaim_param_lock);
   + dirty_bytes = memcg-dirty_bytes;
   + spin_unlock(memcg-reclaim_param_lock);
   +
   + return dirty_bytes;
   +}
   +
   +unsigned long mem_cgroup_dirty_bytes(void)
   +{
   + struct mem_cgroup *memcg;
   + unsigned long dirty_bytes;
   +
   + if (mem_cgroup_disabled())
   + return vm_dirty_bytes;
   +
   + rcu_read_lock();
   + memcg = mem_cgroup_from_task(current);
   + if (memcg == NULL)
   + dirty_bytes = vm_dirty_bytes;
   + else
   + dirty_bytes = get_dirty_bytes(memcg);
   + rcu_read_unlock();
  
  The rcu_read_lock() isn't protecting anything here.
 
 Right!

Are we not protecting memcg pointer using rcu here?

Thanks
Vivek
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-22 Thread Vivek Goyal
On Mon, Feb 22, 2010 at 09:22:42AM +0900, KAMEZAWA Hiroyuki wrote:

[..]
  +static int mem_cgroup_dirty_bytes_write(struct cgroup *cgrp, struct cftype 
  *cft,
  +  u64 val)
  +{
  +   struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  +   struct mem_cgroup *parent;
  +
  +   if (cgrp-parent == NULL)
  +   return -EINVAL;
  +
  +   parent = mem_cgroup_from_cont(cgrp-parent);
  +
  +   cgroup_lock();
  +
  +   /* If under hierarchy, only empty-root can set this value */
  +   if ((parent-use_hierarchy) ||
  +   (memcg-use_hierarchy  !list_empty(cgrp-children))) {
  +   cgroup_unlock();
  +   return -EINVAL;
  +   }
 
 Okay, then, only hierarchy root can set the value.
 Please descirbe this kind of important things in patch description.
 

Hi Andrea,

Why can only root of the hierarchy set set dirty_bytes value? In this
case, a child cgroup's amount of dirty pages will be controlled by
dirty_ratio?

Thanks
Vivek
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-22 Thread David Rientjes
On Mon, 22 Feb 2010, Andrea Righi wrote:

  Hmm...do we need spinlock ? You use unsigned long, then, read-write
  is always atomic if not read-modify-write.
 
 I think I simply copypaste the memcg-swappiness case. But I agree,
 read-write should be atomic.
 

We don't need memcg-reclaim_param_lock in get_swappiness() or
mem_cgroup_get_reclaim_priority().
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-21 Thread David Rientjes
On Sun, 21 Feb 2010, Andrea Righi wrote:

 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 1f9b119..ba3fe0d 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -25,6 +25,16 @@ struct page_cgroup;
  struct page;
  struct mm_struct;
  
 +/* Cgroup memory statistics items exported to the kernel */
 +enum memcg_page_stat_item {
 + MEMCG_NR_FREE_PAGES,
 + MEMCG_NR_RECLAIMABLE_PAGES,
 + MEMCG_NR_FILE_DIRTY,
 + MEMCG_NR_WRITEBACK,
 + MEMCG_NR_WRITEBACK_TEMP,
 + MEMCG_NR_UNSTABLE_NFS,
 +};
 +
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  /*
   * All charge functions with gfp_mask should use GFP_KERNEL or
 @@ -48,6 +58,8 @@ extern void mem_cgroup_cancel_charge_swapin(struct 
 mem_cgroup *ptr);
  
  extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
   gfp_t gfp_mask);
 +extern void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge);
  extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
  extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
  extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
 @@ -117,6 +129,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup 
 *memcg,
  extern int do_swap_account;
  #endif
  
 +extern unsigned long mem_cgroup_dirty_bytes(void);
 +
 +extern u64 mem_cgroup_page_state(enum memcg_page_stat_item item);
 +
  static inline bool mem_cgroup_disabled(void)
  {
   if (mem_cgroup_subsys.disabled)
 @@ -144,6 +160,11 @@ static inline int mem_cgroup_cache_charge(struct page 
 *page,
   return 0;
  }
  
 +static inline void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge)
 +{
 +}
 +
  static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
   struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
  {
 @@ -312,6 +333,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone 
 *zone, int order,
   return 0;
  }
  
 +static inline unsigned long mem_cgroup_dirty_bytes(void)
 +{
 + return vm_dirty_bytes;
 +}
 +
 +static inline u64 mem_cgroup_page_state(enum memcg_page_stat_item item)
 +{
 + return 0;
 +}
 +
  #endif /* CONFIG_CGROUP_MEM_CONT */
  
  #endif /* _LINUX_MEMCONTROL_H */
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 954032b..288b9a4 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -64,13 +64,18 @@ enum mem_cgroup_stat_index {
   /*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
 - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
 + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
   MEM_CGROUP_STAT_RSS,   /* # of pages charged as anon rss */
   MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
   MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
   MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
   MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
   MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 + MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
 + MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
 + MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
 + temporary buffers */
 + MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
  
   MEM_CGROUP_STAT_NSTATS,
  };
 @@ -225,6 +230,9 @@ struct mem_cgroup {
   /* set when res.limit == memsw.limit */
   boolmemsw_is_minimum;
  
 + /* control memory cgroup dirty pages */
 + unsigned long dirty_bytes;
 +
   /*
* statistics. This must be placed at the end of memcg.
*/
 @@ -519,6 +527,67 @@ static void mem_cgroup_charge_statistics(struct 
 mem_cgroup *mem,
   put_cpu();
  }
  
 +static struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
 +{
 + struct page_cgroup *pc;
 + struct mem_cgroup *mem = NULL;
 +
 + pc = lookup_page_cgroup(page);
 + if (unlikely(!pc))
 + return NULL;
 + lock_page_cgroup(pc);
 + if (PageCgroupUsed(pc)) {
 + mem = pc-mem_cgroup;
 + if (mem)
 + css_get(mem-css);
 + }
 + unlock_page_cgroup(pc);
 + return mem;
 +}

Is it possible to merge this with try_get_mem_cgroup_from_page()?

 +
 +void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge)
 +{
 + struct mem_cgroup *mem;
 + struct mem_cgroup_stat_cpu *cpustat;
 + unsigned long flags;
 + int cpu;
 +
 + if (mem_cgroup_disabled())
 + return;
 + /* Translate the zone_stat_item into a mem_cgroup_stat_index */
 + switch (idx) {
 + case NR_FILE_DIRTY:
 + idx = 

[Devel] Re: [PATCH 1/2] memcg: dirty pages accounting and limiting infrastructure

2010-02-21 Thread KAMEZAWA Hiroyuki
On Sun, 21 Feb 2010 16:18:44 +0100
Andrea Righi ari...@develer.com wrote:

 Infrastructure to account dirty pages per cgroup + add memory.dirty_bytes 
 limit
 in cgroupfs.
 
 Signed-off-by: Andrea Righi ari...@develer.com

Looks clean in general. But some confliction with memcg in mmotm.
And please CC: to linux...@kvack.org when you modify memcg.


 ---
  include/linux/memcontrol.h |   31 ++
  mm/memcontrol.c|  218 
 +++-
  2 files changed, 248 insertions(+), 1 deletions(-)
 
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 1f9b119..ba3fe0d 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -25,6 +25,16 @@ struct page_cgroup;
  struct page;
  struct mm_struct;
  
 +/* Cgroup memory statistics items exported to the kernel */
 +enum memcg_page_stat_item {
 + MEMCG_NR_FREE_PAGES,
 + MEMCG_NR_RECLAIMABLE_PAGES,
 + MEMCG_NR_FILE_DIRTY,
 + MEMCG_NR_WRITEBACK,
 + MEMCG_NR_WRITEBACK_TEMP,
 + MEMCG_NR_UNSTABLE_NFS,
 +};
 +
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  /*
   * All charge functions with gfp_mask should use GFP_KERNEL or
 @@ -48,6 +58,8 @@ extern void mem_cgroup_cancel_charge_swapin(struct 
 mem_cgroup *ptr);
  
  extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
   gfp_t gfp_mask);
 +extern void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge);
  extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
  extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
  extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
 @@ -117,6 +129,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup 
 *memcg,
  extern int do_swap_account;
  #endif
  
 +extern unsigned long mem_cgroup_dirty_bytes(void);
 +
 +extern u64 mem_cgroup_page_state(enum memcg_page_stat_item item);
 +
  static inline bool mem_cgroup_disabled(void)
  {
   if (mem_cgroup_subsys.disabled)
 @@ -144,6 +160,11 @@ static inline int mem_cgroup_cache_charge(struct page 
 *page,
   return 0;
  }
  
 +static inline void mem_cgroup_charge_dirty(struct page *page,
 + enum zone_stat_item idx, int charge)
 +{
 +}
 +
  static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
   struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
  {
 @@ -312,6 +333,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone 
 *zone, int order,
   return 0;
  }
  
 +static inline unsigned long mem_cgroup_dirty_bytes(void)
 +{
 + return vm_dirty_bytes;
 +}
 +
 +static inline u64 mem_cgroup_page_state(enum memcg_page_stat_item item)
 +{
 + return 0;
 +}
 +
  #endif /* CONFIG_CGROUP_MEM_CONT */
  
  #endif /* _LINUX_MEMCONTROL_H */
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 954032b..288b9a4 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -64,13 +64,18 @@ enum mem_cgroup_stat_index {
   /*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
 - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
 + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
?

   MEM_CGROUP_STAT_RSS,   /* # of pages charged as anon rss */
   MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
   MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
   MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
   MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
   MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 + MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
 + MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
 + MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
 + temporary buffers */
 + MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
  
   MEM_CGROUP_STAT_NSTATS,
  };
 @@ -225,6 +230,9 @@ struct mem_cgroup {
   /* set when res.limit == memsw.limit */
   boolmemsw_is_minimum;
  
 + /* control memory cgroup dirty pages */
 + unsigned long dirty_bytes;
 +
   /*
* statistics. This must be placed at the end of memcg.
*/
 @@ -519,6 +527,67 @@ static void mem_cgroup_charge_statistics(struct 
 mem_cgroup *mem,
   put_cpu();
  }
  
 +static struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
 +{
 + struct page_cgroup *pc;
 + struct mem_cgroup *mem = NULL;
 +
 + pc = lookup_page_cgroup(page);
 + if (unlikely(!pc))
 + return NULL;
 + lock_page_cgroup(pc);
 + if (PageCgroupUsed(pc)) {
 + mem = pc-mem_cgroup;
 + if (mem)
 + css_get(mem-css);
 + }
 + unlock_page_cgroup(pc);
 + return mem;
 +}
 +
Hmm. do we need