[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Sun, Feb 21, 2010 at 01:38:28PM -0800, David Rientjes wrote:
 On Sun, 21 Feb 2010, Andrea Righi wrote:
 
  diff --git a/mm/page-writeback.c b/mm/page-writeback.c
  index 0b19943..c9ff1cd 100644
  --- a/mm/page-writeback.c
  +++ b/mm/page-writeback.c
  @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
*/
   static int calc_period_shift(void)
   {
  -   unsigned long dirty_total;
  +   unsigned long dirty_total, dirty_bytes;
   
  -   if (vm_dirty_bytes)
  -   dirty_total = vm_dirty_bytes / PAGE_SIZE;
  +   dirty_bytes = mem_cgroup_dirty_bytes();
  +   if (dirty_bytes)
  +   dirty_total = dirty_bytes / PAGE_SIZE;
  else
  dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
  100;
 
 This needs a comment since mem_cgroup_dirty_bytes() doesn't imply that it 
 is responsible for returning the global vm_dirty_bytes when that's 
 actually what it does (both for CONFIG_CGROUP_MEM_RES_CTRL=n and root 
 cgroup).

Fair enough.

Thanks,
-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Mon, Feb 22, 2010 at 09:32:21AM +0900, KAMEZAWA Hiroyuki wrote:
  -   if (vm_dirty_bytes)
  -   dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
  +   dirty_bytes = mem_cgroup_dirty_bytes();
  +   if (dirty_bytes)
  +   dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
  else {
  int dirty_ratio;
 
 you use local value. But, if hierarchila accounting used, memcg-dirty_bytes
 should be got from root-of-hierarchy memcg.
 
 I have no objection if you add a pointer as
   memcg-subhierarchy_root
 to get root of hierarchical accounting. But please check problem of 
 hierarchy, again.

Right, it won't work with hierarchy. I'll fix also considering the
hierarchy case.

Thanks for your review.

-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Mon, Feb 22, 2010 at 11:52:15AM -0500, Vivek Goyal wrote:
   unsigned long determine_dirtyable_memory(void)
   {
  -   unsigned long x;
  -
  -   x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
  -
  +   unsigned long memcg_memory, memory;
  +
  +   memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
  +   memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
  +   if (memcg_memory  0) {
 
 it could be just 
 
   if (memcg_memory) {

Agreed.

   }
 
  +   memcg_memory +=
  +   mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
  +   if (memcg_memory  memory)
  +   return memcg_memory;
  +   }
  if (!vm_highmem_is_dirtyable)
  -   x -= highmem_dirtyable_memory(x);
  +   memory -= highmem_dirtyable_memory(memory);
   
 
 If vm_highmem_is_dirtyable=0, In that case, we can still return with
 memcg_memory which can be more than memory.  IOW, highmem is not
 dirtyable system wide but still we can potetially return back saying
 for this cgroup we can dirty more pages which can potenailly be acutally
 be more that system wide allowed?
 
 Because you have modified dirtyable_memory() and made it per cgroup, I
 think it automatically takes care of the cases of per cgroup dirty ratio,
 I mentioned in my previous mail. So we will use system wide dirty ratio
 to calculate the allowed dirty pages in this cgroup (dirty_ratio *
 available_memory()) and if this cgroup wrote too many pages start
 writeout? 

OK, if I've understood well, you're proposing to use per-cgroup
dirty_ratio interface and do something like:

unsigned long determine_dirtyable_memory(void)
{
unsigned long memcg_memory, memory;

memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
if (!vm_highmem_is_dirtyable)
memory -= highmem_dirtyable_memory(memory);

memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
if (!memcg_memory)
return memory + 1;  /* Ensure that we never return 0 */
memcg_memory += mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
if (!vm_highmem_is_dirtyable)
 memcg_memory -= highmem_dirtyable_memory(memory) *
mem_cgroup_dirty_ratio() / 100;
if (memcg_memory  memory)
return memcg_memory;
}


 
  -   return x + 1;   /* Ensure that we never return 0 */
  +   return memory + 1;  /* Ensure that we never return 0 */
   }
   
   void
  @@ -421,12 +428,13 @@ get_dirty_limits(unsigned long *pbackground, unsigned 
  long *pdirty,
   unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
   {
  unsigned long background;
  -   unsigned long dirty;
  +   unsigned long dirty, dirty_bytes;
  unsigned long available_memory = determine_dirtyable_memory();
  struct task_struct *tsk;
   
  -   if (vm_dirty_bytes)
  -   dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
  +   dirty_bytes = mem_cgroup_dirty_bytes();
  +   if (dirty_bytes)
  +   dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
  else {
  int dirty_ratio;
   
  @@ -505,9 +513,17 @@ static void balance_dirty_pages(struct address_space 
  *mapping,
  get_dirty_limits(background_thresh, dirty_thresh,
  bdi_thresh, bdi);
   
  -   nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
  +   nr_reclaimable = mem_cgroup_page_state(MEMCG_NR_FILE_DIRTY);
  +   if (nr_reclaimable == 0) {
  +   nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
  global_page_state(NR_UNSTABLE_NFS);
  -   nr_writeback = global_page_state(NR_WRITEBACK);
  +   nr_writeback = global_page_state(NR_WRITEBACK);
  +   } else {
  +   nr_reclaimable +=
  +   mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
  +   nr_writeback =
  +   mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
  +   }
   
  bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
  bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
  @@ -660,6 +676,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
  unsigned long dirty_thresh;
   
   for ( ; ; ) {
  +   unsigned long dirty;
  +
  get_dirty_limits(background_thresh, dirty_thresh, NULL, NULL);
   
   /*
  @@ -668,10 +686,15 @@ void throttle_vm_writeout(gfp_t gfp_mask)
*/
   dirty_thresh += dirty_thresh / 10;  /* wh... */
   
  -if (global_page_state(NR_UNSTABLE_NFS) +
  -   global_page_state(NR_WRITEBACK) = dirty_thresh)
  -   break;
  -congestion_wait(BLK_RW_ASYNC, HZ/10);
  +   dirty = mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
  +   

[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Feb 23, 2010 at 10:40:40AM +0100, Andrea Righi wrote:
  If vm_highmem_is_dirtyable=0, In that case, we can still return with
  memcg_memory which can be more than memory.  IOW, highmem is not
  dirtyable system wide but still we can potetially return back saying
  for this cgroup we can dirty more pages which can potenailly be acutally
  be more that system wide allowed?
  
  Because you have modified dirtyable_memory() and made it per cgroup, I
  think it automatically takes care of the cases of per cgroup dirty ratio,
  I mentioned in my previous mail. So we will use system wide dirty ratio
  to calculate the allowed dirty pages in this cgroup (dirty_ratio *
  available_memory()) and if this cgroup wrote too many pages start
  writeout? 
 
 OK, if I've understood well, you're proposing to use per-cgroup
 dirty_ratio interface and do something like:
 
 unsigned long determine_dirtyable_memory(void)
 {
   unsigned long memcg_memory, memory;
 
   memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
   if (!vm_highmem_is_dirtyable)
   memory -= highmem_dirtyable_memory(memory);
 
   memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
   if (!memcg_memory)
   return memory + 1;  /* Ensure that we never return 0 */
   memcg_memory += mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
   if (!vm_highmem_is_dirtyable)
memcg_memory -= highmem_dirtyable_memory(memory) *
   mem_cgroup_dirty_ratio() / 100;

ok, this is wrong:

   if (memcg_memory  memory)
   return memcg_memory;
 }

return min(memcg_memory, memory);

-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Feb 23, 2010 at 02:22:12PM -0800, David Rientjes wrote:
 On Tue, 23 Feb 2010, Vivek Goyal wrote:
 
Because you have modified dirtyable_memory() and made it per cgroup, I
think it automatically takes care of the cases of per cgroup dirty 
ratio,
I mentioned in my previous mail. So we will use system wide dirty ratio
to calculate the allowed dirty pages in this cgroup (dirty_ratio *
available_memory()) and if this cgroup wrote too many pages start
writeout? 
   
   OK, if I've understood well, you're proposing to use per-cgroup
   dirty_ratio interface and do something like:
  
  I think we can use system wide dirty_ratio for per cgroup (instead of
  providing configurable dirty_ratio for each cgroup where each memory
  cgroup can have different dirty ratio. Can't think of a use case
  immediately).
 
 I think each memcg should have both dirty_bytes and dirty_ratio, 
 dirty_bytes defaults to 0 (disabled) while dirty_ratio is inherited from 
 the global vm_dirty_ratio.  Changing vm_dirty_ratio would not change 
 memcgs already using their own dirty_ratio, but new memcgs would get the 
 new value by default.  The ratio would act over the amount of available 
 memory to the cgroup as though it were its own virtual system operating 
 with a subset of the system's RAM and the same global ratio.

Agreed.

-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Feb 23, 2010 at 04:29:43PM -0500, Vivek Goyal wrote:
 On Sun, Feb 21, 2010 at 04:18:45PM +0100, Andrea Righi wrote:
 
 [..]
  diff --git a/mm/page-writeback.c b/mm/page-writeback.c
  index 0b19943..c9ff1cd 100644
  --- a/mm/page-writeback.c
  +++ b/mm/page-writeback.c
  @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
*/
   static int calc_period_shift(void)
   {
  -   unsigned long dirty_total;
  +   unsigned long dirty_total, dirty_bytes;
   
  -   if (vm_dirty_bytes)
  -   dirty_total = vm_dirty_bytes / PAGE_SIZE;
  +   dirty_bytes = mem_cgroup_dirty_bytes();
  +   if (dirty_bytes)
  +   dirty_total = dirty_bytes / PAGE_SIZE;
  else
  dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
  100;
 
 Ok, I don't understand this so I better ask. Can you explain a bit how memory
 cgroup dirty ratio is going to play with per BDI dirty proportion thing.
 
 Currently we seem to be calculating per BDI proportion (based on recently
 completed events), of system wide dirty ratio and decide whether a process
 should be throttled or not.
 
 Because throttling decision is also based on BDI and its proportion, how
 are we going to fit it with mem cgroup? Is it going to be BDI proportion
 of dirty memory with-in memory cgroup (and not system wide)?

IMHO we need to calculate the BDI dirty threshold as a function of the
cgroup's dirty memory, and keep BDI statistics system wide.

So, if a task is generating some writes, the threshold to start itself
the writeback will be calculated as a function of the cgroup's dirty
memory. If the BDI dirty memory is greater than this threshold, the task
must start to writeback dirty pages until it reaches the expected dirty
limit.

OK, in this way a cgroup with a small dirty limit may be forced to
writeback a lot of pages dirtied by other cgroups on the same device.
But this is always related to the fact that tasks are forced to
writeback dirty inodes randomly, and not the inodes they've actually
dirtied.

-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Fri, Feb 26, 2010 at 04:48:11PM -0500, Vivek Goyal wrote:
 On Thu, Feb 25, 2010 at 04:12:11PM +0100, Andrea Righi wrote:
  On Tue, Feb 23, 2010 at 04:29:43PM -0500, Vivek Goyal wrote:
   On Sun, Feb 21, 2010 at 04:18:45PM +0100, Andrea Righi wrote:
   
   [..]
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..c9ff1cd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
  */
 static int calc_period_shift(void)
 {
-   unsigned long dirty_total;
+   unsigned long dirty_total, dirty_bytes;
 
-   if (vm_dirty_bytes)
-   dirty_total = vm_dirty_bytes / PAGE_SIZE;
+   dirty_bytes = mem_cgroup_dirty_bytes();
+   if (dirty_bytes)
+   dirty_total = dirty_bytes / PAGE_SIZE;
else
dirty_total = (vm_dirty_ratio * 
determine_dirtyable_memory()) /
100;
   
   Ok, I don't understand this so I better ask. Can you explain a bit how 
   memory
   cgroup dirty ratio is going to play with per BDI dirty proportion thing.
   
   Currently we seem to be calculating per BDI proportion (based on recently
   completed events), of system wide dirty ratio and decide whether a process
   should be throttled or not.
   
   Because throttling decision is also based on BDI and its proportion, how
   are we going to fit it with mem cgroup? Is it going to be BDI proportion
   of dirty memory with-in memory cgroup (and not system wide)?
  
  IMHO we need to calculate the BDI dirty threshold as a function of the
  cgroup's dirty memory, and keep BDI statistics system wide.
  
  So, if a task is generating some writes, the threshold to start itself
  the writeback will be calculated as a function of the cgroup's dirty
  memory. If the BDI dirty memory is greater than this threshold, the task
  must start to writeback dirty pages until it reaches the expected dirty
  limit.
  
 
 Ok, so calculate dirty per cgroup and calculate BDI's proportion from
 cgroup dirty? So will you be keeping track of vm_completion events per
 cgroup or will rely on existing system wide and per BDI completion events
 to calculate BDI proportion?
 
 BDI proportion are more of an indication of device speed and faster device
 gets higher share of dirty, so may be we don't have to keep track of
 completion events per cgroup and can rely on system wide completion events
 for calculating the proportion of a BDI.
 
  OK, in this way a cgroup with a small dirty limit may be forced to
  writeback a lot of pages dirtied by other cgroups on the same device.
  But this is always related to the fact that tasks are forced to
  writeback dirty inodes randomly, and not the inodes they've actually
  dirtied.
 
 So we are left with following two issues.
 
 - Should we rely on global BDI stats for BDI_RECLAIMABLE and BDI_WRITEBACK
   or we need to make these per cgroup to determine actually how many pages
   have been dirtied by a cgroup and force writeouts accordingly?
 
 - Once we decide to throttle a cgroup, it should write its inodes and
   should not be serialized behind other cgroup's inodes.  

We could try to save who made the inode dirty
(inode-cgroup_that_made_inode_dirty) so that during the active
writeback each cgroup can be forced to write only its own inodes.

-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-28 Thread KAMEZAWA Hiroyuki
On Fri, 26 Feb 2010 16:48:11 -0500
Vivek Goyal vgo...@redhat.com wrote:

 On Thu, Feb 25, 2010 at 04:12:11PM +0100, Andrea Righi wrote:
  On Tue, Feb 23, 2010 at 04:29:43PM -0500, Vivek Goyal wrote:
   On Sun, Feb 21, 2010 at 04:18:45PM +0100, Andrea Righi wrote:

 Because bdi_thres calculation will be based on per cgroup dirty and
 bdi_nr_reclaimable and bdi_nr_writeback will be system wide, we will be
 doing much more aggressive writeouts.
 
 But we will not achieve parallel writeback paths so probably will not help IO
 controller a lot.
 
 Kame-san, is it a problem, with current memory cgroups where writeback is
 not happening that actively, and you run into situation where there are too
 many dirty pages in a cgroup and reclaim can take long time?
 
Hmm, not same situation to the global memory management, but we have similar.

In memcg, we just count user's page, hard to reclaim situation doesn't happen.
But reclaim is slower than expected is an usual problem.

When you try 
% dd id=/dev/zero of=./tmpfifle .
under proper limitation of memcg, you'll find dd is very slow.
We know background writeback helps this situation. We need to kick background
write-back.

Thanks,
-Kame

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-26 Thread Vivek Goyal
On Thu, Feb 25, 2010 at 04:12:11PM +0100, Andrea Righi wrote:
 On Tue, Feb 23, 2010 at 04:29:43PM -0500, Vivek Goyal wrote:
  On Sun, Feb 21, 2010 at 04:18:45PM +0100, Andrea Righi wrote:
  
  [..]
   diff --git a/mm/page-writeback.c b/mm/page-writeback.c
   index 0b19943..c9ff1cd 100644
   --- a/mm/page-writeback.c
   +++ b/mm/page-writeback.c
   @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
 */
static int calc_period_shift(void)
{
   - unsigned long dirty_total;
   + unsigned long dirty_total, dirty_bytes;

   - if (vm_dirty_bytes)
   - dirty_total = vm_dirty_bytes / PAGE_SIZE;
   + dirty_bytes = mem_cgroup_dirty_bytes();
   + if (dirty_bytes)
   + dirty_total = dirty_bytes / PAGE_SIZE;
 else
 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
 100;
  
  Ok, I don't understand this so I better ask. Can you explain a bit how 
  memory
  cgroup dirty ratio is going to play with per BDI dirty proportion thing.
  
  Currently we seem to be calculating per BDI proportion (based on recently
  completed events), of system wide dirty ratio and decide whether a process
  should be throttled or not.
  
  Because throttling decision is also based on BDI and its proportion, how
  are we going to fit it with mem cgroup? Is it going to be BDI proportion
  of dirty memory with-in memory cgroup (and not system wide)?
 
 IMHO we need to calculate the BDI dirty threshold as a function of the
 cgroup's dirty memory, and keep BDI statistics system wide.
 
 So, if a task is generating some writes, the threshold to start itself
 the writeback will be calculated as a function of the cgroup's dirty
 memory. If the BDI dirty memory is greater than this threshold, the task
 must start to writeback dirty pages until it reaches the expected dirty
 limit.
 

Ok, so calculate dirty per cgroup and calculate BDI's proportion from
cgroup dirty? So will you be keeping track of vm_completion events per
cgroup or will rely on existing system wide and per BDI completion events
to calculate BDI proportion?

BDI proportion are more of an indication of device speed and faster device
gets higher share of dirty, so may be we don't have to keep track of
completion events per cgroup and can rely on system wide completion events
for calculating the proportion of a BDI.

 OK, in this way a cgroup with a small dirty limit may be forced to
 writeback a lot of pages dirtied by other cgroups on the same device.
 But this is always related to the fact that tasks are forced to
 writeback dirty inodes randomly, and not the inodes they've actually
 dirtied.

So we are left with following two issues.

- Should we rely on global BDI stats for BDI_RECLAIMABLE and BDI_WRITEBACK
  or we need to make these per cgroup to determine actually how many pages
  have been dirtied by a cgroup and force writeouts accordingly?

- Once we decide to throttle a cgroup, it should write its inodes and
  should not be serialized behind other cgroup's inodes.  

If we don't tackle above two issues, I am not sure what probelm will be solved
by the patch set. The only thing I can see is that we will be doing write-outs
much more aggressively when we have got some memory cgroups created. (Smaller
dirty per cgroup will lead to smaller per BDI dirty and when compared with
overall BDI stat, it should lead to more writeouts).

if (bdi_nr_reclaimable + bdi_nr_writeback = bdi_thresh)
break;

Because bdi_thres calculation will be based on per cgroup dirty and
bdi_nr_reclaimable and bdi_nr_writeback will be system wide, we will be
doing much more aggressive writeouts.

But we will not achieve parallel writeback paths so probably will not help IO
controller a lot.

Kame-san, is it a problem, with current memory cgroups where writeback is
not happening that actively, and you run into situation where there are too
many dirty pages in a cgroup and reclaim can take long time?

Thanks
Vivek
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-26 Thread Vivek Goyal
On Fri, Feb 26, 2010 at 11:21:21PM +0100, Andrea Righi wrote:
 On Fri, Feb 26, 2010 at 04:48:11PM -0500, Vivek Goyal wrote:
  On Thu, Feb 25, 2010 at 04:12:11PM +0100, Andrea Righi wrote:
   On Tue, Feb 23, 2010 at 04:29:43PM -0500, Vivek Goyal wrote:
On Sun, Feb 21, 2010 at 04:18:45PM +0100, Andrea Righi wrote:

[..]
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index 0b19943..c9ff1cd 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
   */
  static int calc_period_shift(void)
  {
 - unsigned long dirty_total;
 + unsigned long dirty_total, dirty_bytes;
  
 - if (vm_dirty_bytes)
 - dirty_total = vm_dirty_bytes / PAGE_SIZE;
 + dirty_bytes = mem_cgroup_dirty_bytes();
 + if (dirty_bytes)
 + dirty_total = dirty_bytes / PAGE_SIZE;
   else
   dirty_total = (vm_dirty_ratio * 
 determine_dirtyable_memory()) /
   100;

Ok, I don't understand this so I better ask. Can you explain a bit how 
memory
cgroup dirty ratio is going to play with per BDI dirty proportion thing.

Currently we seem to be calculating per BDI proportion (based on 
recently
completed events), of system wide dirty ratio and decide whether a 
process
should be throttled or not.

Because throttling decision is also based on BDI and its proportion, how
are we going to fit it with mem cgroup? Is it going to be BDI proportion
of dirty memory with-in memory cgroup (and not system wide)?
   
   IMHO we need to calculate the BDI dirty threshold as a function of the
   cgroup's dirty memory, and keep BDI statistics system wide.
   
   So, if a task is generating some writes, the threshold to start itself
   the writeback will be calculated as a function of the cgroup's dirty
   memory. If the BDI dirty memory is greater than this threshold, the task
   must start to writeback dirty pages until it reaches the expected dirty
   limit.
   
  
  Ok, so calculate dirty per cgroup and calculate BDI's proportion from
  cgroup dirty? So will you be keeping track of vm_completion events per
  cgroup or will rely on existing system wide and per BDI completion events
  to calculate BDI proportion?
  
  BDI proportion are more of an indication of device speed and faster device
  gets higher share of dirty, so may be we don't have to keep track of
  completion events per cgroup and can rely on system wide completion events
  for calculating the proportion of a BDI.
  
   OK, in this way a cgroup with a small dirty limit may be forced to
   writeback a lot of pages dirtied by other cgroups on the same device.
   But this is always related to the fact that tasks are forced to
   writeback dirty inodes randomly, and not the inodes they've actually
   dirtied.
  
  So we are left with following two issues.
  
  - Should we rely on global BDI stats for BDI_RECLAIMABLE and BDI_WRITEBACK
or we need to make these per cgroup to determine actually how many pages
have been dirtied by a cgroup and force writeouts accordingly?
  
  - Once we decide to throttle a cgroup, it should write its inodes and
should not be serialized behind other cgroup's inodes.  
 
 We could try to save who made the inode dirty
 (inode-cgroup_that_made_inode_dirty) so that during the active
 writeback each cgroup can be forced to write only its own inodes.

Yes, but that will require to store a reference to memcg and will become
little complicated.

I was thinking of just matching the cgroup of task being throttled and
memcg of first dirty page in the inode. So we can possibly implement
something like in memcontroller.

bool memcg_task_inode_cgroup_match(inode)

and this function will retrieve first dirty page and compare the cgroup of
that with task memory cgroup. No hassle of storing a pointer hence
reference to memcg.

Well, we could store css_id, and no need to keep a reference to the
memcg. But I guess not storing anything in inode will be simpler.

Thanks
Vivek
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-25 Thread KAMEZAWA Hiroyuki
On Thu, 25 Feb 2010 15:34:44 +0100
Andrea Righi ari...@develer.com wrote:

 On Tue, Feb 23, 2010 at 02:22:12PM -0800, David Rientjes wrote:
  On Tue, 23 Feb 2010, Vivek Goyal wrote:
  
 Because you have modified dirtyable_memory() and made it per cgroup, I
 think it automatically takes care of the cases of per cgroup dirty 
 ratio,
 I mentioned in my previous mail. So we will use system wide dirty 
 ratio
 to calculate the allowed dirty pages in this cgroup (dirty_ratio *
 available_memory()) and if this cgroup wrote too many pages start
 writeout? 

OK, if I've understood well, you're proposing to use per-cgroup
dirty_ratio interface and do something like:
   
   I think we can use system wide dirty_ratio for per cgroup (instead of
   providing configurable dirty_ratio for each cgroup where each memory
   cgroup can have different dirty ratio. Can't think of a use case
   immediately).
  
  I think each memcg should have both dirty_bytes and dirty_ratio, 
  dirty_bytes defaults to 0 (disabled) while dirty_ratio is inherited from 
  the global vm_dirty_ratio.  Changing vm_dirty_ratio would not change 
  memcgs already using their own dirty_ratio, but new memcgs would get the 
  new value by default.  The ratio would act over the amount of available 
  memory to the cgroup as though it were its own virtual system operating 
  with a subset of the system's RAM and the same global ratio.
 
 Agreed.
 
BTW, please add background_dirty_ratio in the same series of patches.
(or something other to kick background-writeback in proper manner.)

If not, we can't kick background write-back until we're caught by dirty_ratio.

Thanks,
-Kame





___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-23 Thread Vivek Goyal
On Tue, Feb 23, 2010 at 10:40:40AM +0100, Andrea Righi wrote:
 On Mon, Feb 22, 2010 at 11:52:15AM -0500, Vivek Goyal wrote:
unsigned long determine_dirtyable_memory(void)
{
   - unsigned long x;
   -
   - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
   -
   + unsigned long memcg_memory, memory;
   +
   + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
   + memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
   + if (memcg_memory  0) {
  
  it could be just 
  
  if (memcg_memory) {
 
 Agreed.
 
  }
  
   + memcg_memory +=
   + mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
   + if (memcg_memory  memory)
   + return memcg_memory;
   + }
 if (!vm_highmem_is_dirtyable)
   - x -= highmem_dirtyable_memory(x);
   + memory -= highmem_dirtyable_memory(memory);

  
  If vm_highmem_is_dirtyable=0, In that case, we can still return with
  memcg_memory which can be more than memory.  IOW, highmem is not
  dirtyable system wide but still we can potetially return back saying
  for this cgroup we can dirty more pages which can potenailly be acutally
  be more that system wide allowed?
  
  Because you have modified dirtyable_memory() and made it per cgroup, I
  think it automatically takes care of the cases of per cgroup dirty ratio,
  I mentioned in my previous mail. So we will use system wide dirty ratio
  to calculate the allowed dirty pages in this cgroup (dirty_ratio *
  available_memory()) and if this cgroup wrote too many pages start
  writeout? 
 
 OK, if I've understood well, you're proposing to use per-cgroup
 dirty_ratio interface and do something like:

I think we can use system wide dirty_ratio for per cgroup (instead of
providing configurable dirty_ratio for each cgroup where each memory
cgroup can have different dirty ratio. Can't think of a use case
immediately).
 
 unsigned long determine_dirtyable_memory(void)
 {
   unsigned long memcg_memory, memory;
 
   memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
   if (!vm_highmem_is_dirtyable)
   memory -= highmem_dirtyable_memory(memory);
 
   memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
   if (!memcg_memory)
   return memory + 1;  /* Ensure that we never return 0 */
   memcg_memory += mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
   if (!vm_highmem_is_dirtyable)
memcg_memory -= highmem_dirtyable_memory(memory) *
   mem_cgroup_dirty_ratio() / 100;
   if (memcg_memory  memory)
   return memcg_memory;
 }
 

This one is tricky and I don't have good answers. I have concerns though.

- While calculating system wide dirtyable memory, we rely on actual memory
  available. (NR_FREE_PAGES + reclaimable_pages). In case of per memory
  cgroup available free pages, we are relying on not necessarily on
  actually available dirtyable memory but based on a user configurable
  limit (LIMIT - USAGE = cgroup_dirtyable_memory).

  This is good as long as total sum of limits of all cgroups is not more
  than available memory. But if somebody sets the limit to a high value,
  we will allow lots of write from that cgroup without being throttled.

  So if memory cgroups were not configured right so that limit total
  represents the actual memory in system, then we might end up having lot
  more dirty pages in the system.

- Subtracting high memory pages from dirtyable memory is tricky. Because
  how to account it in per cgroup calculation. May be we can just do
  following.

calculate_memcg_memory;
memory = memory - highmem_dirtyable_memory();
if (memcg_memory  memory)
return memcg_memory;

 Not sure. This is very crude and leaves the scope of more pages being
 dirty than otherwise would have been. Ideas?

Vivek

 
  
   - return x + 1;   /* Ensure that we never return 0 */
   + return memory + 1;  /* Ensure that we never return 0 */
}

void
   @@ -421,12 +428,13 @@ get_dirty_limits(unsigned long *pbackground, 
   unsigned long *pdirty,
  unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
{
 unsigned long background;
   - unsigned long dirty;
   + unsigned long dirty, dirty_bytes;
 unsigned long available_memory = determine_dirtyable_memory();
 struct task_struct *tsk;

   - if (vm_dirty_bytes)
   - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
   + dirty_bytes = mem_cgroup_dirty_bytes();
   + if (dirty_bytes)
   + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
 else {
 int dirty_ratio;

   @@ -505,9 +513,17 @@ static void balance_dirty_pages(struct address_space 
   *mapping,
 get_dirty_limits(background_thresh, dirty_thresh,
 bdi_thresh, bdi);

   - nr_reclaimable = 

[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-23 Thread Vivek Goyal
On Sun, Feb 21, 2010 at 04:18:45PM +0100, Andrea Righi wrote:

[..]
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index 0b19943..c9ff1cd 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
   */
  static int calc_period_shift(void)
  {
 - unsigned long dirty_total;
 + unsigned long dirty_total, dirty_bytes;
  
 - if (vm_dirty_bytes)
 - dirty_total = vm_dirty_bytes / PAGE_SIZE;
 + dirty_bytes = mem_cgroup_dirty_bytes();
 + if (dirty_bytes)
 + dirty_total = dirty_bytes / PAGE_SIZE;
   else
   dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
   100;

Ok, I don't understand this so I better ask. Can you explain a bit how memory
cgroup dirty ratio is going to play with per BDI dirty proportion thing.

Currently we seem to be calculating per BDI proportion (based on recently
completed events), of system wide dirty ratio and decide whether a process
should be throttled or not.

Because throttling decision is also based on BDI and its proportion, how
are we going to fit it with mem cgroup? Is it going to be BDI proportion
of dirty memory with-in memory cgroup (and not system wide)?

Thanks
Vivek
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-23 Thread David Rientjes
On Tue, 23 Feb 2010, Vivek Goyal wrote:

   Because you have modified dirtyable_memory() and made it per cgroup, I
   think it automatically takes care of the cases of per cgroup dirty ratio,
   I mentioned in my previous mail. So we will use system wide dirty ratio
   to calculate the allowed dirty pages in this cgroup (dirty_ratio *
   available_memory()) and if this cgroup wrote too many pages start
   writeout? 
  
  OK, if I've understood well, you're proposing to use per-cgroup
  dirty_ratio interface and do something like:
 
 I think we can use system wide dirty_ratio for per cgroup (instead of
 providing configurable dirty_ratio for each cgroup where each memory
 cgroup can have different dirty ratio. Can't think of a use case
 immediately).

I think each memcg should have both dirty_bytes and dirty_ratio, 
dirty_bytes defaults to 0 (disabled) while dirty_ratio is inherited from 
the global vm_dirty_ratio.  Changing vm_dirty_ratio would not change 
memcgs already using their own dirty_ratio, but new memcgs would get the 
new value by default.  The ratio would act over the amount of available 
memory to the cgroup as though it were its own virtual system operating 
with a subset of the system's RAM and the same global ratio.
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-22 Thread Vivek Goyal
On Sun, Feb 21, 2010 at 04:18:45PM +0100, Andrea Righi wrote:
 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  fs/fuse/file.c  |3 ++
  fs/nfs/write.c  |3 ++
  fs/nilfs2/segment.c |8 -
  mm/filemap.c|1 +
  mm/page-writeback.c |   69 --
  mm/truncate.c   |1 +
  6 files changed, 63 insertions(+), 22 deletions(-)
 
 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
 index a9f5e13..357632a 100644
 --- a/fs/fuse/file.c
 +++ b/fs/fuse/file.c
 @@ -11,6 +11,7 @@
  #include linux/pagemap.h
  #include linux/slab.h
  #include linux/kernel.h
 +#include linux/memcontrol.h
  #include linux/sched.h
  #include linux/module.h
  
 @@ -1129,6 +1130,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, 
 struct fuse_req *req)
  
   list_del(req-writepages_entry);
   dec_bdi_stat(bdi, BDI_WRITEBACK);
 + mem_cgroup_charge_dirty(req-pages[0], NR_WRITEBACK_TEMP, -1);
   dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
   bdi_writeout_inc(bdi);
   wake_up(fi-page_waitq);
 @@ -1240,6 +1242,7 @@ static int fuse_writepage_locked(struct page *page)
   req-inode = inode;
  
   inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
 + mem_cgroup_charge_dirty(tmp_page, NR_WRITEBACK_TEMP, 1);
   inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
   end_page_writeback(page);
  
 diff --git a/fs/nfs/write.c b/fs/nfs/write.c
 index d63d964..3d9de01 100644
 --- a/fs/nfs/write.c
 +++ b/fs/nfs/write.c
 @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
   req-wb_index,
   NFS_PAGE_TAG_COMMIT);
   spin_unlock(inode-i_lock);
 + mem_cgroup_charge_dirty(req-wb_page, NR_UNSTABLE_NFS, 1);
   inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_RECLAIMABLE);
   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
   struct page *page = req-wb_page;
  
   if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) {
 + mem_cgroup_charge_dirty(page, NR_UNSTABLE_NFS, -1);
   dec_zone_page_state(page, NR_UNSTABLE_NFS);
   dec_bdi_stat(page-mapping-backing_dev_info, BDI_RECLAIMABLE);
   return 1;
 @@ -1320,6 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head 
 *head, int how)
   req = nfs_list_entry(head-next);
   nfs_list_remove_request(req);
   nfs_mark_request_commit(req);
 + mem_cgroup_charge_dirty(req-wb_page, NR_UNSTABLE_NFS, -1);
   dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   dec_bdi_stat(req-wb_page-mapping-backing_dev_info,
   BDI_RECLAIMABLE);
 diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
 index 105b508..b9ffac5 100644
 --- a/fs/nilfs2/segment.c
 +++ b/fs/nilfs2/segment.c
 @@ -1660,8 +1660,10 @@ nilfs_copy_replace_page_buffers(struct page *page, 
 struct list_head *out)
   } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head);
   kunmap_atomic(kaddr, KM_USER0);
  
 - if (!TestSetPageWriteback(clone_page))
 + if (!TestSetPageWriteback(clone_page)) {
 + mem_cgroup_charge_dirty(clone_page, NR_WRITEBACK, 1);
   inc_zone_page_state(clone_page, NR_WRITEBACK);
 + }
   unlock_page(clone_page);
  
   return 0;
 @@ -1788,8 +1790,10 @@ static void __nilfs_end_page_io(struct page *page, int 
 err)
   }
  
   if (buffer_nilfs_allocated(page_buffers(page))) {
 - if (TestClearPageWriteback(page))
 + if (TestClearPageWriteback(page)) {
 + mem_cgroup_charge_dirty(clone_page, NR_WRITEBACK, -1);
   dec_zone_page_state(page, NR_WRITEBACK);
 + }
   } else
   end_page_writeback(page);
  }
 diff --git a/mm/filemap.c b/mm/filemap.c
 index 698ea80..c19d809 100644
 --- a/mm/filemap.c
 +++ b/mm/filemap.c
 @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
* having removed the page entirely.
*/
   if (PageDirty(page)  mapping_cap_account_dirty(mapping)) {
 + mem_cgroup_charge_dirty(page, NR_FILE_DIRTY, -1);
   dec_zone_page_state(page, NR_FILE_DIRTY);
   dec_bdi_stat(mapping-backing_dev_info, BDI_RECLAIMABLE);
   }
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index 0b19943..c9ff1cd 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
   */
  static int calc_period_shift(void)
  {
 - unsigned long dirty_total;
 + unsigned long dirty_total, dirty_bytes;
  
 - if (vm_dirty_bytes)
 - dirty_total = vm_dirty_bytes / 

[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-22 Thread Peter Zijlstra
On Sun, 2010-02-21 at 16:18 +0100, Andrea Righi wrote:
 @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
   */
  static int calc_period_shift(void)
  {
 -   unsigned long dirty_total;
 +   unsigned long dirty_total, dirty_bytes;
  
 -   if (vm_dirty_bytes)
 -   dirty_total = vm_dirty_bytes / PAGE_SIZE;
 +   dirty_bytes = mem_cgroup_dirty_bytes();
 +   if (dirty_bytes)
 +   dirty_total = dirty_bytes / PAGE_SIZE;
 else
 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) 
 /
 100;
 @@ -406,14 +407,20 @@ static unsigned long highmem_dirtyable_memory(unsigned 
 long total)
   */
  unsigned long determine_dirtyable_memory(void)
  {
 -   unsigned long x;
 -
 -   x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
 -
 +   unsigned long memcg_memory, memory;
 +
 +   memory = global_page_state(NR_FREE_PAGES) + 
 global_reclaimable_pages();
 +   memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
 +   if (memcg_memory  0) {
 +   memcg_memory +=
 +   mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
 +   if (memcg_memory  memory)
 +   return memcg_memory;
 +   }
 if (!vm_highmem_is_dirtyable)
 -   x -= highmem_dirtyable_memory(x);
 +   memory -= highmem_dirtyable_memory(memory);
  
 -   return x + 1;   /* Ensure that we never return 0 */
 +   return memory + 1;  /* Ensure that we never return 0 */
  }
  
  void
 @@ -421,12 +428,13 @@ get_dirty_limits(unsigned long *pbackground, unsigned 
 long *pdirty,
  unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
  {
 unsigned long background;
 -   unsigned long dirty;
 +   unsigned long dirty, dirty_bytes;
 unsigned long available_memory = determine_dirtyable_memory();
 struct task_struct *tsk;
  
 -   if (vm_dirty_bytes)
 -   dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
 +   dirty_bytes = mem_cgroup_dirty_bytes();
 +   if (dirty_bytes)
 +   dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
 else {
 int dirty_ratio;
  
 @@ -505,9 +513,17 @@ static void balance_dirty_pages(struct address_space 
 *mapping,
 get_dirty_limits(background_thresh, dirty_thresh,
 bdi_thresh, bdi);
  
 -   nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 +   nr_reclaimable = mem_cgroup_page_state(MEMCG_NR_FILE_DIRTY);
 +   if (nr_reclaimable == 0) {
 +   nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 global_page_state(NR_UNSTABLE_NFS);
 -   nr_writeback = global_page_state(NR_WRITEBACK);
 +   nr_writeback = global_page_state(NR_WRITEBACK);
 +   } else {
 +   nr_reclaimable +=
 +   mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
 +   nr_writeback =
 +   mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
 +   }
  
 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 @@ -660,6 +676,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 unsigned long dirty_thresh;
  
  for ( ; ; ) {
 +   unsigned long dirty;
 +
 get_dirty_limits(background_thresh, dirty_thresh, NULL, 
 NULL);
  
  /*
 @@ -668,10 +686,15 @@ void throttle_vm_writeout(gfp_t gfp_mask)
   */
  dirty_thresh += dirty_thresh / 10;  /* wh... */
  
 -if (global_page_state(NR_UNSTABLE_NFS) +
 -   global_page_state(NR_WRITEBACK) = dirty_thresh)
 -   break;
 -congestion_wait(BLK_RW_ASYNC, HZ/10);
 +   dirty = mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
 +   if (dirty  0)
 +   dirty = global_page_state(NR_UNSTABLE_NFS) +
 +   global_page_state(NR_WRITEBACK);
 +   else
 +   dirty += mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
 +   if (dirty = dirty_thresh)
 +   break;
 +   congestion_wait(BLK_RW_ASYNC, HZ/10);
  
 /*
  * The caller might hold locks which can prevent IO 
 completion 


This stuff looks really rather horrible, 

Relying on these cgroup functions returning 0 seems fragile, some of
them can really be 0. Also sprinkling all that if cgroup foo all over
the place leads to these ugly indentation problems you have.

How about pulling all these things into separate functions, and using a
proper mem_cgroup_has_dirty() function to 

[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-21 Thread David Rientjes
On Sun, 21 Feb 2010, Andrea Righi wrote:

 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index 0b19943..c9ff1cd 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
   */
  static int calc_period_shift(void)
  {
 - unsigned long dirty_total;
 + unsigned long dirty_total, dirty_bytes;
  
 - if (vm_dirty_bytes)
 - dirty_total = vm_dirty_bytes / PAGE_SIZE;
 + dirty_bytes = mem_cgroup_dirty_bytes();
 + if (dirty_bytes)
 + dirty_total = dirty_bytes / PAGE_SIZE;
   else
   dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
   100;

This needs a comment since mem_cgroup_dirty_bytes() doesn't imply that it 
is responsible for returning the global vm_dirty_bytes when that's 
actually what it does (both for CONFIG_CGROUP_MEM_RES_CTRL=n and root 
cgroup).
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH 2/2] memcg: dirty pages instrumentation

2010-02-21 Thread KAMEZAWA Hiroyuki
On Sun, 21 Feb 2010 16:18:45 +0100
Andrea Righi ari...@develer.com wrote:

 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.
 
 Signed-off-by: Andrea Righi ari...@develer.com

I think there are design confusion with 1st patch.


 ---
  fs/fuse/file.c  |3 ++
  fs/nfs/write.c  |3 ++
  fs/nilfs2/segment.c |8 -
  mm/filemap.c|1 +
  mm/page-writeback.c |   69 --
  mm/truncate.c   |1 +
  6 files changed, 63 insertions(+), 22 deletions(-)
 
 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
 index a9f5e13..357632a 100644
 --- a/fs/fuse/file.c
 +++ b/fs/fuse/file.c
 @@ -11,6 +11,7 @@
  #include linux/pagemap.h
  #include linux/slab.h
  #include linux/kernel.h
 +#include linux/memcontrol.h
  #include linux/sched.h
  #include linux/module.h
  
 @@ -1129,6 +1130,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, 
 struct fuse_req *req)
  
   list_del(req-writepages_entry);
   dec_bdi_stat(bdi, BDI_WRITEBACK);
 + mem_cgroup_charge_dirty(req-pages[0], NR_WRITEBACK_TEMP, -1);

Here, you account dirty pages to the memcg which page_cgroup belongs to.
Not to the root cgroup of hierarchical accouning.


   dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
   bdi_writeout_inc(bdi);
   wake_up(fi-page_waitq);
 @@ -1240,6 +1242,7 @@ static int fuse_writepage_locked(struct page *page)
   req-inode = inode;
  
   inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
 + mem_cgroup_charge_dirty(tmp_page, NR_WRITEBACK_TEMP, 1);

here too

   inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
   end_page_writeback(page);
  
 diff --git a/fs/nfs/write.c b/fs/nfs/write.c
 index d63d964..3d9de01 100644
 --- a/fs/nfs/write.c
 +++ b/fs/nfs/write.c
 @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
   req-wb_index,
   NFS_PAGE_TAG_COMMIT);
   spin_unlock(inode-i_lock);
 + mem_cgroup_charge_dirty(req-wb_page, NR_UNSTABLE_NFS, 1);
   inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_RECLAIMABLE);
   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
   struct page *page = req-wb_page;
  
   if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) {
 + mem_cgroup_charge_dirty(page, NR_UNSTABLE_NFS, -1);
   dec_zone_page_state(page, NR_UNSTABLE_NFS);
   dec_bdi_stat(page-mapping-backing_dev_info, BDI_RECLAIMABLE);
   return 1;
 @@ -1320,6 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head 
 *head, int how)
   req = nfs_list_entry(head-next);
   nfs_list_remove_request(req);
   nfs_mark_request_commit(req);
 + mem_cgroup_charge_dirty(req-wb_page, NR_UNSTABLE_NFS, -1);
   dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   dec_bdi_stat(req-wb_page-mapping-backing_dev_info,
   BDI_RECLAIMABLE);
 diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
 index 105b508..b9ffac5 100644
 --- a/fs/nilfs2/segment.c
 +++ b/fs/nilfs2/segment.c
 @@ -1660,8 +1660,10 @@ nilfs_copy_replace_page_buffers(struct page *page, 
 struct list_head *out)
   } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head);
   kunmap_atomic(kaddr, KM_USER0);
  
 - if (!TestSetPageWriteback(clone_page))
 + if (!TestSetPageWriteback(clone_page)) {
 + mem_cgroup_charge_dirty(clone_page, NR_WRITEBACK, 1);
   inc_zone_page_state(clone_page, NR_WRITEBACK);
 + }
   unlock_page(clone_page);
  
   return 0;
 @@ -1788,8 +1790,10 @@ static void __nilfs_end_page_io(struct page *page, int 
 err)
   }
  
   if (buffer_nilfs_allocated(page_buffers(page))) {
 - if (TestClearPageWriteback(page))
 + if (TestClearPageWriteback(page)) {
 + mem_cgroup_charge_dirty(clone_page, NR_WRITEBACK, -1);
   dec_zone_page_state(page, NR_WRITEBACK);
 + }
   } else
   end_page_writeback(page);
  }
 diff --git a/mm/filemap.c b/mm/filemap.c
 index 698ea80..c19d809 100644
 --- a/mm/filemap.c
 +++ b/mm/filemap.c
 @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
* having removed the page entirely.
*/
   if (PageDirty(page)  mapping_cap_account_dirty(mapping)) {
 + mem_cgroup_charge_dirty(page, NR_FILE_DIRTY, -1);
   dec_zone_page_state(page, NR_FILE_DIRTY);
   dec_bdi_stat(mapping-backing_dev_info, BDI_RECLAIMABLE);
   }
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index 0b19943..c9ff1cd 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;