mm: migrate_pages using

2007-03-11 Thread Michal Hocko
Hello.

I have some problems with migrate_pages understanding. Here is my
situation:
I need to change virtual page mapping (all stuff referring to the page) 
to new physical location. Page is present for the process and new page is 
already allocated (and not mapped anywhere). I thought that direct migration 
code should help me, but there are some issues. Maybe I am missing something.

Here is my simple wrapper for migrate_pages (and related) function:

struct page * return_target(struct page * page, unsigned long private,
int ** result)
{
return (struct page *)private;
}

/* I am keeping non exclusive mmap_sem when entering and leaving */
int migrate_pfn_page(struct page *original, struct page *target)
{
LIST_HEAD(pagelist);
int ret;

if((ret = migrate_prep()))
goto done;
get_page(original);
ret = isolate_lru_page(original, page_list);
put_page(original);
if(ret)
goto done;
ret = migrate_pages(page_list, return_target, (unsigned long)target);
done:
return ret;
}

I think that I am loosing pages this way (reference count is not
decreased properly and so original page doesn't get to the free list).
As a example (from printk output):
* before function starts.
original: flags:0x0060 mapping:dece8fa1 mapcount:1 count:1
target: flags:0x mapping: mapcount:0 count:1
* before migrate_pages is called
original flags:0x0040 mapping:dece8fa1 mapcount:1 count:2
target flags:0x mapping: mapcount:0 count:1
* migrate_pages returns with 0 and
original flags:0x mapping: mapcount:0 count:1
target flags:0x0010 mapping:dece8fa1 mapcount:1 count:2

When I try to put_page(original) (because I don't want it for this
moment) I get to bad_page path and need to reboot...

What am I missing? Am I using migrate_pages correctly?

Thanks for any suggestions.

(please add me to cc: because I am not list member)

P.S.
I am using vanilla 2.6.18 source tree.

Best regards
-- 
Michal Hocko

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm: migrate_pages using

2007-03-12 Thread Michal Hocko
On Mon, Mar 12, 2007 at 04:32:55PM +0900, KAMEZAWA Hiroyuki wrote:
 
  When I try to put_page(original) (because I don't want it for this
  moment) I get to bad_page path and need to reboot...
  
  What am I missing? Am I using migrate_pages correctly?
  
 
 ==
 migrate_pages()   | refcnt = 2,
   - unmap_and_move() | refcnt = 2,
   - try_to_unmap()   | refcnt = 1
   - move_to_lru()| refcnt = 1
   - lru_cache_add()  | refcnt = 1
   - page_cache_get() | refcnt = 2
   - pagevec_add()| refcnt = 2 
   - page_cache_put() | refcnt 1
 ==

Thanks for clarification. 

 see pagevec codes and release_pages() in swap.c
 per-cpu pagevec caches freed pages for a while. 
 It will be pushed back to the free list when pagevec goes full.

I am not sure about that. When I looked inside lru_cache_add, it will
call __pagevec_lru_add if per-cpu pagevec goes empty and this function
will add all pages to the zone specific inactive list.

I want to prevent that because I need to use original page for other
purposes and so it can't be in inactive list. So I have created little
change to unmap_and_move function so that original page is added back tu
LRU only if required. migrate_pages uses variant with put_lru parameter
and I am using __migrate_pfn_page which uses put_lru=0. See the attached 
patch.
What do you think about that. Is this way correct?

(Please add me to CC, because I am not list member)

Best regards
-- 
Michal Hocko
--- /usr/src/linux-vanilla/mm/migrate.c 2006-12-02 15:08:23.0 +0100
+++ /usr/src/linux/mm/migrate.c 2007-03-12 18:44:16.0 +0100
 
@@ -587,7 +595,7 @@ static int move_to_new_page(struct page 
  * to the newly allocated page in newpage.
  */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-   struct page *page, int force)
+   struct page *page, int force, int put_lru)
 {
int rc = 0;
int *result = NULL;
@@ -631,10 +639,11 @@ unlock:
 * A page that has been migrated has all references
 * removed and will be freed. A page that has not been
 * migrated will have kepts its references and be
-* restored.
+* restored. Don't push page to LRU unless wanted.
 */
list_del(page-lru);
-   move_to_lru(page);
+   if(put_lru)
+   move_to_lru(page);
}
 
 move_newpage:
@@ -687,7 +696,7 @@ int migrate_pages(struct list_head *from
cond_resched();
 
rc = unmap_and_move(get_new_page, private,
-   page, pass  2);
+   page, pass  2, 1);
 
switch(rc) {
case -ENOMEM:
@@ -717,6 +726,111 @@ out:
return nr_failed + retry;
 }
 
+/* target page is stored in private parameter when used from 
+ * migrate_pfn_page
+ */
+static struct page * return_target(struct page * page, unsigned long private, 
+   int ** result)
+{
+   return (struct page *)private;
+}
+
+int __migrate_pfn_page(struct page * page, new_page_t get_new_page, unsigned 
long private)
+{
+   int rc = 0;
+   int pass;
+   int retry = 1;
+
+   for(pass = 0; pass  10  retry; pass++)
+   {
+   retry = 0;
+   cond_resched();
+   rc = unmap_and_move(get_new_page, private, page, pass  2, 0);
+   switch(rc) 
+   {
+   case -ENOMEM:
+   goto out;
+   case -EAGAIN:
+   retry = 1;
+   break;
+   case 0:
+   break;
+   default:
+   return 1;
+   }
+   }
+   if(rc)
+   move_to_lru(page);
+out:
+   return rc;
+}
+
+int migrate_pfn_page(struct page * original, struct page * target)
+{
+   LIST_HEAD(pagelist);
+   int ret;
+   int swapwrite = current-flags  PF_SWAPWRITE;
+
+   if (!swapwrite)
+   current-flags |= PF_SWAPWRITE;
+
+   /* this will fail if migration is not supported */
+   if((ret = migrate_prep()))
+   goto done;
+   get_page(original);
+   ret = isolate_lru_page(original, pagelist);
+   put_page(original);
+   if(ret)
+   goto done;
+   ret = __migrate_pfn_page(original, return_target, (unsigned 
long)target);
+   if(!ret)
+   page_cache_release(original);
+done:
+   if (!swapwrite)
+   current-flags = ~PF_SWAPWRITE;
+   return ret;
+}


Re: [V5 PATCH 08/26] memcontrol: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Michal Hocko
On Mon 29-10-12 13:40:39, David Rientjes wrote:
 On Mon, 29 Oct 2012, Michal Hocko wrote:
 
   N_HIGH_MEMORY stands for the nodes that has normal or high memory.
   N_MEMORY stands for the nodes that has any memory.
  
  What is the difference of those two?
  
 
 Patch 5 in the series 

Strange, I do not see that one at the mailing list.

 introduces it to be equal to N_HIGH_MEMORY, so 

So this is just a rename? If yes it would be much esier if it was
mentioned in the patch description.

 accepting this patch would be an implicit ack of the direction taken 
 there.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [V5 PATCH 08/26] memcontrol: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Michal Hocko
On Mon 29-10-12 14:08:05, David Rientjes wrote:
 On Mon, 29 Oct 2012, Michal Hocko wrote:
 
 N_HIGH_MEMORY stands for the nodes that has normal or high memory.
 N_MEMORY stands for the nodes that has any memory.

What is the difference of those two?

   
   Patch 5 in the series 
  
  Strange, I do not see that one at the mailing list.
  
 
 http://marc.info/?l=linux-kernelm=135152595827692

Thanks!

   introduces it to be equal to N_HIGH_MEMORY, so 
  
  So this is just a rename? If yes it would be much esier if it was
  mentioned in the patch description.
  
 
 It's not even a rename even though it should be, it's adding yet another 
 node_states that is equal to N_HIGH_MEMORY since that state already 
 includes all memory.  

Which is really strange because I do not see any reason for yet another
alias if the follow up patches rename all of them (I didn't try to apply
the whole series to check that so I might be wrong here).

 It's just a matter of taste but I think we should be renaming it
 instead of aliasing it (unless you actually want to make N_HIGH_MEMORY
 only include nodes with highmem, but nothing depends on that).

Agreed, I've always considered N_HIGH_MEMORY misleading and confusing so
renaming it would really make a lot of sense to me.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 3/6] memcg: Simplify mem_cgroup_force_empty_list error handling

2012-10-30 Thread Michal Hocko
On Mon 29-10-12 15:00:22, Andrew Morton wrote:
 On Mon, 29 Oct 2012 17:58:45 +0400
 Glauber Costa glom...@parallels.com wrote:
 
   + * move charges to its parent or the root cgroup if the group has no
   + * parent (aka use_hierarchy==0).
   + * Although this might fail (get_page_unless_zero, isolate_lru_page or
   + * mem_cgroup_move_account fails) the failure is always temporary and
   + * it signals a race with a page removal/uncharge or migration. In the
   + * first case the page is on the way out and it will vanish from the LRU
   + * on the next attempt and the call should be retried later.
   + * Isolation from the LRU fails only if page has been isolated from
   + * the LRU since we looked at it and that usually means either global
   + * reclaim or migration going on. The page will either get back to the
   + * LRU or vanish.
  
  I just wonder for how long can it go in the worst case?
 
 If the kernel is uniprocessor and the caller is SCHED_FIFO: ad infinitum!

You are right, if the rmdir (resp. echo  force_empty) at SCHED_FIFO
races with put_page (on a shared page) which gets preempted after
put_page_testzero and before __page_cache_release then we are screwed:

put_page(page)
  put_page_testzero
  preempted and page still on 
LRU
mem_cgroup_force_empty_list
  page = list_entry(list-prev, struct page, lru);
  mem_cgroup_move_parent(page)
get_page_unless_zero fails
  cond_resched() scheduled again

The race window is really small but it is definitely possible. I am not
happy about this state and it should be probably mentioned in the
patch description but I do not see any way around (except for hacks like
sched_setscheduler for the current which is, ehm...) and still keep
do_not_fail contract here.

Can we consider this as a corner case (it is much easier to kill a
machine with SCHED_FIFO than this anyway) or the concern is really
strong and we should come with a solution before this can get merged?

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: memcg/cgroup: do not fail fail on pre_destroy callbacks

2012-10-30 Thread Michal Hocko
On Mon 29-10-12 16:26:02, Tejun Heo wrote:
 Hello, Michal.
 
  Tejun is planning to build on top of that and make some more cleanups
  in the cgroup core (namely get rid of of the whole retry code in
  cgroup_rmdir).
 
 I applied 1-3 to the following branch which is based on top of v3.6.
 
   git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
 cgroup-destroy-updates

Ok, Andrew droped all the patches from his tree and I set up this
branch for automerging to -mm git tree.

 I'll follow up with updates to the destroy path which will replace #4.
 #5 and #6 should be stackable on top.

Could you take care of them and apply those two on top of the first one
which guarantees that css_tryget fails and no new task can appear in the
group (aka #4 without follow up cleanups)? So that Andrew doesn't have
to care about them later.

Thanks!
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [V5 PATCH 08/26] memcontrol: use N_MEMORY instead N_HIGH_MEMORY

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 15:03:36, Wen Congyang wrote:
 At 10/30/2012 04:46 AM, David Rientjes Wrote:
  On Mon, 29 Oct 2012, Lai Jiangshan wrote:
[...]
  In one word, we need a N_MEMORY. We just intrude it as an alias to
  N_HIGH_MEMORY and fix all im-proper usages of N_HIGH_MEMORY in late 
  patches.
 
  
  If this is really that problematic (and it appears it's not given that 
  there are many use cases of it and people tend to get it right), then why 
  not simply rename N_HIGH_MEMORY instead of introducing yet another 
  nodemask to the equation?
 
 The reason is that we need a node which only contains movable memory. This
 feature is very important for node hotplug. So we will add a new nodemask
 for movable memory. N_MEMORY contains movable memory but N_HIGH_MEMORY
 doesn't contain it.

OK, so the N_MOVABLE_MEMORY (or how you will call it) requires that all
the allocations will be migrateable?
How do you want to achieve that with the page_cgroup descriptors? (see
bellow)

On Mon 29-10-12 23:20:58, Lai Jiangshan wrote:
[...]
 diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
 index 5ddad0c..c1054ad 100644
 --- a/mm/page_cgroup.c
 +++ b/mm/page_cgroup.c
 @@ -271,7 +271,7 @@ void __init page_cgroup_init(void)
   if (mem_cgroup_disabled())
   return;
  
 - for_each_node_state(nid, N_HIGH_MEMORY) {
 + for_each_node_state(nid, N_MEMORY) {
   unsigned long start_pfn, end_pfn;
  
   start_pfn = node_start_pfn(nid);

This will call init_section_page_cgroup(pfn, nid) later which allocates
page_cgroup descriptors which are not movable. Or is there any code in
your patchset that handles this?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] cgroup: kill cgroup_subsys-__DEPRECATED_clear_css_refs

2012-10-31 Thread Michal Hocko
On Tue 30-10-12 21:22:38, Tejun Heo wrote:
 2ef37d3fe4 (memcg: Simplify mem_cgroup_force_empty_list error
 handling) removed the last user of __DEPRECATED_clear_css_refs.  This
 patch removes __DEPRECATED_clear_css_refs and mechanisms to support
 it.
 
 * Conditionals dependent on __DEPRECATED_clear_css_refs removed.
 
 * -pre_destroy() now can only fail if a new task is attached or child
   cgroup is created while -pre_destroy()s are being called.  As the
   condition is checked again after re-acquiring cgroup_mutex
   afterwards, we don't need to take any immediate action on
   -pre_destroy() failures.  This reduces cgroup_call_pre_destroy() to
   a simple loop surrounding -pre_destory().  Remove
   cgroup_call_pre_destroy() and open-code the loop into
   cgroup_rmdir().
 
 * cgroup_clear_css_refs() can no longer fail.  All that needs to be
   done are deactivating refcnts, setting CSS_REMOVED and putting the
   base reference on each css.  Remove cgroup_clear_css_refs() and the
   failure path, and open-code the loops into cgroup_rmdir().
 
 Note that cgroup_rmdir() will see more cleanup soon.
 
 Signed-off-by: Tejun Heo t...@kernel.org

Looks good to me and the diffstat is really encouraging
Reviewed-by: Michal Hocko mho...@suse.cz

with a minor note bellow
 ---
  include/linux/cgroup.h |  12 
  kernel/cgroup.c| 159 
 -
  2 files changed, 38 insertions(+), 133 deletions(-)
[...]
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index 7981850..033bf4b 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
[...]
 @@ -4168,11 +4075,9 @@ again:
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
 - ret = cgroup_call_pre_destroy(cgrp);
 - if (ret) {
 - clear_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags);
 - return ret;
 - }
 + for_each_subsys(cgrp-root, ss)
 + if (ss-pre_destroy)
 + WARN_ON_ONCE(ss-pre_destroy(cgrp));

Hmm, I am not sure I like this WARN_ON_ONCE. First it can happen for
more than one controller and second we can just clear CGRP_WAIT_ON_RMDIR
and return with EBUSY. The only possible failure at the moment is when a
new task or a child group appear.
I know it is not a big deal because it will disappear later in the
series but it would be more readable IMO.

Thanks!
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/8] cgroup: kill CSS_REMOVED

2012-10-31 Thread Michal Hocko
On Tue 30-10-12 21:22:39, Tejun Heo wrote:
 CSS_REMOVED is one of the several contortions which were necessary to
 support css reference draining on cgroup removal.  All css-refcnts
 which need draining should be deactivated and verified to equal zero
 atomically w.r.t. css_tryget().  If any one isn't zero, all refcnts
 needed to be re-activated and css_tryget() shouldn't fail in the
 process.
 
 This was achieved by letting css_tryget() busy-loop until either the
 refcnt is reactivated (failed removal attempt) or CSS_REMOVED is set
 (committing to removal).
 
 Now that css refcnt draining is no longer used, there's no need for
 atomic rollback mechanism.  css_tryget() simply can look at the
 reference count and fail if the it's deactivated - it's never getting
 re-activated.
 
 This patch removes CSS_REMOVED and updates __css_tryget() to fail if
 the refcnt is deactivated.
 
 Note that this removes css_is_removed() whose only user is VM_BUG_ON()
 in memcontrol.c.  We can replace it with a check on the refcnt but
 given that the only use case is a debug assert, I think it's better to
 simply unexport it.
 
 Signed-off-by: Tejun Heo t...@kernel.org
 Cc: Johannes Weiner han...@cmpxchg.org
 Cc: Michal Hocko mho...@suse.cz
 Cc: Balbir Singh bsinghar...@gmail.com
 Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com

Few questions/suggestions below but it looks good in general to me.
Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  include/linux/cgroup.h |  6 --
  kernel/cgroup.c| 31 ---
  mm/memcontrol.c|  7 +++
  3 files changed, 15 insertions(+), 29 deletions(-)
 
[...]
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index 033bf4b..a49cdbc 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -170,8 +170,8 @@ struct css_id {
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
 -  * is called after synchronize_rcu(). But for safe use, css_is_removed()
 -  * css_tryget() should be used for avoiding race.
 +  * is called after synchronize_rcu(). But for safe use, css_tryget()
 +  * should be used for avoiding race.
*/
   struct cgroup_subsys_state __rcu *css;
   /*
 @@ -4088,8 +4088,6 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
   }
   prepare_to_wait(cgroup_rmdir_waitq, wait, TASK_INTERRUPTIBLE);
  
 - local_irq_disable();
 -

OK, so the new charges shouldn't come from the IRQ context so we cannot
race with css_tryget but why did we need this in the first place?
A separate patch which removes this with an explanation would be nice.

   /* block new css_tryget() by deactivating refcnt */
   for_each_subsys(cgrp-root, ss) {
   struct cgroup_subsys_state *css = cgrp-subsys[ss-subsys_id];
[...]
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 5a1d584..6f8bd9d 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -2343,7 +2343,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
  again:
   if (*ptr) { /* css should be a valid one */
   memcg = *ptr;
 - VM_BUG_ON(css_is_removed(memcg-css));

All the callers seem to be fine but this was a safety net that something
didn't leak out. Can we keep it and test that the reference counter has
been disabled already (css_refcnt(memcg-css)  0 - I do not care
whether open coded or wrapped innsude css_is_removed albeit helper
sounds nicer)?

   if (mem_cgroup_is_root(memcg))
   goto done;
   if (nr_pages == 1  consume_stock(memcg))
 @@ -2483,9 +2482,9 @@ static void __mem_cgroup_cancel_local_charge(struct 
 mem_cgroup *memcg,
  
  /*
   * A helper function to get mem_cgroup from ID. must be called under
 - * rcu_read_lock(). The caller must check css_is_removed() or some if
 - * it's concern. (dropping refcnt from swap can be called against removed
 - * memcg.)
 + * rcu_read_lock(). The caller is responsible for verifying the returned
 + * memcg is still alive if necessary. (dropping refcnt from swap can be
 + * called against removed memcg.)

I think that something like the following would be more instructive:

+ * rcu_read_lock(). The caller is responsible for calling css_tryget
+ * if the mem_cgroup is used for charging. (dropping refcnt from swap can be
+ * called against removed memcg.)

Thanks!
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/8] cgroup: use cgroup_lock_live_group(parent) in cgroup_create()

2012-10-31 Thread Michal Hocko
On Tue 30-10-12 21:22:40, Tejun Heo wrote:
 This patch makes cgroup_create() fail if @parent is marked removed.
 This is to prepare for further updates to cgroup_rmdir() path.
 
 Note that this change isn't strictly necessary.  cgroup can only be
 created via mkdir and the removed marking and dentry removal happen
 without releasing cgroup_mutex, so cgroup_create() can never race with
 cgroup_rmdir().  Even after the scheduled updates to cgroup_rmdir(),
 cgroup_mkdir() and cgroup_rmdir() are synchronized by i_mutex
 rendering the added liveliness check unnecessary.
 
 Do it anyway such that locking is contained inside cgroup proper and
 we don't get nasty surprises if we ever grow another caller of
 cgroup_create().
 
 Signed-off-by: Tejun Heo t...@kernel.org

Looks good. Just a nit bellow
Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  kernel/cgroup.c | 16 +---
  1 file changed, 13 insertions(+), 3 deletions(-)
 
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index a49cdbc..b3010ae 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -3906,6 +3906,18 @@ static long cgroup_create(struct cgroup *parent, 
 struct dentry *dentry,
   if (!cgrp)
   return -ENOMEM;
  
 + /*
 +  * Only live parents can have children.  Note that the liveliness
 +  * check isn't strictly necessary because cgroup_mkdir() and
 +  * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
 +  * anyway so that locking is contained inside cgroup proper and we
 +  * don't get nasty surprises if we ever grow another caller.
 +  */
 + if (!cgroup_lock_live_group(parent)) {
 + err = -ENODEV;
 + goto err_free;
 + }
 +

I think this should be moved up before we try to allocate any memory.
Or is your motivation to keep cgroup_lock held for shorter time?
I could agree with that but a small comment would be helpful.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/8] cgroup: deactivate CSS's and mark cgroup dead before invoking -pre_destroy()

2012-10-31 Thread Michal Hocko
On Tue 30-10-12 21:22:41, Tejun Heo wrote:
 Because -pre_destroy() could fail and can't be called under
 cgroup_mutex, cgroup destruction did something very ugly.

You are referring to a commit in the comment but I would rather see it
here.

   1. Grab cgroup_mutex and verify it can be destroyed; fail otherwise.
 
   2. Release cgroup_mutex and call -pre_destroy().
 
   3. Re-grab cgroup_mutex and verify it can still be destroyed; fail
  otherwise.
 
   4. Continue destroying.
 
 In addition to being ugly, it has been always broken in various ways.
 For example, memcg -pre_destroy() expects the cgroup to be inactive
 after it's done but tasks can be attached and detached between #2 and
 #3 and the conditions that memcg verified in -pre_destroy() might no
 longer hold by the time control reaches #3.
 
 Now that -pre_destroy() is no longer allowed to fail.  We can switch
 to the following.
 
   1. Grab cgroup_mutex and fail if it can't be destroyed; fail
  otherwise.

the other fail is superfluous and too negative ;)

   2. Deactivate CSS's and mark the cgroup removed thus preventing any
  further operations which can invalidate the verification from #1.
 
   3. Release cgroup_mutex and call -pre_destroy().
 
   4. Re-grab cgroup_mutex and continue destroying.
 
 After this change, controllers can safely assume that -pre_destroy()
 will only be called only once for a given cgroup and, once
 -pre_destroy() is called, the cgroup will stay dormant till it's
 destroyed.
 
 Signed-off-by: Tejun Heo t...@kernel.org

Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  kernel/cgroup.c | 41 +++--
  1 file changed, 19 insertions(+), 22 deletions(-)
 
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index b3010ae..66204a6 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -4058,18 +4058,6 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
   struct cgroup_event *event, *tmp;
   struct cgroup_subsys *ss;
  
 - /* the vfs holds both inode-i_mutex already */
 - mutex_lock(cgroup_mutex);
 - if (atomic_read(cgrp-count) != 0) {
 - mutex_unlock(cgroup_mutex);
 - return -EBUSY;
 - }
 - if (!list_empty(cgrp-children)) {
 - mutex_unlock(cgroup_mutex);
 - return -EBUSY;
 - }
 - mutex_unlock(cgroup_mutex);
 -
   /*
* In general, subsystem has no css-refcnt after pre_destroy(). But
* in racy cases, subsystem may have to get css-refcnt after
 @@ -4081,14 +4069,7 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
*/
   set_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags);
  
 - /*
 -  * Call pre_destroy handlers of subsys. Notify subsystems
 -  * that rmdir() request comes.
 -  */
 - for_each_subsys(cgrp-root, ss)
 - if (ss-pre_destroy)
 - WARN_ON_ONCE(ss-pre_destroy(cgrp));
 -
 + /* the vfs holds both inode-i_mutex already */
   mutex_lock(cgroup_mutex);
   parent = cgrp-parent;
   if (atomic_read(cgrp-count) || !list_empty(cgrp-children)) {
 @@ -4098,13 +4079,30 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
   }
   prepare_to_wait(cgroup_rmdir_waitq, wait, TASK_INTERRUPTIBLE);
  
 - /* block new css_tryget() by deactivating refcnt */
 + /*
 +  * Block new css_tryget() by deactivating refcnt and mark @cgrp
 +  * removed.  This makes future css_tryget() and child creation
 +  * attempts fail thus maintaining the removal conditions verified
 +  * above.
 +  */
   for_each_subsys(cgrp-root, ss) {
   struct cgroup_subsys_state *css = cgrp-subsys[ss-subsys_id];
  
   WARN_ON(atomic_read(css-refcnt)  0);
   atomic_add(CSS_DEACT_BIAS, css-refcnt);
   }
 + set_bit(CGRP_REMOVED, cgrp-flags);
 +
 + /*
 +  * Tell subsystems to initate destruction.  pre_destroy() should be
 +  * called with cgroup_mutex unlocked.  See 3fa59dfbc3 (cgroup: fix
 +  * potential deadlock in pre_destroy) for details.
 +  */
 + mutex_unlock(cgroup_mutex);
 + for_each_subsys(cgrp-root, ss)
 + if (ss-pre_destroy)
 + WARN_ON_ONCE(ss-pre_destroy(cgrp));
 + mutex_lock(cgroup_mutex);
  
   /*
* Put all the base refs.  Each css holds an extra reference to the
 @@ -4120,7 +4118,6 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
   clear_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags);
  
   raw_spin_lock(release_list_lock);
 - set_bit(CGRP_REMOVED, cgrp-flags);
   if (!list_empty(cgrp-release_list))
   list_del_init(cgrp-release_list);
   raw_spin_unlock(release_list_lock);
 -- 
 1.7.11.7
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info

Re: [PATCH 5/8] cgroup: remove CGRP_WAIT_ON_RMDIR, cgroup_exclude_rmdir() and cgroup_release_and_wakeup_rmdir()

2012-10-31 Thread Michal Hocko
On Tue 30-10-12 21:22:42, Tejun Heo wrote:
 CGRP_WAIT_ON_RMDIR is another kludge which was added to make cgroup
 destruction rollback somewhat working.  cgroup_rmdir() used to drain
 CSS references and CGRP_WAIT_ON_RMDIR and the associated waitqueue and
 helpers were used to allow the task performing rmdir to wait for the
 next relevant event.
 
 Unfortunately, the wait is visible to controllers too and the
 mechanism got exposed to memcg by 887032670d (cgroup avoid permanent
 sleep at rmdir).
 
 Now that the draining and retries are gone, CGRP_WAIT_ON_RMDIR is
 unnecessary.  Remove it and all the mechanisms supporting it.  Note
 that memcontrol.c changes are essentially revert of 887032670d
 (cgroup avoid permanent sleep at rmdir).
 
 Signed-off-by: Tejun Heo t...@kernel.org
 Cc: Michal Hocko mho...@suse.cz
 Cc: Balbir Singh bsinghar...@gmail.com
 Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com

Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  include/linux/cgroup.h | 21 -
  kernel/cgroup.c| 51 
 --
  mm/memcontrol.c| 24 +---
  3 files changed, 1 insertion(+), 95 deletions(-)

/me likes this

 
 diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
 index a309804..47868a8 100644
 --- a/include/linux/cgroup.h
 +++ b/include/linux/cgroup.h
 @@ -145,10 +145,6 @@ enum {
   /* Control Group requires release notifications to userspace */
   CGRP_NOTIFY_ON_RELEASE,
   /*
 -  * A thread in rmdir() is wating for this cgroup.
 -  */
 - CGRP_WAIT_ON_RMDIR,
 - /*
* Clone cgroup values when creating a new child cgroup
*/
   CGRP_CLONE_CHILDREN,
 @@ -412,23 +408,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
  int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct 
 *task);
  
  /*
 - * When the subsys has to access css and may add permanent refcnt to css,
 - * it should take care of racy conditions with rmdir(). Following set of
 - * functions, is for stop/restart rmdir if necessary.
 - * Because these will call css_get/put, css should be alive css.
 - *
 - *  cgroup_exclude_rmdir();
 - *  ...do some jobs which may access arbitrary empty cgroup
 - *  cgroup_release_and_wakeup_rmdir();
 - *
 - *  When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
 - *  it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
 - */
 -
 -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
 -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
 -
 -/*
   * Control Group taskset, used to pass around set of tasks to cgroup_subsys
   * methods.
   */
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index 66204a6..c5f6fb2 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -966,33 +966,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
  }
  
  /*
 - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
 - * cgroup-count == 0  list_empty(cgroup-children)  subsys has some
 - * reference to css-refcnt. In general, this refcnt is expected to goes down
 - * to zero, soon.
 - *
 - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode-i_mutex;
 - */
 -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
 -
 -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
 -{
 - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags)))
 - wake_up_all(cgroup_rmdir_waitq);
 -}
 -
 -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
 -{
 - css_get(css);
 -}
 -
 -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
 -{
 - cgroup_wakeup_rmdir_waiter(css-cgroup);
 - css_put(css);
 -}
 -
 -/*
   * Call with cgroup_mutex held. Drops reference counts on modules, including
   * any duplicate ones that parse_cgroupfs_options took. If this function
   * returns an error, no reference counts are touched.
 @@ -1963,12 +1936,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct 
 task_struct *tsk)
   }
  
   synchronize_rcu();
 -
 - /*
 -  * wake up rmdir() waiter. the rmdir should fail since the cgroup


 -  * is no longer empty.
 -  */
 - cgroup_wakeup_rmdir_waiter(cgrp);
  out:
   if (retval) {
   for_each_subsys(root, ss) {
 @@ -2138,7 +2105,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, 
 struct task_struct *leader)
* step 5: success! and cleanup
*/
   synchronize_rcu();
 - cgroup_wakeup_rmdir_waiter(cgrp);
   retval = 0;
  out_put_css_set_refs:
   if (retval) {
 @@ -4058,26 +4024,13 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
   struct cgroup_event *event, *tmp;
   struct cgroup_subsys *ss;
  
 - /*
 -  * In general, subsystem has no css-refcnt after pre_destroy(). But
 -  * in racy cases, subsystem may have to get css-refcnt after
 -  * pre_destroy() and it makes rmdir return with -EBUSY

Re: [PATCH 8/8] cgroup: make -pre_destroy() return void

2012-10-31 Thread Michal Hocko
On Tue 30-10-12 21:22:45, Tejun Heo wrote:
 All -pre_destory() implementations return 0 now, which is the only
 allowed return value.  Make it return void.
 
 Signed-off-by: Tejun Heo t...@kernel.org
 Cc: Michal Hocko mho...@suse.cz
 Cc: Balbir Singh bsinghar...@gmail.com
 Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
 Cc: Vivek Goyal vgo...@redhat.com

Acked-by: Michal Hocko mho...@suse.cz

 ---
  block/blk-cgroup.c | 3 +--
  include/linux/cgroup.h | 2 +-
  kernel/cgroup.c| 2 +-
  mm/hugetlb_cgroup.c| 4 +---
  mm/memcontrol.c| 3 +--
  5 files changed, 5 insertions(+), 9 deletions(-)
 
 diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
 index f3b44a6..a7816f3 100644
 --- a/block/blk-cgroup.c
 +++ b/block/blk-cgroup.c
 @@ -600,7 +600,7 @@ struct cftype blkcg_files[] = {
   *
   * This is the blkcg counterpart of ioc_release_fn().
   */
 -static int blkcg_pre_destroy(struct cgroup *cgroup)
 +static void blkcg_pre_destroy(struct cgroup *cgroup)
  {
   struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
  
 @@ -622,7 +622,6 @@ static int blkcg_pre_destroy(struct cgroup *cgroup)
   }
  
   spin_unlock_irq(blkcg-lock);
 - return 0;
  }
  
  static void blkcg_destroy(struct cgroup *cgroup)
 diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
 index 47868a8..adb2adc 100644
 --- a/include/linux/cgroup.h
 +++ b/include/linux/cgroup.h
 @@ -436,7 +436,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
  
  struct cgroup_subsys {
   struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
 - int (*pre_destroy)(struct cgroup *cgrp);
 + void (*pre_destroy)(struct cgroup *cgrp);
   void (*destroy)(struct cgroup *cgrp);
   int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
   void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index c5f6fb2..83cd7d0 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -4054,7 +4054,7 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
   mutex_unlock(cgroup_mutex);
   for_each_subsys(cgrp-root, ss)
   if (ss-pre_destroy)
 - WARN_ON_ONCE(ss-pre_destroy(cgrp));
 + ss-pre_destroy(cgrp);
   mutex_lock(cgroup_mutex);
  
   /*
 diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
 index dc595c6..0d3a1a3 100644
 --- a/mm/hugetlb_cgroup.c
 +++ b/mm/hugetlb_cgroup.c
 @@ -155,7 +155,7 @@ out:
   * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
   * the parent cgroup.
   */
 -static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
 +static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
  {
   struct hstate *h;
   struct page *page;
 @@ -172,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup 
 *cgroup)
   }
   cond_resched();
   } while (hugetlb_cgroup_have_usage(cgroup));
 -
 - return 0;
  }
  
  int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 47c4680..af05a60 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -5002,12 +5002,11 @@ free_out:
   return ERR_PTR(error);
  }
  
 -static int mem_cgroup_pre_destroy(struct cgroup *cont)
 +static void mem_cgroup_pre_destroy(struct cgroup *cont)
  {
   struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
  
   mem_cgroup_reparent_charges(memcg);
 - return 0;
  }
  
  static void mem_cgroup_destroy(struct cgroup *cont)
 -- 
 1.7.11.7
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET] cgroup: simplify cgroup removal path

2012-10-31 Thread Michal Hocko
Thanks for this cleanup. The code looks much better now and:
$ git diff --stat mmotm...tj-cgroups/review-cgroup-rmdir-updates
 block/blk-cgroup.c |3 +-
 include/linux/cgroup.h |   41 +---
 kernel/cgroup.c|  256 
 mm/hugetlb_cgroup.c|   11 +--
 mm/memcontrol.c|   51 ++
 5 files changed, 75 insertions(+), 287 deletions(-)

speaks for itself.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] cgroup: kill cgroup_subsys-__DEPRECATED_clear_css_refs

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 09:41:23, Tejun Heo wrote:
 Hey, Michal.
 
 On Wed, Oct 31, 2012 at 03:37:51PM +0100, Michal Hocko wrote:
   + for_each_subsys(cgrp-root, ss)
   + if (ss-pre_destroy)
   + WARN_ON_ONCE(ss-pre_destroy(cgrp));
  
  Hmm, I am not sure I like this WARN_ON_ONCE. First it can happen for
  more than one controller and second we can just clear CGRP_WAIT_ON_RMDIR
  and return with EBUSY. The only possible failure at the moment is when a
  new task or a child group appear.
  I know it is not a big deal because it will disappear later in the
  series but it would be more readable IMO.
 
 The WARN_ON_ONCE() is just moved from the original
 cgroup_call_pre_destroy().  We can add an error out there but that
 makes future changes difficult.  It's a chicken and egg problem.  You
 gotta break the loop somewhere.

I do not think this is that hard. You can simply change the return path
on pre_destroy error by BUG_ON in cgroup: deactivate CSS's and mark
cgroup dead before invoking -pre_destroy(). There is no chickeegg
problem here because once the group is marked dead and css frozen then
the existing callbacks cannot possibly fail.
Or am I missing your point?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] cgroup: kill cgroup_subsys-__DEPRECATED_clear_css_refs

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 11:16:24, Tejun Heo wrote:
 2ef37d3fe4 (memcg: Simplify mem_cgroup_force_empty_list error
 handling) removed the last user of __DEPRECATED_clear_css_refs.  This
 patch removes __DEPRECATED_clear_css_refs and mechanisms to support
 it.
 
 * Conditionals dependent on __DEPRECATED_clear_css_refs removed.
 
 * -pre_destroy() now can only fail if a new task is attached or child
   cgroup is created while -pre_destroy()s are being called.  As the
   condition is checked again after re-acquiring cgroup_mutex
   afterwards, we don't need to take any immediate action on
   -pre_destroy() failures.  

Well, this is racy because the task can exit until we reach the
re-check. As the result there might still be some pages on the memcg
LRUs left.
As I said in the previous version, I do not see any reason why we
shouldn't just return EBUSY here. I would even skip WARN_ON_ONCE because
it doesn't give us any valueable information. One can trigger that
easily as well.

[...]
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index 7981850..033bf4b 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
[...]
 @@ -4168,11 +4075,9 @@ again:
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
 - ret = cgroup_call_pre_destroy(cgrp);
 - if (ret) {
 - clear_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags);
 - return ret;
 - }
 + for_each_subsys(cgrp-root, ss)
 + if (ss-pre_destroy)
 + WARN_ON_ONCE(ss-pre_destroy(cgrp));
for_each_subsys(cgrp-root, ss)
if (ss-pre_destroy) {
int ret = ss-pre_destroy(cgrp);
clear_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags);
return ret;
}
  
   mutex_lock(cgroup_mutex);
   parent = cgrp-parent;
[...]
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/8] cgroup: kill CSS_REMOVED

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 09:57:39, Tejun Heo wrote:
 Hey, Michal.
 
 On Wed, Oct 31, 2012 at 04:39:26PM +0100, Michal Hocko wrote:
 prepare_to_wait(cgroup_rmdir_waitq, wait, TASK_INTERRUPTIBLE);

   - local_irq_disable();
   -
  
  OK, so the new charges shouldn't come from the IRQ context so we cannot
  race with css_tryget but why did we need this in the first place?
  A separate patch which removes this with an explanation would be nice.
 
 The change is actually tied to this one.  Because css_tryget() busy
 loops on DEACT_BIAS  !CSS_REMOVED and css_tryget() may happen from
 an IRQ context, we need to disable IRQ while deactivating refcnts and
 setting CSS_REMOVED.  I'll mention it in the commit message.

OK

   @@ -2343,7 +2343,6 @@ static int __mem_cgroup_try_charge(struct mm_struct 
   *mm,
again:
 if (*ptr) { /* css should be a valid one */
 memcg = *ptr;
   - VM_BUG_ON(css_is_removed(memcg-css));
  
  All the callers seem to be fine but this was a safety net that something
  didn't leak out. Can we keep it and test that the reference counter has
  been disabled already (css_refcnt(memcg-css)  0 - I do not care
  whether open coded or wrapped innsude css_is_removed albeit helper
  sounds nicer)?
 
 I don't think that's a good idea.  In general, I think too much of
 cgroup internals are exposed to controllers.  People try to implement
 weird behaviors and expose cgroup internals for that, which in turn
 attracts more weirdness, and there seems to be a pattern - cgroup core
 is unnecessarily coupled with VFS locking like controllers are
 unnecessarily coupled with cgroup internal locking.  I really wanna
 move away from such pattern.  I mean, you can't even know
 css_is_removed() isn't gonna change while the function is in progress.

Sure, it cannot detect races but this wasn't the intention of the check.
The more important part is that memcg can outlive css and we want to
catch bugs when we try to charge to an already dead memcg.

[...]
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] cgroup: kill cgroup_subsys-__DEPRECATED_clear_css_refs

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 12:44:03, Tejun Heo wrote:
[...]
 local_irq_save/restore() from cgroup_clear_css_refs() are replaced
 with local_irq_disable/enable() for simplicity.  This is safe as
 cgroup_rmdir() is always called with IRQ enabled.  Note that this IRQ
 switching is necessary to make CSS deactivation and CSS_REMOVED
 assertion atomic against css_tryget() and will be removed by future
 changes.

local_irq_disable doesn't guarantee atomicity, because other CPUs will
see the change in steps so this is a bit misleading. The real reason
AFAICS is that we do not want to hang in css_tryget from IRQ context
(does this really happen btw.?) which would interrupt cgroup_rmdir
between we add CSS_DEACT_BIAS and the group is marked CGRP_REMOVED.
Or am I still missing the point?
[...]
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] cgroup: kill cgroup_subsys-__DEPRECATED_clear_css_refs

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 12:44:03, Tejun Heo wrote:
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index 7981850..8c605e2 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -865,11 +865,8 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
   continue;
  
   ret = ss-pre_destroy(cgrp);
 - if (ret) {
 - /* -pre_destroy() failure is being deprecated */
 - WARN_ON_ONCE(!ss-__DEPRECATED_clear_css_refs);
 + if (WARN_ON_ONCE(ret))
   break;
 - }
   }
  
   return ret;

And sorry for being to annoying about this WARN_ON_ONCE but I really
don't see any reason for it. pre_destroy can still happen and now it is
even reduced to a reasonable condition (somebody shown up). So I would
put it out of way as it doesn't give us anything and as I've already
mentioned one can trigger it really easily.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] cgroup: kill cgroup_subsys-__DEPRECATED_clear_css_refs

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 13:11:02, Tejun Heo wrote:
 Hello,
 
 On Wed, Oct 31, 2012 at 1:08 PM, Michal Hocko mho...@suse.cz wrote:
  local_irq_disable doesn't guarantee atomicity, because other CPUs will
 
 Maybe it should say atomicity on the local CPU.

That would be more clear but being more verbose doesn't hurt either :P

  see the change in steps so this is a bit misleading. The real reason
  AFAICS is that we do not want to hang in css_tryget from IRQ context
  (does this really happen btw.?) which would interrupt cgroup_rmdir
  between we add CSS_DEACT_BIAS and the group is marked CGRP_REMOVED.
  Or am I still missing the point?
 
 Yeah, that's the correct one. We don't want tryget happening between
 DEACT_BIAS and REMOVED as it can hang forever there.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] cgroup: kill cgroup_subsys-__DEPRECATED_clear_css_refs

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 13:24:00, Tejun Heo wrote:
[...]
 local_irq_save/restore() from cgroup_clear_css_refs() are replaced
 with local_irq_disable/enable() for simplicity.  This is safe as
 cgroup_rmdir() is always called with IRQ enabled.  Note that this IRQ
 switching is necessary to ensure that css_tryget() isn't called from
 IRQ context on the same CPU while lower context is between CSS
 deactivation and setting CSS_REMOVED as css_tryget() would hang
 forever in such cases waiting for CSS to be re-activated or
 CSS_REMOVED set.  This will go away soon.

Much better. Thanks a lot!
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] cgroup: kill cgroup_subsys-__DEPRECATED_clear_css_refs

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 13:14:07, Tejun Heo wrote:
 Hey, Michal.
 
 On Wed, Oct 31, 2012 at 1:12 PM, Michal Hocko mho...@suse.cz wrote:
  And sorry for being to annoying about this WARN_ON_ONCE but I really
  don't see any reason for it. pre_destroy can still happen and now it is
  even reduced to a reasonable condition (somebody shown up). So I would
  put it out of way as it doesn't give us anything and as I've already
  mentioned one can trigger it really easily.
 
 It was there before and goes away in several commits.  I have to
 explain all that to remove it in this patch.  Why do that when the
 whole thing is just gonna disappear anyway?

I would consider it nicer but if you feel it is not worth it then don't
worry. It is not a big deal. The important part is that we do not ignore
the return value now.

Thanks
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 8/8] cgroup: make -pre_destroy() return void

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 12:44:10, Tejun Heo wrote:
 All -pre_destory() implementations return 0 now, which is the only
 allowed return value.  Make it return void.
 
 Signed-off-by: Tejun Heo t...@kernel.org
 Cc: Michal Hocko mho...@suse.cz
 Cc: Balbir Singh bsinghar...@gmail.com
 Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
 Cc: Vivek Goyal vgo...@redhat.com

I thought I already gave my r-b but anyway
Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  block/blk-cgroup.c | 3 +--
  include/linux/cgroup.h | 2 +-
  kernel/cgroup.c| 2 +-
  mm/hugetlb_cgroup.c| 4 +---
  mm/memcontrol.c| 3 +--
  5 files changed, 5 insertions(+), 9 deletions(-)
 
 diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
 index f3b44a6..a7816f3 100644
 --- a/block/blk-cgroup.c
 +++ b/block/blk-cgroup.c
 @@ -600,7 +600,7 @@ struct cftype blkcg_files[] = {
   *
   * This is the blkcg counterpart of ioc_release_fn().
   */
 -static int blkcg_pre_destroy(struct cgroup *cgroup)
 +static void blkcg_pre_destroy(struct cgroup *cgroup)
  {
   struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
  
 @@ -622,7 +622,6 @@ static int blkcg_pre_destroy(struct cgroup *cgroup)
   }
  
   spin_unlock_irq(blkcg-lock);
 - return 0;
  }
  
  static void blkcg_destroy(struct cgroup *cgroup)
 diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
 index 47868a8..adb2adc 100644
 --- a/include/linux/cgroup.h
 +++ b/include/linux/cgroup.h
 @@ -436,7 +436,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
  
  struct cgroup_subsys {
   struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
 - int (*pre_destroy)(struct cgroup *cgrp);
 + void (*pre_destroy)(struct cgroup *cgrp);
   void (*destroy)(struct cgroup *cgrp);
   int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
   void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index c5f6fb2..83cd7d0 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -4054,7 +4054,7 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
   mutex_unlock(cgroup_mutex);
   for_each_subsys(cgrp-root, ss)
   if (ss-pre_destroy)
 - WARN_ON_ONCE(ss-pre_destroy(cgrp));
 + ss-pre_destroy(cgrp);
   mutex_lock(cgroup_mutex);
  
   /*
 diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
 index dc595c6..0d3a1a3 100644
 --- a/mm/hugetlb_cgroup.c
 +++ b/mm/hugetlb_cgroup.c
 @@ -155,7 +155,7 @@ out:
   * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
   * the parent cgroup.
   */
 -static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
 +static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
  {
   struct hstate *h;
   struct page *page;
 @@ -172,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup 
 *cgroup)
   }
   cond_resched();
   } while (hugetlb_cgroup_have_usage(cgroup));
 -
 - return 0;
  }
  
  int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 6678f99..a1811ce 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -5002,12 +5002,11 @@ free_out:
   return ERR_PTR(error);
  }
  
 -static int mem_cgroup_pre_destroy(struct cgroup *cont)
 +static void mem_cgroup_pre_destroy(struct cgroup *cont)
  {
   struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
  
   mem_cgroup_reparent_charges(memcg);
 - return 0;
  }
  
  static void mem_cgroup_destroy(struct cgroup *cont)
 -- 
 1.7.11.7
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/8] cgroup: deactivate CSS's and mark cgroup dead before invoking -pre_destroy()

2012-10-31 Thread Michal Hocko
On Wed 31-10-12 12:44:06, Tejun Heo wrote:
[...]
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index f22e3cd..66204a6 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
[...]
 @@ -4122,13 +4079,30 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
   }
   prepare_to_wait(cgroup_rmdir_waitq, wait, TASK_INTERRUPTIBLE);
  
 - /* block new css_tryget() by deactivating refcnt */
 + /*
 +  * Block new css_tryget() by deactivating refcnt and mark @cgrp
 +  * removed.  This makes future css_tryget() and child creation
 +  * attempts fail thus maintaining the removal conditions verified
 +  * above.
 +  */
   for_each_subsys(cgrp-root, ss) {
   struct cgroup_subsys_state *css = cgrp-subsys[ss-subsys_id];
  
   WARN_ON(atomic_read(css-refcnt)  0);
   atomic_add(CSS_DEACT_BIAS, css-refcnt);
   }
 + set_bit(CGRP_REMOVED, cgrp-flags);
 +
 + /*
 +  * Tell subsystems to initate destruction.  pre_destroy() should be
 +  * called with cgroup_mutex unlocked.  See 3fa59dfbc3 (cgroup: fix
 +  * potential deadlock in pre_destroy) for details.
 +  */
 + mutex_unlock(cgroup_mutex);
 + for_each_subsys(cgrp-root, ss)
 + if (ss-pre_destroy)
 + WARN_ON_ONCE(ss-pre_destroy(cgrp));

Do you think that BUG_ON would be too harsh?

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/8] cgroup: deactivate CSS's and mark cgroup dead before invoking -pre_destroy()

2012-11-01 Thread Michal Hocko
On Wed 31-10-12 14:27:25, Tejun Heo wrote:
 Hey, Michal.
 
 On Wed, Oct 31, 2012 at 10:23:59PM +0100, Michal Hocko wrote:
   + for_each_subsys(cgrp-root, ss)
   + if (ss-pre_destroy)
   + WARN_ON_ONCE(ss-pre_destroy(cgrp));
  
  Do you think that BUG_ON would be too harsh?
 
 Yeah, I do think so.  In general, I think the consensus now is to
 prefer WARN_ON[_ONCE]() over BUG_ON() whenever possible.  It's not
 like we can get more information from BUG_ON()s (more likely to get
 less reporting actually by taking down the machine).  Limping machines
 are better than dead ones and there just isn't much to gain here by
 killing it.

Fair enough

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/8] cgroup: use cgroup_lock_live_group(parent) in cgroup_create()

2012-11-01 Thread Michal Hocko
On Wed 31-10-12 10:04:31, Tejun Heo wrote:
 Hey, Michal.
 
 On Wed, Oct 31, 2012 at 04:55:14PM +0100, Michal Hocko wrote:
   + /*
   +  * Only live parents can have children.  Note that the liveliness
   +  * check isn't strictly necessary because cgroup_mkdir() and
   +  * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
   +  * anyway so that locking is contained inside cgroup proper and we
   +  * don't get nasty surprises if we ever grow another caller.
   +  */
   + if (!cgroup_lock_live_group(parent)) {
   + err = -ENODEV;
   + goto err_free;
   + }
   +
  
  I think this should be moved up before we try to allocate any memory.
  Or is your motivation to keep cgroup_lock held for shorter time?
  I could agree with that but a small comment would be helpful.
 
 Then I have to change the error out path more and I'm not sure I wanna
 call deactivate_super() under cgroup_mutex.  It's just simpler this
 way.

I am not sure I understand. What does deactivate_super has to do with
the above suggestion? cgroup_lock_live_group will take the cgroup_mutex
on the success or frees the previously allocatedunused memory. The
only thing I was suggesting is to do cgroup_lock_live_group first and
allocate the cgroup only if it doesn't fail.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/8] cgroup: use cgroup_lock_live_group(parent) in cgroup_create()

2012-11-01 Thread Michal Hocko
On Thu 01-11-12 07:52:24, Tejun Heo wrote:
 Hey, Michal.
 
 On Thu, Nov 1, 2012 at 2:16 AM, Michal Hocko mho...@suse.cz wrote:
  I am not sure I understand. What does deactivate_super has to do with
  the above suggestion? cgroup_lock_live_group will take the cgroup_mutex
  on the success or frees the previously allocatedunused memory. The
  only thing I was suggesting is to do cgroup_lock_live_group first and
  allocate the cgroup only if it doesn't fail.
 
 It complicates updates to the error exit path.

Still don't get it, sorry. What prevents us to do:
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 umode_t mode)
{
struct cgroup *cgrp;
struct cgroupfs_root *root = parent-root;
int err = 0;
struct cgroup_subsys *ss;
struct super_block *sb = root-sb;

if (!cgroup_lock_live_group(parent))
return -ENODEV;

cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
return -ENOMEM;

 You end up having to update a lot more and it's not like grabbing lock
 first is substantially better in any way, so why bother?

Yes the allocation can sleep if we are short on memory so this can
potentially take long which is not entirely nice but a pointless
allocation is not nice either. Anyway I am asking because I am trying to
understand what is the motivation behind and your explanation about the
error exit path doesn't make much sense to me. So I am either missing
something or we are talking about two different things.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/8] cgroup: use cgroup_lock_live_group(parent) in cgroup_create()

2012-11-01 Thread Michal Hocko
On Thu 01-11-12 16:05:32, Michal Hocko wrote:
 On Thu 01-11-12 07:52:24, Tejun Heo wrote:
  Hey, Michal.
  
  On Thu, Nov 1, 2012 at 2:16 AM, Michal Hocko mho...@suse.cz wrote:
   I am not sure I understand. What does deactivate_super has to do with
   the above suggestion? cgroup_lock_live_group will take the cgroup_mutex
   on the success or frees the previously allocatedunused memory. The
   only thing I was suggesting is to do cgroup_lock_live_group first and
   allocate the cgroup only if it doesn't fail.
  
  It complicates updates to the error exit path.
 
 Still don't get it, sorry. What prevents us to do:
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
  umode_t mode)
 {
 struct cgroup *cgrp;
 struct cgroupfs_root *root = parent-root;
 int err = 0;
 struct cgroup_subsys *ss;
 struct super_block *sb = root-sb;
 
   if (!cgroup_lock_live_group(parent))
   return -ENODEV;
 
 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
 if (!cgrp)
 return -ENOMEM;

this needs to drop the lock of course but it doesn't make it much more
complicated...
 
  You end up having to update a lot more and it's not like grabbing lock
  first is substantially better in any way, so why bother?
 
 Yes the allocation can sleep if we are short on memory so this can
 potentially take long which is not entirely nice but a pointless
 allocation is not nice either. Anyway I am asking because I am trying to
 understand what is the motivation behind and your explanation about the
 error exit path doesn't make much sense to me. So I am either missing
 something or we are talking about two different things.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] memcg: fix hotplugged memory zone oops

2012-11-02 Thread Michal Hocko
On Thu 01-11-12 18:28:02, Hugh Dickins wrote:
 When MEMCG is configured on (even when it's disabled by boot option),
 when adding or removing a page to/from its lru list, the zone pointer
 used for stats updates is nowadays taken from the struct lruvec.
 (On many configurations, calculating zone from page is slower.)
 
 But we have no code to update all the lruvecs (per zone, per memcg)
 when a memory node is hotadded.  Here's an extract from the oops which
 results when running numactl to bind a program to a newly onlined node:
 
 BUG: unable to handle kernel NULL pointer dereference at 0f60
 IP: [811870b9] __mod_zone_page_state+0x9/0x60
 PGD 0 
 Oops:  [#1] SMP 
 CPU 2 
 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
 Process numactl (pid: 1219, threadinfo 880039abc000, task 
 8800383c4ce0)
 Stack:
  880039abdaf8 8117390f 880039abdaf8 8167c601
  81174162 88003a480f00 0001 8800395e
  88003dbd0e80 0282 880039abdb48 81174181
 Call Trace:
  [8117390f] __pagevec_lru_add_fn+0xdf/0x140
  [81174181] pagevec_lru_move_fn+0xb1/0x100
  [811741ec] __pagevec_lru_add+0x1c/0x30
  [81174383] lru_add_drain_cpu+0xa3/0x130
  [8117443f] lru_add_drain+0x2f/0x40
  ...
 
 The natural solution might be to use a memcg callback whenever memory
 is hotadded; but that solution has not been scoped out, and it happens
 that we do have an easy location at which to update lruvec-zone.  The
 lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by
 mem_cgroup_page_lruvec(), and both of those do know the right zone.
 
 So check and set lruvec-zone in those; and remove the inadequate
 attempt to set lruvec-zone from lruvec_init(), which is called
 before NODE_DATA(node) has been allocated in such cases.
 
 Ah, there was one exceptionr.  For no particularly good reason,
 mem_cgroup_force_empty_list() has its own code for deciding lruvec.
 Change it to use the standard mem_cgroup_zone_lruvec() and
 mem_cgroup_get_lru_size() too.  In fact it was already safe against
 such an oops (the lru lists in danger could only be empty),
 but we're better proofed against future changes this way.
 
 Reported-by: Tang Chen tangc...@cn.fujitsu.com
 Signed-off-by: Hugh Dickins hu...@google.com
 Acked-by: Johannes Weiner han...@cmpxchg.org
 Acked-by: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
 Cc: Konstantin Khlebnikov khlebni...@openvz.org
 Cc: Wen Congyang we...@cn.fujitsu.com
 Cc: sta...@vger.kernel.org

OK, it adds an overhead also when there is no hotadd going on but this
is just one additional mem accesscmpje so it shouldn't be noticable
(lruvec-zone is used most of the time later so it not a pointless
load).
It is also easier to backport for stable. But is there any reason to fix
it later properly in the hotadd hook?

Anyway
Acked-by: Michal Hocko mho...@suse.cz

Thanks!
 ---
 I've marked this for stable (3.6) since we introduced the problem
 in 3.5 (now closed to stable); but I have no idea if this is the
 only fix needed to get memory hotadd working with memcg in 3.6,
 and received no answer when I enquired twice before.
 
  include/linux/mmzone.h |2 -
  mm/memcontrol.c|   46 +--
  mm/mmzone.c|6 -
  mm/page_alloc.c|2 -
  4 files changed, 38 insertions(+), 18 deletions(-)
 
 --- 3.7-rc3/include/linux/mmzone.h2012-10-14 16:16:57.665308933 -0700
 +++ linux/include/linux/mmzone.h  2012-11-01 14:31:04.284185741 -0700
 @@ -752,7 +752,7 @@ extern int init_currently_empty_zone(str
unsigned long size,
enum memmap_context context);
  
 -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
 +extern void lruvec_init(struct lruvec *lruvec);
  
  static inline struct zone *lruvec_zone(struct lruvec *lruvec)
  {
 --- 3.7-rc3/mm/memcontrol.c   2012-10-14 16:16:58.341309118 -0700
 +++ linux/mm/memcontrol.c 2012-11-01 14:31:04.284185741 -0700
 @@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(st
 struct mem_cgroup *memcg)
  {
   struct mem_cgroup_per_zone *mz;
 + struct lruvec *lruvec;
  
 - if (mem_cgroup_disabled())
 - return zone-lruvec;
 + if (mem_cgroup_disabled()) {
 + lruvec = zone-lruvec;
 + goto out;
 + }
  
   mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
 - return mz-lruvec;
 + lruvec = mz-lruvec;
 +out:
 + /*
 +  * Since a node can be onlined after the mem_cgroup was created,
 +  * we have to be prepared to initialize lruvec-zone here;
 +  * and if offlined then reonlined, we need to reinitialize it.
 +  */
 + if (unlikely(lruvec-zone != zone))
 + lruvec-zone = zone;
 + return lruvec;
  }
  
  /*
 @@ -1087,9 +1099,12

Re: [PATCH] memcg: fix hotplugged memory zone oops

2012-11-02 Thread Michal Hocko
On Fri 02-11-12 11:21:59, Michal Hocko wrote:
 On Thu 01-11-12 18:28:02, Hugh Dickins wrote:
[...]

And I forgot to mention that the following hunk will clash with
memcg: Simplify mem_cgroup_force_empty_list error handling which is in
linux-next already (via Tejun's tree). 
Would it be easier to split the patch into the real fix and the hunk
bellow? That one doesn't have to go into stable anyway and we would save
some merging conflicts. The updated fix on top of -mm tree is bellow for
your convinience.

   /**
  @@ -3688,17 +3712,17 @@ unsigned long mem_cgroup_soft_limit_recl
   static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
  int node, int zid, enum lru_list lru)
   {
  -   struct mem_cgroup_per_zone *mz;
  +   struct lruvec *lruvec;
  unsigned long flags, loop;
  struct list_head *list;
  struct page *busy;
  struct zone *zone;
   
  zone = NODE_DATA(node)-node_zones[zid];
  -   mz = mem_cgroup_zoneinfo(memcg, node, zid);
  -   list = mz-lruvec.lists[lru];
  +   lruvec = mem_cgroup_zone_lruvec(zone, memcg);
  +   list = lruvec-lists[lru];
   
  -   loop = mz-lru_size[lru];
  +   loop = mem_cgroup_get_lru_size(lruvec, lru);
  /* give some margin against EBUSY etc...*/
  loop += 256;
  busy = NULL;

---
From 0e55c305a050502a4b2f5167efed58bb6585520b Mon Sep 17 00:00:00 2001
From: Hugh Dickins hu...@google.com
Date: Fri, 2 Nov 2012 11:47:48 +0100
Subject: [PATCH] memcg: use mem_cgroup_zone_lruvec in
 mem_cgroup_force_empty_list

For no particularly good reason, mem_cgroup_force_empty_list() has
its own code for deciding lruvec. Change it to use the standard
mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too.
In fact it was already safe against oops after memory hotadd (see
memcg: fix hotplugged memory zone oops for more details) when lruvec
is not initialized yet (the lru lists in danger could only be empty),
but we're better proofed against future changes this way.

Reported-by: Tang Chen tangc...@cn.fujitsu.com
Signed-off-by: Hugh Dickins hu...@google.com
Acked-by: Johannes Weiner han...@cmpxchg.org
Acked-by: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
Acked-by: Michal Hocko mho...@suse.cz
Cc: Konstantin Khlebnikov khlebni...@openvz.org
Cc: Wen Congyang we...@cn.fujitsu.com
---
 mm/memcontrol.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c72d65..32f0ecf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3710,15 +3710,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone 
*zone, int order,
 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
int node, int zid, enum lru_list lru)
 {
-   struct mem_cgroup_per_zone *mz;
+   struct lruvec *lruvec;
unsigned long flags;
struct list_head *list;
struct page *busy;
struct zone *zone;
 
zone = NODE_DATA(node)-node_zones[zid];
-   mz = mem_cgroup_zoneinfo(memcg, node, zid);
-   list = mz-lruvec.lists[lru];
+   lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+   list = lruvec-lists[lru];
 
busy = NULL;
do {
-- 
1.7.10.4

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 23/29] memcg: destroy memcg caches

2012-11-02 Thread Michal Hocko
On Fri 02-11-12 11:46:42, Glauber Costa wrote:
 On 11/02/2012 04:05 AM, Andrew Morton wrote:
  On Thu,  1 Nov 2012 16:07:39 +0400
  Glauber Costa glom...@parallels.com wrote:
  
  This patch implements destruction of memcg caches. Right now,
  only caches where our reference counter is the last remaining are
  deleted. If there are any other reference counters around, we just
  leave the caches lying around until they go away.
 
  When that happen, a destruction function is called from the cache
  code. Caches are only destroyed in process context, so we queue them
  up for later processing in the general case.
 
 
  ...
 
  @@ -5950,6 +6012,7 @@ static int mem_cgroup_pre_destroy(struct cgroup 
  *cont)
   {
 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
   
  +  mem_cgroup_destroy_all_caches(memcg);
 return mem_cgroup_force_empty(memcg, false);
   }
   
  
  Conflicts with linux-next cgroup changes.  Looks pretty simple:
  
  
  static int mem_cgroup_pre_destroy(struct cgroup *cont)
  {
  struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
  int ret;
  
  css_get(memcg-css);
  ret = mem_cgroup_reparent_charges(memcg);
  mem_cgroup_destroy_all_caches(memcg);
  css_put(memcg-css);
  
  return ret;
  }
  
 
 There is one significant difference between the code I had and the code
 after your fix up.
 
 In my patch, caches were destroyed before the call to
 mem_cgroup_force_empty. In the final, version, they are destroyed after it.
 
 I am here thinking, but I am not sure if this have any significant
 impact... If we run mem_cgroup_destroy_all_caches() before reparenting,
 we'll have shrunk a lot of the pending caches, and we will have less
 pages to reparent. But we only reparent pages in the lru anyway, and
 then expect kmem and remaining umem to match. So *in theory* it should
 be fine.
 
 Where can I grab your final tree so I can test it and make sure it is
 all good ?

Everything is in the -mm git tree (I tend to take mmots trees if they
compile).

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/13] kmem accounting basic infrastructure

2012-09-26 Thread Michal Hocko
On Tue 18-09-12 18:04:01, Glauber Costa wrote:
 This patch adds the basic infrastructure for the accounting of the slab
 caches. To control that, the following files are created:
 
  * memory.kmem.usage_in_bytes
  * memory.kmem.limit_in_bytes
  * memory.kmem.failcnt
  * memory.kmem.max_usage_in_bytes
 
 They have the same meaning of their user memory counterparts. They
 reflect the state of the kmem res_counter.

 The code is not enabled until a limit is set. 

Per cgroup slab memory accounting is not enabled until a limit is set
for the group. Once the limit is set the accounting cannot be disabled
such a group.

Better?

 This can be tested by the flag kmem_accounted.

Sounds as if it could be done from userspace (because you were talking
about an user interface) which it cannot and we do not see it in this
patch because it is not used anywhere. So please be more specific.

 This means that after the patch is applied, no behavioral changes
 exists for whoever is still using memcg to control their memory usage.
 
 We always account to both user and kernel resource_counters. 

This is in contradiction with your claim that there is no behavioral
change for memcg users. Please clarify when we use u and when u+k
accounting.

There is no behavioral change if the kmem accounting is turned off for
memcg users but when there is a kmem.limit_in_bytes is set then the
memory.usage_in_bytes will include both user and kmem memory.


 This
 effectively means that an independent kernel limit is in place when the
 limit is set to a lower value than the user memory. A equal or higher
 value means that the user limit will always hit first, meaning that kmem
 is effectively unlimited.
 
 People who want to track kernel memory but not limit it, can set this
 limit to a very high number (like RESOURCE_MAX - 1page - that no one
 will ever hit, or equal to the user memory)
 
 Signed-off-by: Glauber Costa glom...@parallels.com
 CC: Michal Hocko mho...@suse.cz
 CC: Johannes Weiner han...@cmpxchg.org
 Acked-by: Kamezawa Hiroyuki kamezawa.hir...@jp.fujitsu.com
 ---
  mm/memcontrol.c | 64 
 -
  1 file changed, 63 insertions(+), 1 deletion(-)
 
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index d6ad138..f3fd354 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -265,6 +265,10 @@ struct mem_cgroup {
   };
  
   /*
 +  * the counter to account for kernel memory usage.
 +  */
 + struct res_counter kmem;
 + /*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
*/
 @@ -279,6 +283,7 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
   bool use_hierarchy;
 + bool kmem_accounted;
  
   booloom_lock;
   atomic_tunder_oom;
 @@ -389,6 +394,7 @@ enum res_type {
   _MEM,
   _MEMSWAP,
   _OOM_TYPE,
 + _KMEM,
  };
  
  #define MEMFILE_PRIVATE(x, val)  ((x)  16 | (val))
 @@ -1439,6 +1445,10 @@ done:
   res_counter_read_u64(memcg-memsw, RES_USAGE)  10,
   res_counter_read_u64(memcg-memsw, RES_LIMIT)  10,
   res_counter_read_u64(memcg-memsw, RES_FAILCNT));
 + printk(KERN_INFO kmem: usage %llukB, limit %llukB, failcnt %llu\n,
 + res_counter_read_u64(memcg-kmem, RES_USAGE)  10,
 + res_counter_read_u64(memcg-kmem, RES_LIMIT)  10,
 + res_counter_read_u64(memcg-kmem, RES_FAILCNT));
  }
  
  /*
 @@ -3946,6 +3956,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, 
 struct cftype *cft,
   else
   val = res_counter_read_u64(memcg-memsw, name);
   break;
 + case _KMEM:
 + val = res_counter_read_u64(memcg-kmem, name);
 + break;
   default:
   BUG();
   }
 @@ -3984,8 +3997,18 @@ static int mem_cgroup_write(struct cgroup *cont, 
 struct cftype *cft,
   break;
   if (type == _MEM)
   ret = mem_cgroup_resize_limit(memcg, val);
 - else
 + else if (type == _MEMSWAP)
   ret = mem_cgroup_resize_memsw_limit(memcg, val);
 + else if (type == _KMEM) {
 + ret = res_counter_set_limit(memcg-kmem, val);
 + if (ret)
 + break;
 +
 + /* For simplicity, we won't allow this to be disabled */
 + if (!memcg-kmem_accounted  val != RESOURCE_MAX)
 + memcg-kmem_accounted = true;
 + } else
 + return -EINVAL;
   break;
   case RES_SOFT_LIMIT:
   ret = res_counter_memparse_write_strategy(buffer, val);
 @@ -4051,12 +4074,16 @@ static int mem_cgroup_reset(struct cgroup *cont, 
 unsigned int event)
   case RES_MAX_USAGE:
   if (type == _MEM

Re: [PATCH v3 06/13] memcg: kmem controller infrastructure

2012-09-26 Thread Michal Hocko
On Tue 18-09-12 18:04:03, Glauber Costa wrote:
 This patch introduces infrastructure for tracking kernel memory pages to
 a given memcg. This will happen whenever the caller includes the flag
 __GFP_KMEMCG flag, and the task belong to a memcg other than the root.
 
 In memcontrol.h those functions are wrapped in inline acessors.  The
 idea is to later on, patch those with static branches, so we don't incur
 any overhead when no mem cgroups with limited kmem are being used.

Could you describe the API a bit here, please? I guess the
kernel user is supposed to call memcg_kmem_newpage_charge and
memcg_kmem_commit_charge resp. memcg_kmem_uncharge_page.
All other kmem functions here are just helpers, right?
 
 [ v2: improved comments and standardized function names ]
 [ v3: handle no longer opaque, functions not exported,
   even more comments ]
 [ v4: reworked Used bit handling and surroundings for more clarity ]
 
 Signed-off-by: Glauber Costa glom...@parallels.com
 CC: Christoph Lameter c...@linux.com
 CC: Pekka Enberg penb...@cs.helsinki.fi
 CC: Michal Hocko mho...@suse.cz
 CC: Kamezawa Hiroyuki kamezawa.hir...@jp.fujitsu.com
 CC: Johannes Weiner han...@cmpxchg.org
 ---
  include/linux/memcontrol.h |  97 +
  mm/memcontrol.c| 177 
 +
  2 files changed, 274 insertions(+)
 
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 8d9489f..82ede9a 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -21,6 +21,7 @@
  #define _LINUX_MEMCONTROL_H
  #include linux/cgroup.h
  #include linux/vm_event_item.h
 +#include linux/hardirq.h

  struct mem_cgroup;
  struct page_cgroup;
 @@ -399,6 +400,17 @@ struct sock;
  #ifdef CONFIG_MEMCG_KMEM
  void sock_update_memcg(struct sock *sk);
  void sock_release_memcg(struct sock *sk);
 +
 +static inline bool memcg_kmem_enabled(void)
 +{
 + return true;
 +}
 +
 +extern bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
 + int order);
 +extern void __memcg_kmem_commit_charge(struct page *page,
 +struct mem_cgroup *memcg, int order);
 +extern void __memcg_kmem_uncharge_page(struct page *page, int order);
  #else
  static inline void sock_update_memcg(struct sock *sk)
  {
 @@ -406,6 +418,91 @@ static inline void sock_update_memcg(struct sock *sk)
  static inline void sock_release_memcg(struct sock *sk)
  {
  }
 +
 +static inline bool memcg_kmem_enabled(void)
 +{
 + return false;
 +}
 +
 +static inline bool
 +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 +{
 + return false;
 +}
 +
 +static inline void  __memcg_kmem_uncharge_page(struct page *page, int order)
 +{
 +}
 +
 +static inline void
 +__memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int 
 order)
 +{
 +}

I think we shouldn't care about these for !MEMCG_KMEM. It should be
sufficient to define the main three functions bellow as return true
resp. NOOP. This would reduce the code churn a bit and also make it
better maintainable.

  #endif /* CONFIG_MEMCG_KMEM */
 +
 +/**
 + * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
 + * @gfp: the gfp allocation flags.
 + * @memcg: a pointer to the memcg this was charged against.
 + * @order: allocation order.
 + *
 + * returns true if the memcg where the current task belongs can hold this
 + * allocation.
 + *
 + * We return true automatically if this allocation is not to be accounted to
 + * any memcg.
 + */
 +static __always_inline bool
 +memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 +{
 + if (!memcg_kmem_enabled())
 + return true;
 +
 + /*
 +  * __GFP_NOFAIL allocations will move on even if charging is not
 +  * possible. Therefore we don't even try, and have this allocation
 +  * unaccounted. We could in theory charge it with
 +  * res_counter_charge_nofail, but we hope those allocations are rare,
 +  * and won't be worth the trouble.
 +  */
 + if (!(gfp  __GFP_KMEMCG) || (gfp  __GFP_NOFAIL))
 + return true;
 + if (in_interrupt() || (!current-mm) || (current-flags  PF_KTHREAD))
 + return true;
 + return __memcg_kmem_newpage_charge(gfp, memcg, order);
 +}
 +
 +/**
 + * memcg_kmem_uncharge_page: uncharge pages from memcg
 + * @page: pointer to struct page being freed
 + * @order: allocation order.
 + *
 + * there is no need to specify memcg here, since it is embedded in 
 page_cgroup
 + */
 +static __always_inline void
 +memcg_kmem_uncharge_page(struct page *page, int order)
 +{
 + if (memcg_kmem_enabled())
 + __memcg_kmem_uncharge_page(page, order);
 +}
 +
 +/**
 + * memcg_kmem_commit_charge: embeds correct memcg in a page
 + * @memcg: a pointer to the memcg this was charged against.
  ^^^
remove this one?

 + * @page: pointer to struct page recently

Re: [PATCH v3 04/13] kmem accounting basic infrastructure

2012-09-26 Thread Michal Hocko
On Wed 26-09-12 18:33:10, Glauber Costa wrote:
 On 09/26/2012 06:03 PM, Michal Hocko wrote:
  On Tue 18-09-12 18:04:01, Glauber Costa wrote:
[...]
  @@ -4961,6 +5015,12 @@ mem_cgroup_create(struct cgroup *cont)
 int cpu;
 enable_swap_cgroup();
 parent = NULL;
  +
  +#ifdef CONFIG_MEMCG_KMEM
  +  WARN_ON(cgroup_add_cftypes(mem_cgroup_subsys,
  + kmem_cgroup_files));
  +#endif
  +
 if (mem_cgroup_soft_limit_tree_init())
 goto free_out;
 root_mem_cgroup = memcg;
  @@ -4979,6 +5039,7 @@ mem_cgroup_create(struct cgroup *cont)
 if (parent  parent-use_hierarchy) {
 res_counter_init(memcg-res, parent-res);
 res_counter_init(memcg-memsw, parent-memsw);
  +  res_counter_init(memcg-kmem, parent-kmem);
  
  Haven't we already discussed that a new memcg should inherit kmem_accounted
  from its parent for use_hierarchy?
  Say we have
  root
  |
  A (kmem_accounted = 1, use_hierachy = 1)
   \
B (kmem_accounted = 0)
 \
  C (kmem_accounted = 1)
  
  B find's itself in an awkward situation becuase it doesn't want to
  account u+k but it ends up doing so becuase C.
  
 
 Ok, I haven't updated it here. But that should be taken care of in the
 lifecycle patch.

I am not sure which patch you are thinking about but I would prefer to
have it here because it is safe wrt. races and it is more obvious as
well.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/13] kmem accounting basic infrastructure

2012-09-27 Thread Michal Hocko
On Thu 27-09-12 01:24:40, Glauber Costa wrote:
[...]
 About use cases, I've already responded: my containers use case is kmem
 limited. There are people like Michal that specifically asked for
 user-only semantics to be preserved. 

Yes, because we have many users (basically almost all) who care only
about the user memory because that's what occupies the vast majority of
the memory. They usually want to isolate workload which would disrupt
the global memory otherwise (e.g. backup process vs. database). You
really do not want to pay an additional overhead for kmem accounting
here.

 So your question for global vs local switch (that again, doesn't
 exist; only a local *limit* exists) should really be posed in the
 following way:  Can two different use cases with different needs be
 hosted in the same box?

I think this is a good and a relevant question. I think this boils down
to whether you want to have trusted and untrusted workloads at the same
machine.
Trusted loads usually only need user memory accounting because kmem
consumption should be really negligible (unless kernel is doing
something really stupid and no kmem limit will help here). 
On the other hand, untrusted workloads can do nasty things that
administrator has hard time to mitigate and setting a kmem limit can
help significantly.

IMHO such a different loads exist on a single machine quite often (Web
server and a back up process as the most simplistic one). The per
hierarchy accounting, therefore, sounds like a good idea without too
much added complexity (actually the only added complexity is in the
proper kmem.limit_in_bytes handling which is a single place).

So I would rather go with per-hierarchy thing.

  Michal, Johannes, Kamezawa, what are your thoughts?
 
 waiting! =)

Well, you guys generated a lot of discussion that one has to read
through, didn't you :P

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/13] kmem accounting basic infrastructure

2012-09-27 Thread Michal Hocko
On Wed 26-09-12 16:33:34, Tejun Heo wrote:
[...]
   So, this seems properly crazy to me at the similar level of
   use_hierarchy fiasco.  I'm gonna NACK on this.
  
  As I said: all use cases I particularly care about are covered by a
  global switch.
  
  I am laying down my views because I really believe they make more sense.
  But at some point, of course, I'll shut up if I believe I am a lone voice.
  
  I believe it should still be good to hear from mhocko and kame, but from
  your point of view, would all the rest, plus the introduction of a
  global switch make it acceptable to you?
 
 The only thing I'm whining about is per-node switch + silently
 ignoring past accounting, so if those two are solved, I think I'm
 pretty happy with the rest.

I think that per-group switch is not nice as well but if we make it
hierarchy specific (which I am proposing for quite some time) and do not
let enable accounting for a group with tasks then we get both
flexibility and reasonable semantic. A global switch sounds too coars to
me and it really not necessary.

Would this work with you?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/13] kmem accounting basic infrastructure

2012-09-27 Thread Michal Hocko
On Thu 27-09-12 16:20:55, Glauber Costa wrote:
 On 09/27/2012 04:15 PM, Michal Hocko wrote:
  On Wed 26-09-12 16:33:34, Tejun Heo wrote:
  [...]
  So, this seems properly crazy to me at the similar level of
  use_hierarchy fiasco.  I'm gonna NACK on this.
 
  As I said: all use cases I particularly care about are covered by a
  global switch.
 
  I am laying down my views because I really believe they make more sense.
  But at some point, of course, I'll shut up if I believe I am a lone voice.
 
  I believe it should still be good to hear from mhocko and kame, but from
  your point of view, would all the rest, plus the introduction of a
  global switch make it acceptable to you?
 
  The only thing I'm whining about is per-node switch + silently
  ignoring past accounting, so if those two are solved, I think I'm
  pretty happy with the rest.
  
  I think that per-group switch is not nice as well but if we make it
  hierarchy specific (which I am proposing for quite some time) and do not
  let enable accounting for a group with tasks then we get both
  flexibility and reasonable semantic. A global switch sounds too coars to
  me and it really not necessary.
  
  Would this work with you?
  
 
 How exactly would that work? AFAIK, we have a single memcg root, we
 can't have multiple memcg hierarchies in a system. Am I missing something?

Well root is so different that we could consider the first level as the
real roots for hierarchies.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/13] kmem accounting basic infrastructure

2012-09-27 Thread Michal Hocko
On Thu 27-09-12 16:40:03, Glauber Costa wrote:
 On 09/27/2012 04:40 PM, Michal Hocko wrote:
  On Thu 27-09-12 16:20:55, Glauber Costa wrote:
  On 09/27/2012 04:15 PM, Michal Hocko wrote:
  On Wed 26-09-12 16:33:34, Tejun Heo wrote:
  [...]
  So, this seems properly crazy to me at the similar level of
  use_hierarchy fiasco.  I'm gonna NACK on this.
 
  As I said: all use cases I particularly care about are covered by a
  global switch.
 
  I am laying down my views because I really believe they make more sense.
  But at some point, of course, I'll shut up if I believe I am a lone 
  voice.
 
  I believe it should still be good to hear from mhocko and kame, but from
  your point of view, would all the rest, plus the introduction of a
  global switch make it acceptable to you?
 
  The only thing I'm whining about is per-node switch + silently
  ignoring past accounting, so if those two are solved, I think I'm
  pretty happy with the rest.
 
  I think that per-group switch is not nice as well but if we make it
  hierarchy specific (which I am proposing for quite some time) and do not
  let enable accounting for a group with tasks then we get both
  flexibility and reasonable semantic. A global switch sounds too coars to
  me and it really not necessary.
 
  Would this work with you?
 
 
  How exactly would that work? AFAIK, we have a single memcg root, we
  can't have multiple memcg hierarchies in a system. Am I missing something?
  
  Well root is so different that we could consider the first level as the
  real roots for hierarchies.
  
 So let's favor clarity: What you are proposing is that the first level
 can have a switch for that, and the first level only. Is that right ?

I do not want any more switches. I am fine with your set the limit and
start accounting apprach and then inherit the _internal_ flag down the
hierarchy.
If you are in a child and want to set the limit then you can do that
only if your parent is accounted already (so that you can have your own
limit). We will need the same thing for oom_controll and swappinness.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 06/13] memcg: kmem controller infrastructure

2012-09-27 Thread Michal Hocko
On Thu 27-09-12 15:31:57, Glauber Costa wrote:
 On 09/26/2012 07:51 PM, Michal Hocko wrote:
  On Tue 18-09-12 18:04:03, Glauber Costa wrote:
[...]
  +  *_memcg = NULL;
  +  rcu_read_lock();
  +  p = rcu_dereference(current-mm-owner);
  +  memcg = mem_cgroup_from_task(p);
  
  mem_cgroup_from_task says it can return NULL. Do we care here? If not
  then please put VM_BUG_ON(!memcg) here.
  
  +  rcu_read_unlock();
  +
  +  if (!memcg_can_account_kmem(memcg))
  +  return true;
  +
  +  mem_cgroup_get(memcg);
  
  I am confused. Why do we take a reference to memcg rather than css_get
  here? Ahh it is because we keep the reference while the page is
  allocated, right? Comment please.
 ok.
 
  
  I am still not sure whether we need css_get here as well. How do you
  know that the current is not moved in parallel and it is a last task in
  a group which then can go away?
 
 the reference count aquired by mem_cgroup_get will still prevent the
 memcg from going away, no?

Yes but you are outside of the rcu now and we usually do css_get before
we rcu_unlock. mem_cgroup_get just makes sure the group doesn't get
deallocated but it could be gone before you call it. Or I am just
confused - these 2 levels of ref counting is really not nice.

Anyway, I have just noticed that __mem_cgroup_try_charge does
VM_BUG_ON(css_is_removed(memcg-css)) on a given memcg so you should
keep css ref count up as well.

  +  /* The page allocation failed. Revert */
  +  if (!page) {
  +  memcg_uncharge_kmem(memcg, PAGE_SIZE  order);
  +  return;
  +  }
  +
  +  pc = lookup_page_cgroup(page);
  +  lock_page_cgroup(pc);
  +  pc-mem_cgroup = memcg;
  +  SetPageCgroupUsed(pc);
  +  unlock_page_cgroup(pc);
  +}
  +
  +void __memcg_kmem_uncharge_page(struct page *page, int order)
  +{
  +  struct mem_cgroup *memcg = NULL;
  +  struct page_cgroup *pc;
  +
  +
  +  pc = lookup_page_cgroup(page);
  +  /*
  +   * Fast unlocked return. Theoretically might have changed, have to
  +   * check again after locking.
  +   */
  +  if (!PageCgroupUsed(pc))
  +  return;
  +
  +  lock_page_cgroup(pc);
  +  if (PageCgroupUsed(pc)) {
  +  memcg = pc-mem_cgroup;
  +  ClearPageCgroupUsed(pc);
  +  }
  +  unlock_page_cgroup(pc);
  +
  +  /*
  +   * Checking if kmem accounted is enabled won't work for uncharge, since
  +   * it is possible that the user enabled kmem tracking, allocated, and
  +   * then disabled it again.
  
  disabling cannot happen, right?
  
 not anymore, right. I can update the comment, 

yes, it is confusing

 but I still believe it is a lot saner to trust information in
 page_cgroup.

I have no objections against that. PageCgroupUsed test and using
pc-mem_cgroup is fine.

  +#ifdef CONFIG_MEMCG_KMEM
  +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
  +{
  +  struct res_counter *fail_res;
  +  struct mem_cgroup *_memcg;
  +  int ret;
  +  bool may_oom;
  +  bool nofail = false;
  +
  +  may_oom = (gfp  __GFP_WAIT)  (gfp  __GFP_FS) 
  +  !(gfp  __GFP_NORETRY);
  
  A comment please? Why __GFP_IO is not considered for example?
  
  
 
 Actually, I believe testing for GFP_WAIT and !GFP_NORETRY would be enough.
 
 The rationale here is, of course, under which circumstance would it be
 valid to call the oom killer? Which is, if the allocation can wait, and
 can retry.

Yes __GFP_WAIT is clear because memcg OOM can wait for arbitrary amount
of time (wait for userspace action on oom_control). __GFP_NORETRY
couldn't get to oom before because oom was excluded explicitely for THP
and migration didn't go through the charging path to reach the oom.
But I do agree that __GFP_NORETRY allocations shouldn't cause the OOM
because we should rather fail the allocation from kernel rather than
shoot something.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 07/13] mm: Allocate kernel pages to the right memcg

2012-09-27 Thread Michal Hocko
On Tue 18-09-12 18:04:04, Glauber Costa wrote:
 When a process tries to allocate a page with the __GFP_KMEMCG flag, the
 page allocator will call the corresponding memcg functions to validate
 the allocation. Tasks in the root memcg can always proceed.
 
 To avoid adding markers to the page - and a kmem flag that would
 necessarily follow, as much as doing page_cgroup lookups for no reason,
 whoever is marking its allocations with __GFP_KMEMCG flag is responsible
 for telling the page allocator that this is such an allocation at
 free_pages() time. This is done by the invocation of
 __free_accounted_pages() and free_accounted_pages().
 
 [ v2: inverted test order to avoid a memcg_get leak,
   free_accounted_pages simplification ]
 
 Signed-off-by: Glauber Costa glom...@parallels.com
 CC: Christoph Lameter c...@linux.com
 CC: Pekka Enberg penb...@cs.helsinki.fi
 CC: Michal Hocko mho...@suse.cz
 CC: Johannes Weiner han...@cmpxchg.org
 CC: Suleiman Souhlal sulei...@google.com
 CC: Mel Gorman mgor...@suse.de
 Acked-by: Kamezawa Hiroyuki kamezawa.hir...@jp.fujitsu.com

Acked-by: Michal Hocko mho...@suse.cz

Thanks!

 ---
  include/linux/gfp.h |  3 +++
  mm/page_alloc.c | 35 +++
  2 files changed, 38 insertions(+)
 
 diff --git a/include/linux/gfp.h b/include/linux/gfp.h
 index d8eae4d..029570f 100644
 --- a/include/linux/gfp.h
 +++ b/include/linux/gfp.h
 @@ -370,6 +370,9 @@ extern void free_pages(unsigned long addr, unsigned int 
 order);
  extern void free_hot_cold_page(struct page *page, int cold);
  extern void free_hot_cold_page_list(struct list_head *list, int cold);
  
 +extern void __free_accounted_pages(struct page *page, unsigned int order);
 +extern void free_accounted_pages(unsigned long addr, unsigned int order);
 +
  #define __free_page(page) __free_pages((page), 0)
  #define free_page(addr) free_pages((addr), 0)
  
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 index b0c5a52..897d8e2 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -2573,6 +2573,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int 
 order,
   struct page *page = NULL;
   int migratetype = allocflags_to_migratetype(gfp_mask);
   unsigned int cpuset_mems_cookie;
 + struct mem_cgroup *memcg = NULL;
  
   gfp_mask = gfp_allowed_mask;
  
 @@ -2591,6 +2592,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int 
 order,
   if (unlikely(!zonelist-_zonerefs-zone))
   return NULL;
  
 + /*
 +  * Will only have any effect when __GFP_KMEMCG is set.  This is
 +  * verified in the (always inline) callee
 +  */
 + if (!memcg_kmem_newpage_charge(gfp_mask, memcg, order))
 + return NULL;
 +
  retry_cpuset:
   cpuset_mems_cookie = get_mems_allowed();
  
 @@ -2624,6 +2632,8 @@ out:
   if (unlikely(!put_mems_allowed(cpuset_mems_cookie)  !page))
   goto retry_cpuset;
  
 + memcg_kmem_commit_charge(page, memcg, order);
 +
   return page;
  }
  EXPORT_SYMBOL(__alloc_pages_nodemask);
 @@ -2676,6 +2686,31 @@ void free_pages(unsigned long addr, unsigned int order)
  
  EXPORT_SYMBOL(free_pages);
  
 +/*
 + * __free_accounted_pages and free_accounted_pages will free pages allocated
 + * with __GFP_KMEMCG.
 + *
 + * Those pages are accounted to a particular memcg, embedded in the
 + * corresponding page_cgroup. To avoid adding a hit in the allocator to 
 search
 + * for that information only to find out that it is NULL for users who have 
 no
 + * interest in that whatsoever, we provide these functions.
 + *
 + * The caller knows better which flags it relies on.
 + */
 +void __free_accounted_pages(struct page *page, unsigned int order)
 +{
 + memcg_kmem_uncharge_page(page, order);
 + __free_pages(page, order);
 +}
 +
 +void free_accounted_pages(unsigned long addr, unsigned int order)
 +{
 + if (addr != 0) {
 + VM_BUG_ON(!virt_addr_valid((void *)addr));
 + __free_accounted_pages(virt_to_page((void *)addr), order);
 + }
 +}
 +
  static void *make_alloc_exact(unsigned long addr, unsigned order, size_t 
 size)
  {
   if (addr) {
 -- 
 1.7.11.4
 
 --
 To unsubscribe from this list: send the line unsubscribe cgroups in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/13] kmem accounting basic infrastructure

2012-09-27 Thread Michal Hocko
On Thu 27-09-12 07:33:00, Tejun Heo wrote:
 Hello, Michal.
 
 On Thu, Sep 27, 2012 at 02:08:06PM +0200, Michal Hocko wrote:
  Yes, because we have many users (basically almost all) who care only
  about the user memory because that's what occupies the vast majority of
  the memory. They usually want to isolate workload which would disrupt
  the global memory otherwise (e.g. backup process vs. database). You
  really do not want to pay an additional overhead for kmem accounting
  here.
 
 I'm not too convinced.  First of all, the overhead added by kmemcg
 isn't big. 

You are probably talking about memory overhead which is indeed not that
big (except for possible side effects as fragmentation which you mention
bellow). But the runtime overhead is present, as far as I understand from
what Glauber said. But, on the other hand, it is fair to say that those
who _want_ use the feature should pay for it.

 The hot path overhead is quite minimal - it doesn't do much more than
 indirecting one more time.  In terms of memory usage, it sure could
 lead to a bit more fragmentation but even if it gets to several megs
 per cgroup, I don't think that's something excessive.  So, there is
 overhead but I don't believe it to be prohibitive.

Remember that users do not want to pay even something minimal when the
feature is not needed.

   So your question for global vs local switch (that again, doesn't
   exist; only a local *limit* exists) should really be posed in the
   following way:  Can two different use cases with different needs be
   hosted in the same box?
  
  I think this is a good and a relevant question. I think this boils down
  to whether you want to have trusted and untrusted workloads at the same
  machine.
  Trusted loads usually only need user memory accounting because kmem
  consumption should be really negligible (unless kernel is doing
  something really stupid and no kmem limit will help here). 
  On the other hand, untrusted workloads can do nasty things that
  administrator has hard time to mitigate and setting a kmem limit can
  help significantly.
  
  IMHO such a different loads exist on a single machine quite often (Web
  server and a back up process as the most simplistic one). The per
  hierarchy accounting, therefore, sounds like a good idea without too
  much added complexity (actually the only added complexity is in the
  proper kmem.limit_in_bytes handling which is a single place).
 
 The distinction between trusted and untrusted is something
 artificially created due to the assumed deficiency of kmemcg
 implementation.  

Not really. It doesn't have to do anything with the overhead (be it
memory or runtime). It really boils down to do I need/want it at all.
Why would I want to think about how much kernel memory is in use in the
first place? Or do you think that user memory accounting should be
deprecated?

 Making things like this visible to userland is a bad
 idea because it locks us into a place where we can't or don't need to
 improve the said deficiencies and end up pushing the difficult
 problems to somewhere else where it will likely be implemented in a
 shabbier way.  There sure are cases when such approach simply cannot
 be avoided, but I really don't think that's the case here - the
 overhead already seems to be at an acceptable level and we're not
 taking away the escape switch.
 
 This is userland visible API.  

I am not sure which API visible part you have in mind but
kmem.limit_in_bytes will be there whether we go with global knob or no
limit no accounting approach. 

 We better err on the side of being conservative than going overboard
 with flexibility.  Even if we eventually need to make this switching
 fullly hierarchical, we really should be doing,
 
 1. Implement simple global switching and look for problem cases.
 
 2. Analyze them and see whether the problem case can't be solved in a
better, more intelligent way.
 
 3. If the problem is something structurally inherent or reasonably too
difficult to solve any other way, consider dumping the problem as
config parameters to userland.
 
 We can always expand the flexibility.  Let's do the simple thing
 first.  As an added bonus, it would enable using static_keys for
 accounting branches too.

While I do agree with you in general and being careful is at place in
this area as time shown several times, this seems to be too restrictive
in this particular case.
We won't save almost no code with the global knob so I am not sure
what we are actually saving here. Global knob will just give us all or
nothing semantic without making the whole thing simpler. You will stick
with static branches and checkes whether the group accountable anyway,
right?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/13] kmem accounting basic infrastructure

2012-09-27 Thread Michal Hocko
On Thu 27-09-12 10:46:05, Tejun Heo wrote:
[...]
   The part I nacked is enabling kmemcg on a populated cgroup and then
   starting accounting from then without any apparent indication that any
   past allocation hasn't been considered.  You end up with numbers which
   nobody can't tell what they really mean and there's no mechanism to
   guarantee any kind of ordering between populating the cgroup and
   configuring it and there's *no* way to find out what happened
   afterwards neither.  This is properly crazy and definitely deserves a
   nack.
   
  
  Mel suggestion of not allowing this to happen once the cgroup has tasks
  takes care of this, and is something I thought of myself.
 
 You mean Michal's?  It should also disallow switching if there are
 children cgroups, right?

Right.
 
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] mm: memcontrol: handle potential crash when rmap races with task exit

2012-10-04 Thread Michal Hocko
On Thu 04-10-12 14:09:16, Johannes Weiner wrote:
 page_referenced() counts only references of mm's that are associated
 with the memcg hierarchy that is being reclaimed.  However, if it
 races with the owner of the mm exiting, mm-owner may be NULL.  Don't
 crash, just ignore the reference.

This seems to be fixed by Hugh's patch 3a981f48 memcg: fix use_hierarchy
css_is_ancestor oops regression which seems to be merged already.

 
 Signed-off-by: Johannes Weiner han...@cmpxchg.org
 Cc: sta...@kernel.org [3.5]
 ---
  include/linux/memcontrol.h | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 8d9489f..8686294 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -91,7 +91,7 @@ int mm_match_cgroup(const struct mm_struct *mm, const 
 struct mem_cgroup *cgroup)
  
   rcu_read_lock();
   memcg = mem_cgroup_from_task(rcu_dereference((mm)-owner));
 - match = __mem_cgroup_same_or_subtree(cgroup, memcg);
 + match = memcg  __mem_cgroup_same_or_subtree(cgroup, memcg);
   rcu_read_unlock();
   return match;
  }
 -- 
 1.7.11.4
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] mm: memcontrol: handle potential crash when rmap races with task exit

2012-10-05 Thread Michal Hocko
On Thu 04-10-12 16:19:08, Johannes Weiner wrote:
 On Thu, Oct 04, 2012 at 08:49:58PM +0200, Michal Hocko wrote:
  On Thu 04-10-12 14:09:16, Johannes Weiner wrote:
   page_referenced() counts only references of mm's that are associated
   with the memcg hierarchy that is being reclaimed.  However, if it
   races with the owner of the mm exiting, mm-owner may be NULL.  Don't
   crash, just ignore the reference.
  
  This seems to be fixed by Hugh's patch 3a981f48 memcg: fix use_hierarchy
  css_is_ancestor oops regression which seems to be merged already.
 
 And look who acked the patch.  I'll show myself out...

My memory is a bit fuzzy but I remember we had two alternatives and
Hugh's won.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/8] THP support for Sparc64

2012-10-05 Thread Michal Hocko
On Thu 04-10-12 14:11:36, David Miller wrote:
 From: Andrea Arcangeli aarca...@redhat.com
 Date: Thu, 4 Oct 2012 12:35:48 +0200
 
  Hi Dave,
  
  On Wed, Oct 03, 2012 at 10:00:27PM -0400, David Miller wrote:
  From: Andrew Morton a...@linux-foundation.org
  Date: Tue, 2 Oct 2012 15:55:44 -0700
  
   I had a shot at integrating all this onto the pending stuff in 
   linux-next. 
   mm: Add and use update_mmu_cache_pmd() in transparent huge page code.
   needed minor massaging in huge_memory.c.  But as Andrea mentioned, we
   ran aground on Gerald's
   http://ozlabs.org/~akpm/mmotm/broken-out/thp-remove-assumptions-on-pgtable_t-type.patch,
   part of the thp-for-s390 work.
  
  While working on a rebase relative to this work, I noticed that the
  s390 patches don't even compile.
  
  It's because of that pmd_pgprot() change from Peter Z. which arrives
  asynchonously via the linux-next tree.  It makes THP start using
  pmd_pgprot() (a new interface) which the s390 patches don't provide.
  
  My suggestion would be to ignore linux-next and port it to -mm only
  and re-send to Andrew. schednuma is by mistake in linux-next, and
  it's not going to get merged as far as I can tell.
 
 Sorry Andrea, that simply is impractical.
 
 The first thing Andrew's patch series does is include linux-next,
 therefore every THP and MM patch in his series is against linux-next.

FWIW there is also a pure -mm (non-rebased) git tree at
http://git.kernel.org/?p=linux/kernel/git/mhocko/mm.git;a=summary
since-3.6 branch. It is based on top of 3.6 with mm patches from
Andrew's tree.

HTH
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/13] kmem accounting basic infrastructure

2012-10-05 Thread Michal Hocko
On Thu 04-10-12 07:43:16, Tejun Heo wrote:
[...]
  That is right but I think that the current discussion shows that a mixed
  (kmem disabled and kmem enabled hierarchies) workloads are far from
  being theoretical and a global knob is just too coarse. I am afraid we
 
 I'm not sure there's much evidence in this thread.  The strongest upto
 this point seems to be performance overhead / difficulty of general
 enough implementation.  As for trusted workload, what are the
 inherent benefits of trusting if you don't have to?

One advantage is that you do _not have_ to consider kernel memory
allocations (which are inherently bound to the kernel version) so the
sizing is much easier and version independent. If you set a limit to XY
because you know what you are doing you certainly do not want to regress
(e.g. because of unnecessary reclaim) just because a certain kernel
allocation got bigger, right?

  will see we want that per hierarchy requests shortly and that would
  just add a new confusion where global knob would complicate it
  considerably (do we really want on/off/per_hierarchy global knob?).
 
 Hmmm?  The global knob is just the same per_hierarchy knob at the
 root.  It's hierarchical after all.

When you said global knob I imagined mount or boot option. If you want
to have yet another memory.enable_kmem then IMHO it is much easier to
use set-accounted semantic (which is hierarchical as well).

 Anyways, as long as the we silently ignore what happened before being
 enabled is gone, I won't fight this anymore.  It isn't broken after
 all.  

OK, it is good that we settled this.

 But, please think about making things simpler in general, cgroup
 is riddled with mis-designed complexities and memcg seems to be
 leading the charge at times.

Yes this is an evolution and it seems that we are slowly getting there.

 
 Thanks.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] memcg: fix hotplugged memory zone oops

2012-11-03 Thread Michal Hocko
On Fri 02-11-12 16:37:37, Hugh Dickins wrote:
 On Fri, 2 Nov 2012, Michal Hocko wrote:
  On Fri 02-11-12 11:21:59, Michal Hocko wrote:
   On Thu 01-11-12 18:28:02, Hugh Dickins wrote:
  [...]
  
  And I forgot to mention that the following hunk will clash with
  memcg: Simplify mem_cgroup_force_empty_list error handling which is in
  linux-next already (via Tejun's tree). 
 
 Oh, via Tejun's tree.  Right, when I checked mmotm there was no problem.

Yeah, whole that thing goes through Tejun's tree because there are many
follow up clean ups depending on that change.

  Would it be easier to split the patch into the real fix and the hunk
  bellow? That one doesn't have to go into stable anyway and we would save
  some merging conflicts. The updated fix on top of -mm tree is bellow for
  your convinience.
 
 I'd prefer to leave it as one patch, so even the future proof part
 of the fix goes into 3.7 and stable.  But your point is that you have
 already seen the future, and it forks in a slightly different direction!
 
 Well, I don't want to be obstructive, but it doesn't look difficult
 to resolve.  

True.

 Perhaps if I hold off on splitting them, and see if akpm barks at me
 or not :)
 
 Hugh
 
 /**
@@ -3688,17 +3712,17 @@ unsigned long mem_cgroup_soft_limit_recl
 static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
int node, int zid, enum lru_list lru)
 {
-   struct mem_cgroup_per_zone *mz;
+   struct lruvec *lruvec;
unsigned long flags, loop;
struct list_head *list;
struct page *busy;
struct zone *zone;
 
zone = NODE_DATA(node)-node_zones[zid];
-   mz = mem_cgroup_zoneinfo(memcg, node, zid);
-   list = mz-lruvec.lists[lru];
+   lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+   list = lruvec-lists[lru];
 
-   loop = mz-lru_size[lru];
+   loop = mem_cgroup_get_lru_size(lruvec, lru);
/* give some margin against EBUSY etc...*/
loop += 256;
busy = NULL;

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET RESEND v2] cgroup: simplify cgroup removal path

2012-11-05 Thread Michal Hocko
On Mon 05-11-12 09:30:24, Tejun Heo wrote:
 Applied to cgroup/cgroup-rmdir-updates and then pulled into
 cgroup/for-3.8 w/ Li's acks added.

OK, I have merged it into -mm git tree.

Thanks
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 19/29] memcg: infrastructure to match an allocation to the right cache

2012-11-06 Thread Michal Hocko
On Mon 05-11-12 16:28:37, Andrew Morton wrote:
 On Thu,  1 Nov 2012 16:07:35 +0400
 Glauber Costa glom...@parallels.com wrote:
 
  +static __always_inline struct kmem_cache *
  +memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 
 I still don't understand why this code uses __always_inline so much.

AFAIU, __always_inline (resp. __attribute__((always_inline))) is the
same thing as inline if optimizations are enabled
(http://ohse.de/uwe/articles/gcc-attributes.html#func-always_inline).
Which is the case for the kernel. I was always wondering why we have
this __always_inline thingy.
It has been introduced back in 2004 by Andi but the commit log doesn't
say much:

[PATCH] gcc-3.5 fixes

Trivial gcc-3.5 build fixes.

Andi what was the original motivation for this attribute?
 
 I don't recall seeing the compiler producing out-of-line versions of
 static inline functions

and if it decides then __always_inline will not help, right?

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] mm: Export vm_committed_as

2012-11-06 Thread Michal Hocko
On Mon 05-11-12 22:12:25, KY Srinivasan wrote:
 
 
  -Original Message-
  From: Andrew Morton [mailto:a...@linux-foundation.org]
  Sent: Monday, November 05, 2012 4:45 PM
  To: KY Srinivasan
  Cc: Greg KH; o...@aepfle.de; linux-kernel@vger.kernel.org; 
  a...@firstfloor.org;
  a...@canonical.com; de...@linuxdriverproject.org; linux...@kvack.org;
  Hiroyuki Kamezawa; Michal Hocko; Johannes Weiner; Ying Han
  Subject: Re: [PATCH 1/2] mm: Export vm_committed_as
  
  On Sat, 3 Nov 2012 14:09:38 +
  KY Srinivasan k...@microsoft.com wrote:
  
  
  

 Ok, but you're going to have to get the -mm developers to agree that
 this is ok before I can accept it.
   
Well I guess it won't kill us.
  
   Andrew,
  
   I presumed this was an Ack from you with regards to exporting the
   symbol. Looks like Greg is waiting to hear from you before he can check
   these patches in. Could you provide an explicit Ack.
  
  
  Well, I do have some qualms about exporting vm_committed_as to modules.
  
  vm_committed_as is a global thing and only really makes sense in a
  non-containerised system.  If the application is running within a
  memory cgroup then vm_enough_memory() and the global overcommit policy
  are at best irrelevant and misleading.
  
  If use of vm_committed_as is indeed a bad thing, then exporting it to
  modules might increase the amount of badness in the kernel.
  
  
  I don't think these qualms are serious enough to stand in the way of
  this patch, but I'd be interested in hearing the memcg developers'
  thoughts on the matter?
  
  
  Perhaps you could provide a detailed description of why your module
  actually needs this?  Precisely what information is it looking for
  and why?  If we know that then perhaps a more comfortable alternative
  can be found.
 
 The Hyper-V host has a policy engine for managing available physical
 memory across competing virtual machines. This policy decision
 is based on a number of parameters including the memory pressure
 reported by the guest. Currently, the pressure calculation is based
 on the memory commitment made by the guest. From what I can tell, the
 ratio of currently allocated physical memory to the current memory
 commitment made by the guest (vm_committed_as) is used as one of the
 parameters in making the memory balancing decision on the host. This
 is what Windows guests report to the host. So, I need some measure of
 memory commitments made by the Linux guest. This is the reason I want
 export vm_committed_as.

So IIUC it will be guest who reports the value and the guest runs in the
ring-0 so it is not in any user process context, right?
If this is correct then memcg doesn't play any role here.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 11/29] memcg: allow a memcg with kmem charges to be destructed.

2012-11-06 Thread Michal Hocko
On Thu 01-11-12 17:05:39, Andrew Morton wrote:
 On Thu,  1 Nov 2012 16:07:27 +0400
 Glauber Costa glom...@parallels.com wrote:
 
  Because the ultimate goal of the kmem tracking in memcg is to track slab
  pages as well, we can't guarantee that we'll always be able to point a
  page to a particular process, and migrate the charges along with it -
  since in the common case, a page will contain data belonging to multiple
  processes.
  
  Because of that, when we destroy a memcg, we only make sure the
  destruction will succeed by discounting the kmem charges from the user
  charges when we try to empty the cgroup.
 
 There was a significant conflict with the sched/numa changes in
 linux-next,

Just for record. The conflict was introduced by 2ef37d3f (memcg: Simplify
mem_cgroup_force_empty_list error handling) which came in via Tejun's
tree.
Your resolution looks good to me. Sorry about the trouble.

 which I resolved as below.  Please check it.
 
 static int mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 {
   struct cgroup *cgrp = memcg-css.cgroup;
   int node, zid;
   u64 usage;
 
   do {
   if (cgroup_task_count(cgrp) || !list_empty(cgrp-children))
   return -EBUSY;
   /* This is for making all *used* pages to be on LRU. */
   lru_add_drain_all();
   drain_all_stock_sync(memcg);
   mem_cgroup_start_move(memcg);
   for_each_node_state(node, N_HIGH_MEMORY) {
   for (zid = 0; zid  MAX_NR_ZONES; zid++) {
   enum lru_list lru;
   for_each_lru(lru) {
   mem_cgroup_force_empty_list(memcg,
   node, zid, lru);
   }
   }
   }
   mem_cgroup_end_move(memcg);
   memcg_oom_recover(memcg);
   cond_resched();
 
   /*
* Kernel memory may not necessarily be trackable to a specific
* process. So they are not migrated, and therefore we can't
* expect their value to drop to 0 here.
* Having res filled up with kmem only is enough.
*
* This is a safety check because mem_cgroup_force_empty_list
* could have raced with mem_cgroup_replace_page_cache callers
* so the lru seemed empty but the page could have been added
* right after the check. RES_USAGE should be safe as we always
* charge before adding to the LRU.
*/
   usage = res_counter_read_u64(memcg-res, RES_USAGE) -
   res_counter_read_u64(memcg-kmem, RES_USAGE);
   } while (usage  0);
 
   return 0;
 }
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] mm: Export vm_committed_as

2012-11-06 Thread Michal Hocko
On Mon 05-11-12 14:33:12, David Rientjes wrote:
 On Mon, 5 Nov 2012, KY Srinivasan wrote:
 
  The Hyper-V host has a policy engine for managing available physical memory 
  across
  competing virtual machines. This policy decision is based on a number of 
  parameters
  including the memory pressure reported by the guest. Currently, the 
  pressure calculation is
  based on the memory commitment made by the guest. From what I can tell, the 
  ratio of
  currently allocated physical memory to the current memory commitment made 
  by the guest
  (vm_committed_as) is used as one of the parameters in making the memory 
  balancing decision on
  the host. This is what Windows guests report to the host. So, I need some 
  measure of memory
  commitments made by the Linux guest. This is the reason I want export 
  vm_committed_as. 
  
 
 I don't think you should export the symbol itself to modules but rather a 
 helper function that returns s64 that just wraps 
 percpu_counter_read_positive() which your driver could use instead.

Agreed, we should rather make sure that nobody can manipulate the value
from modules.

 (And why percpu_counter_read_positive() returns a signed type is a 
 mystery.)

Strange indeed. The last commit changed it from long to s64 to suport
values bigger than 2^31 but even the original long doesn't make much
sense to me.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 9/9] cgroup_freezer: implement proper hierarchy support

2012-11-07 Thread Michal Hocko
On Sat 03-11-12 01:38:35, Tejun Heo wrote:
[...]
 diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
 index 4f12d31..3262537 100644
 --- a/kernel/cgroup_freezer.c
 +++ b/kernel/cgroup_freezer.c
[...]
 @@ -320,14 +388,39 @@ static void freezer_apply_state(struct freezer 
 *freezer, bool freeze,
   * @freezer: freezer of interest
   * @freeze: whether to freeze or thaw
   *
 - * Freeze or thaw @cgroup according to @freeze.
 + * Freeze or thaw @freezer according to @freeze.  The operations are
 + * recursive - all descendants of @freezer will be affected.
   */
  static void freezer_change_state(struct freezer *freezer, bool freeze)
  {
 + struct cgroup *pos;
 +
   /* update @freezer */
   spin_lock_irq(freezer-lock);
   freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
   spin_unlock_irq(freezer-lock);
 +
 + /*
 +  * Update all its descendants in pre-order traversal.  Each
 +  * descendant will try to inherit its parent's FREEZING state as
 +  * CGROUP_FREEZING_PARENT.
 +  */
 + rcu_read_lock();
 + cgroup_for_each_descendant_pre(pos, freezer-css.cgroup) {
 + struct freezer *pos_f = cgroup_freezer(pos);
 + struct freezer *parent = parent_freezer(freezer);

This doesn't seem right. Why are we interested in freezer-parent here
at all? We should either care about freezer-state  CGROUP_FREEZING or
parent = parent_freezer(pos_f), right?

 +
 + /*
 +  * Our update to @parent-state is already visible which is
 +  * all we need.  No need to lock @parent.  For more info on
 +  * synchronization, see freezer_post_create().
 +  */
 + spin_lock_irq(pos_f-lock);
 + freezer_apply_state(pos_f, parent-state  CGROUP_FREEZING,
 + CGROUP_FREEZING_PARENT);
 + spin_unlock_irq(pos_f-lock);
 + }
 + rcu_read_unlock();
  }
  
  static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/9] cgroup: add cgroup_subsys-post_create()

2012-11-07 Thread Michal Hocko
On Sat 03-11-12 01:38:27, Tejun Heo wrote:
 Currently, there's no way for a controller to find out whether a new
 cgroup finished all -create() allocatinos successfully and is
 considered live by cgroup.
 
 This becomes a problem later when we add generic descendants walking
 to cgroup which can be used by controllers as controllers don't have a
 synchronization point where it can synchronize against new cgroups
 appearing in such walks.
 
 This patch adds -post_create().  It's called after all -create()
 succeeded and the cgroup is linked into the generic cgroup hierarchy.
 This plays the counterpart of -pre_destroy().

Hmm, I had to look at cgroup_freezer: implement proper hierarchy
support to actually understand what is the callback good for. The above
sounds as if the callback is needed when a controller wants to use
the new iterators or when pre_destroy is defined.

I think it would be helpful if the changelog described that the callback
is needed when the controller keeps a mutable shared state for the
hierarchy. For example memory controller doesn't have any such a strict
requirement so we can safely use your new iterators without pre_destroy.

Anyway, I like this change because the shared state is now really easy
to implement.

 Signed-off-by: Tejun Heo t...@kernel.org
 Cc: Glauber Costa glom...@parallels.com

Acked-by: Michal Hocko mho...@suse.cz

 ---
  include/linux/cgroup.h |  1 +
  kernel/cgroup.c| 12 ++--
  2 files changed, 11 insertions(+), 2 deletions(-)
 
 diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
 index fe876a7..b442122 100644
 --- a/include/linux/cgroup.h
 +++ b/include/linux/cgroup.h
 @@ -438,6 +438,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
  
  struct cgroup_subsys {
   struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
 + void (*post_create)(struct cgroup *cgrp);
   void (*pre_destroy)(struct cgroup *cgrp);
   void (*destroy)(struct cgroup *cgrp);
   int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index e3045ad..f05d992 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -4060,10 +4060,15 @@ static long cgroup_create(struct cgroup *parent, 
 struct dentry *dentry,
   if (err  0)
   goto err_remove;
  
 - /* each css holds a ref to the cgroup's dentry */
 - for_each_subsys(root, ss)
 + for_each_subsys(root, ss) {
 + /* each css holds a ref to the cgroup's dentry */
   dget(dentry);
  
 + /* creation succeeded, notify subsystems */
 + if (ss-post_create)
 + ss-post_create(cgrp);
 + }
 +
   /* The cgroup directory was pre-locked for us */
   BUG_ON(!mutex_is_locked(cgrp-dentry-d_inode-i_mutex));
  
 @@ -4281,6 +4286,9 @@ static void __init cgroup_init_subsys(struct 
 cgroup_subsys *ss)
  
   ss-active = 1;
  
 + if (ss-post_create)
 + ss-post_create(ss-root-top_cgroup);
 +
   /* this function shouldn't be used with modular subsystems, since they
* need to register a subsys_id, among other things */
   BUG_ON(ss-module);
 -- 
 1.7.11.7
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/9] cgroup: Use rculist ops for cgroup-children

2012-11-07 Thread Michal Hocko
On Sat 03-11-12 01:38:28, Tejun Heo wrote:
 Use RCU safe list operations for cgroup-children.  This will be used
 to implement cgroup children / descendant walking which can be used by
 controllers.
 
 Note that cgroup_create() now puts a new cgroup at the end of the
 -children list instead of head.  This isn't strictly necessary but is
 done so that the iteration order is more conventional.
 
 Signed-off-by: Tejun Heo t...@kernel.org

Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  include/linux/cgroup.h | 1 +
  kernel/cgroup.c| 8 +++-
  2 files changed, 4 insertions(+), 5 deletions(-)
 
 diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
 index b442122..90c33eb 100644
 --- a/include/linux/cgroup.h
 +++ b/include/linux/cgroup.h
 @@ -12,6 +12,7 @@
  #include linux/cpumask.h
  #include linux/nodemask.h
  #include linux/rcupdate.h
 +#include linux/rculist.h
  #include linux/cgroupstats.h
  #include linux/prio_heap.h
  #include linux/rwsem.h
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index f05d992..cc5d2a0 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -1650,7 +1650,6 @@ static struct dentry *cgroup_mount(struct 
 file_system_type *fs_type,
  
   free_cg_links(tmp_cg_links);
  
 - BUG_ON(!list_empty(root_cgrp-sibling));
   BUG_ON(!list_empty(root_cgrp-children));
   BUG_ON(root-number_of_cgroups != 1);
  
 @@ -1699,7 +1698,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
  
   BUG_ON(root-number_of_cgroups != 1);
   BUG_ON(!list_empty(cgrp-children));
 - BUG_ON(!list_empty(cgrp-sibling));
  
   mutex_lock(cgroup_mutex);
   mutex_lock(cgroup_root_mutex);
 @@ -4053,7 +4051,7 @@ static long cgroup_create(struct cgroup *parent, struct 
 dentry *dentry,
   }
   }
  
 - list_add(cgrp-sibling, cgrp-parent-children);
 + list_add_tail_rcu(cgrp-sibling, cgrp-parent-children);
   root-number_of_cgroups++;
  
   err = cgroup_create_dir(cgrp, dentry, mode);
 @@ -4084,7 +4082,7 @@ static long cgroup_create(struct cgroup *parent, struct 
 dentry *dentry,
  
   err_remove:
  
 - list_del(cgrp-sibling);
 + list_del_rcu(cgrp-sibling);
   root-number_of_cgroups--;
  
   err_destroy:
 @@ -4210,7 +4208,7 @@ static int cgroup_rmdir(struct inode *unused_dir, 
 struct dentry *dentry)
   raw_spin_unlock(release_list_lock);
  
   /* delete this cgroup from parent-children */
 - list_del_init(cgrp-sibling);
 + list_del_rcu(cgrp-sibling);
  
   list_del_init(cgrp-allcg_node);
  
 -- 
 1.7.11.7
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/9] cgroup: implement generic child / descendant walk macros

2012-11-07 Thread Michal Hocko
On Tue 06-11-12 12:31:54, Tejun Heo wrote:
 On Sat, Nov 03, 2012 at 01:38:29AM -0700, Tejun Heo wrote:
  Currently, cgroup doesn't provide any generic helper for walking a
  given cgroup's children or descendants.  This patch adds the following
  three macros.
  
  * cgroup_for_each_child() - walk immediate children of a cgroup.
  
  * cgroup_for_each_descendant_pre() - visit all descendants of a cgroup
in pre-order tree traversal.
  
  * cgroup_for_each_descendant_post() - visit all descendants of a
cgroup in post-order tree traversal.
  
  All three only require the user to hold RCU read lock during
  traversal.  Verifying that each iterated cgroup is online is the
  responsibility of the user.  When used with proper synchronization,
  cgroup_for_each_descendant_pre() can be used to propagate config
  updates to descendants in reliable way.  See comments for details.
 
 Michal, Li, how does this look to you?  Would this be okay for memcg
 too?

Yes, definitely. We are currently iterating by css-id which is, ehm,
impractical. Having a deterministic tree walk is definitely a plus.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/9] cgroup: implement generic child / descendant walk macros

2012-11-07 Thread Michal Hocko
On Sat 03-11-12 01:38:29, Tejun Heo wrote:
[...]
 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
 index cc5d2a0..8bd662c 100644
 --- a/kernel/cgroup.c
 +++ b/kernel/cgroup.c
 @@ -2985,6 +2985,92 @@ static void cgroup_enable_task_cg_lists(void)
   write_unlock(css_set_lock);
  }
  
 +/**
 + * cgroup_next_descendant_pre - find the next descendant for pre-order walk
 + * @pos: the current position (%NULL to initiate traversal)
 + * @cgroup: cgroup whose descendants to walk
 + *
 + * To be used by cgroup_for_each_descendant_pre().  Find the next
 + * descendant to visit for pre-order traversal of @cgroup's descendants.
 + */
 +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 +   struct cgroup *cgroup)
 +{
 + struct cgroup *next;
 +
 + WARN_ON_ONCE(!rcu_read_lock_held());
 +
 + /* if first iteration, pretend we just visited @cgroup */
 + if (!pos) {
 + if (list_empty(cgroup-children))
 + return NULL;
 + pos = cgroup;
 + }

Is there any specific reason why the root of the tree is excluded?
This is bit impractical because you have to special case the root
in the code.

 +
 + /* visit the first child if exists */
 + next = list_first_or_null_rcu(pos-children, struct cgroup, sibling);
 + if (next)
 + return next;
 +
 + /* no child, visit my or the closest ancestor's next sibling */
 + do {
 + next = list_entry_rcu(pos-sibling.next, struct cgroup,
 +   sibling);
 + if (next-sibling != pos-parent-children)
 + return next;
 +
 + pos = pos-parent;
 + } while (pos != cgroup);
 +
 + return NULL;
 +}
 +EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
[...]

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/9 v2] cgroup: add cgroup_subsys-post_create()

2012-11-07 Thread Michal Hocko
On Wed 07-11-12 09:15:08, Tejun Heo wrote:
[...]
 This patch adds -post_create().  It's called after all -create()
 succeeded and the cgroup is linked into the generic cgroup hierarchy.
 This plays the counterpart of -pre_destroy().
 
 When used in combination with the to-be-added generic descendant
 iterators, -post_create() can be used to implement reliable state
 inheritance.  It will be explained with the descendant iterators.

Thanks
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/9] cgroup: implement generic child / descendant walk macros

2012-11-07 Thread Michal Hocko
On Wed 07-11-12 09:01:18, Tejun Heo wrote:
 Hello, Michal.
 
 On Wed, Nov 07, 2012 at 05:54:57PM +0100, Michal Hocko wrote:
   +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
   +   struct cgroup *cgroup)
   +{
   + struct cgroup *next;
   +
   + WARN_ON_ONCE(!rcu_read_lock_held());
   +
   + /* if first iteration, pretend we just visited @cgroup */
   + if (!pos) {
   + if (list_empty(cgroup-children))
   + return NULL;
   + pos = cgroup;
   + }
  
  Is there any specific reason why the root of the tree is excluded?
  This is bit impractical because you have to special case the root
  in the code.
 
 Yeah, thought about including it but decided against it for two
 reasons.
 
 * To be consistent with cgroup_for_each_children() - it's a bit weird
   for descendants to include self when children don't.
 
 * Iteration root is likely to require different treatment anyway.
   e.g. for cgroup_freezer, the root is updated to the specified config
   while all the descendants inherit config from its immediate parent.
   They are different.

OK if there is a code which handles root differently then let's not
overcomplicate this. The naming should be clear that root needs a
special treatment.

I will continue with the review tomorrow.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] memcg, oom: provide more precise dump info while memcg oom happening

2012-11-07 Thread Michal Hocko
On Wed 07-11-12 16:41:36, Sha Zhengju wrote:
 From: Sha Zhengju handai@taobao.com
 
 Current, when a memcg oom is happening the oom dump messages is still global
 state and provides few useful info for users. This patch prints more pointed
 memcg page statistics for memcg-oom.
 
 Signed-off-by: Sha Zhengju handai@taobao.com
 Cc: Michal Hocko mho...@suse.cz
 Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
 Cc: David Rientjes rient...@google.com
 Cc: Andrew Morton a...@linux-foundation.org
 ---
  mm/memcontrol.c |   71 
 ---
  mm/oom_kill.c   |6 +++-
  2 files changed, 66 insertions(+), 11 deletions(-)
 
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 0eab7d5..2df5e72 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
[...]
 @@ -1501,8 +1509,59 @@ static void move_unlock_mem_cgroup(struct mem_cgroup 
 *memcg,
   spin_unlock_irqrestore(memcg-move_lock, *flags);
  }
  
 +#define K(x) ((x)  (PAGE_SHIFT-10))
 +static void mem_cgroup_print_oom_stat(struct mem_cgroup *memcg)
 +{
 + struct mem_cgroup *mi;
 + unsigned int i;
 +
 + if (!memcg-use_hierarchy  memcg != root_mem_cgroup) {
 + for (i = 0; i  MEM_CGROUP_STAT_NSTATS; i++) {
 + if (i == MEM_CGROUP_STAT_SWAP  !do_swap_account)
 + continue;
 + printk(KERN_CONT %s:%ldKB , mem_cgroup_stat_names[i],
 + K(mem_cgroup_read_stat(memcg, i)));
 + }
 +
 + for (i = 0; i  MEM_CGROUP_EVENTS_NSTATS; i++)
 + printk(KERN_CONT %s:%lu , mem_cgroup_events_names[i],
 + mem_cgroup_read_events(memcg, i));
 +
 + for (i = 0; i  NR_LRU_LISTS; i++)
 + printk(KERN_CONT %s:%luKB , mem_cgroup_lru_names[i],
 + K(mem_cgroup_nr_lru_pages(memcg, BIT(i;
 + } else {
 +
 + for (i = 0; i  MEM_CGROUP_STAT_NSTATS; i++) {
 + long long val = 0;
 +
 + if (i == MEM_CGROUP_STAT_SWAP  !do_swap_account)
 + continue;
 + for_each_mem_cgroup_tree(mi, memcg)
 + val += mem_cgroup_read_stat(mi, i);
 + printk(KERN_CONT %s:%lldKB , 
 mem_cgroup_stat_names[i], K(val));
 + }
 +
 + for (i = 0; i  MEM_CGROUP_EVENTS_NSTATS; i++) {
 + unsigned long long val = 0;
 +
 + for_each_mem_cgroup_tree(mi, memcg)
 + val += mem_cgroup_read_events(mi, i);
 + printk(KERN_CONT %s:%llu ,
 + mem_cgroup_events_names[i], val);
 + }
 +
 + for (i = 0; i  NR_LRU_LISTS; i++) {
 + unsigned long long val = 0;
 +
 + for_each_mem_cgroup_tree(mi, memcg)
 + val += mem_cgroup_nr_lru_pages(mi, BIT(i));
 + printk(KERN_CONT %s:%lluKB , mem_cgroup_lru_names[i], 
 K(val));
 + }
 + }

This is just plain ugly. for_each_mem_cgroup_tree is use_hierarchy aware
and there is no need for if (use_hierarchy) part.
memcg != root_mem_cgroup test doesn't make much sense as well because we
call that a global oom killer ;)
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom situation

2012-11-07 Thread Michal Hocko
On Wed 07-11-12 16:41:59, Sha Zhengju wrote:
 From: Sha Zhengju handai@taobao.com
 
 If memcg oom happening, don't scan all system tasks to dump memory state of
 eligible tasks, instead we iterates only over the process attached to the oom
 memcg and avoid the rcu lock.

you have replaced rcu lock by css_set_lock which is, well, heavier than
rcu. Besides that the patch is not correct because you have excluded
all tasks that are from subgroups because you iterate only through the
top level one.
I am not sure the whole optimization would be a win even if implemented
correctly. Well, we scan through more tasks currently and most of them
are not relevant but then you would need to exclude task_in_mem_cgroup
from oom_unkillable_task and that would be more code churn than the
win.

 Signed-off-by: Sha Zhengju handai@taobao.com
 Cc: Michal Hocko mho...@suse.cz
 Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
 Cc: David Rientjes rient...@google.com
 Cc: Andrew Morton a...@linux-foundation.org
 ---
  include/linux/memcontrol.h |7 +
  include/linux/oom.h|2 +
  mm/memcontrol.c|   14 +++
  mm/oom_kill.c  |   55 ++-
  4 files changed, 56 insertions(+), 22 deletions(-)
 
[...]
 diff --git a/include/linux/oom.h b/include/linux/oom.h
 index 20b5c46..9ba3344 100644
 --- a/include/linux/oom.h
 +++ b/include/linux/oom.h
 @@ -57,6 +57,8 @@ extern enum oom_scan_t oom_scan_process_thread(struct 
 task_struct *task,
   unsigned long totalpages, const nodemask_t *nodemask,
   bool force_kill);
  
 +extern inline void dump_per_task(struct task_struct *p,
 + const nodemask_t *nodemask);
  extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
   int order, nodemask_t *mask, bool force_kill);
  extern int register_oom_notifier(struct notifier_block *nb);
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 2df5e72..fe648f8 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -1665,6 +1665,20 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup 
 *memcg)
   return min(limit, memsw);
  }
  
 +void dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t 
 *nodemask)
 +{
 + struct cgroup_iter it;
 + struct task_struct *task;
 + struct cgroup *cgroup = memcg-css.cgroup;
 +
 + cgroup_iter_start(cgroup, it);
 + while ((task = cgroup_iter_next(cgroup, it))) {
 + dump_per_task(task, nodemask);
 + }
 +
 + cgroup_iter_end(cgroup, it);
 +}
 +
  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t 
 gfp_mask,
int order)
  {
 diff --git a/mm/oom_kill.c b/mm/oom_kill.c
 index 4b8a6dd..aaf6237 100644
 --- a/mm/oom_kill.c
 +++ b/mm/oom_kill.c
 @@ -367,6 +367,32 @@ static struct task_struct *select_bad_process(unsigned 
 int *ppoints,
   return chosen;
  }
  
 +inline void dump_per_task(struct task_struct *p, const nodemask_t *nodemask)
 +{
 + struct task_struct *task;
 +
 + if (oom_unkillable_task(p, NULL, nodemask))
 + return;
 +
 + task = find_lock_task_mm(p);
 + if (!task) {
 + /*
 +  * This is a kthread or all of p's threads have already
 +  * detached their mm's.  There's no need to report
 +  * them; they can't be oom killed anyway.
 +  */
 + return;
 + }
 +
 + pr_info([%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n,
 + task-pid, from_kuid(init_user_ns, task_uid(task)),
 + task-tgid, task-mm-total_vm, get_mm_rss(task-mm),
 + task-mm-nr_ptes,
 + get_mm_counter(task-mm, MM_SWAPENTS),
 + task-signal-oom_score_adj, task-comm);
 + task_unlock(task);
 +}
 +
  /**
   * dump_tasks - dump current memory state of all system tasks
   * @memcg: current's memory controller, if constrained
 @@ -381,32 +407,17 @@ static struct task_struct *select_bad_process(unsigned 
 int *ppoints,
  static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t 
 *nodemask)
  {
   struct task_struct *p;
 - struct task_struct *task;
  
   pr_info([ pid ]   uid  tgid total_vm  rss nr_ptes swapents 
 oom_score_adj name\n);
 - rcu_read_lock();
 - for_each_process(p) {
 - if (oom_unkillable_task(p, memcg, nodemask))
 - continue;
 -
 - task = find_lock_task_mm(p);
 - if (!task) {
 - /*
 -  * This is a kthread or all of p's threads have already
 -  * detached their mm's.  There's no need to report
 -  * them; they can't be oom killed anyway.
 -  */
 - continue;
 - }
  
 - pr_info([%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n,
 - task-pid, from_kuid(init_user_ns

Re: [PATCH v2] memcg: oom: fix totalpages calculation for memory.swappiness==0

2012-11-07 Thread Michal Hocko
On Wed 07-11-12 14:10:25, Andrew Morton wrote:
 On Tue, 16 Oct 2012 00:04:08 +0200
 Michal Hocko mho...@suse.cz wrote:
 
  As Kosaki correctly pointed out, the glogal reclaim doesn't have this
  issue because we _do_ swap on swappinnes==0 so the swap space has
  to be considered. So the v2 is just acks + changelog fix.
  
  Changes since v1
  - drop a note about global swappiness affected as well from the
changelog
  - stable needs 3.2+ rather than 3.5+ because the fe35004f has been
backported to stable
  ---
  From c2ae4849f09dbfda6b61472c6dd1fd8c2fe8ac81 Mon Sep 17 00:00:00 2001
  From: Michal Hocko mho...@suse.cz
  Date: Wed, 10 Oct 2012 15:46:54 +0200
  Subject: [PATCH] memcg: oom: fix totalpages calculation for
   memory.swappiness==0
  
  oom_badness takes totalpages argument which says how many pages are
  available and it uses it as a base for the score calculation. The value
  is calculated by mem_cgroup_get_limit which considers both limit and
  total_swap_pages (resp. memsw portion of it).
  
  This is usually correct but since fe35004f (mm: avoid swapping out
  with swappiness==0) we do not swap when swappiness is 0 which means
  that we cannot really use up all the totalpages pages. This in turn
  confuses oom score calculation if the memcg limit is much smaller than
  the available swap because the used memory (capped by the limit) is
  negligible comparing to totalpages so the resulting score is too small
  if adj!=0 (typically task with CAP_SYS_ADMIN or non zero oom_score_adj).
  A wrong process might be selected as result.
  
  The problem can be worked around by checking mem_cgroup_swappiness==0
  and not considering swap at all in such a case.
  
  Signed-off-by: Michal Hocko mho...@suse.cz
  Acked-by: David Rientjes rient...@google.com
  Acked-by: Johannes Weiner han...@cmpxchg.org
  Acked-by: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
  Acked-by: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
  Cc: stable [3.2+]
 
 That's Cc: sta...@vger.kernel.org, please.

Will do next time.

 It's unobvious from the changelog that a -stable backport is really
 needed.  The bug looks pretty obscure and has been there for a long
 time.

Yes but it is not _that_ long since fe35004f made it into stable trees
(e.g. 3.2.29).
The reason why we probably do not see many reports is because people
didn't get used to swappiness==0 really works these days - especially
with memcg where it means _really_ no swapping.

 Realistically, is anyone likely to hurt from this?

The primary motivation for the fix was a real report by a customer.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] memcg: oom: fix totalpages calculation for memory.swappiness==0

2012-11-08 Thread Michal Hocko
On Wed 07-11-12 14:53:40, Andrew Morton wrote:
 On Wed, 7 Nov 2012 23:46:40 +0100
 Michal Hocko mho...@suse.cz wrote:
 
   Realistically, is anyone likely to hurt from this?
  
  The primary motivation for the fix was a real report by a customer.
 
 Describe it please and I'll copy it to the changelog.

The original issue (a wrong tasks get killed in a small group and memcg
swappiness=0) has been reported on top of our 3.0 based kernel (with
fe35004f backported). I have tried to replicate it by the test case
mentioned https://lkml.org/lkml/2012/10/10/223.

As David correctly pointed out (https://lkml.org/lkml/2012/10/10/418)
the significant role played the fact that all the processes in the group
have CAP_SYS_ADMIN but oom_score_adj has the similar effect. 
Say there is 2G of swap space which is 524288 pages. If you add
CAP_SYS_ADMIN bonus then you have -15728 score for the bias. This means
that all tasks with less than 60M get the minimum score and it is tasks
ordering which determines who gets killed as a result.

To summarize it. Users of small groups (relatively to the swap size)
with CAP_SYS_ADMIN tasks resp. oom_score_adj are affected the most
others might see an unexpected oom_badness calculation.
Whether this is a workload which is representative, I don't know but
I think that it is worth fixing and pushing to stable as well.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/9] cgroup: implement generic child / descendant walk macros

2012-11-08 Thread Michal Hocko
On Sat 03-11-12 01:38:29, Tejun Heo wrote:
 Currently, cgroup doesn't provide any generic helper for walking a
 given cgroup's children or descendants.  This patch adds the following
 three macros.
 
 * cgroup_for_each_child() - walk immediate children of a cgroup.
 
 * cgroup_for_each_descendant_pre() - visit all descendants of a cgroup
   in pre-order tree traversal.
 
 * cgroup_for_each_descendant_post() - visit all descendants of a
   cgroup in post-order tree traversal.
 
 All three only require the user to hold RCU read lock during
 traversal.  Verifying that each iterated cgroup is online is the
 responsibility of the user.  When used with proper synchronization,
 cgroup_for_each_descendant_pre() can be used to propagate config
 updates to descendants in reliable way.  See comments for details.
 
 Signed-off-by: Tejun Heo t...@kernel.org

I will convert mem_cgroup_iter to use this rather than css_get_next
after this gets into the next tree so that it can fly via Andrew.

Reviewed-by: Michal Hocko mho...@suse.cz

Just a minor knit. You are talking about a config propagation while I
would consider state propagation more clear and less confusing. Config
is usually stable enough so that post_create is not necessary for
syncing (e.g. memcg.swappiness). It is a state which must be consistent
throughout the hierarchy which matters here.

Thanks!

 ---
  include/linux/cgroup.h | 82 +++
  kernel/cgroup.c| 86 
 ++
  2 files changed, 168 insertions(+)
 
 diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
 index 90c33eb..0020329 100644
 --- a/include/linux/cgroup.h
 +++ b/include/linux/cgroup.h
 @@ -534,6 +534,88 @@ static inline struct cgroup* task_cgroup(struct 
 task_struct *task,
   return task_subsys_state(task, subsys_id)-cgroup;
  }
  
 +/**
 + * cgroup_for_each_child - iterate through children of a cgroup
 + * @pos: the cgroup * to use as the loop cursor
 + * @cgroup: cgroup whose children to walk
 + *
 + * Walk @cgroup's children.  Must be called under rcu_read_lock().  A child
 + * cgroup which hasn't finished -post_create() or already has finished
 + * -pre_destroy() may show up during traversal and it's each subsystem's
 + * responsibility to verify that each @pos is alive.
 + *
 + * If a subsystem synchronizes against the parent in its -post_create()
 + * and before starting iterating, a cgroup which finished -post_create()
 + * is guaranteed to be visible in the future iterations.
 + */
 +#define cgroup_for_each_child(pos, cgroup)   \
 + list_for_each_entry_rcu(pos, (cgroup)-children, sibling)
 +
 +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 +   struct cgroup *cgroup);
 +
 +/**
 + * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants
 + * @pos: the cgroup * to use as the loop cursor
 + * @cgroup: cgroup whose descendants to walk
 + *
 + * Walk @cgroup's descendants.  Must be called under rcu_read_lock().  A
 + * descendant cgroup which hasn't finished -post_create() or already has
 + * finished -pre_destroy() may show up during traversal and it's each
 + * subsystem's responsibility to verify that each @pos is alive.
 + *
 + * If a subsystem synchronizes against the parent in its -post_create()
 + * and before starting iterating, and synchronizes against @pos on each
 + * iteration, any descendant cgroup which finished -post_create() is
 + * guaranteed to be visible in the future iterations.
 + *
 + * In other words, the following guarantees that a descendant can't escape
 + * configuration of its ancestors.
 + *
 + * my_post_create(@cgrp)
 + * {
 + *   Lock @cgrp-parent and @cgrp;
 + *   Inherit config from @cgrp-parent;
 + *   Unlock both.
 + * }
 + *
 + * my_update_config(@cgrp)
 + * {
 + *   Lock @cgrp;
 + *   Update @cgrp's config;
 + *   Unlock @cgrp;
 + *
 + *   cgroup_for_each_descendant_pre(@pos, @cgrp) {
 + *   Lock @pos;
 + *   Verify @pos is alive and inherit config from @pos-parent;
 + *   Unlock @pos;
 + *   }
 + * }
 + *
 + * Alternatively, a subsystem may choose to use a single global lock to
 + * synchronize -post_create() and -pre_destroy() against tree-walking
 + * operations.
 + */
 +#define cgroup_for_each_descendant_pre(pos, cgroup)  \
 + for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos);   \
 +  pos = cgroup_next_descendant_pre((pos), (cgroup)))
 +
 +struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
 +struct cgroup *cgroup);
 +
 +/**
 + * cgroup_for_each_descendant_post - post-order walk of a cgroup's 
 descendants
 + * @pos: the cgroup * to use as the loop cursor
 + * @cgroup: cgroup whose descendants to walk
 + *
 + * Similar to cgroup_for_each_descendant_pre() but performs post-order
 + * traversal instead.  Note that the walk visibility

Re: [PATCH 4/9] cgroup_freezer: trivial cleanups

2012-11-08 Thread Michal Hocko
On Sat 03-11-12 01:38:30, Tejun Heo wrote:
 * Clean-up indentation and line-breaks.  Drop the invalid comment
   about freezer-lock.
 
 * Make all internal functions take @freezer instead of both @cgroup
   and @freezer.
 
 Signed-off-by: Tejun Heo t...@kernel.org

Looks reasonable
Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  kernel/cgroup_freezer.c | 41 +++--
  1 file changed, 19 insertions(+), 22 deletions(-)
 
 diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
 index bedefd9..975b3d8 100644
 --- a/kernel/cgroup_freezer.c
 +++ b/kernel/cgroup_freezer.c
 @@ -29,17 +29,15 @@ enum freezer_state {
  };
  
  struct freezer {
 - struct cgroup_subsys_state css;
 - enum freezer_state state;
 - spinlock_t lock; /* protects _writes_ to state */
 + struct cgroup_subsys_state  css;
 + enum freezer_state  state;
 + spinlock_t  lock;
  };
  
 -static inline struct freezer *cgroup_freezer(
 - struct cgroup *cgroup)
 +static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
  {
 - return container_of(
 - cgroup_subsys_state(cgroup, freezer_subsys_id),
 - struct freezer, css);
 + return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
 + struct freezer, css);
  }
  
  static inline struct freezer *task_freezer(struct task_struct *task)
 @@ -180,8 +178,9 @@ out:
   * migrated into or out of @cgroup, so we can't verify task states against
   * @freezer state here.  See freezer_attach() for details.
   */
 -static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer)
 +static void update_if_frozen(struct freezer *freezer)
  {
 + struct cgroup *cgroup = freezer-css.cgroup;
   struct cgroup_iter it;
   struct task_struct *task;
  
 @@ -211,12 +210,11 @@ notyet:
  static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
   struct seq_file *m)
  {
 - struct freezer *freezer;
 + struct freezer *freezer = cgroup_freezer(cgroup);
   enum freezer_state state;
  
 - freezer = cgroup_freezer(cgroup);
   spin_lock_irq(freezer-lock);
 - update_if_frozen(cgroup, freezer);
 + update_if_frozen(freezer);
   state = freezer-state;
   spin_unlock_irq(freezer-lock);
  
 @@ -225,8 +223,9 @@ static int freezer_read(struct cgroup *cgroup, struct 
 cftype *cft,
   return 0;
  }
  
 -static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 +static void freeze_cgroup(struct freezer *freezer)
  {
 + struct cgroup *cgroup = freezer-css.cgroup;
   struct cgroup_iter it;
   struct task_struct *task;
  
 @@ -236,8 +235,9 @@ static void freeze_cgroup(struct cgroup *cgroup, struct 
 freezer *freezer)
   cgroup_iter_end(cgroup, it);
  }
  
 -static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 +static void unfreeze_cgroup(struct freezer *freezer)
  {
 + struct cgroup *cgroup = freezer-css.cgroup;
   struct cgroup_iter it;
   struct task_struct *task;
  
 @@ -247,11 +247,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, 
 struct freezer *freezer)
   cgroup_iter_end(cgroup, it);
  }
  
 -static void freezer_change_state(struct cgroup *cgroup,
 +static void freezer_change_state(struct freezer *freezer,
enum freezer_state goal_state)
  {
 - struct freezer *freezer = cgroup_freezer(cgroup);
 -
   /* also synchronizes against task migration, see freezer_attach() */
   spin_lock_irq(freezer-lock);
  
 @@ -260,13 +258,13 @@ static void freezer_change_state(struct cgroup *cgroup,
   if (freezer-state != CGROUP_THAWED)
   atomic_dec(system_freezing_cnt);
   freezer-state = CGROUP_THAWED;
 - unfreeze_cgroup(cgroup, freezer);
 + unfreeze_cgroup(freezer);
   break;
   case CGROUP_FROZEN:
   if (freezer-state == CGROUP_THAWED)
   atomic_inc(system_freezing_cnt);
   freezer-state = CGROUP_FREEZING;
 - freeze_cgroup(cgroup, freezer);
 + freeze_cgroup(freezer);
   break;
   default:
   BUG();
 @@ -275,8 +273,7 @@ static void freezer_change_state(struct cgroup *cgroup,
   spin_unlock_irq(freezer-lock);
  }
  
 -static int freezer_write(struct cgroup *cgroup,
 -  struct cftype *cft,
 +static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
const char *buffer)
  {
   enum freezer_state goal_state;
 @@ -288,7 +285,7 @@ static int freezer_write(struct cgroup *cgroup,
   else
   return -EINVAL;
  
 - freezer_change_state(cgroup, goal_state);
 + freezer_change_state(cgroup_freezer(cgroup), goal_state);
   return 0;
  }
  
 -- 
 1.7.11.7
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from

Re: [PATCH 5/9] cgroup_freezer: prepare freezer_change_state() for full hierarchy support

2012-11-08 Thread Michal Hocko
On Sat 03-11-12 01:38:31, Tejun Heo wrote:
 * Make freezer_change_state() take bool @freeze instead of enum
   freezer_state.
 
 * Separate out freezer_apply_state() out of freezer_change_state().
   This makes freezer_change_state() a rather silly thin wrapper.  It
   will be filled with hierarchy handling later on.
 
 This patch doesn't introduce any behavior change.
 
 Signed-off-by: Tejun Heo t...@kernel.org

Makes sense
Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  kernel/cgroup_freezer.c | 48 ++--
  1 file changed, 30 insertions(+), 18 deletions(-)
 
 diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
 index 975b3d8..2690830 100644
 --- a/kernel/cgroup_freezer.c
 +++ b/kernel/cgroup_freezer.c
 @@ -247,45 +247,57 @@ static void unfreeze_cgroup(struct freezer *freezer)
   cgroup_iter_end(cgroup, it);
  }
  
 -static void freezer_change_state(struct freezer *freezer,
 -  enum freezer_state goal_state)
 +/**
 + * freezer_apply_state - apply state change to a single cgroup_freezer
 + * @freezer: freezer to apply state change to
 + * @freeze: whether to freeze or unfreeze
 + */
 +static void freezer_apply_state(struct freezer *freezer, bool freeze)
  {
   /* also synchronizes against task migration, see freezer_attach() */
 - spin_lock_irq(freezer-lock);
 + lockdep_assert_held(freezer-lock);
  
 - switch (goal_state) {
 - case CGROUP_THAWED:
 - if (freezer-state != CGROUP_THAWED)
 - atomic_dec(system_freezing_cnt);
 - freezer-state = CGROUP_THAWED;
 - unfreeze_cgroup(freezer);
 - break;
 - case CGROUP_FROZEN:
 + if (freeze) {
   if (freezer-state == CGROUP_THAWED)
   atomic_inc(system_freezing_cnt);
   freezer-state = CGROUP_FREEZING;
   freeze_cgroup(freezer);
 - break;
 - default:
 - BUG();
 + } else {
 + if (freezer-state != CGROUP_THAWED)
 + atomic_dec(system_freezing_cnt);
 + freezer-state = CGROUP_THAWED;
 + unfreeze_cgroup(freezer);
   }
 +}
  
 +/**
 + * freezer_change_state - change the freezing state of a cgroup_freezer
 + * @freezer: freezer of interest
 + * @freeze: whether to freeze or thaw
 + *
 + * Freeze or thaw @cgroup according to @freeze.
 + */
 +static void freezer_change_state(struct freezer *freezer, bool freeze)
 +{
 + /* update @freezer */
 + spin_lock_irq(freezer-lock);
 + freezer_apply_state(freezer, freeze);
   spin_unlock_irq(freezer-lock);
  }
  
  static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
const char *buffer)
  {
 - enum freezer_state goal_state;
 + bool freeze;
  
   if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
 - goal_state = CGROUP_THAWED;
 + freeze = false;
   else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
 - goal_state = CGROUP_FROZEN;
 + freeze = true;
   else
   return -EINVAL;
  
 - freezer_change_state(cgroup_freezer(cgroup), goal_state);
 + freezer_change_state(cgroup_freezer(cgroup), freeze);
   return 0;
  }
  
 -- 
 1.7.11.7
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/9] cgroup_freezer: make freezer-state mask of flags

2012-11-08 Thread Michal Hocko
On Sat 03-11-12 01:38:32, Tejun Heo wrote:
 freezer-state was an enum value - one of THAWED, FREEZING and FROZEN.
 As the scheduled full hierarchy support requires more than one
 freezing condition, switch it to mask of flags.  If FREEZING is not
 set, it's thawed.  FREEZING is set if freezing or frozen.  If frozen,
 both FREEZING and FROZEN are set.  Now that tasks can be attached to
 an already frozen cgroup, this also makes freezing condition checks
 more natural.
 
 This patch doesn't introduce any behavior change.
 
 Signed-off-by: Tejun Heo t...@kernel.org

I think it would be nicer to use freezer_state_flags enum rather than
unsigned int for the state. I would even expect gcc to complain about
that but it looks like -fstrict-enums is c++ specific (so long enum
safety).

Anyway
Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  kernel/cgroup_freezer.c | 60 
 ++---
  1 file changed, 27 insertions(+), 33 deletions(-)
 
 diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
 index 2690830..e76aa9f 100644
 --- a/kernel/cgroup_freezer.c
 +++ b/kernel/cgroup_freezer.c
 @@ -22,15 +22,14 @@
  #include linux/freezer.h
  #include linux/seq_file.h
  
 -enum freezer_state {
 - CGROUP_THAWED = 0,
 - CGROUP_FREEZING,
 - CGROUP_FROZEN,
 +enum freezer_state_flags {
 + CGROUP_FREEZING = (1  1), /* this freezer is freezing */
 + CGROUP_FROZEN   = (1  3), /* this and its descendants frozen 
 */
  };
  
  struct freezer {
   struct cgroup_subsys_state  css;
 - enum freezer_state  state;
 + unsigned intstate;
   spinlock_t  lock;
  };
  
 @@ -48,12 +47,10 @@ static inline struct freezer *task_freezer(struct 
 task_struct *task)
  
  bool cgroup_freezing(struct task_struct *task)
  {
 - enum freezer_state state;
   bool ret;
  
   rcu_read_lock();
 - state = task_freezer(task)-state;
 - ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
 + ret = task_freezer(task)-state  CGROUP_FREEZING;
   rcu_read_unlock();
  
   return ret;
 @@ -63,10 +60,13 @@ bool cgroup_freezing(struct task_struct *task)
   * cgroups_write_string() limits the size of freezer state strings to
   * CGROUP_LOCAL_BUFFER_SIZE
   */
 -static const char *freezer_state_strs[] = {
 - THAWED,
 - FREEZING,
 - FROZEN,
 +static const char *freezer_state_strs(unsigned int state)
 +{
 + if (state  CGROUP_FROZEN)
 + return FROZEN;
 + if (state  CGROUP_FREEZING)
 + return FREEZING;
 + return THAWED;
  };
  
  /*
 @@ -91,7 +91,6 @@ static struct cgroup_subsys_state *freezer_create(struct 
 cgroup *cgroup)
   return ERR_PTR(-ENOMEM);
  
   spin_lock_init(freezer-lock);
 - freezer-state = CGROUP_THAWED;
   return freezer-css;
  }
  
 @@ -99,7 +98,7 @@ static void freezer_destroy(struct cgroup *cgroup)
  {
   struct freezer *freezer = cgroup_freezer(cgroup);
  
 - if (freezer-state != CGROUP_THAWED)
 + if (freezer-state  CGROUP_FREEZING)
   atomic_dec(system_freezing_cnt);
   kfree(freezer);
  }
 @@ -129,15 +128,13 @@ static void freezer_attach(struct cgroup *new_cgrp, 
 struct cgroup_taskset *tset)
* Tasks in @tset are on @new_cgrp but may not conform to its
* current state before executing the following - !frozen tasks may
* be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
 -  * This means that, to determine whether to freeze, one should test
 -  * whether the state equals THAWED.
*/
   cgroup_taskset_for_each(task, new_cgrp, tset) {
 - if (freezer-state == CGROUP_THAWED) {
 + if (!(freezer-state  CGROUP_FREEZING)) {
   __thaw_task(task);
   } else {
   freeze_task(task);
 - freezer-state = CGROUP_FREEZING;
 + freezer-state = ~CGROUP_FROZEN;
   }
   }
  
 @@ -159,11 +156,7 @@ static void freezer_fork(struct task_struct *task)
   goto out;
  
   spin_lock_irq(freezer-lock);
 - /*
 -  * @task might have been just migrated into a FROZEN cgroup.  Test
 -  * equality with THAWED.  Read the comment in freezer_attach().
 -  */
 - if (freezer-state != CGROUP_THAWED)
 + if (freezer-state  CGROUP_FREEZING)
   freeze_task(task);
   spin_unlock_irq(freezer-lock);
  out:
 @@ -184,7 +177,8 @@ static void update_if_frozen(struct freezer *freezer)
   struct cgroup_iter it;
   struct task_struct *task;
  
 - if (freezer-state != CGROUP_FREEZING)
 + if (!(freezer-state  CGROUP_FREEZING) ||
 + (freezer-state  CGROUP_FROZEN))
   return;
  
   cgroup_iter_start(cgroup, it);
 @@ -202,7 +196,7 @@ static void update_if_frozen(struct freezer *freezer)
   }
   }
  
 - freezer-state = CGROUP_FROZEN

Re: [PATCH v6 19/29] memcg: infrastructure to match an allocation to the right cache

2012-11-08 Thread Michal Hocko
On Tue 06-11-12 09:03:54, Michal Hocko wrote:
 On Mon 05-11-12 16:28:37, Andrew Morton wrote:
  On Thu,  1 Nov 2012 16:07:35 +0400
  Glauber Costa glom...@parallels.com wrote:
  
   +static __always_inline struct kmem_cache *
   +memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
  
  I still don't understand why this code uses __always_inline so much.
 
 AFAIU, __always_inline (resp. __attribute__((always_inline))) is the
 same thing as inline if optimizations are enabled
 (http://ohse.de/uwe/articles/gcc-attributes.html#func-always_inline).

And this doesn't tell the whole story because there is -fearly-inlining
which enabled by default and it makes a difference when optimizations
are enabled so __always_inline really enforces inlining.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] memcg, oom: provide more precise dump info while memcg oom happening

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 20:37:45, Sha Zhengju wrote:
 On 11/08/2012 02:02 AM, David Rientjes wrote:
 On Wed, 7 Nov 2012, Sha Zhengju wrote:
[..]
 +   else
 +   show_mem(SHOW_MEM_FILTER_NODES);
 Well that's disappointing if memcg == root_mem_cgroup, we'd probably like
 to know the global memory state to determine what the problem is.
 
 
 I really wondering if there is any case that can pass
 root_mem_cgroup down here.

No it cannot because the root cgroup doesn't have any limit so we cannot
trigger memcg oom killer.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/9] cgroup_freezer: introduce CGROUP_FREEZING_[SELF|PARENT]

2012-11-08 Thread Michal Hocko
On Sat 03-11-12 01:38:33, Tejun Heo wrote:
 Introduce FREEZING_SELF and FREEZING_PARENT and make FREEZING OR of
 the two flags.  This is to prepare for full hierarchy support.
 
 freezer_apply_date() is updated such that it can handle setting and
 clearing of both flags.  The two flags are also exposed to userland
 via read-only files self_freezing and parent_freezing.
 
 Other than the added cgroupfs files, this patch doesn't introduce any
 behavior change.

OK, I can imagine that this might be useful to find the first parent
group that needs to be thawed to unfreeze the group I am interested
in. Could you also update Documentation/cgroups/freezer-subsystem.txt to
clarify the intended usage?

Minor nit. Same as mentioned in the previous patch freezer_apply_state
should get enum freezer_state_flags state parameter.

 Signed-off-by: Tejun Heo t...@kernel.org

Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  kernel/cgroup_freezer.c | 55 
 ++---
  1 file changed, 47 insertions(+), 8 deletions(-)
 
 diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
 index e76aa9f..b8ad93c 100644
 --- a/kernel/cgroup_freezer.c
 +++ b/kernel/cgroup_freezer.c
 @@ -23,8 +23,12 @@
  #include linux/seq_file.h
  
  enum freezer_state_flags {
 - CGROUP_FREEZING = (1  1), /* this freezer is freezing */
 + CGROUP_FREEZING_SELF= (1  1), /* this freezer is freezing */
 + CGROUP_FREEZING_PARENT  = (1  2), /* the parent freezer is freezing */
   CGROUP_FROZEN   = (1  3), /* this and its descendants frozen 
 */
 +
 + /* mask for all FREEZING flags */
 + CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
  };
  
  struct freezer {
 @@ -245,8 +249,13 @@ static void unfreeze_cgroup(struct freezer *freezer)
   * freezer_apply_state - apply state change to a single cgroup_freezer
   * @freezer: freezer to apply state change to
   * @freeze: whether to freeze or unfreeze
 + * @state: CGROUP_FREEZING_* flag to set or clear
 + *
 + * Set or clear @state on @cgroup according to @freeze, and perform
 + * freezing or thawing as necessary.
   */
 -static void freezer_apply_state(struct freezer *freezer, bool freeze)
 +static void freezer_apply_state(struct freezer *freezer, bool freeze,
 + unsigned int state)
  {
   /* also synchronizes against task migration, see freezer_attach() */
   lockdep_assert_held(freezer-lock);
 @@ -254,13 +263,19 @@ static void freezer_apply_state(struct freezer 
 *freezer, bool freeze)
   if (freeze) {
   if (!(freezer-state  CGROUP_FREEZING))
   atomic_inc(system_freezing_cnt);
 - freezer-state |= CGROUP_FREEZING;
 + freezer-state |= state;
   freeze_cgroup(freezer);
   } else {
 - if (freezer-state  CGROUP_FREEZING)
 - atomic_dec(system_freezing_cnt);
 - freezer-state = ~(CGROUP_FREEZING | CGROUP_FROZEN);
 - unfreeze_cgroup(freezer);
 + bool was_freezing = freezer-state  CGROUP_FREEZING;
 +
 + freezer-state = ~state;
 +
 + if (!(freezer-state  CGROUP_FREEZING)) {
 + if (was_freezing)
 + atomic_dec(system_freezing_cnt);
 + freezer-state = ~CGROUP_FROZEN;
 + unfreeze_cgroup(freezer);
 + }
   }
  }
  
 @@ -275,7 +290,7 @@ static void freezer_change_state(struct freezer *freezer, 
 bool freeze)
  {
   /* update @freezer */
   spin_lock_irq(freezer-lock);
 - freezer_apply_state(freezer, freeze);
 + freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
   spin_unlock_irq(freezer-lock);
  }
  
 @@ -295,6 +310,20 @@ static int freezer_write(struct cgroup *cgroup, struct 
 cftype *cft,
   return 0;
  }
  
 +static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype 
 *cft)
 +{
 + struct freezer *freezer = cgroup_freezer(cgroup);
 +
 + return (bool)(freezer-state  CGROUP_FREEZING_SELF);
 +}
 +
 +static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype 
 *cft)
 +{
 + struct freezer *freezer = cgroup_freezer(cgroup);
 +
 + return (bool)(freezer-state  CGROUP_FREEZING_PARENT);
 +}
 +
  static struct cftype files[] = {
   {
   .name = state,
 @@ -302,6 +331,16 @@ static struct cftype files[] = {
   .read_seq_string = freezer_read,
   .write_string = freezer_write,
   },
 + {
 + .name = self_freezing,
 + .flags = CFTYPE_NOT_ON_ROOT,
 + .read_u64 = freezer_self_freezing_read,
 + },
 + {
 + .name = parent_freezing,
 + .flags = CFTYPE_NOT_ON_ROOT,
 + .read_u64 = freezer_parent_freezing_read,
 + },
   { } /* terminate */
  };
  
 -- 
 1.7.11.7
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from

Re: [PATCH 8/9] cgroup_freezer: add -post_create() and -pre_destroy() and track online state

2012-11-08 Thread Michal Hocko
On Sat 03-11-12 01:38:34, Tejun Heo wrote:
 A cgroup is online and visible to iteration between -post_create()
 and -pre_destroy().  This patch introduces CGROUP_FREEZER_ONLINE and
 toggles it from the newly added freezer_post_create() and
 freezer_pre_destroy() while holding freezer-lock such that a
 cgroup_freezer can be reilably distinguished to be online.  This will
 be used by full hierarchy support.

I am thinking whether freezer_pre_destroy is really needed. Once we
reach pre_destroy then there are no tasks nor any children in the group
so there is nobody to wake up if the group was frozen and the destroy
callback is called after synchronize_rcu so the traversing should be
safe.

 ONLINE test is added to freezer_apply_state() but it currently doesn't
 make any difference as freezer_write() can only be called for an
 online cgroup.
 
 Adjusting system_freezing_cnt on destruction is moved from
 freezer_destroy() to the new freezer_pre_destroy() for consistency.
 
 This patch doesn't introduce any noticeable behavior change.
 
 Signed-off-by: Tejun Heo t...@kernel.org

Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  kernel/cgroup_freezer.c | 42 --
  1 file changed, 40 insertions(+), 2 deletions(-)
 
 diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
 index b8ad93c..4f12d31 100644
 --- a/kernel/cgroup_freezer.c
 +++ b/kernel/cgroup_freezer.c
 @@ -23,6 +23,7 @@
  #include linux/seq_file.h
  
  enum freezer_state_flags {
 + CGROUP_FREEZER_ONLINE   = (1  0), /* freezer is fully online */
   CGROUP_FREEZING_SELF= (1  1), /* this freezer is freezing */
   CGROUP_FREEZING_PARENT  = (1  2), /* the parent freezer is freezing */
   CGROUP_FROZEN   = (1  3), /* this and its descendants frozen 
 */
 @@ -98,13 +99,45 @@ static struct cgroup_subsys_state *freezer_create(struct 
 cgroup *cgroup)
   return freezer-css;
  }
  
 -static void freezer_destroy(struct cgroup *cgroup)
 +/**
 + * freezer_post_create - commit creation of a freezer cgroup
 + * @cgroup: cgroup being created
 + *
 + * We're committing to creation of @cgroup.  Mark it online.
 + */
 +static void freezer_post_create(struct cgroup *cgroup)
  {
   struct freezer *freezer = cgroup_freezer(cgroup);
  
 + spin_lock_irq(freezer-lock);
 + freezer-state |= CGROUP_FREEZER_ONLINE;
 + spin_unlock_irq(freezer-lock);
 +}
 +
 +/**
 + * freezer_pre_destroy - initiate destruction of @cgroup
 + * @cgroup: cgroup being destroyed
 + *
 + * @cgroup is going away.  Mark it dead and decrement system_freezing_count
 + * if it was holding one.
 + */
 +static void freezer_pre_destroy(struct cgroup *cgroup)
 +{
 + struct freezer *freezer = cgroup_freezer(cgroup);
 +
 + spin_lock_irq(freezer-lock);
 +
   if (freezer-state  CGROUP_FREEZING)
   atomic_dec(system_freezing_cnt);
 - kfree(freezer);
 +
 + freezer-state = 0;
 +
 + spin_unlock_irq(freezer-lock);
 +}
 +
 +static void freezer_destroy(struct cgroup *cgroup)
 +{
 + kfree(cgroup_freezer(cgroup));
  }
  
  /*
 @@ -260,6 +293,9 @@ static void freezer_apply_state(struct freezer *freezer, 
 bool freeze,
   /* also synchronizes against task migration, see freezer_attach() */
   lockdep_assert_held(freezer-lock);
  
 + if (!(freezer-state  CGROUP_FREEZER_ONLINE))
 + return;
 +
   if (freeze) {
   if (!(freezer-state  CGROUP_FREEZING))
   atomic_inc(system_freezing_cnt);
 @@ -347,6 +383,8 @@ static struct cftype files[] = {
  struct cgroup_subsys freezer_subsys = {
   .name   = freezer,
   .create = freezer_create,
 + .post_create= freezer_post_create,
 + .pre_destroy= freezer_pre_destroy,
   .destroy= freezer_destroy,
   .subsys_id  = freezer_subsys_id,
   .attach = freezer_attach,
 -- 
 1.7.11.7
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 9/9 v2] cgroup_freezer: implement proper hierarchy support

2012-11-08 Thread Michal Hocko
On Wed 07-11-12 08:39:19, Tejun Heo wrote:
[...]
 --- a/kernel/cgroup_freezer.c
 +++ b/kernel/cgroup_freezer.c
[...]
 @@ -320,14 +388,39 @@ static void freezer_apply_state(struct f
   * @freezer: freezer of interest
   * @freeze: whether to freeze or thaw
   *
 - * Freeze or thaw @cgroup according to @freeze.
 + * Freeze or thaw @freezer according to @freeze.  The operations are
 + * recursive - all descendants of @freezer will be affected.
   */
  static void freezer_change_state(struct freezer *freezer, bool freeze)
  {
 + struct cgroup *pos;
 +
   /* update @freezer */
   spin_lock_irq(freezer-lock);
   freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
   spin_unlock_irq(freezer-lock);
 +
 + /*
 +  * Update all its descendants in pre-order traversal.  Each
 +  * descendant will try to inherit its parent's FREEZING state as
 +  * CGROUP_FREEZING_PARENT.
 +  */
 + rcu_read_lock();
 + cgroup_for_each_descendant_pre(pos, freezer-css.cgroup) {
 + struct freezer *pos_f = cgroup_freezer(pos);
 + struct freezer *parent = parent_freezer(pos_f);
 +
 + /*
 +  * Our update to @parent-state is already visible which is
 +  * all we need.  No need to lock @parent.  For more info on
 +  * synchronization, see freezer_post_create().
 +  */
 + spin_lock_irq(pos_f-lock);
 + freezer_apply_state(pos_f, parent-state  CGROUP_FREEZING,
 + CGROUP_FREEZING_PARENT);
 + spin_unlock_irq(pos_f-lock);
 + }
 + rcu_read_unlock();
  }

This seems to be racy because parent-state access is not linearized.
Say we have parallel freeze and thawing on a tree like the following:
A
|
B
|
C 

pre_order will visit them in B, C order.
CPU1CPU2
freezer_apply_state(A, true)
A-state  FREEZING == true freezer_apply_state(A, false)
A-state  FREEZING == false
freezer_apply_state(B, false)
B-state  FREEZING == false
freezer_apply_state(B, true)

B-state  FREEZING == true
freezer_apply_state(C, true)
freezer_apply_state(C, false)

So A, C are thawed while B is frozen. Or am I missing something which
would prevent from this kind of race?

The easy fix is to move spin_unlock_irq(freezer-lock); after
rcu_read_unlock.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 19/29] memcg: infrastructure to match an allocation to the right cache

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 12:05:13, Michal Hocko wrote:
 On Tue 06-11-12 09:03:54, Michal Hocko wrote:
  On Mon 05-11-12 16:28:37, Andrew Morton wrote:
   On Thu,  1 Nov 2012 16:07:35 +0400
   Glauber Costa glom...@parallels.com wrote:
   
+static __always_inline struct kmem_cache *
+memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
   
   I still don't understand why this code uses __always_inline so much.
  
  AFAIU, __always_inline (resp. __attribute__((always_inline))) is the
  same thing as inline if optimizations are enabled
  (http://ohse.de/uwe/articles/gcc-attributes.html#func-always_inline).
 
 And this doesn't tell the whole story because there is -fearly-inlining
 which enabled by default and it makes a difference when optimizations
 are enabled so __always_inline really enforces inlining.

and -fearly-inlining is another doc trap. I have tried with -O2
-fno-early-inlining and __always_inline code has been inlined with gcc
4.3 and 4.7 while simple inline is ignored so it really seems that
__always_inline is always inlined but man page is little a bit mean to
tell us all the details.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/9] cgroup_freezer: make freezer-state mask of flags

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 06:39:52, Tejun Heo wrote:
 Hello, Michal.
 
 On Thu, Nov 08, 2012 at 11:39:28AM +0100, Michal Hocko wrote:
  On Sat 03-11-12 01:38:32, Tejun Heo wrote:
   freezer-state was an enum value - one of THAWED, FREEZING and FROZEN.
   As the scheduled full hierarchy support requires more than one
   freezing condition, switch it to mask of flags.  If FREEZING is not
   set, it's thawed.  FREEZING is set if freezing or frozen.  If frozen,
   both FREEZING and FROZEN are set.  Now that tasks can be attached to
   an already frozen cgroup, this also makes freezing condition checks
   more natural.
   
   This patch doesn't introduce any behavior change.
   
   Signed-off-by: Tejun Heo t...@kernel.org
  
  I think it would be nicer to use freezer_state_flags enum rather than
  unsigned int for the state. I would even expect gcc to complain about
  that but it looks like -fstrict-enums is c++ specific (so long enum
  safety).
 
 But if you use it as flag bits, the resulting value no longer is
 inside the defined enum values.  Isn't that weird?

Ahh, there is always CGROUP_FREEZER_ONLINE which would overcomplicate
the possible and valid masks.
Please ignore this...

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 9/9 v2] cgroup_freezer: implement proper hierarchy support

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 06:18:48, Tejun Heo wrote:
 Hello, Michal.
 
 On Thu, Nov 08, 2012 at 03:08:52PM +0100, Michal Hocko wrote:
  This seems to be racy because parent-state access is not linearized.
  Say we have parallel freeze and thawing on a tree like the following:
  A
  |
  B
  |
  C 
  
  pre_order will visit them in B, C order.
  CPU1CPU2
  freezer_apply_state(A, true)
  A-state  FREEZING == true freezer_apply_state(A, false)
  A-state  FREEZING == false
  freezer_apply_state(B, false)
  B-state  FREEZING == false
  freezer_apply_state(B, true)
  
  B-state  FREEZING == true
  freezer_apply_state(C, true)
  freezer_apply_state(C, false)
  
  So A, C are thawed while B is frozen. Or am I missing something which
  would prevent from this kind of race?
 
 The rule is that there will be at least one inheritance operation
 after a parent is updated.  The exact order of propagation doesn't
 matter as long as there's at least one inheritance event after the
 latest update to a parent.  This works because inheritance operations
 are atomic to each other.  If one inheritance operation saw an
 update to its parent, the next inheritance operation is guaranteed to
 see at least upto that update.
 
 So, in the above example in CPU2, (B-state  FREEZING) test and
 freezer_apply_state(C, false) can't be interleaved with the same
 inheritance operation from CPU1.  They either happen before or after.

I am not sure I understand what you mean by inheritance operation but
you are right that the race is not possible because spin_lock makes
sure that Foo-state is done after the lock(Foo-child) and spin_unlock
then serves as a leave barrier so the other CPUs will see the correctly
updated value. The rest is handled by the fixed ordered tree walk. So
there is really no interleaving going on here.

Sorry about the confusion!

Feel free to add my Reviewed-by.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] mm, oom: change type of oom_score_adj to short

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 01:26:57, David Rientjes wrote:
 The maximum oom_score_adj is 1000 and the minimum oom_score_adj is -1000,
 so this range can be represented by the signed short type with no
 functional change.  The extra space this frees up in struct signal_struct
 will be used for per-thread oom kill flags in the next patch.
 
 Signed-off-by: David Rientjes rient...@google.com

Reviewed-by: Michal Hocko mho...@suse.cz

 ---
  drivers/staging/android/lowmemorykiller.c |   16 
  fs/proc/base.c|   10 +-
  include/linux/oom.h   |4 ++--
  include/linux/sched.h |6 +++---
  include/trace/events/oom.h|4 ++--
  include/trace/events/task.h   |8 
  mm/ksm.c  |2 +-
  mm/oom_kill.c |   10 +-
  mm/swapfile.c |2 +-
  9 files changed, 31 insertions(+), 31 deletions(-)
 
 diff --git a/drivers/staging/android/lowmemorykiller.c 
 b/drivers/staging/android/lowmemorykiller.c
 --- a/drivers/staging/android/lowmemorykiller.c
 +++ b/drivers/staging/android/lowmemorykiller.c
 @@ -40,7 +40,7 @@
  #include linux/notifier.h
  
  static uint32_t lowmem_debug_level = 2;
 -static int lowmem_adj[6] = {
 +static short lowmem_adj[6] = {
   0,
   1,
   6,
 @@ -70,9 +70,9 @@ static int lowmem_shrink(struct shrinker *s, struct 
 shrink_control *sc)
   int rem = 0;
   int tasksize;
   int i;
 - int min_score_adj = OOM_SCORE_ADJ_MAX + 1;
 + short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
   int selected_tasksize = 0;
 - int selected_oom_score_adj;
 + short selected_oom_score_adj;
   int array_size = ARRAY_SIZE(lowmem_adj);
   int other_free = global_page_state(NR_FREE_PAGES);
   int other_file = global_page_state(NR_FILE_PAGES) -
 @@ -90,7 +90,7 @@ static int lowmem_shrink(struct shrinker *s, struct 
 shrink_control *sc)
   }
   }
   if (sc-nr_to_scan  0)
 - lowmem_print(3, lowmem_shrink %lu, %x, ofree %d %d, ma %d\n,
 + lowmem_print(3, lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n,
   sc-nr_to_scan, sc-gfp_mask, other_free,
   other_file, min_score_adj);
   rem = global_page_state(NR_ACTIVE_ANON) +
 @@ -107,7 +107,7 @@ static int lowmem_shrink(struct shrinker *s, struct 
 shrink_control *sc)
   rcu_read_lock();
   for_each_process(tsk) {
   struct task_struct *p;
 - int oom_score_adj;
 + short oom_score_adj;
  
   if (tsk-flags  PF_KTHREAD)
   continue;
 @@ -141,11 +141,11 @@ static int lowmem_shrink(struct shrinker *s, struct 
 shrink_control *sc)
   selected = p;
   selected_tasksize = tasksize;
   selected_oom_score_adj = oom_score_adj;
 - lowmem_print(2, select %d (%s), adj %d, size %d, to kill\n,
 + lowmem_print(2, select %d (%s), adj %hd, size %d, to kill\n,
p-pid, p-comm, oom_score_adj, tasksize);
   }
   if (selected) {
 - lowmem_print(1, send sigkill to %d (%s), adj %d, size %d\n,
 + lowmem_print(1, send sigkill to %d (%s), adj %hd, size %d\n,
selected-pid, selected-comm,
selected_oom_score_adj, selected_tasksize);
   lowmem_deathpending_timeout = jiffies + HZ;
 @@ -176,7 +176,7 @@ static void __exit lowmem_exit(void)
  }
  
  module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR);
 -module_param_array_named(adj, lowmem_adj, int, lowmem_adj_size,
 +module_param_array_named(adj, lowmem_adj, short, lowmem_adj_size,
S_IRUGO | S_IWUSR);
  module_param_array_named(minfree, lowmem_minfree, uint, lowmem_minfree_size,
S_IRUGO | S_IWUSR);
 diff --git a/fs/proc/base.c b/fs/proc/base.c
 --- a/fs/proc/base.c
 +++ b/fs/proc/base.c
 @@ -878,7 +878,7 @@ static ssize_t oom_score_adj_read(struct file *file, char 
 __user *buf,
  {
   struct task_struct *task = get_proc_task(file-f_path.dentry-d_inode);
   char buffer[PROC_NUMBUF];
 - int oom_score_adj = OOM_SCORE_ADJ_MIN;
 + short oom_score_adj = OOM_SCORE_ADJ_MIN;
   unsigned long flags;
   size_t len;
  
 @@ -889,7 +889,7 @@ static ssize_t oom_score_adj_read(struct file *file, char 
 __user *buf,
   unlock_task_sighand(task, flags);
   }
   put_task_struct(task);
 - len = snprintf(buffer, sizeof(buffer), %d\n, oom_score_adj);
 + len = snprintf(buffer, sizeof(buffer), %hd\n, oom_score_adj);
   return simple_read_from_buffer(buf, count, ppos, buffer, len);
  }
  
 @@ -936,15 +936,15 @@ static ssize_t oom_score_adj_write(struct file *file, 
 const char __user *buf,
   goto err_task_lock

Re: [patch 2/2] mm, oom: fix race when specifying a thread as the oom origin

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 01:27:00, David Rientjes wrote:
 test_set_oom_score_adj() and compare_swap_oom_score_adj() are used to
 specify that current should be killed first if an oom condition occurs in
 between the two calls.
 
 The usage is
 
   short oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
   ...
   compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
 
 to store the thread's oom_score_adj, temporarily change it to the maximum
 score possible, and then restore the old value if it is still the same.
 
 This happens to still be racy, however, if the user writes
 OOM_SCORE_ADJ_MAX to /proc/pid/oom_score_adj in between the two calls.
 The compare_swap_oom_score_adj() will then incorrectly reset the old
 value prior to the write of OOM_SCORE_ADJ_MAX.
 
 To fix this, introduce a new oom_flags_t member in struct signal_struct
 that will be used for per-thread oom killer flags.  KSM and swapoff can
 now use a bit in this member to specify that threads should be killed
 first in oom conditions without playing around with oom_score_adj.
 
 This also allows the correct oom_score_adj to always be shown when
 reading /proc/pid/oom_score.
 
 Signed-off-by: David Rientjes rient...@google.com

I didn't like the previous playing with the oom_score_adj and what you
propose looks much nicer.
Maybe s/oom_task_origin/task_oom_origin/ would be a better fit with
{set,clear}_current_oom_origin but I do not care much.

Reviewed-by: Michal Hocko mho...@suse.cz

Thanks!
 ---
  include/linux/oom.h   |   19 +--
  include/linux/sched.h |1 +
  include/linux/types.h |1 +
  mm/ksm.c  |7 ++-
  mm/oom_kill.c |   49 
 +++--
  mm/swapfile.c |5 ++---
  6 files changed, 30 insertions(+), 52 deletions(-)
 
 diff --git a/include/linux/oom.h b/include/linux/oom.h
 --- a/include/linux/oom.h
 +++ b/include/linux/oom.h
 @@ -29,8 +29,23 @@ enum oom_scan_t {
   OOM_SCAN_SELECT,/* always select this thread first */
  };
  
 -extern void compare_swap_oom_score_adj(short old_val, short new_val);
 -extern short test_set_oom_score_adj(short new_val);
 +/* Thread is the potential origin of an oom condition; kill first on oom */
 +#define OOM_FLAG_ORIGIN  ((__force oom_flags_t)0x1)
 +
 +static inline void set_current_oom_origin(void)
 +{
 + current-signal-oom_flags |= OOM_FLAG_ORIGIN;
 +}
 +
 +static inline void clear_current_oom_origin(void)
 +{
 + current-signal-oom_flags = ~OOM_FLAG_ORIGIN;
 +}
 +
 +static inline bool oom_task_origin(const struct task_struct *p)
 +{
 + return !!(p-signal-oom_flags  OOM_FLAG_ORIGIN);
 +}
  
  extern unsigned long oom_badness(struct task_struct *p,
   struct mem_cgroup *memcg, const nodemask_t *nodemask,
 diff --git a/include/linux/sched.h b/include/linux/sched.h
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -631,6 +631,7 @@ struct signal_struct {
   struct rw_semaphore group_rwsem;
  #endif
  
 + oom_flags_t oom_flags;
   short oom_score_adj;/* OOM kill score adjustment */
   short oom_score_adj_min;/* OOM kill score adjustment min value.
* Only settable by CAP_SYS_RESOURCE. */
 diff --git a/include/linux/types.h b/include/linux/types.h
 --- a/include/linux/types.h
 +++ b/include/linux/types.h
 @@ -156,6 +156,7 @@ typedef u32 dma_addr_t;
  #endif
  typedef unsigned __bitwise__ gfp_t;
  typedef unsigned __bitwise__ fmode_t;
 +typedef unsigned __bitwise__ oom_flags_t;
  
  #ifdef CONFIG_PHYS_ADDR_T_64BIT
  typedef u64 phys_addr_t;
 diff --git a/mm/ksm.c b/mm/ksm.c
 --- a/mm/ksm.c
 +++ b/mm/ksm.c
 @@ -1929,12 +1929,9 @@ static ssize_t run_store(struct kobject *kobj, struct 
 kobj_attribute *attr,
   if (ksm_run != flags) {
   ksm_run = flags;
   if (flags  KSM_RUN_UNMERGE) {
 - short oom_score_adj;
 -
 - oom_score_adj = 
 test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
 + set_current_oom_origin();
   err = unmerge_and_remove_all_rmap_items();
 - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
 - oom_score_adj);
 + clear_current_oom_origin();
   if (err) {
   ksm_run = KSM_RUN_STOP;
   count = err;
 diff --git a/mm/oom_kill.c b/mm/oom_kill.c
 --- a/mm/oom_kill.c
 +++ b/mm/oom_kill.c
 @@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
  int sysctl_oom_dump_tasks = 1;
  static DEFINE_SPINLOCK(zone_scan_lock);
  
 -/*
 - * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
 - * @old_val: old oom_score_adj for compare
 - * @new_val: new oom_score_adj for swap
 - *
 - * Sets the oom_score_adj value for current to @new_val iff its present 
 value

Re: [PATCH 9/9 v2] cgroup_freezer: implement proper hierarchy support

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 07:29:23, Tejun Heo wrote:
 Hey, Michal.
 
 On Thu, Nov 08, 2012 at 04:20:39PM +0100, Michal Hocko wrote:
   So, in the above example in CPU2, (B-state  FREEZING) test and
   freezer_apply_state(C, false) can't be interleaved with the same
   inheritance operation from CPU1.  They either happen before or after.
  
  I am not sure I understand what you mean by inheritance operation but
 
 The operation of looking at one's parent and inherting the state.
 Here, checking parent-state and calling apply_state accordingly.
 
  you are right that the race is not possible because spin_lock makes
  sure that Foo-state is done after the lock(Foo-child) and spin_unlock
  then serves as a leave barrier so the other CPUs will see the correctly
  updated value. The rest is handled by the fixed ordered tree walk. So
  there is really no interleaving going on here.
 
 I'm worried that this is a bit too subtle.

Dunno, it looks obvious now, I just missed the entryleave implicit
barriers by spinlocks and again sorry about the confusion.

 This will work fine with a single hierarchy mutex protecting hierarchy
 updates and state propagations through it and that should work for
 most controllers too.

But single mutex is just ugly.

 I want to have at least one example of finer grained locking for
 future reference and cgroup_freezer happened to be the one I started
 working on.

I think this is a good example because it shows how to share the state
without too many implementation details.

 So, it is more complicated (not necessarily in written code but the
 sync rules) than necessary.  I'm still not sure whether to keep it or
 not.

I think the locking is fine and I would keep it this way rather than a
big lock.

 I'll add more documentation about synchronization in
 cgroup_for_each_descendant_pre() either way.

more doc cannot hurt ;)

Thanks
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V3] memcg, oom: provide more precise dump info while memcg oom happening

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 23:52:47, Sha Zhengju wrote:
[...]
 (2) After change
 [  269.225628] mal invoked oom-killer: gfp_mask=0xd0, order=0, oom_score_adj=0
 [  269.225633] mal cpuset=/ mems_allowed=0-1
 [  269.225636] Pid: 4616, comm: mal Not tainted 3.6.0+ #25
 [  269.225637] Call Trace:
 [  269.225647]  [8111b9c4] dump_header+0x84/0xd0
 [  269.225650]  [8111c691] oom_kill_process+0x331/0x350
 [  269.225710]  ...(call trace)
 [  269.225713]  [81517325] page_fault+0x25/0x30
 [  269.225716] Task in /1/2 killed as a result of limit of /1
 [  269.225718] memory: usage 511732kB, limit 512000kB, failcnt 5071
 [  269.225720] memory+swap: usage 563200kB, limit 563200kB, failcnt 57
 [  269.225721] kmem: usage 0kB, limit 9007199254740991kB, failcnt 0
 [  269.225722] Memory cgroup stats:cache:8KB rss:511724KB mapped_file:4KB 
 swap:51468KB inactive_anon:265864KB active_anon:245832KB inactive_file:0KB 
 active_file:0KB unevictable:0KB
 [  269.225741] [ pid ]   uid  tgid total_vm  rss nr_ptes swapents 
 oom_score_adj name
 [  269.225757] [ 4554] 0  455416626  473  17   25 
 0 bash
 [  269.225759] [ 4611] 0  4611   10332890231 20812260 
 0 mal
 [  269.225762] [ 4616] 0  4616   10332832799  88 7562 
 0 mal
 [  269.225764] Memory cgroup out of memory: Kill process 4611 (mal) score 699 
 or sacrifice child
 [  269.225766] Killed process 4611 (mal) total-vm:413312kB, 
 anon-rss:360632kB, file-rss:292kB
 
 This version provides more pointed info for memcg in Memory cgroup stats 
 section.

Looks much better!

 
 Change log:
 v3 --- v2
   1. fix towards hierarchy
   2. undo rework dump_tasks
 v2 --- v1
   1. some modification towards hierarchy
   2. rework dump_tasks
   3. rebased on Michal's mm tree since-3.6
 
 Signed-off-by: Sha Zhengju handai@taobao.com
 ---
  mm/memcontrol.c |   41 +++--
  mm/oom_kill.c   |6 --
  2 files changed, 35 insertions(+), 12 deletions(-)
 
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 0eab7d5..17317fa 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
[...]
 @@ -1501,8 +1509,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup 
 *memcg,
   spin_unlock_irqrestore(memcg-move_lock, *flags);
  }
  
 +#define K(x) ((x)  (PAGE_SHIFT-10))
  /**
 - * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in 
 read mode.

No need to remove this just fix it:
 * mem_cgroup_print_oom_info: Print OOM information relevant to memory 
controller.

   * @memcg: The memory cgroup that went over limit
   * @p: Task that is going to be killed
   *
 @@ -1520,8 +1528,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup 
 *memcg, struct task_struct *p)
*/
   static char memcg_name[PATH_MAX];
   int ret;
 + struct mem_cgroup *mi;
 + unsigned int i;
  
 - if (!memcg || !p)
 + if (!p)
   return;
  
   rcu_read_lock();
 @@ -1569,6 +1579,25 @@ done:
   res_counter_read_u64(memcg-kmem, RES_USAGE)  10,
   res_counter_read_u64(memcg-kmem, RES_LIMIT)  10,
   res_counter_read_u64(memcg-kmem, RES_FAILCNT));
 +
 + printk(KERN_INFO Memory cgroup stats:);

Memory cgroup hierarchy stats is probably a better fit with the
current implementation.

 + for (i = 0; i  MEM_CGROUP_STAT_NSTATS; i++) {
 + long long val = 0;
 + if (i == MEM_CGROUP_STAT_SWAP  !do_swap_account)
 + continue;
 + for_each_mem_cgroup_tree(mi, memcg)
 + val += mem_cgroup_read_stat(mi, i);
 + printk(KERN_CONT %s:%lldKB , mem_cgroup_stat_names[i], 
 K(val));
 + }
 +
 + for (i = 0; i  NR_LRU_LISTS; i++) {
 + unsigned long long val = 0;
 +
 + for_each_mem_cgroup_tree(mi, memcg)
 + val += mem_cgroup_nr_lru_pages(mi, BIT(i));
 + printk(KERN_CONT %s:%lluKB , mem_cgroup_lru_names[i], K(val));
 + }
 + printk(KERN_CONT \n);

This is nice and simple I am just thinking whether it is enough. Say
that you have a deeper hierarchy and the there is a safety limit in the
its root
A (limit)
   /|\
  B C D
  |\
  E F

and we trigger an OOM on the A's limit. Now we know that something blew
up but what it was we do not know. Wouldn't it be better to swap the for
and for_each_mem_cgroup_tree loops? Then we would see the whole
hierarchy and can potentially point at the group which doesn't behave.
Memory cgroup stats for A/: ...
Memory cgroup stats for A/B/: ...
Memory cgroup stats for A/C/: ...
Memory cgroup stats for A/D/: ...
Memory cgroup stats for A/D/E/: ...
Memory cgroup stats for A/D/F/: ...

Would it still fit in with your use case?
[...]
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org

Re: [PATCH 9/9 v3] cgroup_freezer: implement proper hierarchy support

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 09:57:50, Tejun Heo wrote:
 Up until now, cgroup_freezer didn't implement hierarchy properly.
 cgroups could be arranged in hierarchy but it didn't make any
 difference in how each cgroup_freezer behaved.  They all operated
 separately.
 
 This patch implements proper hierarchy support.  If a cgroup is
 frozen, all its descendants are frozen.  A cgroup is thawed iff it and
 all its ancestors are THAWED.  freezer.self_freezing shows the current
 freezing state for the cgroup itself.  freezer.parent_freezing shows
 whether the cgroup is freezing because any of its ancestors is
 freezing.
 
 freezer_post_create() locks the parent and new cgroup and inherits the
 parent's state and freezer_change_state() applies new state top-down
 using cgroup_for_each_descendant_pre() which guarantees that no child
 can escape its parent's state.  update_if_frozen() uses
 cgroup_for_each_descendant_post() to propagate frozen states
 bottom-up.
 
 Synchronization could be coarser and easier by using a single mutex to
 protect all hierarchy operations.  Finer grained approach was used
 because it wasn't too difficult for cgroup_freezer and I think it's
 beneficial to have an example implementation and cgroup_freezer is
 rather simple and can serve a good one.
 
 As this makes cgroup_freezer properly hierarchical,
 freezer_subsys.broken_hierarchy marking is removed.
 
 Note that this patch changes userland visible behavior - freezing a
 cgroup now freezes all its descendants too.  This behavior change is
 intended and has been warned via .broken_hierarchy.
 
 v2: Michal spotted a bug in freezer_change_state() - descendants were
 inheriting from the wrong ancestor.  Fixed.
 
 v3: Documentation/cgroups/freezer-subsystem.txt updated.
 
 Signed-off-by: Tejun Heo t...@kernel.org
 Reviewed-by: Tejun Heo t...@kernel.org

You probably meant Reviewed-by: Michal Hocko  ;)
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 9/9 v3] cgroup_freezer: implement proper hierarchy support

2012-11-08 Thread Michal Hocko
On Thu 08-11-12 10:04:17, Tejun Heo wrote:
 On Thu, Nov 08, 2012 at 07:02:46PM +0100, Michal Hocko wrote:
  On Thu 08-11-12 09:57:50, Tejun Heo wrote:
   Signed-off-by: Tejun Heo t...@kernel.org
   Reviewed-by: Tejun Heo t...@kernel.org
  
  You probably meant Reviewed-by: Michal Hocko  ;)
 
 Hehehheheh... man, I'm too self-absolved.  Thanks for noticing
 that. :)  Updated.

Heh, btw. the doc update looks good as well.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom, memcg: handle sysctl oom_kill_allocating_task while memcg oom happening

2012-10-19 Thread Michal Hocko
On Fri 19-10-12 12:11:52, Sha Zhengju wrote:
 On 10/18/2012 11:32 PM, Michal Hocko wrote:
 On Thu 18-10-12 21:51:57, Sha Zhengju wrote:
 On 10/18/2012 07:56 PM, Michal Hocko wrote:
 On Wed 17-10-12 01:14:48, Sha Zhengju wrote:
 On Tuesday, October 16, 2012, Michal Hockomho...@suse.cz   wrote:
 [...]
 Could you be more specific about the motivation for this patch? Is it
 let's be consistent with the global oom or you have a real use case
 for this knob.
 
 In our environment(rhel6), we encounter a memcg oom 'deadlock'
 problem.  Simply speaking, suppose process A is selected to be killed
 by memcg oom killer, but A is uninterruptible sleeping on a page
 lock. What's worse, the exact page lock is holding by another memcg
 process B which is trapped in mem_croup_oom_lock(proves to be a
 livelock).
 Hmm, this is strange. How can you get down that road with the page lock
 held? Is it possible this is related to the issue fixed by: 1d65f86d
 (mm: preallocate page before lock_page() at filemap COW)?
 No, it has nothing with the cow page. By checking stack of the process A
 selected to be killed(uninterruptible sleeping), it was stuck at:
 __do_fault-filemap_fault-__lock_page_or_retry-wait_on_page_bit--(D
 state).
 The person B holding the exactly page lock is on the following path:
 __do_fault-filemap_fault-__do_page_cache_readahead-..-mpage_readpages
 -add_to_page_cache_locked (in memcg oom and cannot exit)
 Hmm filemap_fault locks the page after the read ahead is triggered
 already so it doesn't call mpage_readpages with any page locked - the
 add_to_page_cache_lru is called without any page locked.

And I was probably blind yesterday because if I have looked inside
add_to_page_cache_lru then I would have found out that we lock the page
before charging it. /me stupid. Sorry about the confusion.
That one is OK, though, because the page is fresh new and not visible
when we charge it. This is not related to your problem, more on that
below.

 It's not the page being fault in filemap_fault that causing the
 problem, but those pages handling by readhead. To clarify the point,
 the more detailed call stack is:
 filemap_fault-do_async/sync_mmap_readahead-ondemand_readahead-
 __do_page_cache_readahead-read_pages-ext3/4_readpages-*mpage_readpages*
 
 It is because mpage_readpages that bring the risk:
 for each of readahead pages
  (1)add_to_page_cache_lru (-- *will lock page and go through
 memcg charging*) add the page to a big bio submit_bio (So those locked
 pages will be unlocked in end_bio after swapin)
 
 So if a page is being charged and cannot exit from memcg oom
 successfully (following I'll explain the reason) in step (1), it will
 cause the submit_bio indefinitely postponed while holding the PageLock
 of previous pages.

OK I think I am seeing what you are trying to say, finally. But you
are wrong here. Previously lockedcharged pages were already submitted
(every do_mpage_readpage submits the given page) so the IO will finish
eventually so those pages get unlocked.
 
[...]
 As I said, 37b23e05 has made pagefault killable by changing
 uninterruptible sleeping to killable sleeping. So A can be woke up to
 exit successfully and free the memory which can in turn help B pass
 memcg charging period.
 
 (By the way, it seems commit 37b23e05 and 7d9fdac need to be
 79dfdaccd1d5 you mean, right? That one just helps when there are too
 many tasks trashing oom killer so it is not related to what you are
 trying to achieve. Besides that make sure you take 23751be0 if you
 take it.
 
 
 Here is the reason why I said a process may go though memcg oom and cannot
 exit. It's just the phenomenon described in the commit log of 79dfdaccd:
 the old version of memcg oom lock can lead to serious starvation and make
 many tasks trash oom killer but nothing useful can be done.

Yes the trashing on oom is certainly possible without that patch and it
seems that this is what the culprit of the problem you are describing.

[...]
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/6] cgroups: forbid pre_destroy callback to fail

2012-10-19 Thread Michal Hocko
On Fri 19-10-12 17:33:18, Li Zefan wrote:
 On 2012/10/17 21:30, Michal Hocko wrote:
  Now that mem_cgroup_pre_destroy callback doesn't fail finally we can
  safely move on and forbit all the callbacks to fail. The last missing
  piece is moving cgroup_call_pre_destroy after cgroup_clear_css_refs so
  that css_tryget fails so no new charges for the memcg can happen.
 
  The callbacks are also called from within cgroup_lock to guarantee that
  no new tasks show up. 
 
 I'm afraid this won't work. See commit 
 3fa59dfbc3b223f02c26593be69ce6fc9a940405
 (cgroup: fix potential deadlock in pre_destroy)

Very good point. Thanks for poiting this out. So we should call
pre_destroy at the very end? What about the following?
Or should be rather drop the lock after check_for_release(parent) or
sooner but after CGRP_REMOVED is set?
---
From 70ea8718aba1c1784b94bfb26aa2307195c07c0b Mon Sep 17 00:00:00 2001
From: Michal Hocko mho...@suse.cz
Date: Wed, 17 Oct 2012 13:42:06 +0200
Subject: [PATCH] cgroups: forbid pre_destroy callback to fail

Now that mem_cgroup_pre_destroy callback doesn't fail finally we can
safely move on and forbit all the callbacks to fail. The last missing
piece is moving cgroup_call_pre_destroy after cgroup_clear_css_refs so
that css_tryget fails so no new charges for the memcg can happen.
We cannot, however, move cgroup_call_pre_destroy right after because we
cannot call mem_cgroup_pre_destroy with the cgroup_lock held (see
3fa59dfb cgroup: fix potential deadlock in pre_destroy) so we have to
move it after the lock is released.

Changes since v1
- Li Zefan pointed out that mem_cgroup_pre_destroy cannot be called with
  cgroup_lock held

Signed-off-by: Michal Hocko mho...@suse.cz
---
 kernel/cgroup.c |   30 +-
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b7d9606..4c6adbd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -855,7 +855,7 @@ static struct inode *cgroup_new_inode(umode_t mode, struct 
super_block *sb)
  * Call subsys's pre_destroy handler.
  * This is called before css refcnt check.
  */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
+static void cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
struct cgroup_subsys *ss;
int ret = 0;
@@ -864,15 +864,8 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
if (!ss-pre_destroy)
continue;
 
-   ret = ss-pre_destroy(cgrp);
-   if (ret) {
-   /* -pre_destroy() failure is being deprecated */
-   WARN_ON_ONCE(!ss-__DEPRECATED_clear_css_refs);
-   break;
-   }
+   BUG_ON(ss-pre_destroy(cgrp));
}
-
-   return ret;
 }
 
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -4161,7 +4154,6 @@ again:
mutex_unlock(cgroup_mutex);
return -EBUSY;
}
-   mutex_unlock(cgroup_mutex);
 
/*
 * In general, subsystem has no css-refcnt after pre_destroy(). But
@@ -4174,17 +4166,6 @@ again:
 */
set_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags);
 
-   /*
-* Call pre_destroy handlers of subsys. Notify subsystems
-* that rmdir() request comes.
-*/
-   ret = cgroup_call_pre_destroy(cgrp);
-   if (ret) {
-   clear_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags);
-   return ret;
-   }
-
-   mutex_lock(cgroup_mutex);
parent = cgrp-parent;
if (atomic_read(cgrp-count) || !list_empty(cgrp-children)) {
clear_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags);
@@ -4206,6 +4187,7 @@ again:
return -EINTR;
goto again;
}
+
/* NO css_tryget() can success after here. */
finish_wait(cgroup_rmdir_waitq, wait);
clear_bit(CGRP_WAIT_ON_RMDIR, cgrp-flags);
@@ -4244,6 +4226,12 @@ again:
spin_unlock(cgrp-event_list_lock);
 
mutex_unlock(cgroup_mutex);
+
+   /*
+* Call pre_destroy handlers of subsys. Notify subsystems
+* that rmdir() request comes.
+*/
+   cgroup_call_pre_destroy(cgrp);
return 0;
 }
 
-- 
1.7.10.4


-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/6] memcg: Simplify mem_cgroup_force_empty_list error handling

2012-10-19 Thread Michal Hocko
On Thu 18-10-12 15:16:54, Tejun Heo wrote:
 Hello, Michal.
 
 On Wed, Oct 17, 2012 at 03:30:45PM +0200, Michal Hocko wrote:
  mem_cgroup_force_empty_list currently tries to remove all pages from
  the given LRU. To prevent from temoporary failures (EBUSY returned by
  mem_cgroup_move_parent) it uses a margin to the current LRU pages and
  returns the true if there are still some pages left on the list.
  
  If we consider that mem_cgroup_move_parent fails only when we are racing
  with somebody else removing the page (resp. uncharging it) or when the
  page is migrated then it is obvious that all those failures are only
  temporal and so we can safely retry later.
  Let's get rid of the safety margin and make the loop really wait for the
  empty LRU. The caller should still make sure that all charges have been
  removed from the res_counter because mem_cgroup_replace_page_cache might
  add a page to the LRU after the check (it doesn't touch res_counter
  though).
  This catches most of the cases except for shmem which might call
  mem_cgroup_replace_page_cache with a page which is not charged and on
  the LRU yet but this was the case also without this patch. In order to
  fix this we need a guarantee that try_get_mem_cgroup_from_page falls
  back to the current mm's cgroup so it needs css_tryget to fail. This
  will be fixed up in a later patch because it nees a help from cgroup
  core.
  
  Signed-off-by: Michal Hocko mho...@suse.cz
 
 In the sense that I looked at it and nothing seemed too scary.
 
  Reviewed-by: Tejun Heo t...@kernel.org

Thanks

 
 Some nitpicks below.
 
   /*
  - * move charges to its parent.
  + * move charges to its parent or the root cgroup if the group
  + * has no parent (aka use_hierarchy==0).
  + * Although this might fail the failure is always temporary and it
  + * signals a race with a page removal/uncharge or migration. In the
  + * first case the page will vanish from the LRU on the next attempt
  + * and the call should be retried later.
*/
  -
 
 Maybe convert to proper /** function comment while at it?  

these are internal functions and we usually do not create kerneldoc for
them. But I can surely change it - it would deserve a bigger clean up
then.

 I also think it would be helpful to actually comment on each possible
 failure case explaining why the failure condition is temporal.

What about:

 * Although this might fail (get_page_unless_zero, isolate_lru_page or
 * mem_cgroup_move_account fails) the failure is always temporary and
 * it signals a race with a page removal/uncharge or migration. In the
 * first case the page is on the way out and it will vanish from the LRU
 * on the next attempt and the call should be retried later.
 * Isolation from the LRU fails only if page has been isolated from
 * the LRU since we looked at it and that usually means either global
 * reclaim or migration going on. The page will either get back to the
 * LRU or vanish.
 * Finaly mem_cgroup_move_account fails only if the page got uncharged
 * (!PageCgroupUsed) or moved to a different group. The page will
 * disappear in the next attempt.


Better? Or should it rather be in the changelog?

 
   /*
* Traverse a specified page_cgroup list and try to drop them all.  This 
  doesn't
  - * reclaim the pages page themselves - it just removes the page_cgroups.
  - * Returns true if some page_cgroups were not freed, indicating that the 
  caller
  - * must retry this operation.
  + * reclaim the pages page themselves - pages are moved to the parent (or 
  root)
  + * group.
*/
 
 Ditto.
 
  -static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
  +static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
  int node, int zid, enum lru_list lru)
   {
  struct mem_cgroup_per_zone *mz;
  -   unsigned long flags, loop;
  +   unsigned long flags;
  struct list_head *list;
  struct page *busy;
  struct zone *zone;
  @@ -3696,11 +3701,8 @@ static bool mem_cgroup_force_empty_list(struct 
  mem_cgroup *memcg,
  mz = mem_cgroup_zoneinfo(memcg, node, zid);
  list = mz-lruvec.lists[lru];
   
  -   loop = mz-lru_size[lru];
  -   /* give some margin against EBUSY etc...*/
  -   loop += 256;
  busy = NULL;
  -   while (loop--) {
  +   do {
  struct page_cgroup *pc;
  struct page *page;
   
  @@ -3726,8 +3728,7 @@ static bool mem_cgroup_force_empty_list(struct 
  mem_cgroup *memcg,
  cond_resched();
  } else
  busy = NULL;
  -   }
  -   return !list_empty(list);
  +   } while (!list_empty(list));
   }
 
 Is there anything which can keep failing until migration to another
 cgroup is complete?  

This is not about migration to another cgroup. Remember there are no
tasks in the group so we have no origin for the migration. I was talking
about migrate_pages.

 I think there is, e.g., if mmap_sem is busy or memcg is co-mounted
 with other

Re: [PATCH 4/6] cgroups: forbid pre_destroy callback to fail

2012-10-19 Thread Michal Hocko
On Thu 18-10-12 15:41:48, Tejun Heo wrote:
 Hello, Michal.
 
 On Wed, Oct 17, 2012 at 03:30:46PM +0200, Michal Hocko wrote:
  Now that mem_cgroup_pre_destroy callback doesn't fail finally we can
  safely move on and forbit all the callbacks to fail. The last missing
  piece is moving cgroup_call_pre_destroy after cgroup_clear_css_refs so
  that css_tryget fails so no new charges for the memcg can happen.
  The callbacks are also called from within cgroup_lock to guarantee that
  no new tasks show up. We could theoretically call them outside of the
  lock but then we have to move after CGRP_REMOVED flag is set.
  
  Signed-off-by: Michal Hocko mho...@suse.cz
 
 So, the plan is to do something like the following once memcg is
 ready.
 
   http://thread.gmane.org/gmane.linux.kernel.containers/22559/focus=75251
 
 Note that the patch is broken in a couple places but it does show the
 general direction.  I'd prefer if patch #3 simply makes pre_destroy()
 return 0 and drop __DEPRECATED_clear_css_refs from mem_cgroup_subsys.

We can still fail inn #3 without this patch becasuse there are is no
guarantee that a new task is attached to the group. And I wanted to keep
memcg and generic cgroup parts separated.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/6] cgroups: forbid pre_destroy callback to fail

2012-10-19 Thread Michal Hocko
On Thu 18-10-12 15:46:06, Tejun Heo wrote:
 On Thu, Oct 18, 2012 at 03:41:48PM -0700, Tejun Heo wrote:
  Note that the patch is broken in a couple places but it does show the
  general direction.  I'd prefer if patch #3 simply makes pre_destroy()
  return 0 and drop __DEPRECATED_clear_css_refs from mem_cgroup_subsys.
  Then, I can pull the branch in and drop all the unnecessary cruft.
 
 But you need the locking change for further memcg cleanup.  To avoid
 interlocked pulls from both sides, I think it's okay to push this one
 with the rest of memcg changes.  I can do the cleanup on top of this
 whole series, but please do drop .__DEPRECATED_clear_css_refs from
 memcg.

OK I will drop that one.

  Acked-by: Tejun Heo t...@kernel.org

Do you still agree with the v2 based on Li's feedback?

Thanks
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/6] memcg: make mem_cgroup_reparent_charges non failing

2012-10-19 Thread Michal Hocko
This is an updated version of the patch. I have dropped
.__DEPRECATED_clear_css_refs in this one as it makes the best sense to
me. I didn't add Tejun's Reviewed-by because of this change. Could you
recheck, please?
---
From 6c1f2e76e254e7638ad8cc87f319e3492ac80c5b Mon Sep 17 00:00:00 2001
From: Michal Hocko mho...@suse.cz
Date: Wed, 17 Oct 2012 14:15:09 +0200
Subject: [PATCH] memcg: make mem_cgroup_reparent_charges non failing

Now that pre_destroy callbacks are called from within cgroup_lock and
the cgroup has been checked to be empty without any children then there
is no other way to fail.
mem_cgroup_pre_destroy doesn't have to take a reference to memcg's css
because all css' are marked dead already.

mem_cgroup_subsys.__DEPRECATED_clear_css_refs can be dropped as
mem_cgroup_pre_destroy cannot fail now.

Changes since v1
- drop __DEPRECATED_clear_css_refs

Signed-off-by: Michal Hocko mho...@suse.cz
---
 mm/memcontrol.c |   19 ++-
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f57ba4c..b4d854e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3738,14 +3738,12 @@ static void mem_cgroup_force_empty_list(struct 
mem_cgroup *memcg,
  *
  * Caller is responsible for holding css reference on the memcg.
  */
-static int mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
+static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 {
struct cgroup *cgrp = memcg-css.cgroup;
int node, zid;
 
do {
-   if (cgroup_task_count(cgrp) || !list_empty(cgrp-children))
-   return -EBUSY;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
drain_all_stock_sync(memcg);
@@ -3771,8 +3769,6 @@ static int mem_cgroup_reparent_charges(struct mem_cgroup 
*memcg)
 * charge before adding to the LRU.
 */
} while (res_counter_read_u64(memcg-res, RES_USAGE)  0);
-
-   return 0;
 }
 
 /*
@@ -3809,7 +3805,9 @@ static int mem_cgroup_force_empty(struct mem_cgroup 
*memcg)
 
}
lru_add_drain();
-   return mem_cgroup_reparent_charges(memcg);
+   mem_cgroup_reparent_charges(memcg);
+
+   return 0;
 }
 
 static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int 
event)
@@ -5013,13 +5011,9 @@ free_out:
 static int mem_cgroup_pre_destroy(struct cgroup *cont)
 {
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-   int ret;
 
-   css_get(memcg-css);
-   ret = mem_cgroup_reparent_charges(memcg);
-   css_put(memcg-css);
-
-   return ret;
+   mem_cgroup_reparent_charges(memcg);
+   return 0;
 }
 
 static void mem_cgroup_destroy(struct cgroup *cont)
@@ -5621,7 +5615,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
.base_cftypes = mem_cgroup_files,
.early_init = 0,
.use_id = 1,
-   .__DEPRECATED_clear_css_refs = true,
 };
 
 #ifdef CONFIG_MEMCG_SWAP
-- 
1.7.10.4

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: process hangs on do_exit when oom happens

2012-10-19 Thread Michal Hocko
On Wed 17-10-12 18:23:34, gaoqiang wrote:
 I looked up nothing useful with google,so I'm here for help..
 
 when this happens:  I use memcg to limit the memory use of a
 process,and when the memcg cgroup was out of memory,
 the process was oom-killed   however,it cannot really complete the
 exiting. here is the some information

How many tasks are in the group and what kind of memory do they use?
Is it possible that you were hit by the same issue as described in 
79dfdacc memcg: make oom_lock 0 and 1 based rather than counter.

 OS version:  centos6.22.6.32.220.7.1

Your kernel is quite old and you should be probably asking your
distribution to help you out. There were many fixes since 2.6.32.
Are you able to reproduce the same issue with the current vanila kernel?

 /proc/pid/stack
 ---
 
 [810597ca] __cond_resched+0x2a/0x40
 [81121569] unmap_vmas+0xb49/0xb70
 [8112822e] exit_mmap+0x7e/0x140
 [8105b078] mmput+0x58/0x110
 [81061aad] exit_mm+0x11d/0x160
 [81061c9d] do_exit+0x1ad/0x860
 [81062391] do_group_exit+0x41/0xb0
 [81077cd8] get_signal_to_deliver+0x1e8/0x430
 [8100a4c4] do_notify_resume+0xf4/0x8b0
 [8100b281] int_signal+0x12/0x17
 [] 0x

This looks strange because this is just an exit part which shouldn't
deadlock or anything. Is this stack stable? Have you tried to take check
it more times?

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/6] cgroups: forbid pre_destroy callback to fail

2012-10-22 Thread Michal Hocko
On Fri 19-10-12 13:24:05, Tejun Heo wrote:
 Hello, Michal.
 
 On Fri, Oct 19, 2012 at 03:32:45PM +0200, Michal Hocko wrote:
  On Thu 18-10-12 15:41:48, Tejun Heo wrote:
   Hello, Michal.
   
   On Wed, Oct 17, 2012 at 03:30:46PM +0200, Michal Hocko wrote:
Now that mem_cgroup_pre_destroy callback doesn't fail finally we can
safely move on and forbit all the callbacks to fail. The last missing
piece is moving cgroup_call_pre_destroy after cgroup_clear_css_refs so
that css_tryget fails so no new charges for the memcg can happen.
The callbacks are also called from within cgroup_lock to guarantee that
no new tasks show up. We could theoretically call them outside of the
lock but then we have to move after CGRP_REMOVED flag is set.

Signed-off-by: Michal Hocko mho...@suse.cz
   
   So, the plan is to do something like the following once memcg is
   ready.
   
 http://thread.gmane.org/gmane.linux.kernel.containers/22559/focus=75251
   
   Note that the patch is broken in a couple places but it does show the
   general direction.  I'd prefer if patch #3 simply makes pre_destroy()
   return 0 and drop __DEPRECATED_clear_css_refs from mem_cgroup_subsys.
  
  We can still fail inn #3 without this patch becasuse there are is no
  guarantee that a new task is attached to the group. And I wanted to keep
  memcg and generic cgroup parts separated.
 
 Yes, but all other controllers are broken that way too

It's just hugetlb and memcg that have pre_destroy.

 and the worst thing which will hapen is triggering WARN_ON_ONCE().

The patch does BUG_ON(ss-pre_destroy(cgrp)). I am not sure WARN_ON_ONCE is
appropriate here because we would like to have it at least per
controller warning. I do not see any reason why to make this more
complicated but I am open to suggestions.

 Let's note the failure in the commit and remove
 DEPREDATED_clear_css_refs in the previous patch.  Then, I can pull
 from you, clean up pre_destroy mess and then you can pull back for
 further cleanups.

Well this will get complicated as there are dependencies between memcg
parts (based on Andrew's tree) and your tree. My tree is not pullable as
all the patches go via Andrew. I am not sure how to get out of this.
There is only one cgroup patch so what about pushing all of this via
Andrew and do the follow up cleanups once they get merged? We are not in
hurry, are we?

Anyway does it really make sense to drop DEPREDATED_clear_css_refs
already in the previous patch when it is _not_ guaranteed that
pre_destroy succeeds?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 06/14] memcg: kmem controller infrastructure

2012-10-22 Thread Michal Hocko
[Sorry for the late reply]

On Mon 22-10-12 16:34:15, Glauber Costa wrote:
 On 10/20/2012 12:34 AM, David Rientjes wrote:
  On Fri, 19 Oct 2012, Glauber Costa wrote:
  
  What about gfp  __GFP_FS?
 
 
  Do you intend to prevent or allow OOM under that flag? I personally
  think that anything that accepts to be OOM-killed should have GFP_WAIT
  set, so that ought to be enough.
 
 
  The oom killer in the page allocator cannot trigger without __GFP_FS 
  because direct reclaim has little chance of being very successful and 
  thus we end up needlessly killing processes, and that tends to happen 
  quite a bit if we dont check for it.  Seems like this would also happen 
  with memcg if mem_cgroup_reclaim() has a large probability of failing?
 
 
  I can indeed see tests for GFP_FS in some key locations in mm/ before
  calling the OOM Killer.
 
  Should I test for GFP_IO as well?
  
  It's not really necessary, if __GFP_IO isn't set then it wouldn't make 
  sense for __GFP_FS to be set.
  
  If the idea is preventing OOM to
  trigger for allocations that can write their pages back, how would you
  feel about the following test:
 
  may_oom = (gfp  GFP_KERNEL)  !(gfp  __GFP_NORETRY) ?
 
  
  I would simply copy the logic from the page allocator and only trigger oom 
  for __GFP_FS and !__GFP_NORETRY.
  
 
 That seems reasonable to me. Michal ?

Yes it makes sense to be consistent with the global case. While we are
at it, do we need to consider PF_DUMPCORE resp. !__GFP_NOFAIL?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: process hangs on do_exit when oom happens

2012-10-22 Thread Michal Hocko
On Mon 22-10-12 10:16:43, Qiang Gao wrote:
 I don't know whether  the process will exit finally, bug this stack lasts
 for hours, which is obviously unnormal.
 The situation:  we use a command calld cglimit to fork-and-exec the
 worker process,and the cglimit will
 set some limitation on the worker with cgroup. for now,we limit the
 memory,and we also use cpu cgroup,but with
 no limiation,so when the worker is running, the cgroup directory looks like
 following:
 
 /cgroup/memory/worker : this directory limit the memory
 /cgroup/cpu/worker :with no limit,but worker process is in.
 
 for some reason(some other process we didn't consider),  the worker process
 invoke global oom-killer,

Are you sure that this is really global oom? What was the limit for the
group?

 not cgroup-oom-killer.  then the worker process hangs there.
 
 Actually, if we didn't set the worker process into the cpu cgroup, this
 will never happens.

Strange and it smells like a misconfiguration. Could you provide the
compllete setting for both controllers?
grep . -r /cgroup/

 On Sat, Oct 20, 2012 at 12:04 AM, Michal Hocko mho...@suse.cz wrote:
 
  On Wed 17-10-12 18:23:34, gaoqiang wrote:
   I looked up nothing useful with google,so I'm here for help..
  
   when this happens:  I use memcg to limit the memory use of a
   process,and when the memcg cgroup was out of memory,
   the process was oom-killed   however,it cannot really complete the
   exiting. here is the some information
 
  How many tasks are in the group and what kind of memory do they use?
  Is it possible that you were hit by the same issue as described in
  79dfdacc memcg: make oom_lock 0 and 1 based rather than counter.
 
   OS version:  centos6.22.6.32.220.7.1
 
  Your kernel is quite old and you should be probably asking your
  distribution to help you out. There were many fixes since 2.6.32.
  Are you able to reproduce the same issue with the current vanila kernel?
 
   /proc/pid/stack
   ---
  
   [810597ca] __cond_resched+0x2a/0x40
   [81121569] unmap_vmas+0xb49/0xb70
   [8112822e] exit_mmap+0x7e/0x140
   [8105b078] mmput+0x58/0x110
   [81061aad] exit_mm+0x11d/0x160
   [81061c9d] do_exit+0x1ad/0x860
   [81062391] do_group_exit+0x41/0xb0
   [81077cd8] get_signal_to_deliver+0x1e8/0x430
   [8100a4c4] do_notify_resume+0xf4/0x8b0
   [8100b281] int_signal+0x12/0x17
   [] 0x
 
  This looks strange because this is just an exit part which shouldn't
  deadlock or anything. Is this stack stable? Have you tried to take check
  it more times?
 
  --
  Michal Hocko
  SUSE Labs
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: process hangs on do_exit when oom happens

2012-10-23 Thread Michal Hocko
On Tue 23-10-12 11:35:52, Qiang Gao wrote:
 I'm sure this is a global-oom,not cgroup-oom. [the dmesg output in the end]

Yes this is the global oom killer because:
 cglimit -M 700M ./tt 
 then after global-oom,the process hangs..

 179184 pages RAM

So you have ~700M of RAM so the memcg limit is basically pointless as it
cannot be reached...
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: make zone_pcp_reset independ on MEMORY_HOTREMOVE

2012-10-23 Thread Michal Hocko
340175b7 (mm/hotplug: free zone-pageset when a zone becomes empty)
introduced zone_pcp_reset and hided it inside CONFIG_MEMORY_HOTREMOVE.
The function is since 506e5fb7 (memory-hotplug: allocate zone's pcp
before onlining pages) called also called from online_pages which
is called outside CONFIG_MEMORY_HOTREMOVE which causes a linkage error.

The function, although not used outside of MEMORY_{HOTPLUT,HOTREMOVE},
seems like universal enough so let's keep it at its current location
and only remove the HOTREMOVE guard.

Signed-off-by: Michal Hocko mho...@suse.cz
Cc: David Rientjes rient...@google.com
Cc: Jiang Liu liu...@gmail.com
Cc: Len Brown len.br...@intel.com
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Cc: Paul Mackerras pau...@samba.org
Cc: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
Cc: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
Cc: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Cc: Dave Hansen d...@linux.vnet.ibm.com
Cc: Mel Gorman m...@csn.ul.ie
---
 mm/page_alloc.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e29912e..30e359c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5981,7 +5981,6 @@ void __meminit zone_pcp_update(struct zone *zone)
 }
 #endif
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void zone_pcp_reset(struct zone *zone)
 {
unsigned long flags;
@@ -6001,6 +6000,7 @@ void zone_pcp_reset(struct zone *zone)
local_irq_restore(flags);
 }
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: process hangs on do_exit when oom happens

2012-10-23 Thread Michal Hocko
On Tue 23-10-12 17:08:40, Qiang Gao wrote:
 this is just an example to show how to reproduce. actually,the first time I 
 saw
 this situation was on a machine with 288G RAM with many tasks running and
 we limit 30G for each.  but finanlly, no one exceeds this limit the the system
 oom.

Yes but mentioning memory controller then might be misleading... It
seems that the only factor in your load is the cpu controller.

And please stop top-posting. It makes the discussion messy.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: process hangs on do_exit when oom happens

2012-10-23 Thread Michal Hocko
On Tue 23-10-12 15:18:48, Qiang Gao wrote:
 This process was moved to RT-priority queue when global oom-killer
 happened to boost the recovery of the system..

Who did that? oom killer doesn't boost the priority (scheduling class)
AFAIK.

 but it wasn't get properily dealt with. I still have no idea why where
 the problem is ..

Well your configuration says that there is no runtime reserved for the
group.
Please refer to Documentation/scheduler/sched-rt-group.txt for more
information.

 On Tue, Oct 23, 2012 at 12:40 PM, Balbir Singh bsinghar...@gmail.com wrote:
  On Tue, Oct 23, 2012 at 9:05 AM, Qiang Gao gaoqiangs...@gmail.com wrote:
  information about the system is in the attach file information.txt
 
  I can not reproduce it in the upstream 3.6.0 kernel..
 
  On Sat, Oct 20, 2012 at 12:04 AM, Michal Hocko mho...@suse.cz wrote:
  On Wed 17-10-12 18:23:34, gaoqiang wrote:
  I looked up nothing useful with google,so I'm here for help..
 
  when this happens:  I use memcg to limit the memory use of a
  process,and when the memcg cgroup was out of memory,
  the process was oom-killed   however,it cannot really complete the
  exiting. here is the some information
 
  How many tasks are in the group and what kind of memory do they use?
  Is it possible that you were hit by the same issue as described in
  79dfdacc memcg: make oom_lock 0 and 1 based rather than counter.
 
  OS version:  centos6.22.6.32.220.7.1
 
  Your kernel is quite old and you should be probably asking your
  distribution to help you out. There were many fixes since 2.6.32.
  Are you able to reproduce the same issue with the current vanila kernel?
 
  /proc/pid/stack
  ---
 
  [810597ca] __cond_resched+0x2a/0x40
  [81121569] unmap_vmas+0xb49/0xb70
  [8112822e] exit_mmap+0x7e/0x140
  [8105b078] mmput+0x58/0x110
  [81061aad] exit_mm+0x11d/0x160
  [81061c9d] do_exit+0x1ad/0x860
  [81062391] do_group_exit+0x41/0xb0
  [81077cd8] get_signal_to_deliver+0x1e8/0x430
  [8100a4c4] do_notify_resume+0xf4/0x8b0
  [8100b281] int_signal+0x12/0x17
  [] 0x
 
  This looks strange because this is just an exit part which shouldn't
  deadlock or anything. Is this stack stable? Have you tried to take check
  it more times?
 
  Looking at information.txt, I found something interesting
 
  rt_rq[0]:/1314
.rt_nr_running : 1
.rt_throttled  : 1
.rt_time   : 0.856656
.rt_runtime: 0.00
 
 
  cfs_rq[0]:/1314
.exec_clock: 8738.133429
.MIN_vruntime  : 0.01
.min_vruntime  : 8739.371271
.max_vruntime  : 0.01
.spread: 0.00
.spread0   : -9792.24
.nr_spread_over: 1
.nr_running: 0
.load  : 0
.load_avg  : 7376.722880
.load_period   : 7.203830
.load_contrib  : 1023
.load_tg   : 1023
.se-exec_start: 282004.715064
.se-vruntime  : 18435.664560
.se-sum_exec_runtime  : 8738.133429
.se-wait_start: 0.00
.se-sleep_start   : 0.00
.se-block_start   : 0.00
.se-sleep_max : 0.00
.se-block_max : 0.00
.se-exec_max  : 77.977054
.se-slice_max : 0.00
.se-wait_max  : 2.664779
.se-wait_sum  : 29.970575
.se-wait_count: 102
.se-load.weight   : 2
 
  So 1314 is a real time process and
 
  cpu.rt_period_us:
  100
  --
  cpu.rt_runtime_us:
  0
 
  When did tt move to being a Real Time process (hint: see nr_running
  and nr_throttled)?
 
  Balbir
 --
 To unsubscribe from this list: send the line unsubscribe cgroups in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: process hangs on do_exit when oom happens

2012-10-23 Thread Michal Hocko
On Tue 23-10-12 18:10:33, Qiang Gao wrote:
 On Tue, Oct 23, 2012 at 5:50 PM, Michal Hocko mho...@suse.cz wrote:
  On Tue 23-10-12 15:18:48, Qiang Gao wrote:
  This process was moved to RT-priority queue when global oom-killer
  happened to boost the recovery of the system..
 
  Who did that? oom killer doesn't boost the priority (scheduling class)
  AFAIK.
 
  but it wasn't get properily dealt with. I still have no idea why where
  the problem is ..
 
  Well your configuration says that there is no runtime reserved for the
  group.
  Please refer to Documentation/scheduler/sched-rt-group.txt for more
  information.
 
[...]
 maybe this is not a upstream-kernel bug. the centos/redhat kernel
 would boost the process to RT prio when the process was selected
 by oom-killer.

This still looks like your cpu controller is misconfigured. Even if the
task is promoted to be realtime.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: make zone_pcp_reset independ on MEMORY_HOTREMOVE

2012-10-23 Thread Michal Hocko
On Tue 23-10-12 18:14:28, Wen Congyang wrote:
 At 10/23/2012 05:37 PM, Michal Hocko Wrote:
  340175b7 (mm/hotplug: free zone-pageset when a zone becomes empty)
  introduced zone_pcp_reset and hided it inside CONFIG_MEMORY_HOTREMOVE.
  The function is since 506e5fb7 (memory-hotplug: allocate zone's pcp
  before onlining pages) called also called from online_pages which
 
 This patch is still in -mm tree, and I have received a report from
 Liu Yuanhan.

Yes you are right. I will resend it and ask Andrew to fold it into the
offending patch. Thanks for catching that!

  is called outside CONFIG_MEMORY_HOTREMOVE which causes a linkage error.
  
  The function, although not used outside of MEMORY_{HOTPLUT,HOTREMOVE},
  seems like universal enough so let's keep it at its current location
  and only remove the HOTREMOVE guard.
  
  Signed-off-by: Michal Hocko mho...@suse.cz
  Cc: David Rientjes rient...@google.com
  Cc: Jiang Liu liu...@gmail.com
  Cc: Len Brown len.br...@intel.com
  Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
  Cc: Paul Mackerras pau...@samba.org
  Cc: Christoph Lameter c...@linux.com
  Cc: Minchan Kim minchan@gmail.com
  Cc: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
  Cc: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
  Cc: Dave Hansen d...@linux.vnet.ibm.com
  Cc: Mel Gorman m...@csn.ul.ie
  ---
   mm/page_alloc.c |2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)
  
  diff --git a/mm/page_alloc.c b/mm/page_alloc.c
  index e29912e..30e359c 100644
  --- a/mm/page_alloc.c
  +++ b/mm/page_alloc.c
  @@ -5981,7 +5981,6 @@ void __meminit zone_pcp_update(struct zone *zone)
   }
   #endif
   
  -#ifdef CONFIG_MEMORY_HOTREMOVE
   void zone_pcp_reset(struct zone *zone)
   {
  unsigned long flags;
  @@ -6001,6 +6000,7 @@ void zone_pcp_reset(struct zone *zone)
  local_irq_restore(flags);
   }
   
  +#ifdef CONFIG_MEMORY_HOTREMOVE
   /*
* All pages in the range must be isolated before calling this.
*/
 
 This patch looks find to me.
 
 Reviewed-by: Wen Congyang we...@cn.fujitsu.com
 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mmotm 2012-10-22-17-08 uploaded (memory_hotplug.c)

2012-10-23 Thread Michal Hocko
On Mon 22-10-12 21:40:09, Randy Dunlap wrote:
 On 10/22/2012 05:09 PM, a...@linux-foundation.org wrote:
 
  The mm-of-the-moment snapshot 2012-10-22-17-08 has been uploaded to
  
 http://www.ozlabs.org/~akpm/mmotm/
  
  mmotm-readme.txt says
  
  README for mm-of-the-moment:
  
  http://www.ozlabs.org/~akpm/mmotm/
  
  This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
  more than once a week.
  
 
 
 
 on x86_64, when CONFIG_MEMORY_HOTREMOVE is not enabled:
 
 mm/built-in.o: In function `online_pages':
 (.ref.text+0x10e7): undefined reference to `zone_pcp_reset'

Caused by memory-hotplug-allocate-zones-pcp-before-onlining-pages.patch.
And fixed by. Andrew either fold this one in to the above one or keep it
separate what works better with you.
---
From e8d79e446b00e57c195c59570df0f2ec435ca39d Mon Sep 17 00:00:00 2001
From: Michal Hocko mho...@suse.cz
Date: Tue, 23 Oct 2012 11:07:11 +0200
Subject: [PATCH] mm: make zone_pcp_reset independ on MEMORY_HOTREMOVE

340175b7 (mm/hotplug: free zone-pageset when a zone becomes empty)
introduced zone_pcp_reset and hided it inside CONFIG_MEMORY_HOTREMOVE.
Since memory-hotplug: allocate zone's pcp before onlining pages the
function is also called from online_pages which is defined outside
CONFIG_MEMORY_HOTREMOVE which causes a linkage error.

The function, although not used outside of MEMORY_{HOTPLUT,HOTREMOVE},
seems like universal enough so let's keep it at its current location
and only remove the HOTREMOVE guard.

Signed-off-by: Michal Hocko mho...@suse.cz
Reviewed-by: Wen Congyang we...@cn.fujitsu.com
Cc: David Rientjes rient...@google.com
Cc: Jiang Liu liu...@gmail.com
Cc: Len Brown len.br...@intel.com
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Cc: Paul Mackerras pau...@samba.org
Cc: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
Cc: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
Cc: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Cc: Dave Hansen d...@linux.vnet.ibm.com
Cc: Mel Gorman m...@csn.ul.ie
---
 mm/page_alloc.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e29912e..30e359c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5981,7 +5981,6 @@ void __meminit zone_pcp_update(struct zone *zone)
 }
 #endif
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void zone_pcp_reset(struct zone *zone)
 {
unsigned long flags;
@@ -6001,6 +6000,7 @@ void zone_pcp_reset(struct zone *zone)
local_irq_restore(flags);
 }
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
-- 
1.7.10.4


-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] add some drop_caches documentation and info messsge

2012-10-24 Thread Michal Hocko
On Tue 23-10-12 16:45:46, Andrew Morton wrote:
 On Fri, 12 Oct 2012 14:57:08 +0200
 Michal Hocko mho...@suse.cz wrote:
 
  Hi,
  I would like to resurrect the following Dave's patch. The last time it
  has been posted was here https://lkml.org/lkml/2010/9/16/250 and there
  didn't seem to be any strong opposition. 
  Kosaki was worried about possible excessive logging when somebody drops
  caches too often (but then he claimed he didn't have a strong opinion
  on that) but I would say opposite. If somebody does that then I would
  really like to know that from the log when supporting a system because
  it almost for sure means that there is something fishy going on. It is
  also worth mentioning that only root can write drop caches so this is
  not an flooding attack vector.
  I am bringing that up again because this can be really helpful when
  chasing strange performance issues which (surprise surprise) turn out to
  be related to artificially dropped caches done because the admin thinks
  this would help...
  
  I have just refreshed the original patch on top of the current mm tree
  but I could live with KERN_INFO as well if people think that KERN_NOTICE
  is too hysterical.
  ---
  From 1f4058be9b089bc9d43d71bc63989335d7637d8d Mon Sep 17 00:00:00 2001
  From: Dave Hansen d...@linux.vnet.ibm.com
  Date: Fri, 12 Oct 2012 14:30:54 +0200
  Subject: [PATCH] add some drop_caches documentation and info messsge
  
  There is plenty of anecdotal evidence and a load of blog posts
  suggesting that using drop_caches periodically keeps your system
  running in tip top shape.  Perhaps adding some kernel
  documentation will increase the amount of accurate data on its use.
  
  If we are not shrinking caches effectively, then we have real bugs.
  Using drop_caches will simply mask the bugs and make them harder
  to find, but certainly does not fix them, nor is it an appropriate
  workaround to limit the size of the caches.
  
  It's a great debugging tool, and is really handy for doing things
  like repeatable benchmark runs.  So, add a bit more documentation
  about it, and add a little KERN_NOTICE.  It should help developers
  who are chasing down reclaim-related bugs.
  
  ...
 
  +   printk(KERN_NOTICE %s (%d): dropped kernel caches: %d\n,
  +   current-comm, task_pid_nr(current), 
  sysctl_drop_caches);
 
 urgh.  Are we really sure we want to do this?  The system operators who
 are actually using this thing will hate us :(

I have no problems with lowering the priority (how do you see
KERN_INFO?) but shouldn't this message kick them that they are doing
something wrong? Or if somebody uses that for benchmarking to have a
clean table before start is this really that invasive?

 More friendly alternatives might be:
 
 - Taint the kernel.  But that will only become apparent with an oops
   trace or similar.
 
 - Add a drop_caches counter and make that available in /proc/vmstat,
   show_mem() output and perhaps other places.

We would loose timing and originating process name in both cases which
can be really helpful while debugging. It is fair to say that we could
deduce the timing if we are collecting /proc/meminfo or /proc/vmstat
already and we do collect them often but this is not the case all of the
time and sometimes it is important to know _who_ is doing all this.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 05/14] Add a __GFP_KMEMCG flag

2012-10-09 Thread Michal Hocko
On Mon 08-10-12 14:06:11, Glauber Costa wrote:
 This flag is used to indicate to the callees that this allocation is a
 kernel allocation in process context, and should be accounted to
 current's memcg. It takes numerical place of the of the recently removed
 __GFP_NO_KSWAPD.
 
 [ v4: make flag unconditional, also declare it in trace code ]
 
 Signed-off-by: Glauber Costa glom...@parallels.com
 CC: Christoph Lameter c...@linux.com
 CC: Pekka Enberg penb...@cs.helsinki.fi
 CC: Michal Hocko mho...@suse.cz
 CC: Suleiman Souhlal sulei...@google.com
 Acked-by: Johannes Weiner han...@cmpxchg.org
 Acked-by: Rik van Riel r...@redhat.com
 Acked-by: Mel Gorman m...@csn.ul.ie
 Acked-by: Kamezawa Hiroyuki kamezawa.hir...@jp.fujitsu.com

Acked-by: Michal Hocko mho...@suse.cz

 ---
  include/linux/gfp.h | 3 ++-
  include/trace/events/gfpflags.h | 1 +
  2 files changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/include/linux/gfp.h b/include/linux/gfp.h
 index 02c1c97..9289d46 100644
 --- a/include/linux/gfp.h
 +++ b/include/linux/gfp.h
 @@ -31,6 +31,7 @@ struct vm_area_struct;
  #define ___GFP_THISNODE  0x4u
  #define ___GFP_RECLAIMABLE   0x8u
  #define ___GFP_NOTRACK   0x20u
 +#define ___GFP_KMEMCG0x40u
  #define ___GFP_OTHER_NODE0x80u
  #define ___GFP_WRITE 0x100u
  
 @@ -87,7 +88,7 @@ struct vm_area_struct;
  
  #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of 
 other node */
  #define __GFP_WRITE  ((__force gfp_t)___GFP_WRITE)   /* Allocator intends to 
 dirty page */
 -
 +#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from 
 a memcg-accounted resource */
  /*
   * This may seem redundant, but it's a way of annotating false positives vs.
   * allocations that simply cannot be supported (e.g. page tables).
 diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
 index 9391706..730df12 100644
 --- a/include/trace/events/gfpflags.h
 +++ b/include/trace/events/gfpflags.h
 @@ -36,6 +36,7 @@
   {(unsigned long)__GFP_RECLAIMABLE,  GFP_RECLAIMABLE}, \
   {(unsigned long)__GFP_MOVABLE,  GFP_MOVABLE}, \
   {(unsigned long)__GFP_NOTRACK,  GFP_NOTRACK}, \
 + {(unsigned long)__GFP_KMEMCG,   GFP_KMEMCG},  \
   {(unsigned long)__GFP_OTHER_NODE,   GFP_OTHER_NODE}   \
   ) : GFP_NOWAIT
  
 -- 
 1.7.11.4
 
 --
 To unsubscribe from this list: send the line unsubscribe cgroups in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   4   5   6   7   8   9   10   >