Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-14 Thread Li Zefan
On 2014/8/12 5:05, David Rientjes wrote:
> On Mon, 11 Aug 2014, Vladimir Davydov wrote:
> 
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1963,7 +1963,7 @@ zonelist_scan:
>>>  
>>> /*
>>>  * Scan zonelist, looking for a zone with enough free.
>>> -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
>>> +* See __cpuset_node_allowed() comment in kernel/cpuset.c.
>>>  */
>>> for_each_zone_zonelist_nodemask(zone, z, zonelist,
>>> high_zoneidx, nodemask) {
>>> @@ -1974,7 +1974,7 @@ zonelist_scan:
>>> continue;
>>> if (cpusets_enabled() &&
>>> (alloc_flags & ALLOC_CPUSET) &&
>>> -   !cpuset_zone_allowed_softwall(zone, gfp_mask))
>>> +   !cpuset_zone_allowed(zone, gfp_mask))
>>> continue;
>>
>> So, this is get_page_from_freelist. It's called from
>> __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
>> set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
>> bit set only for __GFP_WAIT allocations. That said, w/o your patch we
>> try to respect cpusets for all allocations, including atomic, and only
>> ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
>> allocations, while with your patch we always ignore cpusets for
>> !__GFP_WAIT allocations. Not sure if it really matters though, because
>> usually one uses cpuset.mems in conjunction with cpuset.cpus and it
>> won't make any difference then. It also doesn't conflict with any cpuset
>> documentation.
>>
> 
> Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.  

I'm not quite sure. That code has been there before I got involved in cpuset.

> The only thing that we get by falling back to the page allocator slowpath 
> is that kswapd gets woken up before the allocation is attempted without 
> ALLOC_CPUSET.  It seems pointless to wakeup kswapd when the allocation can 
> succeed on any node.  Even with the patch, if the allocation fails because 
> all nodes are below their min watermark, then we still fallback to the 
> slowpath and wake up kswapd but there's nothing much else we can do 
> because it's !__GFP_WAIT.
> .

But I tend to agree with you. But if we want to do this, we should split this
change from the cleanup.

Regarding to the cleanup, I found there used to be a single 
cpuset_node_allowed(),
and your cleanup is exactly a revert of that ancient commit:

commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6
Author: Paul Jackson 
Date:   Wed Dec 13 00:34:25 2006 -0800

[PATCH] cpuset: rework cpuset_zone_allowed api

Seems the major intention was to avoid accident sleep-in-atomic bugs, because
callback_mutex might be held.

I don't see there's any reason callback_mutex can't be a spinlock. I thought
about this when Gu Zhen fixed the bug that callback_mutex is nested inside
rcu_read_lock().

--
 kernel/cpuset.c | 81 ++---
 1 file changed, 49 insertions(+), 32 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index baa155c..9d9e239 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
  */
 
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
 
 /*
  * CPU / memory hotplug is handled asynchronously.
@@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
+   unsigned long flags;
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
@@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
continue;
rcu_read_unlock();
 
-   mutex_lock(_mutex);
+   spin_lock_irqsave(_lock, flags);
cpumask_copy(cp->effective_cpus, new_cpus);
-   mutex_unlock(_mutex);
+   spin_unlock_irqrestore(_lock, flags);
 
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  const char *buf)
 {
+   unsigned long flags;
int retval;
 
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
if (retval < 0)
return retval;
 
-   mutex_lock(_mutex);
+   spin_lock_irqsave(_lock, flags);
cpumask_copy(cs->cpus_allowed, 

Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-14 Thread Li Zefan
On 2014/8/12 5:05, David Rientjes wrote:
 On Mon, 11 Aug 2014, Vladimir Davydov wrote:
 
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -1963,7 +1963,7 @@ zonelist_scan:
  
 /*
  * Scan zonelist, looking for a zone with enough free.
 -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
 +* See __cpuset_node_allowed() comment in kernel/cpuset.c.
  */
 for_each_zone_zonelist_nodemask(zone, z, zonelist,
 high_zoneidx, nodemask) {
 @@ -1974,7 +1974,7 @@ zonelist_scan:
 continue;
 if (cpusets_enabled() 
 (alloc_flags  ALLOC_CPUSET) 
 -   !cpuset_zone_allowed_softwall(zone, gfp_mask))
 +   !cpuset_zone_allowed(zone, gfp_mask))
 continue;

 So, this is get_page_from_freelist. It's called from
 __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
 set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
 bit set only for __GFP_WAIT allocations. That said, w/o your patch we
 try to respect cpusets for all allocations, including atomic, and only
 ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
 allocations, while with your patch we always ignore cpusets for
 !__GFP_WAIT allocations. Not sure if it really matters though, because
 usually one uses cpuset.mems in conjunction with cpuset.cpus and it
 won't make any difference then. It also doesn't conflict with any cpuset
 documentation.

 
 Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.  

I'm not quite sure. That code has been there before I got involved in cpuset.

 The only thing that we get by falling back to the page allocator slowpath 
 is that kswapd gets woken up before the allocation is attempted without 
 ALLOC_CPUSET.  It seems pointless to wakeup kswapd when the allocation can 
 succeed on any node.  Even with the patch, if the allocation fails because 
 all nodes are below their min watermark, then we still fallback to the 
 slowpath and wake up kswapd but there's nothing much else we can do 
 because it's !__GFP_WAIT.
 .

But I tend to agree with you. But if we want to do this, we should split this
change from the cleanup.

Regarding to the cleanup, I found there used to be a single 
cpuset_node_allowed(),
and your cleanup is exactly a revert of that ancient commit:

commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6
Author: Paul Jackson p...@sgi.com
Date:   Wed Dec 13 00:34:25 2006 -0800

[PATCH] cpuset: rework cpuset_zone_allowed api

Seems the major intention was to avoid accident sleep-in-atomic bugs, because
callback_mutex might be held.

I don't see there's any reason callback_mutex can't be a spinlock. I thought
about this when Gu Zhen fixed the bug that callback_mutex is nested inside
rcu_read_lock().

--
 kernel/cpuset.c | 81 ++---
 1 file changed, 49 insertions(+), 32 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index baa155c..9d9e239 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
  */
 
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
 
 /*
  * CPU / memory hotplug is handled asynchronously.
@@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
+   unsigned long flags;
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
@@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
continue;
rcu_read_unlock();
 
-   mutex_lock(callback_mutex);
+   spin_lock_irqsave(callback_lock, flags);
cpumask_copy(cp-effective_cpus, new_cpus);
-   mutex_unlock(callback_mutex);
+   spin_unlock_irqrestore(callback_lock, flags);
 
WARN_ON(!cgroup_on_dfl(cp-css.cgroup) 
!cpumask_equal(cp-cpus_allowed, cp-effective_cpus));
@@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  const char *buf)
 {
+   unsigned long flags;
int retval;
 
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
if (retval  0)
return retval;
 
-   mutex_lock(callback_mutex);
+   spin_lock_irqsave(callback_lock, flags);
cpumask_copy(cs-cpus_allowed, trialcs-cpus_allowed);
-   mutex_unlock(callback_mutex);
+

Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread David Rientjes
On Mon, 11 Aug 2014, Vladimir Davydov wrote:

> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1963,7 +1963,7 @@ zonelist_scan:
> >  
> > /*
> >  * Scan zonelist, looking for a zone with enough free.
> > -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
> > +* See __cpuset_node_allowed() comment in kernel/cpuset.c.
> >  */
> > for_each_zone_zonelist_nodemask(zone, z, zonelist,
> > high_zoneidx, nodemask) {
> > @@ -1974,7 +1974,7 @@ zonelist_scan:
> > continue;
> > if (cpusets_enabled() &&
> > (alloc_flags & ALLOC_CPUSET) &&
> > -   !cpuset_zone_allowed_softwall(zone, gfp_mask))
> > +   !cpuset_zone_allowed(zone, gfp_mask))
> > continue;
> 
> So, this is get_page_from_freelist. It's called from
> __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
> set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
> bit set only for __GFP_WAIT allocations. That said, w/o your patch we
> try to respect cpusets for all allocations, including atomic, and only
> ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
> allocations, while with your patch we always ignore cpusets for
> !__GFP_WAIT allocations. Not sure if it really matters though, because
> usually one uses cpuset.mems in conjunction with cpuset.cpus and it
> won't make any difference then. It also doesn't conflict with any cpuset
> documentation.
> 

Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.  
The only thing that we get by falling back to the page allocator slowpath 
is that kswapd gets woken up before the allocation is attempted without 
ALLOC_CPUSET.  It seems pointless to wakeup kswapd when the allocation can 
succeed on any node.  Even with the patch, if the allocation fails because 
all nodes are below their min watermark, then we still fallback to the 
slowpath and wake up kswapd but there's nothing much else we can do 
because it's !__GFP_WAIT.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread Vladimir Davydov
On Mon, Aug 11, 2014 at 04:37:15AM -0700, David Rientjes wrote:
> On Mon, 11 Aug 2014, Vladimir Davydov wrote:
> 
> > > diff --git a/mm/slab.c b/mm/slab.c
> > > --- a/mm/slab.c
> > > +++ b/mm/slab.c
> > > @@ -3047,16 +3047,19 @@ retry:
> > >* from existing per node queues.
> > >*/
> > >   for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> > > - nid = zone_to_nid(zone);
> > > + struct kmem_cache_node *n;
> > >  
> > > - if (cpuset_zone_allowed_hardwall(zone, flags) &&
> > > - get_node(cache, nid) &&
> > > - get_node(cache, nid)->free_objects) {
> > > - obj = cache_alloc_node(cache,
> > > - flags | GFP_THISNODE, nid);
> > > - if (obj)
> > > - break;
> > > - }
> > > + nid = zone_to_nid(zone);
> > > + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))
> > 
> > We must use softwall check here, otherwise we will proceed to
> > alloc_pages even if there are lots of free slabs on other nodes.
> > alloc_pages, in turn, may allocate from other nodes in case
> > cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet
> > another free slab to another node's list even if it isn't empty. As a
> > result, we may get free list bloating on other nodes. I've seen a
> > machine with one of its nodes almost completely filled with inactive
> > slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So,
> > this is a bug that must be fixed.
> > 
> 
> Right, I understand, and my patch makes no attempt to fix that issue, it's 
> simply collapsing the code down into a single cpuset_zone_allowed() 
> function and the context for the allocation is controlled by the gfp 
> flags (and hardwall is controlled by setting __GFP_HARDWALL) as it should 
> be.  I understand the issue you face, but I can't combine a cleanup with a 
> fix and I would prefer to have your patch keep your commit description.  

Sorry, I misunderstood you.

> The diffstat for my proposal removes many more lines than it adds and I 
> think it will avoid this type of issue in the future for new callers.  
> Your patch could then be based on the single cpuset_zone_allowed() 
> function where you would simply have to remove the __GFP_HARDWALL above.  
> Or, your patch could be merged first and then my cleanup on top, but it 
> seems like your one-liner would be more clear if it is based on mine.

Having one function instead of two doing similar thing is usually better
IMO, but AFAIU your patch isn't a mere cleanup - it also slightly
changes the logic behind !__GFP_WAIT vs cpusets interaction:

> @@ -2505,18 +2501,22 @@ static struct cpuset 
> *nearest_hardwall_ancestor(struct cpuset *cs)
>   *   GFP_USER - only nodes in current tasks mems allowed ok.
>   *
>   * Rule:
> - *Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
> + *Don't call __cpuset_node_allowed if you can't sleep, unless you
>   *pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
>   *the code that might scan up ancestor cpusets and sleep.
>   */
> -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
> +int __cpuset_node_allowed(int node, const gfp_t gfp_mask)
>  {
>   struct cpuset *cs;  /* current cpuset ancestors */
>   int allowed;/* is allocation in zone z allowed? */
>  
> - if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
> + if (in_interrupt())
>   return 1;
>   might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
> + if (gfp_mask & __GFP_THISNODE)
> + return 1;
> + if (!(gfp_mask & __GFP_WAIT))
> + return 1;

This means cpuset_zone_allowed will now always return true for
!__GFP_WAIT allocations.

>   if (node_isset(node, current->mems_allowed))
>   return 1;
>   /*
[...]
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1963,7 +1963,7 @@ zonelist_scan:
>  
>   /*
>* Scan zonelist, looking for a zone with enough free.
> -  * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
> +  * See __cpuset_node_allowed() comment in kernel/cpuset.c.
>*/
>   for_each_zone_zonelist_nodemask(zone, z, zonelist,
>   high_zoneidx, nodemask) {
> @@ -1974,7 +1974,7 @@ zonelist_scan:
>   continue;
>   if (cpusets_enabled() &&
>   (alloc_flags & ALLOC_CPUSET) &&
> - !cpuset_zone_allowed_softwall(zone, gfp_mask))
> + !cpuset_zone_allowed(zone, gfp_mask))
>   continue;

So, this is get_page_from_freelist. It's called from
__alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
set and from __alloc_pages_slowpath with 

Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread David Rientjes
On Mon, 11 Aug 2014, Vladimir Davydov wrote:

> > diff --git a/mm/slab.c b/mm/slab.c
> > --- a/mm/slab.c
> > +++ b/mm/slab.c
> > @@ -3047,16 +3047,19 @@ retry:
> >  * from existing per node queues.
> >  */
> > for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> > -   nid = zone_to_nid(zone);
> > +   struct kmem_cache_node *n;
> >  
> > -   if (cpuset_zone_allowed_hardwall(zone, flags) &&
> > -   get_node(cache, nid) &&
> > -   get_node(cache, nid)->free_objects) {
> > -   obj = cache_alloc_node(cache,
> > -   flags | GFP_THISNODE, nid);
> > -   if (obj)
> > -   break;
> > -   }
> > +   nid = zone_to_nid(zone);
> > +   if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))
> 
> We must use softwall check here, otherwise we will proceed to
> alloc_pages even if there are lots of free slabs on other nodes.
> alloc_pages, in turn, may allocate from other nodes in case
> cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet
> another free slab to another node's list even if it isn't empty. As a
> result, we may get free list bloating on other nodes. I've seen a
> machine with one of its nodes almost completely filled with inactive
> slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So,
> this is a bug that must be fixed.
> 

Right, I understand, and my patch makes no attempt to fix that issue, it's 
simply collapsing the code down into a single cpuset_zone_allowed() 
function and the context for the allocation is controlled by the gfp 
flags (and hardwall is controlled by setting __GFP_HARDWALL) as it should 
be.  I understand the issue you face, but I can't combine a cleanup with a 
fix and I would prefer to have your patch keep your commit description.  

The diffstat for my proposal removes many more lines than it adds and I 
think it will avoid this type of issue in the future for new callers.  
Your patch could then be based on the single cpuset_zone_allowed() 
function where you would simply have to remove the __GFP_HARDWALL above.  
Or, your patch could be merged first and then my cleanup on top, but it 
seems like your one-liner would be more clear if it is based on mine.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread Vladimir Davydov
On Sun, Aug 10, 2014 at 03:43:21PM -0700, David Rientjes wrote:
> On Sun, 10 Aug 2014, Vladimir Davydov wrote:
> 
> > fallback_alloc is called on kmalloc if the preferred node doesn't have
> > free or partial slabs and there's no pages on the node's free list
> > (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries
> > to locate a free or partial slab on other allowed nodes' lists. While
> > iterating over the preferred node's zonelist it skips those zones which
> > cpuset_zone_allowed_hardwall returns false for. That means that for a
> > task bound to a specific node using cpusets fallback_alloc will always
> > ignore free slabs on other nodes and go directly to the reclaimer,
> > which, however, may allocate from other nodes if cpuset.mem_hardwall is
> > unset (default). As a result, we may get lists of free slabs grow
> > without bounds on other nodes, which is bad, because inactive slabs are
> > only evicted by cache_reap at a very slow rate and cannot be dropped
> > forcefully.
> > 
> > To reproduce the issue, run a process that will walk over a directory
> > tree with lots of files inside a cpuset bound to a node that constantly
> > experiences memory pressure. Look at num_slabs vs active_slabs growth as
> > reported by /proc/slabinfo.
> > 
> > We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it
> > can sleep, we only call it on __GFP_WAIT allocations. For atomic
> > allocations we simply ignore cpusets, which is in agreement with the
> > cpuset documenation (see the comment to __cpuset_node_allowed_softwall).
> > 
> 
> If that rule were ever changed, nobody would think to modify the 
> fallback_alloc() behavior in the slab allocator.  Why can't 
> cpuset_zone_allowed_hardwall() just return 1 for !__GFP_WAIT?
> 
> I don't think this issue is restricted only to slab, it's for all callers 
> of cpuset_zone_allowed_softwall() that could possibly be atomic.  I think 
> it would be better to determine if cpuset_zone_allowed() should be 
> hardwall or softwall depending on the gfp flags.
> 
> Let's add Li, the cpuset maintainer.  Any reason we can't do this?
> ---
[...]
> diff --git a/mm/slab.c b/mm/slab.c
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -3047,16 +3047,19 @@ retry:
>* from existing per node queues.
>*/
>   for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> - nid = zone_to_nid(zone);
> + struct kmem_cache_node *n;
>  
> - if (cpuset_zone_allowed_hardwall(zone, flags) &&
> - get_node(cache, nid) &&
> - get_node(cache, nid)->free_objects) {
> - obj = cache_alloc_node(cache,
> - flags | GFP_THISNODE, nid);
> - if (obj)
> - break;
> - }
> + nid = zone_to_nid(zone);
> + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))

We must use softwall check here, otherwise we will proceed to
alloc_pages even if there are lots of free slabs on other nodes.
alloc_pages, in turn, may allocate from other nodes in case
cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet
another free slab to another node's list even if it isn't empty. As a
result, we may get free list bloating on other nodes. I've seen a
machine with one of its nodes almost completely filled with inactive
slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So,
this is a bug that must be fixed.

Note, for SLUB using hardwall check in get_any_partial won't lead to
such a problem, because once added a new slab is loaded to a per cpu
list forcing any further user to allocate from it. Strictly speaking, we
should use softwall check there either though.

> + continue;
> + n = get_node(cache, nid);
> + if (!n)
> + continue;
> + if (!n->free_objects)
> + continue;
> + obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid);
> + if (obj)
> + break;
>   }
>  
>   if (!obj) {
> diff --git a/mm/slub.c b/mm/slub.c
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1671,20 +1671,22 @@ static void *get_any_partial(struct kmem_cache *s, 
> gfp_t flags,
>   struct kmem_cache_node *n;
>  
>   n = get_node(s, zone_to_nid(zone));
> + if (!n)
> + continue;
> + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))
> + continue;
> + if (n->nr_parial <= s->min_partial)
> + continue;
>  
> - if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
> - n->nr_partial > s->min_partial) {
> - object = get_partial_node(s, n, c, 

Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread David Rientjes
On Mon, 11 Aug 2014, Vladimir Davydov wrote:

  diff --git a/mm/page_alloc.c b/mm/page_alloc.c
  --- a/mm/page_alloc.c
  +++ b/mm/page_alloc.c
  @@ -1963,7 +1963,7 @@ zonelist_scan:
   
  /*
   * Scan zonelist, looking for a zone with enough free.
  -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
  +* See __cpuset_node_allowed() comment in kernel/cpuset.c.
   */
  for_each_zone_zonelist_nodemask(zone, z, zonelist,
  high_zoneidx, nodemask) {
  @@ -1974,7 +1974,7 @@ zonelist_scan:
  continue;
  if (cpusets_enabled() 
  (alloc_flags  ALLOC_CPUSET) 
  -   !cpuset_zone_allowed_softwall(zone, gfp_mask))
  +   !cpuset_zone_allowed(zone, gfp_mask))
  continue;
 
 So, this is get_page_from_freelist. It's called from
 __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
 set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
 bit set only for __GFP_WAIT allocations. That said, w/o your patch we
 try to respect cpusets for all allocations, including atomic, and only
 ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
 allocations, while with your patch we always ignore cpusets for
 !__GFP_WAIT allocations. Not sure if it really matters though, because
 usually one uses cpuset.mems in conjunction with cpuset.cpus and it
 won't make any difference then. It also doesn't conflict with any cpuset
 documentation.
 

Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.  
The only thing that we get by falling back to the page allocator slowpath 
is that kswapd gets woken up before the allocation is attempted without 
ALLOC_CPUSET.  It seems pointless to wakeup kswapd when the allocation can 
succeed on any node.  Even with the patch, if the allocation fails because 
all nodes are below their min watermark, then we still fallback to the 
slowpath and wake up kswapd but there's nothing much else we can do 
because it's !__GFP_WAIT.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread Vladimir Davydov
On Sun, Aug 10, 2014 at 03:43:21PM -0700, David Rientjes wrote:
 On Sun, 10 Aug 2014, Vladimir Davydov wrote:
 
  fallback_alloc is called on kmalloc if the preferred node doesn't have
  free or partial slabs and there's no pages on the node's free list
  (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries
  to locate a free or partial slab on other allowed nodes' lists. While
  iterating over the preferred node's zonelist it skips those zones which
  cpuset_zone_allowed_hardwall returns false for. That means that for a
  task bound to a specific node using cpusets fallback_alloc will always
  ignore free slabs on other nodes and go directly to the reclaimer,
  which, however, may allocate from other nodes if cpuset.mem_hardwall is
  unset (default). As a result, we may get lists of free slabs grow
  without bounds on other nodes, which is bad, because inactive slabs are
  only evicted by cache_reap at a very slow rate and cannot be dropped
  forcefully.
  
  To reproduce the issue, run a process that will walk over a directory
  tree with lots of files inside a cpuset bound to a node that constantly
  experiences memory pressure. Look at num_slabs vs active_slabs growth as
  reported by /proc/slabinfo.
  
  We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it
  can sleep, we only call it on __GFP_WAIT allocations. For atomic
  allocations we simply ignore cpusets, which is in agreement with the
  cpuset documenation (see the comment to __cpuset_node_allowed_softwall).
  
 
 If that rule were ever changed, nobody would think to modify the 
 fallback_alloc() behavior in the slab allocator.  Why can't 
 cpuset_zone_allowed_hardwall() just return 1 for !__GFP_WAIT?
 
 I don't think this issue is restricted only to slab, it's for all callers 
 of cpuset_zone_allowed_softwall() that could possibly be atomic.  I think 
 it would be better to determine if cpuset_zone_allowed() should be 
 hardwall or softwall depending on the gfp flags.
 
 Let's add Li, the cpuset maintainer.  Any reason we can't do this?
 ---
[...]
 diff --git a/mm/slab.c b/mm/slab.c
 --- a/mm/slab.c
 +++ b/mm/slab.c
 @@ -3047,16 +3047,19 @@ retry:
* from existing per node queues.
*/
   for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 - nid = zone_to_nid(zone);
 + struct kmem_cache_node *n;
  
 - if (cpuset_zone_allowed_hardwall(zone, flags) 
 - get_node(cache, nid) 
 - get_node(cache, nid)-free_objects) {
 - obj = cache_alloc_node(cache,
 - flags | GFP_THISNODE, nid);
 - if (obj)
 - break;
 - }
 + nid = zone_to_nid(zone);
 + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))

We must use softwall check here, otherwise we will proceed to
alloc_pages even if there are lots of free slabs on other nodes.
alloc_pages, in turn, may allocate from other nodes in case
cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet
another free slab to another node's list even if it isn't empty. As a
result, we may get free list bloating on other nodes. I've seen a
machine with one of its nodes almost completely filled with inactive
slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So,
this is a bug that must be fixed.

Note, for SLUB using hardwall check in get_any_partial won't lead to
such a problem, because once added a new slab is loaded to a per cpu
list forcing any further user to allocate from it. Strictly speaking, we
should use softwall check there either though.

 + continue;
 + n = get_node(cache, nid);
 + if (!n)
 + continue;
 + if (!n-free_objects)
 + continue;
 + obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid);
 + if (obj)
 + break;
   }
  
   if (!obj) {
 diff --git a/mm/slub.c b/mm/slub.c
 --- a/mm/slub.c
 +++ b/mm/slub.c
 @@ -1671,20 +1671,22 @@ static void *get_any_partial(struct kmem_cache *s, 
 gfp_t flags,
   struct kmem_cache_node *n;
  
   n = get_node(s, zone_to_nid(zone));
 + if (!n)
 + continue;
 + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))
 + continue;
 + if (n-nr_parial = s-min_partial)
 + continue;
  
 - if (n  cpuset_zone_allowed_hardwall(zone, flags) 
 - n-nr_partial  s-min_partial) {
 - object = get_partial_node(s, n, c, flags);
 - if (object) {
 - /*
 -   

Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread David Rientjes
On Mon, 11 Aug 2014, Vladimir Davydov wrote:

  diff --git a/mm/slab.c b/mm/slab.c
  --- a/mm/slab.c
  +++ b/mm/slab.c
  @@ -3047,16 +3047,19 @@ retry:
   * from existing per node queues.
   */
  for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
  -   nid = zone_to_nid(zone);
  +   struct kmem_cache_node *n;
   
  -   if (cpuset_zone_allowed_hardwall(zone, flags) 
  -   get_node(cache, nid) 
  -   get_node(cache, nid)-free_objects) {
  -   obj = cache_alloc_node(cache,
  -   flags | GFP_THISNODE, nid);
  -   if (obj)
  -   break;
  -   }
  +   nid = zone_to_nid(zone);
  +   if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))
 
 We must use softwall check here, otherwise we will proceed to
 alloc_pages even if there are lots of free slabs on other nodes.
 alloc_pages, in turn, may allocate from other nodes in case
 cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet
 another free slab to another node's list even if it isn't empty. As a
 result, we may get free list bloating on other nodes. I've seen a
 machine with one of its nodes almost completely filled with inactive
 slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So,
 this is a bug that must be fixed.
 

Right, I understand, and my patch makes no attempt to fix that issue, it's 
simply collapsing the code down into a single cpuset_zone_allowed() 
function and the context for the allocation is controlled by the gfp 
flags (and hardwall is controlled by setting __GFP_HARDWALL) as it should 
be.  I understand the issue you face, but I can't combine a cleanup with a 
fix and I would prefer to have your patch keep your commit description.  

The diffstat for my proposal removes many more lines than it adds and I 
think it will avoid this type of issue in the future for new callers.  
Your patch could then be based on the single cpuset_zone_allowed() 
function where you would simply have to remove the __GFP_HARDWALL above.  
Or, your patch could be merged first and then my cleanup on top, but it 
seems like your one-liner would be more clear if it is based on mine.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-11 Thread Vladimir Davydov
On Mon, Aug 11, 2014 at 04:37:15AM -0700, David Rientjes wrote:
 On Mon, 11 Aug 2014, Vladimir Davydov wrote:
 
   diff --git a/mm/slab.c b/mm/slab.c
   --- a/mm/slab.c
   +++ b/mm/slab.c
   @@ -3047,16 +3047,19 @@ retry:
  * from existing per node queues.
  */
 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
   - nid = zone_to_nid(zone);
   + struct kmem_cache_node *n;

   - if (cpuset_zone_allowed_hardwall(zone, flags) 
   - get_node(cache, nid) 
   - get_node(cache, nid)-free_objects) {
   - obj = cache_alloc_node(cache,
   - flags | GFP_THISNODE, nid);
   - if (obj)
   - break;
   - }
   + nid = zone_to_nid(zone);
   + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL))
  
  We must use softwall check here, otherwise we will proceed to
  alloc_pages even if there are lots of free slabs on other nodes.
  alloc_pages, in turn, may allocate from other nodes in case
  cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet
  another free slab to another node's list even if it isn't empty. As a
  result, we may get free list bloating on other nodes. I've seen a
  machine with one of its nodes almost completely filled with inactive
  slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So,
  this is a bug that must be fixed.
  
 
 Right, I understand, and my patch makes no attempt to fix that issue, it's 
 simply collapsing the code down into a single cpuset_zone_allowed() 
 function and the context for the allocation is controlled by the gfp 
 flags (and hardwall is controlled by setting __GFP_HARDWALL) as it should 
 be.  I understand the issue you face, but I can't combine a cleanup with a 
 fix and I would prefer to have your patch keep your commit description.  

Sorry, I misunderstood you.

 The diffstat for my proposal removes many more lines than it adds and I 
 think it will avoid this type of issue in the future for new callers.  
 Your patch could then be based on the single cpuset_zone_allowed() 
 function where you would simply have to remove the __GFP_HARDWALL above.  
 Or, your patch could be merged first and then my cleanup on top, but it 
 seems like your one-liner would be more clear if it is based on mine.

Having one function instead of two doing similar thing is usually better
IMO, but AFAIU your patch isn't a mere cleanup - it also slightly
changes the logic behind !__GFP_WAIT vs cpusets interaction:

 @@ -2505,18 +2501,22 @@ static struct cpuset 
 *nearest_hardwall_ancestor(struct cpuset *cs)
   *   GFP_USER - only nodes in current tasks mems allowed ok.
   *
   * Rule:
 - *Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
 + *Don't call __cpuset_node_allowed if you can't sleep, unless you
   *pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
   *the code that might scan up ancestor cpusets and sleep.
   */
 -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 +int __cpuset_node_allowed(int node, const gfp_t gfp_mask)
  {
   struct cpuset *cs;  /* current cpuset ancestors */
   int allowed;/* is allocation in zone z allowed? */
  
 - if (in_interrupt() || (gfp_mask  __GFP_THISNODE))
 + if (in_interrupt())
   return 1;
   might_sleep_if(!(gfp_mask  __GFP_HARDWALL));
 + if (gfp_mask  __GFP_THISNODE)
 + return 1;
 + if (!(gfp_mask  __GFP_WAIT))
 + return 1;

This means cpuset_zone_allowed will now always return true for
!__GFP_WAIT allocations.

   if (node_isset(node, current-mems_allowed))
   return 1;
   /*
[...]
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -1963,7 +1963,7 @@ zonelist_scan:
  
   /*
* Scan zonelist, looking for a zone with enough free.
 -  * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
 +  * See __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
   for_each_zone_zonelist_nodemask(zone, z, zonelist,
   high_zoneidx, nodemask) {
 @@ -1974,7 +1974,7 @@ zonelist_scan:
   continue;
   if (cpusets_enabled() 
   (alloc_flags  ALLOC_CPUSET) 
 - !cpuset_zone_allowed_softwall(zone, gfp_mask))
 + !cpuset_zone_allowed(zone, gfp_mask))
   continue;

So, this is get_page_from_freelist. It's called from
__alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
bit set only for __GFP_WAIT allocations. That said, w/o your patch we
try to respect cpusets for all allocations, including 

Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-10 Thread David Rientjes
On Sun, 10 Aug 2014, Vladimir Davydov wrote:

> fallback_alloc is called on kmalloc if the preferred node doesn't have
> free or partial slabs and there's no pages on the node's free list
> (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries
> to locate a free or partial slab on other allowed nodes' lists. While
> iterating over the preferred node's zonelist it skips those zones which
> cpuset_zone_allowed_hardwall returns false for. That means that for a
> task bound to a specific node using cpusets fallback_alloc will always
> ignore free slabs on other nodes and go directly to the reclaimer,
> which, however, may allocate from other nodes if cpuset.mem_hardwall is
> unset (default). As a result, we may get lists of free slabs grow
> without bounds on other nodes, which is bad, because inactive slabs are
> only evicted by cache_reap at a very slow rate and cannot be dropped
> forcefully.
> 
> To reproduce the issue, run a process that will walk over a directory
> tree with lots of files inside a cpuset bound to a node that constantly
> experiences memory pressure. Look at num_slabs vs active_slabs growth as
> reported by /proc/slabinfo.
> 
> We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it
> can sleep, we only call it on __GFP_WAIT allocations. For atomic
> allocations we simply ignore cpusets, which is in agreement with the
> cpuset documenation (see the comment to __cpuset_node_allowed_softwall).
> 

If that rule were ever changed, nobody would think to modify the 
fallback_alloc() behavior in the slab allocator.  Why can't 
cpuset_zone_allowed_hardwall() just return 1 for !__GFP_WAIT?

I don't think this issue is restricted only to slab, it's for all callers 
of cpuset_zone_allowed_softwall() that could possibly be atomic.  I think 
it would be better to determine if cpuset_zone_allowed() should be 
hardwall or softwall depending on the gfp flags.

Let's add Li, the cpuset maintainer.  Any reason we can't do this?
---
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -48,29 +48,16 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct 
*p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
-extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
+extern int __cpuset_node_allowed(int node, const gfp_t gfp_mask);
 
-static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-   return nr_cpusets() <= 1 ||
-   __cpuset_node_allowed_softwall(node, gfp_mask);
+   return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
 }
 
-static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-   return nr_cpusets() <= 1 ||
-   __cpuset_node_allowed_hardwall(node, gfp_mask);
-}
-
-static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
-{
-   return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask);
-}
-
-static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
-{
-   return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
+   return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -178,22 +165,12 @@ static inline int 
cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
return 1;
 }
 
-static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
-{
-   return 1;
-}
-
-static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
-   return 1;
-}
-
-static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
return 1;
 }
 
-static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
return 1;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2449,7 +2449,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct 
cpuset *cs)
 }
 
 /**
- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * __cpuset_node_allowed - Can we allocate on a memory node?
  * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
@@ -2461,12 +2461,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct 
cpuset *cs)
  * flag, yes.
  * Otherwise, no.
  *
- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
- * might sleep, and might allow a node from an enclosing cpuset.
- *
- * 

[PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-10 Thread Vladimir Davydov
fallback_alloc is called on kmalloc if the preferred node doesn't have
free or partial slabs and there's no pages on the node's free list
(GFP_THISNODE allocations fail). Before invoking the reclaimer it tries
to locate a free or partial slab on other allowed nodes' lists. While
iterating over the preferred node's zonelist it skips those zones which
cpuset_zone_allowed_hardwall returns false for. That means that for a
task bound to a specific node using cpusets fallback_alloc will always
ignore free slabs on other nodes and go directly to the reclaimer,
which, however, may allocate from other nodes if cpuset.mem_hardwall is
unset (default). As a result, we may get lists of free slabs grow
without bounds on other nodes, which is bad, because inactive slabs are
only evicted by cache_reap at a very slow rate and cannot be dropped
forcefully.

To reproduce the issue, run a process that will walk over a directory
tree with lots of files inside a cpuset bound to a node that constantly
experiences memory pressure. Look at num_slabs vs active_slabs growth as
reported by /proc/slabinfo.

We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it
can sleep, we only call it on __GFP_WAIT allocations. For atomic
allocations we simply ignore cpusets, which is in agreement with the
cpuset documenation (see the comment to __cpuset_node_allowed_softwall).

Signed-off-by: Vladimir Davydov 
Cc: Christoph Lameter 
Cc: Pekka Enberg 
Cc: David Rientjes 
Cc: Joonsoo Kim 
---
 mm/slab.c |   23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 2e60bf3dedbb..1d77a4df7ee1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3049,14 +3049,23 @@ retry:
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
nid = zone_to_nid(zone);
 
-   if (cpuset_zone_allowed_hardwall(zone, flags) &&
-   get_node(cache, nid) &&
-   get_node(cache, nid)->free_objects) {
-   obj = cache_alloc_node(cache,
-   flags | GFP_THISNODE, nid);
-   if (obj)
-   break;
+   if (!get_node(cache, nid) ||
+   !get_node(cache, nid)->free_objects)
+   continue;
+
+   if (local_flags & __GFP_WAIT) {
+   bool allowed;
+
+   local_irq_enable();
+   allowed = cpuset_zone_allowed_softwall(zone, flags);
+   local_irq_disable();
+   if (!allowed)
+   continue;
}
+
+   obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid);
+   if (obj)
+   break;
}
 
if (!obj) {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-10 Thread Vladimir Davydov
fallback_alloc is called on kmalloc if the preferred node doesn't have
free or partial slabs and there's no pages on the node's free list
(GFP_THISNODE allocations fail). Before invoking the reclaimer it tries
to locate a free or partial slab on other allowed nodes' lists. While
iterating over the preferred node's zonelist it skips those zones which
cpuset_zone_allowed_hardwall returns false for. That means that for a
task bound to a specific node using cpusets fallback_alloc will always
ignore free slabs on other nodes and go directly to the reclaimer,
which, however, may allocate from other nodes if cpuset.mem_hardwall is
unset (default). As a result, we may get lists of free slabs grow
without bounds on other nodes, which is bad, because inactive slabs are
only evicted by cache_reap at a very slow rate and cannot be dropped
forcefully.

To reproduce the issue, run a process that will walk over a directory
tree with lots of files inside a cpuset bound to a node that constantly
experiences memory pressure. Look at num_slabs vs active_slabs growth as
reported by /proc/slabinfo.

We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it
can sleep, we only call it on __GFP_WAIT allocations. For atomic
allocations we simply ignore cpusets, which is in agreement with the
cpuset documenation (see the comment to __cpuset_node_allowed_softwall).

Signed-off-by: Vladimir Davydov vdavy...@parallels.com
Cc: Christoph Lameter c...@linux.com
Cc: Pekka Enberg penb...@kernel.org
Cc: David Rientjes rient...@google.com
Cc: Joonsoo Kim iamjoonsoo@lge.com
---
 mm/slab.c |   23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 2e60bf3dedbb..1d77a4df7ee1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3049,14 +3049,23 @@ retry:
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
nid = zone_to_nid(zone);
 
-   if (cpuset_zone_allowed_hardwall(zone, flags) 
-   get_node(cache, nid) 
-   get_node(cache, nid)-free_objects) {
-   obj = cache_alloc_node(cache,
-   flags | GFP_THISNODE, nid);
-   if (obj)
-   break;
+   if (!get_node(cache, nid) ||
+   !get_node(cache, nid)-free_objects)
+   continue;
+
+   if (local_flags  __GFP_WAIT) {
+   bool allowed;
+
+   local_irq_enable();
+   allowed = cpuset_zone_allowed_softwall(zone, flags);
+   local_irq_disable();
+   if (!allowed)
+   continue;
}
+
+   obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid);
+   if (obj)
+   break;
}
 
if (!obj) {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-10 Thread David Rientjes
On Sun, 10 Aug 2014, Vladimir Davydov wrote:

 fallback_alloc is called on kmalloc if the preferred node doesn't have
 free or partial slabs and there's no pages on the node's free list
 (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries
 to locate a free or partial slab on other allowed nodes' lists. While
 iterating over the preferred node's zonelist it skips those zones which
 cpuset_zone_allowed_hardwall returns false for. That means that for a
 task bound to a specific node using cpusets fallback_alloc will always
 ignore free slabs on other nodes and go directly to the reclaimer,
 which, however, may allocate from other nodes if cpuset.mem_hardwall is
 unset (default). As a result, we may get lists of free slabs grow
 without bounds on other nodes, which is bad, because inactive slabs are
 only evicted by cache_reap at a very slow rate and cannot be dropped
 forcefully.
 
 To reproduce the issue, run a process that will walk over a directory
 tree with lots of files inside a cpuset bound to a node that constantly
 experiences memory pressure. Look at num_slabs vs active_slabs growth as
 reported by /proc/slabinfo.
 
 We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it
 can sleep, we only call it on __GFP_WAIT allocations. For atomic
 allocations we simply ignore cpusets, which is in agreement with the
 cpuset documenation (see the comment to __cpuset_node_allowed_softwall).
 

If that rule were ever changed, nobody would think to modify the 
fallback_alloc() behavior in the slab allocator.  Why can't 
cpuset_zone_allowed_hardwall() just return 1 for !__GFP_WAIT?

I don't think this issue is restricted only to slab, it's for all callers 
of cpuset_zone_allowed_softwall() that could possibly be atomic.  I think 
it would be better to determine if cpuset_zone_allowed() should be 
hardwall or softwall depending on the gfp flags.

Let's add Li, the cpuset maintainer.  Any reason we can't do this?
---
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -48,29 +48,16 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct 
*p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
-extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
+extern int __cpuset_node_allowed(int node, const gfp_t gfp_mask);
 
-static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-   return nr_cpusets() = 1 ||
-   __cpuset_node_allowed_softwall(node, gfp_mask);
+   return nr_cpusets() = 1 || __cpuset_node_allowed(node, gfp_mask);
 }
 
-static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-   return nr_cpusets() = 1 ||
-   __cpuset_node_allowed_hardwall(node, gfp_mask);
-}
-
-static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
-{
-   return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask);
-}
-
-static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
-{
-   return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
+   return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -178,22 +165,12 @@ static inline int 
cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
return 1;
 }
 
-static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
-{
-   return 1;
-}
-
-static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-{
-   return 1;
-}
-
-static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
return 1;
 }
 
-static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
return 1;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2449,7 +2449,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct 
cpuset *cs)
 }
 
 /**
- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * __cpuset_node_allowed - Can we allocate on a memory node?
  * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
@@ -2461,12 +2461,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct 
cpuset *cs)
  * flag, yes.
  * Otherwise, no.
  *
- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
- * might sleep, and might allow a node from an enclosing cpuset.
- *
- * cpuset_node_allowed_hardwall() only handles the