Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On 2014/8/12 5:05, David Rientjes wrote: > On Mon, 11 Aug 2014, Vladimir Davydov wrote: > >>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c >>> --- a/mm/page_alloc.c >>> +++ b/mm/page_alloc.c >>> @@ -1963,7 +1963,7 @@ zonelist_scan: >>> >>> /* >>> * Scan zonelist, looking for a zone with enough free. >>> -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. >>> +* See __cpuset_node_allowed() comment in kernel/cpuset.c. >>> */ >>> for_each_zone_zonelist_nodemask(zone, z, zonelist, >>> high_zoneidx, nodemask) { >>> @@ -1974,7 +1974,7 @@ zonelist_scan: >>> continue; >>> if (cpusets_enabled() && >>> (alloc_flags & ALLOC_CPUSET) && >>> - !cpuset_zone_allowed_softwall(zone, gfp_mask)) >>> + !cpuset_zone_allowed(zone, gfp_mask)) >>> continue; >> >> So, this is get_page_from_freelist. It's called from >> __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit >> set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET >> bit set only for __GFP_WAIT allocations. That said, w/o your patch we >> try to respect cpusets for all allocations, including atomic, and only >> ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT >> allocations, while with your patch we always ignore cpusets for >> !__GFP_WAIT allocations. Not sure if it really matters though, because >> usually one uses cpuset.mems in conjunction with cpuset.cpus and it >> won't make any difference then. It also doesn't conflict with any cpuset >> documentation. >> > > Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this. I'm not quite sure. That code has been there before I got involved in cpuset. > The only thing that we get by falling back to the page allocator slowpath > is that kswapd gets woken up before the allocation is attempted without > ALLOC_CPUSET. It seems pointless to wakeup kswapd when the allocation can > succeed on any node. Even with the patch, if the allocation fails because > all nodes are below their min watermark, then we still fallback to the > slowpath and wake up kswapd but there's nothing much else we can do > because it's !__GFP_WAIT. > . But I tend to agree with you. But if we want to do this, we should split this change from the cleanup. Regarding to the cleanup, I found there used to be a single cpuset_node_allowed(), and your cleanup is exactly a revert of that ancient commit: commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6 Author: Paul Jackson Date: Wed Dec 13 00:34:25 2006 -0800 [PATCH] cpuset: rework cpuset_zone_allowed api Seems the major intention was to avoid accident sleep-in-atomic bugs, because callback_mutex might be held. I don't see there's any reason callback_mutex can't be a spinlock. I thought about this when Gu Zhen fixed the bug that callback_mutex is nested inside rcu_read_lock(). -- kernel/cpuset.c | 81 ++--- 1 file changed, 49 insertions(+), 32 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index baa155c..9d9e239 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = { */ static DEFINE_MUTEX(cpuset_mutex); -static DEFINE_MUTEX(callback_mutex); +static DEFINE_SPINLOCK(callback_lock); /* * CPU / memory hotplug is handled asynchronously. @@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs) */ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { + unsigned long flags; struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; @@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) continue; rcu_read_unlock(); - mutex_lock(_mutex); + spin_lock_irqsave(_lock, flags); cpumask_copy(cp->effective_cpus, new_cpus); - mutex_unlock(_mutex); + spin_unlock_irqrestore(_lock, flags); WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { + unsigned long flags; int retval; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ @@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - mutex_lock(_mutex); + spin_lock_irqsave(_lock, flags); cpumask_copy(cs->cpus_allowed,
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On 2014/8/12 5:05, David Rientjes wrote: On Mon, 11 Aug 2014, Vladimir Davydov wrote: diff --git a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1963,7 +1963,7 @@ zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. +* See __cpuset_node_allowed() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -1974,7 +1974,7 @@ zonelist_scan: continue; if (cpusets_enabled() (alloc_flags ALLOC_CPUSET) - !cpuset_zone_allowed_softwall(zone, gfp_mask)) + !cpuset_zone_allowed(zone, gfp_mask)) continue; So, this is get_page_from_freelist. It's called from __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET bit set only for __GFP_WAIT allocations. That said, w/o your patch we try to respect cpusets for all allocations, including atomic, and only ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT allocations, while with your patch we always ignore cpusets for !__GFP_WAIT allocations. Not sure if it really matters though, because usually one uses cpuset.mems in conjunction with cpuset.cpus and it won't make any difference then. It also doesn't conflict with any cpuset documentation. Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this. I'm not quite sure. That code has been there before I got involved in cpuset. The only thing that we get by falling back to the page allocator slowpath is that kswapd gets woken up before the allocation is attempted without ALLOC_CPUSET. It seems pointless to wakeup kswapd when the allocation can succeed on any node. Even with the patch, if the allocation fails because all nodes are below their min watermark, then we still fallback to the slowpath and wake up kswapd but there's nothing much else we can do because it's !__GFP_WAIT. . But I tend to agree with you. But if we want to do this, we should split this change from the cleanup. Regarding to the cleanup, I found there used to be a single cpuset_node_allowed(), and your cleanup is exactly a revert of that ancient commit: commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6 Author: Paul Jackson p...@sgi.com Date: Wed Dec 13 00:34:25 2006 -0800 [PATCH] cpuset: rework cpuset_zone_allowed api Seems the major intention was to avoid accident sleep-in-atomic bugs, because callback_mutex might be held. I don't see there's any reason callback_mutex can't be a spinlock. I thought about this when Gu Zhen fixed the bug that callback_mutex is nested inside rcu_read_lock(). -- kernel/cpuset.c | 81 ++--- 1 file changed, 49 insertions(+), 32 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index baa155c..9d9e239 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = { */ static DEFINE_MUTEX(cpuset_mutex); -static DEFINE_MUTEX(callback_mutex); +static DEFINE_SPINLOCK(callback_lock); /* * CPU / memory hotplug is handled asynchronously. @@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs) */ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { + unsigned long flags; struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; @@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) continue; rcu_read_unlock(); - mutex_lock(callback_mutex); + spin_lock_irqsave(callback_lock, flags); cpumask_copy(cp-effective_cpus, new_cpus); - mutex_unlock(callback_mutex); + spin_unlock_irqrestore(callback_lock, flags); WARN_ON(!cgroup_on_dfl(cp-css.cgroup) !cpumask_equal(cp-cpus_allowed, cp-effective_cpus)); @@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { + unsigned long flags; int retval; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ @@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval 0) return retval; - mutex_lock(callback_mutex); + spin_lock_irqsave(callback_lock, flags); cpumask_copy(cs-cpus_allowed, trialcs-cpus_allowed); - mutex_unlock(callback_mutex); +
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Mon, 11 Aug 2014, Vladimir Davydov wrote: > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > > --- a/mm/page_alloc.c > > +++ b/mm/page_alloc.c > > @@ -1963,7 +1963,7 @@ zonelist_scan: > > > > /* > > * Scan zonelist, looking for a zone with enough free. > > -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. > > +* See __cpuset_node_allowed() comment in kernel/cpuset.c. > > */ > > for_each_zone_zonelist_nodemask(zone, z, zonelist, > > high_zoneidx, nodemask) { > > @@ -1974,7 +1974,7 @@ zonelist_scan: > > continue; > > if (cpusets_enabled() && > > (alloc_flags & ALLOC_CPUSET) && > > - !cpuset_zone_allowed_softwall(zone, gfp_mask)) > > + !cpuset_zone_allowed(zone, gfp_mask)) > > continue; > > So, this is get_page_from_freelist. It's called from > __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit > set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET > bit set only for __GFP_WAIT allocations. That said, w/o your patch we > try to respect cpusets for all allocations, including atomic, and only > ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT > allocations, while with your patch we always ignore cpusets for > !__GFP_WAIT allocations. Not sure if it really matters though, because > usually one uses cpuset.mems in conjunction with cpuset.cpus and it > won't make any difference then. It also doesn't conflict with any cpuset > documentation. > Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this. The only thing that we get by falling back to the page allocator slowpath is that kswapd gets woken up before the allocation is attempted without ALLOC_CPUSET. It seems pointless to wakeup kswapd when the allocation can succeed on any node. Even with the patch, if the allocation fails because all nodes are below their min watermark, then we still fallback to the slowpath and wake up kswapd but there's nothing much else we can do because it's !__GFP_WAIT. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Mon, Aug 11, 2014 at 04:37:15AM -0700, David Rientjes wrote: > On Mon, 11 Aug 2014, Vladimir Davydov wrote: > > > > diff --git a/mm/slab.c b/mm/slab.c > > > --- a/mm/slab.c > > > +++ b/mm/slab.c > > > @@ -3047,16 +3047,19 @@ retry: > > >* from existing per node queues. > > >*/ > > > for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { > > > - nid = zone_to_nid(zone); > > > + struct kmem_cache_node *n; > > > > > > - if (cpuset_zone_allowed_hardwall(zone, flags) && > > > - get_node(cache, nid) && > > > - get_node(cache, nid)->free_objects) { > > > - obj = cache_alloc_node(cache, > > > - flags | GFP_THISNODE, nid); > > > - if (obj) > > > - break; > > > - } > > > + nid = zone_to_nid(zone); > > > + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL)) > > > > We must use softwall check here, otherwise we will proceed to > > alloc_pages even if there are lots of free slabs on other nodes. > > alloc_pages, in turn, may allocate from other nodes in case > > cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet > > another free slab to another node's list even if it isn't empty. As a > > result, we may get free list bloating on other nodes. I've seen a > > machine with one of its nodes almost completely filled with inactive > > slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So, > > this is a bug that must be fixed. > > > > Right, I understand, and my patch makes no attempt to fix that issue, it's > simply collapsing the code down into a single cpuset_zone_allowed() > function and the context for the allocation is controlled by the gfp > flags (and hardwall is controlled by setting __GFP_HARDWALL) as it should > be. I understand the issue you face, but I can't combine a cleanup with a > fix and I would prefer to have your patch keep your commit description. Sorry, I misunderstood you. > The diffstat for my proposal removes many more lines than it adds and I > think it will avoid this type of issue in the future for new callers. > Your patch could then be based on the single cpuset_zone_allowed() > function where you would simply have to remove the __GFP_HARDWALL above. > Or, your patch could be merged first and then my cleanup on top, but it > seems like your one-liner would be more clear if it is based on mine. Having one function instead of two doing similar thing is usually better IMO, but AFAIU your patch isn't a mere cleanup - it also slightly changes the logic behind !__GFP_WAIT vs cpusets interaction: > @@ -2505,18 +2501,22 @@ static struct cpuset > *nearest_hardwall_ancestor(struct cpuset *cs) > * GFP_USER - only nodes in current tasks mems allowed ok. > * > * Rule: > - *Don't call cpuset_node_allowed_softwall if you can't sleep, unless you > + *Don't call __cpuset_node_allowed if you can't sleep, unless you > *pass in the __GFP_HARDWALL flag set in gfp_flag, which disables > *the code that might scan up ancestor cpusets and sleep. > */ > -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) > +int __cpuset_node_allowed(int node, const gfp_t gfp_mask) > { > struct cpuset *cs; /* current cpuset ancestors */ > int allowed;/* is allocation in zone z allowed? */ > > - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) > + if (in_interrupt()) > return 1; > might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); > + if (gfp_mask & __GFP_THISNODE) > + return 1; > + if (!(gfp_mask & __GFP_WAIT)) > + return 1; This means cpuset_zone_allowed will now always return true for !__GFP_WAIT allocations. > if (node_isset(node, current->mems_allowed)) > return 1; > /* [...] > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1963,7 +1963,7 @@ zonelist_scan: > > /* >* Scan zonelist, looking for a zone with enough free. > - * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. > + * See __cpuset_node_allowed() comment in kernel/cpuset.c. >*/ > for_each_zone_zonelist_nodemask(zone, z, zonelist, > high_zoneidx, nodemask) { > @@ -1974,7 +1974,7 @@ zonelist_scan: > continue; > if (cpusets_enabled() && > (alloc_flags & ALLOC_CPUSET) && > - !cpuset_zone_allowed_softwall(zone, gfp_mask)) > + !cpuset_zone_allowed(zone, gfp_mask)) > continue; So, this is get_page_from_freelist. It's called from __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit set and from __alloc_pages_slowpath with
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Mon, 11 Aug 2014, Vladimir Davydov wrote: > > diff --git a/mm/slab.c b/mm/slab.c > > --- a/mm/slab.c > > +++ b/mm/slab.c > > @@ -3047,16 +3047,19 @@ retry: > > * from existing per node queues. > > */ > > for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { > > - nid = zone_to_nid(zone); > > + struct kmem_cache_node *n; > > > > - if (cpuset_zone_allowed_hardwall(zone, flags) && > > - get_node(cache, nid) && > > - get_node(cache, nid)->free_objects) { > > - obj = cache_alloc_node(cache, > > - flags | GFP_THISNODE, nid); > > - if (obj) > > - break; > > - } > > + nid = zone_to_nid(zone); > > + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL)) > > We must use softwall check here, otherwise we will proceed to > alloc_pages even if there are lots of free slabs on other nodes. > alloc_pages, in turn, may allocate from other nodes in case > cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet > another free slab to another node's list even if it isn't empty. As a > result, we may get free list bloating on other nodes. I've seen a > machine with one of its nodes almost completely filled with inactive > slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So, > this is a bug that must be fixed. > Right, I understand, and my patch makes no attempt to fix that issue, it's simply collapsing the code down into a single cpuset_zone_allowed() function and the context for the allocation is controlled by the gfp flags (and hardwall is controlled by setting __GFP_HARDWALL) as it should be. I understand the issue you face, but I can't combine a cleanup with a fix and I would prefer to have your patch keep your commit description. The diffstat for my proposal removes many more lines than it adds and I think it will avoid this type of issue in the future for new callers. Your patch could then be based on the single cpuset_zone_allowed() function where you would simply have to remove the __GFP_HARDWALL above. Or, your patch could be merged first and then my cleanup on top, but it seems like your one-liner would be more clear if it is based on mine. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Sun, Aug 10, 2014 at 03:43:21PM -0700, David Rientjes wrote: > On Sun, 10 Aug 2014, Vladimir Davydov wrote: > > > fallback_alloc is called on kmalloc if the preferred node doesn't have > > free or partial slabs and there's no pages on the node's free list > > (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries > > to locate a free or partial slab on other allowed nodes' lists. While > > iterating over the preferred node's zonelist it skips those zones which > > cpuset_zone_allowed_hardwall returns false for. That means that for a > > task bound to a specific node using cpusets fallback_alloc will always > > ignore free slabs on other nodes and go directly to the reclaimer, > > which, however, may allocate from other nodes if cpuset.mem_hardwall is > > unset (default). As a result, we may get lists of free slabs grow > > without bounds on other nodes, which is bad, because inactive slabs are > > only evicted by cache_reap at a very slow rate and cannot be dropped > > forcefully. > > > > To reproduce the issue, run a process that will walk over a directory > > tree with lots of files inside a cpuset bound to a node that constantly > > experiences memory pressure. Look at num_slabs vs active_slabs growth as > > reported by /proc/slabinfo. > > > > We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it > > can sleep, we only call it on __GFP_WAIT allocations. For atomic > > allocations we simply ignore cpusets, which is in agreement with the > > cpuset documenation (see the comment to __cpuset_node_allowed_softwall). > > > > If that rule were ever changed, nobody would think to modify the > fallback_alloc() behavior in the slab allocator. Why can't > cpuset_zone_allowed_hardwall() just return 1 for !__GFP_WAIT? > > I don't think this issue is restricted only to slab, it's for all callers > of cpuset_zone_allowed_softwall() that could possibly be atomic. I think > it would be better to determine if cpuset_zone_allowed() should be > hardwall or softwall depending on the gfp flags. > > Let's add Li, the cpuset maintainer. Any reason we can't do this? > --- [...] > diff --git a/mm/slab.c b/mm/slab.c > --- a/mm/slab.c > +++ b/mm/slab.c > @@ -3047,16 +3047,19 @@ retry: >* from existing per node queues. >*/ > for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { > - nid = zone_to_nid(zone); > + struct kmem_cache_node *n; > > - if (cpuset_zone_allowed_hardwall(zone, flags) && > - get_node(cache, nid) && > - get_node(cache, nid)->free_objects) { > - obj = cache_alloc_node(cache, > - flags | GFP_THISNODE, nid); > - if (obj) > - break; > - } > + nid = zone_to_nid(zone); > + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL)) We must use softwall check here, otherwise we will proceed to alloc_pages even if there are lots of free slabs on other nodes. alloc_pages, in turn, may allocate from other nodes in case cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet another free slab to another node's list even if it isn't empty. As a result, we may get free list bloating on other nodes. I've seen a machine with one of its nodes almost completely filled with inactive slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So, this is a bug that must be fixed. Note, for SLUB using hardwall check in get_any_partial won't lead to such a problem, because once added a new slab is loaded to a per cpu list forcing any further user to allocate from it. Strictly speaking, we should use softwall check there either though. > + continue; > + n = get_node(cache, nid); > + if (!n) > + continue; > + if (!n->free_objects) > + continue; > + obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid); > + if (obj) > + break; > } > > if (!obj) { > diff --git a/mm/slub.c b/mm/slub.c > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -1671,20 +1671,22 @@ static void *get_any_partial(struct kmem_cache *s, > gfp_t flags, > struct kmem_cache_node *n; > > n = get_node(s, zone_to_nid(zone)); > + if (!n) > + continue; > + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL)) > + continue; > + if (n->nr_parial <= s->min_partial) > + continue; > > - if (n && cpuset_zone_allowed_hardwall(zone, flags) && > - n->nr_partial > s->min_partial) { > - object = get_partial_node(s, n, c,
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Mon, 11 Aug 2014, Vladimir Davydov wrote: diff --git a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1963,7 +1963,7 @@ zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. +* See __cpuset_node_allowed() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -1974,7 +1974,7 @@ zonelist_scan: continue; if (cpusets_enabled() (alloc_flags ALLOC_CPUSET) - !cpuset_zone_allowed_softwall(zone, gfp_mask)) + !cpuset_zone_allowed(zone, gfp_mask)) continue; So, this is get_page_from_freelist. It's called from __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET bit set only for __GFP_WAIT allocations. That said, w/o your patch we try to respect cpusets for all allocations, including atomic, and only ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT allocations, while with your patch we always ignore cpusets for !__GFP_WAIT allocations. Not sure if it really matters though, because usually one uses cpuset.mems in conjunction with cpuset.cpus and it won't make any difference then. It also doesn't conflict with any cpuset documentation. Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this. The only thing that we get by falling back to the page allocator slowpath is that kswapd gets woken up before the allocation is attempted without ALLOC_CPUSET. It seems pointless to wakeup kswapd when the allocation can succeed on any node. Even with the patch, if the allocation fails because all nodes are below their min watermark, then we still fallback to the slowpath and wake up kswapd but there's nothing much else we can do because it's !__GFP_WAIT. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Sun, Aug 10, 2014 at 03:43:21PM -0700, David Rientjes wrote: On Sun, 10 Aug 2014, Vladimir Davydov wrote: fallback_alloc is called on kmalloc if the preferred node doesn't have free or partial slabs and there's no pages on the node's free list (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries to locate a free or partial slab on other allowed nodes' lists. While iterating over the preferred node's zonelist it skips those zones which cpuset_zone_allowed_hardwall returns false for. That means that for a task bound to a specific node using cpusets fallback_alloc will always ignore free slabs on other nodes and go directly to the reclaimer, which, however, may allocate from other nodes if cpuset.mem_hardwall is unset (default). As a result, we may get lists of free slabs grow without bounds on other nodes, which is bad, because inactive slabs are only evicted by cache_reap at a very slow rate and cannot be dropped forcefully. To reproduce the issue, run a process that will walk over a directory tree with lots of files inside a cpuset bound to a node that constantly experiences memory pressure. Look at num_slabs vs active_slabs growth as reported by /proc/slabinfo. We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it can sleep, we only call it on __GFP_WAIT allocations. For atomic allocations we simply ignore cpusets, which is in agreement with the cpuset documenation (see the comment to __cpuset_node_allowed_softwall). If that rule were ever changed, nobody would think to modify the fallback_alloc() behavior in the slab allocator. Why can't cpuset_zone_allowed_hardwall() just return 1 for !__GFP_WAIT? I don't think this issue is restricted only to slab, it's for all callers of cpuset_zone_allowed_softwall() that could possibly be atomic. I think it would be better to determine if cpuset_zone_allowed() should be hardwall or softwall depending on the gfp flags. Let's add Li, the cpuset maintainer. Any reason we can't do this? --- [...] diff --git a/mm/slab.c b/mm/slab.c --- a/mm/slab.c +++ b/mm/slab.c @@ -3047,16 +3047,19 @@ retry: * from existing per node queues. */ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - nid = zone_to_nid(zone); + struct kmem_cache_node *n; - if (cpuset_zone_allowed_hardwall(zone, flags) - get_node(cache, nid) - get_node(cache, nid)-free_objects) { - obj = cache_alloc_node(cache, - flags | GFP_THISNODE, nid); - if (obj) - break; - } + nid = zone_to_nid(zone); + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL)) We must use softwall check here, otherwise we will proceed to alloc_pages even if there are lots of free slabs on other nodes. alloc_pages, in turn, may allocate from other nodes in case cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet another free slab to another node's list even if it isn't empty. As a result, we may get free list bloating on other nodes. I've seen a machine with one of its nodes almost completely filled with inactive slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So, this is a bug that must be fixed. Note, for SLUB using hardwall check in get_any_partial won't lead to such a problem, because once added a new slab is loaded to a per cpu list forcing any further user to allocate from it. Strictly speaking, we should use softwall check there either though. + continue; + n = get_node(cache, nid); + if (!n) + continue; + if (!n-free_objects) + continue; + obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid); + if (obj) + break; } if (!obj) { diff --git a/mm/slub.c b/mm/slub.c --- a/mm/slub.c +++ b/mm/slub.c @@ -1671,20 +1671,22 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); + if (!n) + continue; + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL)) + continue; + if (n-nr_parial = s-min_partial) + continue; - if (n cpuset_zone_allowed_hardwall(zone, flags) - n-nr_partial s-min_partial) { - object = get_partial_node(s, n, c, flags); - if (object) { - /* -
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Mon, 11 Aug 2014, Vladimir Davydov wrote: diff --git a/mm/slab.c b/mm/slab.c --- a/mm/slab.c +++ b/mm/slab.c @@ -3047,16 +3047,19 @@ retry: * from existing per node queues. */ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - nid = zone_to_nid(zone); + struct kmem_cache_node *n; - if (cpuset_zone_allowed_hardwall(zone, flags) - get_node(cache, nid) - get_node(cache, nid)-free_objects) { - obj = cache_alloc_node(cache, - flags | GFP_THISNODE, nid); - if (obj) - break; - } + nid = zone_to_nid(zone); + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL)) We must use softwall check here, otherwise we will proceed to alloc_pages even if there are lots of free slabs on other nodes. alloc_pages, in turn, may allocate from other nodes in case cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet another free slab to another node's list even if it isn't empty. As a result, we may get free list bloating on other nodes. I've seen a machine with one of its nodes almost completely filled with inactive slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So, this is a bug that must be fixed. Right, I understand, and my patch makes no attempt to fix that issue, it's simply collapsing the code down into a single cpuset_zone_allowed() function and the context for the allocation is controlled by the gfp flags (and hardwall is controlled by setting __GFP_HARDWALL) as it should be. I understand the issue you face, but I can't combine a cleanup with a fix and I would prefer to have your patch keep your commit description. The diffstat for my proposal removes many more lines than it adds and I think it will avoid this type of issue in the future for new callers. Your patch could then be based on the single cpuset_zone_allowed() function where you would simply have to remove the __GFP_HARDWALL above. Or, your patch could be merged first and then my cleanup on top, but it seems like your one-liner would be more clear if it is based on mine. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Mon, Aug 11, 2014 at 04:37:15AM -0700, David Rientjes wrote: On Mon, 11 Aug 2014, Vladimir Davydov wrote: diff --git a/mm/slab.c b/mm/slab.c --- a/mm/slab.c +++ b/mm/slab.c @@ -3047,16 +3047,19 @@ retry: * from existing per node queues. */ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - nid = zone_to_nid(zone); + struct kmem_cache_node *n; - if (cpuset_zone_allowed_hardwall(zone, flags) - get_node(cache, nid) - get_node(cache, nid)-free_objects) { - obj = cache_alloc_node(cache, - flags | GFP_THISNODE, nid); - if (obj) - break; - } + nid = zone_to_nid(zone); + if (!cpuset_zone_allowed(zone, flags | __GFP_HARDWALL)) We must use softwall check here, otherwise we will proceed to alloc_pages even if there are lots of free slabs on other nodes. alloc_pages, in turn, may allocate from other nodes in case cpuset.mem_hardwall=0, because it uses softwall check, so it may add yet another free slab to another node's list even if it isn't empty. As a result, we may get free list bloating on other nodes. I've seen a machine with one of its nodes almost completely filled with inactive slabs for buffer_heads (dozens of GBs) w/o any chance to drop them. So, this is a bug that must be fixed. Right, I understand, and my patch makes no attempt to fix that issue, it's simply collapsing the code down into a single cpuset_zone_allowed() function and the context for the allocation is controlled by the gfp flags (and hardwall is controlled by setting __GFP_HARDWALL) as it should be. I understand the issue you face, but I can't combine a cleanup with a fix and I would prefer to have your patch keep your commit description. Sorry, I misunderstood you. The diffstat for my proposal removes many more lines than it adds and I think it will avoid this type of issue in the future for new callers. Your patch could then be based on the single cpuset_zone_allowed() function where you would simply have to remove the __GFP_HARDWALL above. Or, your patch could be merged first and then my cleanup on top, but it seems like your one-liner would be more clear if it is based on mine. Having one function instead of two doing similar thing is usually better IMO, but AFAIU your patch isn't a mere cleanup - it also slightly changes the logic behind !__GFP_WAIT vs cpusets interaction: @@ -2505,18 +2501,22 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_USER - only nodes in current tasks mems allowed ok. * * Rule: - *Don't call cpuset_node_allowed_softwall if you can't sleep, unless you + *Don't call __cpuset_node_allowed if you can't sleep, unless you *pass in the __GFP_HARDWALL flag set in gfp_flag, which disables *the code that might scan up ancestor cpusets and sleep. */ -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) +int __cpuset_node_allowed(int node, const gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed;/* is allocation in zone z allowed? */ - if (in_interrupt() || (gfp_mask __GFP_THISNODE)) + if (in_interrupt()) return 1; might_sleep_if(!(gfp_mask __GFP_HARDWALL)); + if (gfp_mask __GFP_THISNODE) + return 1; + if (!(gfp_mask __GFP_WAIT)) + return 1; This means cpuset_zone_allowed will now always return true for !__GFP_WAIT allocations. if (node_isset(node, current-mems_allowed)) return 1; /* [...] diff --git a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1963,7 +1963,7 @@ zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. - * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. + * See __cpuset_node_allowed() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -1974,7 +1974,7 @@ zonelist_scan: continue; if (cpusets_enabled() (alloc_flags ALLOC_CPUSET) - !cpuset_zone_allowed_softwall(zone, gfp_mask)) + !cpuset_zone_allowed(zone, gfp_mask)) continue; So, this is get_page_from_freelist. It's called from __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET bit set only for __GFP_WAIT allocations. That said, w/o your patch we try to respect cpusets for all allocations, including
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Sun, 10 Aug 2014, Vladimir Davydov wrote: > fallback_alloc is called on kmalloc if the preferred node doesn't have > free or partial slabs and there's no pages on the node's free list > (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries > to locate a free or partial slab on other allowed nodes' lists. While > iterating over the preferred node's zonelist it skips those zones which > cpuset_zone_allowed_hardwall returns false for. That means that for a > task bound to a specific node using cpusets fallback_alloc will always > ignore free slabs on other nodes and go directly to the reclaimer, > which, however, may allocate from other nodes if cpuset.mem_hardwall is > unset (default). As a result, we may get lists of free slabs grow > without bounds on other nodes, which is bad, because inactive slabs are > only evicted by cache_reap at a very slow rate and cannot be dropped > forcefully. > > To reproduce the issue, run a process that will walk over a directory > tree with lots of files inside a cpuset bound to a node that constantly > experiences memory pressure. Look at num_slabs vs active_slabs growth as > reported by /proc/slabinfo. > > We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it > can sleep, we only call it on __GFP_WAIT allocations. For atomic > allocations we simply ignore cpusets, which is in agreement with the > cpuset documenation (see the comment to __cpuset_node_allowed_softwall). > If that rule were ever changed, nobody would think to modify the fallback_alloc() behavior in the slab allocator. Why can't cpuset_zone_allowed_hardwall() just return 1 for !__GFP_WAIT? I don't think this issue is restricted only to slab, it's for all callers of cpuset_zone_allowed_softwall() that could possibly be atomic. I think it would be better to determine if cpuset_zone_allowed() should be hardwall or softwall depending on the gfp flags. Let's add Li, the cpuset maintainer. Any reason we can't do this? --- diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -48,29 +48,16 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); -extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); -extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); +extern int __cpuset_node_allowed(int node, const gfp_t gfp_mask); -static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) +static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) { - return nr_cpusets() <= 1 || - __cpuset_node_allowed_softwall(node, gfp_mask); + return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask); } -static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) +static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return nr_cpusets() <= 1 || - __cpuset_node_allowed_hardwall(node, gfp_mask); -} - -static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) -{ - return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask); -} - -static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) -{ - return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask); + return cpuset_node_allowed(zone_to_nid(z), gfp_mask); } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, @@ -178,22 +165,12 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) return 1; } -static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) -{ - return 1; -} - -static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) -{ - return 1; -} - -static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) +static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) { return 1; } -static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) +static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return 1; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2449,7 +2449,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /** - * cpuset_node_allowed_softwall - Can we allocate on a memory node? + * __cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -2461,12 +2461,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * flag, yes. * Otherwise, no. * - * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to - * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() - * might sleep, and might allow a node from an enclosing cpuset. - * - *
[PATCH -mm] slab: fix cpuset check in fallback_alloc
fallback_alloc is called on kmalloc if the preferred node doesn't have free or partial slabs and there's no pages on the node's free list (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries to locate a free or partial slab on other allowed nodes' lists. While iterating over the preferred node's zonelist it skips those zones which cpuset_zone_allowed_hardwall returns false for. That means that for a task bound to a specific node using cpusets fallback_alloc will always ignore free slabs on other nodes and go directly to the reclaimer, which, however, may allocate from other nodes if cpuset.mem_hardwall is unset (default). As a result, we may get lists of free slabs grow without bounds on other nodes, which is bad, because inactive slabs are only evicted by cache_reap at a very slow rate and cannot be dropped forcefully. To reproduce the issue, run a process that will walk over a directory tree with lots of files inside a cpuset bound to a node that constantly experiences memory pressure. Look at num_slabs vs active_slabs growth as reported by /proc/slabinfo. We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it can sleep, we only call it on __GFP_WAIT allocations. For atomic allocations we simply ignore cpusets, which is in agreement with the cpuset documenation (see the comment to __cpuset_node_allowed_softwall). Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim --- mm/slab.c | 23 --- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 2e60bf3dedbb..1d77a4df7ee1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3049,14 +3049,23 @@ retry: for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { nid = zone_to_nid(zone); - if (cpuset_zone_allowed_hardwall(zone, flags) && - get_node(cache, nid) && - get_node(cache, nid)->free_objects) { - obj = cache_alloc_node(cache, - flags | GFP_THISNODE, nid); - if (obj) - break; + if (!get_node(cache, nid) || + !get_node(cache, nid)->free_objects) + continue; + + if (local_flags & __GFP_WAIT) { + bool allowed; + + local_irq_enable(); + allowed = cpuset_zone_allowed_softwall(zone, flags); + local_irq_disable(); + if (!allowed) + continue; } + + obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid); + if (obj) + break; } if (!obj) { -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH -mm] slab: fix cpuset check in fallback_alloc
fallback_alloc is called on kmalloc if the preferred node doesn't have free or partial slabs and there's no pages on the node's free list (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries to locate a free or partial slab on other allowed nodes' lists. While iterating over the preferred node's zonelist it skips those zones which cpuset_zone_allowed_hardwall returns false for. That means that for a task bound to a specific node using cpusets fallback_alloc will always ignore free slabs on other nodes and go directly to the reclaimer, which, however, may allocate from other nodes if cpuset.mem_hardwall is unset (default). As a result, we may get lists of free slabs grow without bounds on other nodes, which is bad, because inactive slabs are only evicted by cache_reap at a very slow rate and cannot be dropped forcefully. To reproduce the issue, run a process that will walk over a directory tree with lots of files inside a cpuset bound to a node that constantly experiences memory pressure. Look at num_slabs vs active_slabs growth as reported by /proc/slabinfo. We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it can sleep, we only call it on __GFP_WAIT allocations. For atomic allocations we simply ignore cpusets, which is in agreement with the cpuset documenation (see the comment to __cpuset_node_allowed_softwall). Signed-off-by: Vladimir Davydov vdavy...@parallels.com Cc: Christoph Lameter c...@linux.com Cc: Pekka Enberg penb...@kernel.org Cc: David Rientjes rient...@google.com Cc: Joonsoo Kim iamjoonsoo@lge.com --- mm/slab.c | 23 --- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 2e60bf3dedbb..1d77a4df7ee1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3049,14 +3049,23 @@ retry: for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { nid = zone_to_nid(zone); - if (cpuset_zone_allowed_hardwall(zone, flags) - get_node(cache, nid) - get_node(cache, nid)-free_objects) { - obj = cache_alloc_node(cache, - flags | GFP_THISNODE, nid); - if (obj) - break; + if (!get_node(cache, nid) || + !get_node(cache, nid)-free_objects) + continue; + + if (local_flags __GFP_WAIT) { + bool allowed; + + local_irq_enable(); + allowed = cpuset_zone_allowed_softwall(zone, flags); + local_irq_disable(); + if (!allowed) + continue; } + + obj = cache_alloc_node(cache, flags | GFP_THISNODE, nid); + if (obj) + break; } if (!obj) { -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc
On Sun, 10 Aug 2014, Vladimir Davydov wrote: fallback_alloc is called on kmalloc if the preferred node doesn't have free or partial slabs and there's no pages on the node's free list (GFP_THISNODE allocations fail). Before invoking the reclaimer it tries to locate a free or partial slab on other allowed nodes' lists. While iterating over the preferred node's zonelist it skips those zones which cpuset_zone_allowed_hardwall returns false for. That means that for a task bound to a specific node using cpusets fallback_alloc will always ignore free slabs on other nodes and go directly to the reclaimer, which, however, may allocate from other nodes if cpuset.mem_hardwall is unset (default). As a result, we may get lists of free slabs grow without bounds on other nodes, which is bad, because inactive slabs are only evicted by cache_reap at a very slow rate and cannot be dropped forcefully. To reproduce the issue, run a process that will walk over a directory tree with lots of files inside a cpuset bound to a node that constantly experiences memory pressure. Look at num_slabs vs active_slabs growth as reported by /proc/slabinfo. We should use cpuset_zone_allowed_softwall in fallback_alloc. Since it can sleep, we only call it on __GFP_WAIT allocations. For atomic allocations we simply ignore cpusets, which is in agreement with the cpuset documenation (see the comment to __cpuset_node_allowed_softwall). If that rule were ever changed, nobody would think to modify the fallback_alloc() behavior in the slab allocator. Why can't cpuset_zone_allowed_hardwall() just return 1 for !__GFP_WAIT? I don't think this issue is restricted only to slab, it's for all callers of cpuset_zone_allowed_softwall() that could possibly be atomic. I think it would be better to determine if cpuset_zone_allowed() should be hardwall or softwall depending on the gfp flags. Let's add Li, the cpuset maintainer. Any reason we can't do this? --- diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -48,29 +48,16 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); -extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); -extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); +extern int __cpuset_node_allowed(int node, const gfp_t gfp_mask); -static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) +static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) { - return nr_cpusets() = 1 || - __cpuset_node_allowed_softwall(node, gfp_mask); + return nr_cpusets() = 1 || __cpuset_node_allowed(node, gfp_mask); } -static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) +static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return nr_cpusets() = 1 || - __cpuset_node_allowed_hardwall(node, gfp_mask); -} - -static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) -{ - return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask); -} - -static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) -{ - return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask); + return cpuset_node_allowed(zone_to_nid(z), gfp_mask); } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, @@ -178,22 +165,12 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) return 1; } -static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) -{ - return 1; -} - -static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) -{ - return 1; -} - -static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) +static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) { return 1; } -static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) +static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return 1; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2449,7 +2449,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /** - * cpuset_node_allowed_softwall - Can we allocate on a memory node? + * __cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -2461,12 +2461,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * flag, yes. * Otherwise, no. * - * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to - * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() - * might sleep, and might allow a node from an enclosing cpuset. - * - * cpuset_node_allowed_hardwall() only handles the