RE: [patch] mm, slab: faster active and free stats

2016-12-02 Thread
Hello, David.

There is some problem on my e-mail client so I have to use another one.
Please understand broken reply style.

Yeah, I like this version much. Can we do account slabs_free directly in 
get_first_slab()
and get_valid_first_slab()? Passing page_is_free isn't needed if we do it 
directly in
those functions.

One nitpick is that if we don't replace variable name, num_slabs with 
total_slabs, we will
get less churn the code. However, total_slabs looks better than num_slabs.

Thanks.

-Original Message-
From: David Rientjes [mailto:rient...@google.com] 
Sent: Wednesday, November 30, 2016 9:57 AM
To: Joonsoo Kim <iamjoonsoo@lge.com>
Cc: Andrew Morton <a...@linux-foundation.org>; Greg Thelen 
<gthe...@google.com>; Aruna Ramakrishna <aruna.ramakris...@oracle.com>; 
Christoph Lameter <c...@linux.com>; linux-kernel@vger.kernel.org; 
linux...@kvack.org
Subject: Re: [patch] mm, slab: faster active and free stats

On Mon, 28 Nov 2016, Joonsoo Kim wrote:

> Hello,
> 
> Sorry for long delay.
> I agree that this improvement is needed. Could you try the approach 
> that maintains n->num_slabs and n->free_slabs? I guess that it would 
> be simpler than this patch so more maintainable.
> 

Ok, what do you think about the following?  I'm not sure it's that much more 
simpler.


mm, slab: track total number of slabs instead of active slabs

Rather than tracking the number of active slabs for each node, track the
total number of slabs.  This is a minor improvement that avoids active
slab tracking when a slab goes from free to partial or partial to free.

Suggested-by: Joonsoo Kim <iamjoonsoo@lge.com>
Signed-off-by: David Rientjes <rient...@google.com>
---
 mm/slab.c | 48 +---
 mm/slab.h |  4 ++--
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,7 +227,7 @@ static void kmem_cache_node_init(struct kmem_cache_node 
*parent)
INIT_LIST_HEAD(>slabs_full);
INIT_LIST_HEAD(>slabs_partial);
INIT_LIST_HEAD(>slabs_free);
-   parent->active_slabs = 0;
+   parent->total_slabs = 0;
parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
@@ -1381,20 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t 
gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);
 
for_each_kmem_cache_node(cachep, node, n) {
-   unsigned long active_objs = 0, free_objs = 0;
-   unsigned long active_slabs, num_slabs;
+   unsigned long total_slabs, free_slabs, free_objs;
 
spin_lock_irqsave(>list_lock, flags);
-   active_slabs = n->active_slabs;
-   num_slabs = active_slabs + n->free_slabs;
-
-   active_objs += (num_slabs * cachep->num) - n->free_objects;
-   free_objs += n->free_objects;
+   total_slabs = n->total_slabs;
+   free_slabs = n->free_slabs;
+   free_objs = n->free_objects;
spin_unlock_irqrestore(>list_lock, flags);
 
-   pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
-   node, active_slabs, num_slabs, active_objs,
-   num_slabs * cachep->num, free_objs);
+   pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+   node, total_slabs - free_slabs, total_slabs,
+   (total_slabs * cachep->num) - free_objs,
+   total_slabs * cachep->num);
}
 #endif
 }
@@ -2307,6 +2305,7 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(>lru);
n->free_slabs--;
+   n->total_slabs--;
/*
 * Safe to drop the lock. The slab is no longer linked
 * to the cache.
@@ -2741,13 +2740,12 @@ static void cache_grow_end(struct kmem_cache *cachep, 
struct page *page)
n = get_node(cachep, page_to_nid(page));
 
spin_lock(>list_lock);
+   n->total_slabs++;
if (!page->active) {
list_add_tail(>lru, &(n->slabs_free));
n->free_slabs++;
-   } else {
+   } else
fixup_slab_list(cachep, n, page, );
-   n->active_slabs++;
-   }
 
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
@@ -2935,10 +2933,8 @@ static struct page *get_first_slab(struct 
kmem_cache_node *n, bool pfmemalloc)
if (sk_memalloc_socks())
page = get_valid_first_slab(n, page, _is_free, pfmemalloc);
 
-   if (page && page_is_free) {
-  

RE: [patch] mm, slab: faster active and free stats

2016-12-02 Thread
Hello, David.

There is some problem on my e-mail client so I have to use another one.
Please understand broken reply style.

Yeah, I like this version much. Can we do account slabs_free directly in 
get_first_slab()
and get_valid_first_slab()? Passing page_is_free isn't needed if we do it 
directly in
those functions.

One nitpick is that if we don't replace variable name, num_slabs with 
total_slabs, we will
get less churn the code. However, total_slabs looks better than num_slabs.

Thanks.

-Original Message-
From: David Rientjes [mailto:rient...@google.com] 
Sent: Wednesday, November 30, 2016 9:57 AM
To: Joonsoo Kim 
Cc: Andrew Morton ; Greg Thelen 
; Aruna Ramakrishna ; 
Christoph Lameter ; linux-kernel@vger.kernel.org; 
linux...@kvack.org
Subject: Re: [patch] mm, slab: faster active and free stats

On Mon, 28 Nov 2016, Joonsoo Kim wrote:

> Hello,
> 
> Sorry for long delay.
> I agree that this improvement is needed. Could you try the approach 
> that maintains n->num_slabs and n->free_slabs? I guess that it would 
> be simpler than this patch so more maintainable.
> 

Ok, what do you think about the following?  I'm not sure it's that much more 
simpler.


mm, slab: track total number of slabs instead of active slabs

Rather than tracking the number of active slabs for each node, track the
total number of slabs.  This is a minor improvement that avoids active
slab tracking when a slab goes from free to partial or partial to free.

Suggested-by: Joonsoo Kim 
Signed-off-by: David Rientjes 
---
 mm/slab.c | 48 +---
 mm/slab.h |  4 ++--
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,7 +227,7 @@ static void kmem_cache_node_init(struct kmem_cache_node 
*parent)
INIT_LIST_HEAD(>slabs_full);
INIT_LIST_HEAD(>slabs_partial);
INIT_LIST_HEAD(>slabs_free);
-   parent->active_slabs = 0;
+   parent->total_slabs = 0;
parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
@@ -1381,20 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t 
gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);
 
for_each_kmem_cache_node(cachep, node, n) {
-   unsigned long active_objs = 0, free_objs = 0;
-   unsigned long active_slabs, num_slabs;
+   unsigned long total_slabs, free_slabs, free_objs;
 
spin_lock_irqsave(>list_lock, flags);
-   active_slabs = n->active_slabs;
-   num_slabs = active_slabs + n->free_slabs;
-
-   active_objs += (num_slabs * cachep->num) - n->free_objects;
-   free_objs += n->free_objects;
+   total_slabs = n->total_slabs;
+   free_slabs = n->free_slabs;
+   free_objs = n->free_objects;
spin_unlock_irqrestore(>list_lock, flags);
 
-   pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
-   node, active_slabs, num_slabs, active_objs,
-   num_slabs * cachep->num, free_objs);
+   pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+   node, total_slabs - free_slabs, total_slabs,
+   (total_slabs * cachep->num) - free_objs,
+   total_slabs * cachep->num);
}
 #endif
 }
@@ -2307,6 +2305,7 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(>lru);
n->free_slabs--;
+   n->total_slabs--;
/*
 * Safe to drop the lock. The slab is no longer linked
 * to the cache.
@@ -2741,13 +2740,12 @@ static void cache_grow_end(struct kmem_cache *cachep, 
struct page *page)
n = get_node(cachep, page_to_nid(page));
 
spin_lock(>list_lock);
+   n->total_slabs++;
if (!page->active) {
list_add_tail(>lru, &(n->slabs_free));
n->free_slabs++;
-   } else {
+   } else
fixup_slab_list(cachep, n, page, );
-   n->active_slabs++;
-   }
 
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
@@ -2935,10 +2933,8 @@ static struct page *get_first_slab(struct 
kmem_cache_node *n, bool pfmemalloc)
if (sk_memalloc_socks())
page = get_valid_first_slab(n, page, _is_free, pfmemalloc);
 
-   if (page && page_is_free) {
-   n->active_slabs++;
+   if (page && page_is_free)
n->free_slabs--;
-   }
 
return page;
 }
@@ -3441,7 +3437,6 @@ static void free_block(struct kmem_cache 

Re: [patch] mm, slab: faster active and free stats

2016-11-29 Thread David Rientjes
On Mon, 28 Nov 2016, Joonsoo Kim wrote:

> Hello,
> 
> Sorry for long delay.
> I agree that this improvement is needed. Could you try the approach
> that maintains n->num_slabs and n->free_slabs? I guess that it would be
> simpler than this patch so more maintainable.
> 

Ok, what do you think about the following?  I'm not sure it's that much 
more simpler.


mm, slab: track total number of slabs instead of active slabs

Rather than tracking the number of active slabs for each node, track the
total number of slabs.  This is a minor improvement that avoids active
slab tracking when a slab goes from free to partial or partial to free.

Suggested-by: Joonsoo Kim 
Signed-off-by: David Rientjes 
---
 mm/slab.c | 48 +---
 mm/slab.h |  4 ++--
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,7 +227,7 @@ static void kmem_cache_node_init(struct kmem_cache_node 
*parent)
INIT_LIST_HEAD(>slabs_full);
INIT_LIST_HEAD(>slabs_partial);
INIT_LIST_HEAD(>slabs_free);
-   parent->active_slabs = 0;
+   parent->total_slabs = 0;
parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
@@ -1381,20 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t 
gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);
 
for_each_kmem_cache_node(cachep, node, n) {
-   unsigned long active_objs = 0, free_objs = 0;
-   unsigned long active_slabs, num_slabs;
+   unsigned long total_slabs, free_slabs, free_objs;
 
spin_lock_irqsave(>list_lock, flags);
-   active_slabs = n->active_slabs;
-   num_slabs = active_slabs + n->free_slabs;
-
-   active_objs += (num_slabs * cachep->num) - n->free_objects;
-   free_objs += n->free_objects;
+   total_slabs = n->total_slabs;
+   free_slabs = n->free_slabs;
+   free_objs = n->free_objects;
spin_unlock_irqrestore(>list_lock, flags);
 
-   pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
-   node, active_slabs, num_slabs, active_objs,
-   num_slabs * cachep->num, free_objs);
+   pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+   node, total_slabs - free_slabs, total_slabs,
+   (total_slabs * cachep->num) - free_objs,
+   total_slabs * cachep->num);
}
 #endif
 }
@@ -2307,6 +2305,7 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(>lru);
n->free_slabs--;
+   n->total_slabs--;
/*
 * Safe to drop the lock. The slab is no longer linked
 * to the cache.
@@ -2741,13 +2740,12 @@ static void cache_grow_end(struct kmem_cache *cachep, 
struct page *page)
n = get_node(cachep, page_to_nid(page));
 
spin_lock(>list_lock);
+   n->total_slabs++;
if (!page->active) {
list_add_tail(>lru, &(n->slabs_free));
n->free_slabs++;
-   } else {
+   } else
fixup_slab_list(cachep, n, page, );
-   n->active_slabs++;
-   }
 
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
@@ -2935,10 +2933,8 @@ static struct page *get_first_slab(struct 
kmem_cache_node *n, bool pfmemalloc)
if (sk_memalloc_socks())
page = get_valid_first_slab(n, page, _is_free, pfmemalloc);
 
-   if (page && page_is_free) {
-   n->active_slabs++;
+   if (page && page_is_free)
n->free_slabs--;
-   }
 
return page;
 }
@@ -3441,7 +3437,6 @@ static void free_block(struct kmem_cache *cachep, void 
**objpp,
if (page->active == 0) {
list_add(>lru, >slabs_free);
n->free_slabs++;
-   n->active_slabs--;
} else {
/* Unconditionally move a slab to the end of the
 * partial list on free - maximum time for the
@@ -3457,6 +3452,7 @@ static void free_block(struct kmem_cache *cachep, void 
**objpp,
page = list_last_entry(>slabs_free, struct page, lru);
list_move(>lru, list);
n->free_slabs--;
+   n->total_slabs--;
}
 }
 
@@ -4109,8 +4105,8 @@ static void cache_reap(struct work_struct *w)
 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 {
unsigned long active_objs, num_objs, active_slabs;
-   unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0;
-   unsigned long num_slabs_free = 0;
+   

Re: [patch] mm, slab: faster active and free stats

2016-11-29 Thread David Rientjes
On Mon, 28 Nov 2016, Joonsoo Kim wrote:

> Hello,
> 
> Sorry for long delay.
> I agree that this improvement is needed. Could you try the approach
> that maintains n->num_slabs and n->free_slabs? I guess that it would be
> simpler than this patch so more maintainable.
> 

Ok, what do you think about the following?  I'm not sure it's that much 
more simpler.


mm, slab: track total number of slabs instead of active slabs

Rather than tracking the number of active slabs for each node, track the
total number of slabs.  This is a minor improvement that avoids active
slab tracking when a slab goes from free to partial or partial to free.

Suggested-by: Joonsoo Kim 
Signed-off-by: David Rientjes 
---
 mm/slab.c | 48 +---
 mm/slab.h |  4 ++--
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,7 +227,7 @@ static void kmem_cache_node_init(struct kmem_cache_node 
*parent)
INIT_LIST_HEAD(>slabs_full);
INIT_LIST_HEAD(>slabs_partial);
INIT_LIST_HEAD(>slabs_free);
-   parent->active_slabs = 0;
+   parent->total_slabs = 0;
parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
@@ -1381,20 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t 
gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);
 
for_each_kmem_cache_node(cachep, node, n) {
-   unsigned long active_objs = 0, free_objs = 0;
-   unsigned long active_slabs, num_slabs;
+   unsigned long total_slabs, free_slabs, free_objs;
 
spin_lock_irqsave(>list_lock, flags);
-   active_slabs = n->active_slabs;
-   num_slabs = active_slabs + n->free_slabs;
-
-   active_objs += (num_slabs * cachep->num) - n->free_objects;
-   free_objs += n->free_objects;
+   total_slabs = n->total_slabs;
+   free_slabs = n->free_slabs;
+   free_objs = n->free_objects;
spin_unlock_irqrestore(>list_lock, flags);
 
-   pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
-   node, active_slabs, num_slabs, active_objs,
-   num_slabs * cachep->num, free_objs);
+   pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+   node, total_slabs - free_slabs, total_slabs,
+   (total_slabs * cachep->num) - free_objs,
+   total_slabs * cachep->num);
}
 #endif
 }
@@ -2307,6 +2305,7 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(>lru);
n->free_slabs--;
+   n->total_slabs--;
/*
 * Safe to drop the lock. The slab is no longer linked
 * to the cache.
@@ -2741,13 +2740,12 @@ static void cache_grow_end(struct kmem_cache *cachep, 
struct page *page)
n = get_node(cachep, page_to_nid(page));
 
spin_lock(>list_lock);
+   n->total_slabs++;
if (!page->active) {
list_add_tail(>lru, &(n->slabs_free));
n->free_slabs++;
-   } else {
+   } else
fixup_slab_list(cachep, n, page, );
-   n->active_slabs++;
-   }
 
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
@@ -2935,10 +2933,8 @@ static struct page *get_first_slab(struct 
kmem_cache_node *n, bool pfmemalloc)
if (sk_memalloc_socks())
page = get_valid_first_slab(n, page, _is_free, pfmemalloc);
 
-   if (page && page_is_free) {
-   n->active_slabs++;
+   if (page && page_is_free)
n->free_slabs--;
-   }
 
return page;
 }
@@ -3441,7 +3437,6 @@ static void free_block(struct kmem_cache *cachep, void 
**objpp,
if (page->active == 0) {
list_add(>lru, >slabs_free);
n->free_slabs++;
-   n->active_slabs--;
} else {
/* Unconditionally move a slab to the end of the
 * partial list on free - maximum time for the
@@ -3457,6 +3452,7 @@ static void free_block(struct kmem_cache *cachep, void 
**objpp,
page = list_last_entry(>slabs_free, struct page, lru);
list_move(>lru, list);
n->free_slabs--;
+   n->total_slabs--;
}
 }
 
@@ -4109,8 +4105,8 @@ static void cache_reap(struct work_struct *w)
 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 {
unsigned long active_objs, num_objs, active_slabs;
-   unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0;
-   unsigned long num_slabs_free = 0;
+   unsigned long total_slabs = 0, free_objs = 0, 

Re: [patch] mm, slab: faster active and free stats

2016-11-27 Thread Joonsoo Kim
On Fri, Nov 11, 2016 at 02:30:39AM -0800, David Rientjes wrote:
> On Fri, 11 Nov 2016, Joonsoo Kim wrote:
> 
> > Hello, David.
> > 
> > Maintaining acitve/free_slab counters looks so complex. And, I think
> > that we don't need to maintain these counters for faster slabinfo.
> > Key point is to remove iterating n->slabs_partial list.
> > 
> > We can calculate active slab/object by following equation as you did in
> > this patch.
> > 
> > active_slab(n) = n->num_slab - the number of free_slab
> > active_object(n) = n->num_slab * cachep->num - n->free_objects
> > 
> > To get the number of free_slab, we need to iterate n->slabs_free list
> > but I guess it would be small enough.
> > 
> > If you don't like to iterate n->slabs_free list in slabinfo, just
> > maintaining the number of slabs_free would be enough.
> > 
> 
> Hi Joonsoo,
> 
> It's a good point, although I don't think the patch has overly complex 
> logic to keep track of slab state.
> 
> We don't prefer to do any iteration in get_slabinfo() since users can 
> read /proc/slabinfo constantly; it's better to just settle the stats when 
> slab state changes instead of repeating an expensive operation over and 
> over if someone is running slabtop(1) or /proc/slabinfo is scraped 
> regularly for stats.
> 
> That said, I imagine there are more clever ways to arrive at the same 
> answer, and you bring up a good point about maintaining a n->num_slabs and 
> n->free_slabs rather than n->active_slabs and n->free_slabs.
> 
> I don't feel strongly about either approach, but I think some improvement, 
> such as what this patch provides, is needed to prevent how expensive 
> simply reading /proc/slabinfo can be.

Hello,

Sorry for long delay.
I agree that this improvement is needed. Could you try the approach
that maintains n->num_slabs and n->free_slabs? I guess that it would be
simpler than this patch so more maintainable.

Thanks.



Re: [patch] mm, slab: faster active and free stats

2016-11-27 Thread Joonsoo Kim
On Fri, Nov 11, 2016 at 02:30:39AM -0800, David Rientjes wrote:
> On Fri, 11 Nov 2016, Joonsoo Kim wrote:
> 
> > Hello, David.
> > 
> > Maintaining acitve/free_slab counters looks so complex. And, I think
> > that we don't need to maintain these counters for faster slabinfo.
> > Key point is to remove iterating n->slabs_partial list.
> > 
> > We can calculate active slab/object by following equation as you did in
> > this patch.
> > 
> > active_slab(n) = n->num_slab - the number of free_slab
> > active_object(n) = n->num_slab * cachep->num - n->free_objects
> > 
> > To get the number of free_slab, we need to iterate n->slabs_free list
> > but I guess it would be small enough.
> > 
> > If you don't like to iterate n->slabs_free list in slabinfo, just
> > maintaining the number of slabs_free would be enough.
> > 
> 
> Hi Joonsoo,
> 
> It's a good point, although I don't think the patch has overly complex 
> logic to keep track of slab state.
> 
> We don't prefer to do any iteration in get_slabinfo() since users can 
> read /proc/slabinfo constantly; it's better to just settle the stats when 
> slab state changes instead of repeating an expensive operation over and 
> over if someone is running slabtop(1) or /proc/slabinfo is scraped 
> regularly for stats.
> 
> That said, I imagine there are more clever ways to arrive at the same 
> answer, and you bring up a good point about maintaining a n->num_slabs and 
> n->free_slabs rather than n->active_slabs and n->free_slabs.
> 
> I don't feel strongly about either approach, but I think some improvement, 
> such as what this patch provides, is needed to prevent how expensive 
> simply reading /proc/slabinfo can be.

Hello,

Sorry for long delay.
I agree that this improvement is needed. Could you try the approach
that maintains n->num_slabs and n->free_slabs? I guess that it would be
simpler than this patch so more maintainable.

Thanks.



Re: [patch] mm, slab: faster active and free stats

2016-11-11 Thread David Rientjes
On Fri, 11 Nov 2016, Joonsoo Kim wrote:

> Hello, David.
> 
> Maintaining acitve/free_slab counters looks so complex. And, I think
> that we don't need to maintain these counters for faster slabinfo.
> Key point is to remove iterating n->slabs_partial list.
> 
> We can calculate active slab/object by following equation as you did in
> this patch.
> 
> active_slab(n) = n->num_slab - the number of free_slab
> active_object(n) = n->num_slab * cachep->num - n->free_objects
> 
> To get the number of free_slab, we need to iterate n->slabs_free list
> but I guess it would be small enough.
> 
> If you don't like to iterate n->slabs_free list in slabinfo, just
> maintaining the number of slabs_free would be enough.
> 

Hi Joonsoo,

It's a good point, although I don't think the patch has overly complex 
logic to keep track of slab state.

We don't prefer to do any iteration in get_slabinfo() since users can 
read /proc/slabinfo constantly; it's better to just settle the stats when 
slab state changes instead of repeating an expensive operation over and 
over if someone is running slabtop(1) or /proc/slabinfo is scraped 
regularly for stats.

That said, I imagine there are more clever ways to arrive at the same 
answer, and you bring up a good point about maintaining a n->num_slabs and 
n->free_slabs rather than n->active_slabs and n->free_slabs.

I don't feel strongly about either approach, but I think some improvement, 
such as what this patch provides, is needed to prevent how expensive 
simply reading /proc/slabinfo can be.


Re: [patch] mm, slab: faster active and free stats

2016-11-11 Thread David Rientjes
On Fri, 11 Nov 2016, Joonsoo Kim wrote:

> Hello, David.
> 
> Maintaining acitve/free_slab counters looks so complex. And, I think
> that we don't need to maintain these counters for faster slabinfo.
> Key point is to remove iterating n->slabs_partial list.
> 
> We can calculate active slab/object by following equation as you did in
> this patch.
> 
> active_slab(n) = n->num_slab - the number of free_slab
> active_object(n) = n->num_slab * cachep->num - n->free_objects
> 
> To get the number of free_slab, we need to iterate n->slabs_free list
> but I guess it would be small enough.
> 
> If you don't like to iterate n->slabs_free list in slabinfo, just
> maintaining the number of slabs_free would be enough.
> 

Hi Joonsoo,

It's a good point, although I don't think the patch has overly complex 
logic to keep track of slab state.

We don't prefer to do any iteration in get_slabinfo() since users can 
read /proc/slabinfo constantly; it's better to just settle the stats when 
slab state changes instead of repeating an expensive operation over and 
over if someone is running slabtop(1) or /proc/slabinfo is scraped 
regularly for stats.

That said, I imagine there are more clever ways to arrive at the same 
answer, and you bring up a good point about maintaining a n->num_slabs and 
n->free_slabs rather than n->active_slabs and n->free_slabs.

I don't feel strongly about either approach, but I think some improvement, 
such as what this patch provides, is needed to prevent how expensive 
simply reading /proc/slabinfo can be.


Re: [patch] mm, slab: faster active and free stats

2016-11-10 Thread Joonsoo Kim
On Wed, Nov 09, 2016 at 04:38:08PM -0800, David Rientjes wrote:
> On Tue, 8 Nov 2016, Andrew Morton wrote:
> 
> > > Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
> > > if there are many slab caches and if there are very lengthy per-node
> > > partial and/or free lists.
> > > 
> > > Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
> > > stats") addressed the per-node full lists which showed a significant
> > > improvement when no objects were freed.  This patch has the same
> > > motivation and optimizes the remainder of the usecases where there are
> > > very lengthy partial and free lists.
> > > 
> > > This patch maintains per-node active_slabs (full and partial) and
> > > free_slabs rather than iterating the lists at runtime when reading
> > > /proc/slabinfo.
> > 
> > Are there any nice numbers you can share?
> > 
> 
> Yes, please add this to the description:
> 
> 
> When allocating 100GB of slab from a test cache where every slab page is
> on the partial list, reading /proc/slabinfo (includes all other slab
> caches on the system) takes ~247ms on average with 48 samples.
> 
> As a result of this patch, the same read takes ~0.856ms on average.

Hello, David.

Maintaining acitve/free_slab counters looks so complex. And, I think
that we don't need to maintain these counters for faster slabinfo.
Key point is to remove iterating n->slabs_partial list.

We can calculate active slab/object by following equation as you did in
this patch.

active_slab(n) = n->num_slab - the number of free_slab
active_object(n) = n->num_slab * cachep->num - n->free_objects

To get the number of free_slab, we need to iterate n->slabs_free list
but I guess it would be small enough.

If you don't like to iterate n->slabs_free list in slabinfo, just
maintaining the number of slabs_free would be enough.

Thanks.


Re: [patch] mm, slab: faster active and free stats

2016-11-10 Thread Joonsoo Kim
On Wed, Nov 09, 2016 at 04:38:08PM -0800, David Rientjes wrote:
> On Tue, 8 Nov 2016, Andrew Morton wrote:
> 
> > > Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
> > > if there are many slab caches and if there are very lengthy per-node
> > > partial and/or free lists.
> > > 
> > > Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
> > > stats") addressed the per-node full lists which showed a significant
> > > improvement when no objects were freed.  This patch has the same
> > > motivation and optimizes the remainder of the usecases where there are
> > > very lengthy partial and free lists.
> > > 
> > > This patch maintains per-node active_slabs (full and partial) and
> > > free_slabs rather than iterating the lists at runtime when reading
> > > /proc/slabinfo.
> > 
> > Are there any nice numbers you can share?
> > 
> 
> Yes, please add this to the description:
> 
> 
> When allocating 100GB of slab from a test cache where every slab page is
> on the partial list, reading /proc/slabinfo (includes all other slab
> caches on the system) takes ~247ms on average with 48 samples.
> 
> As a result of this patch, the same read takes ~0.856ms on average.

Hello, David.

Maintaining acitve/free_slab counters looks so complex. And, I think
that we don't need to maintain these counters for faster slabinfo.
Key point is to remove iterating n->slabs_partial list.

We can calculate active slab/object by following equation as you did in
this patch.

active_slab(n) = n->num_slab - the number of free_slab
active_object(n) = n->num_slab * cachep->num - n->free_objects

To get the number of free_slab, we need to iterate n->slabs_free list
but I guess it would be small enough.

If you don't like to iterate n->slabs_free list in slabinfo, just
maintaining the number of slabs_free would be enough.

Thanks.


Re: [patch] mm, slab: faster active and free stats

2016-11-09 Thread David Rientjes
On Tue, 8 Nov 2016, Andrew Morton wrote:

> > Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
> > if there are many slab caches and if there are very lengthy per-node
> > partial and/or free lists.
> > 
> > Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
> > stats") addressed the per-node full lists which showed a significant
> > improvement when no objects were freed.  This patch has the same
> > motivation and optimizes the remainder of the usecases where there are
> > very lengthy partial and free lists.
> > 
> > This patch maintains per-node active_slabs (full and partial) and
> > free_slabs rather than iterating the lists at runtime when reading
> > /proc/slabinfo.
> 
> Are there any nice numbers you can share?
> 

Yes, please add this to the description:


When allocating 100GB of slab from a test cache where every slab page is
on the partial list, reading /proc/slabinfo (includes all other slab
caches on the system) takes ~247ms on average with 48 samples.

As a result of this patch, the same read takes ~0.856ms on average.


Re: [patch] mm, slab: faster active and free stats

2016-11-09 Thread David Rientjes
On Tue, 8 Nov 2016, Andrew Morton wrote:

> > Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
> > if there are many slab caches and if there are very lengthy per-node
> > partial and/or free lists.
> > 
> > Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
> > stats") addressed the per-node full lists which showed a significant
> > improvement when no objects were freed.  This patch has the same
> > motivation and optimizes the remainder of the usecases where there are
> > very lengthy partial and free lists.
> > 
> > This patch maintains per-node active_slabs (full and partial) and
> > free_slabs rather than iterating the lists at runtime when reading
> > /proc/slabinfo.
> 
> Are there any nice numbers you can share?
> 

Yes, please add this to the description:


When allocating 100GB of slab from a test cache where every slab page is
on the partial list, reading /proc/slabinfo (includes all other slab
caches on the system) takes ~247ms on average with 48 samples.

As a result of this patch, the same read takes ~0.856ms on average.


Re: [patch] mm, slab: faster active and free stats

2016-11-08 Thread Andrew Morton
On Tue, 8 Nov 2016 15:06:45 -0800 (PST) David Rientjes  
wrote:

> Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
> if there are many slab caches and if there are very lengthy per-node
> partial and/or free lists.
> 
> Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
> stats") addressed the per-node full lists which showed a significant
> improvement when no objects were freed.  This patch has the same
> motivation and optimizes the remainder of the usecases where there are
> very lengthy partial and free lists.
> 
> This patch maintains per-node active_slabs (full and partial) and
> free_slabs rather than iterating the lists at runtime when reading
> /proc/slabinfo.

Are there any nice numbers you can share?


Re: [patch] mm, slab: faster active and free stats

2016-11-08 Thread Andrew Morton
On Tue, 8 Nov 2016 15:06:45 -0800 (PST) David Rientjes  
wrote:

> Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
> if there are many slab caches and if there are very lengthy per-node
> partial and/or free lists.
> 
> Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
> stats") addressed the per-node full lists which showed a significant
> improvement when no objects were freed.  This patch has the same
> motivation and optimizes the remainder of the usecases where there are
> very lengthy partial and free lists.
> 
> This patch maintains per-node active_slabs (full and partial) and
> free_slabs rather than iterating the lists at runtime when reading
> /proc/slabinfo.

Are there any nice numbers you can share?


[patch] mm, slab: faster active and free stats

2016-11-08 Thread David Rientjes
From: Greg Thelen 

Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
if there are many slab caches and if there are very lengthy per-node
partial and/or free lists.

Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
stats") addressed the per-node full lists which showed a significant
improvement when no objects were freed.  This patch has the same
motivation and optimizes the remainder of the usecases where there are
very lengthy partial and free lists.

This patch maintains per-node active_slabs (full and partial) and
free_slabs rather than iterating the lists at runtime when reading
/proc/slabinfo.

[rient...@google.com: changelog]
Signed-off-by: Greg Thelen 
Signed-off-by: David Rientjes 
---
 mm/slab.c | 117 +-
 mm/slab.h |   3 +-
 2 files changed, 49 insertions(+), 71 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node 
*parent)
INIT_LIST_HEAD(>slabs_full);
INIT_LIST_HEAD(>slabs_partial);
INIT_LIST_HEAD(>slabs_free);
+   parent->active_slabs = 0;
+   parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
spin_lock_init(>list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
-   parent->num_slabs = 0;
 }
 
 #define MAKE_LIST(cachep, listp, slab, nodeid) \
@@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t 
gfpflags, int nodeid)
 {
 #if DEBUG
struct kmem_cache_node *n;
-   struct page *page;
unsigned long flags;
int node;
static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -1381,32 +1381,20 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t 
gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);
 
for_each_kmem_cache_node(cachep, node, n) {
-   unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
-   unsigned long active_slabs = 0, num_slabs = 0;
-   unsigned long num_slabs_partial = 0, num_slabs_free = 0;
-   unsigned long num_slabs_full;
+   unsigned long active_objs = 0, free_objs = 0;
+   unsigned long active_slabs, num_slabs;
 
spin_lock_irqsave(>list_lock, flags);
-   num_slabs = n->num_slabs;
-   list_for_each_entry(page, >slabs_partial, lru) {
-   active_objs += page->active;
-   num_slabs_partial++;
-   }
-   list_for_each_entry(page, >slabs_free, lru)
-   num_slabs_free++;
+   active_slabs = n->active_slabs;
+   num_slabs = active_slabs + n->free_slabs;
 
-   free_objects += n->free_objects;
+   active_objs += (num_slabs * cachep->num) - n->free_objects;
+   free_objs += n->free_objects;
spin_unlock_irqrestore(>list_lock, flags);
 
-   num_objs = num_slabs * cachep->num;
-   active_slabs = num_slabs - num_slabs_free;
-   num_slabs_full = num_slabs -
-   (num_slabs_partial + num_slabs_free);
-   active_objs += (num_slabs_full * cachep->num);
-
pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
-   node, active_slabs, num_slabs, active_objs, num_objs,
-   free_objects);
+   node, active_slabs, num_slabs, active_objs,
+   num_slabs * cachep->num, free_objs);
}
 #endif
 }
@@ -2318,7 +2306,7 @@ static int drain_freelist(struct kmem_cache *cache,
 
page = list_entry(p, struct page, lru);
list_del(>lru);
-   n->num_slabs--;
+   n->free_slabs--;
/*
 * Safe to drop the lock. The slab is no longer linked
 * to the cache.
@@ -2753,12 +2741,14 @@ static void cache_grow_end(struct kmem_cache *cachep, 
struct page *page)
n = get_node(cachep, page_to_nid(page));
 
spin_lock(>list_lock);
-   if (!page->active)
+   if (!page->active) {
list_add_tail(>lru, &(n->slabs_free));
-   else
+   n->free_slabs++;
+   } else {
fixup_slab_list(cachep, n, page, );
+   n->active_slabs++;
+   }
 
-   n->num_slabs++;
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
spin_unlock(>list_lock);
@@ -2884,7 +2874,7 @@ static inline void fixup_slab_list(struct kmem_cache 
*cachep,
 
 /* Try to find non-pfmemalloc slab if needed */
 static noinline struct page *get_valid_first_slab(struct 

[patch] mm, slab: faster active and free stats

2016-11-08 Thread David Rientjes
From: Greg Thelen 

Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive
if there are many slab caches and if there are very lengthy per-node
partial and/or free lists.

Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo
stats") addressed the per-node full lists which showed a significant
improvement when no objects were freed.  This patch has the same
motivation and optimizes the remainder of the usecases where there are
very lengthy partial and free lists.

This patch maintains per-node active_slabs (full and partial) and
free_slabs rather than iterating the lists at runtime when reading
/proc/slabinfo.

[rient...@google.com: changelog]
Signed-off-by: Greg Thelen 
Signed-off-by: David Rientjes 
---
 mm/slab.c | 117 +-
 mm/slab.h |   3 +-
 2 files changed, 49 insertions(+), 71 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node 
*parent)
INIT_LIST_HEAD(>slabs_full);
INIT_LIST_HEAD(>slabs_partial);
INIT_LIST_HEAD(>slabs_free);
+   parent->active_slabs = 0;
+   parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
spin_lock_init(>list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
-   parent->num_slabs = 0;
 }
 
 #define MAKE_LIST(cachep, listp, slab, nodeid) \
@@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t 
gfpflags, int nodeid)
 {
 #if DEBUG
struct kmem_cache_node *n;
-   struct page *page;
unsigned long flags;
int node;
static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -1381,32 +1381,20 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t 
gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);
 
for_each_kmem_cache_node(cachep, node, n) {
-   unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
-   unsigned long active_slabs = 0, num_slabs = 0;
-   unsigned long num_slabs_partial = 0, num_slabs_free = 0;
-   unsigned long num_slabs_full;
+   unsigned long active_objs = 0, free_objs = 0;
+   unsigned long active_slabs, num_slabs;
 
spin_lock_irqsave(>list_lock, flags);
-   num_slabs = n->num_slabs;
-   list_for_each_entry(page, >slabs_partial, lru) {
-   active_objs += page->active;
-   num_slabs_partial++;
-   }
-   list_for_each_entry(page, >slabs_free, lru)
-   num_slabs_free++;
+   active_slabs = n->active_slabs;
+   num_slabs = active_slabs + n->free_slabs;
 
-   free_objects += n->free_objects;
+   active_objs += (num_slabs * cachep->num) - n->free_objects;
+   free_objs += n->free_objects;
spin_unlock_irqrestore(>list_lock, flags);
 
-   num_objs = num_slabs * cachep->num;
-   active_slabs = num_slabs - num_slabs_free;
-   num_slabs_full = num_slabs -
-   (num_slabs_partial + num_slabs_free);
-   active_objs += (num_slabs_full * cachep->num);
-
pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
-   node, active_slabs, num_slabs, active_objs, num_objs,
-   free_objects);
+   node, active_slabs, num_slabs, active_objs,
+   num_slabs * cachep->num, free_objs);
}
 #endif
 }
@@ -2318,7 +2306,7 @@ static int drain_freelist(struct kmem_cache *cache,
 
page = list_entry(p, struct page, lru);
list_del(>lru);
-   n->num_slabs--;
+   n->free_slabs--;
/*
 * Safe to drop the lock. The slab is no longer linked
 * to the cache.
@@ -2753,12 +2741,14 @@ static void cache_grow_end(struct kmem_cache *cachep, 
struct page *page)
n = get_node(cachep, page_to_nid(page));
 
spin_lock(>list_lock);
-   if (!page->active)
+   if (!page->active) {
list_add_tail(>lru, &(n->slabs_free));
-   else
+   n->free_slabs++;
+   } else {
fixup_slab_list(cachep, n, page, );
+   n->active_slabs++;
+   }
 
-   n->num_slabs++;
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
spin_unlock(>list_lock);
@@ -2884,7 +2874,7 @@ static inline void fixup_slab_list(struct kmem_cache 
*cachep,
 
 /* Try to find non-pfmemalloc slab if needed */
 static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
-