Re: [PATCH 02/29] mm: slab allocation fairness

2007-02-21 Thread Pekka Enberg

On 2/21/07, Peter Zijlstra <[EMAIL PROTECTED]> wrote:

[AIM9 results go here]


Yes please. I would really like to know what we gain by making the
slab even more complex.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 02/29] mm: slab allocation fairness

2007-02-21 Thread Peter Zijlstra
The slab allocator has some unfairness wrt gfp flags; when the slab cache is
grown the gfp flags are used to allocate more memory, however when there is 
slab cache available (in partial or free slabs, per cpu caches or otherwise)
gfp flags are ignored.

Thus it is possible for less critical slab allocations to succeed and gobble
up precious memory when under memory pressure.

This patch solves that by using the newly introduced page allocation rank.

Page allocation rank is a scalar quantity connecting ALLOC_ and gfp flags which
represents how deep we had to reach into our reserves when allocating a page. 
Rank 0 is the deepest we can reach (ALLOC_NO_WATERMARK) and 16 is the most 
shallow allocation possible (ALLOC_WMARK_HIGH).

When the slab space is grown the rank of the page allocation is stored. For
each slab allocation we test the given gfp flags against this rank. Thereby
asking the question: would these flags have allowed the slab to grow.

If not so, we need to test the current situation. This is done by forcing the
growth of the slab space. (Just testing the free page limits will not work due
to direct reclaim) Failing this we need to fail the slab allocation.

Thus if we grew the slab under great duress while PF_MEMALLOC was set and we 
really did access the memalloc reserve the rank would be set to 0. If the next
allocation to that slab would be GFP_NOFS|__GFP_NOMEMALLOC (which ordinarily
maps to rank 4 and always > 0) we'd want to make sure that memory pressure has
decreased enough to allow an allocation with the given gfp flags.

So in this case we try to force grow the slab cache and on failure we fail the
slab allocation. Thus preserving the available slab cache for more pressing
allocations.

If this newly allocated slab will be trimmed on the next kmem_cache_free
(not unlikely) this is no problem, since 1) it will free memory and 2) the
sole purpose of the allocation was to probe the allocation rank, we didn't
need the space itself.

[AIM9 results go here]

Signed-off-by: Peter Zijlstra <[EMAIL PROTECTED]>
---
 mm/Kconfig |3 ++
 mm/slab.c  |   81 -
 2 files changed, 57 insertions(+), 27 deletions(-)

Index: linux-2.6/mm/slab.c
===
--- linux-2.6.orig/mm/slab.c
+++ linux-2.6/mm/slab.c
@@ -114,6 +114,7 @@
 #include   
 #include   
 #include   
+#include   "internal.h"
 
 /*
  * DEBUG   - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
@@ -380,6 +381,7 @@ static void kmem_list3_init(struct kmem_
 
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
+   int rank;
struct array_cache *array[NR_CPUS];
 /* 2) Cache tunables. Protected by cache_chain_mutex */
unsigned int batchcount;
@@ -1023,21 +1025,21 @@ static inline int cache_free_alien(struc
 }
 
 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
-   gfp_t flags)
+   gfp_t flags, int rank)
 {
return NULL;
 }
 
 static inline void *cache_alloc_node(struct kmem_cache *cachep,
-gfp_t flags, int nodeid)
+gfp_t flags, int nodeid, int rank)
 {
return NULL;
 }
 
 #else  /* CONFIG_NUMA */
 
-static void *cache_alloc_node(struct kmem_cache *, gfp_t, int);
-static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
+static void *cache_alloc_node(struct kmem_cache *, gfp_t, int, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -1639,6 +1641,7 @@ static void *kmem_getpages(struct kmem_c
if (!page)
return NULL;
 
+   cachep->rank = page->index;
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
add_zone_page_state(page_zone(page),
@@ -2287,6 +2290,7 @@ kmem_cache_create (const char *name, siz
}
 #endif
 #endif
+   cachep->rank = MAX_ALLOC_RANK;
 
/*
 * Determine if the slab management is 'on' or 'off' slab.
@@ -2953,7 +2957,7 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int 
rank)
 {
int batchcount;
struct kmem_list3 *l3;
@@ -2965,6 +2969,8 @@ static void *cache_alloc_refill(struct k
check_irq_off();
ac = cpu_cache_get(cachep);
 retry:
+   if (unlikely(rank > cachep->rank))
+   goto force_grow;
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
@@ -3020,14 +3026,16 @@ must_grow:
l3->free_objects -= ac->avail;
 alloc_done:
spin_unlock(>list_lock);
-
if (unlikely(!ac->avail)) {
int x;
+force_grow:
x = 

[PATCH 02/29] mm: slab allocation fairness

2007-02-21 Thread Peter Zijlstra
The slab allocator has some unfairness wrt gfp flags; when the slab cache is
grown the gfp flags are used to allocate more memory, however when there is 
slab cache available (in partial or free slabs, per cpu caches or otherwise)
gfp flags are ignored.

Thus it is possible for less critical slab allocations to succeed and gobble
up precious memory when under memory pressure.

This patch solves that by using the newly introduced page allocation rank.

Page allocation rank is a scalar quantity connecting ALLOC_ and gfp flags which
represents how deep we had to reach into our reserves when allocating a page. 
Rank 0 is the deepest we can reach (ALLOC_NO_WATERMARK) and 16 is the most 
shallow allocation possible (ALLOC_WMARK_HIGH).

When the slab space is grown the rank of the page allocation is stored. For
each slab allocation we test the given gfp flags against this rank. Thereby
asking the question: would these flags have allowed the slab to grow.

If not so, we need to test the current situation. This is done by forcing the
growth of the slab space. (Just testing the free page limits will not work due
to direct reclaim) Failing this we need to fail the slab allocation.

Thus if we grew the slab under great duress while PF_MEMALLOC was set and we 
really did access the memalloc reserve the rank would be set to 0. If the next
allocation to that slab would be GFP_NOFS|__GFP_NOMEMALLOC (which ordinarily
maps to rank 4 and always  0) we'd want to make sure that memory pressure has
decreased enough to allow an allocation with the given gfp flags.

So in this case we try to force grow the slab cache and on failure we fail the
slab allocation. Thus preserving the available slab cache for more pressing
allocations.

If this newly allocated slab will be trimmed on the next kmem_cache_free
(not unlikely) this is no problem, since 1) it will free memory and 2) the
sole purpose of the allocation was to probe the allocation rank, we didn't
need the space itself.

[AIM9 results go here]

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 mm/Kconfig |3 ++
 mm/slab.c  |   81 -
 2 files changed, 57 insertions(+), 27 deletions(-)

Index: linux-2.6/mm/slab.c
===
--- linux-2.6.orig/mm/slab.c
+++ linux-2.6/mm/slab.c
@@ -114,6 +114,7 @@
 #include   asm/cacheflush.h
 #include   asm/tlbflush.h
 #include   asm/page.h
+#include   internal.h
 
 /*
  * DEBUG   - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
@@ -380,6 +381,7 @@ static void kmem_list3_init(struct kmem_
 
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
+   int rank;
struct array_cache *array[NR_CPUS];
 /* 2) Cache tunables. Protected by cache_chain_mutex */
unsigned int batchcount;
@@ -1023,21 +1025,21 @@ static inline int cache_free_alien(struc
 }
 
 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
-   gfp_t flags)
+   gfp_t flags, int rank)
 {
return NULL;
 }
 
 static inline void *cache_alloc_node(struct kmem_cache *cachep,
-gfp_t flags, int nodeid)
+gfp_t flags, int nodeid, int rank)
 {
return NULL;
 }
 
 #else  /* CONFIG_NUMA */
 
-static void *cache_alloc_node(struct kmem_cache *, gfp_t, int);
-static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
+static void *cache_alloc_node(struct kmem_cache *, gfp_t, int, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -1639,6 +1641,7 @@ static void *kmem_getpages(struct kmem_c
if (!page)
return NULL;
 
+   cachep-rank = page-index;
nr_pages = (1  cachep-gfporder);
if (cachep-flags  SLAB_RECLAIM_ACCOUNT)
add_zone_page_state(page_zone(page),
@@ -2287,6 +2290,7 @@ kmem_cache_create (const char *name, siz
}
 #endif
 #endif
+   cachep-rank = MAX_ALLOC_RANK;
 
/*
 * Determine if the slab management is 'on' or 'off' slab.
@@ -2953,7 +2957,7 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int 
rank)
 {
int batchcount;
struct kmem_list3 *l3;
@@ -2965,6 +2969,8 @@ static void *cache_alloc_refill(struct k
check_irq_off();
ac = cpu_cache_get(cachep);
 retry:
+   if (unlikely(rank  cachep-rank))
+   goto force_grow;
batchcount = ac-batchcount;
if (!ac-touched  batchcount  BATCHREFILL_LIMIT) {
/*
@@ -3020,14 +3026,16 @@ must_grow:
l3-free_objects -= ac-avail;
 alloc_done:
spin_unlock(l3-list_lock);
-
if (unlikely(!ac-avail)) {
int x;
+force_grow:

Re: [PATCH 02/29] mm: slab allocation fairness

2007-02-21 Thread Pekka Enberg

On 2/21/07, Peter Zijlstra [EMAIL PROTECTED] wrote:

[AIM9 results go here]


Yes please. I would really like to know what we gain by making the
slab even more complex.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/