[PATCH 01/29] mm: page allocation rank

2007-02-21 Thread Peter Zijlstra
Introduce page allocation rank.

This allocation rank is an measure of the 'hardness' of the page allocation.
Where hardness refers to how deep we have to reach (and thereby if reclaim 
was activated) to obtain the page.

It basically is a mapping from the ALLOC_/gfp flags into a scalar quantity,
which allows for comparisons of the kind: 
  'would this allocation have succeeded using these gfp flags'.

For the gfp -> alloc_flags mapping we use the 'hardest' possible, those
used by __alloc_pages() right before going into direct reclaim.

The alloc_flags -> rank mapping is given by: 2*2^wmark - harder - 2*high
where wmark = { min = 1, low, high } and harder, high are booleans.
This gives:
  0 is the hardest possible allocation - ALLOC_NO_WATERMARK,
  1 is ALLOC_WMARK_MIN|ALLOC_HARDER|ALLOC_HIGH,
  ...
  15 is ALLOC_WMARK_HIGH|ALLOC_HARDER,
  16 is the softest allocation - ALLOC_WMARK_HIGH.

Rank <= 4 will have woke up kswapd and when also > 0 might have ran into
direct reclaim.

Rank > 8 rarely happens and means lots of memory free (due to parallel oom 
kill).

The allocation rank is stored in page->index for successful allocations.

'offline' testing of the rank is made impossible by direct reclaim and
fragmentation issues. That is, it is impossible to tell if a given allocation
will succeed without actually doing it.

The purpose of this measure is to introduce some fairness into the slab
allocator.

Signed-off-by: Peter Zijlstra <[EMAIL PROTECTED]>
---
 mm/internal.h   |   89 
 mm/page_alloc.c |   58 ++--
 2 files changed, 106 insertions(+), 41 deletions(-)

Index: linux-2.6-git/mm/internal.h
===
--- linux-2.6-git.orig/mm/internal.h2007-01-08 11:53:13.0 +0100
+++ linux-2.6-git/mm/internal.h 2007-01-09 11:29:18.0 +0100
@@ -12,6 +12,7 @@
 #define __MM_INTERNAL_H
 
 #include 
+#include 
 
 static inline void set_page_count(struct page *page, int v)
 {
@@ -37,4 +38,92 @@ static inline void __put_page(struct pag
 extern void fastcall __init __free_pages_bootmem(struct page *page,
unsigned int order);
 
+#define ALLOC_HARDER   0x01 /* try to alloc harder */
+#define ALLOC_HIGH 0x02 /* __GFP_HIGH set */
+#define ALLOC_WMARK_MIN0x04 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW0x08 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH   0x10 /* use pages_high watermark */
+#define ALLOC_NO_WATERMARKS0x20 /* don't check watermarks at all */
+#define ALLOC_CPUSET   0x40 /* check for correct cpuset */
+
+/*
+ * get the deepest reaching allocation flags for the given gfp_mask
+ */
+static int inline gfp_to_alloc_flags(gfp_t gfp_mask)
+{
+   struct task_struct *p = current;
+   int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+   const gfp_t wait = gfp_mask & __GFP_WAIT;
+
+   /*
+* The caller may dip into page reserves a bit more if the caller
+* cannot run direct reclaim, or if the caller has realtime scheduling
+* policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+*/
+   if (gfp_mask & __GFP_HIGH)
+   alloc_flags |= ALLOC_HIGH;
+
+   if (!wait) {
+   alloc_flags |= ALLOC_HARDER;
+   /*
+* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+*/
+   alloc_flags &= ~ALLOC_CPUSET;
+   } else if (unlikely(rt_task(p)) && !in_interrupt())
+   alloc_flags |= ALLOC_HARDER;
+
+   if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
+   if (!in_interrupt() &&
+   ((p->flags & PF_MEMALLOC) ||
+unlikely(test_thread_flag(TIF_MEMDIE
+   alloc_flags |= ALLOC_NO_WATERMARKS;
+   }
+
+   return alloc_flags;
+}
+
+#define MAX_ALLOC_RANK 16
+
+/*
+ * classify the allocation: 0 is hardest, 16 is easiest.
+ */
+static inline int alloc_flags_to_rank(int alloc_flags)
+{
+   int rank;
+
+   if (alloc_flags & ALLOC_NO_WATERMARKS)
+   return 0;
+
+   rank = alloc_flags & (ALLOC_WMARK_MIN|ALLOC_WMARK_LOW|ALLOC_WMARK_HIGH);
+   rank -= alloc_flags & (ALLOC_HARDER|ALLOC_HIGH);
+
+   return rank;
+}
+
+static inline int gfp_to_rank(gfp_t gfp_mask)
+{
+   /*
+* Although correct this full version takes a ~3% performance
+* hit on the network tests in aim9.
+*
+
+   return alloc_flags_to_rank(gfp_to_alloc_flags(gfp_mask));
+
+*
+* Just check the bare essential ALLOC_NO_WATERMARKS case this keeps
+* the aim9 results within the error margin.
+*/
+
+   if 

[PATCH 01/29] mm: page allocation rank

2007-02-21 Thread Peter Zijlstra
Introduce page allocation rank.

This allocation rank is an measure of the 'hardness' of the page allocation.
Where hardness refers to how deep we have to reach (and thereby if reclaim 
was activated) to obtain the page.

It basically is a mapping from the ALLOC_/gfp flags into a scalar quantity,
which allows for comparisons of the kind: 
  'would this allocation have succeeded using these gfp flags'.

For the gfp - alloc_flags mapping we use the 'hardest' possible, those
used by __alloc_pages() right before going into direct reclaim.

The alloc_flags - rank mapping is given by: 2*2^wmark - harder - 2*high
where wmark = { min = 1, low, high } and harder, high are booleans.
This gives:
  0 is the hardest possible allocation - ALLOC_NO_WATERMARK,
  1 is ALLOC_WMARK_MIN|ALLOC_HARDER|ALLOC_HIGH,
  ...
  15 is ALLOC_WMARK_HIGH|ALLOC_HARDER,
  16 is the softest allocation - ALLOC_WMARK_HIGH.

Rank = 4 will have woke up kswapd and when also  0 might have ran into
direct reclaim.

Rank  8 rarely happens and means lots of memory free (due to parallel oom 
kill).

The allocation rank is stored in page-index for successful allocations.

'offline' testing of the rank is made impossible by direct reclaim and
fragmentation issues. That is, it is impossible to tell if a given allocation
will succeed without actually doing it.

The purpose of this measure is to introduce some fairness into the slab
allocator.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 mm/internal.h   |   89 
 mm/page_alloc.c |   58 ++--
 2 files changed, 106 insertions(+), 41 deletions(-)

Index: linux-2.6-git/mm/internal.h
===
--- linux-2.6-git.orig/mm/internal.h2007-01-08 11:53:13.0 +0100
+++ linux-2.6-git/mm/internal.h 2007-01-09 11:29:18.0 +0100
@@ -12,6 +12,7 @@
 #define __MM_INTERNAL_H
 
 #include linux/mm.h
+#include linux/hardirq.h
 
 static inline void set_page_count(struct page *page, int v)
 {
@@ -37,4 +38,92 @@ static inline void __put_page(struct pag
 extern void fastcall __init __free_pages_bootmem(struct page *page,
unsigned int order);
 
+#define ALLOC_HARDER   0x01 /* try to alloc harder */
+#define ALLOC_HIGH 0x02 /* __GFP_HIGH set */
+#define ALLOC_WMARK_MIN0x04 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW0x08 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH   0x10 /* use pages_high watermark */
+#define ALLOC_NO_WATERMARKS0x20 /* don't check watermarks at all */
+#define ALLOC_CPUSET   0x40 /* check for correct cpuset */
+
+/*
+ * get the deepest reaching allocation flags for the given gfp_mask
+ */
+static int inline gfp_to_alloc_flags(gfp_t gfp_mask)
+{
+   struct task_struct *p = current;
+   int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+   const gfp_t wait = gfp_mask  __GFP_WAIT;
+
+   /*
+* The caller may dip into page reserves a bit more if the caller
+* cannot run direct reclaim, or if the caller has realtime scheduling
+* policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+*/
+   if (gfp_mask  __GFP_HIGH)
+   alloc_flags |= ALLOC_HIGH;
+
+   if (!wait) {
+   alloc_flags |= ALLOC_HARDER;
+   /*
+* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+*/
+   alloc_flags = ~ALLOC_CPUSET;
+   } else if (unlikely(rt_task(p))  !in_interrupt())
+   alloc_flags |= ALLOC_HARDER;
+
+   if (likely(!(gfp_mask  __GFP_NOMEMALLOC))) {
+   if (!in_interrupt() 
+   ((p-flags  PF_MEMALLOC) ||
+unlikely(test_thread_flag(TIF_MEMDIE
+   alloc_flags |= ALLOC_NO_WATERMARKS;
+   }
+
+   return alloc_flags;
+}
+
+#define MAX_ALLOC_RANK 16
+
+/*
+ * classify the allocation: 0 is hardest, 16 is easiest.
+ */
+static inline int alloc_flags_to_rank(int alloc_flags)
+{
+   int rank;
+
+   if (alloc_flags  ALLOC_NO_WATERMARKS)
+   return 0;
+
+   rank = alloc_flags  (ALLOC_WMARK_MIN|ALLOC_WMARK_LOW|ALLOC_WMARK_HIGH);
+   rank -= alloc_flags  (ALLOC_HARDER|ALLOC_HIGH);
+
+   return rank;
+}
+
+static inline int gfp_to_rank(gfp_t gfp_mask)
+{
+   /*
+* Although correct this full version takes a ~3% performance
+* hit on the network tests in aim9.
+*
+
+   return alloc_flags_to_rank(gfp_to_alloc_flags(gfp_mask));
+
+*
+* Just check the bare essential ALLOC_NO_WATERMARKS case this keeps
+* the aim9 results within the error margin.
+*/
+
+   if