From:           "Hamid R. Jahanjou" <[EMAIL PROTECTED]>
To:             [EMAIL PROTECTED]
Subject:                [PATCH] VM: Implements the swap-out page-clustering 
technique
Date:           Thu, 04 Sep 2008 15:34:30 +0330
Message-ID:             <[EMAIL PROTECTED]>
Archive-link:           Article, Thread


From: Hamid R. Jahanjou

Implements the idea of swap-out page clustering from *BSD for
Linux. Each time a candidate page is to be swapped out,
virtually-nearby pages are scanned to find eligible pages to be
swapped out too as a cluster. This technique increases the likelihood of
bringing in related data on a page fault and decreases swap space
fragmentation in the long run. Currently, Linux searches only
physically-nearby pages which is not optimal since, over time, physically-
adjacent pages may become unrelated.

The code can be statically tuned. No benchmarks. I'm not sure whether
the added complexity is acceptable.

Signed-off-by: Hamid R. Jahanjou <[EMAIL PROTECTED]>
---

diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff
linux-2.6.26.1-vanilla/include/linux/pageout_clustering.h
linux-2.6.26.1HJ/include/linux/pageout_clustering.h
--- linux-2.6.26.1-vanilla/include/linux/pageout_clustering.h   1970-01-01
03:30:00.000000000 +0330
+++ linux-2.6.26.1HJ/include/linux/pageout_clustering.h 2008-08-28
20:19:52.000000000 +0330
@@ -0,0 +1,12 @@
+#ifndef PAGEOUT_CLUSTERING_H_
+#define PAGEOUT_CLUSTERING_H_
+
+#include<linux/list.h>
+#define INITIAL_CLUSTER_SCAN_RANGE 4
+#define HALF_PAGEOUT_CLUSTER_SIZE 8      /* default number of pages
to be clustered
+                                         * on either side of a given page */
+
+int try_to_cluster(struct page *, struct list_head *,
+                  struct list_head *, int, int);
+
+#endif
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff
linux-2.6.26.1-vanilla/include/linux/rmap.h
linux-2.6.26.1HJ/include/linux/rmap.h
--- linux-2.6.26.1-vanilla/include/linux/rmap.h 2008-08-02
02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/include/linux/rmap.h       2008-08-28 20:19:52.000000000 
+0330
@@ -102,6 +102,12 @@ pte_t *page_check_address(struct page *,
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);

 /*
+ * Used by pageout clustering
+ */
+struct anon_vma *page_lock_anon_vma(struct page *);
+void page_unlock_anon_vma(struct anon_vma *);
+
+/*
  * Cleans the PTEs of shared mappings.
  * (and since clean PTEs should also be readonly, write protects them too)
  *
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff
linux-2.6.26.1-vanilla/mm/Makefile linux-2.6.26.1HJ/mm/Makefile
--- linux-2.6.26.1-vanilla/mm/Makefile  2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/Makefile        2008-08-28 20:19:52.000000000 +0330
@@ -9,9 +9,9 @@ mmu-$(CONFIG_MMU)       := fremap.o highmem.o

 obj-y                  := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o pdflush.o \
-                          readahead.o swap.o truncate.o vmscan.o \
-                          prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                          page_isolation.o $(mmu-y)
+                          readahead.o swap.o truncate.o pageout_clustering.o \
+                          vmscan.o prio_tree.o util.o mmzone.o vmstat.o \
+                           backing-dev.o page_isolation.o $(mmu-y)

 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)   += bounce.o
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff
linux-2.6.26.1-vanilla/mm/pageout_clustering.c
linux-2.6.26.1HJ/mm/pageout_clustering.c
--- linux-2.6.26.1-vanilla/mm/pageout_clustering.c      1970-01-01
03:30:00.000000000 +0330
+++ linux-2.6.26.1HJ/mm/pageout_clustering.c    2008-09-04 09:26:40.000000000 
+0330
@@ -0,0 +1,278 @@
+#include<linux/pageout_clustering.h>
+#include<linux/mm.h>
+#include<linux/pagemap.h>
+#include<linux/swap.h>
+#include<linux/swapops.h>
+#include<linux/slab.h>
+#include<linux/init.h>
+#include<linux/rmap.h>
+#include<linux/module.h>
+#include<linux/memcontrol.h>
+
+
+/*
+ * contains data related to pageout clustering
+ */
+struct clustering_info
+{
+
+    struct page *page;                 /* page descriptor of a page
used for sampling cluster pages */
+    struct vm_area_struct *vma;        /* VMA under consideration */
+    struct list_head *src;             /* source page list used by
shrink_list and friends */
+
+    union {
+       struct anon_vma *anon_vma;     /* anon_vma object we're using for
reverse mapping */
+       struct address_space *mapping; /* address_space object we're using
for reverse mapping */
+};
+
+    struct page **cluster;             /* our page cluster */
+    unsigned int cluster_size;         /* cluster size */
+    unsigned int page_index;           /* index of the sampling page
within the cluster */
+    unsigned int index;                /* slot in the cluster for the
next page */
+    unsigned int range;                /* maximum number of VMA's to scan */
+    unsigned int nr_collected;         /* number of pages collected
in the cluster */
+    unsigned int nr_sc;                /* current number of VMA's scanned */
+    int mode;                          /* page-isolation mode */
+    int anonym;                        /* indicates whether we are to
collect mapped pages (0)
+                                       * or anonymous pages (1) */
+    int (*continue_search)
+       (struct clustering_info *);    /* returns true if the search can go on 
*/
+};
+
+
+static inline int check_conditions(struct clustering_info *ci)
+{
+    return (ci->nr_sc < ci->range) && (ci->nr_collected < ci->cluster_size);
+}
+
+
+/* embodies page-selection-for-clustering policy */
+static inline int page_allowed_in_cluster(struct page *page, struct
clustering_info *ci)
+{
+    int zone_id = page_zone_id(ci->page);
+
+
+    /* scanning the original page ? */
+    if (unlikely(page == ci->page))
+       return 0;
+
+    /* skip pages belonging to other zones */
+    if (unlikely(page_zone_id(page) != zone_id))
+       return 0;
+
+    /* skip pages not in LRU lists */
+    if (!PageLRU(page))
+       return 0;
+
+    /* skip active pages */
+    if (page->flags & PG_active)
+       return 0;
+
+    /* skip non-dirty pages */
+    if (!(page->flags & PG_dirty))
+       return 0;
+
+    return 1;
+}
+
+
+
+/* embodies vma-selection-for-clustering policy */
+static inline int vma_allowed_in_cluster(struct vm_area_struct *vma,
struct clustering_info *ci)
+{
+    if ( (vma->vm_flags & VM_RESERVED) ||
+        (vma->vm_flags & VM_LOCKED) ||
+        (vma->vm_flags & VM_IO) ||
+        (vma->vm_flags & VM_NONLINEAR) )
+       return 0;
+
+    /* we ignore mapped VMA's when doing anonym clustering; */
+    /* similarly, we ignore anonym VMA's when doing mapped clustering */
+    if ( ((ci->anonym) && (vma->vm_file != NULL)) ||
+        (!(ci->anonym) && (vma->vm_file == NULL)) )
+       return 0;
+
+    return 1;
+}
+
+
+
+/*
+ * scans a process's vma structures, to find pages eligible for being
added to cluster
+ */
+static void try_to_cluster_vma(struct clustering_info *ci)
+{
+    struct vm_area_struct *cursor_vma = ci->vma;
+    struct page *cursor_page;
+    struct mm_struct *cursor_mm;
+    unsigned long vm_address;
+
+
+    cursor_mm = cursor_vma->vm_mm;
+    if (!down_read_trylock(&cursor_mm->mmap_sem))
+       return;
+
+    do
+    {
+       if (!vma_allowed_in_cluster(cursor_vma, ci))
+           goto next_vma;
+
+       ci->nr_sc++;
+
+       for(vm_address = cursor_vma->vm_start;
+           vm_address < cursor_vma->vm_end && ci->nr_collected < 
ci->cluster_size;
+           vm_address += PAGE_SIZE)
+       {
+           cursor_page = virt_to_page(vm_address);
+           if (!page_allowed_in_cluster(cursor_page, ci))
+               continue;
+
+           switch (__isolate_lru_page(cursor_page, ci->mode))
+           {
+           case 0:
+               ci->cluster[ci->index] = cursor_page;
+               ci->nr_collected++;
+               ci->index = (++ci->index) % (ci->cluster_size);
+               break;
+
+           case -EBUSY:
+               /* it is being freed else where */
+               list_move(&cursor_page->lru, ci->src);
+               break;
+
+           default:
+               break;
+           };
+       }
+
+next_vma:
+       cursor_vma = cursor_vma->vm_next;
+       /* begin from the first VMA if the end of list is reached */
+       if (cursor_vma == NULL)
+           cursor_vma = cursor_mm->mmap;
+
+    } while (cursor_vma != ci->vma && ci->continue_search(ci));
+
+    up_read(&cursor_mm->mmap_sem);
+    return;
+}
+
+
+
+/*
+ * helper function for try_to_cluster(); handles mapped pages
+ */
+static inline void try_to_cluster_mapped(struct clustering_info *ci)
+{
+    struct vm_area_struct *vma;
+    struct prio_tree_iter pst_itr;
+    const pgoff_t pgoff = ci->page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+
+    vma_prio_tree_foreach(vma, &pst_itr, &ci->mapping->i_mmap, pgoff, pgoff) {
+       if (!ci->continue_search(ci))
+           break;
+
+       ci->vma = vma;
+       try_to_cluster_vma(ci);
+    }
+    return;
+}
+
+
+
+/*
+ * helper function for try_to_cluster(); handles anonymous pages
+ */
+static inline void try_to_cluster_anon(struct clustering_info *ci)
+{
+    struct list_head *list_item;
+    struct vm_area_struct *vma;
+
+
+    for (list_item = ci->anon_vma->head.next;
+        (list_item != &ci->anon_vma->head) && ci->continue_search(ci);
+        list_item = list_item->next)
+    {
+       vma = list_entry(list_item, struct vm_area_struct, anon_vma_node);
+       ci->vma = vma;
+       try_to_cluster_vma(ci);
+    }
+
+    return;
+}
+
+
+
+/*
+ * Searches in the virtual address space of the process possessing the page
+ * to find other suitable pages to be swapped out as well. This is known as
+ * page clustering. Not to be confused with other usages of the word in Linux.
+ *
+ * We adjust the range of our search by the value of the allocation order.
+ *
+ * @page: an originally-selected page
+ * @half_cluster_size: maximum number of pages to cluster on each
side of the page
+ * @src: the list containing the page
+ * @dst: the list to insert pages to
+ * @order: allocation order
+ * @mode: page-isolation mode
+ *
+ * returns the number of pages added to the list, including the passed page.
+ */
+int try_to_cluster(struct page *page, struct list_head *src,
+                  struct list_head *dst, int order, int mode)
+{
+    struct clustering_info cluster_info;
+    struct address_space *mapping;
+    struct anon_vma *anon_vma;
+    unsigned int range = INITIAL_CLUSTER_SCAN_RANGE << order;
+#define def_cluster_size (2 * HALF_PAGEOUT_CLUSTER_SIZE + 1)
+    struct page *cluster[def_cluster_size] = {NULL};
+    int i;
+
+
+    cluster[HALF_PAGEOUT_CLUSTER_SIZE] = page;
+    /* initialize commom fields only; other fields are
+     * set as we go on */
+    cluster_info.page = page;
+    cluster_info.vma = NULL;
+    cluster_info.src = src;
+    cluster_info.cluster = cluster;
+    cluster_info.cluster_size = def_cluster_size;
+    cluster_info.page_index = HALF_PAGEOUT_CLUSTER_SIZE;
+    cluster_info.index = HALF_PAGEOUT_CLUSTER_SIZE + 1;
+    cluster_info.range = range;
+    cluster_info.nr_collected = 1;
+    cluster_info.nr_sc = 0;
+    cluster_info.mode = mode;
+    cluster_info.continue_search = check_conditions;
+
+    /* try to cluster anonymous pages ? */
+    anon_vma = page_lock_anon_vma(page);
+    if (anon_vma != NULL)
+    {
+       cluster_info.anon_vma = anon_vma;
+       cluster_info.anonym = 1;
+       try_to_cluster_anon(&cluster_info);
+       page_unlock_anon_vma(anon_vma);
+    }
+
+    /* try to cluster mapped pages ? */
+    else if (page_mapped(page))
+    {
+       mapping = page->mapping;
+       spin_lock(&mapping->i_mmap_lock);
+       cluster_info.mapping = mapping;
+       cluster_info.anonym = 0;
+       try_to_cluster_mapped(&cluster_info);
+       spin_unlock(&mapping->i_mmap_lock);
+    }
+
+    /* add to destination list */
+    for (i = 0; i < def_cluster_size; i++)
+       if (cluster[i] != NULL)
+           list_move(&cluster[i]->lru, dst);
+
+    return cluster_info.nr_collected;
+}
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff
linux-2.6.26.1-vanilla/mm/rmap.c linux-2.6.26.1HJ/mm/rmap.c
--- linux-2.6.26.1-vanilla/mm/rmap.c    2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/rmap.c  2008-08-28 20:19:52.000000000 +0330
@@ -156,7 +156,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
        struct anon_vma *anon_vma;
        unsigned long anon_mapping;
@@ -176,7 +176,7 @@ out:
        return NULL;
 }

-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
        spin_unlock(&anon_vma->lock);
        rcu_read_unlock();
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff
linux-2.6.26.1-vanilla/mm/vmscan.c linux-2.6.26.1HJ/mm/vmscan.c
--- linux-2.6.26.1-vanilla/mm/vmscan.c  2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/vmscan.c        2008-08-28 20:19:52.000000000 +0330
@@ -38,6 +38,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/pageout_clustering.h>

 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -700,10 +701,7 @@ static unsigned long isolate_lru_pages(u

        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
                struct page *page;
-               unsigned long pfn;
-               unsigned long end_pfn;
-               unsigned long page_pfn;
-               int zone_id;
+               int nr_clustered;

                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
@@ -712,8 +710,7 @@ static unsigned long isolate_lru_pages(u

                switch (__isolate_lru_page(page, mode)) {
                case 0:
-                       list_move(&page->lru, dst);
-                       nr_taken++;
+                       /* counted and added to dst in try_to_cluster() */
                        break;

                case -EBUSY:
@@ -725,51 +722,13 @@ static unsigned long isolate_lru_pages(u
                        BUG();
                }

-               if (!order)
-                       continue;
-
-               /*
-                * Attempt to take all pages in the order aligned region
-                * surrounding the tag page.  Only take those pages of
-                * the same active state as that tag page.  We may safely
-                * round the target page pfn down to the requested order
-                * as the mem_map is guarenteed valid out to MAX_ORDER,
-                * where that page is in a different zone we will detect
-                * it from its zone id and abort this block scan.
-                */
-               zone_id = page_zone_id(page);
-               page_pfn = page_to_pfn(page);
-               pfn = page_pfn & ~((1 << order) - 1);
-               end_pfn = pfn + (1 << order);
-               for (; pfn < end_pfn; pfn++) {
-                       struct page *cursor_page;
-
-                       /* The target page is in the block, ignore it. */
-                       if (unlikely(pfn == page_pfn))
-                               continue;
-
-                       /* Avoid holes within the zone. */
-                       if (unlikely(!pfn_valid_within(pfn)))
-                               break;
-
-                       cursor_page = pfn_to_page(pfn);
-                       /* Check that we have not crossed a zone boundary. */
-                       if (unlikely(page_zone_id(cursor_page) != zone_id))
-                               continue;
-                       switch (__isolate_lru_page(cursor_page, mode)) {
-                       case 0:
-                               list_move(&cursor_page->lru, dst);
-                               nr_taken++;
-                               scan++;
-                               break;
-
-                       case -EBUSY:
-                               /* else it is being freed elsewhere */
-                               list_move(&cursor_page->lru, src);
-                       default:
-                               break;
-                       }
-               }
+               /*
+                * scan virtually-nearby pages in process address space
+                * to find eligible pages to be swapped out as well
+                */
+               nr_clustered = try_to_cluster(page, src, dst, order, mode);
+               nr_taken += nr_clustered;
+               scan += nr_clustered - 1;
        }

        *scanned = scan;




-- 
Regards,
Peter Teoh

Reply via email to