We created a new function __remove_swap_mapping_batch that
allows all pages under the same swap partition to be removed
from the swap cache's mapping in a single acquisition
of the mapping's tree lock.  This reduces the contention
on the lock when multiple threads are reclaiming
memory by swapping to the same swap partition.

The handle_pgout_batch function is updated so all the
pages under the same swap partition are unmapped together
when the have been paged out.

Signed-off-by: Tim Chen <tim.c.c...@linux.intel.com>
---
 mm/vmscan.c | 426 ++++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 286 insertions(+), 140 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9fc04e1..5e4b8ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -690,6 +690,103 @@ cannot_free:
        return 0;
 }
 
+/* use this only for swap mapped pages */
+static void __remove_swap_mapping_batch(struct page *pages[],
+                           bool reclaimed, short ret[], int nr)
+{
+       unsigned long flags;
+       struct page *page;
+       swp_entry_t swap[SWAP_BATCH];
+       struct address_space *mapping;
+
+       int i, batch_size;
+
+       if (nr <= 0)
+               return;
+
+       while (nr) {
+               mapping = page_mapping(pages[0]);
+               BUG_ON(!mapping);
+
+               batch_size = min(nr, SWAP_BATCH);
+
+               spin_lock_irqsave(&mapping->tree_lock, flags);
+               for (i = 0; i < batch_size; ++i) {
+                       page = pages[i];
+
+                       BUG_ON(!PageLocked(page));
+                       BUG_ON(!PageSwapCache(page));
+                       BUG_ON(mapping != page_mapping(page));
+
+                       /* stop batching if mapping changes */
+                       if (mapping != page_mapping(page)) {
+                               batch_size = i;
+                               break;
+                       }
+                       /*
+                        * The non racy check for a busy page.
+                        *
+                        * Must be careful with the order of the tests. When 
someone has
+                        * a ref to the page, it may be possible that they 
dirty it then
+                        * drop the reference. So if PageDirty is tested before 
page_count
+                        * here, then the following race may occur:
+                        *
+                        * get_user_pages(&page);
+                        * [user mapping goes away]
+                        * write_to(page);
+                        *                              !PageDirty(page)    
[good]
+                        * SetPageDirty(page);
+                        * put_page(page);
+                        *                              !page_count(page)   
[good, discard it]
+                        *
+                        * [oops, our write_to data is lost]
+                        *
+                        * Reversing the order of the tests ensures such a 
situation cannot
+                        * escape unnoticed. The smp_rmb is needed to ensure 
the page->flags
+                        * load is not satisfied before that of page->_count.
+                        *
+                        * Note that if SetPageDirty is always performed via 
set_page_dirty,
+                        * and thus under tree_lock, then this ordering is not 
required.
+                        */
+                       if (!page_ref_freeze(page, 2))
+                               goto cannot_free;
+                       /* note: atomic_cmpxchg in page_freeze_refs provides 
the smp_rmb */
+                       if (unlikely(PageDirty(page))) {
+                               page_ref_unfreeze(page, 2);
+                               goto cannot_free;
+                       }
+
+                       swap[i].val = page_private(page);
+                       __delete_from_swap_cache(page);
+
+                       ret[i] = 1;
+                       continue;
+
+cannot_free:
+                       ret[i] = 0;
+               }
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+               /* need to keep irq off for mem_cgroup accounting, don't 
restore flags yet  */
+               local_irq_disable();
+               for (i = 0; i < batch_size; ++i) {
+                       if (ret[i]) {
+                               page = pages[i];
+                               mem_cgroup_swapout(page, swap[i]);
+                       }
+               }
+               local_irq_enable();
+
+               for (i = 0; i < batch_size; ++i) {
+                       if (ret[i])
+                               swapcache_free(swap[i]);
+               }
+               /* advance to next batch */
+               pages += batch_size;
+               ret += batch_size;
+               nr -= batch_size;
+       }
+}
 /*
  * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
  * someone else has a ref on the page, abort and return 0.  If it was
@@ -897,177 +994,226 @@ static void handle_pgout_batch(struct list_head 
*page_list,
        int nr)
 {
        struct address_space *mapping;
+       struct page *umap_pages[SWAP_BATCH];
        struct page *page;
-       int i;
-
-       for (i = 0; i < nr; ++i) {
-               page = pages[i];
-               mapping =  page_mapping(page);
+       int i, j, batch_size;
+       short umap_ret[SWAP_BATCH], idx[SWAP_BATCH];
+
+       while (nr) {
+               j = 0;
+               batch_size = min(nr, SWAP_BATCH);
+               mapping = NULL;
+
+               for (i = 0; i < batch_size; ++i) {
+                       page = pages[i];
+
+                       if (mapping) {
+                               if (mapping != page_mapping(page)) {
+                                       /* mapping change, stop batch here */
+                                       batch_size = i;
+                                       break;
+                               }
+                       } else
+                               mapping =  page_mapping(page);
 
-               /* check outcome of cache addition */
-               if (!ret[i]) {
-                       ret[i] = PG_ACTIVATE_LOCKED;
-                       continue;
-               }
-               /*
-                * The page is mapped into the page tables of one or more
-                * processes. Try to unmap it here.
-                */
-               if (page_mapped(page) && mapping) {
-                       switch (swap_ret[i] = try_to_unmap(page, lazyfree ?
-                               (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
-                               (ttu_flags | TTU_BATCH_FLUSH))) {
-                       case SWAP_FAIL:
+                       /* check outcome of cache addition */
+                       if (!ret[i]) {
                                ret[i] = PG_ACTIVATE_LOCKED;
                                continue;
-                       case SWAP_AGAIN:
-                               ret[i] = PG_KEEP_LOCKED;
-                               continue;
-                       case SWAP_MLOCK:
-                               ret[i] = PG_MLOCKED;
-                               continue;
-                       case SWAP_LZFREE:
-                               goto lazyfree;
-                       case SWAP_SUCCESS:
-                               ; /* try to free the page below */
                        }
-               }
-
-               if (PageDirty(page)) {
                        /*
-                        * Only kswapd can writeback filesystem pages to
-                        * avoid risk of stack overflow but only writeback
-                        * if many dirty pages have been encountered.
+                        * The page is mapped into the page tables of one or 
more
+                        * processes. Try to unmap it here.
                         */
-                       if (page_is_file_cache(page) &&
-                                       (!current_is_kswapd() ||
-                                        !test_bit(ZONE_DIRTY, &zone->flags))) {
+                       if (page_mapped(page) && mapping) {
+                               switch (swap_ret[i] = try_to_unmap(page, 
lazyfree ?
+                                       (ttu_flags | TTU_BATCH_FLUSH | 
TTU_LZFREE) :
+                                       (ttu_flags | TTU_BATCH_FLUSH))) {
+                               case SWAP_FAIL:
+                                       ret[i] = PG_ACTIVATE_LOCKED;
+                                       continue;
+                               case SWAP_AGAIN:
+                                       ret[i] = PG_KEEP_LOCKED;
+                                       continue;
+                               case SWAP_MLOCK:
+                                       ret[i] = PG_MLOCKED;
+                                       continue;
+                               case SWAP_LZFREE:
+                                       goto lazyfree;
+                               case SWAP_SUCCESS:
+                                       ; /* try to free the page below */
+                               }
+                       }
+
+                       if (PageDirty(page)) {
                                /*
-                                * Immediately reclaim when written back.
-                                * Similar in principal to deactivate_page()
-                                * except we already have the page isolated
-                                * and know it's dirty
+                                * Only kswapd can writeback filesystem pages to
+                                * avoid risk of stack overflow but only 
writeback
+                                * if many dirty pages have been encountered.
                                 */
-                               inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
-                               SetPageReclaim(page);
-
-                               ret[i] = PG_KEEP_LOCKED;
-                               continue;
-                       }
+                               if (page_is_file_cache(page) &&
+                                               (!current_is_kswapd() ||
+                                                !test_bit(ZONE_DIRTY, 
&zone->flags))) {
+                                       /*
+                                        * Immediately reclaim when written 
back.
+                                        * Similar in principal to 
deactivate_page()
+                                        * except we already have the page 
isolated
+                                        * and know it's dirty
+                                        */
+                                       inc_zone_page_state(page, 
NR_VMSCAN_IMMEDIATE);
+                                       SetPageReclaim(page);
 
-                       if (references == PAGEREF_RECLAIM_CLEAN) {
-                               ret[i] = PG_KEEP_LOCKED;
-                               continue;
-                       }
-                       if (!may_enter_fs) {
-                               ret[i] = PG_KEEP_LOCKED;
-                               continue;
-                       }
-                       if (!sc->may_writepage) {
-                               ret[i] = PG_KEEP_LOCKED;
-                               continue;
-                       }
+                                       ret[i] = PG_KEEP_LOCKED;
+                                       continue;
+                               }
 
-                       /*
-                        * Page is dirty. Flush the TLB if a writable entry
-                        * potentially exists to avoid CPU writes after IO
-                        * starts and then write it out here.
-                        */
-                       try_to_unmap_flush_dirty();
-                       switch (pageout(page, mapping, sc)) {
-                       case PAGE_KEEP:
-                               ret[i] = PG_KEEP_LOCKED;
-                               continue;
-                       case PAGE_ACTIVATE:
-                               ret[i] = PG_ACTIVATE_LOCKED;
-                               continue;
-                       case PAGE_SUCCESS:
-                               if (PageWriteback(page)) {
-                                       ret[i] = PG_KEEP;
+                               if (references == PAGEREF_RECLAIM_CLEAN) {
+                                       ret[i] = PG_KEEP_LOCKED;
+                                       continue;
+                               }
+                               if (!may_enter_fs) {
+                                       ret[i] = PG_KEEP_LOCKED;
                                        continue;
                                }
-                               if (PageDirty(page)) {
-                                       ret[i] = PG_KEEP;
+                               if (!sc->may_writepage) {
+                                       ret[i] = PG_KEEP_LOCKED;
                                        continue;
                                }
 
                                /*
-                                * A synchronous write - probably a ramdisk.  Go
-                                * ahead and try to reclaim the page.
+                                * Page is dirty. Flush the TLB if a writable 
entry
+                                * potentially exists to avoid CPU writes after 
IO
+                                * starts and then write it out here.
                                 */
-                               if (!trylock_page(page)) {
-                                       ret[i] = PG_KEEP;
-                                       continue;
-                               }
-                               if (PageDirty(page) || PageWriteback(page)) {
+                               try_to_unmap_flush_dirty();
+                               switch (pageout(page, mapping, sc)) {
+                               case PAGE_KEEP:
                                        ret[i] = PG_KEEP_LOCKED;
                                        continue;
+                               case PAGE_ACTIVATE:
+                                       ret[i] = PG_ACTIVATE_LOCKED;
+                                       continue;
+                               case PAGE_SUCCESS:
+                                       if (PageWriteback(page)) {
+                                               ret[i] = PG_KEEP;
+                                               continue;
+                                       }
+                                       if (PageDirty(page)) {
+                                               ret[i] = PG_KEEP;
+                                               continue;
+                                       }
+
+                                       /*
+                                        * A synchronous write - probably a 
ramdisk.  Go
+                                        * ahead and try to reclaim the page.
+                                        */
+                                       if (!trylock_page(page)) {
+                                               ret[i] = PG_KEEP;
+                                               continue;
+                                       }
+                                       if (PageDirty(page) || 
PageWriteback(page)) {
+                                               ret[i] = PG_KEEP_LOCKED;
+                                               continue;
+                                       }
+                                       mapping = page_mapping(page);
+                               case PAGE_CLEAN:
+                                       ; /* try to free the page below */
                                }
-                               mapping = page_mapping(page);
-                       case PAGE_CLEAN:
-                               ; /* try to free the page below */
                        }
-               }
 
-               /*
-                * If the page has buffers, try to free the buffer mappings
-                * associated with this page. If we succeed we try to free
-                * the page as well.
-                *
-                * We do this even if the page is PageDirty().
-                * try_to_release_page() does not perform I/O, but it is
-                * possible for a page to have PageDirty set, but it is actually
-                * clean (all its buffers are clean).  This happens if the
-                * buffers were written out directly, with submit_bh(). ext3
-                * will do this, as well as the blockdev mapping.
-                * try_to_release_page() will discover that cleanness and will
-                * drop the buffers and mark the page clean - it can be freed.
-                *
-                * Rarely, pages can have buffers and no ->mapping.  These are
-                * the pages which were not successfully invalidated in
-                * truncate_complete_page().  We try to drop those buffers here
-                * and if that worked, and the page is no longer mapped into
-                * process address space (page_count == 1) it can be freed.
-                * Otherwise, leave the page on the LRU so it is swappable.
-                */
-               if (page_has_private(page)) {
-                       if (!try_to_release_page(page, sc->gfp_mask)) {
-                               ret[i] = PG_ACTIVATE_LOCKED;
+                       /*
+                        * If the page has buffers, try to free the buffer 
mappings
+                        * associated with this page. If we succeed we try to 
free
+                        * the page as well.
+                        *
+                        * We do this even if the page is PageDirty().
+                        * try_to_release_page() does not perform I/O, but it is
+                        * possible for a page to have PageDirty set, but it is 
actually
+                        * clean (all its buffers are clean).  This happens if 
the
+                        * buffers were written out directly, with submit_bh(). 
ext3
+                        * will do this, as well as the blockdev mapping.
+                        * try_to_release_page() will discover that cleanness 
and will
+                        * drop the buffers and mark the page clean - it can be 
freed.
+                        *
+                        * Rarely, pages can have buffers and no ->mapping.  
These are
+                        * the pages which were not successfully invalidated in
+                        * truncate_complete_page().  We try to drop those 
buffers here
+                        * and if that worked, and the page is no longer mapped 
into
+                        * process address space (page_count == 1) it can be 
freed.
+                        * Otherwise, leave the page on the LRU so it is 
swappable.
+                        */
+                       if (page_has_private(page)) {
+                               if (!try_to_release_page(page, sc->gfp_mask)) {
+                                       ret[i] = PG_ACTIVATE_LOCKED;
+                                       continue;
+                               }
+                               if (!mapping && page_count(page) == 1) {
+                                       unlock_page(page);
+                                       if (put_page_testzero(page)) {
+                                               ret[i] = PG_FREE;
+                                               continue;
+                                       } else {
+                                               /*
+                                                * rare race with speculative 
reference.
+                                                * the speculative reference 
will free
+                                                * this page shortly, so we may
+                                                * increment nr_reclaimed (and
+                                                * leave it off the LRU).
+                                                */
+                                               ret[i] = PG_SPECULATIVE_REF;
+                                               continue;
+                                       }
+                               }
+                       }
+lazyfree:
+                       if (!mapping) {
+                               ret[i] = PG_KEEP_LOCKED;
                                continue;
                        }
-                       if (!mapping && page_count(page) == 1) {
-                               unlock_page(page);
-                               if (put_page_testzero(page)) {
-                                       ret[i] = PG_FREE;
-                                       continue;
-                               } else {
-                                       /*
-                                        * rare race with speculative reference.
-                                        * the speculative reference will free
-                                        * this page shortly, so we may
-                                        * increment nr_reclaimed (and
-                                        * leave it off the LRU).
-                                        */
-                                       ret[i] = PG_SPECULATIVE_REF;
+                       if (!PageSwapCache(page)) {
+                               if (!__remove_mapping(mapping, page, true)) {
+                                       ret[i] = PG_KEEP_LOCKED;
                                        continue;
                                }
+                               __ClearPageLocked(page);
+                               ret[i] = PG_FREE;
+                               continue;
                        }
+
+                       /* note pages to be unmapped */
+                       ret[i] = PG_UNKNOWN;
+                       idx[j] = i;
+                       umap_pages[j] = page;
+                       ++j;
                }
-lazyfree:
-               if (!mapping || !__remove_mapping(mapping, page, true)) {
-                       ret[i] = PG_KEEP_LOCKED;
-                       continue;
+
+               /* handle remaining pages that need to be unmapped */
+               __remove_swap_mapping_batch(umap_pages, true, umap_ret, j);
+
+               for (i = 0; i < j; ++i) {
+                       if (!umap_ret[i]) {
+                               /* unmap failed */
+                               ret[idx[i]] = PG_KEEP_LOCKED;
+                               continue;
+                       }
+
+                       page = umap_pages[i];
+                       /*
+                        * At this point, we have no other references and there 
is
+                        * no way to pick any more up (removed from LRU, removed
+                        * from pagecache). Can use non-atomic bitops now (and
+                        * we obviously don't have to worry about waking up a 
process
+                        * waiting on the page lock, because there are no 
references.
+                        */
+                       __ClearPageLocked(page);
+                       ret[idx[i]] = PG_FREE;
                }
 
-               /*
-                * At this point, we have no other references and there is
-                * no way to pick any more up (removed from LRU, removed
-                * from pagecache). Can use non-atomic bitops now (and
-                * we obviously don't have to worry about waking up a process
-                * waiting on the page lock, because there are no references.
-                */
-               __ClearPageLocked(page);
-               ret[i] = PG_FREE;
+               /* advance pointers to next batch and remaining page count */
+               nr = nr - batch_size;
+               pages += batch_size;
+               ret += batch_size;
+               swap_ret += batch_size;
        }
 }
 
-- 
2.5.5

Reply via email to