Changelog v4
1. Added documentation for max_unmapped_pages
2. Better #ifdef'ing of max_unmapped_pages and min_unmapped_pages

Changelog v2
1. Use a config option to enable the code (Andrew Morton)
2. Explain the magic tunables in the code or at-least attempt
   to explain them (General comment)
3. Hint uses of the boot parameter with unlikely (Andrew Morton)
4. Use better names (balanced is not a good naming convention)

Provide control using zone_reclaim() and a boot parameter. The
code reuses functionality from zone_reclaim() to isolate unmapped
pages and reclaim them as a priority, ahead of other mapped pages.

Signed-off-by: Balbir Singh <bal...@linux.vnet.ibm.com>
Reviewed-by: Christoph Lameter <c...@linux.com>
---
 Documentation/kernel-parameters.txt |    8 +++
 Documentation/sysctl/vm.txt         |   19 +++++++-
 include/linux/mmzone.h              |    7 +++
 include/linux/swap.h                |   25 ++++++++--
 init/Kconfig                        |   12 +++++
 kernel/sysctl.c                     |   13 +++++
 mm/page_alloc.c                     |   29 ++++++++++++
 mm/vmscan.c                         |   88 +++++++++++++++++++++++++++++++++++
 8 files changed, 194 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index d4e67a5..f522c34 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2520,6 +2520,14 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
                        [X86]
                        Set unknown_nmi_panic=1 early on boot.
 
+       unmapped_page_control
+                       [KNL] Available if CONFIG_UNMAPPED_PAGECACHE_CONTROL
+                       is enabled. It controls the amount of unmapped memory
+                       that is present in the system. This boot option plus
+                       vm.min_unmapped_ratio (sysctl) provide granular control
+                       over how much unmapped page cache can exist in the 
system
+                       before kswapd starts reclaiming unmapped page cache 
pages.
+
        usbcore.autosuspend=
                        [USB] The autosuspend time delay (in seconds) used
                        for newly-detected USB devices (default 2).  This
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 30289fa..1c722f7 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -381,11 +381,14 @@ and may not be fast.
 
 min_unmapped_ratio:
 
-This is available only on NUMA kernels.
+This is available only on NUMA kernels or when unmapped page cache
+control is enabled.
 
 This is a percentage of the total pages in each zone. Zone reclaim will
 only occur if more than this percentage of pages are in a state that
-zone_reclaim_mode allows to be reclaimed.
+zone_reclaim_mode allows to be reclaimed. If unmapped page cache control
+is enabled, this is the minimum level to which the cache will be shrunk
+down to.
 
 If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
 against all file-backed unmapped pages including swapcache pages and tmpfs
@@ -396,6 +399,18 @@ The default is 1 percent.
 
 ==============================================================
 
+max_unmapped_ratio:
+
+This is available only when unmapped page cache control is enabled.
+
+This is a percentage of the total pages in each zone. Zone reclaim will
+only occur if more than this percentage of pages are in a state and
+unmapped page cache control is enabled.
+
+The default is 16 percent.
+
+==============================================================
+
 mmap_min_addr
 
 This file indicates the amount of address space  which a user process will
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 59cbed0..caa29ad 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -309,7 +309,12 @@ struct zone {
        /*
         * zone reclaim becomes active if more unmapped pages exist.
         */
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
        unsigned long           min_unmapped_pages;
+#endif
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+       unsigned long           max_unmapped_pages;
+#endif
 #ifdef CONFIG_NUMA
        int node;
        unsigned long           min_slab_pages;
@@ -776,6 +781,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct 
ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
                        void __user *, size_t *, loff_t *);
+int sysctl_max_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
+                       void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
                        void __user *, size_t *, loff_t *);
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ce8f686..86cafc5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -264,19 +264,36 @@ extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern long vm_total_pages;
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
 extern int sysctl_min_unmapped_ratio;
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+extern int sysctl_max_unmapped_ratio;
+#endif
+
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
-#ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
-extern int sysctl_min_slab_ratio;
 #else
-#define zone_reclaim_mode 0
 static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 {
        return 0;
 }
 #endif
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+extern bool should_reclaim_unmapped_pages(struct zone *zone);
+#else
+static inline bool should_reclaim_unmapped_pages(struct zone *zone)
+{
+       return false;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+extern int zone_reclaim_mode;
+extern int sysctl_min_slab_ratio;
+#else
+#define zone_reclaim_mode 0
+#endif
+
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
diff --git a/init/Kconfig b/init/Kconfig
index 41b2431..222b3af 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -811,6 +811,18 @@ config SCHED_AUTOGROUP
 config MM_OWNER
        bool
 
+config UNMAPPED_PAGECACHE_CONTROL
+       bool "Provide control over unmapped page cache"
+       default n
+       help
+         This option adds support for controlling unmapped page cache
+         via a boot parameter (unmapped_page_control). The boot parameter
+         with sysctl (vm.min_unmapped_ratio) control the total number
+         of unmapped pages in the system. This feature is useful if
+         you want to limit the amount of unmapped page cache or want
+         to reduce page cache duplication in a virtualized environment.
+         If unsure say 'N'
+
 config SYSFS_DEPRECATED
        bool "Enable deprecated sysfs features to support old userspace tools"
        depends on SYSFS
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e3a8ce4..d9e77da 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1214,6 +1214,7 @@ static struct ctl_table vm_table[] = {
                .proc_handler   = proc_dointvec_unsigned,
        },
 #endif
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
        {
                .procname       = "min_unmapped_ratio",
                .data           = &sysctl_min_unmapped_ratio,
@@ -1223,6 +1224,18 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
+#endif
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+       {
+               .procname       = "max_unmapped_ratio",
+               .data           = &sysctl_max_unmapped_ratio,
+               .maxlen         = sizeof(sysctl_max_unmapped_ratio),
+               .mode           = 0644,
+               .proc_handler   = sysctl_max_unmapped_ratio_sysctl_handler,
+               .extra1         = &zero,
+               .extra2         = &one_hundred,
+       },
+#endif
 #ifdef CONFIG_NUMA
        {
                .procname       = "zone_reclaim_mode",
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1d32865..5b89e5b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1669,6 +1669,9 @@ zonelist_scan:
                        unsigned long mark;
                        int ret;
 
+                       if (should_reclaim_unmapped_pages(zone))
+                               wakeup_kswapd(zone, order, classzone_idx);
+
                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
                        if (zone_watermark_ok(zone, order, mark,
                                    classzone_idx, alloc_flags))
@@ -4249,8 +4252,14 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat,
 
                zone->spanned_pages = size;
                zone->present_pages = realsize;
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
                                                / 100;
+#endif
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
+                                               / 100;
+#endif
 #ifdef CONFIG_NUMA
                zone->node = nid;
                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
@@ -5157,6 +5166,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int 
write,
        return 0;
 }
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
@@ -5173,6 +5183,25 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table 
*table, int write,
        return 0;
 }
 
+#endif
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+       void __user *buffer, size_t *length, loff_t *ppos)
+{
+       struct zone *zone;
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
+       for_each_zone(zone)
+               zone->max_unmapped_pages = (zone->present_pages *
+                               sysctl_max_unmapped_ratio) / 100;
+       return 0;
+}
+#endif
+
 #ifdef CONFIG_NUMA
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5b24e74..bb06710 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -158,6 +158,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
 #define scanning_global_lru(sc)        (1)
 #endif
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+static void reclaim_unmapped_pages(int priority, struct zone *zone,
+                                               struct scan_control *sc);
+static int unmapped_page_control __read_mostly;
+
+static int __init unmapped_page_control_parm(char *str)
+{
+       unmapped_page_control = 1;
+       /*
+        * XXX: Should we tweak swappiness here?
+        */
+       return 1;
+}
+__setup("unmapped_page_control", unmapped_page_control_parm);
+
+#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
+static inline void reclaim_unmapped_pages(int priority,
+                               struct zone *zone, struct scan_control *sc)
+{
+       return 0;
+}
+#endif
+
 static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
                                                  struct scan_control *sc)
 {
@@ -2371,6 +2394,12 @@ loop_again:
                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                        &sc, priority, 0);
 
+                       /*
+                        * We do unmapped page reclaim once here and once
+                        * below, so that we don't lose out
+                        */
+                       reclaim_unmapped_pages(priority, zone, &sc);
+
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
@@ -2408,6 +2437,11 @@ loop_again:
                                continue;
 
                        sc.nr_scanned = 0;
+                       /*
+                        * Reclaim unmapped pages upfront, this should be
+                        * really cheap
+                        */
+                       reclaim_unmapped_pages(priority, zone, &sc);
 
                        /*
                         * Call soft limit reclaim before calling shrink_zone.
@@ -2721,7 +2755,8 @@ void wakeup_kswapd(struct zone *zone, int order, enum 
zone_type classzone_idx)
        }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
-       if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+       if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0) &&
+               !should_reclaim_unmapped_pages(zone))
                return;
 
        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
@@ -2874,6 +2909,7 @@ static int __init kswapd_init(void)
 
 module_init(kswapd_init)
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
 /*
  * Zone reclaim mode
  *
@@ -2900,6 +2936,10 @@ int zone_reclaim_mode __read_mostly;
  */
 int sysctl_min_unmapped_ratio = 1;
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+int sysctl_max_unmapped_ratio = 16;
+#endif
+
 /*
  * If the number of slab pages in a zone grows beyond this percentage then
  * slab reclaim needs to occur.
@@ -3094,6 +3134,52 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, 
unsigned int order)
 
        return ret;
 }
+#endif
+
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+/*
+ * Routine to reclaim unmapped pages, inspired from the code under
+ * CONFIG_NUMA that does unmapped page and slab page control by keeping
+ * min_unmapped_pages in the zone. We currently reclaim just unmapped
+ * pages, slab control will come in soon, at which point this routine
+ * should be called reclaim cached pages
+ */
+void reclaim_unmapped_pages(int priority, struct zone *zone,
+                                               struct scan_control *sc)
+{
+       if (unlikely(unmapped_page_control) &&
+               (zone_unmapped_file_pages(zone) > zone->min_unmapped_pages)) {
+               struct scan_control nsc;
+               unsigned long nr_pages;
+
+               nsc = *sc;
+
+               nsc.swappiness = 0;
+               nsc.may_writepage = 0;
+               nsc.may_unmap = 0;
+               nsc.nr_reclaimed = 0;
+
+               nr_pages = zone_unmapped_file_pages(zone) -
+                               zone->min_unmapped_pages;
+               /*
+                * We don't want to be too aggressive with our
+                * reclaim, it is our best effort to control
+                * unmapped pages
+                */
+               nr_pages >>= 3;
+
+               zone_reclaim_pages(zone, &nsc, nr_pages);
+       }
+}
+
+bool should_reclaim_unmapped_pages(struct zone *zone)
+{
+       if (unlikely(unmapped_page_control) &&
+               (zone_unmapped_file_pages(zone) > zone->max_unmapped_pages))
+               return true;
+       return false;
+}
+#endif
 
 /*
  * page_evictable - test whether a page is evictable

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to