Re: [RFC] mm: Proactive compaction

Vlastimil Babka Tue, 20 Aug 2019 01:46:32 -0700

+CC Khalid Aziz who proposed a different approach:
https://lore.kernel.org/linux-mm/[email protected]/T/#u


On 8/16/19 11:43 PM, Nitin Gupta wrote:
> For some applications we need to allocate almost all memory as
> hugepages. However, on a running system, higher order allocations can
> fail if the memory is fragmented. Linux kernel currently does
> on-demand compaction as we request more hugepages but this style of
> compaction incurs very high latency. Experiments with one-time full
> memory compaction (followed by hugepage allocations) shows that kernel
> is able to restore a highly fragmented memory state to a fairly
> compacted memory state within <1 sec for a 32G system. Such data
> suggests that a more proactive compaction can help us allocate a large
> fraction of memory as hugepages keeping allocation latencies low.
> 
> For a more proactive compaction, the approach taken here is to define
> per page-order external fragmentation thresholds and let kcompactd
> threads act on these thresholds.
> 
> The low and high thresholds are defined per page-order and exposed
> through sysfs:
> 
>   /sys/kernel/mm/compaction/order-[1..MAX_ORDER]/extfrag_{low,high}
> 
> Per-node kcompactd thread is woken up every few seconds to check if
> any zone on its node has extfrag above the extfrag_high threshold for
> any order, in which case the thread starts compaction in the backgrond
> till all zones are below extfrag_low level for all orders. By default
> both these thresolds are set to 100 for all orders which essentially
> disables kcompactd.

Could you define what exactly extfrag is, in the changelog?

> To avoid wasting CPU cycles when compaction cannot help, such as when
> memory is full, we check both, extfrag > extfrag_high and
> compaction_suitable(zone). This allows kcomapctd thread to stays inactive
> even if extfrag thresholds are not met.

How does it translate to e.g. the number of free pages of order?

> This patch is largely based on ideas from Michal Hocko posted here:
> https://lore.kernel.org/linux-mm/[email protected]/
> 
> Testing done (on x86):
>  - Set /sys/kernel/mm/compaction/order-9/extfrag_{low,high} = {25, 30}
>  respectively.
>  - Use a test program to fragment memory: the program allocates all memory
>  and then for each 2M aligned section, frees 3/4 of base pages using
>  munmap.
>  - kcompactd0 detects fragmentation for order-9 > extfrag_high and starts
>  compaction till extfrag < extfrag_low for order-9.
> 
> The patch has plenty of rough edges but posting it early to see if I'm
> going in the right direction and to get some early feedback.

That's a lot of control knobs - how is an admin supposed to tune them to their
needs?

(keeping the rest for reference)

> Signed-off-by: Nitin Gupta <[email protected]>
> ---
>  include/linux/compaction.h |  12 ++
>  mm/compaction.c            | 250 ++++++++++++++++++++++++++++++-------
>  mm/vmstat.c                |  12 ++
>  3 files changed, 228 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/compaction.h b/include/linux/compaction.h
> index 9569e7c786d3..26bfedbbc64b 100644
> --- a/include/linux/compaction.h
> +++ b/include/linux/compaction.h
> @@ -60,6 +60,17 @@ enum compact_result {
>  
>  struct alloc_context; /* in mm/internal.h */
>  
> +// "order-%d"
> +#define COMPACTION_ORDER_STATE_NAME_LEN 16
> +// Per-order compaction state
> +struct compaction_order_state {
> +     unsigned int order;
> +     unsigned int extfrag_low;
> +     unsigned int extfrag_high;
> +     unsigned int extfrag_curr;
> +     char name[COMPACTION_ORDER_STATE_NAME_LEN];
> +};
> +
>  /*
>   * Number of free order-0 pages that should be available above given 
> watermark
>   * to make sure compaction has reasonable chance of not running out of free
> @@ -90,6 +101,7 @@ extern int sysctl_compaction_handler(struct ctl_table 
> *table, int write,
>  extern int sysctl_extfrag_threshold;
>  extern int sysctl_compact_unevictable_allowed;
>  
> +extern int extfrag_for_order(struct zone *zone, unsigned int order);
>  extern int fragmentation_index(struct zone *zone, unsigned int order);
>  extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
>               unsigned int order, unsigned int alloc_flags,
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 952dc2fb24e5..21866b1ad249 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -25,6 +25,10 @@
>  #include <linux/psi.h>
>  #include "internal.h"
>  
> +#ifdef CONFIG_COMPACTION
> +struct compaction_order_state compaction_order_states[MAX_ORDER+1];
> +#endif
> +
>  #ifdef CONFIG_COMPACTION
>  static inline void count_compact_event(enum vm_event_item item)
>  {
> @@ -1846,6 +1850,49 @@ static inline bool is_via_compact_memory(int order)
>       return order == -1;
>  }
>  
> +static int extfrag_wmark_high(struct zone *zone)
> +{
> +     int order;
> +
> +     for (order = 1; order <= MAX_ORDER; order++) {
> +             int extfrag = extfrag_for_order(zone, order);
> +             int threshold = compaction_order_states[order].extfrag_high;
> +
> +             if (extfrag > threshold)
> +                     return order;
> +     }
> +     return 0;
> +}
> +
> +static bool node_should_compact(pg_data_t *pgdat)
> +{
> +     struct zone *zone;
> +
> +     for_each_populated_zone(zone) {
> +             int order = extfrag_wmark_high(zone);
> +
> +             if (order && compaction_suitable(zone, order,
> +                             0, zone_idx(zone)) == COMPACT_CONTINUE) {
> +                     return true;
> +             }
> +     }
> +     return false;
> +}
> +
> +static int extfrag_wmark_low(struct zone *zone)
> +{
> +     int order;
> +
> +     for (order = 1; order <= MAX_ORDER; order++) {
> +             int extfrag = extfrag_for_order(zone, order);
> +             int threshold = compaction_order_states[order].extfrag_low;
> +
> +             if (extfrag > threshold)
> +                     return order;
> +     }
> +     return 0;
> +}
> +
>  static enum compact_result __compact_finished(struct compact_control *cc)
>  {
>       unsigned int order;
> @@ -1872,7 +1919,7 @@ static enum compact_result __compact_finished(struct 
> compact_control *cc)
>                       return COMPACT_PARTIAL_SKIPPED;
>       }
>  
> -     if (is_via_compact_memory(cc->order))
> +     if (extfrag_wmark_low(cc->zone))
>               return COMPACT_CONTINUE;
>  
>       /*
> @@ -1962,18 +2009,6 @@ static enum compact_result 
> __compaction_suitable(struct zone *zone, int order,
>  {
>       unsigned long watermark;
>  
> -     if (is_via_compact_memory(order))
> -             return COMPACT_CONTINUE;
> -
> -     watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
> -     /*
> -      * If watermarks for high-order allocation are already met, there
> -      * should be no need for compaction at all.
> -      */
> -     if (zone_watermark_ok(zone, order, watermark, classzone_idx,
> -                                                             alloc_flags))
> -             return COMPACT_SUCCESS;
> -
>       /*
>        * Watermarks for order-0 must be met for compaction to be able to
>        * isolate free pages for migration targets. This means that the
> @@ -2003,31 +2038,9 @@ enum compact_result compaction_suitable(struct zone 
> *zone, int order,
>                                       int classzone_idx)
>  {
>       enum compact_result ret;
> -     int fragindex;
>  
>       ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
>                                   zone_page_state(zone, NR_FREE_PAGES));
> -     /*
> -      * fragmentation index determines if allocation failures are due to
> -      * low memory or external fragmentation
> -      *
> -      * index of -1000 would imply allocations might succeed depending on
> -      * watermarks, but we already failed the high-order watermark check
> -      * index towards 0 implies failure is due to lack of memory
> -      * index towards 1000 implies failure is due to fragmentation
> -      *
> -      * Only compact if a failure would be due to fragmentation. Also
> -      * ignore fragindex for non-costly orders where the alternative to
> -      * a successful reclaim/compaction is OOM. Fragindex and the
> -      * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
> -      * excessive compaction for costly orders, but it should not be at the
> -      * expense of system stability.
> -      */
> -     if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
> -             fragindex = fragmentation_index(zone, order);
> -             if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
> -                     ret = COMPACT_NOT_SUITABLE_ZONE;
> -     }
>  
>       trace_mm_compaction_suitable(zone, order, ret);
>       if (ret == COMPACT_NOT_SUITABLE_ZONE)
> @@ -2416,7 +2429,6 @@ static void compact_node(int nid)
>               .gfp_mask = GFP_KERNEL,
>       };
>  
> -
>       for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
>  
>               zone = &pgdat->node_zones[zoneid];
> @@ -2493,9 +2505,149 @@ void compaction_unregister_node(struct node *node)
>  }
>  #endif /* CONFIG_SYSFS && CONFIG_NUMA */
>  
> +#ifdef CONFIG_SYSFS
> +
> +#define COMPACTION_ATTR_RO(_name) \
> +     static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> +
> +#define COMPACTION_ATTR(_name) \
> +     static struct kobj_attribute _name##_attr = \
> +             __ATTR(_name, 0644, _name##_show, _name##_store)
> +
> +static struct kobject *compaction_kobj;
> +static struct kobject *compaction_order_kobjs[MAX_ORDER];
> +
> +static struct compaction_order_state *kobj_to_compaction_order_state(
> +                                             struct kobject *kobj)
> +{
> +     int i;
> +
> +     for (i = 1; i <= MAX_ORDER; i++) {
> +             if (compaction_order_kobjs[i] == kobj)
> +                     return &compaction_order_states[i];
> +     }
> +
> +     return NULL;
> +}
> +
> +static ssize_t extfrag_store_common(bool is_low, struct kobject *kobj,
> +             struct kobj_attribute *attr, const char *buf, size_t count)
> +{
> +     int err;
> +     unsigned long input;
> +     struct compaction_order_state *c = kobj_to_compaction_order_state(kobj);
> +
> +     err = kstrtoul(buf, 10, &input);
> +     if (err)
> +             return err;
> +     if (input > 100)
> +             return -EINVAL;
> +
> +     if (is_low)
> +             c->extfrag_low = input;
> +     else
> +             c->extfrag_high = input;
> +
> +     return count;
> +}
> +
> +static ssize_t extfrag_low_show(struct kobject *kobj,
> +             struct kobj_attribute *attr, char *buf)
> +{
> +     struct compaction_order_state *c = kobj_to_compaction_order_state(kobj);
> +
> +     return sprintf(buf, "%u\n", c->extfrag_low);
> +}
> +
> +static ssize_t extfrag_low_store(struct kobject *kobj,
> +             struct kobj_attribute *attr, const char *buf, size_t count)
> +{
> +     return extfrag_store_common(true, kobj, attr, buf, count);
> +}
> +COMPACTION_ATTR(extfrag_low);
> +
> +static ssize_t extfrag_high_show(struct kobject *kobj,
> +                                     struct kobj_attribute *attr, char *buf)
> +{
> +     struct compaction_order_state *c = kobj_to_compaction_order_state(kobj);
> +
> +     return sprintf(buf, "%u\n", c->extfrag_high);
> +}
> +
> +static ssize_t extfrag_high_store(struct kobject *kobj,
> +             struct kobj_attribute *attr, const char *buf, size_t count)
> +{
> +     return extfrag_store_common(false, kobj, attr, buf, count);
> +}
> +COMPACTION_ATTR(extfrag_high);
> +
> +static struct attribute *compaction_order_attrs[] = {
> +     &extfrag_low_attr.attr,
> +     &extfrag_high_attr.attr,
> +     NULL,
> +};
> +
> +static const struct attribute_group compaction_order_attr_group = {
> +     .attrs = compaction_order_attrs,
> +};
> +
> +static int compaction_sysfs_add_order(struct compaction_order_state *c,
> +     struct kobject *parent, struct kobject **compaction_order_kobjs,
> +     const struct attribute_group *compaction_order_attr_group)
> +{
> +     int retval;
> +
> +     compaction_order_kobjs[c->order] =
> +                     kobject_create_and_add(c->name, parent);
> +     if (!compaction_order_kobjs[c->order])
> +             return -ENOMEM;
> +
> +     retval = sysfs_create_group(compaction_order_kobjs[c->order],
> +                             compaction_order_attr_group);
> +     if (retval)
> +             kobject_put(compaction_order_kobjs[c->order]);
> +
> +     return retval;
> +}
> +
> +static void __init compaction_sysfs_init(void)
> +{
> +     struct compaction_order_state *c;
> +     int i, err;
> +
> +     compaction_kobj = kobject_create_and_add("compaction", mm_kobj);
> +     if (!compaction_kobj)
> +             return;
> +
> +     for (i = 1; i <= MAX_ORDER; i++) {
> +             c = &compaction_order_states[i];
> +             err = compaction_sysfs_add_order(c, compaction_kobj,
> +                                     compaction_order_kobjs,
> +                                     &compaction_order_attr_group);
> +             if (err)
> +                     pr_err("compaction: Unable to add state %s", c->name);
> +     }
> +}
> +
> +static void __init compaction_init_order_states(void)
> +{
> +     int i;
> +
> +     for (i = 0; i <= MAX_ORDER; i++) {
> +             struct compaction_order_state *c = &compaction_order_states[i];
> +
> +             c->order = i;
> +             c->extfrag_low = 100;
> +             c->extfrag_high = 100;
> +             snprintf(c->name, COMPACTION_ORDER_STATE_NAME_LEN,
> +                                             "order-%d", i);
> +     }
> +}
> +#endif
> +
>  static inline bool kcompactd_work_requested(pg_data_t *pgdat)
>  {
> -     return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
> +     return kthread_should_stop() || node_should_compact(pgdat);
>  }
>  
>  static bool kcompactd_node_suitable(pg_data_t *pgdat)
> @@ -2527,15 +2679,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
>       int zoneid;
>       struct zone *zone;
>       struct compact_control cc = {
> -             .order = pgdat->kcompactd_max_order,
> -             .search_order = pgdat->kcompactd_max_order,
> +             .order = -1,
>               .total_migrate_scanned = 0,
>               .total_free_scanned = 0,
> -             .classzone_idx = pgdat->kcompactd_classzone_idx,
> -             .mode = MIGRATE_SYNC_LIGHT,
> -             .ignore_skip_hint = false,
> +             .mode = MIGRATE_SYNC,
> +             .ignore_skip_hint = true,
> +             .whole_zone = false,
>               .gfp_mask = GFP_KERNEL,
> +             .classzone_idx = MAX_NR_ZONES - 1,
>       };
> +
>       trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
>                                                       cc.classzone_idx);
>       count_compact_event(KCOMPACTD_WAKE);
> @@ -2565,7 +2718,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
>               if (kthread_should_stop())
>                       return;
>               status = compact_zone(&cc, NULL);
> -
>               if (status == COMPACT_SUCCESS) {
>                       compaction_defer_reset(zone, cc.order, false);
>               } else if (status == COMPACT_PARTIAL_SKIPPED || status == 
> COMPACT_COMPLETE) {
> @@ -2650,11 +2802,14 @@ static int kcompactd(void *p)
>       pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
>  
>       while (!kthread_should_stop()) {
> -             unsigned long pflags;
> +             unsigned long ret, pflags;
>  
>               trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
> -             wait_event_freezable(pgdat->kcompactd_wait,
> -                             kcompactd_work_requested(pgdat));
> +             ret = wait_event_freezable_timeout(pgdat->kcompactd_wait,
> +                             kcompactd_work_requested(pgdat),
> +                             msecs_to_jiffies(5000));
> +             if (!ret)
> +                     continue;
>  
>               psi_memstall_enter(&pflags);
>               kcompactd_do_work(pgdat);
> @@ -2735,6 +2890,9 @@ static int __init kcompactd_init(void)
>               return ret;
>       }
>  
> +     compaction_init_order_states();
> +     compaction_sysfs_init();
> +
>       for_each_node_state(nid, N_MEMORY)
>               kcompactd_run(nid);
>       return 0;
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index fd7e16ca6996..e9090a5595d1 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1074,6 +1074,18 @@ static int __fragmentation_index(unsigned int order, 
> struct contig_page_info *in
>       return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, 
> requested))), info->free_blocks_total);
>  }
>  
> +int extfrag_for_order(struct zone *zone, unsigned int order)
> +{
> +     struct contig_page_info info;
> +
> +     fill_contig_page_info(zone, order, &info);
> +     if (info.free_pages == 0)
> +             return 0;
> +
> +     return (info.free_pages - (info.free_blocks_suitable << order)) * 100
> +                                                     / info.free_pages;
> +}
> +
>  /* Same as __fragmentation index but allocs contig_page_info on stack */
>  int fragmentation_index(struct zone *zone, unsigned int order)
>  {
>

Re: [RFC] mm: Proactive compaction

Reply via email to