[PATCH 10/10] mm/page_alloc: Embed per_cpu_pages locking within the per-cpu structure

2021-04-19 Thread Mel Gorman
struct per_cpu_pages is protected by the pagesets lock but it can be
embedded within struct per_cpu_pages at a minor cost. This is possible
because per-cpu lookups are based on offsets. Paraphrasing an explanation
from Peter Ziljstra

  The whole thing relies on:

_cpu_ptr(msblk->stream, cpu)->lock == per_cpu_ptr(>stream->lock, 
cpu)

  Which is true because the lhs:

(local_lock_t *)((zone->per_cpu_pages + per_cpu_offset(cpu)) + 
offsetof(struct per_cpu_pages, lock))

  and the rhs:

(local_lock_t *)((zone->per_cpu_pages + offsetof(struct per_cpu_pages, 
lock)) + per_cpu_offset(cpu))

  are identical, because addition is associative.

More details are included in mmzone.h. This embedding is not completely
free for three reasons.

1. As local_lock does not return a per-cpu structure, the PCP has to
   be looked up twice -- first to acquire the lock and again to get the
   PCP pointer.

2. For PREEMPT_RT and CONFIG_DEBUG_LOCK_ALLOC, local_lock is potentially
   a spinlock or has lock-specific tracking. In both cases, it becomes
   necessary to release/acquire different locks when freeing a list of
   pages in free_unref_page_list.

3. For most kernel configurations, local_lock_t is empty and no storage is
   required. By embedding the lock, the memory consumption on PREEMPT_RT
   and CONFIG_DEBUG_LOCK_ALLOC is higher.

Suggested-by: Peter Zijlstra 
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h | 31 -
 mm/page_alloc.c| 78 +++---
 2 files changed, 81 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d3b8889b76ab..a1f458f939db 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -338,8 +338,37 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
-/* Fields and list protected by pagesets local_lock in page_alloc.c */
+/*
+ * struct per_cpu_pages is a per-cpu structure protected by a lock
+ * embedded within the structure. This is subtle because ordinarily
+ * that would imply the PCP structure must first be located when
+ * the task is not yet pinned to the CPU e.g.
+ *
+ *   migrate_disable();
+ *   pcp = this_cpu_ptr(zone->per_cpu_pageset);
+ *   local_lock_irqsave(>lock, flags);
+ *
+ * However, because per-cpu addresses are calculated based on offsets,
+ * the following is true
+ *
+ *   _cpu_ptr(zone->per_cpu_pageset, cpu)->lock == 
per_cpu_ptr(>per_cpu_pageset->lock, cpu)
+ *
+ * This is true because the LHS is
+ *
+ *   (local_lock_t *)((zone->per_cpu_pageset + per_cpu_offset(cpu)) + 
offsetof(struct per_cpu_pageset, lock))
+ *
+ * while the RHS is
+ *
+ *   (local_lock_t *)((zone->per_cpu_pageset + offsetof(struct 
per_cpu_pageset, lock)) + per_cpu_offset(cpu))
+ *
+ * local_lock will first disable preempt, migration or IRQs depending on
+ * the kernel configuration prior to the per_cpu_offset is calculated.
+ * Hence, the following is safe for both PREEMPT_RT and !PREEMPT_RT.
+ *
+ *   local_lock_irqsave(>per_cpu_pageset->lock, flags);
+ */
 struct per_cpu_pages {
+   local_lock_t lock;
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
int batch;  /* chunk size for buddy add/remove */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44a5eb067b38..52ce688b6c73 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,13 +112,6 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
 
-struct pagesets {
-   local_lock_t lock;
-};
-static DEFINE_PER_CPU(struct pagesets, pagesets) = {
-   .lock = INIT_LOCAL_LOCK(lock),
-};
-
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -2985,12 +2978,12 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
unsigned long flags;
int to_drain, batch;
 
-   local_lock_irqsave(, flags);
+   local_lock_irqsave(>lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
-   local_unlock_irqrestore(, flags);
+   local_unlock_irqrestore(>lock, flags);
 }
 #endif
 
@@ -3006,13 +2999,13 @@ static void drain_pages_zone(unsigned int cpu, struct 
zone *zone)
unsigned long flags;
struct per_cpu_pages *pcp;
 
-   local_lock_irqsave(, flags);
+   local_lock_irqsave(>per_cpu_pageset->lock, flags);
 
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
 
-   local_unlock_irqrestore(, flags);
+   local_unl

[PATCH 09/10] mm/page_alloc: Update PGFREE outside the zone lock in __free_pages_ok

2021-04-19 Thread Mel Gorman
VM events do not need explicit protection by disabling IRQs so
update the counter with IRQs enabled in __free_pages_ok.

Signed-off-by: Mel Gorman 
Acked-by: Vlastimil Babka 
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a9c1282d9c7b..44a5eb067b38 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1557,13 +1557,14 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
migratetype = get_pfnblock_migratetype(page, pfn);
 
spin_lock_irqsave(>lock, flags);
-   __count_vm_events(PGFREE, 1 << order);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock_irqrestore(>lock, flags);
+
+   __count_vm_events(PGFREE, 1 << order);
 }
 
 void __free_pages_core(struct page *page, unsigned int order)
-- 
2.26.2



[PATCH 08/10] mm/page_alloc: Avoid conflating IRQs disabled with zone->lock

2021-04-19 Thread Mel Gorman
Historically when freeing pages, free_one_page() assumed that callers
had IRQs disabled and the zone->lock could be acquired with spin_lock().
This confuses the scope of what local_lock_irq is protecting and what
zone->lock is protecting in free_unref_page_list in particular.

This patch uses spin_lock_irqsave() for the zone->lock in
free_one_page() instead of relying on callers to have disabled
IRQs. free_unref_page_commit() is changed to only deal with PCP pages
protected by the local lock. free_unref_page_list() then first frees
isolated pages to the buddy lists with free_one_page() and frees the rest
of the pages to the PCP via free_unref_page_commit(). The end result
is that free_one_page() is no longer depending on side-effects of
local_lock to be correct.

Note that this may incur a performance penalty while memory hot-remove
is running but that is not a common operation.

Signed-off-by: Mel Gorman 
Acked-by: Vlastimil Babka 
---
 mm/page_alloc.c | 68 ++---
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c6e8da942905..a9c1282d9c7b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1458,13 +1458,15 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype, fpi_t fpi_flags)
 {
-   spin_lock(>lock);
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
-   spin_unlock(>lock);
+   spin_unlock_irqrestore(>lock, flags);
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -3229,31 +3231,13 @@ static bool free_unref_page_prepare(struct page *page, 
unsigned long pfn)
return true;
 }
 
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
+  int migratetype)
 {
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
-   int migratetype;
 
-   migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
-
-   /*
-* We only track unmovable, reclaimable and movable on pcp lists.
-* Free ISOLATE pages back to the allocator because they are being
-* offlined but treat HIGHATOMIC as movable pages so we can get those
-* areas back if necessary. Otherwise, we may have to free
-* excessively into the page allocator
-*/
-   if (migratetype >= MIGRATE_PCPTYPES) {
-   if (unlikely(is_migrate_isolate(migratetype))) {
-   free_one_page(zone, page, pfn, 0, migratetype,
- FPI_NONE);
-   return;
-   }
-   migratetype = MIGRATE_MOVABLE;
-   }
-
pcp = this_cpu_ptr(zone->per_cpu_pageset);
list_add(>lru, >lists[migratetype]);
pcp->count++;
@@ -3268,12 +3252,29 @@ void free_unref_page(struct page *page)
 {
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
+   int migratetype;
 
if (!free_unref_page_prepare(page, pfn))
return;
 
+   /*
+* We only track unmovable, reclaimable and movable on pcp lists.
+* Place ISOLATE pages on the isolated list because they are being
+* offlined but treat HIGHATOMIC as movable pages so we can get those
+* areas back if necessary. Otherwise, we may have to free
+* excessively into the page allocator
+*/
+   migratetype = get_pcppage_migratetype(page);
+   if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+   if (unlikely(is_migrate_isolate(migratetype))) {
+   free_one_page(page_zone(page), page, pfn, 0, 
migratetype, FPI_NONE);
+   return;
+   }
+   migratetype = MIGRATE_MOVABLE;
+   }
+
local_lock_irqsave(, flags);
-   free_unref_page_commit(page, pfn);
+   free_unref_page_commit(page, pfn, migratetype);
local_unlock_irqrestore(, flags);
 }
 
@@ -3285,22 +3286,37 @@ void free_unref_page_list(struct list_head *list)
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+   int migratetype;
 
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
pfn = page_to_pfn(page);
if (!free_unref_page_prepare(page, pfn))
list_del(>lru);
+
+   /*
+* Free isolated pages directly to the allocator, see
+  

[PATCH 07/10] mm/page_alloc: Explicitly acquire the zone lock in __free_pages_ok

2021-04-19 Thread Mel Gorman
__free_pages_ok() disables IRQs before calling a common helper
free_one_page() that acquires the zone lock. This is not safe according
to Documentation/locking/locktypes.rst and in this context, IRQ disabling
is not protecting a per_cpu_pages structure either or a local_lock would
be used.

This patch explicitly acquires the lock with spin_lock_irqsave instead of
relying on a helper. This removes the last instance of local_irq_save()
in page_alloc.c.

Signed-off-by: Mel Gorman 
Acked-by: Vlastimil Babka 
---
 mm/page_alloc.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 295624fe293b..c6e8da942905 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1547,21 +1547,21 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
+   struct zone *zone = page_zone(page);
 
if (!free_pages_prepare(page, order, true))
return;
 
migratetype = get_pfnblock_migratetype(page, pfn);
 
-   /*
-* TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock
-* and protect vmstat updates.
-*/
-   local_irq_save(flags);
+   spin_lock_irqsave(>lock, flags);
__count_vm_events(PGFREE, 1 << order);
-   free_one_page(page_zone(page), page, pfn, order, migratetype,
- fpi_flags);
-   local_irq_restore(flags);
+   if (unlikely(has_isolate_pageblock(zone) ||
+   is_migrate_isolate(migratetype))) {
+   migratetype = get_pfnblock_migratetype(page, pfn);
+   }
+   __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+   spin_unlock_irqrestore(>lock, flags);
 }
 
 void __free_pages_core(struct page *page, unsigned int order)
-- 
2.26.2



[PATCH 06/10] mm/page_alloc: Reduce duration that IRQs are disabled for VM counters

2021-04-19 Thread Mel Gorman
IRQs are left disabled for the zone and node VM event counters. This is
unnecessary as the affected counters are allowed to race for preemmption
and IRQs.

This patch reduces the scope of IRQs being disabled
via local_[lock|unlock]_irq on !PREEMPT_RT kernels. One
__mod_zone_freepage_state is still called with IRQs disabled. While this
could be moved out, it's not free on all architectures as some require
IRQs to be disabled for mod_zone_page_state on !PREEMPT_RT kernels.

Signed-off-by: Mel Gorman 
Acked-by: Vlastimil Babka 
---
 mm/page_alloc.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cff0f1c98b28..295624fe293b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3474,11 +3474,11 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
pcp = this_cpu_ptr(zone->per_cpu_pageset);
list = >lists[migratetype];
page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
+   local_unlock_irqrestore(, flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone, 1);
}
-   local_unlock_irqrestore(, flags);
return page;
 }
 
@@ -3530,15 +3530,15 @@ struct page *rmqueue(struct zone *preferred_zone,
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
-   spin_unlock(>lock);
if (!page)
goto failed;
+
__mod_zone_freepage_state(zone, -(1 << order),
  get_pcppage_migratetype(page));
+   spin_unlock_irqrestore(>lock, flags);
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
-   local_irq_restore(flags);
 
 out:
/* Separate test+clear to avoid unnecessary atomics */
@@ -3551,7 +3551,7 @@ struct page *rmqueue(struct zone *preferred_zone,
return page;
 
 failed:
-   local_irq_restore(flags);
+   spin_unlock_irqrestore(>lock, flags);
return NULL;
 }
 
@@ -5103,11 +5103,11 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
nr_populated++;
}
 
+   local_unlock_irqrestore(, flags);
+
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
 
-   local_unlock_irqrestore(, flags);
-
return nr_populated;
 
 failed_irq:
-- 
2.26.2



[PATCH 05/10] mm/page_alloc: Batch the accounting updates in the bulk allocator

2021-04-19 Thread Mel Gorman
Now that the zone_statistics are simple counters that do not require
special protection, the bulk allocator accounting updates can be batch
updated without adding too much complexity with protected RMW updates or
using xchg.

Signed-off-by: Mel Gorman 
Acked-by: Vlastimil Babka 
---
 include/linux/vmstat.h |  8 
 mm/page_alloc.c| 30 +-
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c1d2c316ce7d..9bf194d507e7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -247,6 +247,14 @@ __count_numa_event(struct zone *zone, enum numa_stat_item 
item)
raw_cpu_inc(pzstats->vm_numa_event[item]);
 }
 
+static inline void
+__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
+{
+   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+   raw_cpu_add(pzstats->vm_numa_event[item], delta);
+}
+
 extern unsigned long sum_zone_node_page_state(int node,
  enum zone_stat_item item);
 extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item 
item);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9d0f047647e3..cff0f1c98b28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3411,7 +3411,8 @@ void __putback_isolated_page(struct page *page, unsigned 
int order, int mt)
  *
  * Must be called with interrupts disabled.
  */
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+  long nr_account)
 {
 #ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -3424,12 +3425,12 @@ static inline void zone_statistics(struct zone 
*preferred_zone, struct zone *z)
local_stat = NUMA_OTHER;
 
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
-   __count_numa_event(z, NUMA_HIT);
+   __count_numa_events(z, NUMA_HIT, nr_account);
else {
-   __count_numa_event(z, NUMA_MISS);
-   __count_numa_event(preferred_zone, NUMA_FOREIGN);
+   __count_numa_events(z, NUMA_MISS, nr_account);
+   __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
}
-   __count_numa_event(z, local_stat);
+   __count_numa_events(z, local_stat, nr_account);
 #endif
 }
 
@@ -3475,7 +3476,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
-   zone_statistics(preferred_zone, zone);
+   zone_statistics(preferred_zone, zone, 1);
}
local_unlock_irqrestore(, flags);
return page;
@@ -3536,7 +3537,7 @@ struct page *rmqueue(struct zone *preferred_zone,
  get_pcppage_migratetype(page));
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   zone_statistics(preferred_zone, zone, 1);
local_irq_restore(flags);
 
 out:
@@ -5019,7 +5020,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
struct alloc_context ac;
gfp_t alloc_gfp;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
-   int nr_populated = 0;
+   int nr_populated = 0, nr_account = 0;
 
if (unlikely(nr_pages <= 0))
return 0;
@@ -5092,15 +5093,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
goto failed_irq;
break;
}
-
-   /*
-* Ideally this would be batched but the best way to do
-* that cheaply is to first convert zone_statistics to
-* be inaccurate per-cpu counter like vm_events to avoid
-* a RMW cycle then do the accounting with IRQs enabled.
-*/
-   __count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
-   zone_statistics(ac.preferred_zoneref->zone, zone);
+   nr_account++;
 
prep_new_page(page, 0, gfp, 0);
if (page_list)
@@ -5110,6 +5103,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
nr_populated++;
}
 
+   __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
+   zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+
local_unlock_irqrestore(, flags);
 
return nr_populated;
-- 
2.26.2



[PATCH 04/10] mm/vmstat: Inline NUMA event counter updates

2021-04-19 Thread Mel Gorman
__count_numa_event is small enough to be treated similarly to
__count_vm_event so inline it.

Signed-off-by: Mel Gorman 
Acked-by: Vlastimil Babka 
---
 include/linux/vmstat.h | 10 +-
 mm/vmstat.c|  9 -
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b120c58cae82..c1d2c316ce7d 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -238,7 +238,15 @@ static inline unsigned long 
zone_page_state_snapshot(struct zone *zone,
 }
 
 #ifdef CONFIG_NUMA
-extern void __count_numa_event(struct zone *zone, enum numa_stat_item item);
+/* See __count_vm_event comment on why raw_cpu_inc is used. */
+static inline void
+__count_numa_event(struct zone *zone, enum numa_stat_item item)
+{
+   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+   raw_cpu_inc(pzstats->vm_numa_event[item]);
+}
+
 extern unsigned long sum_zone_node_page_state(int node,
  enum zone_stat_item item);
 extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item 
item);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 97558cb607ac..4e6c474a3cd6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -935,15 +935,6 @@ void drain_zonestat(struct zone *zone, struct 
per_cpu_zonestat *pzstats)
 #endif
 
 #ifdef CONFIG_NUMA
-/* See __count_vm_event comment on why raw_cpu_inc is used. */
-void __count_numa_event(struct zone *zone,
-enum numa_stat_item item)
-{
-   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-
-   raw_cpu_inc(pzstats->vm_numa_event[item]);
-}
-
 /*
  * Determine the per node value of a stat item. This function
  * is called frequently in a NUMA machine, so try to be as
-- 
2.26.2



[PATCH 03/10] mm/vmstat: Convert NUMA statistics to basic NUMA counters

2021-04-19 Thread Mel Gorman
NUMA statistics are maintained on the zone level for hits, misses, foreign
etc but nothing relies on them being perfectly accurate for functional
correctness. The counters are used by userspace to get a general overview
of a workloads NUMA behaviour but the page allocator incurs a high cost to
maintain perfect accuracy similar to what is required for a vmstat like
NR_FREE_PAGES. There even is a sysctl vm.numa_stat to allow userspace to
turn off the collection of NUMA statistics like NUMA_HIT.

This patch converts NUMA_HIT and friends to be NUMA events with similar
accuracy to VM events. There is a possibility that slight errors will be
introduced but the overall trend as seen by userspace will be similar.
The counters are no longer updated from vmstat_refresh context as it is
unnecessary overhead for counters that may never be read by userspace.
Note that counters could be maintained at the node level to save space
but it would have a user-visible impact due to /proc/zoneinfo.

Signed-off-by: Mel Gorman 
---
 drivers/base/node.c|  18 ++--
 include/linux/mmzone.h |  13 ++-
 include/linux/vmstat.h |  43 +-
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  12 +--
 mm/vmstat.c| 184 +++--
 6 files changed, 113 insertions(+), 159 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index f449dbb2c746..443a609db428 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -484,6 +484,7 @@ static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL);
 static ssize_t node_read_numastat(struct device *dev,
  struct device_attribute *attr, char *buf)
 {
+   fold_vm_numa_events();
return sysfs_emit(buf,
  "numa_hit %lu\n"
  "numa_miss %lu\n"
@@ -491,12 +492,12 @@ static ssize_t node_read_numastat(struct device *dev,
  "interleave_hit %lu\n"
  "local_node %lu\n"
  "other_node %lu\n",
- sum_zone_numa_state(dev->id, NUMA_HIT),
- sum_zone_numa_state(dev->id, NUMA_MISS),
- sum_zone_numa_state(dev->id, NUMA_FOREIGN),
- sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
- sum_zone_numa_state(dev->id, NUMA_LOCAL),
- sum_zone_numa_state(dev->id, NUMA_OTHER));
+ sum_zone_numa_event_state(dev->id, NUMA_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_MISS),
+ sum_zone_numa_event_state(dev->id, NUMA_FOREIGN),
+ sum_zone_numa_event_state(dev->id, 
NUMA_INTERLEAVE_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_LOCAL),
+ sum_zone_numa_event_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL);
 
@@ -514,10 +515,11 @@ static ssize_t node_read_vmstat(struct device *dev,
 sum_zone_node_page_state(nid, i));
 
 #ifdef CONFIG_NUMA
-   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+   fold_vm_numa_events();
+   for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
len += sysfs_emit_at(buf, len, "%s %lu\n",
 numa_stat_name(i),
-sum_zone_numa_state(nid, i));
+sum_zone_numa_event_state(nid, i));
 
 #endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 106da8fbc72a..d3b8889b76ab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -135,10 +135,10 @@ enum numa_stat_item {
NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */
-   NR_VM_NUMA_STAT_ITEMS
+   NR_VM_NUMA_EVENT_ITEMS
 };
 #else
-#define NR_VM_NUMA_STAT_ITEMS 0
+#define NR_VM_NUMA_EVENT_ITEMS 0
 #endif
 
 enum zone_stat_item {
@@ -357,7 +357,12 @@ struct per_cpu_zonestat {
s8 stat_threshold;
 #endif
 #ifdef CONFIG_NUMA
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
+   /*
+* Low priority inaccurate counters that are only folded
+* on demand. Use a large type to avoid the overhead of
+* folding during refresh_cpu_vm_stats.
+*/
+   unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
 #endif
 };
 
@@ -609,7 +614,7 @@ struct zone {
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t   vm_stat[NR_VM_ZONE_STAT_ITEMS];
-   atomic_long_t   vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
+   atomic_long_t   vm_numa_

[PATCH 02/10] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-04-19 Thread Mel Gorman
There is a lack of clarity of what exactly local_irq_save/local_irq_restore
protects in page_alloc.c . It conflates the protection of per-cpu page
allocation structures with per-cpu vmstat deltas.

This patch protects the PCP structure using local_lock which for most
configurations is identical to IRQ enabling/disabling. The scope of the
lock is still wider than it should be but this is decreased later.

It is possible for the local_lock to be embedded safely within struct
per_cpu_pages but it adds complexity to free_unref_page_list so it is
implemented as a separate patch later in the series.

[l...@intel.com: Make pagesets static]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h |  2 ++
 mm/page_alloc.c| 50 +-
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a4393ac27336..106da8fbc72a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* Free memory management - zoned buddy allocator.  */
@@ -337,6 +338,7 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d6283cab22d..4e92d43c25f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
 
+struct pagesets {
+   local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+   .lock = INIT_LOCAL_LOCK(lock),
+};
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -1421,6 +1428,10 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
} while (--count && --batch_free && !list_empty(list));
}
 
+   /*
+* local_lock_irq held so equivalent to spin_lock_irqsave for
+* both PREEMPT_RT and non-PREEMPT_RT configurations.
+*/
spin_lock(>lock);
isolated_pageblocks = has_isolate_pageblock(zone);
 
@@ -1541,6 +1552,11 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
return;
 
migratetype = get_pfnblock_migratetype(page, pfn);
+
+   /*
+* TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock
+* and protect vmstat updates.
+*/
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype,
@@ -2910,6 +2926,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int 
order,
 {
int i, allocated = 0;
 
+   /*
+* local_lock_irq held so equivalent to spin_lock_irqsave for
+* both PREEMPT_RT and non-PREEMPT_RT configurations.
+*/
spin_lock(>lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
@@ -2962,12 +2982,12 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
unsigned long flags;
int to_drain, batch;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 #endif
 
@@ -2983,13 +3003,13 @@ static void drain_pages_zone(unsigned int cpu, struct 
zone *zone)
unsigned long flags;
struct per_cpu_pages *pcp;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
 
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
 
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 
 /*
@@ -3252,9 +3272,9 @@ void free_unref_page(struct page *page)
if (!free_unref_page_prepare(page, pfn))
return;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
free_unref_page_commit(page, pfn);
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 
 /*
@@ -3274,7 +3294,7 @@ void free_unref_page_list(struct list_head *list)
set_page_private(page, pfn);
}
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
list_for_each_entry_safe(page, next, list, lru) {
unsigned long pfn = page_private(page);
 
@@ -3287

[PATCH 01/10] mm/page_alloc: Split per cpu page lists and zone stats

2021-04-19 Thread Mel Gorman
The per-cpu page allocator lists and the per-cpu vmstat deltas are stored
in the same struct per_cpu_pages even though vmstats have no direct impact
on the per-cpu page lists. This is inconsistent because the vmstats for a
node are stored on a dedicated structure. The bigger issue is that the
per_cpu_pages structure is not cache-aligned and stat updates either
cache conflict with adjacent per-cpu lists incurring a runtime cost or
padding is required incurring a memory cost.

This patch splits the per-cpu pagelists and the vmstat deltas into separate
structures. It's mostly a mechanical conversion but some variable renaming
is done to clearly distinguish the per-cpu pages structure (pcp) from
the vmstats (pzstats).

Superficially, this appears to increase the size of the per_cpu_pages
structure but the movement of expire fills a structure hole so there is
no impact overall.

[l...@intel.com: Check struct per_cpu_zonestat has a non-zero size]
[vba...@suse.cz: Init zone->per_cpu_zonestats properly]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h | 18 
 include/linux/vmstat.h |  8 ++--
 mm/page_alloc.c| 85 -
 mm/vmstat.c| 96 ++
 4 files changed, 111 insertions(+), 96 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..a4393ac27336 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -341,20 +341,21 @@ struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
int batch;  /* chunk size for buddy add/remove */
+#ifdef CONFIG_NUMA
+   int expire; /* When 0, remote pagesets are drained */
+#endif
 
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES];
 };
 
-struct per_cpu_pageset {
-   struct per_cpu_pages pcp;
-#ifdef CONFIG_NUMA
-   s8 expire;
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
-#endif
+struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
-   s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+   s8 stat_threshold;
+#endif
+#ifdef CONFIG_NUMA
+   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 };
 
@@ -470,7 +471,8 @@ struct zone {
int node;
 #endif
struct pglist_data  *zone_pgdat;
-   struct per_cpu_pageset __percpu *pageset;
+   struct per_cpu_pages__percpu *per_cpu_pageset;
+   struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
 * the high and batch values are copied to individual pagesets for
 * faster access
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 506d625163a1..1736ea9d24a7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct 
zone *zone,
int cpu;
 
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_numa_stat_diff[item];
 
return x;
 }
@@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct 
zone *zone,
 #ifdef CONFIG_SMP
int cpu;
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_stat_diff[item];
 
if (x < 0)
x = 0;
@@ -291,7 +291,7 @@ struct ctl_table;
 int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
loff_t *ppos);
 
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);
 
 int calculate_pressure_threshold(struct zone *zone);
 int calculate_normal_threshold(struct zone *zone);
@@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { }
 static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
-   struct per_cpu_pageset *pset) { }
+   struct per_cpu_zonestat *pzstats) { }
 #endif /* CONFIG_SMP */
 
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bf0db982f14..2d6283cab22d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2981,15 +2981,14 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
unsigned long flags;
-   struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
 
local_irq_save(flags);
-   pset = per_cpu_ptr(zone->pageset, cpu);
 
-   pcp = >pcp;
+   pcp = per_cpu_ptr(zone->per_cp

[PATCH 00/10 v4] Use local_lock for pcp protection and reduce stat overhead

2021-04-19 Thread Mel Gorman
Some Acks from RT people are still missing that I'd like to have before
trying to merge this via Andrew's tree and there is an open question is
whether the last path in this series is worthwhile. It embeds local_lock
within the per_cpu_pages structure to clarify the scope but it increases
complexity and storage costs that may not be worthwhile. I don't think
it is but it was asked whether the lock could be safely embedded or not.

Changelog since v3
o Preserve NUMA_* counters after CPU hotplug
o Drop "mm/page_alloc: Remove duplicate checks if migratetype should be 
isolated"
o Add micro-optimisation tracking PFN during free_unref_page_list
o Add Acks

Changelog since v2
o Fix zonestats initialisation
o Merged memory hotplug fix separately
o Embed local_lock within per_cpu_pages

This series requires patches in Andrew's tree so for convenience, it's
also available at

git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git 
mm-percpu-local_lock-v4r4

The PCP (per-cpu page allocator in page_alloc.c) shares locking
requirements with vmstat and the zone lock which is inconvenient and
causes some issues. For example, the PCP list and vmstat share the
same per-cpu space meaning that it's possible that vmstat updates dirty
cache lines holding per-cpu lists across CPUs unless padding is used.
Second, PREEMPT_RT does not want to disable IRQs for too long in the
page allocator.

This series splits the locking requirements and uses locks types more
suitable for PREEMPT_RT, reduces the time when special locking is required
for stats and reduces the time when IRQs need to be disabled on !PREEMPT_RT
kernels.

Why local_lock? PREEMPT_RT considers the following sequence to be unsafe
as documented in Documentation/locking/locktypes.rst

   local_irq_disable();
   spin_lock();

The pcp allocator has this sequence for rmqueue_pcplist (local_irq_save)
-> __rmqueue_pcplist -> rmqueue_bulk (spin_lock). While it's possible to
separate this out, it generally means there are points where we enable
IRQs and reenable them again immediately. To prevent a migration and the
per-cpu pointer going stale, migrate_disable is also needed. That is a
custom lock that is similar, but worse, than local_lock. Furthermore,
on PREEMPT_RT, it's undesirable to leave IRQs disabled for too long.
By converting to local_lock which disables migration on PREEMPT_RT, the
locking requirements can be separated and start moving the protections
for PCP, stats and the zone lock to PREEMPT_RT-safe equivalent locking. As
a bonus, local_lock also means that PROVE_LOCKING does something useful.

After that, it's obvious that zone_statistics incurs too much overhead
and leaves IRQs disabled for longer than necessary on !PREEMPT_RT
kernels. zone_statistics uses perfectly accurate counters requiring IRQs
be disabled for parallel RMW sequences when inaccurate ones like vm_events
would do. The series makes the NUMA statistics (NUMA_HIT and friends)
inaccurate counters that then require no special protection on !PREEMPT_RT.

The bulk page allocator can then do stat updates in bulk with IRQs enabled
which should improve the efficiency.  Technically, this could have been
done without the local_lock and vmstat conversion work and the order
simply reflects the timing of when different series were implemented.

Finally, there are places where we conflate IRQs being disabled for the
PCP with the IRQ-safe zone spinlock. The remainder of the series reduces
the scope of what is protected by disabled IRQs on !PREEMPT_RT kernels.
By the end of the series, page_alloc.c does not call local_irq_save so
the locking scope is a bit clearer. The one exception is that modifying
NR_FREE_PAGES still happens in places where it's known the IRQs are
disabled as it's harmless for PREEMPT_RT and would be expensive to split
the locking there.

No performance data is included because despite the overhead of the stats,
it's within the noise for most workloads on !PREEMPT_RT. However, Jesper
Dangaard Brouer ran a page allocation microbenchmark on a E5-1650 v4 @
3.60GHz CPU on the first version of this series. Focusing on the array
variant of the bulk page allocator reveals the following.

(CPU: Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz)
ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size

 BaselinePatched
 1   56.383  54.225 (+3.83%)
 2   40.047  35.492 (+11.38%)
 3   37.339  32.643 (+12.58%)
 4   35.578  30.992 (+12.89%)
 8   33.592  29.606 (+11.87%)
 16  32.362  28.532 (+11.85%)
 32  31.476  27.728 (+11.91%)
 64  30.633  27.252 (+11.04%)
 128 30.596  27.090 (+11.46%)

While this is a positive outcome, the series is more likely to be
interesting to the RT people in terms of getting parts of the PREEMPT_RT
tree into mainline.

 drivers/base/node.c|  18 +--
 include/linux/mmzone.h |  60 +++--
 include/linux/vmstat.h |  65 ++
 mm/mempolicy.c 

[tip: sched/core] sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG

2021-04-16 Thread tip-bot2 for Mel Gorman
The following commit has been merged into the sched/core branch of tip:

Commit-ID: b7cc6ec744b307db59568c654a8904a5928aa855
Gitweb:
https://git.kernel.org/tip/b7cc6ec744b307db59568c654a8904a5928aa855
Author:Mel Gorman 
AuthorDate:Wed, 24 Mar 2021 13:39:16 
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:33 +02:00

sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG

The ability to enable/disable NUMA balancing is not a debugging feature
and should not depend on CONFIG_SCHED_DEBUG.  For example, machines within
a HPC cluster may disable NUMA balancing temporarily for some jobs and
re-enable it for other jobs without needing to reboot.

This patch removes the dependency on CONFIG_SCHED_DEBUG for
kernel.numa_balancing sysctl. The other numa balancing related sysctls
are left as-is because if they need to be tuned then it is more likely
that NUMA balancing needs to be fixed instead.

Signed-off-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210324133916.gq15...@suse.de
---
 kernel/sysctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09..8042098 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1753,6 +1753,9 @@ static struct ctl_table kern_table[] = {
.proc_handler   = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_NUMA_BALANCING
{
.procname   = "numa_balancing",
.data   = NULL, /* filled in by handler */
@@ -1763,7 +1766,6 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
 #endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
{
.procname   = "sched_rt_period_us",
.data   = _sched_rt_period,


Re: [PATCH 11/11] mm/page_alloc: Embed per_cpu_pages locking within the per-cpu structure

2021-04-15 Thread Mel Gorman
On Thu, Apr 15, 2021 at 04:53:46PM +0200, Vlastimil Babka wrote:
> On 4/14/21 3:39 PM, Mel Gorman wrote:
> > struct per_cpu_pages is protected by the pagesets lock but it can be
> > embedded within struct per_cpu_pages at a minor cost. This is possible
> > because per-cpu lookups are based on offsets. Paraphrasing an explanation
> > from Peter Ziljstra
> > 
> >   The whole thing relies on:
> > 
> > _cpu_ptr(msblk->stream, cpu)->lock == 
> > per_cpu_ptr(>stream->lock, cpu)
> > 
> >   Which is true because the lhs:
> > 
> > (local_lock_t *)((zone->per_cpu_pages + per_cpu_offset(cpu)) + 
> > offsetof(struct per_cpu_pages, lock))
> > 
> >   and the rhs:
> > 
> > (local_lock_t *)((zone->per_cpu_pages + offsetof(struct per_cpu_pages, 
> > lock)) + per_cpu_offset(cpu))
> > 
> >   are identical, because addition is associative.
> > 
> > More details are included in mmzone.h. This embedding is not completely
> > free for three reasons.
> > 
> > 1. As local_lock does not return a per-cpu structure, the PCP has to
> >be looked up twice -- first to acquire the lock and again to get the
> >PCP pointer.
> > 
> > 2. For PREEMPT_RT and CONFIG_DEBUG_LOCK_ALLOC, local_lock is potentially
> >a spinlock or has lock-specific tracking. In both cases, it becomes
> >necessary to release/acquire different locks when freeing a list of
> >pages in free_unref_page_list.
> 
> Looks like this pattern could benefit from a local_lock API helper that would 
> do
> the right thing? It probably couldn't optimize much the CONFIG_PREEMPT_RT case
> which would need to be unlock/lock in any case, but CONFIG_DEBUG_LOCK_ALLOC
> could perhaps just keep the IRQ's disabled and just note the change of what's
> acquired?
> 

A helper could potentially be used but right now, there is only one
call-site that needs this type of care so it may be overkill. A helper
was proposed that can lookup and lock a per-cpu structure which is
generally useful but does not suit the case where different locks need
to be acquired.

> > 3. For most kernel configurations, local_lock_t is empty and no storage is
> >required. By embedding the lock, the memory consumption on PREEMPT_RT
> >and CONFIG_DEBUG_LOCK_ALLOC is higher.
> 
> But I wonder, is there really a benefit to this increased complexity? Before 
> the
> patch we had "pagesets" - a local_lock that protects all zones' pcplists. Now
> each zone's pcplists have own local_lock. On !PREEMPT_RT we will never take 
> the
> locks of multiple zones from the same CPU in parallel, because we use
> local_lock_irqsave(). Can that parallelism happen on PREEMPT_RT, because that
> could perhaps justify the change?
> 

I don't think PREEMPT_RT gets additional parallelism because it's still
a per-cpu structure that is being protected. The difference is whether
we are protecting the CPU-N index for all per_cpu_pages or just one.
The patch exists because it was asked why the lock was not embedded within
the structure it's protecting. I initially thought that was unsafe and
I was wrong as explained in the changelog. But now that I find it *can*
be done but it's a bit ugly so I put it at the end of the series so it
can be dropped if necessary.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 09/11] mm/page_alloc: Avoid conflating IRQs disabled with zone->lock

2021-04-15 Thread Mel Gorman
On Thu, Apr 15, 2021 at 02:25:36PM +0200, Vlastimil Babka wrote:
> > @@ -3294,6 +3295,7 @@ void free_unref_page_list(struct list_head *list)
> > struct page *page, *next;
> > unsigned long flags, pfn;
> > int batch_count = 0;
> > +   int migratetype;
> >  
> > /* Prepare pages for freeing */
> > list_for_each_entry_safe(page, next, list, lru) {
> > @@ -3301,15 +3303,28 @@ void free_unref_page_list(struct list_head *list)
> > if (!free_unref_page_prepare(page, pfn))
> > list_del(>lru);
> > set_page_private(page, pfn);
> 
> Should probably move this below so we don't set private for pages that then go
> through free_one_page()? Doesn't seem to be a bug, just unneccessary.
> 

Sure.

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1d87ca364680..a9c1282d9c7b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3293,7 +3293,6 @@ void free_unref_page_list(struct list_head *list)
pfn = page_to_pfn(page);
if (!free_unref_page_prepare(page, pfn))
list_del(>lru);
-   set_page_private(page, pfn);
 
/*
 * Free isolated pages directly to the allocator, see
@@ -3307,6 +3306,8 @@ void free_unref_page_list(struct list_head *list)
list_del(>lru);
}
}
+
+       set_page_private(page, pfn);
}
 
local_lock_irqsave(, flags);

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 04/11] mm/vmstat: Convert NUMA statistics to basic NUMA counters

2021-04-15 Thread Mel Gorman
On Wed, Apr 14, 2021 at 05:56:53PM +0200, Vlastimil Babka wrote:
> On 4/14/21 5:18 PM, Mel Gorman wrote:
> > On Wed, Apr 14, 2021 at 02:56:45PM +0200, Vlastimil Babka wrote:
> >> So it seems that this intermediate assignment to zone counters (using
> >> atomic_long_set() even) is unnecessary and this could mimic 
> >> sum_vm_events() that
> >> just does the summation on a local array?
> >> 
> > 
> > The atomic is unnecessary for sure but using a local array is
> > problematic because of your next point.
> 
> IIUC vm_events seems to do fine without a centralized array and handling CPU 
> hot
> remove at the sime time ...
> 

The vm_events are more global in nature. They are not reported
to userspace on a per-zone (/proc/zoneinfo) basis or per-node
(/sys/devices/system/node/node*/numastat) basis so they are not equivalent.

> >> And probably a bit more serious is that vm_events have 
> >> vm_events_fold_cpu() to
> >> deal with a cpu going away, but after your patch the stats counted on a 
> >> cpu just
> >> disapepar from the sums as it goes offline as there's no such thing for 
> >> the numa
> >> counters.
> >> 
> > 
> > That is a problem I missed. Even if zonestats was preserved on
> > hot-remove, fold_vm_zone_numa_events would not be reading the CPU so
> > hotplug events jump all over the place.
> > 
> > So some periodic folding is necessary. I would still prefer not to do it
> > by time but it could be done only on overflow or when a file like
> > /proc/vmstat is read. I'll think about it a bit more and see what I come
> > up with.
> 
> ... because vm_events_fold_cpu() seems to simply move the stats from the CPU
> being offlined to the current one. So the same approach should be enough for
> NUMA stats?
> 

Yes, or at least very similar.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 04/11] mm/vmstat: Inline NUMA event counter updates

2021-04-15 Thread Mel Gorman
On Wed, Apr 14, 2021 at 06:26:25PM +0200, Vlastimil Babka wrote:
> On 4/14/21 6:20 PM, Vlastimil Babka wrote:
> > On 4/14/21 3:39 PM, Mel Gorman wrote:
> >> __count_numa_event is small enough to be treated similarly to
> >> __count_vm_event so inline it.
> >> 
> >> Signed-off-by: Mel Gorman 
> > 
> > Acked-by: Vlastimil Babka 
> > 
> >> ---
> >>  include/linux/vmstat.h | 9 +
> >>  mm/vmstat.c| 9 -
> >>  2 files changed, 9 insertions(+), 9 deletions(-)
> >> 
> >> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> >> index fc14415223c5..dde4dec4e7dd 100644
> >> --- a/include/linux/vmstat.h
> >> +++ b/include/linux/vmstat.h
> >> @@ -237,6 +237,15 @@ static inline unsigned long 
> >> zone_page_state_snapshot(struct zone *zone,
> >>  }
> >>  
> >>  #ifdef CONFIG_NUMA
> >> +/* See __count_vm_event comment on why raw_cpu_inc is used. */
> >> +static inline void
> >> +__count_numa_event(struct zone *zone, enum numa_stat_item item)
> >> +{
> >> +  struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
> >> +
> >> +  raw_cpu_inc(pzstats->vm_numa_event[item]);
> >> +}
> >> +
> >>  extern void __count_numa_event(struct zone *zone, enum numa_stat_item 
> >> item);
> 
> Ah, but the line above should be removed.
> 

Yes it should!

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 07/11] mm/page_alloc: Remove duplicate checks if migratetype should be isolated

2021-04-15 Thread Mel Gorman
On Wed, Apr 14, 2021 at 07:21:42PM +0200, Vlastimil Babka wrote:
> On 4/14/21 3:39 PM, Mel Gorman wrote:
> > Both free_pcppages_bulk() and free_one_page() have very similar
> > checks about whether a page's migratetype has changed under the
> > zone lock. Use a common helper.
> > 
> > Signed-off-by: Mel Gorman 
> 
> Seems like for free_pcppages_bulk() this patch makes it check for each page on
> the pcplist
> - zone->nr_isolate_pageblock != 0 instead of local bool (the performance might
> be the same I guess on modern cpu though)
> - is_migrate_isolate(migratetype) for a migratetype obtained by
> get_pcppage_migratetype() which cannot be migrate_isolate so the check is 
> useless.
> 
> As such it doesn't seem a worthwhile cleanup to me considering all the other
> microoptimisations?
> 

The patch was a preparation patch for the rest of the series to avoid code
duplication and to consolidate checks together in one place to determine
if they are even correct.

Until zone_pcp_disable() came along, it was possible to have isolated PCP
pages in the lists even though zone->nr_isolate_pageblock could be 0 during
memory hot-remove so the split in free_pcppages_bulk was not necessarily
correct at all times.

The remaining problem is alloc_contig_pages, it does not disable
PCPs so both checks are necessary. If that also disabled PCPs
then check_migratetype_isolated could be deleted but the cost to
alloc_contig_pages might be too high.

I'll delete this patch for now because it's relatively minor and there
should be other ways of keeping the code duplication down.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 04/11] mm/vmstat: Convert NUMA statistics to basic NUMA counters

2021-04-14 Thread Mel Gorman
On Wed, Apr 14, 2021 at 02:56:45PM +0200, Vlastimil Babka wrote:
> On 4/7/21 10:24 PM, Mel Gorman wrote:
> > NUMA statistics are maintained on the zone level for hits, misses, foreign
> > etc but nothing relies on them being perfectly accurate for functional
> > correctness. The counters are used by userspace to get a general overview
> > of a workloads NUMA behaviour but the page allocator incurs a high cost to
> > maintain perfect accuracy similar to what is required for a vmstat like
> > NR_FREE_PAGES. There even is a sysctl vm.numa_stat to allow userspace to
> > turn off the collection of NUMA statistics like NUMA_HIT.
> > 
> > This patch converts NUMA_HIT and friends to be NUMA events with similar
> > accuracy to VM events. There is a possibility that slight errors will be
> > introduced but the overall trend as seen by userspace will be similar.
> > Note that while these counters could be maintained at the node level that
> > it would have a user-visible impact.
> 
> I guess this kind of inaccuracy is fine. I just don't like much
> fold_vm_zone_numa_events() which seems to calculate sums of percpu counters 
> and
> then assign the result to zone counters for immediate consumption, which 
> differs
> from other kinds of folds in vmstat that reset the percpu counters to 0 as 
> they
> are treated as diffs to the global counters.
> 

The counters that are diffs fit inside an s8 and they are kept limited
because their "true" value is sometimes critical -- e.g. NR_FREE_PAGES
for watermark checking. So the level of drift has to be controlled and
the drift should not exist potentially forever so it gets updated
periodically.

The inaccurate counters are only exported to userspace. There is no need
to update them every few seconds so fold_vm_zone_numa_events() is only
called when a user cares but you raise a raise a valid below.

> So it seems that this intermediate assignment to zone counters (using
> atomic_long_set() even) is unnecessary and this could mimic sum_vm_events() 
> that
> just does the summation on a local array?
> 

The atomic is unnecessary for sure but using a local array is
problematic because of your next point.

> And probably a bit more serious is that vm_events have vm_events_fold_cpu() to
> deal with a cpu going away, but after your patch the stats counted on a cpu 
> just
> disapepar from the sums as it goes offline as there's no such thing for the 
> numa
> counters.
> 

That is a problem I missed. Even if zonestats was preserved on
hot-remove, fold_vm_zone_numa_events would not be reading the CPU so
hotplug events jump all over the place.

So some periodic folding is necessary. I would still prefer not to do it
by time but it could be done only on overflow or when a file like
/proc/vmstat is read. I'll think about it a bit more and see what I come
up with.

Thanks!

-- 
Mel Gorman
SUSE Labs


[PATCH 11/11] mm/page_alloc: Embed per_cpu_pages locking within the per-cpu structure

2021-04-14 Thread Mel Gorman
struct per_cpu_pages is protected by the pagesets lock but it can be
embedded within struct per_cpu_pages at a minor cost. This is possible
because per-cpu lookups are based on offsets. Paraphrasing an explanation
from Peter Ziljstra

  The whole thing relies on:

_cpu_ptr(msblk->stream, cpu)->lock == per_cpu_ptr(>stream->lock, 
cpu)

  Which is true because the lhs:

(local_lock_t *)((zone->per_cpu_pages + per_cpu_offset(cpu)) + 
offsetof(struct per_cpu_pages, lock))

  and the rhs:

(local_lock_t *)((zone->per_cpu_pages + offsetof(struct per_cpu_pages, 
lock)) + per_cpu_offset(cpu))

  are identical, because addition is associative.

More details are included in mmzone.h. This embedding is not completely
free for three reasons.

1. As local_lock does not return a per-cpu structure, the PCP has to
   be looked up twice -- first to acquire the lock and again to get the
   PCP pointer.

2. For PREEMPT_RT and CONFIG_DEBUG_LOCK_ALLOC, local_lock is potentially
   a spinlock or has lock-specific tracking. In both cases, it becomes
   necessary to release/acquire different locks when freeing a list of
   pages in free_unref_page_list.

3. For most kernel configurations, local_lock_t is empty and no storage is
   required. By embedding the lock, the memory consumption on PREEMPT_RT
   and CONFIG_DEBUG_LOCK_ALLOC is higher.

Suggested-by: Peter Zijlstra 
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h | 31 -
 mm/page_alloc.c| 78 +++---
 2 files changed, 81 insertions(+), 28 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 693cd5f24f7d..c4e05e16ba1c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -338,8 +338,37 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
-/* Fields and list protected by pagesets local_lock in page_alloc.c */
+/*
+ * struct per_cpu_pages is a per-cpu structure protected by a lock
+ * embedded within the structure. This is subtle because ordinarily
+ * that would imply the PCP structure must first be located when
+ * the task is not yet pinned to the CPU e.g.
+ *
+ *   migrate_disable();
+ *   pcp = this_cpu_ptr(zone->per_cpu_pageset);
+ *   local_lock_irqsave(>lock, flags);
+ *
+ * However, because per-cpu addresses are calculated based on offsets,
+ * the following is true
+ *
+ *   _cpu_ptr(zone->per_cpu_pageset, cpu)->lock == 
per_cpu_ptr(>per_cpu_pageset->lock, cpu)
+ *
+ * This is true because the LHS is
+ *
+ *   (local_lock_t *)((zone->per_cpu_pageset + per_cpu_offset(cpu)) + 
offsetof(struct per_cpu_pageset, lock))
+ *
+ * while the RHS is
+ *
+ *   (local_lock_t *)((zone->per_cpu_pageset + offsetof(struct 
per_cpu_pageset, lock)) + per_cpu_offset(cpu))
+ *
+ * local_lock will first disable preempt, migration or IRQs depending on
+ * the kernel configuration prior to the per_cpu_offset is calculated.
+ * Hence, the following is safe for both PREEMPT_RT and !PREEMPT_RT.
+ *
+ *   local_lock_irqsave(>per_cpu_pageset->lock, flags);
+ */
 struct per_cpu_pages {
+   local_lock_t lock;
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
int batch;  /* chunk size for buddy add/remove */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8a94fe77bef7..324c2832a09f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,13 +112,6 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
 
-struct pagesets {
-   local_lock_t lock;
-};
-static DEFINE_PER_CPU(struct pagesets, pagesets) = {
-   .lock = INIT_LOCAL_LOCK(lock),
-};
-
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -2994,12 +2987,12 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
unsigned long flags;
int to_drain, batch;
 
-   local_lock_irqsave(, flags);
+   local_lock_irqsave(>lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
-   local_unlock_irqrestore(, flags);
+   local_unlock_irqrestore(>lock, flags);
 }
 #endif
 
@@ -3015,13 +3008,13 @@ static void drain_pages_zone(unsigned int cpu, struct 
zone *zone)
unsigned long flags;
struct per_cpu_pages *pcp;
 
-   local_lock_irqsave(, flags);
+   local_lock_irqsave(>per_cpu_pageset->lock, flags);
 
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
 
-   local_unlock_irqrestore(, flags);
+   local_unl

[PATCH 07/11] mm/page_alloc: Remove duplicate checks if migratetype should be isolated

2021-04-14 Thread Mel Gorman
Both free_pcppages_bulk() and free_one_page() have very similar
checks about whether a page's migratetype has changed under the
zone lock. Use a common helper.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 32 ++--
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 295624fe293b..1ed370668e7f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1354,6 +1354,23 @@ static inline void prefetch_buddy(struct page *page)
prefetch(buddy);
 }
 
+/*
+ * The migratetype of a page may have changed due to isolation so check.
+ * Assumes the caller holds the zone->lock to serialise against page
+ * isolation.
+ */
+static inline int
+check_migratetype_isolated(struct zone *zone, struct page *page, unsigned long 
pfn, int migratetype)
+{
+   /* If isolating, check if the migratetype has changed */
+   if (unlikely(has_isolate_pageblock(zone) ||
+   is_migrate_isolate(migratetype))) {
+   migratetype = get_pfnblock_migratetype(page, pfn);
+   }
+
+   return migratetype;
+}
+
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
@@ -1371,7 +1388,6 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
int migratetype = 0;
int batch_free = 0;
int prefetch_nr = READ_ONCE(pcp->batch);
-   bool isolated_pageblocks;
struct page *page, *tmp;
LIST_HEAD(head);
 
@@ -1433,21 +1449,20 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
 * both PREEMPT_RT and non-PREEMPT_RT configurations.
 */
spin_lock(>lock);
-   isolated_pageblocks = has_isolate_pageblock(zone);
 
/*
 * Use safe version since after __free_one_page(),
 * page->lru.next will not point to original list.
 */
list_for_each_entry_safe(page, tmp, , lru) {
+   unsigned long pfn = page_to_pfn(page);
int mt = get_pcppage_migratetype(page);
+
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
-   /* Pageblock could have been isolated meanwhile */
-   if (unlikely(isolated_pageblocks))
-   mt = get_pageblock_migratetype(page);
 
-   __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
+   mt = check_migratetype_isolated(zone, page, pfn, mt);
+   __free_one_page(page, pfn, zone, 0, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(>lock);
@@ -1459,10 +1474,7 @@ static void free_one_page(struct zone *zone,
int migratetype, fpi_t fpi_flags)
 {
spin_lock(>lock);
-   if (unlikely(has_isolate_pageblock(zone) ||
-   is_migrate_isolate(migratetype))) {
-   migratetype = get_pfnblock_migratetype(page, pfn);
-   }
+   migratetype = check_migratetype_isolated(zone, page, pfn, migratetype);
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock(>lock);
 }
-- 
2.26.2



[PATCH 06/11] mm/page_alloc: Reduce duration that IRQs are disabled for VM counters

2021-04-14 Thread Mel Gorman
IRQs are left disabled for the zone and node VM event counters. This is
unnecessary as the affected counters are allowed to race for preemmption
and IRQs.

This patch reduces the scope of IRQs being disabled
via local_[lock|unlock]_irq on !PREEMPT_RT kernels. One
__mod_zone_freepage_state is still called with IRQs disabled. While this
could be moved out, it's not free on all architectures as some require
IRQs to be disabled for mod_zone_page_state on !PREEMPT_RT kernels.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cff0f1c98b28..295624fe293b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3474,11 +3474,11 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
pcp = this_cpu_ptr(zone->per_cpu_pageset);
list = >lists[migratetype];
page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
+   local_unlock_irqrestore(, flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone, 1);
}
-   local_unlock_irqrestore(, flags);
return page;
 }
 
@@ -3530,15 +3530,15 @@ struct page *rmqueue(struct zone *preferred_zone,
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
-   spin_unlock(>lock);
if (!page)
goto failed;
+
__mod_zone_freepage_state(zone, -(1 << order),
  get_pcppage_migratetype(page));
+   spin_unlock_irqrestore(>lock, flags);
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
-   local_irq_restore(flags);
 
 out:
/* Separate test+clear to avoid unnecessary atomics */
@@ -3551,7 +3551,7 @@ struct page *rmqueue(struct zone *preferred_zone,
return page;
 
 failed:
-   local_irq_restore(flags);
+   spin_unlock_irqrestore(>lock, flags);
return NULL;
 }
 
@@ -5103,11 +5103,11 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
nr_populated++;
}
 
+   local_unlock_irqrestore(, flags);
+
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
 
-   local_unlock_irqrestore(, flags);
-
return nr_populated;
 
 failed_irq:
-- 
2.26.2



[PATCH 05/11] mm/page_alloc: Batch the accounting updates in the bulk allocator

2021-04-14 Thread Mel Gorman
Now that the zone_statistics are simple counters that do not require
special protection, the bulk allocator accounting updates can be batch
updated without adding too much complexity with protected RMW updates or
using xchg.

Signed-off-by: Mel Gorman 
---
 include/linux/vmstat.h |  8 
 mm/page_alloc.c| 30 +-
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index dde4dec4e7dd..8473b8fa9756 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -246,6 +246,14 @@ __count_numa_event(struct zone *zone, enum numa_stat_item 
item)
raw_cpu_inc(pzstats->vm_numa_event[item]);
 }
 
+static inline void
+__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
+{
+   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+   raw_cpu_add(pzstats->vm_numa_event[item], delta);
+}
+
 extern void __count_numa_event(struct zone *zone, enum numa_stat_item item);
 extern unsigned long sum_zone_node_page_state(int node,
  enum zone_stat_item item);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9d0f047647e3..cff0f1c98b28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3411,7 +3411,8 @@ void __putback_isolated_page(struct page *page, unsigned 
int order, int mt)
  *
  * Must be called with interrupts disabled.
  */
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+  long nr_account)
 {
 #ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -3424,12 +3425,12 @@ static inline void zone_statistics(struct zone 
*preferred_zone, struct zone *z)
local_stat = NUMA_OTHER;
 
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
-   __count_numa_event(z, NUMA_HIT);
+   __count_numa_events(z, NUMA_HIT, nr_account);
else {
-   __count_numa_event(z, NUMA_MISS);
-   __count_numa_event(preferred_zone, NUMA_FOREIGN);
+   __count_numa_events(z, NUMA_MISS, nr_account);
+   __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
}
-   __count_numa_event(z, local_stat);
+   __count_numa_events(z, local_stat, nr_account);
 #endif
 }
 
@@ -3475,7 +3476,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
-   zone_statistics(preferred_zone, zone);
+   zone_statistics(preferred_zone, zone, 1);
}
local_unlock_irqrestore(, flags);
return page;
@@ -3536,7 +3537,7 @@ struct page *rmqueue(struct zone *preferred_zone,
  get_pcppage_migratetype(page));
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   zone_statistics(preferred_zone, zone, 1);
local_irq_restore(flags);
 
 out:
@@ -5019,7 +5020,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
struct alloc_context ac;
gfp_t alloc_gfp;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
-   int nr_populated = 0;
+   int nr_populated = 0, nr_account = 0;
 
if (unlikely(nr_pages <= 0))
return 0;
@@ -5092,15 +5093,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
goto failed_irq;
break;
}
-
-   /*
-* Ideally this would be batched but the best way to do
-* that cheaply is to first convert zone_statistics to
-* be inaccurate per-cpu counter like vm_events to avoid
-* a RMW cycle then do the accounting with IRQs enabled.
-*/
-   __count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
-   zone_statistics(ac.preferred_zoneref->zone, zone);
+   nr_account++;
 
prep_new_page(page, 0, gfp, 0);
if (page_list)
@@ -5110,6 +5103,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
nr_populated++;
}
 
+   __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
+   zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+
local_unlock_irqrestore(, flags);
 
return nr_populated;
-- 
2.26.2



[PATCH 10/11] mm/page_alloc: Update PGFREE outside the zone lock in __free_pages_ok

2021-04-14 Thread Mel Gorman
VM events do not need explicit protection by disabling IRQs so
update the counter with IRQs enabled in __free_pages_ok.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a0b210077178..8a94fe77bef7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1569,10 +1569,11 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
migratetype = get_pfnblock_migratetype(page, pfn);
 
spin_lock_irqsave(>lock, flags);
-   __count_vm_events(PGFREE, 1 << order);
migratetype = check_migratetype_isolated(zone, page, pfn, migratetype);
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock_irqrestore(>lock, flags);
+
+   __count_vm_events(PGFREE, 1 << order);
 }
 
 void __free_pages_core(struct page *page, unsigned int order)
-- 
2.26.2



[PATCH 08/11] mm/page_alloc: Explicitly acquire the zone lock in __free_pages_ok

2021-04-14 Thread Mel Gorman
__free_pages_ok() disables IRQs before calling a common helper
free_one_page() that acquires the zone lock. This is not safe according
to Documentation/locking/locktypes.rst and in this context, IRQ disabling
is not protecting a per_cpu_pages structure either or a local_lock would
be used.

This patch explicitly acquires the lock with spin_lock_irqsave instead of
relying on a helper. This removes the last instance of local_irq_save()
in page_alloc.c.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1ed370668e7f..6791e9361076 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1559,21 +1559,18 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
+   struct zone *zone = page_zone(page);
 
if (!free_pages_prepare(page, order, true))
return;
 
migratetype = get_pfnblock_migratetype(page, pfn);
 
-   /*
-* TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock
-* and protect vmstat updates.
-*/
-   local_irq_save(flags);
+   spin_lock_irqsave(>lock, flags);
__count_vm_events(PGFREE, 1 << order);
-   free_one_page(page_zone(page), page, pfn, order, migratetype,
- fpi_flags);
-   local_irq_restore(flags);
+   migratetype = check_migratetype_isolated(zone, page, pfn, migratetype);
+   __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+   spin_unlock_irqrestore(>lock, flags);
 }
 
 void __free_pages_core(struct page *page, unsigned int order)
-- 
2.26.2



[PATCH 09/11] mm/page_alloc: Avoid conflating IRQs disabled with zone->lock

2021-04-14 Thread Mel Gorman
Historically when freeing pages, free_one_page() assumed that callers
had IRQs disabled and the zone->lock could be acquired with spin_lock().
This confuses the scope of what local_lock_irq is protecting and what
zone->lock is protecting in free_unref_page_list in particular.

This patch uses spin_lock_irqsave() for the zone->lock in
free_one_page() instead of relying on callers to have disabled
IRQs. free_unref_page_commit() is changed to only deal with PCP pages
protected by the local lock. free_unref_page_list() then first frees
isolated pages to the buddy lists with free_one_page() and frees the rest
of the pages to the PCP via free_unref_page_commit(). The end result
is that free_one_page() is no longer depending on side-effects of
local_lock to be correct.

Note that this may incur a performance penalty while memory hot-remove
is running but that is not a common operation.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 67 ++---
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6791e9361076..a0b210077178 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1473,10 +1473,12 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype, fpi_t fpi_flags)
 {
-   spin_lock(>lock);
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
migratetype = check_migratetype_isolated(zone, page, pfn, migratetype);
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
-   spin_unlock(>lock);
+   spin_unlock_irqrestore(>lock, flags);
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -3238,31 +3240,13 @@ static bool free_unref_page_prepare(struct page *page, 
unsigned long pfn)
return true;
 }
 
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
+  int migratetype)
 {
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
-   int migratetype;
 
-   migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
-
-   /*
-* We only track unmovable, reclaimable and movable on pcp lists.
-* Free ISOLATE pages back to the allocator because they are being
-* offlined but treat HIGHATOMIC as movable pages so we can get those
-* areas back if necessary. Otherwise, we may have to free
-* excessively into the page allocator
-*/
-   if (migratetype >= MIGRATE_PCPTYPES) {
-   if (unlikely(is_migrate_isolate(migratetype))) {
-   free_one_page(zone, page, pfn, 0, migratetype,
- FPI_NONE);
-   return;
-   }
-   migratetype = MIGRATE_MOVABLE;
-   }
-
pcp = this_cpu_ptr(zone->per_cpu_pageset);
list_add(>lru, >lists[migratetype]);
pcp->count++;
@@ -3277,12 +3261,29 @@ void free_unref_page(struct page *page)
 {
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
+   int migratetype;
 
if (!free_unref_page_prepare(page, pfn))
return;
 
+   /*
+* We only track unmovable, reclaimable and movable on pcp lists.
+* Place ISOLATE pages on the isolated list because they are being
+* offlined but treat HIGHATOMIC as movable pages so we can get those
+* areas back if necessary. Otherwise, we may have to free
+* excessively into the page allocator
+*/
+   migratetype = get_pcppage_migratetype(page);
+   if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+   if (unlikely(is_migrate_isolate(migratetype))) {
+   free_one_page(page_zone(page), page, pfn, 0, 
migratetype, FPI_NONE);
+   return;
+   }
+   migratetype = MIGRATE_MOVABLE;
+   }
+
local_lock_irqsave(, flags);
-   free_unref_page_commit(page, pfn);
+   free_unref_page_commit(page, pfn, migratetype);
local_unlock_irqrestore(, flags);
 }
 
@@ -3294,6 +3295,7 @@ void free_unref_page_list(struct list_head *list)
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+   int migratetype;
 
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
@@ -3301,15 +3303,28 @@ void free_unref_page_list(struct list_head *list)
if (!free_unref_page_prepare(page, pfn))
list_del(>lru);
set_page_private(page, pfn);
+
+   /*
+* Free isolated pages directly to the allocator, see
+* comment in free_unref_page.
+

[PATCH 04/11] mm/vmstat: Inline NUMA event counter updates

2021-04-14 Thread Mel Gorman
__count_numa_event is small enough to be treated similarly to
__count_vm_event so inline it.

Signed-off-by: Mel Gorman 
---
 include/linux/vmstat.h | 9 +
 mm/vmstat.c| 9 -
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index fc14415223c5..dde4dec4e7dd 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -237,6 +237,15 @@ static inline unsigned long 
zone_page_state_snapshot(struct zone *zone,
 }
 
 #ifdef CONFIG_NUMA
+/* See __count_vm_event comment on why raw_cpu_inc is used. */
+static inline void
+__count_numa_event(struct zone *zone, enum numa_stat_item item)
+{
+   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+   raw_cpu_inc(pzstats->vm_numa_event[item]);
+}
+
 extern void __count_numa_event(struct zone *zone, enum numa_stat_item item);
 extern unsigned long sum_zone_node_page_state(int node,
  enum zone_stat_item item);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 63bd84d122c0..b853df95ed0c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -902,15 +902,6 @@ void drain_zonestat(struct zone *zone, struct 
per_cpu_zonestat *pzstats)
 #endif
 
 #ifdef CONFIG_NUMA
-/* See __count_vm_event comment on why raw_cpu_inc is used. */
-void __count_numa_event(struct zone *zone,
-enum numa_stat_item item)
-{
-   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-
-   raw_cpu_inc(pzstats->vm_numa_event[item]);
-}
-
 /*
  * Determine the per node value of a stat item. This function
  * is called frequently in a NUMA machine, so try to be as
-- 
2.26.2



[PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats

2021-04-14 Thread Mel Gorman
The per-cpu page allocator lists and the per-cpu vmstat deltas are stored
in the same struct per_cpu_pages even though vmstats have no direct impact
on the per-cpu page lists. This is inconsistent because the vmstats for a
node are stored on a dedicated structure. The bigger issue is that the
per_cpu_pages structure is not cache-aligned and stat updates either
cache conflict with adjacent per-cpu lists incurring a runtime cost or
padding is required incurring a memory cost.

This patch splits the per-cpu pagelists and the vmstat deltas into separate
structures. It's mostly a mechanical conversion but some variable renaming
is done to clearly distinguish the per-cpu pages structure (pcp) from
the vmstats (pzstats).

Superficially, this appears to increase the size of the per_cpu_pages
structure but the movement of expire fills a structure hole so there is
no impact overall.

[l...@intel.com: Check struct per_cpu_zonestat has a non-zero size]
[vba...@suse.cz: Init zone->per_cpu_zonestats properly]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h | 18 
 include/linux/vmstat.h |  8 ++--
 mm/page_alloc.c| 85 -
 mm/vmstat.c| 96 ++
 4 files changed, 111 insertions(+), 96 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..a4393ac27336 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -341,20 +341,21 @@ struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
int batch;  /* chunk size for buddy add/remove */
+#ifdef CONFIG_NUMA
+   int expire; /* When 0, remote pagesets are drained */
+#endif
 
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES];
 };
 
-struct per_cpu_pageset {
-   struct per_cpu_pages pcp;
-#ifdef CONFIG_NUMA
-   s8 expire;
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
-#endif
+struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
-   s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+   s8 stat_threshold;
+#endif
+#ifdef CONFIG_NUMA
+   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 };
 
@@ -470,7 +471,8 @@ struct zone {
int node;
 #endif
struct pglist_data  *zone_pgdat;
-   struct per_cpu_pageset __percpu *pageset;
+   struct per_cpu_pages__percpu *per_cpu_pageset;
+   struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
 * the high and batch values are copied to individual pagesets for
 * faster access
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 506d625163a1..1736ea9d24a7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct 
zone *zone,
int cpu;
 
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_numa_stat_diff[item];
 
return x;
 }
@@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct 
zone *zone,
 #ifdef CONFIG_SMP
int cpu;
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_stat_diff[item];
 
if (x < 0)
x = 0;
@@ -291,7 +291,7 @@ struct ctl_table;
 int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
loff_t *ppos);
 
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);
 
 int calculate_pressure_threshold(struct zone *zone);
 int calculate_normal_threshold(struct zone *zone);
@@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { }
 static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
-   struct per_cpu_pageset *pset) { }
+   struct per_cpu_zonestat *pzstats) { }
 #endif /* CONFIG_SMP */
 
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bf0db982f14..2d6283cab22d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2981,15 +2981,14 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
unsigned long flags;
-   struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
 
local_irq_save(flags);
-   pset = per_cpu_ptr(zone->pageset, cpu);
 
-   pcp = >pcp;
+   pcp = per_cpu_ptr(zone->per_cp

[PATCH 02/11] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-04-14 Thread Mel Gorman
There is a lack of clarity of what exactly local_irq_save/local_irq_restore
protects in page_alloc.c . It conflates the protection of per-cpu page
allocation structures with per-cpu vmstat deltas.

This patch protects the PCP structure using local_lock which for most
configurations is identical to IRQ enabling/disabling. The scope of the
lock is still wider than it should be but this is decreased later.

It is possible for the local_lock to be embedded safely within struct
per_cpu_pages but it adds complexity to free_unref_page_list so it is
implemented as a separate patch later in the series.

[l...@intel.com: Make pagesets static]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h |  2 ++
 mm/page_alloc.c| 50 +-
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a4393ac27336..106da8fbc72a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* Free memory management - zoned buddy allocator.  */
@@ -337,6 +338,7 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d6283cab22d..4e92d43c25f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
 
+struct pagesets {
+   local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+   .lock = INIT_LOCAL_LOCK(lock),
+};
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -1421,6 +1428,10 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
} while (--count && --batch_free && !list_empty(list));
}
 
+   /*
+* local_lock_irq held so equivalent to spin_lock_irqsave for
+* both PREEMPT_RT and non-PREEMPT_RT configurations.
+*/
spin_lock(>lock);
isolated_pageblocks = has_isolate_pageblock(zone);
 
@@ -1541,6 +1552,11 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
return;
 
migratetype = get_pfnblock_migratetype(page, pfn);
+
+   /*
+* TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock
+* and protect vmstat updates.
+*/
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype,
@@ -2910,6 +2926,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int 
order,
 {
int i, allocated = 0;
 
+   /*
+* local_lock_irq held so equivalent to spin_lock_irqsave for
+* both PREEMPT_RT and non-PREEMPT_RT configurations.
+*/
spin_lock(>lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
@@ -2962,12 +2982,12 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
unsigned long flags;
int to_drain, batch;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 #endif
 
@@ -2983,13 +3003,13 @@ static void drain_pages_zone(unsigned int cpu, struct 
zone *zone)
unsigned long flags;
struct per_cpu_pages *pcp;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
 
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
 
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 
 /*
@@ -3252,9 +3272,9 @@ void free_unref_page(struct page *page)
if (!free_unref_page_prepare(page, pfn))
return;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
free_unref_page_commit(page, pfn);
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 
 /*
@@ -3274,7 +3294,7 @@ void free_unref_page_list(struct list_head *list)
set_page_private(page, pfn);
}
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
list_for_each_entry_safe(page, next, list, lru) {
unsigned long pfn = page_private(page);
 
@@ -3287

[PATCH 03/11] mm/vmstat: Convert NUMA statistics to basic NUMA counters

2021-04-14 Thread Mel Gorman
NUMA statistics are maintained on the zone level for hits, misses, foreign
etc but nothing relies on them being perfectly accurate for functional
correctness. The counters are used by userspace to get a general overview
of a workloads NUMA behaviour but the page allocator incurs a high cost to
maintain perfect accuracy similar to what is required for a vmstat like
NR_FREE_PAGES. There even is a sysctl vm.numa_stat to allow userspace to
turn off the collection of NUMA statistics like NUMA_HIT.

This patch converts NUMA_HIT and friends to be NUMA events with similar
accuracy to VM events. There is a possibility that slight errors will be
introduced but the overall trend as seen by userspace will be similar.
Note that while these counters could be maintained at the node level that
it would have a user-visible impact.

Signed-off-by: Mel Gorman 
---
 drivers/base/node.c|  18 +++--
 include/linux/mmzone.h |  11 ++-
 include/linux/vmstat.h |  42 +-
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  12 +--
 mm/vmstat.c| 175 -
 6 files changed, 93 insertions(+), 167 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index f449dbb2c746..443a609db428 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -484,6 +484,7 @@ static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL);
 static ssize_t node_read_numastat(struct device *dev,
  struct device_attribute *attr, char *buf)
 {
+   fold_vm_numa_events();
return sysfs_emit(buf,
  "numa_hit %lu\n"
  "numa_miss %lu\n"
@@ -491,12 +492,12 @@ static ssize_t node_read_numastat(struct device *dev,
  "interleave_hit %lu\n"
  "local_node %lu\n"
  "other_node %lu\n",
- sum_zone_numa_state(dev->id, NUMA_HIT),
- sum_zone_numa_state(dev->id, NUMA_MISS),
- sum_zone_numa_state(dev->id, NUMA_FOREIGN),
- sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
- sum_zone_numa_state(dev->id, NUMA_LOCAL),
- sum_zone_numa_state(dev->id, NUMA_OTHER));
+ sum_zone_numa_event_state(dev->id, NUMA_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_MISS),
+ sum_zone_numa_event_state(dev->id, NUMA_FOREIGN),
+ sum_zone_numa_event_state(dev->id, 
NUMA_INTERLEAVE_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_LOCAL),
+ sum_zone_numa_event_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL);
 
@@ -514,10 +515,11 @@ static ssize_t node_read_vmstat(struct device *dev,
 sum_zone_node_page_state(nid, i));
 
 #ifdef CONFIG_NUMA
-   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+   fold_vm_numa_events();
+   for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
len += sysfs_emit_at(buf, len, "%s %lu\n",
 numa_stat_name(i),
-sum_zone_numa_state(nid, i));
+sum_zone_numa_event_state(nid, i));
 
 #endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 106da8fbc72a..693cd5f24f7d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -135,10 +135,10 @@ enum numa_stat_item {
NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */
-   NR_VM_NUMA_STAT_ITEMS
+   NR_VM_NUMA_EVENT_ITEMS
 };
 #else
-#define NR_VM_NUMA_STAT_ITEMS 0
+#define NR_VM_NUMA_EVENT_ITEMS 0
 #endif
 
 enum zone_stat_item {
@@ -357,7 +357,10 @@ struct per_cpu_zonestat {
s8 stat_threshold;
 #endif
 #ifdef CONFIG_NUMA
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
+   u16 vm_numa_stat_diff[NR_VM_NUMA_EVENT_ITEMS];
+#endif
+#ifdef CONFIG_NUMA
+   unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
 #endif
 };
 
@@ -609,7 +612,7 @@ struct zone {
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t   vm_stat[NR_VM_ZONE_STAT_ITEMS];
-   atomic_long_t   vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
+   atomic_long_t   vm_numa_events[NR_VM_NUMA_EVENT_ITEMS];
 } cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1736ea9d24a7..fc14415223c5 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -138,35 +138,27 @@ static i

[PATCH 0/11 v3] Use local_lock for pcp protection and reduce stat overhead

2021-04-14 Thread Mel Gorman
Changelog since v2
o Fix zonestats initialisation
o Merged memory hotplug fix separately
o Embed local_lock within per_cpu_pages

This series requires patches in Andrew's tree so for convenience, it's
also available at

git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git 
mm-percpu-local_lock-v3r6

The PCP (per-cpu page allocator in page_alloc.c) shares locking
requirements with vmstat and the zone lock which is inconvenient and
causes some issues. For example, the PCP list and vmstat share the
same per-cpu space meaning that it's possible that vmstat updates dirty
cache lines holding per-cpu lists across CPUs unless padding is used.
Second, PREEMPT_RT does not want to disable IRQs for too long in the
page allocator.

This series splits the locking requirements and uses locks types more
suitable for PREEMPT_RT, reduces the time when special locking is required
for stats and reduces the time when IRQs need to be disabled on !PREEMPT_RT
kernels.

Why local_lock? PREEMPT_RT considers the following sequence to be unsafe
as documented in Documentation/locking/locktypes.rst

   local_irq_disable();
   spin_lock();

The pcp allocator has this sequence for rmqueue_pcplist (local_irq_save)
-> __rmqueue_pcplist -> rmqueue_bulk (spin_lock). While it's possible to
separate this out, it generally means there are points where we enable
IRQs and reenable them again immediately. To prevent a migration and the
per-cpu pointer going stale, migrate_disable is also needed. That is a
custom lock that is similar, but worse, than local_lock. Furthermore,
on PREEMPT_RT, it's undesirable to leave IRQs disabled for too long.
By converting to local_lock which disables migration on PREEMPT_RT, the
locking requirements can be separated and start moving the protections
for PCP, stats and the zone lock to PREEMPT_RT-safe equivalent locking. As
a bonus, local_lock also means that PROVE_LOCKING does something useful.

After that, it's obvious that zone_statistics incurs too much overhead
and leaves IRQs disabled for longer than necessary on !PREEMPT_RT
kernels. zone_statistics uses perfectly accurate counters requiring IRQs
be disabled for parallel RMW sequences when inaccurate ones like vm_events
would do. The series makes the NUMA statistics (NUMA_HIT and friends)
inaccurate counters that then require no special protection on !PREEMPT_RT.

The bulk page allocator can then do stat updates in bulk with IRQs enabled
which should improve the efficiency.  Technically, this could have been
done without the local_lock and vmstat conversion work and the order
simply reflects the timing of when different series were implemented.

Finally, there are places where we conflate IRQs being disabled for the
PCP with the IRQ-safe zone spinlock. The remainder of the series reduces
the scope of what is protected by disabled IRQs on !PREEMPT_RT kernels.
By the end of the series, page_alloc.c does not call local_irq_save so
the locking scope is a bit clearer. The one exception is that modifying
NR_FREE_PAGES still happens in places where it's known the IRQs are
disabled as it's harmless for PREEMPT_RT and would be expensive to split
the locking there.

No performance data is included because despite the overhead of the stats,
it's within the noise for most workloads on !PREEMPT_RT. However, Jesper
Dangaard Brouer ran a page allocation microbenchmark on a E5-1650 v4 @
3.60GHz CPU on the first version of this series. Focusing on the array
variant of the bulk page allocator reveals the following.

(CPU: Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz)
ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size

 BaselinePatched
 1   56.383  54.225 (+3.83%)
 2   40.047  35.492 (+11.38%)
 3   37.339  32.643 (+12.58%)
 4   35.578  30.992 (+12.89%)
 8   33.592  29.606 (+11.87%)
 16  32.362  28.532 (+11.85%)
 32  31.476  27.728 (+11.91%)
 64  30.633  27.252 (+11.04%)
 128 30.596  27.090 (+11.46%)

While this is a positive outcome, the series is more likely to be
interesting to the RT people in terms of getting parts of the PREEMPT_RT
tree into mainline.

 drivers/base/node.c|  18 +--
 include/linux/mmzone.h |  58 ++--
 include/linux/vmstat.h |  65 +
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c| 302 +
 mm/vmstat.c| 250 --
 6 files changed, 370 insertions(+), 325 deletions(-)

-- 
2.26.2

Mel Gorman (11):
  mm/page_alloc: Split per cpu page lists and zone stats
  mm/page_alloc: Convert per-cpu list protection to local_lock
  mm/vmstat: Convert NUMA statistics to basic NUMA counters
  mm/vmstat: Inline NUMA event counter updates
  mm/page_alloc: Batch the accounting updates in the bulk allocator
  mm/page_alloc: Reduce duration that IRQs are disabled for VM counters
  mm/page_alloc: Remove duplicate checks if migratetype 

Re: [PATCH 02/11] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-04-13 Thread Mel Gorman
On Mon, Apr 12, 2021 at 11:47:00PM +0200, Thomas Gleixner wrote:
> On Mon, Apr 12 2021 at 12:56, Mel Gorman wrote:
> > On Fri, Apr 09, 2021 at 08:55:39PM +0200, Peter Zijlstra wrote:
> > I'll update the changelog and comment accordingly. I'll decide later
> > whether to leave it or move the location of the lock at the end of the
> > series. If the patch is added, it'll either incur the double lookup (not
> > that expensive, might be optimised by the compiler) or come up with a
> > helper that takes the lock and returns the per-cpu structure. The double
> > lookup probably makes more sense initially because there are multiple
> > potential users of a helper that says "pin to CPU, lookup, lock and return
> > a per-cpu structure" for both IRQ-safe and IRQ-unsafe variants with the
> > associated expansion of the local_lock API. It might be better to introduce
> > such a helper with multiple users converted at the same time and there are
> > other local_lock users in preempt-rt that could do with upstreaming first.
> 
> We had such helpers in RT a while ago but it turned into an helper
> explosion pretty fast. But that was one of the early versions of local
> locks which could not be embedded into a per CPU data structure due to
> raisins (my stupidity).
> 
> But with the more thought out approach of today we can have (+/- the
> obligatory naming bikeshedding):
> 
> 

I don't have strong opinions on the name -- it's long but it's clear.
The overhead of local_lock_get_cpu_ptr has similar weight to get_cpu_ptr
in terms of the cost of preempt_disable. The helper also means that new
users of a local_lock embedded within a per-cpu structure do not have to
figure out if it's safe from scratch.

If the page allocator embeds local_lock within struct per_cpu_pages then
the conversion to the helper is at the end of the mail. The messiest part
is free_unref_page_commit and that is a mess because free_unref_page_list
has to check if a new lock is required in case a list of pages is from
different zones.

> 
>
> and RT will then change that to:
> 
> --- a/include/linux/local_lock_internal.h
> +++ b/include/linux/local_lock_internal.h
> @@ -96,7 +96,7 @@ static inline void local_lock_release(lo
>   ({  \
>   type *__pcp;\
>   \
> - preempt_disable();  \
> + ll_preempt_disable();   \
>   __pcp = this_cpu_ptr(pcp);  \
>   local_lock_acquire(&__pcp->lock);   \
>   __pcp;  \
> @@ -106,7 +106,7 @@ static inline void local_lock_release(lo
>   ({  \
>   type *__pcp;\
>   \
> - local_irq_disable();\
> + ll_local_irq_disable(); \
>   __pcp = this_cpu_ptr(pcp);  \
>   local_lock_acquire(&__pcp->lock);   \
>   __pcp;  \
> @@ -116,7 +116,7 @@ static inline void local_lock_release(lo
>   ({  \
>   type *__pcp;\
>   \
> - local_irq_save(flags);  \
> + ll_local_irq_save(flags);   \
>   __pcp = this_cpu_ptr(pcp);  \
>   local_lock_acquire(&__pcp->lock);   \
>   __pcp;  \
> 
> 
> where ll_xxx is defined as xxx for non-RT and on RT all of them
> get mapped to migrate_disable().
> 
> Thoughts?
> 

I think that works. I created the obvious definitions of ll_* and rebased
on top of preempt-rt to see. I'll see if it boots :P

Page allocator conversion to helper looks like

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d9d7f6d68243..2948a5502589 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3008,9 +3008,7 @@ static void drain_pages_zone(unsigned int cpu, struct 
zone *zone)
unsigned long flags;
struct per_cpu_pages *pcp;
 
-   local_lock_irqsave(>per_cpu_pageset->lock, flags);
-
-   pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+   pcp = local_lock_irqsave_get_cpu_ptr(zone->per_cpu_pageset, lock, 
f

Re: [PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats

2021-04-13 Thread Mel Gorman
On Mon, Apr 12, 2021 at 07:43:18PM +0200, Vlastimil Babka wrote:
> On 4/7/21 10:24 PM, Mel Gorman wrote:
> > @@ -6691,7 +6697,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
> >  * relies on the ability of the linker to provide the
> >  * offset of a (static) per cpu variable into the per cpu area.
> >  */
> > -   zone->pageset = _pageset;
> > +   zone->per_cpu_pageset = _pageset;
> 
> I don't see any _zonestats assignment here in zone_pcp_init() or its
> caller(s), which seems strange, as zone_pcp_reset() does it.
> 

Yes, it's required, well spotted!

-- 
Mel Gorman
SUSE Labs


Re: [PATCH v2 resend] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-13 Thread Mel Gorman
On Tue, Apr 13, 2021 at 11:36:08AM +0200, Vlastimil Babka wrote:
> On 4/12/21 4:08 PM, Mel Gorman wrote:
> > On Mon, Apr 12, 2021 at 02:40:18PM +0200, Vlastimil Babka wrote:
> >> On 4/12/21 2:08 PM, Mel Gorman wrote:
> >
> > the pageset structures in place would be much more straight-forward
> > assuming the structures were not allocated in the zone that is being
> > hot-removed.
> 
> I would expect this is not possible, at least for ZONE_MOVABLE, as the percpu
> allocations should be GFP_KERNEL.

True.

> And it's not realistic to expect offlining to
> succeed at all without using ZONE_MOVABLE.
> 
> AFAIK even Oscar's work on using the node to self-contain its own structures 
> is
> only applicable to struct pages, not percpu allocations?

That I don't know as I didn't check although in general, it would be
somewhat unfortunate if per-cpu structures were remote. It wouldn't be
critical given that they'll be in cache assuming the per-cpu structures
are not straddling cache lines.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH v2 resend] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-12 Thread Mel Gorman
On Mon, Apr 12, 2021 at 04:12:11PM +0200, David Hildenbrand wrote:
> > After v1 of the patch, the race was reduced to the point between the
> > zone watermark check and the rmqueue_pcplist but yes, it still existed.
> > Closing it completely was either complex or expensive. Setting
> > zone->pageset = _pageset before the free would shrink the race
> > further but that still leaves a potential memory ordering issue.
> > 
> > While fixable, it's either complex, expensive or both so yes, just leaving
> > the pageset structures in place would be much more straight-forward
> > assuming the structures were not allocated in the zone that is being
> > hot-removed. As things stand, I had trouble even testing zone hot-remove
> > as there was always a few pages left behind and I did not chase down
> > why.
>
> Can you elaborate? I can reliably trigger zone present pages going to 0 by
> just hotplugging a DIMM, onlining the memory block devices to the MOVABLE
> zone, followed by offlining the memory block again.
> 

For the machine I was testing on, I tried offlining all memory within
a zone on a NUMA machine. Even if I used movable_zone to create a zone
or numa=fake to create multiple fake nodes and zones, there was always
either reserved or pinned pages preventing the full zone being removed.

-- 
Mel Gorman
SUSE Labs


Re: [RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-12 Thread Mel Gorman
On Mon, Apr 12, 2021 at 02:21:47PM +0200, Vincent Guittot wrote:
> > > Peter, Valentin, Vincent, Mel, etal
> > >
> > > On architectures where we have multiple levels of cache access latencies
> > > within a DIE, (For example: one within the current LLC or SMT core and the
> > > other at MC or Hemisphere, and finally across hemispheres), do you have 
> > > any
> > > suggestions on how we could handle the same in the core scheduler?
> 
> I would say that SD_SHARE_PKG_RESOURCES is there for that and doesn't
> only rely on cache
> 

>From topology.c

SD_SHARE_PKG_RESOURCES - describes shared caches

I'm guessing here because I am not familiar with power10 but the central
problem appears to be when to prefer selecting a CPU sharing L2 or L3
cache and the core assumes the last-level-cache is the only relevant one.

For this patch, I wondered if setting SD_SHARE_PKG_RESOURCES would have
unintended consequences for load balancing because load within a die may
not be spread between SMT4 domains if SD_SHARE_PKG_RESOURCES was set at
the MC level.

> >
> > Minimally I think it would be worth detecting when there are multiple
> > LLCs per node and detecting that in generic code as a static branch. In
> > select_idle_cpu, consider taking two passes -- first on the LLC domain
> > and if no idle CPU is found then taking a second pass if the search depth
> 
> We have done a lot of changes to reduce and optimize the fast path and
> I don't think re adding another layer  in the fast path makes sense as
> you will end up unrolling the for_each_domain behind some
> static_banches.
> 

Searching the node would only happen if a) there was enough search depth
left and b) there were no idle CPUs at the LLC level. As no new domain
is added, it's not clear to me why for_each_domain would change.

But still, your comment reminded me that different architectures have
different requirements

Power 10 appears to prefer CPU selection sharing L2 cache but desires
spillover to L3 when selecting and idle CPU.

X86 varies, it might want the Power10 approach for some families and prefer
L3 spilling over to a CPU on the same node in others.

S390 cares about something called books and drawers although I've no
what it means as such and whether it has any preferences on
search order.

ARM has similar requirements again according to "scheduler: expose the
topology of clusters and add cluster scheduler" and that one *does*
add another domain.

I had forgotten about the ARM patches but remembered that they were
interesting because they potentially help the Zen situation but I didn't
get the chance to review them before they fell off my radar again. About
all I recall is that I thought the "cluster" terminology was vague.

The only commonality I thought might exist is that architectures may
like to define what the first domain to search for an idle CPU and a
second domain. Alternatively, architectures could specify a domain to
search primarily but also search the next domain in the hierarchy if
search depth permits. The default would be the existing behaviour --
search CPUs sharing a last-level-cache.

> SD_SHARE_PKG_RESOURCES should be set to the last level where we can
> efficiently move task between CPUs at wakeup
> 

The definition of "efficiently" varies. Moving tasks between CPUs sharing
a cache is most efficient but moving the task to a CPU that at least has
local memory channels is a reasonable option if there are no idle CPUs
sharing cache and preferable to stacking.

> > allows within the node with the LLC CPUs masked out. While there would be
> > a latency hit because cache is not shared, it would still be a CPU local
> > to memory that is idle. That would potentially be beneficial on Zen*
> > as well without having to introduce new domains in the topology hierarchy.
> 
> What is the current sched_domain topology description for zen ?
> 

The cache and NUMA topologies differ slightly between each generation
of Zen. The common pattern is that a single NUMA node can have multiple
L3 caches and at one point I thought it might be reasonable to allow
spillover to select a local idle CPU instead of stacking multiple tasks
on a CPU sharing cache. I never got as far as thinking how it could be
done in a way that multiple architectures would be happy with.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH v2 resend] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-12 Thread Mel Gorman
On Mon, Apr 12, 2021 at 02:40:18PM +0200, Vlastimil Babka wrote:
> On 4/12/21 2:08 PM, Mel Gorman wrote:
> > zone_pcp_reset allegedly protects against a race with drain_pages
> > using local_irq_save but this is bogus. local_irq_save only operates
> > on the local CPU. If memory hotplug is running on CPU A and drain_pages
> > is running on CPU B, disabling IRQs on CPU A does not affect CPU B and
> > offers no protection.
> > 
> > This patch deletes IRQ disable/enable on the grounds that IRQs protect
> > nothing and assumes the existing hotplug paths guarantees the PCP cannot be
> > used after zone_pcp_enable(). That should be the case already because all
> > the pages have been freed and there is no page to put on the PCP lists.
> > 
> > Signed-off-by: Mel Gorman 
> 
> Yeah the irq disabling here is clearly bogus, so:
> 
> Acked-by: Vlastimil Babka 
> 

Thanks!

> But I think Michal has a point that we might best leave the pagesets around, 
> by
> a future change. I'm have some doubts that even with your reordering of the
> reset/destroy after zonelist rebuild in v1 they cant't be reachable. We have 
> no
> protection between zonelist rebuild and zonelist traversal, and that's why we
> just leave pgdats around.
> 
> So I can imagine a task racing with memory hotremove might see watermarks as 
> ok
> in get_page_from_freelist() for the zone and proceeds to try_this_zone:, then
> gets stalled/scheduled out while hotremove rebuilds the zonelist and destroys
> the pcplists, then the first task is resumed and proceeds with 
> rmqueue_pcplist().
> 
> So that's very rare thus not urgent, and this patch doesn't make it less rare 
> so
> not a reason to block it.
> 

After v1 of the patch, the race was reduced to the point between the
zone watermark check and the rmqueue_pcplist but yes, it still existed.
Closing it completely was either complex or expensive. Setting
zone->pageset = _pageset before the free would shrink the race
further but that still leaves a potential memory ordering issue.

While fixable, it's either complex, expensive or both so yes, just leaving
the pageset structures in place would be much more straight-forward
assuming the structures were not allocated in the zone that is being
hot-removed. As things stand, I had trouble even testing zone hot-remove
as there was always a few pages left behind and I did not chase down
why. The focus was getting rid of the bogus local_irq_save() because
it was clearly wrong and offering a false sense of safety and the last
problematic local_irq_save() user in page_alloc.c when local_lock is used
to protect the PCP structures.

-- 
Mel Gorman
SUSE Labs


[PATCH v2 resend] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-12 Thread Mel Gorman
zone_pcp_reset allegedly protects against a race with drain_pages
using local_irq_save but this is bogus. local_irq_save only operates
on the local CPU. If memory hotplug is running on CPU A and drain_pages
is running on CPU B, disabling IRQs on CPU A does not affect CPU B and
offers no protection.

This patch deletes IRQ disable/enable on the grounds that IRQs protect
nothing and assumes the existing hotplug paths guarantees the PCP cannot be
used after zone_pcp_enable(). That should be the case already because all
the pages have been freed and there is no page to put on the PCP lists.

Signed-off-by: Mel Gorman 
---
Resending for email address correction and adding lists

Changelog since v1
o Minimal fix

 mm/page_alloc.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e8aedb64b57..9bf0db982f14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8952,12 +8952,9 @@ void zone_pcp_enable(struct zone *zone)
 
 void zone_pcp_reset(struct zone *zone)
 {
-   unsigned long flags;
int cpu;
struct per_cpu_pageset *pset;
 
-   /* avoid races with drain_pages()  */
-   local_irq_save(flags);
if (zone->pageset != _pageset) {
for_each_online_cpu(cpu) {
pset = per_cpu_ptr(zone->pageset, cpu);
@@ -8966,7 +8963,6 @@ void zone_pcp_reset(struct zone *zone)
free_percpu(zone->pageset);
zone->pageset = _pageset;
}
-   local_irq_restore(flags);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE


Re: [PATCH 02/11] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-04-12 Thread Mel Gorman
On Fri, Apr 09, 2021 at 08:55:39PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 02:32:56PM +0100, Mel Gorman wrote:
> > That said, there are some curious users already.
> > fs/squashfs/decompressor_multi_percpu.c looks like it always uses the
> > local_lock in CPU 0's per-cpu structure instead of stabilising a per-cpu
> > pointer. 
> 
> I'm not sure how you read that.
> 
> You're talking about this:
> 
>   local_lock(>stream->lock);
> 
> right? Note that msblk->stream is a per-cpu pointer, so
> >stream->lock is that same per-cpu pointer with an offset on.
> 
> The whole think relies on:
> 
>   _cpu_ptr(msblk->stream, cpu)->lock == 
> per_cpu_ptr(>stream->lock, cpu)
> 
> Which is true because the lhs:
> 
>   (local_lock_t *)((msblk->stream + per_cpu_offset(cpu)) + 
> offsetof(struct squashfs_stream, lock))
> 
> and the rhs:
> 
>   (local_lock_t *)((msblk->stream + offsetof(struct squashfs_stream, 
> lock)) + per_cpu_offset(cpu))
> 
> are identical, because addition is associative.
> 

Ok, I think I see and understand now, I didn't follow far enough down
into the macro magic and missed this observation so thanks for your
patience. The page allocator still incurs a double lookup of the per
cpu offsets but it should work for both the current local_lock_irq
implementation and the one in preempt-rt because the task will be pinned
to the CPU by either preempt_disable, migrate_disable or IRQ disable
depending on the local_lock implementation and kernel configuration.

I'll update the changelog and comment accordingly. I'll decide later
whether to leave it or move the location of the lock at the end of the
series. If the patch is added, it'll either incur the double lookup (not
that expensive, might be optimised by the compiler) or come up with a
helper that takes the lock and returns the per-cpu structure. The double
lookup probably makes more sense initially because there are multiple
potential users of a helper that says "pin to CPU, lookup, lock and return
a per-cpu structure" for both IRQ-safe and IRQ-unsafe variants with the
associated expansion of the local_lock API. It might be better to introduce
such a helper with multiple users converted at the same time and there are
other local_lock users in preempt-rt that could do with upstreaming first.

> > drivers/block/zram/zcomp.c appears to do the same although for
> > at least one of the zcomp_stream_get() callers, the CPU is pinned for
> > other reasons (bit spin lock held). I think it happens to work anyway
> > but it's weird and I'm not a fan.
> 
> Same thing.

Yep.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 2/9] mm/page_alloc: Add a bulk page allocator

2021-04-12 Thread Mel Gorman
On Mon, Apr 12, 2021 at 11:59:38AM +0100, Mel Gorman wrote:
> > I don't understand this comment. Only alloc_flags_nofragment() sets this 
> > flag
> > and we don't use it here?
> > 
> 
> It's there as a reminder that there are non-obvious consequences
> to ALLOC_NOFRAGMENT that may affect the bulk allocation success
> rate. __rmqueue_fallback will only select pageblock_order pages and if that
> fails, we fall into the slow path that allocates a single page. I didn't
> deal with it because it was not obvious that it's even relevant but I bet
> in 6 months time, I'll forget that ALLOC_NOFRAGMENT may affect success
> rates without the comment. I'm waiting for a bug that can trivially trigger
> a case with a meaningful workload where the success rate is poor enough to
> affect latency before adding complexity. Ideally by then, the allocation
> paths would be unified a bit better.
> 

So this needs better clarification. ALLOC_NOFRAGMENT is not a
problem at the moment but at one point during development, it was a
non-obvious potential problem. If the paths are unified, ALLOC_NOFRAGMENT
*potentially* becomes a problem depending on how it's done and it needs
careful consideration. For example, it could be part unified by moving
the alloc_flags_nofragment() call into prepare_alloc_pages because in
__alloc_pages, it always happens and it looks like an obvious partial
unification. Hence the comment "May set ALLOC_NOFRAGMENT" because I wanted
a reminder in case I "fixed" this in 6 months time and forgot the downside.

-- 
Mel Gorman
SUSE Labs


[PATCH] mm/page_alloc: Add a bulk page allocator -fix -fix -fix

2021-04-12 Thread Mel Gorman
Vlastimil Babka noted that a comment is wrong, fix it. This is the third
fix to the mmotm patch mm-page_alloc-add-a-bulk-page-allocator.patch.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1c67c99603a3..c62862071e6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5067,7 +5067,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
return 0;
gfp = alloc_gfp;
 
-   /* Find an allowed local zone that meets the high watermark. */
+   /* Find an allowed local zone that meets the low watermark. */
for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, 
ac.highest_zoneidx, ac.nodemask) {
unsigned long mark;
 


Re: [PATCH 2/9] mm/page_alloc: Add a bulk page allocator

2021-04-12 Thread Mel Gorman
On Mon, Apr 12, 2021 at 12:21:42PM +0200, Vlastimil Babka wrote:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 8a3e13277e22..eb547470a7e4 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -4965,6 +4965,124 @@ static inline bool prepare_alloc_pages(gfp_t 
> > gfp_mask, unsigned int order,
> > return true;
> >  }
> >  
> > +/*
> > + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list
> > + * @gfp: GFP flags for the allocation
> > + * @preferred_nid: The preferred NUMA node ID to allocate from
> > + * @nodemask: Set of nodes to allocate from, may be NULL
> > + * @nr_pages: The number of pages desired on the list
> > + * @page_list: List to store the allocated pages
> > + *
> > + * This is a batched version of the page allocator that attempts to
> > + * allocate nr_pages quickly and add them to a list.
> > + *
> > + * Returns the number of pages on the list.
> > + */
> > +int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> > +   nodemask_t *nodemask, int nr_pages,
> > +   struct list_head *page_list)
> > +{
> > +   struct page *page;
> > +   unsigned long flags;
> > +   struct zone *zone;
> > +   struct zoneref *z;
> > +   struct per_cpu_pages *pcp;
> > +   struct list_head *pcp_list;
> > +   struct alloc_context ac;
> > +   gfp_t alloc_gfp;
> > +   unsigned int alloc_flags;
> 
> Was going to complain that this is not set to ALLOC_WMARK_LOW. Must be faster
> next time...
> 

Good that you caught it anyway!

> > +   int allocated = 0;
> > +
> > +   if (WARN_ON_ONCE(nr_pages <= 0))
> > +   return 0;
> > +
> > +   /* Use the single page allocator for one page. */
> > +   if (nr_pages == 1)
> > +   goto failed;
> > +
> > +   /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
> 
> I don't understand this comment. Only alloc_flags_nofragment() sets this flag
> and we don't use it here?
> 

It's there as a reminder that there are non-obvious consequences
to ALLOC_NOFRAGMENT that may affect the bulk allocation success
rate. __rmqueue_fallback will only select pageblock_order pages and if that
fails, we fall into the slow path that allocates a single page. I didn't
deal with it because it was not obvious that it's even relevant but I bet
in 6 months time, I'll forget that ALLOC_NOFRAGMENT may affect success
rates without the comment. I'm waiting for a bug that can trivially trigger
a case with a meaningful workload where the success rate is poor enough to
affect latency before adding complexity. Ideally by then, the allocation
paths would be unified a bit better.

> > +   gfp &= gfp_allowed_mask;
> > +   alloc_gfp = gfp;
> > +   if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, , 
> > _gfp, _flags))
> > +   return 0;
> > +   gfp = alloc_gfp;
> > +
> > +   /* Find an allowed local zone that meets the high watermark. */
> 
> Should it say "low watermark"?
> 

Yeah, that's leftover from an earlier prototype :(

-- 
Mel Gorman
SUSE Labs


Re: [RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-12 Thread Mel Gorman
On Mon, Apr 12, 2021 at 11:06:19AM +0100, Valentin Schneider wrote:
> On 12/04/21 10:37, Mel Gorman wrote:
> > On Mon, Apr 12, 2021 at 11:54:36AM +0530, Srikar Dronamraju wrote:
> >> * Gautham R. Shenoy  [2021-04-02 11:07:54]:
> >>
> >> >
> >> > To remedy this, this patch proposes that the LLC be moved to the MC
> >> > level which is a group of cores in one half of the chip.
> >> >
> >> >   SMT (SMT4) --> MC (Hemisphere)[LLC] --> DIE
> >> >
> >>
> >> I think marking Hemisphere as a LLC in a P10 scenario is a good idea.
> >>
> >> > While there is no cache being shared at this level, this is still the
> >> > level where some amount of cache-snooping takes place and it is
> >> > relatively faster to access the data from the caches of the cores
> >> > within this domain. With this change, we no longer see regressions on
> >> > P10 for applications which require single threaded performance.
> >>
> >> Peter, Valentin, Vincent, Mel, etal
> >>
> >> On architectures where we have multiple levels of cache access latencies
> >> within a DIE, (For example: one within the current LLC or SMT core and the
> >> other at MC or Hemisphere, and finally across hemispheres), do you have any
> >> suggestions on how we could handle the same in the core scheduler?
> >>
> >
> > Minimally I think it would be worth detecting when there are multiple
> > LLCs per node and detecting that in generic code as a static branch. In
> > select_idle_cpu, consider taking two passes -- first on the LLC domain
> > and if no idle CPU is found then taking a second pass if the search depth
> > allows within the node with the LLC CPUs masked out.
> 
> I think that's actually a decent approach. Tying SD_SHARE_PKG_RESOURCES to
> something other than pure cache topology in a generic manner is tough (as
> it relies on murky, ill-defined hardware fabric properties).
> 

Agreed. The LLC->node scan idea has been on my TODO list to try for
a while.

> Last I tried thinking about that, I stopped at having a core-to-core
> latency matrix, building domains off of that, and having some knob
> specifying the highest distance value below which we'd set
> SD_SHARE_PKG_RESOURCES. There's a few things I 'hate' about that; for one
> it makes cpus_share_cache() somewhat questionable.
> 

And I thought about something like this too but worried it might get
complex, particularly on chiplets where we do not necessarily have
hardware info on latency depending on how it's wired up. It also might
lead to excessive cpumask manipulation in a fast path if we have to
traverse multiple distances with search cost exceeding gains from latency
reduction. Hence -- keeping it simple with two level only, LLC then node
within the allowed search depth and see what that gets us. It might be
"good enough" in most cases and would be a basis for comparison against
complex approaches.

At minimum, I expect IBM can evaluate the POWER10 aspect and I can run
an evaluation on Zen generations.

-- 
Mel Gorman
SUSE Labs


Re: [RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-12 Thread Mel Gorman
On Mon, Apr 12, 2021 at 11:54:36AM +0530, Srikar Dronamraju wrote:
> * Gautham R. Shenoy  [2021-04-02 11:07:54]:
> 
> > 
> > To remedy this, this patch proposes that the LLC be moved to the MC
> > level which is a group of cores in one half of the chip.
> > 
> >   SMT (SMT4) --> MC (Hemisphere)[LLC] --> DIE
> > 
> 
> I think marking Hemisphere as a LLC in a P10 scenario is a good idea.
> 
> > While there is no cache being shared at this level, this is still the
> > level where some amount of cache-snooping takes place and it is
> > relatively faster to access the data from the caches of the cores
> > within this domain. With this change, we no longer see regressions on
> > P10 for applications which require single threaded performance.
> 
> Peter, Valentin, Vincent, Mel, etal
> 
> On architectures where we have multiple levels of cache access latencies
> within a DIE, (For example: one within the current LLC or SMT core and the
> other at MC or Hemisphere, and finally across hemispheres), do you have any
> suggestions on how we could handle the same in the core scheduler?
> 

Minimally I think it would be worth detecting when there are multiple
LLCs per node and detecting that in generic code as a static branch. In
select_idle_cpu, consider taking two passes -- first on the LLC domain
and if no idle CPU is found then taking a second pass if the search depth
allows within the node with the LLC CPUs masked out. While there would be
a latency hit because cache is not shared, it would still be a CPU local
to memory that is idle. That would potentially be beneficial on Zen*
as well without having to introduce new domains in the topology hierarchy.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-09 Thread Mel Gorman
e *zone)
> > > -{
> > > - unsigned long flags;
> > > - int cpu;
> > > - struct per_cpu_pageset *pset;
> > > -
> > > - /* avoid races with drain_pages()  */
> > > - local_irq_save(flags);
> > > - if (zone->pageset != _pageset) {
> > > - for_each_online_cpu(cpu) {
> > > - pset = per_cpu_ptr(zone->pageset, cpu);
> > > - drain_zonestat(zone, pset);
> > > - }
> > > - free_percpu(zone->pageset);
> > > - zone->pageset = _pageset;
> > > - }
> > > - local_irq_restore(flags);
> > > -}
> > > -
> > 
> > zone_pcp_reset still needs to exist to drain the remaining vmstats or
> > it'll break 5a883813845a ("memory-hotplug: fix zone stat
> > mismatch").
> 
> Are you sure we are reseting vmstats in the hotremove. I do not see
> anything like that. Maybe this was needed at the time. I will double
> check.

zone_pcp_reset calls drain_zonestat to apply the per-cpu vmstat deltas
to the atomic per-zone and global stats.

If anything, the minimal "fix" is to simply delete IRQ disable/enable on
the grounds that IRQs protect nothing and assume the existing hotplug
paths guarantees the PCP cannot be used after zone_pcp_enable(). That
should be the case already because all the pages have been freed and
there is nothing to even put into the PCPs but I worried that the PCP
structure itself might still be reachable even if it's useless which is
why I freed the structure once they could not be reached via zonelists.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-09 Thread Mel Gorman
On Fri, Apr 09, 2021 at 02:48:12PM +0200, Michal Hocko wrote:
> On Fri 09-04-21 14:42:58, Michal Hocko wrote:
> > On Fri 09-04-21 13:09:57, Mel Gorman wrote:
> > > zone_pcp_reset allegedly protects against a race with drain_pages
> > > using local_irq_save but this is bogus. local_irq_save only operates
> > > on the local CPU. If memory hotplug is running on CPU A and drain_pages
> > > is running on CPU B, disabling IRQs on CPU A does not affect CPU B and
> > > offers no protection.
> > 
> > Yes, the synchronization aspect is bogus indeed.
> > 
> > > This patch reorders memory hotremove such that the PCP structures
> > > relevant to the zone are no longer reachable by the time the structures
> > > are freed.  With this reordering, no protection is required to prevent
> > > a use-after-free and the IRQs can be left enabled. zone_pcp_reset is
> > > renamed to zone_pcp_destroy to make it clear that the per-cpu structures
> > > are deleted when the function returns.
> > 
> > Wouldn't it be much easier to simply not destroy/reset pcp of an empty
> > zone at all? The whole point of this exercise seems to be described in
> > 340175b7d14d5. setup_zone_pageset can check for an already allocated pcp
> > and simply reinitialize it. 
> 
> I meant this
> 

It might be simplier but if the intention is to free as much memory
as possible during hot-remove, it seems wasteful to leave the per-cpu
structures behind if we do not have to. If a problem with my patch can
be spotted then I'm happy to go with an alternative fix but there are
two minor issues with your proposed fix.

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index e6a602e82860..b0fdda77e570 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6496,7 +6496,13 @@ void __meminit setup_zone_pageset(struct zone *zone)
>   struct per_cpu_pageset *p;
>   int cpu;
>  
> - zone->pageset = alloc_percpu(struct per_cpu_pageset);
> + /*
> +  * zone could have gone completely offline during memory hotplug
> +  * when the pgdat is left behind for simplicity. On a next onlining
> +  * we do not need to reallocate pcp state.
> +  */
> + if (!zone->pageset)
> + zone->pageset = alloc_percpu(struct per_cpu_pageset);

Should be "if (zone->pageset != _pageset)" ?


>   for_each_possible_cpu(cpu) {
>   p = per_cpu_ptr(zone->pageset, cpu);
>   pageset_init(p);
> @@ -8803,25 +8809,6 @@ void zone_pcp_enable(struct zone *zone)
>   mutex_unlock(_batch_high_lock);
>  }
>  
> -void zone_pcp_reset(struct zone *zone)
> -{
> - unsigned long flags;
> - int cpu;
> - struct per_cpu_pageset *pset;
> -
> - /* avoid races with drain_pages()  */
> - local_irq_save(flags);
> - if (zone->pageset != _pageset) {
> - for_each_online_cpu(cpu) {
> - pset = per_cpu_ptr(zone->pageset, cpu);
> - drain_zonestat(zone, pset);
> - }
> - free_percpu(zone->pageset);
> - zone->pageset = _pageset;
> - }
> - local_irq_restore(flags);
> -}
> -

zone_pcp_reset still needs to exist to drain the remaining vmstats or
it'll break 5a883813845a ("memory-hotplug: fix zone stat
mismatch").

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 02/11] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-04-09 Thread Mel Gorman
On Fri, Apr 09, 2021 at 10:24:24AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 09, 2021 at 08:59:39AM +0100, Mel Gorman wrote:
> > In the end I just gave up and kept it simple as there is no benefit to
> > !PREEMPT_RT which just disables IRQs. Maybe it'll be worth considering when
> > PREEMPT_RT is upstream and can be enabled. The series was functionally
> > tested on the PREEMPT_RT tree by reverting the page_alloc.c patch and
> > applies this series and all of its prerequisites on top.
> 
> Right, I see the problem. Fair enough; perhaps ammend the changelog to
> include some of that so that we can 'remember' in a few months why the
> code is 'funneh'.
> 

I updated the changelog and also added a comment above the
declaration. That said, there are some curious users already.
fs/squashfs/decompressor_multi_percpu.c looks like it always uses the
local_lock in CPU 0's per-cpu structure instead of stabilising a per-cpu
pointer. drivers/block/zram/zcomp.c appears to do the same although for
at least one of the zcomp_stream_get() callers, the CPU is pinned for
other reasons (bit spin lock held). I think it happens to work anyway
but it's weird and I'm not a fan.

Anyway, new version looks like is below.

-- 
[PATCH] mm/page_alloc: Convert per-cpu list protection to local_lock

There is a lack of clarity of what exactly local_irq_save/local_irq_restore
protects in page_alloc.c . It conflates the protection of per-cpu page
allocation structures with per-cpu vmstat deltas.

This patch protects the PCP structure using local_lock which for most
configurations is identical to IRQ enabling/disabling.  The scope of the
lock is still wider than it should be but this is decreased later.

local_lock is declared statically instead of placing it within a structure
and this is deliberate. Placing it in the zone offers limited benefit and
confuses what the lock is protecting -- struct per_cpu_pages. However,
putting it in per_cpu_pages is problematic because the task is not guaranteed
to be pinned to the CPU yet so looking up a per-cpu structure is unsafe.

[l...@intel.com: Make pagesets static]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h |  2 ++
 mm/page_alloc.c| 67 +++---
 2 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a4393ac27336..106da8fbc72a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* Free memory management - zoned buddy allocator.  */
@@ -337,6 +338,7 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3bc4da4cbf9c..04644c3dd187 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,6 +112,30 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
 
+/*
+ * Protects the per_cpu_pages structures.
+ *
+ * This lock is not placed in struct per_cpu_pages because the task acquiring
+ * the lock is not guaranteed to be pinned to the CPU yet due to
+ * preempt/migrate/IRQs disabled or holding a spinlock. The pattern to acquire
+ * the lock would become
+ *
+ *   migrate_disable();
+ *   pcp = this_cpu_ptr(zone->per_cpu_pageset);
+ *   local_lock_irqsave(>lock, flags);
+ *
+ * While a helper would avoid code duplication, there is no inherent advantage
+ * and migrate_disable itself is undesirable (see include/linux/preempt.h).
+ * Similarly, putting the lock in the zone offers no particular benefit but
+ * confuses what the lock is protecting.
+ */
+struct pagesets {
+   local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+   .lock = INIT_LOCAL_LOCK(lock),
+};
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -1421,6 +1445,10 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
} while (--count && --batch_free && !list_empty(list));
}
 
+   /*
+* local_lock_irq held so equivalent to spin_lock_irqsave for
+* both PREEMPT_RT and non-PREEMPT_RT configurations.
+*/
spin_lock(>lock);
isolated_pageblocks = has_isolate_pageblock(zone);
 
@@ -1541,6 +1569,11 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
return;
 
migratetype = get_pfnblock_migratetype(page, pfn);
+
+   /*
+* TODO FIX: Disable IRQs before acquiring IRQ-safe

[PATCH] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-09 Thread Mel Gorman
zone_pcp_reset allegedly protects against a race with drain_pages
using local_irq_save but this is bogus. local_irq_save only operates
on the local CPU. If memory hotplug is running on CPU A and drain_pages
is running on CPU B, disabling IRQs on CPU A does not affect CPU B and
offers no protection.

This patch reorders memory hotremove such that the PCP structures
relevant to the zone are no longer reachable by the time the structures
are freed.  With this reordering, no protection is required to prevent
a use-after-free and the IRQs can be left enabled. zone_pcp_reset is
renamed to zone_pcp_destroy to make it clear that the per-cpu structures
are deleted when the function returns.

Signed-off-by: Mel Gorman 
---
 mm/internal.h   |  2 +-
 mm/memory_hotplug.c | 10 +++---
 mm/page_alloc.c | 22 --
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 09adf152a10b..cc34ce4461b7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -203,7 +203,7 @@ extern void free_unref_page(struct page *page);
 extern void free_unref_page_list(struct list_head *list);
 
 extern void zone_pcp_update(struct zone *zone);
-extern void zone_pcp_reset(struct zone *zone);
+extern void zone_pcp_destroy(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
 extern void zone_pcp_enable(struct zone *zone);
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0cdbbfbc5757..3d059c9f9c2d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1687,12 +1687,16 @@ int __ref offline_pages(unsigned long start_pfn, 
unsigned long nr_pages)
zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(>lock, flags);
 
-   zone_pcp_enable(zone);
-
/* removal success */
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
zone->present_pages -= nr_pages;
 
+   /*
+* Restore PCP after managed pages has been updated. Unpopulated
+* zones PCP structures will remain unusable.
+*/
+   zone_pcp_enable(zone);
+
pgdat_resize_lock(zone->zone_pgdat, );
zone->zone_pgdat->node_present_pages -= nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, );
@@ -1700,8 +1704,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned 
long nr_pages)
init_per_zone_wmark_min();
 
if (!populated_zone(zone)) {
-   zone_pcp_reset(zone);
build_all_zonelists(NULL);
+   zone_pcp_destroy(zone);
} else
zone_pcp_update(zone);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e8aedb64b57..d6c3db853552 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8946,18 +8946,29 @@ void zone_pcp_disable(struct zone *zone)
 
 void zone_pcp_enable(struct zone *zone)
 {
-   __zone_set_pageset_high_and_batch(zone, zone->pageset_high, 
zone->pageset_batch);
+   /*
+* If the zone is populated, restore the high and batch counts.
+* If unpopulated, leave the high and batch count as 0 and 1
+* respectively as done by zone_pcp_disable. The per-cpu
+* structures will later be freed by zone_pcp_destroy.
+*/
+   if (populated_zone(zone))
+   __zone_set_pageset_high_and_batch(zone, zone->pageset_high, 
zone->pageset_batch);
+
mutex_unlock(_batch_high_lock);
 }
 
-void zone_pcp_reset(struct zone *zone)
+/*
+ * Called when a zone has been hot-removed. At this point, the PCP has been
+ * drained, disabled and the zone is removed from the zonelists so the
+ * structures are no longer in use. PCP was disabled/drained by
+ * zone_pcp_disable. This function will drain any remaining vmstat deltas.
+ */
+void zone_pcp_destroy(struct zone *zone)
 {
-   unsigned long flags;
int cpu;
struct per_cpu_pageset *pset;
 
-   /* avoid races with drain_pages()  */
-   local_irq_save(flags);
if (zone->pageset != _pageset) {
for_each_online_cpu(cpu) {
pset = per_cpu_ptr(zone->pageset, cpu);
@@ -8966,7 +8977,6 @@ void zone_pcp_reset(struct zone *zone)
free_percpu(zone->pageset);
zone->pageset = _pageset;
}
-   local_irq_restore(flags);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE


Re: Problem in pfmemalloc skb handling in net/core/dev.c

2021-04-09 Thread Mel Gorman
On Fri, Apr 09, 2021 at 02:14:12AM -0700, Xie He wrote:
> On Fri, Apr 9, 2021 at 1:44 AM Mel Gorman  wrote:
> >
> > That would imply that the tap was communicating with a swap device to
> > allocate a pfmemalloc skb which shouldn't happen. Furthermore, it would
> > require the swap device to be deactivated while pfmemalloc skbs still
> > existed. Have you encountered this problem?
> 
> I'm not a user of swap devices or pfmemalloc skbs. I just want to make
> sure the protocols that I'm developing (not IP or IPv6) won't get
> pfmemalloc skbs when receiving, because those protocols cannot handle
> them.
> 
> According to the code, it seems always possible to get a pfmemalloc
> skb when a network driver calls "__netdev_alloc_skb". The skb will
> then be queued in per-CPU backlog queues when the driver calls
> "netif_rx". There seems to be nothing preventing "sk_memalloc_socks()"
> from becoming "false" after the skb is allocated and before it is
> handled by "__netif_receive_skb".
> 
> Do you mean that at the time "sk_memalloc_socks()" changes from "true"
> to "false", there would be no in-flight skbs currently being received,
> and all network communications have been paused?

Not all network communication, but communication with swap devices
should have stopped once sk_memalloc_socks is false.

-- 
Mel Gorman
SUSE Labs


Re: Problem in pfmemalloc skb handling in net/core/dev.c

2021-04-09 Thread Mel Gorman
On Fri, Apr 09, 2021 at 01:33:24AM -0700, Xie He wrote:
> On Fri, Apr 9, 2021 at 12:30 AM Mel Gorman  
> wrote:
> >
> > Under what circumstances do you expect sk_memalloc_socks() to be false
> > and skb_pfmemalloc() to be true that would cause a problem?
> 
> For example, if at the time the skb is allocated,
> "sk_memalloc_socks()" was true, then the skb might be allocated as a
> pfmemalloc skb. However, if after this skb is allocated and before
> this skb reaches "__netif_receive_skb", "sk_memalloc_socks()" has
> changed from "true" to "false", then "__netif_receive_skb" will see
> "sk_memalloc_socks()" being false and "skb_pfmemalloc(skb)" being
> true.
> 
> This is a problem because this would cause a pfmemalloc skb to be
> delivered to "taps" and protocols that don't support pfmemalloc skbs.

That would imply that the tap was communicating with a swap device to
allocate a pfmemalloc skb which shouldn't happen. Furthermore, it would
require the swap device to be deactivated while pfmemalloc skbs still
existed. Have you encountered this problem?

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 02/11] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-04-09 Thread Mel Gorman
On Fri, Apr 09, 2021 at 08:39:45AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 08, 2021 at 06:42:44PM +0100, Mel Gorman wrote:
> > On Thu, Apr 08, 2021 at 12:52:07PM +0200, Peter Zijlstra wrote:
> > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > index a68bacddcae0..e9e60d1a85d4 100644
> > > > --- a/mm/page_alloc.c
> > > > +++ b/mm/page_alloc.c
> > > > @@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;
> > > >  static DEFINE_MUTEX(pcp_batch_high_lock);
> > > >  #define MIN_PERCPU_PAGELIST_FRACTION   (8)
> > > >  
> > > > +struct pagesets {
> > > > +   local_lock_t lock;
> > > > +};
> > > > +static DEFINE_PER_CPU(struct pagesets, pagesets) = {
> > > > +   .lock = INIT_LOCAL_LOCK(lock),
> > > > +};
> > > 
> > > So why isn't the local_lock_t in struct per_cpu_pages ? That seems to be
> > > the actual object that is protected by it and is already per-cpu.
> > > 
> > > Is that because you want to avoid the duplication across zones? Is that
> > > worth the effort?
> > 
> > When I wrote the patch, the problem was that zone_pcp_reset freed the
> > per_cpu_pages structure and it was "protected" by local_irq_save(). If
> > that was converted to local_lock_irq then the structure containing the
> > lock is freed before it is released which is obviously bad.
> > 
> > Much later when trying to make the allocator RT-safe in general, I realised
> > that locking was broken and fixed it in patch 3 of this series. With that,
> > the local_lock could potentially be embedded within per_cpu_pages safely
> > at the end of this series.
> 
> Fair enough; I was just wondering why the obvious solution wasn't chosen
> and neither changelog nor comment explain, so I had to ask :-)

It's a fair question and it was my first approach before I hit problems.
Thinking again this morning, I remembered that another problem I hit was
patterns like this

local_lock_irqsave(, flags);
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);

turning into

cpu = get_cpu();
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
local_lock_irqsave(>lock, flags);

That has its own problems if zone->lock was acquired within the
local_lock_irqsave section (Section "spinlock_t and rwlock_t" in
Documentation/locking/locktypes.rst) so it has to turn into

migrate_disable();
pcp = this_cpu_ptr(zone->per_cpu_pageset);
local_lock_irqsave(>lock, flags);

I did not want to start adding migrate_disable() in multiple places like
this because I'm guessing that new users of migrate_disable() need strong
justification and adding such code in page_alloc.c might cause cargo-cult
copy in the future. Maybe it could be addressed with a helper like
this_cpu_local_lock or this_cpu_local_lock_irq but that means in some
cases that the PCP structure is looked up twice with patterns like this one

local_lock_irqsave(, flags);
free_unref_page_commit(page, pfn, migratetype);
local_unlock_irqrestore(, flags);

To get around multiple lookups the helper becomes something that disables
migration, looks up the PCP structure, locks it and returns it with
pcp then passed around as appropriate. Not sure what I would call that
helper :P

In the end I just gave up and kept it simple as there is no benefit to
!PREEMPT_RT which just disables IRQs. Maybe it'll be worth considering when
PREEMPT_RT is upstream and can be enabled. The series was functionally
tested on the PREEMPT_RT tree by reverting the page_alloc.c patch and
applies this series and all of its prerequisites on top.

-- 
Mel Gorman
SUSE Labs


Re: Problem in pfmemalloc skb handling in net/core/dev.c

2021-04-09 Thread Mel Gorman
On Thu, Apr 08, 2021 at 11:52:01AM -0700, Xie He wrote:
> Hi Mel Gorman,
> 
> I may have found a problem in pfmemalloc skb handling in
> net/core/dev.c. I see there are "if" conditions checking for
> "sk_memalloc_socks() && skb_pfmemalloc(skb)", and when the condition
> is true, the skb is handled specially as a pfmemalloc skb, otherwise
> it is handled as a normal skb.
> 
> However, if "sk_memalloc_socks()" is false and "skb_pfmemalloc(skb)"
> is true, the skb is still handled as a normal skb. Is this correct?

Under what circumstances do you expect sk_memalloc_socks() to be false
and skb_pfmemalloc() to be true that would cause a problem?

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 0/11 v2] Use local_lock for pcp protection and reduce stat overhead

2021-04-08 Thread Mel Gorman
On Thu, Apr 08, 2021 at 12:56:01PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 07, 2021 at 09:24:12PM +0100, Mel Gorman wrote:
> > Why local_lock? PREEMPT_RT considers the following sequence to be unsafe
> > as documented in Documentation/locking/locktypes.rst
> > 
> >local_irq_disable();
> >raw_spin_lock();
> 
> Almost, the above is actually OK on RT. The problematic one is:
> 
>   local_irq_disable();
>   spin_lock();
> 
> That doesn't work on RT since spin_lock() turns into a PI-mutex which
> then obviously explodes if it tries to block with IRQs disabled.
> 
> And it so happens, that's exactly the one at hand.

Ok, I completely messed up the leader because it was local_irq_disable()
+ spin_lock() that I was worried about. Once the series is complete,
it is replated with

  local_lock_irq(_lock)
  spin_lock();

According to Documentation/locking/locktypes.rst, that should be safe.
I'll rephrase the justification.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 02/11] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-04-08 Thread Mel Gorman
On Thu, Apr 08, 2021 at 12:52:07PM +0200, Peter Zijlstra wrote:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index a68bacddcae0..e9e60d1a85d4 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;
> >  static DEFINE_MUTEX(pcp_batch_high_lock);
> >  #define MIN_PERCPU_PAGELIST_FRACTION   (8)
> >  
> > +struct pagesets {
> > +   local_lock_t lock;
> > +};
> > +static DEFINE_PER_CPU(struct pagesets, pagesets) = {
> > +   .lock = INIT_LOCAL_LOCK(lock),
> > +};
> 
> So why isn't the local_lock_t in struct per_cpu_pages ? That seems to be
> the actual object that is protected by it and is already per-cpu.
> 
> Is that because you want to avoid the duplication across zones? Is that
> worth the effort?

When I wrote the patch, the problem was that zone_pcp_reset freed the
per_cpu_pages structure and it was "protected" by local_irq_save(). If
that was converted to local_lock_irq then the structure containing the
lock is freed before it is released which is obviously bad.

Much later when trying to make the allocator RT-safe in general, I realised
that locking was broken and fixed it in patch 3 of this series. With that,
the local_lock could potentially be embedded within per_cpu_pages safely
at the end of this series.

-- 
Mel Gorman
SUSE Labs


[PATCH 11/11] mm/page_alloc: Update PGFREE outside the zone lock in __free_pages_ok

2021-04-07 Thread Mel Gorman
VM events do not need explicit protection by disabling IRQs so
update the counter with IRQs enabled in __free_pages_ok.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d98d97b6cf5..49951dd841fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1569,10 +1569,11 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
migratetype = get_pfnblock_migratetype(page, pfn);
 
spin_lock_irqsave(>lock, flags);
-   __count_vm_events(PGFREE, 1 << order);
migratetype = check_migratetype_isolated(zone, page, pfn, migratetype);
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock_irqrestore(>lock, flags);
+
+   __count_vm_events(PGFREE, 1 << order);
 }
 
 void __free_pages_core(struct page *page, unsigned int order)
-- 
2.26.2



[PATCH 10/11] mm/page_alloc: Avoid conflating IRQs disabled with zone->lock

2021-04-07 Thread Mel Gorman
Historically when freeing pages, free_one_page() assumed that callers
had IRQs disabled and the zone->lock could be acquired with spin_lock().
This confuses the scope of what local_lock_irq is protecting and what
zone->lock is protecting in free_unref_page_list in particular.

This patch uses spin_lock_irqsave() for the zone->lock in
free_one_page() instead of relying on callers to have disabled
IRQs. free_unref_page_commit() is changed to only deal with PCP pages
protected by the local lock. free_unref_page_list() then first frees
isolated pages to the buddy lists with free_one_page() and frees the rest
of the pages to the PCP via free_unref_page_commit(). The end result
is that free_one_page() is no longer depending on side-effects of
local_lock to be correct.

Note that this may incur a performance penalty while memory hot-remove
is running but that is not a common operation.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 67 ++---
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d94ec53367bd..6d98d97b6cf5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1473,10 +1473,12 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype, fpi_t fpi_flags)
 {
-   spin_lock(>lock);
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
migratetype = check_migratetype_isolated(zone, page, pfn, migratetype);
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
-   spin_unlock(>lock);
+   spin_unlock_irqrestore(>lock, flags);
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -3238,31 +3240,13 @@ static bool free_unref_page_prepare(struct page *page, 
unsigned long pfn)
return true;
 }
 
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
+  int migratetype)
 {
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
-   int migratetype;
 
-   migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
-
-   /*
-* We only track unmovable, reclaimable and movable on pcp lists.
-* Free ISOLATE pages back to the allocator because they are being
-* offlined but treat HIGHATOMIC as movable pages so we can get those
-* areas back if necessary. Otherwise, we may have to free
-* excessively into the page allocator
-*/
-   if (migratetype >= MIGRATE_PCPTYPES) {
-   if (unlikely(is_migrate_isolate(migratetype))) {
-   free_one_page(zone, page, pfn, 0, migratetype,
- FPI_NONE);
-   return;
-   }
-   migratetype = MIGRATE_MOVABLE;
-   }
-
pcp = this_cpu_ptr(zone->per_cpu_pageset);
list_add(>lru, >lists[migratetype]);
pcp->count++;
@@ -3277,12 +3261,29 @@ void free_unref_page(struct page *page)
 {
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
+   int migratetype;
 
if (!free_unref_page_prepare(page, pfn))
return;
 
+   /*
+* We only track unmovable, reclaimable and movable on pcp lists.
+* Place ISOLATE pages on the isolated list because they are being
+* offlined but treat HIGHATOMIC as movable pages so we can get those
+* areas back if necessary. Otherwise, we may have to free
+* excessively into the page allocator
+*/
+   migratetype = get_pcppage_migratetype(page);
+   if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+   if (unlikely(is_migrate_isolate(migratetype))) {
+   free_one_page(page_zone(page), page, pfn, 0, 
migratetype, FPI_NONE);
+   return;
+   }
+   migratetype = MIGRATE_MOVABLE;
+   }
+
local_lock_irqsave(, flags);
-   free_unref_page_commit(page, pfn);
+   free_unref_page_commit(page, pfn, migratetype);
local_unlock_irqrestore(, flags);
 }
 
@@ -3294,6 +3295,7 @@ void free_unref_page_list(struct list_head *list)
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+   int migratetype;
 
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
@@ -3301,15 +3303,28 @@ void free_unref_page_list(struct list_head *list)
if (!free_unref_page_prepare(page, pfn))
list_del(>lru);
set_page_private(page, pfn);
+
+   /*
+* Free isolated pages directly to the allocator, see
+* comment in free_unref_page.
+

[PATCH 09/11] mm/page_alloc: Explicitly acquire the zone lock in __free_pages_ok

2021-04-07 Thread Mel Gorman
__free_pages_ok() disables IRQs before calling a common helper
free_one_page() that acquires the zone lock. While this is safe, it
unnecessarily disables IRQs on PREEMPT_RT kernels.

This patch explicitly acquires the lock with spin_lock_irqsave instead of
relying on a helper. This removes the last instance of local_irq_save()
in page_alloc.c.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1bb5b522a0f9..d94ec53367bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1559,21 +1559,18 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
+   struct zone *zone = page_zone(page);
 
if (!free_pages_prepare(page, order, true))
return;
 
migratetype = get_pfnblock_migratetype(page, pfn);
 
-   /*
-* TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock
-* and protect vmstat updates.
-*/
-   local_irq_save(flags);
+   spin_lock_irqsave(>lock, flags);
__count_vm_events(PGFREE, 1 << order);
-   free_one_page(page_zone(page), page, pfn, order, migratetype,
- fpi_flags);
-   local_irq_restore(flags);
+   migratetype = check_migratetype_isolated(zone, page, pfn, migratetype);
+   __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+   spin_unlock_irqrestore(>lock, flags);
 }
 
 void __free_pages_core(struct page *page, unsigned int order)
-- 
2.26.2



[PATCH 08/11] mm/page_alloc: Remove duplicate checks if migratetype should be isolated

2021-04-07 Thread Mel Gorman
Both free_pcppages_bulk() and free_one_page() have very similar
checks about whether a pages migratetype has changed under the
zone lock. Use a common helper.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 32 ++--
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd75102ef1e1..1bb5b522a0f9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1354,6 +1354,23 @@ static inline void prefetch_buddy(struct page *page)
prefetch(buddy);
 }
 
+/*
+ * The migratetype of a page may have changed due to isolation so check.
+ * Assumes the caller holds the zone->lock to serialise against page
+ * isolation.
+ */
+static inline int
+check_migratetype_isolated(struct zone *zone, struct page *page, unsigned long 
pfn, int migratetype)
+{
+   /* If isolating, check if the migratetype has changed */
+   if (unlikely(has_isolate_pageblock(zone) ||
+   is_migrate_isolate(migratetype))) {
+   migratetype = get_pfnblock_migratetype(page, pfn);
+   }
+
+   return migratetype;
+}
+
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
@@ -1371,7 +1388,6 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
int migratetype = 0;
int batch_free = 0;
int prefetch_nr = READ_ONCE(pcp->batch);
-   bool isolated_pageblocks;
struct page *page, *tmp;
LIST_HEAD(head);
 
@@ -1433,21 +1449,20 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
 * both PREEMPT_RT and non-PREEMPT_RT configurations.
 */
spin_lock(>lock);
-   isolated_pageblocks = has_isolate_pageblock(zone);
 
/*
 * Use safe version since after __free_one_page(),
 * page->lru.next will not point to original list.
 */
list_for_each_entry_safe(page, tmp, , lru) {
+   unsigned long pfn = page_to_pfn(page);
int mt = get_pcppage_migratetype(page);
+
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
-   /* Pageblock could have been isolated meanwhile */
-   if (unlikely(isolated_pageblocks))
-   mt = get_pageblock_migratetype(page);
 
-   __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
+   mt = check_migratetype_isolated(zone, page, pfn, mt);
+   __free_one_page(page, pfn, zone, 0, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(>lock);
@@ -1459,10 +1474,7 @@ static void free_one_page(struct zone *zone,
int migratetype, fpi_t fpi_flags)
 {
spin_lock(>lock);
-   if (unlikely(has_isolate_pageblock(zone) ||
-   is_migrate_isolate(migratetype))) {
-   migratetype = get_pfnblock_migratetype(page, pfn);
-   }
+   migratetype = check_migratetype_isolated(zone, page, pfn, migratetype);
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock(>lock);
 }
-- 
2.26.2



[PATCH 07/11] mm/page_alloc: Reduce duration that IRQs are disabled for VM counters

2021-04-07 Thread Mel Gorman
IRQs are left disabled for the zone and node VM event counters. This is
unnecessary as the affected counters are allowed to race for preemmption
and IRQs.

This patch reduces the scope of IRQs being disabled
via local_[lock|unlock]_irq on !PREEMPT_RT kernels. One
__mod_zone_freepage_state is still called with IRQs disabled. While this
could be moved out, it's not free on all architectures as some require
IRQs to be disabled for mod_zone_page_state on !PREEMPT_RTkernels.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index defb0e436fac..bd75102ef1e1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3474,11 +3474,11 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
pcp = this_cpu_ptr(zone->per_cpu_pageset);
list = >lists[migratetype];
page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
+   local_unlock_irqrestore(, flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone, 1);
}
-   local_unlock_irqrestore(, flags);
return page;
 }
 
@@ -3530,15 +3530,15 @@ struct page *rmqueue(struct zone *preferred_zone,
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
-   spin_unlock(>lock);
if (!page)
goto failed;
+
__mod_zone_freepage_state(zone, -(1 << order),
  get_pcppage_migratetype(page));
+   spin_unlock_irqrestore(>lock, flags);
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
-   local_irq_restore(flags);
 
 out:
/* Separate test+clear to avoid unnecessary atomics */
@@ -3551,7 +3551,7 @@ struct page *rmqueue(struct zone *preferred_zone,
return page;
 
 failed:
-   local_irq_restore(flags);
+   spin_unlock_irqrestore(>lock, flags);
return NULL;
 }
 
@@ -5103,11 +5103,11 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
nr_populated++;
}
 
+   local_unlock_irqrestore(, flags);
+
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
 
-   local_unlock_irqrestore(, flags);
-
return nr_populated;
 
 failed_irq:
-- 
2.26.2



[PATCH 06/11] mm/page_alloc: Batch the accounting updates in the bulk allocator

2021-04-07 Thread Mel Gorman
Now that the zone_statistics are simple counters that do not require
special protection, the bulk allocator accounting updates can be batch
updated without adding too much complexity with protected RMW updates or
using xchg.

Signed-off-by: Mel Gorman 
---
 include/linux/vmstat.h |  8 
 mm/page_alloc.c| 30 +-
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index dde4dec4e7dd..8473b8fa9756 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -246,6 +246,14 @@ __count_numa_event(struct zone *zone, enum numa_stat_item 
item)
raw_cpu_inc(pzstats->vm_numa_event[item]);
 }
 
+static inline void
+__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
+{
+   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+   raw_cpu_add(pzstats->vm_numa_event[item], delta);
+}
+
 extern void __count_numa_event(struct zone *zone, enum numa_stat_item item);
 extern unsigned long sum_zone_node_page_state(int node,
  enum zone_stat_item item);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 73e618d06315..defb0e436fac 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3411,7 +3411,8 @@ void __putback_isolated_page(struct page *page, unsigned 
int order, int mt)
  *
  * Must be called with interrupts disabled.
  */
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+  long nr_account)
 {
 #ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -3424,12 +3425,12 @@ static inline void zone_statistics(struct zone 
*preferred_zone, struct zone *z)
local_stat = NUMA_OTHER;
 
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
-   __count_numa_event(z, NUMA_HIT);
+   __count_numa_events(z, NUMA_HIT, nr_account);
else {
-   __count_numa_event(z, NUMA_MISS);
-   __count_numa_event(preferred_zone, NUMA_FOREIGN);
+   __count_numa_events(z, NUMA_MISS, nr_account);
+   __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
}
-   __count_numa_event(z, local_stat);
+   __count_numa_events(z, local_stat, nr_account);
 #endif
 }
 
@@ -3475,7 +3476,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
-   zone_statistics(preferred_zone, zone);
+   zone_statistics(preferred_zone, zone, 1);
}
local_unlock_irqrestore(, flags);
return page;
@@ -3536,7 +3537,7 @@ struct page *rmqueue(struct zone *preferred_zone,
  get_pcppage_migratetype(page));
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   zone_statistics(preferred_zone, zone, 1);
local_irq_restore(flags);
 
 out:
@@ -5019,7 +5020,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
struct alloc_context ac;
gfp_t alloc_gfp;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
-   int nr_populated = 0;
+   int nr_populated = 0, nr_account = 0;
 
if (unlikely(nr_pages <= 0))
return 0;
@@ -5092,15 +5093,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
goto failed_irq;
break;
}
-
-   /*
-* Ideally this would be batched but the best way to do
-* that cheaply is to first convert zone_statistics to
-* be inaccurate per-cpu counter like vm_events to avoid
-* a RMW cycle then do the accounting with IRQs enabled.
-*/
-   __count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
-   zone_statistics(ac.preferred_zoneref->zone, zone);
+   nr_account++;
 
prep_new_page(page, 0, gfp, 0);
if (page_list)
@@ -5110,6 +5103,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
nr_populated++;
}
 
+   __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
+   zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+
local_unlock_irqrestore(, flags);
 
return nr_populated;
-- 
2.26.2



[PATCH 05/11] mm/vmstat: Inline NUMA event counter updates

2021-04-07 Thread Mel Gorman
__count_numa_event is small enough to be treated similarly to
__count_vm_event so inline it.

Signed-off-by: Mel Gorman 
---
 include/linux/vmstat.h | 9 +
 mm/vmstat.c| 9 -
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index fc14415223c5..dde4dec4e7dd 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -237,6 +237,15 @@ static inline unsigned long 
zone_page_state_snapshot(struct zone *zone,
 }
 
 #ifdef CONFIG_NUMA
+/* See __count_vm_event comment on why raw_cpu_inc is used. */
+static inline void
+__count_numa_event(struct zone *zone, enum numa_stat_item item)
+{
+   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+   raw_cpu_inc(pzstats->vm_numa_event[item]);
+}
+
 extern void __count_numa_event(struct zone *zone, enum numa_stat_item item);
 extern unsigned long sum_zone_node_page_state(int node,
  enum zone_stat_item item);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 63bd84d122c0..b853df95ed0c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -902,15 +902,6 @@ void drain_zonestat(struct zone *zone, struct 
per_cpu_zonestat *pzstats)
 #endif
 
 #ifdef CONFIG_NUMA
-/* See __count_vm_event comment on why raw_cpu_inc is used. */
-void __count_numa_event(struct zone *zone,
-enum numa_stat_item item)
-{
-   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-
-   raw_cpu_inc(pzstats->vm_numa_event[item]);
-}
-
 /*
  * Determine the per node value of a stat item. This function
  * is called frequently in a NUMA machine, so try to be as
-- 
2.26.2



[PATCH 04/11] mm/vmstat: Convert NUMA statistics to basic NUMA counters

2021-04-07 Thread Mel Gorman
NUMA statistics are maintained on the zone level for hits, misses, foreign
etc but nothing relies on them being perfectly accurate for functional
correctness. The counters are used by userspace to get a general overview
of a workloads NUMA behaviour but the page allocator incurs a high cost to
maintain perfect accuracy similar to what is required for a vmstat like
NR_FREE_PAGES. There even is a sysctl vm.numa_stat to allow userspace to
turn off the collection of NUMA statistics like NUMA_HIT.

This patch converts NUMA_HIT and friends to be NUMA events with similar
accuracy to VM events. There is a possibility that slight errors will be
introduced but the overall trend as seen by userspace will be similar.
Note that while these counters could be maintained at the node level that
it would have a user-visible impact.

Signed-off-by: Mel Gorman 
---
 drivers/base/node.c|  18 +++--
 include/linux/mmzone.h |  11 ++-
 include/linux/vmstat.h |  42 +-
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  12 +--
 mm/vmstat.c| 175 -
 6 files changed, 93 insertions(+), 167 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index f449dbb2c746..443a609db428 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -484,6 +484,7 @@ static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL);
 static ssize_t node_read_numastat(struct device *dev,
  struct device_attribute *attr, char *buf)
 {
+   fold_vm_numa_events();
return sysfs_emit(buf,
  "numa_hit %lu\n"
  "numa_miss %lu\n"
@@ -491,12 +492,12 @@ static ssize_t node_read_numastat(struct device *dev,
  "interleave_hit %lu\n"
  "local_node %lu\n"
  "other_node %lu\n",
- sum_zone_numa_state(dev->id, NUMA_HIT),
- sum_zone_numa_state(dev->id, NUMA_MISS),
- sum_zone_numa_state(dev->id, NUMA_FOREIGN),
- sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
- sum_zone_numa_state(dev->id, NUMA_LOCAL),
- sum_zone_numa_state(dev->id, NUMA_OTHER));
+ sum_zone_numa_event_state(dev->id, NUMA_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_MISS),
+ sum_zone_numa_event_state(dev->id, NUMA_FOREIGN),
+ sum_zone_numa_event_state(dev->id, 
NUMA_INTERLEAVE_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_LOCAL),
+ sum_zone_numa_event_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL);
 
@@ -514,10 +515,11 @@ static ssize_t node_read_vmstat(struct device *dev,
 sum_zone_node_page_state(nid, i));
 
 #ifdef CONFIG_NUMA
-   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+   fold_vm_numa_events();
+   for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
len += sysfs_emit_at(buf, len, "%s %lu\n",
 numa_stat_name(i),
-sum_zone_numa_state(nid, i));
+sum_zone_numa_event_state(nid, i));
 
 #endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 106da8fbc72a..693cd5f24f7d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -135,10 +135,10 @@ enum numa_stat_item {
NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */
-   NR_VM_NUMA_STAT_ITEMS
+   NR_VM_NUMA_EVENT_ITEMS
 };
 #else
-#define NR_VM_NUMA_STAT_ITEMS 0
+#define NR_VM_NUMA_EVENT_ITEMS 0
 #endif
 
 enum zone_stat_item {
@@ -357,7 +357,10 @@ struct per_cpu_zonestat {
s8 stat_threshold;
 #endif
 #ifdef CONFIG_NUMA
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
+   u16 vm_numa_stat_diff[NR_VM_NUMA_EVENT_ITEMS];
+#endif
+#ifdef CONFIG_NUMA
+   unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
 #endif
 };
 
@@ -609,7 +612,7 @@ struct zone {
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t   vm_stat[NR_VM_ZONE_STAT_ITEMS];
-   atomic_long_t   vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
+   atomic_long_t   vm_numa_events[NR_VM_NUMA_EVENT_ITEMS];
 } cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1736ea9d24a7..fc14415223c5 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -138,35 +138,27 @@ static i

[PATCH 03/11] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-07 Thread Mel Gorman
zone_pcp_reset allegedly protects against a race with drain_pages
using local_irq_save but this is bogus. local_irq_save only operates
on the local CPU. If memory hotplug is running on CPU A and drain_pages
is running on CPU B, disabling IRQs on CPU A does not affect CPU B and
offers no protection.

This patch reorders memory hotremove such that the PCP structures
relevant to the zone are no longer reachable by the time the structures
are freed.  With this reordering, no protection is required to prevent
a use-after-free and the IRQs can be left enabled. zone_pcp_reset is
renamed to zone_pcp_destroy to make it clear that the per-cpu structures
are deleted when the function returns.

Signed-off-by: Mel Gorman 
---
 mm/internal.h   |  2 +-
 mm/memory_hotplug.c | 10 +++---
 mm/page_alloc.c | 22 --
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 09adf152a10b..cc34ce4461b7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -203,7 +203,7 @@ extern void free_unref_page(struct page *page);
 extern void free_unref_page_list(struct list_head *list);
 
 extern void zone_pcp_update(struct zone *zone);
-extern void zone_pcp_reset(struct zone *zone);
+extern void zone_pcp_destroy(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
 extern void zone_pcp_enable(struct zone *zone);
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0cdbbfbc5757..3d059c9f9c2d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1687,12 +1687,16 @@ int __ref offline_pages(unsigned long start_pfn, 
unsigned long nr_pages)
zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(>lock, flags);
 
-   zone_pcp_enable(zone);
-
/* removal success */
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
zone->present_pages -= nr_pages;
 
+   /*
+* Restore PCP after managed pages has been updated. Unpopulated
+* zones PCP structures will remain unusable.
+*/
+   zone_pcp_enable(zone);
+
pgdat_resize_lock(zone->zone_pgdat, );
zone->zone_pgdat->node_present_pages -= nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, );
@@ -1700,8 +1704,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned 
long nr_pages)
init_per_zone_wmark_min();
 
if (!populated_zone(zone)) {
-   zone_pcp_reset(zone);
build_all_zonelists(NULL);
+   zone_pcp_destroy(zone);
} else
zone_pcp_update(zone);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e9e60d1a85d4..a8630003612b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8972,18 +8972,29 @@ void zone_pcp_disable(struct zone *zone)
 
 void zone_pcp_enable(struct zone *zone)
 {
-   __zone_set_pageset_high_and_batch(zone, zone->pageset_high, 
zone->pageset_batch);
+   /*
+* If the zone is populated, restore the high and batch counts.
+* If unpopulated, leave the high and batch count as 0 and 1
+* respectively as done by zone_pcp_disable. The per-cpu
+* structures will later be freed by zone_pcp_destroy.
+*/
+   if (populated_zone(zone))
+   __zone_set_pageset_high_and_batch(zone, zone->pageset_high, 
zone->pageset_batch);
+
mutex_unlock(_batch_high_lock);
 }
 
-void zone_pcp_reset(struct zone *zone)
+/*
+ * Called when a zone has been hot-removed. At this point, the PCP has been
+ * drained, disabled and the zone is removed from the zonelists so the
+ * structures are no longer in use. PCP was disabled/drained by
+ * zone_pcp_disable. This function will drain any remaining vmstat deltas.
+ */
+void zone_pcp_destroy(struct zone *zone)
 {
-   unsigned long flags;
int cpu;
struct per_cpu_zonestat *pzstats;
 
-   /* avoid races with drain_pages()  */
-   local_irq_save(flags);
if (zone->per_cpu_pageset != _pageset) {
for_each_online_cpu(cpu) {
pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
@@ -8994,7 +9005,6 @@ void zone_pcp_reset(struct zone *zone)
zone->per_cpu_pageset = _pageset;
zone->per_cpu_zonestats = _zonestats;
}
-   local_irq_restore(flags);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-- 
2.26.2



[PATCH 02/11] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-04-07 Thread Mel Gorman
There is a lack of clarity of what exactly local_irq_save/local_irq_restore
protects in page_alloc.c . It conflates the protection of per-cpu page
allocation structures with per-cpu vmstat deltas.

This patch protects the PCP structure using local_lock which for most
configurations is identical to IRQ enabling/disabling.  The scope of the
lock is still wider than it should be but this is decreased laster.

[l...@intel.com: Make pagesets static]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h |  2 ++
 mm/page_alloc.c| 50 +-
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a4393ac27336..106da8fbc72a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* Free memory management - zoned buddy allocator.  */
@@ -337,6 +338,7 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a68bacddcae0..e9e60d1a85d4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
 
+struct pagesets {
+   local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+   .lock = INIT_LOCAL_LOCK(lock),
+};
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -1421,6 +1428,10 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
} while (--count && --batch_free && !list_empty(list));
}
 
+   /*
+* local_lock_irq held so equivalent to spin_lock_irqsave for
+* both PREEMPT_RT and non-PREEMPT_RT configurations.
+*/
spin_lock(>lock);
isolated_pageblocks = has_isolate_pageblock(zone);
 
@@ -1541,6 +1552,11 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
return;
 
migratetype = get_pfnblock_migratetype(page, pfn);
+
+   /*
+* TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock
+* and protect vmstat updates.
+*/
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype,
@@ -2910,6 +2926,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int 
order,
 {
int i, allocated = 0;
 
+   /*
+* local_lock_irq held so equivalent to spin_lock_irqsave for
+* both PREEMPT_RT and non-PREEMPT_RT configurations.
+*/
spin_lock(>lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
@@ -2962,12 +2982,12 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
unsigned long flags;
int to_drain, batch;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 #endif
 
@@ -2983,13 +3003,13 @@ static void drain_pages_zone(unsigned int cpu, struct 
zone *zone)
unsigned long flags;
struct per_cpu_pages *pcp;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
 
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
 
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 
 /*
@@ -3252,9 +3272,9 @@ void free_unref_page(struct page *page)
if (!free_unref_page_prepare(page, pfn))
return;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
free_unref_page_commit(page, pfn);
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 
 /*
@@ -3274,7 +3294,7 @@ void free_unref_page_list(struct list_head *list)
set_page_private(page, pfn);
}
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
list_for_each_entry_safe(page, next, list, lru) {
unsigned long pfn = page_private(page);
 
@@ -3287,12 +3307,12 @@ void free_unref_page_list(struct list_head *list)
 * a large list of pages to free.
 */
if (++batch_count == SWA

[PATCH 0/11 v2] Use local_lock for pcp protection and reduce stat overhead

2021-04-07 Thread Mel Gorman
For MM people, the whole series is relevant but patch 3 needs particular
attention for memory hotremove as I had problems testing it because full
zone removal always failed for me. For RT people, the most interesting
patches are 2, 9 and 10 with 2 being the most important.

This series requires patches in Andrew's tree so for convenience, it's also 
available at

git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git 
mm-percpu-local_lock-v2r10

The PCP (per-cpu page allocator in page_alloc.c) shares locking
requirements with vmstat and the zone lock which is inconvenient and
causes some issues. For example, the PCP list and vmstat share the same
per-cpu space meaning that it's possible that vmstat updates dirty cache
lines holding per-cpu lists across CPUs unless padding is used.  Second,
PREEMPT_RT does not want IRQs disabled in the page allocator because it's
too long for IRQs to be disabled unnecesarily.

This series splits the locking requirements and uses locks types more
suitable for PREEMPT_RT, reduces the time when special locking is required
for stats and reduces the time when IRQs need to be disabled on !PREEMPT_RT
kernels.

Why local_lock? PREEMPT_RT considers the following sequence to be unsafe
as documented in Documentation/locking/locktypes.rst

   local_irq_disable();
   raw_spin_lock();

The page allocator does not use raw_spin_lock but using local_irq_safe
is undesirable on PREEMPT_RT as it leaves IRQs disabled for an excessive
length of time. By converting to local_lock which disables migration on
PREEMPT_RT, the locking requirements can be separated and start moving
the protections for PCP, stats and the zone lock to PREEMPT_RT-safe
equivalent locking. As a bonus, local_lock also means that PROVE_LOCKING
does something useful.

After that, it was very obvious that zone_statistics in particular has
way too much overhead and leaves IRQs disabled for longer than necessary
on !PREEMPT_RT kernels. zone_statistics uses perfectly accurate counters
requiring IRQs be disabled for parallel RMW sequences when inaccurate ones
like vm_events would do. The series makes the NUMA statistics (NUMA_HIT
and friends) inaccurate counters that then require no special protection
on !PREEMPT_RT.

The bulk page allocator can then do stat updates in bulk with IRQs enabled
which should improve the efficiency.  Technically, this could have been
done without the local_lock and vmstat conversion work and the order
simply reflects the timing of when different series were implemented.

Finally, there are places where we conflate IRQs being disabled for the
PCP with the IRQ-safe zone spinlock. The remainder of the series reduces
the scope of what is protected by disabled IRQs on !PREEMPT_RT kernels.
By the end of the series, page_alloc.c does not call local_irq_save so
the locking scope is a bit clearer. The one exception is that modifying
NR_FREE_PAGES still happens in places where it's known the IRQs are
disabled as it's harmless for PREEMPT_RT and would be expensive to split
the locking there.

No performance data is included because despite the overhead of the stats,
it's within the noise for most workloads on !PREEMPT_RT. However, Jesper
Dangaard Brouer ran a page allocation microbenchmark on a E5-1650 v4 @
3.60GHz CPU on the first version of this series. Focusing on the array
variant of the bulk page allocator reveals the following.

(CPU: Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz)
ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size

 BaselinePatched
 1   56.383  54.225 (+3.83%)
 2   40.047  35.492 (+11.38%)
 3   37.339  32.643 (+12.58%)
 4   35.578  30.992 (+12.89%)
 8   33.592  29.606 (+11.87%)
 16  32.362  28.532 (+11.85%)
 32  31.476  27.728 (+11.91%)
 64  30.633  27.252 (+11.04%)
 128 30.596  27.090 (+11.46%)

While this is a positive outcome, the series is more likely to be
interesting to the RT people in terms of getting parts of the PREEMPT_RT
tree into mainline.

 drivers/base/node.c|  18 +--
 include/linux/mmzone.h |  29 ++--
 include/linux/vmstat.h |  65 +
 mm/internal.h  |   2 +-
 mm/memory_hotplug.c|  10 +-
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c| 297 -
 mm/vmstat.c| 250 --
 8 files changed, 339 insertions(+), 334 deletions(-)

-- 
2.26.2



[PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats

2021-04-07 Thread Mel Gorman
The per-cpu page allocator lists and the per-cpu vmstat deltas are stored
in the same struct per_cpu_pages even though vmstats have no direct impact
on the per-cpu page lists. This is inconsistent because the vmstats for a
node are stored on a dedicated structure. The bigger issue is that the
per_cpu_pages structure is not cache-aligned and stat updates either
cache conflict with adjacent per-cpu lists incurring a runtime cost or
padding is required incurring a memory cost.

This patch splits the per-cpu pagelists and the vmstat deltas into separate
structures. It's mostly a mechanical conversion but some variable renaming
is done to clearly distinguish the per-cpu pages structure (pcp) from
the vmstats (pzstats).

Superficially, this appears to increase the size of the per_cpu_pages
structure but the movement of expire fills a structure hole so there is
no impact overall.

[l...@intel.com: Check struct per_cpu_zonestat has a non-zero size]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h | 18 
 include/linux/vmstat.h |  8 ++--
 mm/page_alloc.c| 84 +++-
 mm/vmstat.c| 96 ++
 4 files changed, 110 insertions(+), 96 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..a4393ac27336 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -341,20 +341,21 @@ struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
int batch;  /* chunk size for buddy add/remove */
+#ifdef CONFIG_NUMA
+   int expire; /* When 0, remote pagesets are drained */
+#endif
 
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES];
 };
 
-struct per_cpu_pageset {
-   struct per_cpu_pages pcp;
-#ifdef CONFIG_NUMA
-   s8 expire;
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
-#endif
+struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
-   s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+   s8 stat_threshold;
+#endif
+#ifdef CONFIG_NUMA
+   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 };
 
@@ -470,7 +471,8 @@ struct zone {
int node;
 #endif
struct pglist_data  *zone_pgdat;
-   struct per_cpu_pageset __percpu *pageset;
+   struct per_cpu_pages__percpu *per_cpu_pageset;
+   struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
 * the high and batch values are copied to individual pagesets for
 * faster access
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 506d625163a1..1736ea9d24a7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct 
zone *zone,
int cpu;
 
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_numa_stat_diff[item];
 
return x;
 }
@@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct 
zone *zone,
 #ifdef CONFIG_SMP
int cpu;
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_stat_diff[item];
 
if (x < 0)
x = 0;
@@ -291,7 +291,7 @@ struct ctl_table;
 int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
loff_t *ppos);
 
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);
 
 int calculate_pressure_threshold(struct zone *zone);
 int calculate_normal_threshold(struct zone *zone);
@@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { }
 static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
-   struct per_cpu_pageset *pset) { }
+   struct per_cpu_zonestat *pzstats) { }
 #endif /* CONFIG_SMP */
 
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e8aedb64b57..a68bacddcae0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2981,15 +2981,14 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
unsigned long flags;
-   struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
 
local_irq_save(flags);
-   pset = per_cpu_ptr(zone->pageset, cpu);
 
-   pcp = >pcp;
+   pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
f

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-07 Thread Mel Gorman
On Wed, Apr 07, 2021 at 12:15:13PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 07, 2021 at 10:41:06AM +0100, Mel Gorman wrote:
> 
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -6112,6 +6112,27 @@ static int select_idle_core(struct task_
> > >   return -1;
> > >  }
> > >  
> > > +/*
> > > + * Scan the local SMT mask for idle CPUs.
> > > + */
> > > +static int select_idle_smt(struct task_struct *p, struct sched_domain 
> > > *sd, int target)
> > > +{
> > > + int cpu;
> > > +
> > > + if (!static_branch_likely(_smt_present))
> > > + return -1;
> > > +
> > > + for_each_cpu(cpu, cpu_smt_mask(target)) {
> > > + if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> > > + !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> > > + continue;
> > 
> > While I know that !cpumask_test_cpu(cpu, sched_domain_span(sd)) was
> > done previously, I found it hard to believe that the test matters. If
> > target/prev share a the LLC domain, why would the SMT siblings *not*
> > share a LLC?
> 
> I think the reason for it is that a cpuset might have split the siblings
> apart and disabled load-balancing across them or something.
> 
> Then the affinity mask can still cross the partition, but we shouldn't
> ever move into it through balancing.

Ok, cpusets do split domains. I can't imagine the logic of splitting SMT
siblings across cpusets but if it's possible, it has to be checked and
protecting that with cpusets_enabled() would be a little overkill and
possibly miss some other corner case :(

Thanks.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-07 Thread Mel Gorman
On Wed, Apr 07, 2021 at 09:17:18AM +0200, Peter Zijlstra wrote:
> Subject: sched/fair: Bring back select_idle_smt(), but differently
> From: Rik van Riel 
> Date: Fri, 26 Mar 2021 15:19:32 -0400
> 
> From: Rik van Riel 
> 
> Mel Gorman did some nice work in 9fe1f127b913 ("sched/fair: Merge
> select_idle_core/cpu()"), resulting in the kernel being more efficient
> at finding an idle CPU, and in tasks spending less time waiting to be
> run, both according to the schedstats run_delay numbers, and according
> to measured application latencies. Yay.
> 
> The flip side of this is that we see more task migrations (about 30%
> more), higher cache misses, higher memory bandwidth utilization, and
> higher CPU use, for the same number of requests/second.
> 
> This is most pronounced on a memcache type workload, which saw a
> consistent 1-3% increase in total CPU use on the system, due to those
> increased task migrations leading to higher L2 cache miss numbers, and
> higher memory utilization. The exclusive L3 cache on Skylake does us
> no favors there.
> 
> On our web serving workload, that effect is usually negligible.
> 
> It appears that the increased number of CPU migrations is generally a
> good thing, since it leads to lower cpu_delay numbers, reflecting the
> fact that tasks get to run faster. However, the reduced locality and
> the corresponding increase in L2 cache misses hurts a little.
> 
> The patch below appears to fix the regression, while keeping the
> benefit of the lower cpu_delay numbers, by reintroducing
> select_idle_smt with a twist: when a socket has no idle cores, check
> to see if the sibling of "prev" is idle, before searching all the
> other CPUs.
> 
> This fixes both the occasional 9% regression on the web serving
> workload, and the continuous 2% CPU use regression on the memcache
> type workload.
> 
> With Mel's patches and this patch together, task migrations are still
> high, but L2 cache misses, memory bandwidth, and CPU time used are
> back down to what they were before. The p95 and p99 response times for
> the memcache type application improve by about 10% over what they were
> before Mel's patches got merged.
> 
> Signed-off-by: Rik van Riel 
> Signed-off-by: Peter Zijlstra (Intel) 
> Link: https://lkml.kernel.org/r/20210326151932.2c187...@imladris.surriel.com

I think this is still ok and should not invalidate the previous tests on
v3. While test_idle_cores() was checked on target, as long as target/prev
share cache, the test should be equivalent other than there is a minor
race so

Reviewed-by: Mel Gorman 

One minor question below though which previously confused me.

> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6112,6 +6112,27 @@ static int select_idle_core(struct task_
>   return -1;
>  }
>  
> +/*
> + * Scan the local SMT mask for idle CPUs.
> + */
> +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, 
> int target)
> +{
> + int cpu;
> +
> + if (!static_branch_likely(_smt_present))
> + return -1;
> +
> + for_each_cpu(cpu, cpu_smt_mask(target)) {
> + if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> + !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> + continue;

While I know that !cpumask_test_cpu(cpu, sched_domain_span(sd)) was
done previously, I found it hard to believe that the test matters. If
target/prev share a the LLC domain, why would the SMT siblings *not*
share a LLC?

-- 
Mel Gorman
SUSE Labs


Re: [RFC PATCH 0/6] mm: thp: use generic THP migration for NUMA hinting fault

2021-04-07 Thread Mel Gorman
On Tue, Apr 06, 2021 at 09:42:07AM -0700, Yang Shi wrote:
> On Tue, Apr 6, 2021 at 5:03 AM Gerald Schaefer
>  wrote:
> >
> > On Thu, 1 Apr 2021 13:10:49 -0700
> > Yang Shi  wrote:
> >
> > [...]
> > > > >
> > > > > Yes, it could be. The old behavior of migration was to return -ENOMEM
> > > > > if THP migration is not supported then split THP. That behavior was
> > > > > not very friendly to some usecases, for example, memory policy and
> > > > > migration lieu of reclaim (the upcoming). But I don't mean we restore
> > > > > the old behavior. We could split THP if it returns -ENOSYS and the
> > > > > page is THP.
> > > >
> > > > OK, as long as we don't get any broken PMD migration entries established
> > > > for s390, some extra THP splitting would be acceptable I guess.
> > >
> > > There will be no migration PMD installed. The current behavior is a
> > > no-op if THP migration is not supported.
> >
> > Ok, just for completeness, since Mel also replied that the split
> > was not done on other architectures "because the loss from splitting
> > exceeded the gain of improved locality":
> >
> > I did not mean to request extra splitting functionality for s390,
> > simply skipping / ignoring large PMDs would also be fine for s390,
> > no need to add extra complexity.
> 
> Thank you. It could make life easier. The current code still converts
> huge PMD to RPOTNONE even though THP migration is not supported. It is
> easy to skip such PMDs hence cycles are saved for pointless NUMA
> hinting page faults.
> 
> Will do so in v2 if no objection from Mel as well.

I did not get a chance to review this in time but if a v2 shows up,
I'll at least run it through a battery of tests to measure the impact
and hopefully find the time to do a proper review. Superficially I'm not
opposed to using generic code for migration because even if it shows up a
problem, it would be better to optimise the generic implementation than
carry two similar implementations. I'm undecided on whether s390 should
split+migrate rather than skip because I do not have a good overview of
"typical workloads on s390 that benefit from NUMA balancing".

-- 
Mel Gorman
SUSE Labs


Re: [PATCH -V2] NUMA balancing: reduce TLB flush via delaying mapping on hint page fault

2021-04-07 Thread Mel Gorman
On Fri, Apr 02, 2021 at 04:27:17PM +0800, Huang Ying wrote:
> With NUMA balancing, in hint page fault handler, the faulting page
> will be migrated to the accessing node if necessary.  During the
> migration, TLB will be shot down on all CPUs that the process has run
> on recently.  Because in the hint page fault handler, the PTE will be
> made accessible before the migration is tried.  The overhead of TLB
> shooting down can be high, so it's better to be avoided if possible.
> In fact, if we delay mapping the page until migration, that can be
> avoided.  This is what this patch doing.
> 
> 
>

Thanks, I think this is ok for Andrew to pick up to see if anything
bisects to this commit but it's a low risk.

Reviewed-by: Mel Gorman 

More notes;

This is not a universal win given that not all workloads exhibit the
pattern where accesses occur in parallel threads between when a page
is marked accessible and when it is migrated. The impact of the patch
appears to be neutral for those workloads. For workloads that do exhibit
the pattern, there is a small gain with a reduction in interrupts as
advertised unlike v1 of the patch. Further tests are running to confirm
the reduction is in TLB shootdown interrupts but I'm reasonably confident
that will be the case. Gains are typically small and the load described in
the changelog appears to be a best case scenario but a 1-5% gain in some
other workloads is still an improvement. There is still the possibility
that some workloads will unnecessarily stall as a result of the patch
for slightly longer periods of time but that is a relatively low risk
and will be difficult to detect. If I'm wrong, a bisection will find it.

Andrew?

-- 
Mel Gorman
SUSE Labs


Re: [RFC] NUMA balancing: reduce TLB flush via delaying mapping on hint page fault

2021-04-01 Thread Mel Gorman
On Wed, Mar 31, 2021 at 09:36:04AM -0700, Nadav Amit wrote:
> 
> 
> > On Mar 31, 2021, at 6:16 AM, Mel Gorman  wrote:
> > 
> > On Wed, Mar 31, 2021 at 07:20:09PM +0800, Huang, Ying wrote:
> >> Mel Gorman  writes:
> >> 
> >>> On Mon, Mar 29, 2021 at 02:26:51PM +0800, Huang Ying wrote:
> >>>> For NUMA balancing, in hint page fault handler, the faulting page will
> >>>> be migrated to the accessing node if necessary.  During the migration,
> >>>> TLB will be shot down on all CPUs that the process has run on
> >>>> recently.  Because in the hint page fault handler, the PTE will be
> >>>> made accessible before the migration is tried.  The overhead of TLB
> >>>> shooting down is high, so it's better to be avoided if possible.  In
> >>>> fact, if we delay mapping the page in PTE until migration, that can be
> >>>> avoided.  This is what this patch doing.
> >>>> 
> >>> 
> >>> Why would the overhead be high? It was previously inaccessibly so it's
> >>> only parallel accesses making forward progress that trigger the need
> >>> for a flush.
> >> 
> >> Sorry, I don't understand this.  Although the page is inaccessible, the
> >> threads may access other pages, so TLB flushing is still necessary.
> >> 
> > 
> > You assert the overhead of TLB shootdown is high and yes, it can be
> > very high but you also said "the benchmark score has no visible changes"
> > indicating the TLB shootdown cost is not a major problem for the workload.
> > It does not mean we should ignore it though.
> 
> If you are looking for a benchmark that is negatively affected by NUMA
> balancing, then IIRC Parsec???s dedup is such a workload. [1]
> 

Few questions;

Is Parsec imparied due to NUMA balancing in general or due to TLB
shootdowns specifically?

Are you using "gcc-pthreads" for parallelisation and the "native" size
for Parsec?

Is there any specific thread count that matters either in
absolute terms or as a precentage of online CPUs?

-- 
Mel Gorman
SUSE Labs


signature.asc
Description: Digital signature


Re: [PATCH 2/6] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-03-31 Thread Mel Gorman
On Wed, Mar 31, 2021 at 07:42:42PM +0200, Thomas Gleixner wrote:
> On Wed, Mar 31 2021 at 12:01, Mel Gorman wrote:
> > On Wed, Mar 31, 2021 at 11:55:56AM +0200, Thomas Gleixner wrote:
> > @@ -887,13 +887,11 @@ void cpu_vm_stats_fold(int cpu)
> >  
> > pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
> >  
> > -   preempt_disable();
> > for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
> > if (pzstats->vm_stat_diff[i]) {
> > int v;
> >  
> > -   v = pzstats->vm_stat_diff[i];
> > -   pzstats->vm_stat_diff[i] = 0;
> > +   v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
> 
> Confused. pzstats is not a percpu pointer. zone->per_cpu_zonestats is.
> 
> But @cpu is not necessarily the current CPU.
> 

I was drinking drain cleaner instead of coffee. The code was also broken
to begin with.

drain_pages() is draining pagesets of a local or dead CPU. For a local
CPU, disabling IRQs prevent an IRQ arriving during the drain, trying to
allocate a page and potentially corrupt the local pageset -- ok.

zone_pcp_reset is accessing a remote CPUs pageset, freeing the percpu
pointer and resetting it to boot_pageset. zone_pcp_reset calling
local_irq_save() does not offer any special protection against
drain_pages because there are two separate IRQs involved.

This particular patch may have no reason to touch zone_pcp_reset,
cpu_vm_stats_fold or drain_zonestat at all but I need to think about it
more tomorrow.

-- 
Mel Gorman
SUSE Labs


Re: [RFC PATCH 0/6] mm: thp: use generic THP migration for NUMA hinting fault

2021-03-31 Thread Mel Gorman
On Tue, Mar 30, 2021 at 04:42:00PM +0200, Gerald Schaefer wrote:
> Could there be a work-around by splitting THP pages instead of marking them
> as migrate pmds (via pte swap entries), at least when THP migration is not
> supported? I guess it could also be acceptable if THP pages were simply not
> migrated for NUMA balancing on s390, but then we might need some extra config
> option to make that behavior explicit.
> 

The split is not done on other architectures simply because the loss
from splitting exceeded the gain of improved locality in too many cases.
However, it might be ok as an s390-specific workaround.

(Note, I haven't read the rest of the series due to lack of time but this
query caught my eye).

-- 
Mel Gorman
SUSE Labs


Re: [RFC] NUMA balancing: reduce TLB flush via delaying mapping on hint page fault

2021-03-31 Thread Mel Gorman
On Wed, Mar 31, 2021 at 07:20:09PM +0800, Huang, Ying wrote:
> Mel Gorman  writes:
> 
> > On Mon, Mar 29, 2021 at 02:26:51PM +0800, Huang Ying wrote:
> >> For NUMA balancing, in hint page fault handler, the faulting page will
> >> be migrated to the accessing node if necessary.  During the migration,
> >> TLB will be shot down on all CPUs that the process has run on
> >> recently.  Because in the hint page fault handler, the PTE will be
> >> made accessible before the migration is tried.  The overhead of TLB
> >> shooting down is high, so it's better to be avoided if possible.  In
> >> fact, if we delay mapping the page in PTE until migration, that can be
> >> avoided.  This is what this patch doing.
> >> 
> >
> > Why would the overhead be high? It was previously inaccessibly so it's
> > only parallel accesses making forward progress that trigger the need
> > for a flush.
> 
> Sorry, I don't understand this.  Although the page is inaccessible, the
> threads may access other pages, so TLB flushing is still necessary.
> 

You assert the overhead of TLB shootdown is high and yes, it can be
very high but you also said "the benchmark score has no visible changes"
indicating the TLB shootdown cost is not a major problem for the workload.
It does not mean we should ignore it though.

> > 
> >
> > If migration is attempted, then the time until the migration PTE is
> > created is variable. The page has to be isolated from the LRU so there
> > could be contention on the LRU lock, a new page has to be allocated and
> > that allocation potentially has to enter the page allocator slow path
> > etc. During that time, parallel threads make forward progress but with
> > the patch, multiple threads potentially attempt the allocation and fail
> > instead of doing real work.
> 
> If my understanding of the code were correct, only the first thread will
> attempt the isolation and allocation.  Because TestClearPageLRU() is
> called in
> 
>   migrate_misplaced_page()
> numamigrate_isolate_page()
>   isolate_lru_page()
> 
> And migrate_misplaced_page() will return 0 immediately if
> TestClearPageLRU() returns false.  Then the second thread will make the
> page accessible and make forward progress.
> 

Ok, that's true. While additional work is done, the cost is reasonably
low -- lower than I initially imagined and with fewer side-effects.

> But there's still some timing difference between the original and
> patched kernel.  We have several choices to reduce the difference.
> 
> 1. Check PageLRU() with PTL held in do_numa_page()
> 
> If PageLRU() return false, do_numa_page() can make the page accessible
> firstly.  So the second thread will make the page accessible earlier.
> 
> 2. Try to lock the page with PTL held in do_numa_page()
> 
> If the try-locking succeeds, it's the first thread, so it can delay
> mapping.  If try-locking fails, it may be the second thread, so it will
> make the page accessible firstly.  We need to teach
> migrate_misplaced_page() to work with the page locked.  This will
> enlarge the duration that the page is locked.  Is it a problem?
> 
> 3. Check page_count() with PTL held in do_numa_page()
> 
> The first thread will call get_page() in numa_migrate_prep().  So if the
> second thread can detect that, it can make the page accessible firstly.
> The difficulty is that it appears hard to identify the expected
> page_count() for the file pages.  For anonymous pages, that is much
> easier, so at least if a page passes the following test, we can delay
> mapping,
> 
> PageAnon(page) && page_count(page) == page_mapcount(page) + 
> !!PageSwapCache(page)
> 
> This will disable the optimization for the file pages.  But it may be
> good enough?
> 
> Which one do you think is better?  Maybe the first one is good enough?
> 

The first one is probably the most straight-forward but it's more
important to figure out why interrupts were higher with at least one
workload when the exact opposite is expected. Investigating which of
options 1-3 are best and whether it's worth the duplicated check could
be done as a separate patch.

> > You should consider the following question -- is the potential saving
> > of an IPI transmission enough to offset the cost of parallel accesses
> > not making forward progress while one migration is setup and having
> > different migration attempts collide?
> >
> > I have tests running just in case but I think the answer may be "no".
> > So far only one useful test as completed (specjbb2005 with one VM per NUMA
> > node) and it showed a mix of small gains and losses but with *higher*

Re: [PATCH 2/6] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-03-31 Thread Mel Gorman
On Wed, Mar 31, 2021 at 11:55:56AM +0200, Thomas Gleixner wrote:
> On Mon, Mar 29 2021 at 13:06, Mel Gorman wrote:
> > There is a lack of clarity of what exactly local_irq_save/local_irq_restore
> > protects in page_alloc.c . It conflates the protection of per-cpu page
> > allocation structures with per-cpu vmstat deltas.
> >
> > This patch protects the PCP structure using local_lock which
> > for most configurations is identical to IRQ enabling/disabling.
> > The scope of the lock is still wider than it should be but this is
> > decreased in later patches. The per-cpu vmstat deltas are protected by
> > preempt_disable/preempt_enable where necessary instead of relying on
> > IRQ disable/enable.
> 
> Yes, this goes into the right direction and I really appreciate the
> scoped protection for clarity sake.
> 

Thanks.

> >  #ifdef CONFIG_MEMORY_HOTREMOVE
> > diff --git a/mm/vmstat.c b/mm/vmstat.c
> > index 8a8f1a26b231..01b74ff73549 100644
> > --- a/mm/vmstat.c
> > +++ b/mm/vmstat.c
> > @@ -887,6 +887,7 @@ void cpu_vm_stats_fold(int cpu)
> >  
> > pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
> >  
> > +   preempt_disable();
> 
> What's the reason for the preempt_disable() here? A comment would be
> appreciated.
> 

Very good question because it's protecting vm_stat_diff and
vm_numa_stat_diff in different contexts and not quite correctly at this
point of the series. By the end of the series vm_numa_stat_diff is a
simple counter and does not need special protection.

Right now, it's protecting against a read and clear of vm_stat_diff
in two contexts -- cpu_vm_stats_fold and drain_zonestats but it's only
defensive. cpu_vm_stats_fold is only called when a CPU is going dead and
drain_zonestats is called from memory hotplug context. The protection is
necessary only if a new drain_zonestats caller was added without taking
the RMW of vm_stat_diff into account which may never happen.

This whole problem with preemption could be avoided altogether if
this_cpu_xchg was used similar to what is done elsewhere in vmstat
so this?

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 64429ca4957f..9528304ce24d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8969,8 +8969,9 @@ void zone_pcp_reset(struct zone *zone)
struct per_cpu_zonestat *pzstats;
 
/*
-* No race with drain_pages. drain_zonestat disables preemption
-* and drain_pages relies on the pcp local_lock.
+* No race with drain_pages. drain_zonestat is only concerned with
+* vm_*_stat_diff which is updated with this_cpu_xchg and drain_pages
+* only cares about the PCP lists protected by local_lock.
 */
if (zone->per_cpu_pageset != _pageset) {
for_each_online_cpu(cpu) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 01b74ff73549..34ff61a145d2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -887,13 +887,11 @@ void cpu_vm_stats_fold(int cpu)
 
pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
 
-   preempt_disable();
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (pzstats->vm_stat_diff[i]) {
int v;
 
-   v = pzstats->vm_stat_diff[i];
-   pzstats->vm_stat_diff[i] = 0;
+   v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
atomic_long_add(v, >vm_stat[i]);
global_zone_diff[i] += v;
}
@@ -903,13 +901,11 @@ void cpu_vm_stats_fold(int cpu)
if (pzstats->vm_numa_stat_diff[i]) {
int v;
 
-   v = pzstats->vm_numa_stat_diff[i];
-   pzstats->vm_numa_stat_diff[i] = 0;
+   v = 
this_cpu_xchg(pzstats->vm_numa_stat_diff[i], 0);
atomic_long_add(v, >vm_numa_stat[i]);
global_numa_diff[i] += v;
}
 #endif
-   preempt_enable();
}
 
for_each_online_pgdat(pgdat) {
@@ -943,10 +939,9 @@ void drain_zonestat(struct zone *zone, struct 
per_cpu_zonestat *pzstats)
 {
int i;
 
-   preempt_disable();
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (pzstats->vm_stat_diff[i]) {
-   int v = pzstats->vm_stat_diff[i];
+   int v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
pzstats->vm_stat_diff[i] = 0;
atomic_long_add(v, >vm_stat[i]);
atomic_long_add(v, _zone_stat[i]);
@@ -955,14 +950,12 @@ void drain_zonestat(struct zone

Re: [RFC PATCH 0/6] Use local_lock for pcp protection and reduce stat overhead

2021-03-31 Thread Mel Gorman
Ingo, Thomas or Peter, is there any chance one of you could take a look
at patch "[PATCH 2/6] mm/page_alloc: Convert per-cpu list protection to
local_lock" from this series? It's partially motivated by PREEMPT_RT. More
details below.

On Mon, Mar 29, 2021 at 01:06:42PM +0100, Mel Gorman wrote:
> This series requires patches in Andrew's tree so the series is also
> available at
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git 
> mm-percpu-local_lock-v1r15
> 
> The PCP (per-cpu page allocator in page_alloc.c) share locking requirements
> with vmstat which is inconvenient and causes some issues. Possibly because
> of that, the PCP list and vmstat share the same per-cpu space meaning that
> it's possible that vmstat updates dirty cache lines holding per-cpu lists
> across CPUs unless padding is used. The series splits that structure and
> separates the locking.
> 

The bulk page allocation series that the local_lock work had an
additional fix so I've rebased this onto

git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git 
mm-percpu-local_lock-v1r16

> Second, PREEMPT_RT considers the following sequence to be unsafe
> as documented in Documentation/locking/locktypes.rst
> 
>local_irq_disable();
>spin_lock();
> 
> The pcp allocator has this sequence for rmqueue_pcplist (local_irq_save)
> -> __rmqueue_pcplist -> rmqueue_bulk (spin_lock). This series explicitly
> separates the locking requirements for the PCP list (local_lock) and stat
> updates (irqs disabled). Once that is done, the length of time IRQs are
> disabled can be reduced and in some cases, IRQ disabling can be replaced
> with preempt_disable.
> 

It's this part I'm interested in even though it only partially addresses
the preempt-rt tree concerns. More legwork is needed for preempt-rt which
is outside the context of this series. At minimum, it involves

1. Split locking of pcp and buddy allocator instead of using spin_lock()
   when it's "known" that IRQs are disabled (not necessarily a valid
   assumption on PREEMPT_RT)
2. Split the zone lock into what protects the zone metadata and what
   protects the free lists

This looks straight-forward but it involves audit work and it may be
difficult to avoid regressing non-PREEMPT_RT kernels by disabling/enabling
IRQs when switching between the pcp allocator and the buddy allocator.

> After that, it was very obvious that zone_statistics in particular has way
> too much overhead and leaves IRQs disabled for longer than necessary. It
> has perfectly accurate counters requiring IRQs be disabled for parallel
> RMW sequences when inaccurate ones like vm_events would do. The series
> makes the NUMA statistics (NUMA_HIT and friends) inaccurate counters that
> only require preempt be disabled.
> 
> Finally the bulk page allocator can then do all the stat updates in bulk
> with IRQs enabled which should improve the efficiency of the bulk page
> allocator. Technically, this could have been done without the local_lock
> and vmstat conversion work and the order simply reflects the timing of
> when different series were implemented.
> 
> No performance data is included because despite the overhead of the
> stats, it's within the noise for most workloads but Jesper and Chuck may
> observe a significant different with the same tests used for the bulk
> page allocator. The series is more likely to be interesting to the RT
> folk in terms of slowing getting the PREEMPT tree into mainline.
> 
>  drivers/base/node.c|  18 +--
>  include/linux/mmzone.h |  29 +++--
>  include/linux/vmstat.h |  65 ++-
>  mm/mempolicy.c |   2 +-
>  mm/page_alloc.c| 173 ++++++++----
>  mm/vmstat.c| 254 +++--
>  6 files changed, 254 insertions(+), 287 deletions(-)
> 
> -- 
> 2.26.2
> 

-- 
Mel Gorman
SUSE Labs


Re: [RFC PATCH 0/6] Use local_lock for pcp protection and reduce stat overhead

2021-03-31 Thread Mel Gorman
On Tue, Mar 30, 2021 at 08:51:54PM +0200, Jesper Dangaard Brouer wrote:
> On Mon, 29 Mar 2021 13:06:42 +0100
> Mel Gorman  wrote:
> 
> > This series requires patches in Andrew's tree so the series is also
> > available at
> > 
> > git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git 
> > mm-percpu-local_lock-v1r15
> > 
> > tldr: Jesper and Chuck, it would be nice to verify if this series helps
> > the allocation rate of the bulk page allocator. RT people, this
> > *partially* addresses some problems PREEMPT_RT has with the page
> > allocator but it needs review.
> 
> I've run a new micro-benchmark[1] which shows:
> (CPU: Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz)
> 
> 
> BASELINE
>  single_page alloc+put: 194 cycles(tsc) 54.106 ns
> 
> ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size
> 
>  Per elem: 195 cycles(tsc) 54.225 ns (step:1)
>  Per elem: 127 cycles(tsc) 35.492 ns (step:2)
>  Per elem: 117 cycles(tsc) 32.643 ns (step:3)
>  Per elem: 111 cycles(tsc) 30.992 ns (step:4)
>  Per elem: 106 cycles(tsc) 29.606 ns (step:8)
>  Per elem: 102 cycles(tsc) 28.532 ns (step:16)
>  Per elem: 99 cycles(tsc) 27.728 ns (step:32)
>  Per elem: 98 cycles(tsc) 27.252 ns (step:64)
>  Per elem: 97 cycles(tsc) 27.090 ns (step:128)
> 
> This should be seen in comparison with the older micro-benchmark[2]
> done on branch mm-bulk-rebase-v5r9.
> 
> BASELINE
>  single_page alloc+put: Per elem: 199 cycles(tsc) 55.472 ns
> 
> ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size
> 
>  Per elem: 202 cycles(tsc) 56.383 ns (step:1)
>  Per elem: 144 cycles(tsc) 40.047 ns (step:2)
>  Per elem: 134 cycles(tsc) 37.339 ns (step:3)
>  Per elem: 128 cycles(tsc) 35.578 ns (step:4)
>  Per elem: 120 cycles(tsc) 33.592 ns (step:8)
>  Per elem: 116 cycles(tsc) 32.362 ns (step:16)
>  Per elem: 113 cycles(tsc) 31.476 ns (step:32)
>  Per elem: 110 cycles(tsc) 30.633 ns (step:64)
>  Per elem: 110 cycles(tsc) 30.596 ns (step:128)
> 

Ok, so bulk allocation is faster than allocating single pages, no surprise
there. Putting the array figures for bulk allocation into tabular format
and comparing we get;

Array variant (time to allocate a page in nanoseconds, lower is better)
BaselinePatched
1   56.383  54.225 (+3.83%)
2   40.047  35.492 (+11.38%)
3   37.339  32.643 (+12.58%)
4   35.578  30.992 (+12.89%)
8   33.592  29.606 (+11.87%)
16  32.362  28.532 (+11.85%)
32  31.476  27.728 (+11.91%)
64  30.633  27.252 (+11.04%)
128 30.596  27.090 (+11.46%)

The series is 11-12% faster when allocating multiple pages.  That's a
fairly positive outcome and I'll include this in the series leader if
you have no objections.

Thanks Jesper!

-- 
Mel Gorman
SUSE Labs


Re: [RFC] NUMA balancing: reduce TLB flush via delaying mapping on hint page fault

2021-03-30 Thread Mel Gorman
On Mon, Mar 29, 2021 at 02:26:51PM +0800, Huang Ying wrote:
> For NUMA balancing, in hint page fault handler, the faulting page will
> be migrated to the accessing node if necessary.  During the migration,
> TLB will be shot down on all CPUs that the process has run on
> recently.  Because in the hint page fault handler, the PTE will be
> made accessible before the migration is tried.  The overhead of TLB
> shooting down is high, so it's better to be avoided if possible.  In
> fact, if we delay mapping the page in PTE until migration, that can be
> avoided.  This is what this patch doing.
> 

Why would the overhead be high? It was previously inaccessibly so it's
only parallel accesses making forward progress that trigger the need
for a flush. As your change notes -- "The benchmark score has no visible
changes". The patch was neither a win nor a loss for your target workload
but there are more fundamental issues to consider.

> We have tested the patch with the pmbench memory accessing benchmark
> on a 2-socket Intel server, and found that the number of the TLB
> shooting down IPI reduces up to 99% (from ~6.0e6 to ~2.3e4) if NUMA
> balancing is triggered (~8.8e6 pages migrated).  The benchmark score
> has no visible changes.
> 
> Known issues:
> 
> For the multiple threads applications, it's possible that the page is
> accessed by 2 threads almost at the same time.  In the original
> implementation, the second thread may go accessing the page directly
> because the first thread has installed the accessible PTE.  While with
> this patch, there will be a window that the second thread will find
> the PTE is still inaccessible.  But the difference between the
> accessible window is small.  Because the page will be made
> inaccessible soon for migrating.
> 

If multiple threads trap the hinting fault, only one potentially attempts
a migration as the others observe the PTE has changed when the PTL is
acquired and return to userspace. Such threads then have a short window to
make progress before the PTE *potentially* becomes a migration PTE and
during that window, the parallel access may not need the page any more
and never stall on the migration.

That migration PTE may never be created if migrate_misplaced_page
chooses to ignore the PTE in which case there is minimal disruption.

If migration is attempted, then the time until the migration PTE is
created is variable. The page has to be isolated from the LRU so there
could be contention on the LRU lock, a new page has to be allocated and
that allocation potentially has to enter the page allocator slow path
etc. During that time, parallel threads make forward progress but with
the patch, multiple threads potentially attempt the allocation and fail
instead of doing real work.

You should consider the following question -- is the potential saving
of an IPI transmission enough to offset the cost of parallel accesses
not making forward progress while one migration is setup and having
different migration attempts collide?

I have tests running just in case but I think the answer may be "no".
So far only one useful test as completed (specjbb2005 with one VM per NUMA
node) and it showed a mix of small gains and losses but with *higher*
interrupts contrary to what was expected from the changelog. For some
thread counts, the results showed large differences in variability,
sometimes lower and sometimes much higher.

It makes me think that a workload should be identified that really
benefits from the IPI savings are enough to justify stalling parallel
accesses that could be making forward progress.

One nit below

> Signed-off-by: "Huang, Ying" 
> Cc: Peter Zijlstra 
> Cc: Mel Gorman 
> Cc: Peter Xu 
> Cc: Johannes Weiner 
> Cc: Vlastimil Babka 
> Cc: "Matthew Wilcox" 
> Cc: Will Deacon 
> Cc: Michel Lespinasse 
> Cc: Arjun Roy 
> Cc: "Kirill A. Shutemov" 
> ---
>  mm/memory.c | 54 +++--
>  1 file changed, 32 insertions(+), 22 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index d3273bd69dbb..a9a8ed1ac06c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4148,29 +4148,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>   goto out;
>   }
>  
> - /*
> -  * Make it present again, Depending on how arch implementes non
> -  * accessible ptes, some can allow access by kernel mode.
> -  */
> - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
> + /* Get the normal PTE  */
> + old_pte = ptep_get(vmf->pte);
>   pte = pte_modify(old_pte, vma->vm_page_prot);
> - pte = pte_mkyoung(pte);
> - if (was_writable)
> - pte = pte_mkwrite(pte);
> - ptep_modify_prot_commit(vma, vmf->

[PATCH] mm/page_alloc: Add a bulk page allocator -fix -fix

2021-03-30 Thread Mel Gorman
Colin Ian King reported the following problem (slightly edited)

Author: Mel Gorman 
Date:   Mon Mar 29 11:12:24 2021 +1100

mm/page_alloc: add a bulk page allocator

...

Static analysis on linux-next with Coverity has found a potential
uninitialized variable issue in function __alloc_pages_bulk with
the following commit:

...

Uninitialized scalar variable (UNINIT)
15. uninit_use_in_call: Using uninitialized value alloc_flags when
calling prepare_alloc_pages.

5056if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask,
, _gfp, _flags))

The problem is that prepare_alloc_flags only updates alloc_flags
which must have a valid initial value. The appropriate initial value is
ALLOC_WMARK_LOW to avoid the bulk allocator pushing a zone below the low
watermark without waking kswapd assuming the GFP mask allows kswapd to
be woken.

This is a second fix to the mmotm patch
mm-page_alloc-add-a-bulk-page-allocator.patch . It will cause a mild conflict
with a later patch due to renaming of an adjacent variable that is trivially
resolved. I can post a full series with the fixes merged if that is preferred.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 92d55f80c289..dabef0b910c9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4990,7 +4990,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
struct list_head *pcp_list;
struct alloc_context ac;
gfp_t alloc_gfp;
-   unsigned int alloc_flags;
+   unsigned int alloc_flags = ALLOC_WMARK_LOW;
int allocated = 0;
 
if (WARN_ON_ONCE(nr_pages <= 0))


Re: mm/page_alloc: add a bulk page allocator

2021-03-30 Thread Mel Gorman
On Mon, Mar 29, 2021 at 04:18:09PM +0100, Colin Ian King wrote:
> Hi,
> 
> Static analysis on linux-next with Coverity has found a potential
> uninitialized variable issue in function __alloc_pages_bulk with the
> following commit:
> 
> commit b0e0a469733fa571ddd8fe147247c9561b51b2da
> Author: Mel Gorman 
> Date:   Mon Mar 29 11:12:24 2021 +1100
> 
> mm/page_alloc: add a bulk page allocator
> 
> The analysis is as follows:
> 
> > 
>
> 5050if (nr_pages - nr_populated == 1)
> 5051goto failed;
> 5052
> 5053/* May set ALLOC_NOFRAGMENT, fragmentation will return 1
> page. */
> 5054gfp &= gfp_allowed_mask;
> 5055alloc_gfp = gfp;
> 
> Uninitialized scalar variable (UNINIT)
> 15. uninit_use_in_call: Using uninitialized value alloc_flags when
> calling prepare_alloc_pages.
> 
> 5056if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask,
> , _gfp, _flags))

Ok, so Coverity thinks that alloc_flags is potentially uninitialised and
without digging into every part of the report, Coverity is right.

> 
>
> So alloc_flags in gfp_to_alloc_flags_cma is being updated with the |=
> operator and we managed to get to this path with uninitialized
> alloc_flags.  Should alloc_flags be initialized to zero in
> __alloc_page_bulk()?
> 

You are correct about the |= updating an initial value, but I think the
initialized value should be ALLOC_WMARK_LOW. A value of 0 would be the same
as ALLOC_WMARK_MIN and that would allow the bulk allocator to potentially
consume too many pages without waking kswapd.  I'll put together a patch
shortly. Thanks Colin!

-- 
Mel Gorman
SUSE Labs


[PATCH 6/6] mm/page_alloc: Reduce duration that IRQs are disabled for VM counters

2021-03-29 Thread Mel Gorman
IRQs are left disabled for the zone and node VM event counters. On some
architectures this is unnecessary and it confuses what the scope of the
locking for per-cpu lists and VM counters are.

This patch reduces the scope of IRQs being disabled via local_[lock|unlock]
and relies on preemption disabling for the per-cpu counters. This
is not completely free on all architectures as architectures
without HAVE_CMPXCHG_DOUBLE will disable/enable IRQs again for the
mod_zone_freepage_state call. However, it clarifies what the per-cpu
pages lock protects and how zone stats may need IRQs disabled if ever
called from an IRQ context.

Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 22 --
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 32c64839c145..25d9351e75d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3461,11 +3461,17 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
pcp = this_cpu_ptr(zone->per_cpu_pageset);
list = >lists[migratetype];
page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
+   local_unlock_irqrestore(, flags);
if (page) {
+   /*
+* per-cpu counter updates are not preempt-safe but is
+* acceptable to race versus interrupts.
+*/
+   preempt_disable();
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone, 1);
+   preempt_enable();
}
-   local_unlock_irqrestore(, flags);
return page;
 }
 
@@ -3517,15 +3523,17 @@ struct page *rmqueue(struct zone *preferred_zone,
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
-   spin_unlock(>lock);
+   spin_unlock_irqrestore(>lock, flags);
+
if (!page)
goto failed;
+
+   preempt_disable();
__mod_zone_freepage_state(zone, -(1 << order),
  get_pcppage_migratetype(page));
-
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
-   local_irq_restore(flags);
+   preempt_enable();
 
 out:
/* Separate test+clear to avoid unnecessary atomics */
@@ -5090,10 +5098,12 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
nr_populated++;
}
 
+   local_unlock_irqrestore(, flags);
+
+   preempt_disable();
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
-
-   local_unlock_irqrestore(, flags);
+   preempt_enable();
 
return nr_populated;
 
-- 
2.26.2



[PATCH 2/6] mm/page_alloc: Convert per-cpu list protection to local_lock

2021-03-29 Thread Mel Gorman
There is a lack of clarity of what exactly local_irq_save/local_irq_restore
protects in page_alloc.c . It conflates the protection of per-cpu page
allocation structures with per-cpu vmstat deltas.

This patch protects the PCP structure using local_lock which
for most configurations is identical to IRQ enabling/disabling.
The scope of the lock is still wider than it should be but this is
decreased in later patches. The per-cpu vmstat deltas are protected by
preempt_disable/preempt_enable where necessary instead of relying on
IRQ disable/enable.

[l...@intel.com: Make pagesets static]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h |  2 ++
 mm/page_alloc.c| 43 --
 mm/vmstat.c|  4 
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a4393ac27336..106da8fbc72a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* Free memory management - zoned buddy allocator.  */
@@ -337,6 +338,7 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 32006e66564a..7f8c73020688 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,6 +112,13 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
 
+struct pagesets {
+   local_lock_t lock;
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+   .lock = INIT_LOCAL_LOCK(lock),
+};
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -2962,12 +2969,12 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
unsigned long flags;
int to_drain, batch;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 #endif
 
@@ -2983,13 +2990,13 @@ static void drain_pages_zone(unsigned int cpu, struct 
zone *zone)
unsigned long flags;
struct per_cpu_pages *pcp;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
 
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
 
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 
 /*
@@ -3252,9 +3259,9 @@ void free_unref_page(struct page *page)
if (!free_unref_page_prepare(page, pfn))
return;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
free_unref_page_commit(page, pfn);
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 
 /*
@@ -3274,7 +3281,7 @@ void free_unref_page_list(struct list_head *list)
set_page_private(page, pfn);
}
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
list_for_each_entry_safe(page, next, list, lru) {
unsigned long pfn = page_private(page);
 
@@ -3287,12 +3294,12 @@ void free_unref_page_list(struct list_head *list)
 * a large list of pages to free.
 */
if (++batch_count == SWAP_CLUSTER_MAX) {
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
batch_count = 0;
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
}
}
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
 }
 
 /*
@@ -3449,7 +3456,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
struct page *page;
unsigned long flags;
 
-   local_irq_save(flags);
+   local_lock_irqsave(, flags);
pcp = this_cpu_ptr(zone->per_cpu_pageset);
list = >lists[migratetype];
page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
@@ -3457,7 +3464,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
-   local_irq_restore(flags);
+   local_unlock_irqrestore(, flags);
return page;
 }
 
@@ -5052,7 +5059,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 

[PATCH 5/6] mm/page_alloc: Batch the accounting updates in the bulk allocator

2021-03-29 Thread Mel Gorman
Now that the zone_statistics are a simple counter that does not require
special protection, the bulk allocator accounting updates can be
batch updated without requiring IRQs to be disabled.

Signed-off-by: Mel Gorman 
---
 include/linux/vmstat.h |  8 
 mm/page_alloc.c| 30 +-
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index dde4dec4e7dd..8473b8fa9756 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -246,6 +246,14 @@ __count_numa_event(struct zone *zone, enum numa_stat_item 
item)
raw_cpu_inc(pzstats->vm_numa_event[item]);
 }
 
+static inline void
+__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
+{
+   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+   raw_cpu_add(pzstats->vm_numa_event[item], delta);
+}
+
 extern void __count_numa_event(struct zone *zone, enum numa_stat_item item);
 extern unsigned long sum_zone_node_page_state(int node,
  enum zone_stat_item item);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7eb48632bcac..32c64839c145 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3398,7 +3398,8 @@ void __putback_isolated_page(struct page *page, unsigned 
int order, int mt)
  *
  * Must be called with interrupts disabled.
  */
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+  long nr_account)
 {
 #ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -3411,12 +3412,12 @@ static inline void zone_statistics(struct zone 
*preferred_zone, struct zone *z)
local_stat = NUMA_OTHER;
 
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
-   __count_numa_event(z, NUMA_HIT);
+   __count_numa_events(z, NUMA_HIT, nr_account);
else {
-   __count_numa_event(z, NUMA_MISS);
-   __count_numa_event(preferred_zone, NUMA_FOREIGN);
+   __count_numa_events(z, NUMA_MISS, nr_account);
+   __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
}
-   __count_numa_event(z, local_stat);
+   __count_numa_events(z, local_stat, nr_account);
 #endif
 }
 
@@ -3462,7 +3463,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
-   zone_statistics(preferred_zone, zone);
+   zone_statistics(preferred_zone, zone, 1);
}
local_unlock_irqrestore(, flags);
return page;
@@ -3523,7 +3524,7 @@ struct page *rmqueue(struct zone *preferred_zone,
  get_pcppage_migratetype(page));
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   zone_statistics(preferred_zone, zone, 1);
local_irq_restore(flags);
 
 out:
@@ -5006,7 +5007,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
struct alloc_context ac;
gfp_t alloc_gfp;
unsigned int alloc_flags;
-   int nr_populated = 0;
+   int nr_populated = 0, nr_account = 0;
 
if (unlikely(nr_pages <= 0))
return 0;
@@ -5079,15 +5080,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
goto failed_irq;
break;
}
-
-   /*
-* Ideally this would be batched but the best way to do
-* that cheaply is to first convert zone_statistics to
-* be inaccurate per-cpu counter like vm_events to avoid
-* a RMW cycle then do the accounting with IRQs enabled.
-*/
-   __count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
-   zone_statistics(ac.preferred_zoneref->zone, zone);
+   nr_account++;
 
prep_new_page(page, 0, gfp, 0);
if (page_list)
@@ -5097,6 +5090,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
preferred_nid,
nr_populated++;
}
 
+   __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
+   zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+
local_unlock_irqrestore(, flags);
 
return nr_populated;
-- 
2.26.2



[PATCH 4/6] mm/vmstat: Inline NUMA event counter updates

2021-03-29 Thread Mel Gorman
__count_numa_event is small enough to be treated similarly to
__count_vm_event so inline it.

Signed-off-by: Mel Gorman 
---
 include/linux/vmstat.h | 9 +
 mm/vmstat.c| 9 -
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index fc14415223c5..dde4dec4e7dd 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -237,6 +237,15 @@ static inline unsigned long 
zone_page_state_snapshot(struct zone *zone,
 }
 
 #ifdef CONFIG_NUMA
+/* See __count_vm_event comment on why raw_cpu_inc is used. */
+static inline void
+__count_numa_event(struct zone *zone, enum numa_stat_item item)
+{
+   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+   raw_cpu_inc(pzstats->vm_numa_event[item]);
+}
+
 extern void __count_numa_event(struct zone *zone, enum numa_stat_item item);
 extern unsigned long sum_zone_node_page_state(int node,
  enum zone_stat_item item);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 46bc61184afc..a326483dd4ab 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -906,15 +906,6 @@ void drain_zonestat(struct zone *zone, struct 
per_cpu_zonestat *pzstats)
 #endif
 
 #ifdef CONFIG_NUMA
-/* See __count_vm_event comment on why raw_cpu_inc is used. */
-void __count_numa_event(struct zone *zone,
-enum numa_stat_item item)
-{
-   struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-
-   raw_cpu_inc(pzstats->vm_numa_event[item]);
-}
-
 /*
  * Determine the per node value of a stat item. This function
  * is called frequently in a NUMA machine, so try to be as
-- 
2.26.2



[PATCH 3/6] mm/vmstat: Convert NUMA statistics to basic NUMA counters

2021-03-29 Thread Mel Gorman
NUMA statistics are maintained on the zone level for hits, misses, foreign
etc but nothing relies on them being perfectly accurate for functional
correctness. This is overkill as the counters are used by userspace to get
a general overview a workloads NUMA behaviour but the page allocator incurs
a high cost to maintain perfect accuracy similar to what is required for
a vmstat like NR_FREE_PAGES. There even is a sysctl vm.numa_stat to allow
userspace to turn off the collection of NUMA statistics like NUMA_HIT.

This patch converts NUMA_HIT and friends to be NUMA events with similar
accuracy to VM events. There is a possibility that slight errors will be
introduced but the overall trend as seen by userspace will be similar.
Note that while these counters could be maintained at the node level,
it would have a user-visible impact.

Signed-off-by: Mel Gorman 
---
 drivers/base/node.c|  18 +++--
 include/linux/mmzone.h |  11 ++-
 include/linux/vmstat.h |  42 +-
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  12 +--
 mm/vmstat.c| 175 -
 6 files changed, 93 insertions(+), 167 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index f449dbb2c746..443a609db428 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -484,6 +484,7 @@ static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL);
 static ssize_t node_read_numastat(struct device *dev,
  struct device_attribute *attr, char *buf)
 {
+   fold_vm_numa_events();
return sysfs_emit(buf,
  "numa_hit %lu\n"
  "numa_miss %lu\n"
@@ -491,12 +492,12 @@ static ssize_t node_read_numastat(struct device *dev,
  "interleave_hit %lu\n"
  "local_node %lu\n"
  "other_node %lu\n",
- sum_zone_numa_state(dev->id, NUMA_HIT),
- sum_zone_numa_state(dev->id, NUMA_MISS),
- sum_zone_numa_state(dev->id, NUMA_FOREIGN),
- sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
- sum_zone_numa_state(dev->id, NUMA_LOCAL),
- sum_zone_numa_state(dev->id, NUMA_OTHER));
+ sum_zone_numa_event_state(dev->id, NUMA_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_MISS),
+ sum_zone_numa_event_state(dev->id, NUMA_FOREIGN),
+ sum_zone_numa_event_state(dev->id, 
NUMA_INTERLEAVE_HIT),
+ sum_zone_numa_event_state(dev->id, NUMA_LOCAL),
+ sum_zone_numa_event_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL);
 
@@ -514,10 +515,11 @@ static ssize_t node_read_vmstat(struct device *dev,
 sum_zone_node_page_state(nid, i));
 
 #ifdef CONFIG_NUMA
-   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+   fold_vm_numa_events();
+   for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
len += sysfs_emit_at(buf, len, "%s %lu\n",
 numa_stat_name(i),
-sum_zone_numa_state(nid, i));
+sum_zone_numa_event_state(nid, i));
 
 #endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 106da8fbc72a..693cd5f24f7d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -135,10 +135,10 @@ enum numa_stat_item {
NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */
-   NR_VM_NUMA_STAT_ITEMS
+   NR_VM_NUMA_EVENT_ITEMS
 };
 #else
-#define NR_VM_NUMA_STAT_ITEMS 0
+#define NR_VM_NUMA_EVENT_ITEMS 0
 #endif
 
 enum zone_stat_item {
@@ -357,7 +357,10 @@ struct per_cpu_zonestat {
s8 stat_threshold;
 #endif
 #ifdef CONFIG_NUMA
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
+   u16 vm_numa_stat_diff[NR_VM_NUMA_EVENT_ITEMS];
+#endif
+#ifdef CONFIG_NUMA
+   unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
 #endif
 };
 
@@ -609,7 +612,7 @@ struct zone {
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t   vm_stat[NR_VM_ZONE_STAT_ITEMS];
-   atomic_long_t   vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
+   atomic_long_t   vm_numa_events[NR_VM_NUMA_EVENT_ITEMS];
 } cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1736ea9d24a7..fc14415223c5 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -138,35 +138,

[PATCH 1/6] mm/page_alloc: Split per cpu page lists and zone stats

2021-03-29 Thread Mel Gorman
The per-cpu page allocator lists and the per-cpu vmstat deltas are stored
in the same struct per_cpu_pages even though vmstats have no direct impact
on the per-cpu page lists. This is inconsistent because the vmstats for a
node are stored on a dedicated structure. The bigger issue is that the
per_cpu_pages structure is not cache-aligned and stat updates either
cache conflict with adjacent per-cpu lists incurring a runtime cost or
padding is required incurring a memory cost.

This patch splits the per-cpu pagelists and the vmstat deltas into separate
structures. It's mostly a mechanical conversion but some variable renaming
is done to clearly distinguish the per-cpu pages structure (pcp) from
the vmstats (pzstats).

Superficially, this appears to increase the size of the per_cpu_pages
structure but the movement of expire fills a structure hole so there is
no impact overall.

[l...@intel.com: Check struct per_cpu_zonestat has a non-zero size]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h | 18 
 include/linux/vmstat.h |  8 ++--
 mm/page_alloc.c| 84 +++-
 mm/vmstat.c| 96 ++
 4 files changed, 110 insertions(+), 96 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..a4393ac27336 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -341,20 +341,21 @@ struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
int batch;  /* chunk size for buddy add/remove */
+#ifdef CONFIG_NUMA
+   int expire; /* When 0, remote pagesets are drained */
+#endif
 
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES];
 };
 
-struct per_cpu_pageset {
-   struct per_cpu_pages pcp;
-#ifdef CONFIG_NUMA
-   s8 expire;
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
-#endif
+struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
-   s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+   s8 stat_threshold;
+#endif
+#ifdef CONFIG_NUMA
+   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 };
 
@@ -470,7 +471,8 @@ struct zone {
int node;
 #endif
struct pglist_data  *zone_pgdat;
-   struct per_cpu_pageset __percpu *pageset;
+   struct per_cpu_pages__percpu *per_cpu_pageset;
+   struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
 * the high and batch values are copied to individual pagesets for
 * faster access
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 506d625163a1..1736ea9d24a7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct 
zone *zone,
int cpu;
 
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_numa_stat_diff[item];
 
return x;
 }
@@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct 
zone *zone,
 #ifdef CONFIG_SMP
int cpu;
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_stat_diff[item];
 
if (x < 0)
x = 0;
@@ -291,7 +291,7 @@ struct ctl_table;
 int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
loff_t *ppos);
 
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);
 
 int calculate_pressure_threshold(struct zone *zone);
 int calculate_normal_threshold(struct zone *zone);
@@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { }
 static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
-   struct per_cpu_pageset *pset) { }
+   struct per_cpu_zonestat *pzstats) { }
 #endif /* CONFIG_SMP */
 
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e478ee24e282..32006e66564a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2981,15 +2981,14 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
unsigned long flags;
-   struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
 
local_irq_save(flags);
-   pset = per_cpu_ptr(zone->pageset, cpu);
 
-   pcp = >pcp;
+   pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
f

[RFC PATCH 0/6] Use local_lock for pcp protection and reduce stat overhead

2021-03-29 Thread Mel Gorman
This series requires patches in Andrew's tree so the series is also
available at

git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git 
mm-percpu-local_lock-v1r15

tldr: Jesper and Chuck, it would be nice to verify if this series helps
the allocation rate of the bulk page allocator. RT people, this
*partially* addresses some problems PREEMPT_RT has with the page
allocator but it needs review.

The PCP (per-cpu page allocator in page_alloc.c) share locking requirements
with vmstat which is inconvenient and causes some issues. Possibly because
of that, the PCP list and vmstat share the same per-cpu space meaning that
it's possible that vmstat updates dirty cache lines holding per-cpu lists
across CPUs unless padding is used. The series splits that structure and
separates the locking.

Second, PREEMPT_RT considers the following sequence to be unsafe
as documented in Documentation/locking/locktypes.rst

   local_irq_disable();
   spin_lock();

The pcp allocator has this sequence for rmqueue_pcplist (local_irq_save)
-> __rmqueue_pcplist -> rmqueue_bulk (spin_lock). This series explicitly
separates the locking requirements for the PCP list (local_lock) and stat
updates (irqs disabled). Once that is done, the length of time IRQs are
disabled can be reduced and in some cases, IRQ disabling can be replaced
with preempt_disable.

After that, it was very obvious that zone_statistics in particular has way
too much overhead and leaves IRQs disabled for longer than necessary. It
has perfectly accurate counters requiring IRQs be disabled for parallel
RMW sequences when inaccurate ones like vm_events would do. The series
makes the NUMA statistics (NUMA_HIT and friends) inaccurate counters that
only require preempt be disabled.

Finally the bulk page allocator can then do all the stat updates in bulk
with IRQs enabled which should improve the efficiency of the bulk page
allocator. Technically, this could have been done without the local_lock
and vmstat conversion work and the order simply reflects the timing of
when different series were implemented.

No performance data is included because despite the overhead of the
stats, it's within the noise for most workloads but Jesper and Chuck may
observe a significant different with the same tests used for the bulk
page allocator. The series is more likely to be interesting to the RT
folk in terms of slowing getting the PREEMPT tree into mainline.

 drivers/base/node.c|  18 +--
 include/linux/mmzone.h |  29 +++--
 include/linux/vmstat.h |  65 ++-
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c| 173 
 mm/vmstat.c| 254 +++--
 6 files changed, 254 insertions(+), 287 deletions(-)

-- 
2.26.2



Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-03-28 Thread Mel Gorman
On Fri, Mar 26, 2021 at 03:19:32PM -0400, Rik van Riel wrote:
> ---8<---
> sched,fair: bring back select_idle_smt, but differently
> 
> Mel Gorman did some nice work in 9fe1f127b913
> ("sched/fair: Merge select_idle_core/cpu()"), resulting in the kernel
> being more efficient at finding an idle CPU, and in tasks spending less
> time waiting to be run, both according to the schedstats run_delay
> numbers, and according to measured application latencies. Yay.
> 
> The flip side of this is that we see more task migrations (about
> 30% more), higher cache misses, higher memory bandwidth utilization,
> and higher CPU use, for the same number of requests/second.
> 
> This is most pronounced on a memcache type workload, which saw
> a consistent 1-3% increase in total CPU use on the system, due
> to those increased task migrations leading to higher L2 cache
> miss numbers, and higher memory utilization. The exclusive L3
> cache on Skylake does us no favors there.
> 
> On our web serving workload, that effect is usually negligible.
> 
> It appears that the increased number of CPU migrations is generally
> a good thing, since it leads to lower cpu_delay numbers, reflecting
> the fact that tasks get to run faster. However, the reduced locality
> and the corresponding increase in L2 cache misses hurts a little.
> 
> The patch below appears to fix the regression, while keeping the
> benefit of the lower cpu_delay numbers, by reintroducing select_idle_smt
> with a twist: when a socket has no idle cores, check to see if the
> sibling of "prev" is idle, before searching all the other CPUs.
> 
> This fixes both the occasional 9% regression on the web serving
> workload, and the continuous 2% CPU use regression on the memcache
> type workload.
> 
> With Mel's patches and this patch together, task migrations are still
> high, but L2 cache misses, memory bandwidth, and CPU time used are back
> down to what they were before. The p95 and p99 response times for the
> memcache type application improve by about 10% over what they were
> before Mel's patches got merged.
> 
> Signed-off-by: Rik van Riel 

FWIW, v3 appears to have performed faster than v2 on the few tests I ran
and the patch looks fine.

Reviewed-by: Mel Gorman 

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 0/9 v6] Introduce a bulk order-0 page allocator with two in-tree users

2021-03-25 Thread Mel Gorman
On Thu, Mar 25, 2021 at 03:06:57PM +0100, Uladzislau Rezki wrote:
> > On Thu, Mar 25, 2021 at 12:50:01PM +, Matthew Wilcox wrote:
> > > On Thu, Mar 25, 2021 at 11:42:19AM +0000, Mel Gorman wrote:
> > > > This series introduces a bulk order-0 page allocator with sunrpc and
> > > > the network page pool being the first users. The implementation is not
> > > > efficient as semantics needed to be ironed out first. If no other 
> > > > semantic
> > > > changes are needed, it can be made more efficient.  Despite that, this
> > > > is a performance-related for users that require multiple pages for an
> > > > operation without multiple round-trips to the page allocator. Quoting
> > > > the last patch for the high-speed networking use-case
> > > > 
> > > > Kernel  XDP stats   CPU pps   Delta
> > > > BaselineXDP-RX CPU  total   3,771,046   n/a
> > > > ListXDP-RX CPU  total   3,940,242+4.49%
> > > > Array   XDP-RX CPU  total   4,249,224   +12.68%
> > > > 
> > > > >From the SUNRPC traces of svc_alloc_arg()
> > > > 
> > > > Single page: 25.007 us per call over 532,571 calls
> > > > Bulk list:6.258 us per call over 517,034 calls
> > > > Bulk array:   4.590 us per call over 517,442 calls
> > > > 
> > > > Both potential users in this series are corner cases (NFS and high-speed
> > > > networks) so it is unlikely that most users will see any benefit in the
> > > > short term. Other potential other users are batch allocations for page
> > > > cache readahead, fault around and SLUB allocations when high-order pages
> > > > are unavailable. It's unknown how much benefit would be seen by 
> > > > converting
> > > > multiple page allocation calls to a single batch or what difference it 
> > > > may
> > > > make to headline performance.
> > > 
> > > We have a third user, vmalloc(), with a 16% perf improvement.  I know the
> > > email says 21% but that includes the 5% improvement from switching to
> > > kvmalloc() to allocate area->pages.
> > > 
> > > https://lore.kernel.org/linux-mm/20210323133948.ga10...@pc638.lan/
> > > 
> > 
> > That's fairly promising. Assuming the bulk allocator gets merged, it would
> > make sense to add vmalloc on top. That's for bringing it to my attention
> > because it's far more relevant than my imaginary potential use cases.
> > 
> For the vmalloc we should be able to allocating on a specific NUMA node,
> at least the current interface takes it into account. As far as i see
> the current interface allocate on a current node:
> 
> static inline unsigned long
> alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page 
> **page_array)
> {
> return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, 
> page_array);
> }
> 
> Or am i missing something?
> 

No, you're not missing anything. Options would be to add a helper similar
alloc_pages_node or to directly call __alloc_pages_bulk specifying a node
and using GFP_THISNODE. prepare_alloc_pages() should pick the correct
zonelist containing only the required node.

> --
> Vlad Rezki

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 0/9 v6] Introduce a bulk order-0 page allocator with two in-tree users

2021-03-25 Thread Mel Gorman
On Thu, Mar 25, 2021 at 12:50:01PM +, Matthew Wilcox wrote:
> On Thu, Mar 25, 2021 at 11:42:19AM +0000, Mel Gorman wrote:
> > This series introduces a bulk order-0 page allocator with sunrpc and
> > the network page pool being the first users. The implementation is not
> > efficient as semantics needed to be ironed out first. If no other semantic
> > changes are needed, it can be made more efficient.  Despite that, this
> > is a performance-related for users that require multiple pages for an
> > operation without multiple round-trips to the page allocator. Quoting
> > the last patch for the high-speed networking use-case
> > 
> > Kernel  XDP stats   CPU pps   Delta
> > BaselineXDP-RX CPU  total   3,771,046   n/a
> > ListXDP-RX CPU  total   3,940,242+4.49%
> > Array   XDP-RX CPU  total   4,249,224   +12.68%
> > 
> > >From the SUNRPC traces of svc_alloc_arg()
> > 
> > Single page: 25.007 us per call over 532,571 calls
> > Bulk list:6.258 us per call over 517,034 calls
> > Bulk array:   4.590 us per call over 517,442 calls
> > 
> > Both potential users in this series are corner cases (NFS and high-speed
> > networks) so it is unlikely that most users will see any benefit in the
> > short term. Other potential other users are batch allocations for page
> > cache readahead, fault around and SLUB allocations when high-order pages
> > are unavailable. It's unknown how much benefit would be seen by converting
> > multiple page allocation calls to a single batch or what difference it may
> > make to headline performance.
> 
> We have a third user, vmalloc(), with a 16% perf improvement.  I know the
> email says 21% but that includes the 5% improvement from switching to
> kvmalloc() to allocate area->pages.
> 
> https://lore.kernel.org/linux-mm/20210323133948.ga10...@pc638.lan/
> 

That's fairly promising. Assuming the bulk allocator gets merged, it would
make sense to add vmalloc on top. That's for bringing it to my attention
because it's far more relevant than my imaginary potential use cases.

> I don't know how many _frequent_ vmalloc users we have that will benefit
> from this, but it's probably more than will benefit from improvements
> to 200Gbit networking performance.

I think it was 100Gbit being looked at but your point is still valid and
there is no harm in incrementally improving over time.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 4/9] mm/page_alloc: optimize code layout for __alloc_pages_bulk

2021-03-25 Thread Mel Gorman
On Thu, Mar 25, 2021 at 12:12:17PM +, Matthew Wilcox wrote:
> On Thu, Mar 25, 2021 at 11:42:23AM +0000, Mel Gorman wrote:
> >  
> > -   if (WARN_ON_ONCE(nr_pages <= 0))
> > +   if (unlikely(nr_pages <= 0))
> > return 0;
> 
> If we made nr_pages unsigned, we wouldn't need this check at all (ok,
> we'd still need to figure out what to do with 0).  But then, if a user
> inadvertently passes in -ENOMEM, we'll try to allocate 4 billion pages.

This is exactly why nr_pages is signed. An error in accounting by the
caller potentially puts the system under severe memory pressure. This
*should* only be a problem when a new caller of the API is being
implemented. The warning goes away in a later patch for reasons explained
in the changelog.

> So maybe we should check it.  Gah, API design is hard.

Yep.

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 2/9] mm/page_alloc: Add a bulk page allocator

2021-03-25 Thread Mel Gorman
On Thu, Mar 25, 2021 at 12:05:25PM +, Matthew Wilcox wrote:
> On Thu, Mar 25, 2021 at 11:42:21AM +0000, Mel Gorman wrote:
> > +int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> > +   nodemask_t *nodemask, int nr_pages,
> > +   struct list_head *list);
> > +
> > +/* Bulk allocate order-0 pages */
> > +static inline unsigned long
> > +alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
> > +{
> > +   return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);
> 
> Discrepancy in the two return types here.  Suspect they should both
> be 'unsigned int' so there's no question about "can it return an errno".
> 

I'll make it unsigned long as the nr_pages parameter is unsigned long.
It's a silly range to have for pages but it matches alloc_contig_range
even though free_contig_range takes unsigned int *sigh*

> >  
> > +/*
> 
> If you could make that "/**" instead ...
> 

I decided not to until we're reasonably sure the semantics are not going
to change.

---8<---
mm/page_alloc: Add a bulk page allocator -fix

Matthew Wilcox pointed out that the return type for alloc_pages_bulk()
and __alloc_pages_bulk() is inconsistent. Fix it.

Signed-off-by: Mel Gorman 
---
 include/linux/gfp.h | 2 +-
 mm/page_alloc.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4a304fd39916..a2be8f4174a9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -518,7 +518,7 @@ static inline int arch_make_page_accessible(struct page 
*page)
 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask);
 
-int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
struct list_head *list);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eb547470a7e4..92d55f80c289 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4978,7 +4978,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, 
unsigned int order,
  *
  * Returns the number of pages on the list.
  */
-int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
struct list_head *page_list)
 {


Re: [RFC] mm: activate access-more-than-once page via NUMA balancing

2021-03-25 Thread Mel Gorman
On Thu, Mar 25, 2021 at 12:33:45PM +0800, Huang, Ying wrote:
> > I caution against this patch.
> >
> > It's non-deterministic for a number of reasons. As it requires NUMA
> > balancing to be enabled, the pageout behaviour of a system changes when
> > NUMA balancing is active. If this led to pages being artificially and
> > inappropriately preserved, NUMA balancing could be disabled for the
> > wrong reasons.  It only applies to pages that have no target node so
> > memory policies affect which pages are activated differently. Similarly,
> > NUMA balancing does not scan all VMAs and some pages may never trap a
> > NUMA fault as a result. The timing of when an address space gets scanned
> > is driven by the locality of pages and so the timing of page activation
> > potentially becomes linked to whether pages are local or need to migrate
> > (although not right now for this patch as it only affects pages with a
> > target nid of NUMA_NO_NODE). In other words, changes in NUMA balancing
> > that affect migration potentially affect the aging rate.  Similarly,
> > the activate rate of a process with a single thread and multiple threads
> > potentially have different activation rates.
> >
> > Finally, the NUMA balancing scan algorithm is sub-optimal. It potentially
> > scans the entire address space even though only a small number of pages
> > are scanned. This is particularly problematic when a process has a lot
> > of threads because threads are redundantly scanning the same regions. If
> > NUMA balancing ever introduced range tracking of faulted pages to limit
> > how much scanning it has to do, it would inadvertently cause a change in
> > page activation rate.
> >
> > NUMA balancing is about page locality, it should not get conflated with
> > page aging.
> 
> I understand your concerns about binding the NUMA balancing and page
> reclaiming.  The requirement of the page locality and page aging is
> different, so the policies need to be different.  This is the wrong part
> of the patch.
> 
> From another point of view, it's still possible to share some underlying
> mechanisms (and code) between them.  That is, scanning the page tables
> to make pages unaccessible and capture the page accesses via the page
> fault. 

Potentially yes but not necessarily recommended for page aging. NUMA
balancing has to be careful about the rate it scans pages to avoid
excessive overhead so it's driven by locality. The scanning happens
within a tasks context so during that time, the task is not executing
its normal work and it incurs the overhead for faults. Generally, this
is not too much overhead because pages get migrated locally, the scan
rate drops and so does the overhead.

However, if you want to drive page aging, that is constant so the rate
could not be easily adapted in a way that would be deterministic.

> Now these page accessing information is used for the page
> locality.  Do you think it's a good idea to use these information for
> the page aging too (but with a different policy as you pointed out)?
> 

I'm not completely opposed to it but I think the overhead it would
introduce could be severe. Worse, if a workload fits in memory and there
is limited to no memory pressure, it's all overhead for no gain. Early
generations of NUMA balancing had to find a balance to sure the gains
from locality exceeded the cost of measuring locality and doing the same
for page aging in some ways is even more challenging.

> From yet another point of view :-), in current NUMA balancing
> implementation, it's assumed that the node private pages can fit in the
> accessing node.  But this may be not always true.  Is it a valid
> optimization to migrate the hot private pages first?
> 

I'm not sure how the hotness of pages could be ranked. At the time of a
hinting fault, the page is by definition active now because it was been
accessed. Prioritising what pages to migrate based on the number of faults
that have been trapped would have to be stored somewhere.

-- 
Mel Gorman
SUSE Labs


[PATCH 9/9] net: page_pool: use alloc_pages_bulk in refill code path

2021-03-25 Thread Mel Gorman
From: Jesper Dangaard Brouer 

There are cases where the page_pool need to refill with pages from the
page allocator. Some workloads cause the page_pool to release pages
instead of recycling these pages.

For these workload it can improve performance to bulk alloc pages from
the page-allocator to refill the alloc cache.

For XDP-redirect workload with 100G mlx5 driver (that use page_pool)
redirecting xdp_frame packets into a veth, that does XDP_PASS to create
an SKB from the xdp_frame, which then cannot return the page to the
page_pool.

Performance results under GitHub xdp-project[1]:
 [1] 
https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool06_alloc_pages_bulk.org

Mel: The patch "net: page_pool: convert to use alloc_pages_bulk_array
variant" was squashed with this patch. From the test page, the array
variant was superior with one of the test results as follows.

Kernel  XDP stats   CPU pps   Delta
BaselineXDP-RX CPU  total   3,771,046   n/a
ListXDP-RX CPU  total   3,940,242+4.49%
Array   XDP-RX CPU  total   4,249,224   +12.68%

Signed-off-by: Jesper Dangaard Brouer 
Signed-off-by: Mel Gorman 
---
 include/net/page_pool.h |  2 +-
 net/core/page_pool.c| 82 -
 2 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index b5b195305346..6d517a37c18b 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -65,7 +65,7 @@
 #define PP_ALLOC_CACHE_REFILL  64
 struct pp_alloc_cache {
u32 count;
-   void *cache[PP_ALLOC_CACHE_SIZE];
+   struct page *cache[PP_ALLOC_CACHE_SIZE];
 };
 
 struct page_pool_params {
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 40e1b2beaa6c..9ec1aa9640ad 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -203,38 +203,17 @@ static bool page_pool_dma_map(struct page_pool *pool, 
struct page *page)
return true;
 }
 
-/* slow path */
-noinline
-static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
-gfp_t _gfp)
+static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
+gfp_t gfp)
 {
-   unsigned int pp_flags = pool->p.flags;
struct page *page;
-   gfp_t gfp = _gfp;
-
-   /* We could always set __GFP_COMP, and avoid this branch, as
-* prep_new_page() can handle order-0 with __GFP_COMP.
-*/
-   if (pool->p.order)
-   gfp |= __GFP_COMP;
-
-   /* FUTURE development:
-*
-* Current slow-path essentially falls back to single page
-* allocations, which doesn't improve performance.  This code
-* need bulk allocation support from the page allocator code.
-*/
 
-   /* Cache was empty, do real allocation */
-#ifdef CONFIG_NUMA
+   gfp |= __GFP_COMP;
page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
-#else
-   page = alloc_pages(gfp, pool->p.order);
-#endif
-   if (!page)
+   if (unlikely(!page))
return NULL;
 
-   if ((pp_flags & PP_FLAG_DMA_MAP) &&
+   if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
unlikely(!page_pool_dma_map(pool, page))) {
put_page(page);
return NULL;
@@ -243,6 +222,57 @@ static struct page *__page_pool_alloc_pages_slow(struct 
page_pool *pool,
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+   return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
+gfp_t gfp)
+{
+   const int bulk = PP_ALLOC_CACHE_REFILL;
+   unsigned int pp_flags = pool->p.flags;
+   unsigned int pp_order = pool->p.order;
+   struct page *page;
+   int i, nr_pages;
+
+   /* Don't support bulk alloc for high-order pages */
+   if (unlikely(pp_order))
+   return __page_pool_alloc_page_order(pool, gfp);
+
+   /* Unnecessary as alloc cache is empty, but guarantees zero count */
+   if (unlikely(pool->alloc.count > 0))
+   return pool->alloc.cache[--pool->alloc.count];
+
+   /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
+   memset(>alloc.cache, 0, sizeof(void *) * bulk);
+
+   nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache);
+   if (unlikely(!nr_pages))
+   return NULL;
+
+   /* Pages have been filled into alloc.cache array, but count is zero and
+* page element have not been (possibly) DMA mapped.
+*/
+   for (i = 0; i &

[PATCH 8/9] net: page_pool: refactor dma_map into own function page_pool_dma_map

2021-03-25 Thread Mel Gorman
From: Jesper Dangaard Brouer 

In preparation for next patch, move the dma mapping into its own
function, as this will make it easier to follow the changes.

[ilias.apalodimas: make page_pool_dma_map return boolean]
Signed-off-by: Jesper Dangaard Brouer 
Reviewed-by: Ilias Apalodimas 
Signed-off-by: Mel Gorman 
---
 net/core/page_pool.c | 45 +---
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index ad8b0707af04..40e1b2beaa6c 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -180,14 +180,37 @@ static void page_pool_dma_sync_for_device(struct 
page_pool *pool,
 pool->p.dma_dir);
 }
 
+static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
+{
+   dma_addr_t dma;
+
+   /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
+* since dma_addr_t can be either 32 or 64 bits and does not always fit
+* into page private data (i.e 32bit cpu with 64bit DMA caps)
+* This mapping is kept for lifetime of page, until leaving pool.
+*/
+   dma = dma_map_page_attrs(pool->p.dev, page, 0,
+(PAGE_SIZE << pool->p.order),
+pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
+   if (dma_mapping_error(pool->p.dev, dma))
+   return false;
+
+   page->dma_addr = dma;
+
+   if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+   page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+
+   return true;
+}
+
 /* slow path */
 noinline
 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 gfp_t _gfp)
 {
+   unsigned int pp_flags = pool->p.flags;
struct page *page;
gfp_t gfp = _gfp;
-   dma_addr_t dma;
 
/* We could always set __GFP_COMP, and avoid this branch, as
 * prep_new_page() can handle order-0 with __GFP_COMP.
@@ -211,30 +234,14 @@ static struct page *__page_pool_alloc_pages_slow(struct 
page_pool *pool,
if (!page)
return NULL;
 
-   if (!(pool->p.flags & PP_FLAG_DMA_MAP))
-   goto skip_dma_map;
-
-   /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
-* since dma_addr_t can be either 32 or 64 bits and does not always fit
-* into page private data (i.e 32bit cpu with 64bit DMA caps)
-* This mapping is kept for lifetime of page, until leaving pool.
-*/
-   dma = dma_map_page_attrs(pool->p.dev, page, 0,
-(PAGE_SIZE << pool->p.order),
-pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
-   if (dma_mapping_error(pool->p.dev, dma)) {
+   if ((pp_flags & PP_FLAG_DMA_MAP) &&
+   unlikely(!page_pool_dma_map(pool, page))) {
put_page(page);
return NULL;
}
-   page->dma_addr = dma;
 
-   if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
-   page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
-
-skip_dma_map:
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
-
trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
 
/* When page just alloc'ed is should/must have refcnt 1. */
-- 
2.26.2



[PATCH 6/9] SUNRPC: Set rq_page_end differently

2021-03-25 Thread Mel Gorman
From: Chuck Lever 

Patch series "SUNRPC consumer for the bulk page allocator"

This patch set and the measurements below are based on yesterday's
bulk allocator series:

git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git mm-bulk-rebase-v5r9

The patches change SUNRPC to invoke the array-based bulk allocator
instead of alloc_page().

The micro-benchmark results are promising. I ran a mixture of 256KB
reads and writes over NFSv3. The server's kernel is built with KASAN
enabled, so the comparison is exaggerated but I believe it is still
valid.

I instrumented svc_recv() to measure the latency of each call to
svc_alloc_arg() and report it via a trace point. The following
results are averages across the trace events.

Single page: 25.007 us per call over 532,571 calls
Bulk list:6.258 us per call over 517,034 calls
Bulk array:   4.590 us per call over 517,442 calls

This patch (of 2)

Refactor:

I'm about to use the loop variable @i for something else.

As far as the "i++" is concerned, that is a post-increment. The
value of @i is not used subsequently, so the increment operator
is unnecessary and can be removed.

Also note that nfsd_read_actor() was renamed nfsd_splice_actor()
by commit cf8208d0eabd ("sendfile: convert nfsd to
splice_direct_to_actor()").

Signed-off-by: Chuck Lever 
Signed-off-by: Mel Gorman 
---
 net/sunrpc/svc_xprt.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 3cdd71a8df1e..609bda97d4ae 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -642,7 +642,7 @@ static void svc_check_conn_limits(struct svc_serv *serv)
 static int svc_alloc_arg(struct svc_rqst *rqstp)
 {
struct svc_serv *serv = rqstp->rq_server;
-   struct xdr_buf *arg;
+   struct xdr_buf *arg = >rq_arg;
int pages;
int i;
 
@@ -667,11 +667,10 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
}
rqstp->rq_pages[i] = p;
}
-   rqstp->rq_page_end = >rq_pages[i];
-   rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
+   rqstp->rq_page_end = >rq_pages[pages];
+   rqstp->rq_pages[pages] = NULL; /* this might be seen in 
nfsd_splice_actor() */
 
/* Make arg->head point to first page and arg->pages point to rest */
-   arg = >rq_arg;
arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
arg->head[0].iov_len = PAGE_SIZE;
arg->pages = rqstp->rq_pages + 1;
-- 
2.26.2



[PATCH 7/9] SUNRPC: Refresh rq_pages using a bulk page allocator

2021-03-25 Thread Mel Gorman
From: Chuck Lever 

Reduce the rate at which nfsd threads hammer on the page allocator.
This improves throughput scalability by enabling the threads to run
more independently of each other.

[mgorman: Update interpretation of alloc_pages_bulk return value]
Signed-off-by: Chuck Lever 
Signed-off-by: Mel Gorman 
---
 net/sunrpc/svc_xprt.c | 31 +++
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 609bda97d4ae..0c27c3291ca1 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -643,30 +643,29 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
 {
struct svc_serv *serv = rqstp->rq_server;
struct xdr_buf *arg = >rq_arg;
-   int pages;
-   int i;
+   unsigned long pages, filled;
 
-   /* now allocate needed pages.  If we get a failure, sleep briefly */
pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT;
if (pages > RPCSVC_MAXPAGES) {
-   pr_warn_once("svc: warning: pages=%u > RPCSVC_MAXPAGES=%lu\n",
+   pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n",
 pages, RPCSVC_MAXPAGES);
/* use as many pages as possible */
pages = RPCSVC_MAXPAGES;
}
-   for (i = 0; i < pages ; i++)
-   while (rqstp->rq_pages[i] == NULL) {
-   struct page *p = alloc_page(GFP_KERNEL);
-   if (!p) {
-   set_current_state(TASK_INTERRUPTIBLE);
-   if (signalled() || kthread_should_stop()) {
-   set_current_state(TASK_RUNNING);
-   return -EINTR;
-   }
-   schedule_timeout(msecs_to_jiffies(500));
-   }
-   rqstp->rq_pages[i] = p;
+
+   for (;;) {
+   filled = alloc_pages_bulk_array(GFP_KERNEL, pages,
+   rqstp->rq_pages);
+   if (filled == pages)
+   break;
+
+   set_current_state(TASK_INTERRUPTIBLE);
+   if (signalled() || kthread_should_stop()) {
+   set_current_state(TASK_RUNNING);
+   return -EINTR;
}
+   schedule_timeout(msecs_to_jiffies(500));
+   }
rqstp->rq_page_end = >rq_pages[pages];
rqstp->rq_pages[pages] = NULL; /* this might be seen in 
nfsd_splice_actor() */
 
-- 
2.26.2



[PATCH 5/9] mm/page_alloc: inline __rmqueue_pcplist

2021-03-25 Thread Mel Gorman
From: Jesper Dangaard Brouer 

When __alloc_pages_bulk() got introduced two callers of __rmqueue_pcplist
exist and the compiler chooses to not inline this function.

 ./scripts/bloat-o-meter vmlinux-before vmlinux-inline__rmqueue_pcplist
add/remove: 0/1 grow/shrink: 2/0 up/down: 164/-125 (39)
Function old new   delta
rmqueue 21972296 +99
__alloc_pages_bulk  19211986 +65
__rmqueue_pcplist125   --125
Total: Before=19374127, After=19374166, chg +0.00%

modprobe page_bench04_bulk loops=$((10**7))

Type:time_bulk_page_alloc_free_array
 -  Per elem: 106 cycles(tsc) 29.595 ns (step:64)
 - (measurement period time:0.295955434 sec time_interval:295955434)
 - (invoke count:1000 tsc_interval:1065447105)

Before:
 - Per elem: 110 cycles(tsc) 30.633 ns (step:64)

Signed-off-by: Jesper Dangaard Brouer 
Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1ec18121268b..d900e92884b2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3415,7 +3415,8 @@ static inline void zone_statistics(struct zone 
*preferred_zone, struct zone *z)
 }
 
 /* Remove page from the per-cpu list, caller must protect the list */
-static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+static inline
+struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
-- 
2.26.2



[PATCH 4/9] mm/page_alloc: optimize code layout for __alloc_pages_bulk

2021-03-25 Thread Mel Gorman
From: Jesper Dangaard Brouer 

Looking at perf-report and ASM-code for __alloc_pages_bulk() it is clear
that the code activated is suboptimal. The compiler guesses wrong and
places unlikely code at the beginning. Due to the use of WARN_ON_ONCE()
macro the UD2 asm instruction is added to the code, which confuse the
I-cache prefetcher in the CPU.

[mgorman: Minor changes and rebasing]
Signed-off-by: Jesper Dangaard Brouer 
Signed-off-by: Mel Gorman 
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index be1e33a4df39..1ec18121268b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5001,7 +5001,7 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
unsigned int alloc_flags;
int nr_populated = 0;
 
-   if (WARN_ON_ONCE(nr_pages <= 0))
+   if (unlikely(nr_pages <= 0))
return 0;
 
/*
@@ -5048,7 +5048,7 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 * If there are no allowed local zones that meets the watermarks then
 * try to allocate a single page and reclaim if necessary.
 */
-   if (!zone)
+   if (unlikely(!zone))
goto failed;
 
/* Attempt the batch allocation */
@@ -5066,7 +5066,7 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 
page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
pcp, pcp_list);
-   if (!page) {
+   if (unlikely(!page)) {
/* Try and get at least one page */
if (!nr_populated)
goto failed_irq;
-- 
2.26.2



[PATCH 3/9] mm/page_alloc: Add an array-based interface to the bulk page allocator

2021-03-25 Thread Mel Gorman
The proposed callers for the bulk allocator store pages from the bulk
allocator in an array. This patch adds an array-based interface to the API
to avoid multiple list iterations. The page list interface is preserved
to avoid requiring all users of the bulk API to allocate and manage enough
storage to store the pages.

Signed-off-by: Mel Gorman 
---
 include/linux/gfp.h | 13 +++---
 mm/page_alloc.c | 60 +
 2 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4a304fd39916..fb6234e1fe59 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -520,13 +520,20 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, 
int preferred_nid,
 
 int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
-   struct list_head *list);
+   struct list_head *page_list,
+   struct page **page_array);
 
 /* Bulk allocate order-0 pages */
 static inline unsigned long
-alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
+alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head 
*list)
 {
-   return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);
+   return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, 
NULL);
+}
+
+static inline unsigned long
+alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page 
**page_array)
+{
+   return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, 
page_array);
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eb547470a7e4..be1e33a4df39 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4966,21 +4966,29 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, 
unsigned int order,
 }
 
 /*
- * __alloc_pages_bulk - Allocate a number of order-0 pages to a list
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
  * @gfp: GFP flags for the allocation
  * @preferred_nid: The preferred NUMA node ID to allocate from
  * @nodemask: Set of nodes to allocate from, may be NULL
- * @nr_pages: The number of pages desired on the list
- * @page_list: List to store the allocated pages
+ * @nr_pages: The number of pages desired on the list or array
+ * @page_list: Optional list to store the allocated pages
+ * @page_array: Optional array to store the pages
  *
  * This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly and add them to a list.
+ * allocate nr_pages quickly. Pages are added to page_list if page_list
+ * is not NULL, otherwise it is assumed that the page_array is valid.
  *
- * Returns the number of pages on the list.
+ * For lists, nr_pages is the number of pages that should be allocated.
+ *
+ * For arrays, only NULL elements are populated with pages and nr_pages
+ * is the maximum number of pages that will be stored in the array.
+ *
+ * Returns the number of pages on the list or array.
  */
 int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
-   struct list_head *page_list)
+   struct list_head *page_list,
+   struct page **page_array)
 {
struct page *page;
unsigned long flags;
@@ -4991,13 +4999,20 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
struct alloc_context ac;
gfp_t alloc_gfp;
unsigned int alloc_flags;
-   int allocated = 0;
+   int nr_populated = 0;
 
if (WARN_ON_ONCE(nr_pages <= 0))
return 0;
 
+   /*
+* Skip populated array elements to determine if any pages need
+* to be allocated before disabling IRQs.
+*/
+   while (page_array && page_array[nr_populated] && nr_populated < 
nr_pages)
+   nr_populated++;
+
/* Use the single page allocator for one page. */
-   if (nr_pages == 1)
+   if (nr_pages - nr_populated == 1)
goto failed;
 
/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
@@ -5041,12 +5056,19 @@ int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
pcp = _cpu_ptr(zone->pageset)->pcp;
pcp_list = >lists[ac.migratetype];
 
-   while (allocated < nr_pages) {
+   while (nr_populated < nr_pages) {
+
+   /* Skip existing pages */
+   if (page_array && page_array[nr_populated]) {
+   nr_populated++;
+   continue;
+   }
+
page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
pcp, pcp_list);
if (!page) {
/* Try and get at least one page */
-   if (!allocat

[PATCH 2/9] mm/page_alloc: Add a bulk page allocator

2021-03-25 Thread Mel Gorman
This patch adds a new page allocator interface via alloc_pages_bulk,
and __alloc_pages_bulk_nodemask. A caller requests a number of pages
to be allocated and added to a list.

The API is not guaranteed to return the requested number of pages and
may fail if the preferred allocation zone has limited free memory, the
cpuset changes during the allocation or page debugging decides to fail
an allocation. It's up to the caller to request more pages in batch
if necessary.

Note that this implementation is not very efficient and could be improved
but it would require refactoring. The intent is to make it available early
to determine what semantics are required by different callers. Once the
full semantics are nailed down, it can be refactored.

Signed-off-by: Mel Gorman 
Acked-by: Vlastimil Babka 
---
 include/linux/gfp.h |  11 +
 mm/page_alloc.c | 118 
 2 files changed, 129 insertions(+)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0a88f84b08f4..4a304fd39916 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -518,6 +518,17 @@ static inline int arch_make_page_accessible(struct page 
*page)
 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask);
 
+int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+   nodemask_t *nodemask, int nr_pages,
+   struct list_head *list);
+
+/* Bulk allocate order-0 pages */
+static inline unsigned long
+alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
+{
+   return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);
+}
+
 /*
  * Allocate pages, preferring the node given as nid. The node must be valid and
  * online. For more general interface, see alloc_pages_node().
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8a3e13277e22..eb547470a7e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4965,6 +4965,124 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, 
unsigned int order,
return true;
 }
 
+/*
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to a list
+ * @gfp: GFP flags for the allocation
+ * @preferred_nid: The preferred NUMA node ID to allocate from
+ * @nodemask: Set of nodes to allocate from, may be NULL
+ * @nr_pages: The number of pages desired on the list
+ * @page_list: List to store the allocated pages
+ *
+ * This is a batched version of the page allocator that attempts to
+ * allocate nr_pages quickly and add them to a list.
+ *
+ * Returns the number of pages on the list.
+ */
+int __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+   nodemask_t *nodemask, int nr_pages,
+   struct list_head *page_list)
+{
+   struct page *page;
+   unsigned long flags;
+   struct zone *zone;
+   struct zoneref *z;
+   struct per_cpu_pages *pcp;
+   struct list_head *pcp_list;
+   struct alloc_context ac;
+   gfp_t alloc_gfp;
+   unsigned int alloc_flags;
+   int allocated = 0;
+
+   if (WARN_ON_ONCE(nr_pages <= 0))
+   return 0;
+
+   /* Use the single page allocator for one page. */
+   if (nr_pages == 1)
+   goto failed;
+
+   /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
+   gfp &= gfp_allowed_mask;
+   alloc_gfp = gfp;
+   if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, , 
_gfp, _flags))
+   return 0;
+   gfp = alloc_gfp;
+
+   /* Find an allowed local zone that meets the high watermark. */
+   for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, 
ac.highest_zoneidx, ac.nodemask) {
+   unsigned long mark;
+
+   if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
+   !__cpuset_zone_allowed(zone, gfp)) {
+   continue;
+   }
+
+   if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
+   zone_to_nid(zone) != 
zone_to_nid(ac.preferred_zoneref->zone)) {
+   goto failed;
+   }
+
+   mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + 
nr_pages;
+   if (zone_watermark_fast(zone, 0,  mark,
+   zonelist_zone_idx(ac.preferred_zoneref),
+   alloc_flags, gfp)) {
+   break;
+   }
+   }
+
+   /*
+* If there are no allowed local zones that meets the watermarks then
+* try to allocate a single page and reclaim if necessary.
+*/
+   if (!zone)
+   goto failed;
+
+   /* Attempt the batch allocation */
+   local_irq_save(flags);
+   pcp = _cpu_ptr(zone->pageset)->pcp;
+   pcp_list = >lists[ac.migratetype];
+
+   while (allocated < nr_pages) {
+ 

  1   2   3   4   5   6   7   8   9   10   >