Now that memmap_init_zone() knows how to split the init work into
multiple threads, allow the tracking for those threads to be handled
via a passed in 'struct memmap_async_state' instance.

This infrastructure allows devm_memremap_pages() users, like the pmem
driver, to track memmap initialization in the backgroud, and use
memmap_sync() when it performs an operation that may result in a
pfn_to_page(), like dax mapping a pfn into userspace.

The approach mirrors what is done for background memmap initialization
and defers waiting for initialization to complete until the first
userspace consumer arrives.

Cc: Michal Hocko <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: Logan Gunthorpe <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Andrew Morton <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
 include/linux/memmap_async.h |   10 ++++
 include/linux/memremap.h     |   29 ++++++++++++
 kernel/memremap.c            |   65 ++++++++++++++++-----------
 mm/page_alloc.c              |  102 +++++++++++++++++++++++++++++++++++++-----
 4 files changed, 169 insertions(+), 37 deletions(-)

diff --git a/include/linux/memmap_async.h b/include/linux/memmap_async.h
index d2011681a910..4633eca9290e 100644
--- a/include/linux/memmap_async.h
+++ b/include/linux/memmap_async.h
@@ -3,6 +3,9 @@
 #define __LINUX_MEMMAP_ASYNC_H
 #include <linux/async.h>
 #include <linux/ioport.h>
+#include <linux/async.h>
+#include <linux/pfn_t.h>
+#include <linux/radix-tree.h>
 
 struct dev_pagemap;
 struct vmem_altmap;
@@ -32,14 +35,21 @@ struct memmap_init_memmap {
 };
 
 struct memmap_init_pages {
+       int id;
        struct resource res;
+       async_cookie_t cookie;
        struct memmap_init_env *env;
 };
 
 struct memmap_async_state {
        struct memmap_init_env env;
        struct memmap_init_memmap memmap;
+       struct memmap_init_pages page_init[NR_MEMMAP_THREADS];
+       unsigned long active[BITS_TO_LONGS(NR_MEMMAP_THREADS)];
+       struct radix_tree_root pfn_to_thread;
 };
 
 extern struct async_domain memmap_init_domain;
+extern void memmap_sync(pfn_t pfn, unsigned long nr_pages,
+               struct memmap_async_state *async);
 #endif /* __LINUX_MEMMAP_ASYNC_H */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index bfdc7363b13b..a2313fadd686 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_MEMREMAP_H_
 #define _LINUX_MEMREMAP_H_
+#include <linux/pfn.h>
 #include <linux/ioport.h>
 #include <linux/percpu-refcount.h>
 
@@ -101,6 +102,7 @@ typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
                                pmd_t *pmdp);
 typedef void (*dev_page_free_t)(struct page *page, void *data);
 
+struct memmap_async_state;
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
  * @page_fault: callback when CPU fault on an unaddressable device page
@@ -112,6 +114,7 @@ typedef void (*dev_page_free_t)(struct page *page, void 
*data);
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @async: async memmap init context
  */
 struct dev_pagemap {
        dev_page_fault_t page_fault;
@@ -124,8 +127,34 @@ struct dev_pagemap {
        struct device *dev;
        void *data;
        enum memory_type type;
+       struct memmap_async_state *async;
 };
 
+static inline unsigned long order_at(struct resource *res, unsigned long pgoff)
+{
+       unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
+       unsigned long nr_pages, mask;
+
+       nr_pages = PHYS_PFN(resource_size(res));
+       if (nr_pages == pgoff)
+               return ULONG_MAX;
+
+       /*
+        * What is the largest aligned power-of-2 range available from
+        * this resource pgoff to the end of the resource range,
+        * considering the alignment of the current pgoff?
+        */
+       mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
+       if (!mask)
+               return ULONG_MAX;
+
+       return find_first_bit(&mask, BITS_PER_LONG);
+}
+
+#define foreach_order_pgoff(res, order, pgoff) \
+       for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+                       pgoff += 1UL << order, order = order_at((res), pgoff))
+
 #ifdef CONFIG_ZONE_DEVICE
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap,
                void (*kill)(struct percpu_ref *));
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 85e4a7c576b2..18719a596be5 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -7,6 +7,7 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/memory_hotplug.h>
+#include <linux/memmap_async.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/wait_bit.h>
@@ -16,31 +17,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
 #define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 
-static unsigned long order_at(struct resource *res, unsigned long pgoff)
-{
-       unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
-       unsigned long nr_pages, mask;
-
-       nr_pages = PHYS_PFN(resource_size(res));
-       if (nr_pages == pgoff)
-               return ULONG_MAX;
-
-       /*
-        * What is the largest aligned power-of-2 range available from
-        * this resource pgoff to the end of the resource range,
-        * considering the alignment of the current pgoff?
-        */
-       mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
-       if (!mask)
-               return ULONG_MAX;
-
-       return find_first_bit(&mask, BITS_PER_LONG);
-}
-
-#define foreach_order_pgoff(res, order, pgoff) \
-       for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
-                       pgoff += 1UL << order, order = order_at((res), pgoff))
-
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 int device_private_entry_fault(struct vm_area_struct *vma,
                       unsigned long addr,
@@ -113,15 +89,46 @@ static unsigned long pfn_next(unsigned long pfn)
 #define for_each_device_pfn(pfn, map) \
        for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
 
+static void kill_memmap_async(struct memmap_async_state *async)
+{
+       struct radix_tree_iter iter;
+       void *slot;
+       int i;
+
+       if (!async)
+               return;
+
+       for (i = 0; i < NR_MEMMAP_THREADS; i++) {
+               async_cookie_t cookie;
+
+               if (!test_bit(i, async->active))
+                       continue;
+
+               cookie = async->page_init[i].cookie;
+               async_synchronize_cookie_domain(cookie+1, &memmap_init_domain);
+       }
+       radix_tree_for_each_slot(slot, &async->pfn_to_thread, &iter, 0)
+               radix_tree_delete(&async->pfn_to_thread, iter.index);
+}
+
 static void devm_memremap_pages_release(void *data)
 {
        struct dev_pagemap *pgmap = data;
        struct device *dev = pgmap->dev;
        struct resource *res = &pgmap->res;
        resource_size_t align_start, align_size;
+       struct memmap_async_state *async = pgmap->async;
        unsigned long pfn;
 
+       /*
+        * Once the pgmap is killed pgmap owners must disallow new
+        * direct_access / page mapping requests. I.e. memmap_sync()
+        * users must not race the teardown of the async->pfn_to_thread
+        * radix.
+        */
        pgmap->kill(pgmap->ref);
+       kill_memmap_async(async);
+
        for_each_device_pfn(pfn, pgmap)
                put_page(pfn_to_page(pfn));
 
@@ -240,7 +247,13 @@ void *devm_memremap_pages(struct device *dev, struct 
dev_pagemap *pgmap,
                struct zone *zone;
 
                error = arch_add_memory(nid, align_start, align_size, altmap,
-                               false, NULL);
+                               false, pgmap->async);
+               if (error == -EWOULDBLOCK) {
+                       /* fall back to synchronous */
+                       pgmap->async = NULL;
+                       error = arch_add_memory(nid, align_start, align_size,
+                                       altmap, false, NULL);
+               }
                zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
                if (!error)
                        move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d0ed17cf305..d1466dd82bc2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
 #include <linux/ftrace.h>
 #include <linux/lockdep.h>
 #include <linux/async.h>
+#include <linux/pfn_t.h>
 #include <linux/nmi.h>
 
 #include <asm/sections.h>
@@ -5510,12 +5511,80 @@ static void __ref memmap_init_async(void *data, 
async_cookie_t cookie)
 {
        struct memmap_init_pages *args = data;
        struct memmap_init_env *env = args->env;
+       struct dev_pagemap *pgmap = env->pgmap;
+       struct memmap_async_state *async = pgmap ? pgmap->async : NULL;
        struct resource *res = &args->res;
        unsigned long pfn;
 
+       if (async)
+               async_synchronize_cookie_domain(async->memmap.cookie+1,
+                               &memmap_init_domain);
+
        for (pfn = PHYS_PFN(res->start); pfn < PHYS_PFN(res->end+1); pfn++)
                memmap_init_one(pfn, env->zone, env->nid, env->context,
-                               env->pgmap);
+                               pgmap);
+       if (async)
+               clear_bit(args->id, async->active);
+}
+
+void memmap_sync(pfn_t pfn, unsigned long nr_pages,
+               struct memmap_async_state *async)
+{
+       struct memmap_init_pages *args, *start, *end;
+       unsigned long raw_pfn = pfn_t_to_pfn(pfn);
+
+       if (!async || !pfn_t_has_page(pfn)
+                       || !bitmap_weight(async->active, NR_MEMMAP_THREADS))
+               return;
+
+       start = radix_tree_lookup(&async->pfn_to_thread, raw_pfn);
+       end = radix_tree_lookup(&async->pfn_to_thread, raw_pfn + nr_pages - 1);
+       if (!start || !end) {
+               WARN_ON_ONCE(1);
+               return;
+       }
+
+       for (args = start; args <= end; args++) {
+               int id = args - &async->page_init[0];
+
+               async_synchronize_cookie_domain(args->cookie+1,
+                               &memmap_init_domain);
+               pr_debug("%s: pfn: %#lx nr: %ld thread: %d\n",
+                               __func__, raw_pfn, nr_pages, id);
+       }
+}
+EXPORT_SYMBOL_GPL(memmap_sync);
+
+static bool run_memmap_init(struct memmap_init_pages *thread,
+               struct memmap_async_state *async, struct async_domain *domain)
+{
+       struct resource *res = &thread->res;
+       unsigned long pgoff;
+       int order;
+
+       if (!async) {
+               async_schedule_domain(memmap_init_async, thread, domain);
+               return false;
+       }
+
+       thread->cookie = async_schedule_domain(memmap_init_async,
+                       thread, domain);
+       set_bit(thread->id, async->active);
+       foreach_order_pgoff(res, order, pgoff) {
+               int rc = __radix_tree_insert(&async->pfn_to_thread,
+                               PHYS_PFN(res->start) + pgoff, order, thread);
+               if (rc) {
+                       /*
+                        * Mark all threads inactive, and by returning
+                        * false we'll sync all threads before returning
+                        * from memmap_init_zone().
+                        */
+                       memset(async->active, 0, sizeof(unsigned long)
+                                       * BITS_TO_LONGS(NR_MEMMAP_THREADS));
+                       return false;
+               }
+       }
+       return true;
 }
 
 /*
@@ -5554,33 +5623,44 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
                 * function.  They do not exist on hotplugged memory.
                 */
                ASYNC_DOMAIN_EXCLUSIVE(local);
-               struct memmap_init_pages args[NR_MEMMAP_THREADS];
-               struct memmap_init_env env = {
-                       .nid = nid,
-                       .zone = zone,
-                       .pgmap = pgmap,
-                       .context = context,
-               };
+               struct memmap_async_state *async = pgmap ? pgmap->async : NULL;
+               struct memmap_init_pages _args[NR_MEMMAP_THREADS];
+               struct memmap_init_pages *args = async ? async->page_init : 
_args;
+               struct async_domain *domain;
+               struct memmap_init_env _env;
+               struct memmap_init_env *env = async ? &async->env : &_env;
                unsigned long step, rem;
+               bool sync = !async;
                int i;
 
+               domain = async ? &memmap_init_domain : &local;
+               env->pgmap = pgmap;
+               env->nid = nid;
+               env->zone = zone;
+               env->context = context;
+
                size = end_pfn - start_pfn;
                step = size / NR_MEMMAP_THREADS;
                rem = size % NR_MEMMAP_THREADS;
+               if (async)
+                       INIT_RADIX_TREE(&async->pfn_to_thread, GFP_KERNEL);
                for (i = 0; i < NR_MEMMAP_THREADS; i++) {
                        struct memmap_init_pages *t = &args[i];
 
-                       t->env = &env;
+                       t->id = i;
+                       t->env = env;
                        t->res.start = PFN_PHYS(start_pfn);
                        t->res.end = PFN_PHYS(start_pfn + step) - 1;
                        if (i == NR_MEMMAP_THREADS-1)
                                t->res.end += PFN_PHYS(rem);
 
-                       async_schedule_domain(memmap_init_async, t, &local);
+                       if (!run_memmap_init(t, async, domain))
+                               sync = true;
 
                        start_pfn += step;
                }
-               async_synchronize_full_domain(&local);
+               if (sync)
+                       async_synchronize_full_domain(domain);
                return;
        }
 

_______________________________________________
Linux-nvdimm mailing list
[email protected]
https://lists.01.org/mailman/listinfo/linux-nvdimm

Reply via email to