The current memory_failure_dev_pagemap() can only handle single-mapped
dax page for fsdax mode.  The dax page could be mapped by multiple files
and offsets if we let reflink feature & fsdax mode work together.  So,
we refactor current implementation to support handle memory failure on
each file and offset.

Signed-off-by: Shiyang Ruan <ruansy.f...@cn.fujitsu.com>
---
 fs/dax.c            | 21 ++++++++++
 include/linux/dax.h |  1 +
 include/linux/mm.h  |  9 +++++
 mm/memory-failure.c | 98 ++++++++++++++++++++++++++++++++++-----------
 4 files changed, 105 insertions(+), 24 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 26d5dcd2d69e..c64c3a0e76a6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -378,6 +378,27 @@ static struct page *dax_busy_page(void *entry)
        return NULL;
 }
 
+/*
+ * dax_load_pfn - Load pfn of the DAX entry corresponding to a page
+ * @mapping: The file whose entry we want to load
+ * @index:   The offset where the DAX entry located in
+ *
+ * Return:   pfn of the DAX entry
+ */
+unsigned long dax_load_pfn(struct address_space *mapping, unsigned long index)
+{
+       XA_STATE(xas, &mapping->i_pages, index);
+       void *entry;
+       unsigned long pfn;
+
+       xas_lock_irq(&xas);
+       entry = xas_load(&xas);
+       pfn = dax_to_pfn(entry);
+       xas_unlock_irq(&xas);
+
+       return pfn;
+}
+
 /*
  * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
  * @page: The page whose entry we want to lock
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b52f084aa643..89e56ceeffc7 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -150,6 +150,7 @@ int dax_writeback_mapping_range(struct address_space 
*mapping,
 
 struct page *dax_layout_busy_page(struct address_space *mapping);
 struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t 
start, loff_t end);
+unsigned long dax_load_pfn(struct address_space *mapping, unsigned long index);
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecdf8a8cd6ae..ab52bc633d84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1157,6 +1157,14 @@ static inline bool is_device_private_page(const struct 
page *page)
                page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
+static inline bool is_device_fsdax_page(const struct page *page)
+{
+       return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+               IS_ENABLED(CONFIG_FS_DAX) &&
+               is_zone_device_page(page) &&
+               page->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
        return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
@@ -3045,6 +3053,7 @@ enum mf_flags {
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
 };
+extern int mf_dax_mapping_kill_procs(struct address_space *mapping, pgoff_t 
index, int flags);
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e9481632fcd1..158fe0c8e602 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/page-isolation.h>
+#include <linux/dax.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -120,6 +121,13 @@ static int hwpoison_filter_dev(struct page *p)
        if (PageSlab(p))
                return -EINVAL;
 
+       if (pfn_valid(page_to_pfn(p))) {
+               if (is_device_fsdax_page(p))
+                       return 0;
+               else
+                       return -EINVAL;
+       }
+
        mapping = page_mapping(p);
        if (mapping == NULL || mapping->host == NULL)
                return -EINVAL;
@@ -286,10 +294,9 @@ void shake_page(struct page *p, int access)
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
-               struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+                                              unsigned long address)
 {
-       unsigned long address = vma_address(page, vma);
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
@@ -329,9 +336,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page 
*page,
  * Schedule a process for later kill.
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  */
-static void add_to_kill(struct task_struct *tsk, struct page *p,
-                      struct vm_area_struct *vma,
-                      struct list_head *to_kill)
+static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff,
+                       struct vm_area_struct *vma, struct list_head *to_kill)
 {
        struct to_kill *tk;
 
@@ -342,9 +348,12 @@ static void add_to_kill(struct task_struct *tsk, struct 
page *p,
        }
 
        tk->addr = page_address_in_vma(p, vma);
-       if (is_zone_device_page(p))
-               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-       else
+       if (is_zone_device_page(p)) {
+               if (is_device_fsdax_page(p))
+                       tk->addr = vma->vm_start +
+                                       ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+               tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+       } else
                tk->size_shift = page_shift(compound_head(p));
 
        /*
@@ -492,7 +501,7 @@ static void collect_procs_anon(struct page *page, struct 
list_head *to_kill,
                        if (!page_mapped_in_vma(page, vma))
                                continue;
                        if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, 0, vma, to_kill);
                }
        }
        read_unlock(&tasklist_lock);
@@ -502,24 +511,19 @@ static void collect_procs_anon(struct page *page, struct 
list_head *to_kill,
 /*
  * Collect processes when the error hit a file mapped page.
  */
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
-                               int force_early)
+static void collect_procs_file(struct page *page, struct address_space 
*mapping,
+               pgoff_t pgoff, struct list_head *to_kill, int force_early)
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
-       struct address_space *mapping = page->mapping;
-       pgoff_t pgoff;
 
        i_mmap_lock_read(mapping);
        read_lock(&tasklist_lock);
-       pgoff = page_to_pgoff(page);
        for_each_process(tsk) {
                struct task_struct *t = task_early_kill(tsk, force_early);
-
                if (!t)
                        continue;
-               vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
-                                     pgoff) {
+               vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                        /*
                         * Send early kill signal to tasks where a vma covers
                         * the page but the corrupted page is not necessarily
@@ -528,7 +532,7 @@ static void collect_procs_file(struct page *page, struct 
list_head *to_kill,
                         * to be informed of all such data corruptions.
                         */
                        if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, pgoff, vma, to_kill);
                }
        }
        read_unlock(&tasklist_lock);
@@ -547,7 +551,8 @@ static void collect_procs(struct page *page, struct 
list_head *tokill,
        if (PageAnon(page))
                collect_procs_anon(page, tokill, force_early);
        else
-               collect_procs_file(page, tokill, force_early);
+               collect_procs_file(page, page_mapping(page), 
page_to_pgoff(page),
+                                  tokill, force_early);
 }
 
 static const char *action_name[] = {
@@ -1214,6 +1219,50 @@ static int try_to_split_thp_page(struct page *page, 
const char *msg)
        return 0;
 }
 
+int mf_dax_mapping_kill_procs(struct address_space *mapping, pgoff_t index, 
int flags)
+{
+       const bool unmap_success = true;
+       unsigned long pfn, size = 0;
+       struct to_kill *tk;
+       LIST_HEAD(to_kill);
+       int rc = -EBUSY;
+       loff_t start;
+
+       /* load the pfn of the dax mapping file */
+       pfn = dax_load_pfn(mapping, index);
+       if (!pfn)
+               return rc;
+       /*
+        * Unlike System-RAM there is no possibility to swap in a
+        * different physical page at a given virtual address, so all
+        * userspace consumption of ZONE_DEVICE memory necessitates
+        * SIGBUS (i.e. MF_MUST_KILL)
+        */
+       flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+       collect_procs_file(pfn_to_page(pfn), mapping, index, &to_kill,
+                          flags & MF_ACTION_REQUIRED);
+
+       list_for_each_entry(tk, &to_kill, nd)
+               if (tk->size_shift)
+                       size = max(size, 1UL << tk->size_shift);
+       if (size) {
+               /*
+                * Unmap the largest mapping to avoid breaking up
+                * device-dax mappings which are constant size. The
+                * actual size of the mapping being torn down is
+                * communicated in siginfo, see kill_proc()
+                */
+               start = (index << PAGE_SHIFT) & ~(size - 1);
+               unmap_mapping_range(mapping, start, start + size, 0);
+       }
+
+       kill_procs(&to_kill, flags & MF_MUST_KILL, !unmap_success,
+                  pfn, flags);
+       rc = 0;
+       return rc;
+}
+EXPORT_SYMBOL_GPL(mf_dax_mapping_kill_procs);
+
 static int memory_failure_hugetlb(unsigned long pfn, int flags)
 {
        struct page *p = pfn_to_page(pfn);
@@ -1297,7 +1346,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, 
int flags,
        const bool unmap_success = true;
        unsigned long size = 0;
        struct to_kill *tk;
-       LIST_HEAD(tokill);
+       LIST_HEAD(to_kill);
        int rc = -EBUSY;
        loff_t start;
        dax_entry_t cookie;
@@ -1345,9 +1394,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, 
int flags,
         * SIGBUS (i.e. MF_MUST_KILL)
         */
        flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-       collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+       collect_procs_file(page, page->mapping, page->index, &to_kill,
+                          flags & MF_ACTION_REQUIRED);
 
-       list_for_each_entry(tk, &tokill, nd)
+       list_for_each_entry(tk, &to_kill, nd)
                if (tk->size_shift)
                        size = max(size, 1UL << tk->size_shift);
        if (size) {
@@ -1360,7 +1410,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, 
int flags,
                start = (page->index << PAGE_SHIFT) & ~(size - 1);
                unmap_mapping_range(page->mapping, start, start + size, 0);
        }
-       kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+       kill_procs(&to_kill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
        rc = 0;
 unlock:
        dax_unlock_page(page, cookie);
-- 
2.30.0


_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-le...@lists.01.org

Reply via email to