Hardware actually tells us the blast radius of the error, but we ignore
it and take out the entire page.  We've had a customer request to know
exactly how much of the page is damaged so they can avoid reconstructing
an entire 2MB page if only a single cacheline is damaged.

This is only a strawman that I did in an hour or two; I'd appreciate
architectural-level feedback.  Should I just convert memory_failure() to
always take an address & granularity?  Should I create a struct to pass
around (page, phys, granularity) instead of reconstructing the missing
pieces in half a dozen functions?  Is this functionality welcome at all,
or is the risk of upsetting applications which expect at least a page
of granularity too high?

I can see places where I've specified a plain PAGE_SHIFT insted of
interrogating a compound page for its size.  I'd probably split this
patch up into two or three pieces for applying.

I've also blindly taken out the call to unmap_mapping_range().  Again,
the customer requested that we not do this.  That deserves to be in its
own patch and properly justified.

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index ce9120c4f740..09978c5b9493 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -463,7 +463,7 @@ static void mce_irq_work_cb(struct irq_work *entry)
  * Check if the address reported by the CPU is in a format we can parse.
  * It would be possible to add code for most other cases, but all would
  * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
+ * parser). So only support physical addresses up to page granularity for now.
  */
 int mce_usable_address(struct mce *m)
 {
@@ -570,7 +570,6 @@ static int uc_decode_notifier(struct notifier_block *nb, 
unsigned long val,
                              void *data)
 {
        struct mce *mce = (struct mce *)data;
-       unsigned long pfn;
 
        if (!mce || !mce_usable_address(mce))
                return NOTIFY_DONE;
@@ -579,9 +578,8 @@ static int uc_decode_notifier(struct notifier_block *nb, 
unsigned long val,
            mce->severity != MCE_DEFERRED_SEVERITY)
                return NOTIFY_DONE;
 
-       pfn = mce->addr >> PAGE_SHIFT;
-       if (!memory_failure(pfn, 0)) {
-               set_mce_nospec(pfn, whole_page(mce));
+       if (!__memory_failure(mce->addr, MCI_MISC_ADDR_LSB(mce->misc), 0)) {
+               set_mce_nospec(mce->addr >> PAGE_SHIFT, whole_page(mce));
                mce->kflags |= MCE_HANDLED_UC;
        }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc7b87310c10..f2ea6491df28 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3035,7 +3035,14 @@ enum mf_flags {
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
 };
-extern int memory_failure(unsigned long pfn, int flags);
+
+int __memory_failure(unsigned long addr, unsigned int granularity, int flags);
+
+static inline int memory_failure(unsigned long pfn, int flags)
+{
+       return __memory_failure(pfn << PAGE_SHIFT, PAGE_SHIFT, flags);
+}
+
 extern void memory_failure_queue(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 47b8ccb1fb9b..775b053dcd98 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -198,7 +198,7 @@ struct to_kill {
        struct list_head nd;
        struct task_struct *tsk;
        unsigned long addr;
-       short size_shift;
+       short granularity;
 };
 
 /*
@@ -209,7 +209,7 @@ struct to_kill {
 static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
 {
        struct task_struct *t = tk->tsk;
-       short addr_lsb = tk->size_shift;
+       short addr_lsb = tk->granularity;
        int ret = 0;
 
        pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware 
memory corruption\n",
@@ -262,40 +262,6 @@ void shake_page(struct page *p, int access)
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
-               struct vm_area_struct *vma)
-{
-       unsigned long address = vma_address(page, vma);
-       pgd_t *pgd;
-       p4d_t *p4d;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       pgd = pgd_offset(vma->vm_mm, address);
-       if (!pgd_present(*pgd))
-               return 0;
-       p4d = p4d_offset(pgd, address);
-       if (!p4d_present(*p4d))
-               return 0;
-       pud = pud_offset(p4d, address);
-       if (!pud_present(*pud))
-               return 0;
-       if (pud_devmap(*pud))
-               return PUD_SHIFT;
-       pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd))
-               return 0;
-       if (pmd_devmap(*pmd))
-               return PMD_SHIFT;
-       pte = pte_offset_map(pmd, address);
-       if (!pte_present(*pte))
-               return 0;
-       if (pte_devmap(*pte))
-               return PAGE_SHIFT;
-       return 0;
-}
-
 /*
  * Failure handling: if we can't find or can't kill a process there's
  * not much we can do. We just print a message and ignore otherwise.
@@ -306,8 +272,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page 
*page,
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
-                      struct vm_area_struct *vma,
-                      struct list_head *to_kill)
+                       unsigned int offset, unsigned int granularity,
+                       struct vm_area_struct *vma, struct list_head *to_kill)
 {
        struct to_kill *tk;
 
@@ -318,27 +284,14 @@ static void add_to_kill(struct task_struct *tsk, struct 
page *p,
        }
 
        tk->addr = page_address_in_vma(p, vma);
-       if (is_zone_device_page(p))
-               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-       else
-               tk->size_shift = page_shift(compound_head(p));
+       tk->granularity = granularity;
 
-       /*
-        * Send SIGKILL if "tk->addr == -EFAULT". Also, as
-        * "tk->size_shift" is always non-zero for !is_zone_device_page(),
-        * so "tk->size_shift == 0" effectively checks no mapping on
-        * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
-        * to a process' address space, it's possible not all N VMAs
-        * contain mappings for the page, but at least one VMA does.
-        * Only deliver SIGBUS with payload derived from the VMA that
-        * has a mapping for the page.
-        */
+       /* Send SIGKILL if "tk->addr == -EFAULT". */
        if (tk->addr == -EFAULT) {
                pr_info("Memory failure: Unable to find user space address %lx 
in %s\n",
                        page_to_pfn(p), tsk->comm);
-       } else if (tk->size_shift == 0) {
-               kfree(tk);
-               return;
+       } else {
+               tk->addr += offset;
        }
 
        get_task_struct(tsk);
@@ -468,7 +421,8 @@ static void collect_procs_anon(struct page *page, struct 
list_head *to_kill,
                        if (!page_mapped_in_vma(page, vma))
                                continue;
                        if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, 0, PAGE_SHIFT, vma,
+                                               to_kill);
                }
        }
        read_unlock(&tasklist_lock);
@@ -478,8 +432,8 @@ static void collect_procs_anon(struct page *page, struct 
list_head *to_kill,
 /*
  * Collect processes when the error hit a file mapped page.
  */
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
-                               int force_early)
+static void collect_procs_file(struct page *page, unsigned int offset,
+               short granularity, struct list_head *to_kill, int force_early)
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
@@ -503,7 +457,8 @@ static void collect_procs_file(struct page *page, struct 
list_head *to_kill,
                         * to be informed of all such data corruptions.
                         */
                        if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, 0, PAGE_SHIFT, vma,
+                                               to_kill);
                }
        }
        read_unlock(&tasklist_lock);
@@ -513,8 +468,8 @@ static void collect_procs_file(struct page *page, struct 
list_head *to_kill,
 /*
  * Collect the processes who have the corrupted page mapped to kill.
  */
-static void collect_procs(struct page *page, struct list_head *tokill,
-                               int force_early)
+static void collect_procs(struct page *page, unsigned int offset,
+               short granularity, struct list_head *tokill, int force_early)
 {
        if (!page->mapping)
                return;
@@ -522,7 +477,8 @@ static void collect_procs(struct page *page, struct 
list_head *tokill,
        if (PageAnon(page))
                collect_procs_anon(page, tokill, force_early);
        else
-               collect_procs_file(page, tokill, force_early);
+               collect_procs_file(page, offset, granularity, tokill,
+                               force_early);
 }
 
 static const char *action_name[] = {
@@ -1026,7 +982,8 @@ static bool hwpoison_user_mappings(struct page *p, 
unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-               collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
+               collect_procs(hpage, 0, PAGE_SHIFT, &tokill,
+                               flags & MF_ACTION_REQUIRED);
 
        if (!PageHuge(hpage)) {
                unmap_success = try_to_unmap(hpage, ttu);
@@ -1176,16 +1133,14 @@ static int memory_failure_hugetlb(unsigned long pfn, 
int flags)
        return res;
 }
 
-static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
-               struct dev_pagemap *pgmap)
+static int memory_failure_dev_pagemap(unsigned long addr, short granularity,
+               int flags, struct dev_pagemap *pgmap)
 {
+       unsigned long pfn = addr >> PAGE_SHIFT;
        struct page *page = pfn_to_page(pfn);
        const bool unmap_success = true;
-       unsigned long size = 0;
-       struct to_kill *tk;
        LIST_HEAD(tokill);
        int rc = -EBUSY;
-       loff_t start;
        dax_entry_t cookie;
 
        /*
@@ -1225,21 +1180,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, 
int flags,
         * SIGBUS (i.e. MF_MUST_KILL)
         */
        flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-       collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
-
-       list_for_each_entry(tk, &tokill, nd)
-               if (tk->size_shift)
-                       size = max(size, 1UL << tk->size_shift);
-       if (size) {
-               /*
-                * Unmap the largest mapping to avoid breaking up
-                * device-dax mappings which are constant size. The
-                * actual size of the mapping being torn down is
-                * communicated in siginfo, see kill_proc()
-                */
-               start = (page->index << PAGE_SHIFT) & ~(size - 1);
-               unmap_mapping_range(page->mapping, start, start + size, 0);
-       }
+       collect_procs(page, offset_in_page(addr), granularity, &tokill,
+                       flags & MF_ACTION_REQUIRED);
        kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
        rc = 0;
 unlock:
@@ -1252,8 +1194,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, 
int flags,
 }
 
 /**
- * memory_failure - Handle memory failure of a page.
- * @pfn: Page Number of the corrupted page
+ * __memory_failure - Handle memory failure.
+ * @addr: Physical address of the error.
+ * @granularity: Number of bits in the address which are bad.
  * @flags: fine tune action taken
  *
  * This function is called by the low level machine check code
@@ -1268,8 +1211,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, 
int flags,
  * Must run in process context (e.g. a work queue) with interrupts
  * enabled and no spinlocks hold.
  */
-int memory_failure(unsigned long pfn, int flags)
+int __memory_failure(unsigned long addr, unsigned int granularity, int flags)
 {
+       unsigned long pfn = addr >> PAGE_SHIFT;
        struct page *p;
        struct page *hpage;
        struct page *orig_head;
@@ -1285,8 +1229,8 @@ int memory_failure(unsigned long pfn, int flags)
                if (pfn_valid(pfn)) {
                        pgmap = get_dev_pagemap(pfn, NULL);
                        if (pgmap)
-                               return memory_failure_dev_pagemap(pfn, flags,
-                                                                 pgmap);
+                               return memory_failure_dev_pagemap(addr,
+                                               granularity, flags, pgmap);
                }
                pr_err("Memory failure: %#lx: memory outside kernel control\n",
                        pfn);
@@ -1442,7 +1386,7 @@ int memory_failure(unsigned long pfn, int flags)
        unlock_page(p);
        return res;
 }
-EXPORT_SYMBOL_GPL(memory_failure);
+EXPORT_SYMBOL_GPL(__memory_failure);
 
 #define MEMORY_FAILURE_FIFO_ORDER      4
 #define MEMORY_FAILURE_FIFO_SIZE       (1 << MEMORY_FAILURE_FIFO_ORDER)
_______________________________________________
Linux-nvdimm mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to