[RFC][PATCH 1/6] mm: Dont assume page-table invariance during faults

Peter Zijlstra Mon, 20 Oct 2014 15:43:09 -0700

One of the side effects of speculating on faults (without holding
mmap_sem) is that we can race with free_pgtables() and therefore we
cannot assume the page-tables will stick around.


Remove the relyance on the pte pointer.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
 mm/memory.c |   76 ++++++++++++++++--------------------------------------------
 1 file changed, 21 insertions(+), 55 deletions(-)

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1933,31 +1933,6 @@ int apply_to_page_range(struct mm_struct
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
-/*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-                               pte_t *page_table, pte_t orig_pte)
-{
-       int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-       if (sizeof(pte_t) > sizeof(unsigned long)) {
-               spinlock_t *ptl = pte_lockptr(mm, pmd);
-               spin_lock(ptl);
-               same = pte_same(*page_table, orig_pte);
-               spin_unlock(ptl);
-       }
-#endif
-       pte_unmap(page_table);
-       return same;
-}
-
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)
 {
        debug_dma_assert_idle(src);
@@ -2407,21 +2382,18 @@ EXPORT_SYMBOL(unmap_mapping_range);
  * as does filemap_fault().
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               unsigned long address, pmd_t *pmd,
                unsigned int flags, pte_t orig_pte)
 {
        spinlock_t *ptl;
        struct page *page, *swapcache;
        struct mem_cgroup *memcg;
        swp_entry_t entry;
-       pte_t pte;
+       pte_t *page_table, pte;
        int locked;
        int exclusive = 0;
        int ret = 0;
 
-       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-               goto out;
-
        entry = pte_to_swp_entry(orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
@@ -2624,15 +2596,13 @@ static inline int check_stack_guard_page
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               unsigned long address, pmd_t *pmd,
                unsigned int flags)
 {
        struct mem_cgroup *memcg;
        struct page *page;
        spinlock_t *ptl;
-       pte_t entry;
-
-       pte_unmap(page_table);
+       pte_t entry, *page_table;
 
        /* Check if we need to add a guard page to the stack */
        if (check_stack_guard_page(vma, address) < 0)
@@ -3031,13 +3001,12 @@ static int do_shared_fault(struct mm_str
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               unsigned long address, pmd_t *pmd,
                unsigned int flags, pte_t orig_pte)
 {
        pgoff_t pgoff = (((address & PAGE_MASK)
                        - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
-       pte_unmap(page_table);
        if (!(flags & FAULT_FLAG_WRITE))
                return do_read_fault(mm, vma, address, pmd, pgoff, flags,
                                orig_pte);
@@ -3059,16 +3028,13 @@ static int do_linear_fault(struct mm_str
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               unsigned long address, pmd_t *pmd,
                unsigned int flags, pte_t orig_pte)
 {
        pgoff_t pgoff;
 
        flags |= FAULT_FLAG_NONLINEAR;
 
-       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-               return 0;
-
        if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
                /*
                 * Page table corrupted: show pte and kill process.
@@ -3103,7 +3069,7 @@ static int numa_migrate_prep(struct page
 }
 
 static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                  unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+                  unsigned long addr, pte_t pte, pmd_t *pmd)
 {
        struct page *page = NULL;
        spinlock_t *ptl;
@@ -3112,6 +3078,7 @@ static int do_numa_page(struct mm_struct
        int target_nid;
        bool migrated = false;
        int flags = 0;
+       pte_t *ptep;
 
        /*
        * The "pte" at this point cannot be used safely without
@@ -3122,8 +3089,7 @@ static int do_numa_page(struct mm_struct
        * the _PAGE_NUMA bit and it is not really expected that there
        * would be concurrent hardware modifications to the PTE.
        */
-       ptl = pte_lockptr(mm, pmd);
-       spin_lock(ptl);
+       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (unlikely(!pte_same(*ptep, pte))) {
                pte_unmap_unlock(ptep, ptl);
                goto out;
@@ -3195,34 +3161,32 @@ static int do_numa_page(struct mm_struct
  */
 static int handle_pte_fault(struct mm_struct *mm,
                     struct vm_area_struct *vma, unsigned long address,
-                    pte_t *pte, pmd_t *pmd, unsigned int flags)
+                    pte_t entry, pmd_t *pmd, unsigned int flags)
 {
-       pte_t entry;
        spinlock_t *ptl;
+       pte_t *pte;
 
-       entry = ACCESS_ONCE(*pte);
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
                        if (vma->vm_ops) {
                                if (likely(vma->vm_ops->fault))
                                        return do_linear_fault(mm, vma, address,
-                                               pte, pmd, flags, entry);
+                                               pmd, flags, entry);
                        }
                        return do_anonymous_page(mm, vma, address,
-                                                pte, pmd, flags);
+                                                pmd, flags);
                }
                if (pte_file(entry))
                        return do_nonlinear_fault(mm, vma, address,
-                                       pte, pmd, flags, entry);
+                                       pmd, flags, entry);
                return do_swap_page(mm, vma, address,
-                                       pte, pmd, flags, entry);
+                                       pmd, flags, entry);
        }
 
        if (pte_numa(entry))
-               return do_numa_page(mm, vma, address, entry, pte, pmd);
+               return do_numa_page(mm, vma, address, entry, pmd);
 
-       ptl = pte_lockptr(mm, pmd);
-       spin_lock(ptl);
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (unlikely(!pte_same(*pte, entry)))
                goto unlock;
        if (flags & FAULT_FLAG_WRITE) {
@@ -3261,7 +3225,7 @@ static int __handle_mm_fault(struct mm_s
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
-       pte_t *pte;
+       pte_t *pte, entry;
 
        if (unlikely(is_vm_hugetlb_page(vma)))
                return hugetlb_fault(mm, vma, address, flags);
@@ -3331,8 +3295,10 @@ static int __handle_mm_fault(struct mm_s
         * safe to run pte_offset_map().
         */
        pte = pte_offset_map(pmd, address);
+       entry = ACCESS_ONCE(*pte);
+       pte_unmap(pte);
 
-       return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+       return handle_pte_fault(mm, vma, address, entry, pmd, flags);
 }
 
 /*


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC][PATCH 1/6] mm: Dont assume page-table invariance during faults

Reply via email to