gup: track FOLL_PIN pages

John Hubbard Thu, 12 Dec 2019 01:15:37 -0800

Add tracking of pages that were pinned via FOLL_PIN.

As mentioned in the FOLL_PIN documentation, callers who effectively set
FOLL_PIN are required to ultimately free such pages via unpin_user_page().
The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
for DIO and/or RDMA use".


Pages that have been pinned via FOLL_PIN are identifiable via a
new function call:

   bool page_dma_pinned(struct page *page);

What to do in response to encountering such a page, is left to later
patchsets. There is discussion about this in [1], [2], and [3].

This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().

[1] Some slow progress on get_user_pages() (Apr 2, 2019):
    https://lwn.net/Articles/784574/
[2] DMA and get_user_pages() (LPC: Dec 12, 2018):
    https://lwn.net/Articles/774411/
[3] The trouble with get_user_pages() (Apr 30, 2018):
    https://lwn.net/Articles/753027/

Suggested-by: Jan Kara <j...@suse.cz>
Suggested-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Signed-off-by: John Hubbard <jhubb...@nvidia.com>
---
 Documentation/core-api/pin_user_pages.rst |   2 +-
 include/linux/mm.h                        |  77 ++++-
 include/linux/mmzone.h                    |   2 +
 include/linux/page_ref.h                  |  10 +
 mm/gup.c                                  | 399 +++++++++++++++++-----
 mm/huge_memory.c                          |  45 ++-
 mm/hugetlb.c                              |  38 ++-
 mm/vmstat.c                               |   2 +
 8 files changed, 436 insertions(+), 139 deletions(-)

diff --git a/Documentation/core-api/pin_user_pages.rst 
b/Documentation/core-api/pin_user_pages.rst
index 1d490155ecd7..2db14df1f2d7 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -53,7 +53,7 @@ Which flags are set by each wrapper
 For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
 flags the caller provides. The caller is required to pass in a non-null struct
 pages* array, and the function then pin pages by incrementing each by a special
-value. For now, that value is +1, just like get_user_pages*().::
+value: GUP_PIN_COUNTING_BIAS.::
 
  Function
  --------
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6a1a357e7d86..1765332f27e8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1005,6 +1005,10 @@ static inline bool is_pci_p2pdma_page(const struct page 
*page)
 #define page_ref_zero_or_close_to_overflow(page) \
        ((unsigned int) page_ref_count(page) + 127u <= 127u)
 
+#define page_ref_zero_or_close_to_bias_overflow(page) \
+       ((unsigned int) page_ref_count(page) + \
+               GUP_PIN_COUNTING_BIAS <= GUP_PIN_COUNTING_BIAS)
+
 static inline void get_page(struct page *page)
 {
        page = compound_head(page);
@@ -1016,6 +1020,8 @@ static inline void get_page(struct page *page)
        page_ref_inc(page);
 }
 
+bool __must_check try_grab_page(struct page *page, unsigned int flags);
+
 static inline __must_check bool try_get_page(struct page *page)
 {
        page = compound_head(page);
@@ -1044,29 +1050,70 @@ static inline void put_page(struct page *page)
                __put_page(page);
 }
 
-/**
- * unpin_user_page() - release a gup-pinned page
- * @page:            pointer to page to be released
+/*
+ * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
+ * the page's refcount so that two separate items are tracked: the original 
page
+ * reference count, and also a new count of how many pin_user_pages() calls 
were
+ * made against the page. ("gup-pinned" is another term for the latter).
+ *
+ * With this scheme, pin_user_pages() becomes special: such pages are marked as
+ * distinct from normal pages. As such, the unpin_user_page() call (and its
+ * variants) must be used in order to release gup-pinned pages.
  *
- * Pages that were pinned via pin_user_pages*() must be released via either
- * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
- * that eventually such pages can be separately tracked and uniquely handled. 
In
- * particular, interactions with RDMA and filesystems need special handling.
+ * Choice of value:
  *
- * unpin_user_page() and put_page() are not interchangeable, despite this early
- * implementation that makes them look the same. unpin_user_page() calls must
- * be perfectly matched up with pin*() calls.
+ * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
+ * counts with respect to pin_user_pages() and unpin_user_page() becomes
+ * simpler, due to the fact that adding an even power of two to the page
+ * refcount has the effect of using only the upper N bits, for the code that
+ * counts up using the bias value. This means that the lower bits are left for
+ * the exclusive use of the original code that increments and decrements by one
+ * (or at least, by much smaller values than the bias value).
+ *
+ * Of course, once the lower bits overflow into the upper bits (and this is
+ * OK, because subtraction recovers the original values), then visual 
inspection
+ * no longer suffices to directly view the separate counts. However, for normal
+ * applications that don't have huge page reference counts, this won't be an
+ * issue.
+ *
+ * Locking: the lockless algorithm described in page_cache_get_speculative()
+ * and page_cache_gup_pin_speculative() provides safe operation for
+ * get_user_pages and page_mkclean and other calls that race to set up page
+ * table entries.
  */
-static inline void unpin_user_page(struct page *page)
-{
-       put_page(page);
-}
+#define GUP_PIN_COUNTING_BIAS (1U << 10)
 
+void unpin_user_page(struct page *page);
 void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty);
-
 void unpin_user_pages(struct page **pages, unsigned long npages);
 
+/**
+ * page_dma_pinned() - report if a page is pinned for DMA.
+ *
+ * This function checks if a page has been pinned via a call to
+ * pin_user_pages*().
+ *
+ * The return value is partially fuzzy: false is not fuzzy, because it means
+ * "definitely not pinned for DMA", but true means "probably pinned for DMA, 
but
+ * possibly a false positive due to having at least GUP_PIN_COUNTING_BIAS worth
+ * of normal page references".
+ *
+ * False positives are OK, because: a) it's unlikely for a page to get that 
many
+ * refcounts, and b) all the callers of this routine are expected to be able to
+ * deal gracefully with a false positive.
+ *
+ * For more information, please see Documentation/vm/pin_user_pages.rst.
+ *
+ * @page:      pointer to page to be queried.
+ * @Return:    True, if it is likely that the page has been "dma-pinned".
+ *             False, if the page is definitely not dma-pinned.
+ */
+static inline bool page_dma_pinned(struct page *page)
+{
+       return (page_ref_count(compound_head(page))) >= GUP_PIN_COUNTING_BIAS;
+}
+
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define SECTION_IN_PAGE_FLAGS
 #endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 89d8ff06c9ce..a7418f7a44da 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -244,6 +244,8 @@ enum node_stat_item {
        NR_DIRTIED,             /* page dirtyings since bootup */
        NR_WRITTEN,             /* page writings since bootup */
        NR_KERNEL_MISC_RECLAIMABLE,     /* reclaimable non-slab kernel pages */
+       NR_FOLL_PIN_REQUESTED,  /* via: pin_user_page(), gup flag: FOLL_PIN */
+       NR_FOLL_PIN_RETURNED,   /* pages returned via unpin_user_page() */
        NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 14d14beb1f7f..b9cbe553d1e7 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -102,6 +102,16 @@ static inline void page_ref_sub(struct page *page, int nr)
                __page_ref_mod(page, -nr);
 }
 
+static inline int page_ref_sub_return(struct page *page, int nr)
+{
+       int ret = atomic_sub_return(nr, &page->_refcount);
+
+       if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
+               __page_ref_mod(page, -nr);
+
+       return ret;
+}
+
 static inline void page_ref_inc(struct page *page)
 {
        atomic_inc(&page->_refcount);
diff --git a/mm/gup.c b/mm/gup.c
index 73aedcefa4bd..4da3ab274b10 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -36,6 +36,20 @@ static __always_inline long __gup_longterm_locked(struct 
task_struct *tsk,
                                                  struct page **pages,
                                                  struct vm_area_struct **vmas,
                                                  unsigned int flags);
+
+#ifdef CONFIG_DEBUG_VM
+static inline void __update_proc_vmstat(struct page *page,
+                                       enum node_stat_item item, int count)
+{
+       mod_node_page_state(page_pgdat(page), item, count);
+}
+#else
+static inline void __update_proc_vmstat(struct page *page,
+                                       enum node_stat_item item, int count)
+{
+}
+#endif
+
 /*
  * Return the compound head page with ref appropriately incremented,
  * or NULL if that failed.
@@ -51,6 +65,156 @@ static inline struct page *try_get_compound_head(struct 
page *page, int refs)
        return head;
 }
 
+/**
+ * try_pin_compound_head() - mark a compound page as being used by
+ * pin_user_pages*().
+ *
+ * This is the FOLL_PIN counterpart to try_get_compound_head().
+ *
+ * @page:      pointer to page to be marked
+ * @Return:    the compound head page, with ref appropriately incremented,
+ * or NULL upon failure.
+ */
+__must_check struct page *try_pin_compound_head(struct page *page, int refs)
+{
+       struct page *head = try_get_compound_head(page,
+                                                 GUP_PIN_COUNTING_BIAS * refs);
+       if (!head)
+               return NULL;
+
+       __update_proc_vmstat(page, NR_FOLL_PIN_REQUESTED, refs);
+       return head;
+}
+
+/*
+ * try_grab_compound_head() - attempt to elevate a page's refcount, by a
+ * flags-dependent amount.
+ *
+ * "grab" names in this file mean, "look at flags to decide whether to use
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ *
+ * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
+ * same time. (That's true throughout the get_user_pages*() and
+ * pin_user_pages*() APIs.) Cases:
+ *
+ *     FOLL_GET: page's refcount will be incremented by 1.
+ *      FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ *
+ * Return: head page (with refcount appropriately incremented) for success, or
+ * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
+ * considered failure, and furthermore, a likely bug in the caller, so a 
warning
+ * is also emitted.
+ */
+static __maybe_unused struct page *try_grab_compound_head(struct page *page,
+                                                         int refs,
+                                                         unsigned int flags)
+{
+       if (flags & FOLL_GET)
+               return try_get_compound_head(page, refs);
+       else if (flags & FOLL_PIN)
+               return try_pin_compound_head(page, refs);
+
+       WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0);
+       return NULL;
+}
+
+/**
+ * try_grab_page() - elevate a page's refcount by a flag-dependent amount
+ *
+ * This might not do anything at all, depending on the flags argument.
+ *
+ * "grab" names in this file mean, "look at flags to decide whether to use
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ *
+ * @page:      pointer to page to be grabbed
+ * @flags:     gup flags: these are the FOLL_* flag values.
+ *
+ * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the 
same
+ * time. Cases:
+ *
+ *     FOLL_GET: page's refcount will be incremented by 1.
+ *      FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ *
+ * Return: true for success, or if no action was required (if neither FOLL_PIN
+ * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
+ * FOLL_PIN was set, but the page could not be grabbed.
+ */
+bool __must_check try_grab_page(struct page *page, unsigned int flags)
+{
+       if (flags & FOLL_GET)
+               return try_get_page(page);
+       else if (flags & FOLL_PIN) {
+               page = compound_head(page);
+               WARN_ON_ONCE(flags & FOLL_GET);
+
+               if (WARN_ON_ONCE(page_ref_zero_or_close_to_bias_overflow(page)))
+                       return false;
+
+               page_ref_add(page, GUP_PIN_COUNTING_BIAS);
+               __update_proc_vmstat(page, NR_FOLL_PIN_REQUESTED, 1);
+       }
+
+       return true;
+}
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+static bool __unpin_devmap_managed_user_page(struct page *page)
+{
+       bool is_devmap = page_is_devmap_managed(page);
+
+       if (is_devmap) {
+               int count = page_ref_sub_return(page, GUP_PIN_COUNTING_BIAS);
+
+               __update_proc_vmstat(page, NR_FOLL_PIN_RETURNED, 1);
+               /*
+                * devmap page refcounts are 1-based, rather than 0-based: if
+                * refcount is 1, then the page is free and the refcount is
+                * stable because nobody holds a reference on the page.
+                */
+               if (count == 1)
+                       free_devmap_managed_page(page);
+               else if (!count)
+                       __put_page(page);
+       }
+
+       return is_devmap;
+}
+#else
+static bool __unpin_devmap_managed_user_page(struct page *page)
+{
+       return false;
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
+/**
+ * unpin_user_page() - release a dma-pinned page
+ * @page:            pointer to page to be released
+ *
+ * Pages that were pinned via pin_user_pages*() must be released via either
+ * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
+ * that such pages can be separately tracked and uniquely handled. In
+ * particular, interactions with RDMA and filesystems need special handling.
+ */
+void unpin_user_page(struct page *page)
+{
+       page = compound_head(page);
+
+       /*
+        * For devmap managed pages we need to catch refcount transition from
+        * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
+        * page is free and we need to inform the device driver through
+        * callback. See include/linux/memremap.h and HMM for details.
+        */
+       if (__unpin_devmap_managed_user_page(page))
+               return;
+
+       if (page_ref_sub_and_test(page, GUP_PIN_COUNTING_BIAS))
+               __put_page(page);
+
+       __update_proc_vmstat(page, NR_FOLL_PIN_RETURNED, 1);
+}
+EXPORT_SYMBOL(unpin_user_page);
+
 /**
  * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned 
pages
  * @pages:  array of pages to be maybe marked dirty, and definitely released.
@@ -237,10 +401,11 @@ static struct page *follow_page_pte(struct vm_area_struct 
*vma,
        }
 
        page = vm_normal_page(vma, address, pte);
-       if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+       if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
                /*
-                * Only return device mapping pages in the FOLL_GET case since
-                * they are only valid while holding the pgmap reference.
+                * Only return device mapping pages in the FOLL_GET or FOLL_PIN
+                * case since they are only valid while holding the pgmap
+                * reference.
                 */
                *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
                if (*pgmap)
@@ -278,11 +443,10 @@ static struct page *follow_page_pte(struct vm_area_struct 
*vma,
                goto retry;
        }
 
-       if (flags & FOLL_GET) {
-               if (unlikely(!try_get_page(page))) {
-                       page = ERR_PTR(-ENOMEM);
-                       goto out;
-               }
+       /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
+       if (unlikely(!try_grab_page(page, flags))) {
+               page = ERR_PTR(-ENOMEM);
+               goto out;
        }
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
@@ -544,7 +708,7 @@ static struct page *follow_page_mask(struct vm_area_struct 
*vma,
        /* make this handle hugepd */
        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
        if (!IS_ERR(page)) {
-               BUG_ON(flags & FOLL_GET);
+               WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
                return page;
        }
 
@@ -1131,6 +1295,36 @@ static __always_inline long 
__get_user_pages_locked(struct task_struct *tsk,
        return pages_done;
 }
 
+static long __get_user_pages_remote(struct task_struct *tsk,
+                                   struct mm_struct *mm,
+                                   unsigned long start, unsigned long nr_pages,
+                                   unsigned int gup_flags, struct page **pages,
+                                   struct vm_area_struct **vmas, int *locked)
+{
+       /*
+        * Parts of FOLL_LONGTERM behavior are incompatible with
+        * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+        * vmas. However, this only comes up if locked is set, and there are
+        * callers that do request FOLL_LONGTERM, but do not set locked. So,
+        * allow what we can.
+        */
+       if (gup_flags & FOLL_LONGTERM) {
+               if (WARN_ON_ONCE(locked))
+                       return -EINVAL;
+               /*
+                * This will check the vmas (even if our vmas arg is NULL)
+                * and return -ENOTSUPP if DAX isn't allowed in this case:
+                */
+               return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+                                            vmas, gup_flags | FOLL_TOUCH |
+                                            FOLL_REMOTE);
+       }
+
+       return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+                                      locked,
+                                      gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+}
+
 /*
  * get_user_pages_remote() - pin user pages in memory
  * @tsk:       the task_struct to use for page fault accounting, or
@@ -1205,28 +1399,8 @@ long get_user_pages_remote(struct task_struct *tsk, 
struct mm_struct *mm,
        if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
                return -EINVAL;
 
-       /*
-        * Parts of FOLL_LONGTERM behavior are incompatible with
-        * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
-        * vmas. However, this only comes up if locked is set, and there are
-        * callers that do request FOLL_LONGTERM, but do not set locked. So,
-        * allow what we can.
-        */
-       if (gup_flags & FOLL_LONGTERM) {
-               if (WARN_ON_ONCE(locked))
-                       return -EINVAL;
-               /*
-                * This will check the vmas (even if our vmas arg is NULL)
-                * and return -ENOTSUPP if DAX isn't allowed in this case:
-                */
-               return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
-                                            vmas, gup_flags | FOLL_TOUCH |
-                                            FOLL_REMOTE);
-       }
-
-       return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
-                                      locked,
-                                      gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+       return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+                                      pages, vmas, locked);
 }
 EXPORT_SYMBOL(get_user_pages_remote);
 
@@ -1421,10 +1595,11 @@ static long __get_user_pages_locked(struct task_struct 
*tsk,
        return i ? : -EFAULT;
 }
 
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
-                          unsigned long start, unsigned long nr_pages,
-                          unsigned int gup_flags, struct page **pages,
-                          struct vm_area_struct **vmas, int *locked)
+static long __get_user_pages_remote(struct task_struct *tsk,
+                                   struct mm_struct *mm,
+                                   unsigned long start, unsigned long nr_pages,
+                                   unsigned int gup_flags, struct page **pages,
+                                   struct vm_area_struct **vmas, int *locked)
 {
        return 0;
 }
@@ -1864,13 +2039,17 @@ static inline pte_t gup_get_pte(pte_t *ptep)
 #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
 
 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
+                                           unsigned int flags,
                                            struct page **pages)
 {
        while ((*nr) - nr_start) {
                struct page *page = pages[--(*nr)];
 
                ClearPageReferenced(page);
-               put_page(page);
+               if (flags & FOLL_PIN)
+                       unpin_user_page(page);
+               else
+                       put_page(page);
        }
 }
 
@@ -1903,7 +2082,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
 
                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
                        if (unlikely(!pgmap)) {
-                               undo_dev_pagemap(nr, nr_start, pages);
+                               undo_dev_pagemap(nr, nr_start, flags, pages);
                                goto pte_unmap;
                        }
                } else if (pte_special(pte))
@@ -1912,7 +2091,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);
 
-               head = try_get_compound_head(page, 1);
+               head = try_grab_compound_head(page, 1, flags);
                if (!head)
                        goto pte_unmap;
 
@@ -1957,7 +2136,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
 
 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
-               unsigned long end, struct page **pages, int *nr)
+                            unsigned long end, unsigned int flags,
+                            struct page **pages, int *nr)
 {
        int nr_start = *nr;
        struct dev_pagemap *pgmap = NULL;
@@ -1967,12 +2147,15 @@ static int __gup_device_huge(unsigned long pfn, 
unsigned long addr,
 
                pgmap = get_dev_pagemap(pfn, pgmap);
                if (unlikely(!pgmap)) {
-                       undo_dev_pagemap(nr, nr_start, pages);
+                       undo_dev_pagemap(nr, nr_start, flags, pages);
                        return 0;
                }
                SetPageReferenced(page);
                pages[*nr] = page;
-               get_page(page);
+               if (unlikely(!try_grab_page(page, flags))) {
+                       undo_dev_pagemap(nr, nr_start, flags, pages);
+                       return 0;
+               }
                (*nr)++;
                pfn++;
        } while (addr += PAGE_SIZE, addr != end);
@@ -1983,48 +2166,52 @@ static int __gup_device_huge(unsigned long pfn, 
unsigned long addr,
 }
 
 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
-               unsigned long end, struct page **pages, int *nr)
+                                unsigned long end, unsigned int flags,
+                                struct page **pages, int *nr)
 {
        unsigned long fault_pfn;
        int nr_start = *nr;
 
        fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-       if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+       if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
                return 0;
 
        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
-               undo_dev_pagemap(nr, nr_start, pages);
+               undo_dev_pagemap(nr, nr_start, flags, pages);
                return 0;
        }
        return 1;
 }
 
 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
-               unsigned long end, struct page **pages, int *nr)
+                                unsigned long end, unsigned int flags,
+                                struct page **pages, int *nr)
 {
        unsigned long fault_pfn;
        int nr_start = *nr;
 
        fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-       if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+       if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
                return 0;
 
        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
-               undo_dev_pagemap(nr, nr_start, pages);
+               undo_dev_pagemap(nr, nr_start, flags, pages);
                return 0;
        }
        return 1;
 }
 #else
 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
-               unsigned long end, struct page **pages, int *nr)
+                                unsigned long end, unsigned int flags,
+                                struct page **pages, int *nr)
 {
        BUILD_BUG();
        return 0;
 }
 
 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
-               unsigned long end, struct page **pages, int *nr)
+                                unsigned long end, unsigned int flags,
+                                struct page **pages, int *nr)
 {
        BUILD_BUG();
        return 0;
@@ -2042,8 +2229,11 @@ static int record_subpages(struct page *page, unsigned 
long addr,
        return nr;
 }
 
-static void put_compound_head(struct page *page, int refs)
+static void put_compound_head(struct page *page, int refs, unsigned int flags)
 {
+       if (flags & FOLL_PIN)
+               refs *= GUP_PIN_COUNTING_BIAS;
+
        /* Do a get_page() first, in case refs == page->_refcount */
        get_page(page);
        page_ref_sub(page, refs);
@@ -2083,12 +2273,12 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, 
unsigned long addr,
        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
        refs = record_subpages(page, addr, end, pages + *nr);
 
-       head = try_get_compound_head(head, refs);
+       head = try_grab_compound_head(head, refs, flags);
        if (!head)
                return 0;
 
        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-               put_compound_head(head, refs);
+               put_compound_head(head, refs, flags);
                return 0;
        }
 
@@ -2136,18 +2326,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, 
unsigned long addr,
        if (pmd_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
-               return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+               return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
+                                            pages, nr);
        }
 
        page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        refs = record_subpages(page, addr, end, pages + *nr);
 
-       head = try_get_compound_head(pmd_page(orig), refs);
+       head = try_grab_compound_head(pmd_page(orig), refs, flags);
        if (!head)
                return 0;
 
        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
-               put_compound_head(head, refs);
+               put_compound_head(head, refs, flags);
                return 0;
        }
 
@@ -2157,7 +2348,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned 
long addr,
 }
 
 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
-               unsigned long end, unsigned int flags, struct page **pages, int 
*nr)
+                       unsigned long end, unsigned int flags,
+                       struct page **pages, int *nr)
 {
        struct page *head, *page;
        int refs;
@@ -2168,18 +2360,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, 
unsigned long addr,
        if (pud_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
-               return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
+               return __gup_device_huge_pud(orig, pudp, addr, end, flags,
+                                            pages, nr);
        }
 
        page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        refs = record_subpages(page, addr, end, pages + *nr);
 
-       head = try_get_compound_head(pud_page(orig), refs);
+       head = try_grab_compound_head(pud_page(orig), refs, flags);
        if (!head)
                return 0;
 
        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
-               put_compound_head(head, refs);
+               put_compound_head(head, refs, flags);
                return 0;
        }
 
@@ -2203,12 +2396,12 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, 
unsigned long addr,
        page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
        refs = record_subpages(page, addr, end, pages + *nr);
 
-       head = try_get_compound_head(pgd_page(orig), refs);
+       head = try_grab_compound_head(pgd_page(orig), refs, flags);
        if (!head)
                return 0;
 
        if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
-               put_compound_head(head, refs);
+               put_compound_head(head, refs, flags);
                return 0;
        }
 
@@ -2371,6 +2564,14 @@ int __get_user_pages_fast(unsigned long start, int 
nr_pages, int write,
        unsigned long len, end;
        unsigned long flags;
        int nr = 0;
+       /*
+        * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
+        * because gup fast is always a "pin with a +1 page refcount" request.
+        */
+       unsigned int gup_flags = FOLL_GET;
+
+       if (write)
+               gup_flags |= FOLL_WRITE;
 
        start = untagged_addr(start) & PAGE_MASK;
        len = (unsigned long) nr_pages << PAGE_SHIFT;
@@ -2396,7 +2597,7 @@ int __get_user_pages_fast(unsigned long start, int 
nr_pages, int write,
        if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
            gup_fast_permitted(start, end)) {
                local_irq_save(flags);
-               gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
+               gup_pgd_range(start, end, gup_flags, pages, &nr);
                local_irq_restore(flags);
        }
 
@@ -2435,7 +2636,7 @@ static int internal_get_user_pages_fast(unsigned long 
start, int nr_pages,
        int nr = 0, ret = 0;
 
        if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
-                                      FOLL_FORCE | FOLL_PIN)))
+                                      FOLL_FORCE | FOLL_PIN | FOLL_GET)))
                return -EINVAL;
 
        start = untagged_addr(start) & PAGE_MASK;
@@ -2502,6 +2703,13 @@ int get_user_pages_fast(unsigned long start, int 
nr_pages,
        if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
                return -EINVAL;
 
+       /*
+        * The caller may or may not have explicitly set FOLL_GET; either way is
+        * OK. However, internally (within mm/gup.c), gup fast variants must set
+        * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
+        * request.
+        */
+       gup_flags |= FOLL_GET;
        return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -2509,9 +2717,12 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
 /**
  * pin_user_pages_fast() - pin user pages in memory without taking locks
  *
- * For now, this is a placeholder function, until various call sites are
- * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
- * this is identical to get_user_pages_fast().
+ * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
+ * get_user_pages_fast() for documentation on the function arguments, because
+ * the arguments here are identical.
+ *
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
+ * see Documentation/vm/pin_user_pages.rst for further details.
  *
  * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
  * is NOT intended for Case 2 (RDMA: long-term pins).
@@ -2519,21 +2730,24 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
 int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages)
 {
-       /*
-        * This is a placeholder, until the pin functionality is activated.
-        * Until then, just behave like the corresponding get_user_pages*()
-        * routine.
-        */
-       return get_user_pages_fast(start, nr_pages, gup_flags, pages);
+       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+       if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+               return -EINVAL;
+
+       gup_flags |= FOLL_PIN;
+       return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
 }
 EXPORT_SYMBOL_GPL(pin_user_pages_fast);
 
 /**
  * pin_user_pages_remote() - pin pages of a remote process (task != current)
  *
- * For now, this is a placeholder function, until various call sites are
- * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
- * this is identical to get_user_pages_remote().
+ * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
+ * get_user_pages_remote() for documentation on the function arguments, because
+ * the arguments here are identical.
+ *
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
+ * see Documentation/vm/pin_user_pages.rst for details.
  *
  * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
  * is NOT intended for Case 2 (RDMA: long-term pins).
@@ -2543,22 +2757,24 @@ long pin_user_pages_remote(struct task_struct *tsk, 
struct mm_struct *mm,
                           unsigned int gup_flags, struct page **pages,
                           struct vm_area_struct **vmas, int *locked)
 {
-       /*
-        * This is a placeholder, until the pin functionality is activated.
-        * Until then, just behave like the corresponding get_user_pages*()
-        * routine.
-        */
-       return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages,
-                                    vmas, locked);
+       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+       if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+               return -EINVAL;
+
+       gup_flags |= FOLL_PIN;
+       return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+                                      pages, vmas, locked);
 }
 EXPORT_SYMBOL(pin_user_pages_remote);
 
 /**
  * pin_user_pages() - pin user pages in memory for use by other devices
  *
- * For now, this is a placeholder function, until various call sites are
- * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
- * this is identical to get_user_pages().
+ * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
+ * FOLL_PIN is set.
+ *
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
+ * see Documentation/vm/pin_user_pages.rst for details.
  *
  * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
  * is NOT intended for Case 2 (RDMA: long-term pins).
@@ -2567,11 +2783,12 @@ long pin_user_pages(unsigned long start, unsigned long 
nr_pages,
                    unsigned int gup_flags, struct page **pages,
                    struct vm_area_struct **vmas)
 {
-       /*
-        * This is a placeholder, until the pin functionality is activated.
-        * Until then, just behave like the corresponding get_user_pages*()
-        * routine.
-        */
-       return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+       if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+               return -EINVAL;
+
+       gup_flags |= FOLL_PIN;
+       return __gup_longterm_locked(current, current->mm, start, nr_pages,
+                                    pages, vmas, gup_flags);
 }
 EXPORT_SYMBOL(pin_user_pages);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 41a0fbddc96b..ebaa6961ff76 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -945,6 +945,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, 
unsigned long addr,
         */
        WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW 
set");
 
+       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+       if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+                        (FOLL_PIN | FOLL_GET)))
+               return NULL;
+
        if (flags & FOLL_WRITE && !pmd_write(*pmd))
                return NULL;
 
@@ -960,7 +965,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, 
unsigned long addr,
         * device mapped pages can only be returned if the
         * caller will manage the page reference count.
         */
-       if (!(flags & FOLL_GET))
+       if (!(flags & (FOLL_GET | FOLL_PIN)))
                return ERR_PTR(-EEXIST);
 
        pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
@@ -968,7 +973,8 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, 
unsigned long addr,
        if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
-       get_page(page);
+       if (!try_grab_page(page, flags))
+               page = ERR_PTR(-EFAULT);
 
        return page;
 }
@@ -1088,6 +1094,11 @@ struct page *follow_devmap_pud(struct vm_area_struct 
*vma, unsigned long addr,
        if (flags & FOLL_WRITE && !pud_write(*pud))
                return NULL;
 
+       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+       if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+                        (FOLL_PIN | FOLL_GET)))
+               return NULL;
+
        if (pud_present(*pud) && pud_devmap(*pud))
                /* pass */;
        else
@@ -1099,8 +1110,10 @@ struct page *follow_devmap_pud(struct vm_area_struct 
*vma, unsigned long addr,
        /*
         * device mapped pages can only be returned if the
         * caller will manage the page reference count.
+        *
+        * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
         */
-       if (!(flags & FOLL_GET))
+       if (!(flags & (FOLL_GET | FOLL_PIN)))
                return ERR_PTR(-EEXIST);
 
        pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
@@ -1108,7 +1121,8 @@ struct page *follow_devmap_pud(struct vm_area_struct 
*vma, unsigned long addr,
        if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
-       get_page(page);
+       if (!try_grab_page(page, flags))
+               page = ERR_PTR(-EFAULT);
 
        return page;
 }
@@ -1468,6 +1482,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct 
*vma,
 {
        struct mm_struct *mm = vma->vm_mm;
        struct page *page = NULL;
+       struct page *subpage = NULL;
 
        assert_spin_locked(pmd_lockptr(mm, pmd));
 
@@ -1486,6 +1501,14 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct 
*vma,
        VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
        if (flags & FOLL_TOUCH)
                touch_pmd(vma, addr, pmd, flags);
+
+       subpage = page;
+       subpage += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+       VM_BUG_ON_PAGE(!PageCompound(subpage) &&
+                      !is_zone_device_page(subpage), subpage);
+       if (!try_grab_page(subpage, flags))
+               return ERR_PTR(-EFAULT);
+
        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                /*
                 * We don't mlock() pte-mapped THPs. This way we can avoid
@@ -1509,24 +1532,18 @@ struct page *follow_trans_huge_pmd(struct 
vm_area_struct *vma,
                 */
 
                if (PageAnon(page) && compound_mapcount(page) != 1)
-                       goto skip_mlock;
+                       goto out;
                if (PageDoubleMap(page) || !page->mapping)
-                       goto skip_mlock;
+                       goto out;
                if (!trylock_page(page))
-                       goto skip_mlock;
+                       goto out;
                lru_add_drain();
                if (page->mapping && !PageDoubleMap(page))
                        mlock_vma_page(page);
                unlock_page(page);
        }
-skip_mlock:
-       page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
-       VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
-       if (flags & FOLL_GET)
-               get_page(page);
-
 out:
-       return page;
+       return subpage;
 }
 
 /* NUMA hinting page fault entry point for trans huge pmds */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac65bb5e38ac..0e21bbe9f017 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4326,19 +4326,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                page = pte_page(huge_ptep_get(pte));
 
-               /*
-                * Instead of doing 'try_get_page()' below in the same_page
-                * loop, just check the count once here.
-                */
-               if (unlikely(page_count(page) <= 0)) {
-                       if (pages) {
-                               spin_unlock(ptl);
-                               remainder = 0;
-                               err = -ENOMEM;
-                               break;
-                       }
-               }
-
                /*
                 * If subpage information not requested, update counters
                 * and skip the same_page loop below.
@@ -4356,7 +4343,13 @@ long follow_hugetlb_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
 same_page:
                if (pages) {
                        pages[i] = mem_map_offset(page, pfn_offset);
-                       get_page(pages[i]);
+                       if (!try_grab_page(pages[i], flags)) {
+                               spin_unlock(ptl);
+                               remainder = 0;
+                               err = -ENOMEM;
+                               WARN_ON_ONCE(1);
+                               break;
+                       }
                }
 
                if (vmas)
@@ -4916,6 +4909,12 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long 
address,
        struct page *page = NULL;
        spinlock_t *ptl;
        pte_t pte;
+
+       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+       if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+                        (FOLL_PIN | FOLL_GET)))
+               return NULL;
+
 retry:
        ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
@@ -4928,8 +4927,11 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long 
address,
        pte = huge_ptep_get((pte_t *)pmd);
        if (pte_present(pte)) {
                page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
-               if (flags & FOLL_GET)
-                       get_page(page);
+               if (unlikely(!try_grab_page(page, flags))) {
+                       WARN_ON_ONCE(1);
+                       page = NULL;
+                       goto out;
+               }
        } else {
                if (is_hugetlb_entry_migration(pte)) {
                        spin_unlock(ptl);
@@ -4950,7 +4952,7 @@ struct page * __weak
 follow_huge_pud(struct mm_struct *mm, unsigned long address,
                pud_t *pud, int flags)
 {
-       if (flags & FOLL_GET)
+       if (flags & (FOLL_GET | FOLL_PIN))
                return NULL;
 
        return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
@@ -4959,7 +4961,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long 
address,
 struct page * __weak
 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int 
flags)
 {
-       if (flags & FOLL_GET)
+       if (flags & (FOLL_GET | FOLL_PIN))
                return NULL;
 
        return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> 
PAGE_SHIFT);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 78d53378db99..b56808bae1b4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1168,6 +1168,8 @@ const char * const vmstat_text[] = {
        "nr_dirtied",
        "nr_written",
        "nr_kernel_misc_reclaimable",
+       "nr_foll_pin_requested",
+       "nr_foll_pin_returned",
 
        /* enum writeback_stat_item counters */
        "nr_dirty_threshold",
-- 
2.24.0

[PATCH v10 23/25] mm/gup: track FOLL_PIN pages

Reply via email to