From: Zi Yan <[email protected]>

To split a THP to any lower order (except order-1) pages, we need to
reform THPs on subpages at given order and add page refcount based on the
new page order. Also we need to reinitialize page_deferred_list after
removing the page from the split_queue, otherwise a subsequent split will
see list corruption when checking the page_deferred_list again.

It has many uses, like minimizing the number of pages after
truncating a huge pagecache page. For anonymous THPs, we can only split
them to order-0 like before until we add support for any size anonymous
THPs.

Order-1 folio is not supported because _deferred_list, which is used by
partially mapped folios, is stored in subpage 2 and an order-1 folio only
has subpage 0 and 1.

Signed-off-by: Zi Yan <[email protected]>
---
 include/linux/huge_mm.h |  21 +++++---
 mm/huge_memory.c        | 114 +++++++++++++++++++++++++++++++---------
 2 files changed, 101 insertions(+), 34 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 5adb86af35fc..de0c89105076 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -265,10 +265,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, 
unsigned long addr,
 
 void folio_prep_large_rmappable(struct folio *folio);
 bool can_split_folio(struct folio *folio, int *pextra_pins);
-int split_huge_page_to_list(struct page *page, struct list_head *list);
+int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+               unsigned int new_order);
 static inline int split_huge_page(struct page *page)
 {
-       return split_huge_page_to_list(page, NULL);
+       return split_huge_page_to_list_to_order(page, NULL, 0);
 }
 void deferred_split_folio(struct folio *folio);
 
@@ -422,7 +423,8 @@ can_split_folio(struct folio *folio, int *pextra_pins)
        return false;
 }
 static inline int
-split_huge_page_to_list(struct page *page, struct list_head *list)
+split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+               unsigned int new_order)
 {
        return 0;
 }
@@ -519,17 +521,20 @@ static inline bool thp_migration_supported(void)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-static inline int split_folio_to_list(struct folio *folio,
-               struct list_head *list)
+static inline int split_folio_to_list_to_order(struct folio *folio,
+               struct list_head *list, int new_order)
 {
-       return split_huge_page_to_list(&folio->page, list);
+       return split_huge_page_to_list_to_order(&folio->page, list, new_order);
 }
 
-static inline int split_folio(struct folio *folio)
+static inline int split_folio_to_order(struct folio *folio, int new_order)
 {
-       return split_folio_to_list(folio, NULL);
+       return split_folio_to_list_to_order(folio, NULL, new_order);
 }
 
+#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0)
+#define split_folio(f) split_folio_to_order(f, 0)
+
 /*
  * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
  * limitations in the implementation like arm64 MTE can override this to
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ad7133c97428..d0e555a8ea98 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2718,11 +2718,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
 
 static void unmap_folio(struct folio *folio)
 {
-       enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
-               TTU_SYNC | TTU_BATCH_FLUSH;
+       enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
+               TTU_BATCH_FLUSH;
 
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
 
+       if (folio_test_pmd_mappable(folio))
+               ttu_flags |= TTU_SPLIT_HUGE_PMD;
+
        /*
         * Anon pages need migration entries to preserve them, but file
         * pages can simply be left unmapped, then faulted back on demand.
@@ -2756,7 +2759,6 @@ static void lru_add_page_tail(struct page *head, struct 
page *tail,
                struct lruvec *lruvec, struct list_head *list)
 {
        VM_BUG_ON_PAGE(!PageHead(head), head);
-       VM_BUG_ON_PAGE(PageCompound(tail), head);
        VM_BUG_ON_PAGE(PageLRU(tail), head);
        lockdep_assert_held(&lruvec->lru_lock);
 
@@ -2777,7 +2779,8 @@ static void lru_add_page_tail(struct page *head, struct 
page *tail,
 }
 
 static void __split_huge_page_tail(struct folio *folio, int tail,
-               struct lruvec *lruvec, struct list_head *list)
+               struct lruvec *lruvec, struct list_head *list,
+               unsigned int new_order)
 {
        struct page *head = &folio->page;
        struct page *page_tail = head + tail;
@@ -2847,10 +2850,15 @@ static void __split_huge_page_tail(struct folio *folio, 
int tail,
         * which needs correct compound_head().
         */
        clear_compound_head(page_tail);
+       if (new_order) {
+               prep_compound_page(page_tail, new_order);
+               folio_prep_large_rmappable(page_folio(page_tail));
+       }
 
        /* Finally unfreeze refcount. Additional reference from page cache. */
-       page_ref_unfreeze(page_tail, 1 + (!folio_test_anon(folio) ||
-                                         folio_test_swapcache(folio)));
+       page_ref_unfreeze(page_tail,
+               1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
+                            folio_nr_pages(page_folio(page_tail)) : 0));
 
        if (folio_test_young(folio))
                folio_set_young(new_folio);
@@ -2868,7 +2876,7 @@ static void __split_huge_page_tail(struct folio *folio, 
int tail,
 }
 
 static void __split_huge_page(struct page *page, struct list_head *list,
-               pgoff_t end)
+               pgoff_t end, unsigned int new_order)
 {
        struct folio *folio = page_folio(page);
        struct page *head = &folio->page;
@@ -2877,10 +2885,11 @@ static void __split_huge_page(struct page *page, struct 
list_head *list,
        unsigned long offset = 0;
        unsigned int nr = thp_nr_pages(head);
        int i, nr_dropped = 0;
+       unsigned int new_nr = 1 << new_order;
        int order = folio_order(folio);
 
        /* complete memcg works before add pages to LRU */
-       split_page_memcg(head, order, 0);
+       split_page_memcg(head, order, new_order);
 
        if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
                offset = swp_offset(folio->swap);
@@ -2893,8 +2902,8 @@ static void __split_huge_page(struct page *page, struct 
list_head *list,
 
        ClearPageHasHWPoisoned(head);
 
-       for (i = nr - 1; i >= 1; i--) {
-               __split_huge_page_tail(folio, i, lruvec, list);
+       for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
+               __split_huge_page_tail(folio, i, lruvec, list, new_order);
                /* Some pages can be beyond EOF: drop them from page cache */
                if (head[i].index >= end) {
                        struct folio *tail = page_folio(head + i);
@@ -2910,29 +2919,41 @@ static void __split_huge_page(struct page *page, struct 
list_head *list,
                        __xa_store(&head->mapping->i_pages, head[i].index,
                                        head + i, 0);
                } else if (swap_cache) {
+                       /*
+                        * split anonymous THPs (including swapped out ones) to
+                        * non-zero order not supported
+                        */
+                       VM_WARN_ONCE(new_order,
+                               "Split swap-cached anon folio to non-0 order 
not supported");
                        __xa_store(&swap_cache->i_pages, offset + i,
                                        head + i, 0);
                }
        }
 
-       ClearPageCompound(head);
+       if (!new_order)
+               ClearPageCompound(head);
+       else {
+               struct folio *new_folio = (struct folio *)head;
+
+               folio_set_order(new_folio, new_order);
+       }
        unlock_page_lruvec(lruvec);
        /* Caller disabled irqs, so they are still disabled here */
 
-       split_page_owner(head, order, 0);
+       split_page_owner(head, order, new_order);
 
        /* See comment in __split_huge_page_tail() */
        if (PageAnon(head)) {
                /* Additional pin to swap cache */
                if (PageSwapCache(head)) {
-                       page_ref_add(head, 2);
+                       page_ref_add(head, 1 + new_nr);
                        xa_unlock(&swap_cache->i_pages);
                } else {
                        page_ref_inc(head);
                }
        } else {
                /* Additional pin to page cache */
-               page_ref_add(head, 2);
+               page_ref_add(head, 1 + new_nr);
                xa_unlock(&head->mapping->i_pages);
        }
        local_irq_enable();
@@ -2944,7 +2965,15 @@ static void __split_huge_page(struct page *page, struct 
list_head *list,
        if (folio_test_swapcache(folio))
                split_swap_cluster(folio->swap);
 
-       for (i = 0; i < nr; i++) {
+       /*
+        * set page to its compound_head when split to non order-0 pages, so
+        * we can skip unlocking it below, since PG_locked is transferred to
+        * the compound_head of the page and the caller will unlock it.
+        */
+       if (new_order)
+               page = compound_head(page);
+
+       for (i = 0; i < nr; i += new_nr) {
                struct page *subpage = head + i;
                if (subpage == page)
                        continue;
@@ -2978,29 +3007,35 @@ bool can_split_folio(struct folio *folio, int 
*pextra_pins)
 }
 
 /*
- * This function splits huge page into normal pages. @page can point to any
- * subpage of huge page to split. Split doesn't change the position of @page.
+ * This function splits huge page into pages in @new_order. @page can point to
+ * any subpage of huge page to split. Split doesn't change the position of
+ * @page.
+ *
+ * NOTE: order-1 folio is not supported because _deferred_list, which is used
+ * by partially mapped folios, is stored in subpage 2 and an order-1 folio
+ * only has subpage 0 and 1.
  *
  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
  * The huge page must be locked.
  *
  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
  *
- * Both head page and tail pages will inherit mapping, flags, and so on from
- * the hugepage.
+ * Pages in new_order will inherit mapping, flags, and so on from the hugepage.
  *
- * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
- * they are not mapped.
+ * GUP pin and PG_locked transferred to @page or the compound page @page 
belongs
+ * to. Rest subpages can be freed if they are not mapped.
  *
  * Returns 0 if the hugepage is split successfully.
  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
  * us.
  */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
+int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+                                    unsigned int new_order)
 {
        struct folio *folio = page_folio(page);
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
-       XA_STATE(xas, &folio->mapping->i_pages, folio->index);
+       /* reset xarray order to new order after split */
+       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
        struct anon_vma *anon_vma = NULL;
        struct address_space *mapping = NULL;
        int extra_pins, ret;
@@ -3010,6 +3045,26 @@ int split_huge_page_to_list(struct page *page, struct 
list_head *list)
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
 
+       /* Cannot split THP to order-1 (no order-1 THPs) */
+       if (new_order == 1) {
+               VM_WARN_ONCE(1, "Cannot split to order-1 folio");
+               return -EINVAL;
+       }
+
+       if (new_order) {
+               /* Split shmem folio to non-zero order not supported */
+               if (shmem_mapping(folio->mapping)) {
+                       VM_WARN_ONCE(1, "Split shmem folio to non-0 order not 
support");
+                       return -EINVAL;
+               }
+               /* No split if the file system does not support large folio */
+               if (!mapping_large_folio_support(folio->mapping)) {
+                       VM_WARN_ONCE(1, "Split file folio to non-0 order not 
support");
+                       return -EINVAL;
+               }
+       }
+
+
        is_hzp = is_huge_zero_page(&folio->page);
        if (is_hzp) {
                pr_warn_ratelimited("Called split_huge_page for huge zero 
page\n");
@@ -3105,14 +3160,21 @@ int split_huge_page_to_list(struct page *page, struct 
list_head *list)
        if (folio_ref_freeze(folio, 1 + extra_pins)) {
                if (!list_empty(&folio->_deferred_list)) {
                        ds_queue->split_queue_len--;
-                       list_del(&folio->_deferred_list);
+                       /*
+                        * Reinitialize page_deferred_list after removing the
+                        * page from the split_queue, otherwise a subsequent
+                        * split will see list corruption when checking the
+                        * page_deferred_list.
+                        */
+                       list_del_init(&folio->_deferred_list);
                }
                spin_unlock(&ds_queue->split_queue_lock);
                if (mapping) {
                        int nr = folio_nr_pages(folio);
 
                        xas_split(&xas, folio, folio_order(folio));
-                       if (folio_test_pmd_mappable(folio)) {
+                       if (folio_test_pmd_mappable(folio) &&
+                           new_order < HPAGE_PMD_ORDER) {
                                if (folio_test_swapbacked(folio)) {
                                        __lruvec_stat_mod_folio(folio,
                                                        NR_SHMEM_THPS, -nr);
@@ -3124,7 +3186,7 @@ int split_huge_page_to_list(struct page *page, struct 
list_head *list)
                        }
                }
 
-               __split_huge_page(page, list, end);
+               __split_huge_page(page, list, end, new_order);
                ret = 0;
        } else {
                spin_unlock(&ds_queue->split_queue_lock);
-- 
2.43.0


Reply via email to