On 2/26/26 04:23, Nico Pache wrote:
> generalize the order of the __collapse_huge_page_* functions
> to support future mTHP collapse.
> 
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
> 
> No functional changes in this patch.
> 
> Reviewed-by: Wei Yang <[email protected]>
> Reviewed-by: Lance Yang <[email protected]>
> Reviewed-by: Lorenzo Stoakes <[email protected]>
> Reviewed-by: Baolin Wang <[email protected]>
> Co-developed-by: Dev Jain <[email protected]>
> Signed-off-by: Dev Jain <[email protected]>
> Signed-off-by: Nico Pache <[email protected]>
> ---
>  mm/khugepaged.c | 73 +++++++++++++++++++++++++++++++------------------
>  1 file changed, 47 insertions(+), 26 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index a9b645402b7f..ecdbbf6a01a6 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -535,7 +535,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
>  
>  static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct 
> *vma,
>               unsigned long start_addr, pte_t *pte, struct collapse_control 
> *cc,
> -             struct list_head *compound_pagelist)
> +             unsigned int order, struct list_head *compound_pagelist)
>  {
>       struct page *page = NULL;
>       struct folio *folio = NULL;
> @@ -543,15 +543,17 @@ static enum scan_result 
> __collapse_huge_page_isolate(struct vm_area_struct *vma,
>       pte_t *_pte;
>       int none_or_zero = 0, shared = 0, referenced = 0;
>       enum scan_result result = SCAN_FAIL;
> +     const unsigned long nr_pages = 1UL << order;
> +     int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - 
> order);

It might be a bit more readable to move "const unsigned long
nr_pages = 1UL << order;" all the way to the top.

Then, have here

        int max_ptes_none = 0;

and do at the beginning of the function:

        /* For MADV_COLLAPSE, we always collapse ... */
        if (!cc->is_khugepaged)
                max_ptes_none = HPAGE_PMD_NR;
        /*  ... except if userfaultf relies on MISSING faults. */
        if (!userfaultfd_armed(vma))
                max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - 
order);

(but see below regarding helper function)

then the code below becomes ...

>  
> -     for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> +     for (_pte = pte; _pte < pte + nr_pages;
>            _pte++, addr += PAGE_SIZE) {
>               pte_t pteval = ptep_get(_pte);
>               if (pte_none_or_zero(pteval)) {
>                       ++none_or_zero;
>                       if (!userfaultfd_armed(vma) &&
>                           (!cc->is_khugepaged ||
> -                          none_or_zero <= khugepaged_max_ptes_none)) {
> +                          none_or_zero <= max_ptes_none)) {

...

        if (none_or_zero <= max_ptes_none) {


I see that you do something like that (but slightly different) in the next
patch. You could easily extend the above by it.

Or go one step further and move all of that conditional into 
collapse_max_ptes_none(), whereby
you simply also pass the cc and the vma.

Then this all gets cleaned up and you'd end up above with

max_ptes_none = collapse_max_ptes_none(cc, vma, order);
if (max_ptes_none < 0)
        return result;

I'd do all that in this patch here, getting rid of #4.


>                               continue;
>                       } else {
>                               result = SCAN_EXCEED_NONE_PTE;
> @@ -585,8 +587,14 @@ static enum scan_result 
> __collapse_huge_page_isolate(struct vm_area_struct *vma,
>               /* See collapse_scan_pmd(). */
>               if (folio_maybe_mapped_shared(folio)) {
>                       ++shared;
> -                     if (cc->is_khugepaged &&
> -                         shared > khugepaged_max_ptes_shared) {
> +                     /*
> +                      * TODO: Support shared pages without leading to further
> +                      * mTHP collapses. Currently bringing in new pages via
> +                      * shared may cause a future higher order collapse on a
> +                      * rescan of the same range.
> +                      */
> +                     if (!is_pmd_order(order) || (cc->is_khugepaged &&
> +                         shared > khugepaged_max_ptes_shared)) {

That's not how we indent within a nested ().

To make this easier to read, what about similarly having at the beginning
of the function:

int max_ptes_shared = 0;

/* For MADV_COLLAPSE, we always collapse. */
if (cc->is_khugepaged)
        max_ptes_none = HPAGE_PMD_NR;
/* TODO ... */
if (is_pmd_order(order))
        max_ptes_none = khugepaged_max_ptes_shared;

to turn this code into a

        if (shared > khugepaged_max_ptes_shared)

Also, here, might make sense to have a collapse_max_ptes_swap(cc, order)
to do that and clean it up.


>                               result = SCAN_EXCEED_SHARED_PTE;
>                               count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>                               goto out;
> @@ -679,18 +687,18 @@ static enum scan_result 
> __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  }
>  
>  static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> -                                             struct vm_area_struct *vma,
> -                                             unsigned long address,
> -                                             spinlock_t *ptl,
> -                                             struct list_head 
> *compound_pagelist)
> +             struct vm_area_struct *vma, unsigned long address,
> +             spinlock_t *ptl, unsigned int order,
> +             struct list_head *compound_pagelist)
>  {
> -     unsigned long end = address + HPAGE_PMD_SIZE;
> +     unsigned long end = address + (PAGE_SIZE << order);
>       struct folio *src, *tmp;
>       pte_t pteval;
>       pte_t *_pte;
>       unsigned int nr_ptes;
> +     const unsigned long nr_pages = 1UL << order;

Move it further to the top.

>  
> -     for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
> +     for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
>            address += nr_ptes * PAGE_SIZE) {
>               nr_ptes = 1;
>               pteval = ptep_get(_pte);
> @@ -743,13 +751,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t 
> *pte,
>  }
>  
>  static void __collapse_huge_page_copy_failed(pte_t *pte,
> -                                          pmd_t *pmd,
> -                                          pmd_t orig_pmd,
> -                                          struct vm_area_struct *vma,
> -                                          struct list_head 
> *compound_pagelist)
> +             pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> +             unsigned int order, struct list_head *compound_pagelist)
>  {
>       spinlock_t *pmd_ptl;
> -
> +     const unsigned long nr_pages = 1UL << order;
>       /*
>        * Re-establish the PMD to point to the original page table
>        * entry. Restoring PMD needs to be done prior to releasing
> @@ -763,7 +769,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>        * Release both raw and compound pages isolated
>        * in __collapse_huge_page_isolate.
>        */
> -     release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> +     release_pte_pages(pte, pte + nr_pages, compound_pagelist);
>  }
>  
>  /*
> @@ -783,16 +789,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>   */
>  static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio 
> *folio,
>               pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> -             unsigned long address, spinlock_t *ptl,
> +             unsigned long address, spinlock_t *ptl, unsigned int order,
>               struct list_head *compound_pagelist)
>  {
>       unsigned int i;
>       enum scan_result result = SCAN_SUCCEED;
> -
> +     const unsigned long nr_pages = 1UL << order;

Same here, all the way to the top.

>       /*
>        * Copying pages' contents is subject to memory poison at any iteration.
>        */
> -     for (i = 0; i < HPAGE_PMD_NR; i++) {
> +     for (i = 0; i < nr_pages; i++) {
>               pte_t pteval = ptep_get(pte + i);
>               struct page *page = folio_page(folio, i);
>               unsigned long src_addr = address + i * PAGE_SIZE;
> @@ -811,10 +817,10 @@ static enum scan_result __collapse_huge_page_copy(pte_t 
> *pte, struct folio *foli
>  
>       if (likely(result == SCAN_SUCCEED))
>               __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> -                                                 compound_pagelist);
> +                                                 order, compound_pagelist);
>       else
>               __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> -                                              compound_pagelist);
> +                                              order, compound_pagelist);
>  
>       return result;
>  }
> @@ -985,12 +991,12 @@ static enum scan_result check_pmd_still_valid(struct 
> mm_struct *mm,
>   * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
>   */
>  static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
> -             struct vm_area_struct *vma, unsigned long start_addr, pmd_t 
> *pmd,
> -             int referenced)
> +             struct vm_area_struct *vma, unsigned long start_addr,
> +             pmd_t *pmd, int referenced, unsigned int order)
>  {
>       int swapped_in = 0;
>       vm_fault_t ret = 0;
> -     unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
> +     unsigned long addr, end = start_addr + (PAGE_SIZE << order);
>       enum scan_result result;
>       pte_t *pte = NULL;
>       spinlock_t *ptl;
> @@ -1022,6 +1028,19 @@ static enum scan_result 
> __collapse_huge_page_swapin(struct mm_struct *mm,
>                   pte_present(vmf.orig_pte))
>                       continue;
>  
> +             /*
> +              * TODO: Support swapin without leading to further mTHP
> +              * collapses. Currently bringing in new pages via swapin may
> +              * cause a future higher order collapse on a rescan of the same
> +              * range.
> +              */
> +             if (!is_pmd_order(order)) {
> +                     pte_unmap(pte);
> +                     mmap_read_unlock(mm);
> +                     result = SCAN_EXCEED_SWAP_PTE;
> +                     goto out;
> +             }
> +

Interesting, we just swapin everything we find :)

But do we really need this check here? I mean, we just found it to be present.

In the rare event that there was a race, do we really care? It was just
present, now it's swapped. Bad luck. Just swap it in.

-- 
Cheers,

David

Reply via email to