On 3/12/26 21:32, David Hildenbrand (Arm) wrote: > On 2/26/26 04:23, Nico Pache wrote: >> generalize the order of the __collapse_huge_page_* functions >> to support future mTHP collapse. >> >> mTHP collapse will not honor the khugepaged_max_ptes_shared or >> khugepaged_max_ptes_swap parameters, and will fail if it encounters a >> shared or swapped entry. >> >> No functional changes in this patch. >> >> Reviewed-by: Wei Yang <[email protected]> >> Reviewed-by: Lance Yang <[email protected]> >> Reviewed-by: Lorenzo Stoakes <[email protected]> >> Reviewed-by: Baolin Wang <[email protected]> >> Co-developed-by: Dev Jain <[email protected]> >> Signed-off-by: Dev Jain <[email protected]> >> Signed-off-by: Nico Pache <[email protected]> >> --- >> mm/khugepaged.c | 73 +++++++++++++++++++++++++++++++------------------ >> 1 file changed, 47 insertions(+), 26 deletions(-) >> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c >> index a9b645402b7f..ecdbbf6a01a6 100644 >> --- a/mm/khugepaged.c >> +++ b/mm/khugepaged.c >> @@ -535,7 +535,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, >> >> static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct >> *vma, >> unsigned long start_addr, pte_t *pte, struct collapse_control >> *cc, >> - struct list_head *compound_pagelist) >> + unsigned int order, struct list_head *compound_pagelist) >> { >> struct page *page = NULL; >> struct folio *folio = NULL; >> @@ -543,15 +543,17 @@ static enum scan_result >> __collapse_huge_page_isolate(struct vm_area_struct *vma, >> pte_t *_pte; >> int none_or_zero = 0, shared = 0, referenced = 0; >> enum scan_result result = SCAN_FAIL; >> + const unsigned long nr_pages = 1UL << order; >> + int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - >> order); > > It might be a bit more readable to move "const unsigned long > nr_pages = 1UL << order;" all the way to the top. > > Then, have here > > int max_ptes_none = 0; > > and do at the beginning of the function: > > /* For MADV_COLLAPSE, we always collapse ... */ > if (!cc->is_khugepaged) > max_ptes_none = HPAGE_PMD_NR; > /* ... except if userfaultf relies on MISSING faults. */ > if (!userfaultfd_armed(vma)) > max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - > order); > > (but see below regarding helper function) > > then the code below becomes ... > >> >> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; >> + for (_pte = pte; _pte < pte + nr_pages; >> _pte++, addr += PAGE_SIZE) { >> pte_t pteval = ptep_get(_pte); >> if (pte_none_or_zero(pteval)) { >> ++none_or_zero; >> if (!userfaultfd_armed(vma) && >> (!cc->is_khugepaged || >> - none_or_zero <= khugepaged_max_ptes_none)) { >> + none_or_zero <= max_ptes_none)) { > > ... > > if (none_or_zero <= max_ptes_none) { > > > I see that you do something like that (but slightly different) in the next > patch. You could easily extend the above by it. > > Or go one step further and move all of that conditional into > collapse_max_ptes_none(), whereby > you simply also pass the cc and the vma. > > Then this all gets cleaned up and you'd end up above with > > max_ptes_none = collapse_max_ptes_none(cc, vma, order); > if (max_ptes_none < 0) > return result; > > I'd do all that in this patch here, getting rid of #4. > > >> continue; >> } else { >> result = SCAN_EXCEED_NONE_PTE; >> @@ -585,8 +587,14 @@ static enum scan_result >> __collapse_huge_page_isolate(struct vm_area_struct *vma, >> /* See collapse_scan_pmd(). */ >> if (folio_maybe_mapped_shared(folio)) { >> ++shared; >> - if (cc->is_khugepaged && >> - shared > khugepaged_max_ptes_shared) { >> + /* >> + * TODO: Support shared pages without leading to further >> + * mTHP collapses. Currently bringing in new pages via >> + * shared may cause a future higher order collapse on a >> + * rescan of the same range. >> + */ >> + if (!is_pmd_order(order) || (cc->is_khugepaged && >> + shared > khugepaged_max_ptes_shared)) { > > That's not how we indent within a nested (). > > To make this easier to read, what about similarly having at the beginning > of the function: > > int max_ptes_shared = 0; > > /* For MADV_COLLAPSE, we always collapse. */ > if (cc->is_khugepaged) > max_ptes_none = HPAGE_PMD_NR; > /* TODO ... */ > if (is_pmd_order(order)) > max_ptes_none = khugepaged_max_ptes_shared; > > to turn this code into a > > if (shared > khugepaged_max_ptes_shared) > > Also, here, might make sense to have a collapse_max_ptes_swap(cc, order) > to do that and clean it up. > > >> result = SCAN_EXCEED_SHARED_PTE; >> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); >> goto out; >> @@ -679,18 +687,18 @@ static enum scan_result >> __collapse_huge_page_isolate(struct vm_area_struct *vma, >> } >> >> static void __collapse_huge_page_copy_succeeded(pte_t *pte, >> - struct vm_area_struct *vma, >> - unsigned long address, >> - spinlock_t *ptl, >> - struct list_head >> *compound_pagelist) >> + struct vm_area_struct *vma, unsigned long address, >> + spinlock_t *ptl, unsigned int order, >> + struct list_head *compound_pagelist) >> { >> - unsigned long end = address + HPAGE_PMD_SIZE; >> + unsigned long end = address + (PAGE_SIZE << order); >> struct folio *src, *tmp; >> pte_t pteval; >> pte_t *_pte; >> unsigned int nr_ptes; >> + const unsigned long nr_pages = 1UL << order; > > Move it further to the top. > >> >> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, >> + for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes, >> address += nr_ptes * PAGE_SIZE) { >> nr_ptes = 1; >> pteval = ptep_get(_pte); >> @@ -743,13 +751,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t >> *pte, >> } >> >> static void __collapse_huge_page_copy_failed(pte_t *pte, >> - pmd_t *pmd, >> - pmd_t orig_pmd, >> - struct vm_area_struct *vma, >> - struct list_head >> *compound_pagelist) >> + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, >> + unsigned int order, struct list_head *compound_pagelist) >> { >> spinlock_t *pmd_ptl; >> - >> + const unsigned long nr_pages = 1UL << order; >> /* >> * Re-establish the PMD to point to the original page table >> * entry. Restoring PMD needs to be done prior to releasing >> @@ -763,7 +769,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, >> * Release both raw and compound pages isolated >> * in __collapse_huge_page_isolate. >> */ >> - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); >> + release_pte_pages(pte, pte + nr_pages, compound_pagelist); >> } >> >> /* >> @@ -783,16 +789,16 @@ static void __collapse_huge_page_copy_failed(pte_t >> *pte, >> */ >> static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio >> *folio, >> pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, >> - unsigned long address, spinlock_t *ptl, >> + unsigned long address, spinlock_t *ptl, unsigned int order, >> struct list_head *compound_pagelist) >> { >> unsigned int i; >> enum scan_result result = SCAN_SUCCEED; >> - >> + const unsigned long nr_pages = 1UL << order; > > Same here, all the way to the top. > >> /* >> * Copying pages' contents is subject to memory poison at any iteration. >> */ >> - for (i = 0; i < HPAGE_PMD_NR; i++) { >> + for (i = 0; i < nr_pages; i++) { >> pte_t pteval = ptep_get(pte + i); >> struct page *page = folio_page(folio, i); >> unsigned long src_addr = address + i * PAGE_SIZE; >> @@ -811,10 +817,10 @@ static enum scan_result >> __collapse_huge_page_copy(pte_t *pte, struct folio *foli >> >> if (likely(result == SCAN_SUCCEED)) >> __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, >> - compound_pagelist); >> + order, compound_pagelist); >> else >> __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, >> - compound_pagelist); >> + order, compound_pagelist); >> >> return result; >> } >> @@ -985,12 +991,12 @@ static enum scan_result check_pmd_still_valid(struct >> mm_struct *mm, >> * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. >> */ >> static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm, >> - struct vm_area_struct *vma, unsigned long start_addr, pmd_t >> *pmd, >> - int referenced) >> + struct vm_area_struct *vma, unsigned long start_addr, >> + pmd_t *pmd, int referenced, unsigned int order) >> { >> int swapped_in = 0; >> vm_fault_t ret = 0; >> - unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); >> + unsigned long addr, end = start_addr + (PAGE_SIZE << order); >> enum scan_result result; >> pte_t *pte = NULL; >> spinlock_t *ptl; >> @@ -1022,6 +1028,19 @@ static enum scan_result >> __collapse_huge_page_swapin(struct mm_struct *mm, >> pte_present(vmf.orig_pte)) >> continue; >> >> + /* >> + * TODO: Support swapin without leading to further mTHP >> + * collapses. Currently bringing in new pages via swapin may >> + * cause a future higher order collapse on a rescan of the same >> + * range. >> + */ >> + if (!is_pmd_order(order)) { >> + pte_unmap(pte); >> + mmap_read_unlock(mm); >> + result = SCAN_EXCEED_SWAP_PTE; >> + goto out; >> + } >> + > > Interesting, we just swapin everything we find :) > > But do we really need this check here? I mean, we just found it to be present. > > In the rare event that there was a race, do we really care? It was just > present, now it's swapped. Bad luck. Just swap it in. >
Okay, now I am confused. Why are you not taking care of collapse_scan_pmd() in the same context? Because if you make sure that we properly check against a max_ptes_swap similar as in the style above, we'd rule out swapin right from the start? Also, I would expect that all other parameters in there are similarly handled? -- Cheers, David
