Khugepaged already supports the anonymous mTHP collapse. Similarly, let khugepaged also support the shmem mTHP collapse. The strategy for shmem mTHP collapse follows the anonymous mTHP collapse:
Track present pages via a bitmap while scanning PMD ranges for collapse candidates. After the scan completes, use the bitmap to determine the most efficient mTHP order to collapse to. Scale 'max_ptes_none' by the attempted collapse order to determine the minimum fill threshold for eligibility. Similarly, shmem mTHP collapse rejects regions containing swapped-out pages to avoid creep. Currently, the collapse_pte_mapped_thp() does not build the mapping for mTHP. Cause we still expect to establish the mTHP mapping via refault under the control of fault_around. So collapse_pte_mapped_thp() remains responsible only for building the mapping for PMD-sized THP, which is reasonable and makes life easier. Note that we do not need to remove pte page tables for shmem mTHP collapse. Signed-off-by: Baolin Wang <[email protected]> --- mm/khugepaged.c | 115 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 24 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0c8dfbd48410..818d51915748 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -135,6 +135,10 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; +static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc, int order); + #ifdef CONFIG_SYSFS static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -1487,6 +1491,7 @@ static unsigned int max_order_from_offset(unsigned int offset) * mTHP. */ static enum scan_result mthp_collapse(struct mm_struct *mm, + struct file *file, pgoff_t start, unsigned long address, int referenced, int unmapped, struct collapse_control *cc, unsigned long enabled_orders) { @@ -1512,8 +1517,12 @@ static enum scan_result mthp_collapse(struct mm_struct *mm, enum scan_result ret; collapse_address = address + offset * PAGE_SIZE; - ret = collapse_huge_page(mm, collapse_address, referenced, - unmapped, cc, order); + if (file) + ret = collapse_file(mm, collapse_address, file, + start + offset, cc, order); + else + ret = collapse_huge_page(mm, collapse_address, + referenced, unmapped, cc, order); switch (ret) { /* Cases where we continue to next collapse candidate */ @@ -1521,6 +1530,7 @@ static enum scan_result mthp_collapse(struct mm_struct *mm, collapsed += nr_ptes; fallthrough; case SCAN_PTE_MAPPED_HUGEPAGE: + case SCAN_PAGE_COMPOUND: goto next_offset; /* Cases where lower orders might still succeed */ case SCAN_ALLOC_HUGE_PAGE_FAIL: @@ -1774,7 +1784,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm, if (result == SCAN_SUCCEED) { /* collapse_huge_page expects the lock to be dropped before calling */ mmap_read_unlock(mm); - result = mthp_collapse(mm, start_addr, referenced, + result = mthp_collapse(mm, NULL, 0, start_addr, referenced, unmapped, cc, enabled_orders); /* mmap_lock was released above, set lock_dropped */ *lock_dropped = true; @@ -2306,7 +2316,9 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, if (++nr_none > max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + if (is_pmd_order(order)) + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE); goto xa_locked; } @@ -2316,6 +2328,19 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); + + /* + * TODO: Support swapin without leading to further mTHP + * collapses. Currently bringing in new pages via swapin may + * cause a future higher order collapse on a rescan of the same + * range. + */ + if (!is_pmd_order(order)) { + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP); + result = SCAN_EXCEED_SWAP_PTE; + goto xa_unlocked; + } + /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, 0, &folio, SGP_NOALLOC)) { @@ -2399,6 +2424,18 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } + /* + * If the folio order is greater than the collapse order, there is + * no need to continue attempting to collapse. + * And should return SCAN_PAGE_COMPOUND instead of SCAN_PTE_MAPPED_HUGEPAGE, + * then we can build the mapping under the control of fault_around + * when refaulting. + */ + if (folio_order(folio) >= order) { + result = SCAN_PAGE_COMPOUND; + goto out_unlock; + } + if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; @@ -2621,12 +2658,11 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, xas_unlock_irq(&xas); /* - * Remove pte page tables, so we can re-fault the page as huge. - * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp(). + * Remove pte page tables for PMD-sized THP collapse, so we can + * re-fault the page as huge. */ - retract_page_tables(mapping, start); - if (cc && !cc->is_khugepaged) - result = SCAN_PTE_MAPPED_HUGEPAGE; + if (is_pmd_order(order)) + retract_page_tables(mapping, start); folio_unlock(new_folio); /* @@ -2675,22 +2711,35 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, } static enum scan_result collapse_scan_file(struct mm_struct *mm, - unsigned long addr, struct file *file, pgoff_t start, - struct collapse_control *cc) + struct vm_area_struct *vma, unsigned long addr, + struct file *file, pgoff_t start, struct collapse_control *cc) { - const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER); + enum tva_type tva_flags = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; + unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER); const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER); - struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); - int present, swap; - int node = NUMA_NO_NODE; enum scan_result result = SCAN_SUCCEED; + unsigned long enabled_orders, nr_pages; + struct folio *folio = NULL; + int node = NUMA_NO_NODE; + int present, swap; + pgoff_t pgoff; present = 0; swap = 0; + bitmap_zero(cc->mthp_present_ptes, MAX_PTRS_PER_PTE); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); + + enabled_orders = collapse_possible_orders(vma, vma->vm_flags, tva_flags); + /* + * If PMD is the only enabled order, enforce max_ptes_none, otherwise + * scan all pages to populate the bitmap for mTHP collapse. + */ + if (enabled_orders != BIT(HPAGE_PMD_ORDER)) + max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT; + rcu_read_lock(); xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, folio)) @@ -2754,7 +2803,17 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, * is just too costly... */ - present += folio_nr_pages(folio); + nr_pages = folio_nr_pages(folio); + present += nr_pages; + + /* + * If there are folios present, keep track of it in the bitmap + * for file/shmem mTHP collapse. + */ + pgoff = max_t(pgoff_t, start, folio->index) - start; + nr_pages = min_t(int, HPAGE_PMD_NR - pgoff, nr_pages); + bitmap_set(cc->mthp_present_ptes, pgoff, nr_pages); + folio_put(folio); if (need_resched()) { @@ -2768,15 +2827,23 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, else cc->progress += HPAGE_PMD_NR; - if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - max_ptes_none) { - result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); - } else { - result = collapse_file(mm, addr, file, start, cc, HPAGE_PMD_ORDER); - } + if (result != SCAN_SUCCEED) + goto out; + + if (present < HPAGE_PMD_NR - max_ptes_none) { + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + count_mthp_stat(HPAGE_PMD_ORDER, + MTHP_STAT_COLLAPSE_EXCEED_NONE); + goto out; } + result = mthp_collapse(mm, file, start, addr, 0, 0, cc, enabled_orders); + if (result == SCAN_SUCCEED && !cc->is_khugepaged) { + /* If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). */ + result = SCAN_PTE_MAPPED_HUGEPAGE; + } +out: trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } @@ -2808,7 +2875,7 @@ static enum scan_result collapse_single_pmd(unsigned long addr, mmap_read_unlock(mm); *lock_dropped = true; retry: - result = collapse_scan_file(mm, addr, file, pgoff, cc); + result = collapse_scan_file(mm, vma, addr, file, pgoff, cc); /* * For MADV_COLLAPSE, when encountering dirty pages, try to writeback, -- 2.47.3

