When a guest reports free pages to the hypervisor via the page reporting framework (used by virtio-balloon and hv_balloon), the host typically zeros those pages when reclaiming their backing memory. However, when those pages are later allocated in the guest, post_alloc_hook() unconditionally zeros them again if __GFP_ZERO is set. This double-zeroing is wasteful, especially for large pages.
Avoid redundant zeroing: - Add a host_zeroes_pages flag to page_reporting_dev_info, allowing drivers to declare that their host zeros reported pages on reclaim. A static key (page_reporting_host_zeroes) gates the fast path. - Add PG_zeroed page flag (sharing PG_private bit) to mark pages that have been zeroed by the host. Set it on reported pages during allocation from the buddy in page_del_and_expand(). - Thread the zeroed bool through rmqueue -> prep_new_page -> post_alloc_hook, where it skips redundant zeroing for __GFP_ZERO allocations. No driver sets host_zeroes_pages yet; a follow-up patch to virtio_balloon is needed to opt in. Signed-off-by: Michael S. Tsirkin <[email protected]> Assisted-by: Claude:claude-opus-4-6 Assisted-by: cursor-agent:GPT-5.4-xhigh --- include/linux/page-flags.h | 9 +++++ include/linux/page_reporting.h | 3 ++ mm/compaction.c | 6 ++-- mm/internal.h | 2 +- mm/page_alloc.c | 66 +++++++++++++++++++++++----------- mm/page_reporting.c | 14 +++++++- mm/page_reporting.h | 12 +++++++ 7 files changed, 87 insertions(+), 25 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f7a0e4af0c73..eef2499cba8b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -135,6 +135,8 @@ enum pageflags { PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ /* Some filesystems */ PG_checked = PG_owner_priv_1, + /* Page contents are known to be zero */ + PG_zeroed = PG_private, /* * Depending on the way an anonymous folio can be mapped into a page @@ -679,6 +681,13 @@ FOLIO_TEST_CLEAR_FLAG_FALSE(young) FOLIO_FLAG_FALSE(idle) #endif +/* + * PageZeroed() tracks pages known to be zero. The allocator + * uses this to skip redundant zeroing in post_alloc_hook(). + */ +__PAGEFLAG(Zeroed, zeroed, PF_NO_COMPOUND) +#define __PG_ZEROED (1UL << PG_zeroed) + /* * PageReported() is used to track reported free pages within the Buddy * allocator. We can use the non-atomic version of the test and set diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index fe648dfa3a7c..10faadfeb4fb 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -13,6 +13,9 @@ struct page_reporting_dev_info { int (*report)(struct page_reporting_dev_info *prdev, struct scatterlist *sg, unsigned int nents); + /* If true, host zeros reported pages on reclaim */ + bool host_zeroes_pages; + /* work struct for processing reports */ struct delayed_work work; diff --git a/mm/compaction.c b/mm/compaction.c index c1039a9373e5..61209cd408ea 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -82,7 +82,8 @@ static inline bool is_via_compact_memory(int order) { return false; } static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) { - post_alloc_hook(page, order, __GFP_MOVABLE, USER_ADDR_NONE); + __ClearPageZeroed(page); + post_alloc_hook(page, order, __GFP_MOVABLE, false, USER_ADDR_NONE); set_page_refcounted(page); return page; } @@ -1832,7 +1833,8 @@ static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long da set_page_private(&freepage[size], start_order); } dst = (struct folio *)freepage; - post_alloc_hook(&dst->page, order, __GFP_MOVABLE, USER_ADDR_NONE); + __ClearPageZeroed(&dst->page); + post_alloc_hook(&dst->page, order, __GFP_MOVABLE, false, USER_ADDR_NONE); set_page_refcounted(&dst->page); if (order) prep_compound_page(&dst->page, order); diff --git a/mm/internal.h b/mm/internal.h index 8e4616e42b4a..0600d824ba03 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -889,7 +889,7 @@ static inline void prep_compound_tail(struct page *head, int tail_idx) } void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags, - unsigned long user_addr); + bool zeroed, unsigned long user_addr); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca4f9c0948af..eff01a819744 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1774,6 +1774,7 @@ static __always_inline void page_del_and_expand(struct zone *zone, bool was_reported = page_reported(page); __del_page_from_free_list(page, zone, high, migratetype); + nr_pages -= expand(zone, page, low, high, migratetype, was_reported); account_freepages(zone, -nr_pages, migratetype); } @@ -1846,8 +1847,10 @@ static inline bool should_skip_init(gfp_t flags) return (flags & __GFP_SKIP_ZERO); } + inline void post_alloc_hook(struct page *page, unsigned int order, - gfp_t gfp_flags, unsigned long user_addr) + gfp_t gfp_flags, bool zeroed, + unsigned long user_addr) { bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); @@ -1856,6 +1859,14 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_private(page, 0); + /* + * If the page is zeroed, skip memory initialization. + * We still need to handle tag zeroing separately since the host + * does not know about memory tags. + */ + if (zeroed && init && !zero_tags) + init = false; + arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); @@ -1913,13 +1924,13 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - unsigned int alloc_flags, - unsigned long user_addr) + unsigned int alloc_flags, bool zeroed, + unsigned long user_addr) { if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); - post_alloc_hook(page, order, gfp_flags, user_addr); + post_alloc_hook(page, order, gfp_flags, zeroed, user_addr); /* * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to @@ -3189,6 +3200,7 @@ int __isolate_free_page(struct page *page, unsigned int order) } del_page_from_free_list(page, zone, order, mt); + __ClearPageZeroed(page); /* * Set the pageblock if the isolated page is at least half of a @@ -3261,7 +3273,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, static __always_inline struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, unsigned int order, unsigned int alloc_flags, - int migratetype) + int migratetype, bool *zeroed) { struct page *page; unsigned long flags; @@ -3296,6 +3308,8 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, } } spin_unlock_irqrestore(&zone->lock, flags); + *zeroed = PageZeroed(page); + __ClearPageZeroed(page); } while (check_new_pages(page, order)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); @@ -3357,10 +3371,9 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order) /* Remove page from the per-cpu list, caller must protect the list */ static inline struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, - int migratetype, - unsigned int alloc_flags, + int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, - struct list_head *list) + struct list_head *list, bool *zeroed) { struct page *page; @@ -3381,6 +3394,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, page = list_first_entry(list, struct page, pcp_list); list_del(&page->pcp_list); pcp->count -= 1 << order; + *zeroed = PageZeroed(page); + __ClearPageZeroed(page); } while (check_new_pages(page, order)); return page; @@ -3389,7 +3404,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsigned int order, - int migratetype, unsigned int alloc_flags) + int migratetype, unsigned int alloc_flags, + bool *zeroed) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3408,7 +3424,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, */ pcp->free_count >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; - page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, + pcp, list, zeroed); pcp_spin_unlock(pcp, UP_flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); @@ -3433,19 +3450,19 @@ static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, - int migratetype) + int migratetype, bool *zeroed) { struct page *page; if (likely(pcp_allowed_order(order))) { page = rmqueue_pcplist(preferred_zone, zone, order, - migratetype, alloc_flags); + migratetype, alloc_flags, zeroed); if (likely(page)) goto out; } page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, - migratetype); + migratetype, zeroed); out: /* Separate test+clear to avoid unnecessary atomics */ @@ -3836,6 +3853,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct pglist_data *last_pgdat = NULL; bool last_pgdat_dirty_ok = false; bool no_fallback; + bool zeroed; bool skip_kswapd_nodes = nr_online_nodes > 1; bool skipped_kswapd_nodes = false; @@ -3980,10 +3998,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, try_this_zone: page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order, - gfp_mask, alloc_flags, ac->migratetype); + gfp_mask, alloc_flags, ac->migratetype, + &zeroed); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags, - ac->user_addr); + zeroed, ac->user_addr); /* * If this is a high-order atomic allocation then check @@ -4217,9 +4236,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, count_vm_event(COMPACTSTALL); /* Prep a captured page if available */ - if (page) - prep_new_page(page, order, gfp_mask, alloc_flags, + if (page) { + __ClearPageZeroed(page); + prep_new_page(page, order, gfp_mask, alloc_flags, false, ac->user_addr); + } /* Try get a page from the freelist if available */ if (!page) @@ -5193,6 +5214,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; while (nr_populated < nr_pages) { + bool zeroed = false; /* Skip existing pages */ if (page_array[nr_populated]) { @@ -5201,7 +5223,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, } page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, - pcp, pcp_list); + pcp, pcp_list, &zeroed); if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { @@ -5212,7 +5234,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, } nr_account++; - prep_new_page(page, 0, gfp, 0, USER_ADDR_NONE); + prep_new_page(page, 0, gfp, 0, zeroed, USER_ADDR_NONE); set_page_refcounted(page); page_array[nr_populated++] = page; } @@ -6983,7 +7005,8 @@ static void split_free_frozen_pages(struct list_head *list, gfp_t gfp_mask) list_for_each_entry_safe(page, next, &list[order], lru) { int i; - post_alloc_hook(page, order, gfp_mask, USER_ADDR_NONE); + __ClearPageZeroed(page); + post_alloc_hook(page, order, gfp_mask, false, USER_ADDR_NONE); if (!order) continue; @@ -7188,8 +7211,9 @@ int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end, } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); + __ClearPageZeroed(head); check_new_pages(head, order); - prep_new_page(head, order, gfp_mask, 0, USER_ADDR_NONE); + prep_new_page(head, order, gfp_mask, 0, false, USER_ADDR_NONE); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", diff --git a/mm/page_reporting.c b/mm/page_reporting.c index f0042d5743af..6177d2413743 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -50,6 +50,8 @@ EXPORT_SYMBOL_GPL(page_reporting_order); #define PAGE_REPORTING_DELAY (2 * HZ) static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; +DEFINE_STATIC_KEY_FALSE(page_reporting_host_zeroes); + enum { PAGE_REPORTING_IDLE = 0, PAGE_REPORTING_REQUESTED, @@ -129,8 +131,11 @@ page_reporting_drain(struct page_reporting_dev_info *prdev, * report on the new larger page when we make our way * up to that higher order. */ - if (PageBuddy(page) && buddy_order(page) == order) + if (PageBuddy(page) && buddy_order(page) == order) { __SetPageReported(page); + if (page_reporting_host_zeroes_pages()) + __SetPageZeroed(page); + } } while ((sg = sg_next(sg))); /* reinitialize scatterlist now that it is empty */ @@ -386,6 +391,10 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) /* Assign device to allow notifications */ rcu_assign_pointer(pr_dev_info, prdev); + /* enable zeroed page optimization if host zeroes reported pages */ + if (prdev->host_zeroes_pages) + static_branch_enable(&page_reporting_host_zeroes); + /* enable page reporting notification */ if (!static_key_enabled(&page_reporting_enabled)) { static_branch_enable(&page_reporting_enabled); @@ -410,6 +419,9 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev) /* Flush any existing work, and lock it out */ cancel_delayed_work_sync(&prdev->work); + + if (prdev->host_zeroes_pages) + static_branch_disable(&page_reporting_host_zeroes); } mutex_unlock(&page_reporting_mutex); diff --git a/mm/page_reporting.h b/mm/page_reporting.h index c51dbc228b94..736ea7b37e9e 100644 --- a/mm/page_reporting.h +++ b/mm/page_reporting.h @@ -15,6 +15,13 @@ DECLARE_STATIC_KEY_FALSE(page_reporting_enabled); extern unsigned int page_reporting_order; void __page_reporting_notify(void); +DECLARE_STATIC_KEY_FALSE(page_reporting_host_zeroes); + +static inline bool page_reporting_host_zeroes_pages(void) +{ + return static_branch_unlikely(&page_reporting_host_zeroes); +} + static inline bool page_reported(struct page *page) { return static_branch_unlikely(&page_reporting_enabled) && @@ -46,6 +53,11 @@ static inline void page_reporting_notify_free(unsigned int order) #else /* CONFIG_PAGE_REPORTING */ #define page_reported(_page) false +static inline bool page_reporting_host_zeroes_pages(void) +{ + return false; +} + static inline void page_reporting_notify_free(unsigned int order) { } -- MST

