The branch main has been updated by olce:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=d440953942372ca275d0743a6e220631bde440ee

commit d440953942372ca275d0743a6e220631bde440ee
Author:     Olivier Certner <o...@freebsd.org>
AuthorDate: 2025-07-07 20:29:12 +0000
Commit:     Olivier Certner <o...@freebsd.org>
CommitDate: 2025-09-09 07:56:45 +0000

    vm_domainset: Only probe domains once when iterating, instead of up to 4 
times
    
    Because of the 'di_minskip' logic, which resets the initial domain, an
    iterator starts by considering only domains that have more than
    'free_min' pages in a first phase, and then all domains in a second one.
    Non-"underpaged" domains are thus examined twice, even if the allocation
    can't succeed.
    
    Re-scanning the same domains twice just wastes time, as allocation
    attempts that must not wait may rely on failing sooner and those that
    must will loop anyway (a domain previously scanned twice has more pages
    than 'free_min' and consequently vm_wait_doms() will just return
    immediately).
    
    Additionally, the DOMAINSET_POLICY_FIRSTTOUCH policy would aggravate
    this situation by reexamining the current domain again at the end of
    each phase.  In the case of a single domain, this means doubling again
    the number of times domain 0 is probed.
    
    Implementation consists in adding two 'domainset_t' to 'struct
    vm_domainset_iter' (and removing the 'di_n' counter).  The first,
    'di_remain_mask', contains domains still to be explored in the current
    phase, the first phase concerning only domains with more pages than
    'free_min' ('di_minskip' true) and the second one concerning only
    domains previously under 'free_min' ('di_minskip' false).  The second,
    'di_min_mask', holds the domains with less pages than 'free_min'
    encountered during the first phase, and serves as the reset value for
    'di_remain_mask' when transitioning to the second phase.
    
    PR:             277476
    Fixes:          e5818a53dbd2 ("Implement several enhancements to NUMA 
policies.")
    Fixes:          23984ce5cd24 ("Avoid resource deadlocks when one domain has 
exhausted its memory."...)
    MFC after:      10 days
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D51249
---
 sys/vm/vm_domainset.c | 53 ++++++++++++++++++++++++++++++---------------------
 sys/vm/vm_domainset.h |  6 +++++-
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c
index b44bdb96b0d4..bd15449559a5 100644
--- a/sys/vm/vm_domainset.c
+++ b/sys/vm/vm_domainset.c
@@ -131,7 +131,8 @@ static void
 vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
 {
 
-       KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
+       KASSERT(!DOMAINSET_EMPTY(&di->di_remain_mask),
+           ("%s: Already iterated on all domains", __func__));
        switch (di->di_policy) {
        case DOMAINSET_POLICY_FIRSTTOUCH:
                /*
@@ -161,37 +162,39 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, int 
*domain)
        switch (di->di_policy) {
        case DOMAINSET_POLICY_FIRSTTOUCH:
                *domain = PCPU_GET(domain);
-               if (DOMAINSET_ISSET(*domain, &di->di_valid_mask)) {
-                       /*
-                        * Add an extra iteration because we will visit the
-                        * current domain a second time in the rr iterator.
-                        */
-                       di->di_n = di->di_domain->ds_cnt + 1;
+               if (DOMAINSET_ISSET(*domain, &di->di_valid_mask))
                        break;
-               }
                /*
                 * To prevent impossible allocations we convert an invalid
                 * first-touch to round-robin.
                 */
                /* FALLTHROUGH */
        case DOMAINSET_POLICY_ROUNDROBIN:
-               di->di_n = di->di_domain->ds_cnt;
                vm_domainset_iter_rr(di, domain);
                break;
        case DOMAINSET_POLICY_PREFER:
                *domain = di->di_domain->ds_prefer;
-               di->di_n = di->di_domain->ds_cnt;
                break;
        case DOMAINSET_POLICY_INTERLEAVE:
                vm_domainset_iter_interleave(di, domain);
-               di->di_n = di->di_domain->ds_cnt;
                break;
        default:
                panic("%s: Unknown policy %d", __func__, di->di_policy);
        }
-       KASSERT(di->di_n > 0, ("%s: Invalid n %d", __func__, di->di_n));
        KASSERT(*domain < vm_ndomains,
            ("%s: Invalid domain %d", __func__, *domain));
+
+       /* Initialize the mask of domains to visit. */
+       if (di->di_minskip) {
+               /* Phase 1: Skip domains under 'v_free_min'. */
+               DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask);
+               DOMAINSET_ZERO(&di->di_min_mask);
+       } else
+               /* Phase 2: Browse domains that were under 'v_free_min'. */
+               DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask);
+
+       /* Mark first domain as seen. */
+       DOMAINSET_CLR(*domain, &di->di_remain_mask);
 }
 
 void
@@ -225,12 +228,15 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, 
struct vm_object *obj,
        if (__predict_false(DOMAINSET_EMPTY(&di->di_valid_mask)))
                return (ENOMEM);
 
-       /* If there are more domains to visit we run the iterator. */
-       while (--di->di_n != 0) {
+       /* If there are more domains to visit in this phase, run the iterator. 
*/
+       while (!DOMAINSET_EMPTY(&di->di_remain_mask)) {
                vm_domainset_iter_next(di, domain);
-               if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) &&
-                   (!di->di_minskip || !vm_page_count_min_domain(*domain)))
-                       return (0);
+               if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) {
+                       DOMAINSET_CLR(*domain, &di->di_remain_mask);
+                       if (!di->di_minskip || 
!vm_page_count_min_domain(*domain))
+                               return (0);
+                       DOMAINSET_SET(*domain, &di->di_min_mask);
+               }
        }
 
        /* If we skipped domains below min restart the search. */
@@ -298,12 +304,15 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, 
int *domain)
        if (DOMAINSET_EMPTY(&di->di_valid_mask))
                return (ENOMEM);
 
-       /* If there are more domains to visit we run the iterator. */
-       while (--di->di_n != 0) {
+       /* If there are more domains to visit in this phase, run the iterator. 
*/
+       while (!DOMAINSET_EMPTY(&di->di_remain_mask)) {
                vm_domainset_iter_next(di, domain);
-               if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) &&
-                   (!di->di_minskip || !vm_page_count_min_domain(*domain)))
-                       return (0);
+               if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) {
+                       DOMAINSET_CLR(*domain, &di->di_remain_mask);
+                       if (!di->di_minskip || 
!vm_page_count_min_domain(*domain))
+                               return (0);
+                       DOMAINSET_SET(*domain, &di->di_min_mask);
+               }
        }
 
        /* If we skipped domains below min restart the search. */
diff --git a/sys/vm/vm_domainset.h b/sys/vm/vm_domainset.h
index 0d325a642f40..b223a4d03df9 100644
--- a/sys/vm/vm_domainset.h
+++ b/sys/vm/vm_domainset.h
@@ -33,11 +33,15 @@ struct pctrie_iter;
 struct vm_domainset_iter {
        struct domainset        *di_domain;
        unsigned int            *di_iter;
+       /* Initialized from 'di_domain', initial value after reset. */
        domainset_t             di_valid_mask;
+       /* Domains to browse in the current phase. */
+       domainset_t             di_remain_mask;
+       /* Domains skipped in phase 1 because under 'v_free_min'. */
+       domainset_t             di_min_mask;
        vm_pindex_t             di_offset;
        int                     di_flags;
        uint16_t                di_policy;
-       domainid_t              di_n;
        bool                    di_minskip;
 };
 

Reply via email to