[RFC 3/4] mm, swap: make SWAP_NR_ORDERS runtime

Ritesh Harjani (IBM) Tue, 09 Jun 2026 06:20:34 -0700

SWAP_NR_ORDERS is currently a compile-time constant defined as PMD_ORDER
+ 1 when CONFIG_THP_SWAP=y, else 1.
This patch converts SWAP_NR_ORDERS and all the relevant code paths to
make it runtime dependent. This is needed for architectures like powerpc
book3s64, where PMD_ORDER is decided at runtime depending upon which MMU
is chosen (Radix / Hash).


One thing to note here is, if any of the allocations required in
swapfile_init() call (which is a subsys_initcall) fails, then we have no
option but to panic. This is inline with how memory allocation failures
in other subsys_initcall() are handled.

Signed-off-by: Ritesh Harjani (IBM) <[email protected]>
---
 include/linux/swap.h |  10 ++---
 mm/swapfile.c        | 105 ++++++++++++++++++++++++++++++-------------
 2 files changed, 78 insertions(+), 37 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 46c25523d7b8..063ab7c4d4a5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -224,9 +224,9 @@ enum {
 #define SWAP_ENTRY_INVALID     0

 #ifdef CONFIG_THP_SWAP
-#define SWAP_NR_ORDERS         (PMD_ORDER + 1)
+#define swap_nr_orders()       ((unsigned int)(PMD_ORDER + 1))
 #else
-#define SWAP_NR_ORDERS         1
+#define swap_nr_orders()       (1U)
 #endif

 /*
@@ -234,7 +234,7 @@ enum {
  * The purpose is to optimize SWAP throughput on these device.
  */
 struct swap_sequential_cluster {
-       unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
+       DECLARE_FLEX_ARRAY(unsigned int, next); /* Likely next allocation 
offset */
 };

 /*
@@ -250,9 +250,9 @@ struct swap_info_struct {
        struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
        struct list_head free_clusters; /* free clusters list */
        struct list_head full_clusters; /* full clusters list */
-       struct list_head nonfull_clusters[SWAP_NR_ORDERS];
+       struct list_head *nonfull_clusters;
                                        /* list of cluster that contains at 
least one free slot */
-       struct list_head frag_clusters[SWAP_NR_ORDERS];
+       struct list_head *frag_clusters;
                                        /* list of cluster that are fragmented 
or contented */
        unsigned int pages;             /* total of usable pages of swap */
        atomic_long_t inuse_pages;      /* number of those currently in use */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 016a5aa0cb93..0a78802528cf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -118,16 +118,12 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
 atomic_t nr_rotate_swap = ATOMIC_INIT(0);

 struct percpu_swap_cluster {
-       struct swap_info_struct *si[SWAP_NR_ORDERS];
-       unsigned long offset[SWAP_NR_ORDERS];
+       struct swap_info_struct **si;
+       unsigned long *offset;
        local_lock_t lock;
 };

-static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
-       .si = { NULL },
-       .offset = { SWAP_ENTRY_INVALID },
-       .lock = INIT_LOCAL_LOCK(),
-};
+static struct percpu_swap_cluster __percpu *percpu_swap_cluster;

 unsigned int swap_slots_in_cluster __read_mostly;
 bool swap_table_use_page __read_mostly;
@@ -545,7 +541,7 @@ swap_cluster_populate(struct swap_info_struct *si,
         * Only cluster isolation from the allocator does table allocation.
         * Swap allocator uses percpu clusters and holds the local lock.
         */
-       lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
+       lockdep_assert_held(&this_cpu_ptr(percpu_swap_cluster)->lock);
        if (!(si->flags & SWP_SOLIDSTATE))
                lockdep_assert_held(&si->global_cluster_lock);
        lockdep_assert_held(&ci->lock);
@@ -562,7 +558,7 @@ swap_cluster_populate(struct swap_info_struct *si,
        spin_unlock(&ci->lock);
        if (!(si->flags & SWP_SOLIDSTATE))
                spin_unlock(&si->global_cluster_lock);
-       local_unlock(&percpu_swap_cluster.lock);
+       local_unlock(&percpu_swap_cluster->lock);

        ret = swap_cluster_alloc_table(ci, __GFP_HIGH | __GFP_NOMEMALLOC |
                                           GFP_KERNEL);
@@ -575,7 +571,7 @@ swap_cluster_populate(struct swap_info_struct *si,
         * could happen with ignoring the percpu cluster is fragmentation,
         * which is acceptable since this fallback and race is rare.
         */
-       local_lock(&percpu_swap_cluster.lock);
+       local_lock(&percpu_swap_cluster->lock);
        if (!(si->flags & SWP_SOLIDSTATE))
                spin_lock(&si->global_cluster_lock);
        spin_lock(&ci->lock);
@@ -1016,8 +1012,10 @@ static unsigned int alloc_swap_scan_cluster(struct 
swap_info_struct *si,
        relocate_cluster(si, ci);
        swap_cluster_unlock(ci);
        if (si->flags & SWP_SOLIDSTATE) {
-               this_cpu_write(percpu_swap_cluster.offset[order], next);
-               this_cpu_write(percpu_swap_cluster.si[order], si);
+               struct percpu_swap_cluster *pcp_sc = 
this_cpu_ptr(percpu_swap_cluster);
+
+               pcp_sc->offset[order] = next;
+               pcp_sc->si[order] = si;
        } else {
                si->global_cluster->next[order] = next;
        }
@@ -1178,7 +1176,7 @@ static unsigned long cluster_alloc_swap_entry(struct 
swap_info_struct *si,
                goto done;

        /* Order 0 stealing from higher order */
-       for (int o = 1; o < SWAP_NR_ORDERS; o++) {
+       for (int o = 1; o < swap_nr_orders(); o++) {
                /*
                 * Clusters here have at least one usable slots and can't fail 
order 0
                 * allocation, but reclaim may drop si->lock and race with 
another user.
@@ -1376,13 +1374,14 @@ static bool swap_alloc_fast(struct folio *folio)
        struct swap_cluster_info *ci;
        struct swap_info_struct *si;
        unsigned int offset;
+       struct percpu_swap_cluster *pcp_sc = this_cpu_ptr(percpu_swap_cluster);

        /*
         * Once allocated, swap_info_struct will never be completely freed,
         * so checking it's liveness by get_swap_device_info is enough.
         */
-       si = this_cpu_read(percpu_swap_cluster.si[order]);
-       offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+       si = pcp_sc->si[order];
+       offset = pcp_sc->offset[order];
        if (!si || !offset || !get_swap_device_info(si))
                return false;

@@ -1770,10 +1769,10 @@ int folio_alloc_swap(struct folio *folio)
        }

 again:
-       local_lock(&percpu_swap_cluster.lock);
+       local_lock(&percpu_swap_cluster->lock);
        if (!swap_alloc_fast(folio))
                swap_alloc_slow(folio);
-       local_unlock(&percpu_swap_cluster.lock);
+       local_unlock(&percpu_swap_cluster->lock);

        if (!order && unlikely(!folio_test_swapcache(folio))) {
                if (swap_sync_discard())
@@ -2166,6 +2165,7 @@ swp_entry_t swap_alloc_hibernation_slot(int type)
        unsigned long pcp_offset, offset = SWAP_ENTRY_INVALID;
        struct swap_cluster_info *ci;
        swp_entry_t entry = {0};
+       struct percpu_swap_cluster *pcp_sc;

        if (!si)
                goto fail;
@@ -2174,9 +2174,10 @@ swp_entry_t swap_alloc_hibernation_slot(int type)
         * Try the local cluster first if it matches the device. If
         * not, try grab a new cluster and override local cluster.
         */
-       local_lock(&percpu_swap_cluster.lock);
-       pcp_si = this_cpu_read(percpu_swap_cluster.si[0]);
-       pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]);
+       local_lock(&percpu_swap_cluster->lock);
+       pcp_sc = this_cpu_ptr(percpu_swap_cluster);
+       pcp_si = pcp_sc->si[0];
+       pcp_offset = pcp_sc->offset[0];
        if (pcp_si == si && pcp_offset) {
                ci = swap_cluster_lock(si, pcp_offset);
                if (cluster_is_usable(ci, 0))
@@ -2186,7 +2187,7 @@ swp_entry_t swap_alloc_hibernation_slot(int type)
        }
        if (!offset)
                offset = cluster_alloc_swap_entry(si, NULL);
-       local_unlock(&percpu_swap_cluster.lock);
+       local_unlock(&percpu_swap_cluster->lock);
        if (offset)
                entry = swp_entry(si->type, offset);

@@ -3029,6 +3030,16 @@ static void wait_for_allocation(struct swap_info_struct 
*si)
        }
 }

+static void free_swap_info_arrays(struct swap_info_struct *si)
+{
+       kfree(si->global_cluster);
+       si->global_cluster = NULL;
+       kfree(si->nonfull_clusters);
+       si->nonfull_clusters = NULL;
+       kfree(si->frag_clusters);
+       si->frag_clusters = NULL;
+}
+
 static void free_swap_cluster_info(struct swap_cluster_info *cluster_info,
                                   unsigned long maxpages)
 {
@@ -3057,17 +3068,17 @@ static void free_swap_cluster_info(struct 
swap_cluster_info *cluster_info,
 static void flush_percpu_swap_cluster(struct swap_info_struct *si)
 {
        int cpu, i;
-       struct swap_info_struct **pcp_si;
+       struct percpu_swap_cluster *pcp_sc;

        for_each_possible_cpu(cpu) {
-               pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
+               pcp_sc = per_cpu_ptr(percpu_swap_cluster, cpu);
                /*
                 * Invalidate the percpu swap cluster cache, si->users
                 * is dead, so no new user will point to it, just flush
                 * any existing user.
                 */
-               for (i = 0; i < SWAP_NR_ORDERS; i++)
-                       cmpxchg(&pcp_si[i], si, NULL);
+               for (i = 0; i < swap_nr_orders(); i++)
+                       cmpxchg(&pcp_sc->si[i], si, NULL);
        }
 }

@@ -3179,8 +3190,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        arch_swap_invalidate_area(p->type);
        zswap_swapoff(p->type);
        mutex_unlock(&swapon_mutex);
-       kfree(p->global_cluster);
-       p->global_cluster = NULL;
+       free_swap_info_arrays(p);
        free_swap_cluster_info(cluster_info, maxpages);

        inode = mapping->host;
@@ -3531,6 +3541,7 @@ static int setup_swap_clusters_info(struct 
swap_info_struct *si,
        struct swap_cluster_info *cluster_info;
        int err = -ENOMEM;
        unsigned long i;
+       unsigned int nr_orders = swap_nr_orders();

        cluster_info = kvzalloc_objs(*cluster_info, nr_clusters);
        if (!cluster_info)
@@ -3539,11 +3550,19 @@ static int setup_swap_clusters_info(struct 
swap_info_struct *si,
        for (i = 0; i < nr_clusters; i++)
                spin_lock_init(&cluster_info[i].lock);

+       si->nonfull_clusters = kmalloc_objs(*si->nonfull_clusters, nr_orders);
+       if (!si->nonfull_clusters)
+               goto err;
+
+       si->frag_clusters = kmalloc_objs(*si->frag_clusters, nr_orders);
+       if (!si->frag_clusters)
+               goto err;
+
        if (!(si->flags & SWP_SOLIDSTATE)) {
-               si->global_cluster = kmalloc_obj(*si->global_cluster);
+               si->global_cluster = kmalloc_flex(*si->global_cluster, next, 
nr_orders);
                if (!si->global_cluster)
                        goto err;
-               for (i = 0; i < SWAP_NR_ORDERS; i++)
+               for (i = 0; i < nr_orders; i++)
                        si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
                spin_lock_init(&si->global_cluster_lock);
        }
@@ -3579,7 +3598,7 @@ static int setup_swap_clusters_info(struct 
swap_info_struct *si,
        INIT_LIST_HEAD(&si->full_clusters);
        INIT_LIST_HEAD(&si->discard_clusters);

-       for (i = 0; i < SWAP_NR_ORDERS; i++) {
+       for (i = 0; i < nr_orders; i++) {
                INIT_LIST_HEAD(&si->nonfull_clusters[i]);
                INIT_LIST_HEAD(&si->frag_clusters[i]);
        }
@@ -3599,6 +3618,7 @@ static int setup_swap_clusters_info(struct 
swap_info_struct *si,
        si->cluster_info = cluster_info;
        return 0;
 err:
+       free_swap_info_arrays(si);
        free_swap_cluster_info(cluster_info, maxpages);
        return err;
 }
@@ -3807,8 +3827,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, 
int, swap_flags)
 bad_swap_unlock_inode:
        inode_unlock(inode);
 bad_swap:
-       kfree(si->global_cluster);
-       si->global_cluster = NULL;
+       free_swap_info_arrays(si);
        inode = NULL;
        destroy_swap_extents(si, swap_file);
        free_swap_cluster_info(si->cluster_info, si->max);
@@ -3922,6 +3941,10 @@ void __folio_throttle_swaprate(struct folio *folio, 
gfp_t gfp)

 static int __init swapfile_init(void)
 {
+       unsigned int nr_orders = swap_nr_orders();
+       struct percpu_swap_cluster *pcp_sc;
+       int cpu;
+
        swapfile_maximum_size = arch_max_swapfile_size();

        swap_slots_in_cluster = generic_swap_slots_in_clusters();
@@ -3939,6 +3962,24 @@ static int __init swapfile_init(void)
                                            SWAPFILE_CLUSTER),
                                    0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);

+       percpu_swap_cluster = alloc_percpu(struct percpu_swap_cluster);
+       if (!percpu_swap_cluster)
+               panic("%s: alloc_percpu failed for percpu_swap_cluster\n", 
__func__);
+
+       for_each_possible_cpu(cpu) {
+               int node = cpu_to_mem(cpu);
+
+               pcp_sc = per_cpu_ptr(percpu_swap_cluster, cpu);
+               local_lock_init(&pcp_sc->lock);
+               pcp_sc->si = kcalloc_node(nr_orders, sizeof(*pcp_sc->si),
+                                       GFP_KERNEL, node);
+               pcp_sc->offset = kcalloc_node(nr_orders, 
sizeof(*pcp_sc->offset),
+                                           GFP_KERNEL, node);
+               if (!pcp_sc->si || !pcp_sc->offset)
+                       panic("%s: per-CPU kcalloc failed for cpu:%d, 
node:%d\n",
+                                       __func__, cpu, node);
+       }
+
 #ifdef CONFIG_MIGRATION
        if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
                swap_migration_ad_supported = true;
--
2.39.5

[RFC 3/4] mm, swap: make SWAP_NR_ORDERS runtime

Reply via email to