From: Waiman Long <[email protected]>

[dchinner: original commit message preserved]

When many threads are trying to add or delete inode to or from
a superblock's s_inodes list, spinlock contention on the list can
become a performance bottleneck.

This patch changes the s_inodes field to become a dlock list which
is a distributed set of lists with per-list spinlocks.  As a result,
the following superblock inode list (sb->s_inodes) iteration functions
in vfs are also being modified:

 1. iterate_bdevs()
 2. drop_pagecache_sb()
 3. evict_inodes()
 4. invalidate_inodes()
 5. fsnotify_unmount_inodes()
 6. add_dquot_ref()
 7. remove_dquot_ref()

With an exit microbenchmark that creates a large number of threads,
attachs many inodes to them in procfs and then exits. The runtimes of
that microbenchmark with various number of threads before and after
the patch on a 4-socket Intel E7-8867 v3 system (64 cores, 128 threads)
on a 4.19-rc3 based kernel were as follows:

  # of threads  Elapsed/Sys Time    Elapsed/Sys Time    Speedup
                Unpatched Kernel     Patched Kernel
  ------------  ----------------    ----------------    -------
      1000      59.17s/123m09.8s    18.90s/24m44.5s      3.13
      1200      73.20s/151m24.1s    27.54s/50m05.3s      2.66
      1400     102.04s/212m00.9s    36.75s/68m26.7s      2.78
      1600     131.13s/272m52.4s    50.16s/94m23.7s      2.61

[dchinner: forward port, add new inode list traversals, etc]
[dchinner: scalability results on current TOT XFS]

With 400k inodes per thread concurrent directory traversal workload,
scalability improves at >=16 threads on 6.7-rc4 on XFS. We only test
XFS here as it is the only filesystem that demonstrates sufficient
internal scalability for the superblock inode list to be a
scalability bottleneck.

Table contains test runtime in seconds; perfect scalability is
demonstrated by the runtime staying constant as thread count goes up.

Threads         6.4-rc7         patched
-------         -------         -------
2               11.673          11.158
4                9.665           9.444
8               10.622           9.275
16              12.148           9.508
32              20.518          10.308

Unpatched kernel profile at 32 threads:

- 95.45% vfs_fstatat
  - 95.00% vfs_statx
     - 91.00% filename_lookup
        - 90.90% path_lookupat
           - 90.40% walk_component
              - 89.05% lookup_slow
                 - 88.95% __lookup_slow
                    - 86.38% xfs_vn_lookup
                       - 84.05% xfs_lookup
                          - 78.82% xfs_iget
                             - 72.58% xfs_setup_inode
                                - 72.54% inode_sb_list_add
                                   - 71.12% _raw_spin_lock
                                      - 71.09% do_raw_spin_lock
                                         - 68.85% __pv_queued_spin_lock_slowpath

Patched kernel profile at 32 threads - the biggest single point of
contention is now the dentry cache LRU via dput():

-   21.59%     0.25%  [kernel]              [k] dput
   - 21.34% dput
      - 19.93% retain_dentry
         - d_lru_add
            - 19.82% list_lru_add
               - 14.62% _raw_spin_lock
                  - 14.47% do_raw_spin_lock
                       10.89% __pv_queued_spin_lock_slowpath
                 1.78% __list_add_valid_or_report
               - 0.81% _raw_spin_unlock
                  - do_raw_spin_unlock
                       0.77% __raw_callee_save___pv_queued_spin_unlock
      - 0.79% _raw_spin_unlock
         - 0.78% do_raw_spin_unlock
              0.67% __raw_callee_save___pv_queued_spin_unlock

Signed-off-by: Waiman Long <[email protected]>
Signed-off-by: Dave Chinner <[email protected]>
---
 block/bdev.c           | 24 ++++++++----------------
 fs/drop_caches.c       |  9 ++++-----
 fs/gfs2/ops_fstype.c   | 21 +++++++++++----------
 fs/inode.c             | 37 ++++++++++++++++---------------------
 fs/notify/fsnotify.c   | 12 ++++++------
 fs/quota/dquot.c       | 22 ++++++----------------
 fs/super.c             | 13 +++++++------
 include/linux/fs.h     |  8 ++++----
 security/landlock/fs.c | 25 ++++++-------------------
 9 files changed, 68 insertions(+), 103 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 750aec178b6a..07135fd6fda4 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -437,11 +437,11 @@ long nr_blockdev_pages(void)
 {
        struct inode *inode;
        long ret = 0;
+       DEFINE_DLOCK_LIST_ITER(iter, &blockdev_superblock->s_inodes);
 
-       spin_lock(&blockdev_superblock->s_inode_list_lock);
-       list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                ret += inode->i_mapping->nrpages;
-       spin_unlock(&blockdev_superblock->s_inode_list_lock);
+       }
 
        return ret;
 }
@@ -1032,9 +1032,9 @@ EXPORT_SYMBOL_GPL(bdev_mark_dead);
 void sync_bdevs(bool wait)
 {
        struct inode *inode, *old_inode = NULL;
+       DEFINE_DLOCK_LIST_ITER(iter, &blockdev_superblock->s_inodes);
 
-       spin_lock(&blockdev_superblock->s_inode_list_lock);
-       list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                struct address_space *mapping = inode->i_mapping;
                struct block_device *bdev;
 
@@ -1046,15 +1046,8 @@ void sync_bdevs(bool wait)
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
-               spin_unlock(&blockdev_superblock->s_inode_list_lock);
-               /*
-                * We hold a reference to 'inode' so it couldn't have been
-                * removed from s_inodes list while we dropped the
-                * s_inode_list_lock  We cannot iput the inode now as we can
-                * be holding the last reference and we cannot iput it under
-                * s_inode_list_lock. So we keep the reference and iput it
-                * later.
-                */
+               dlock_list_unlock(&iter);
+
                iput(old_inode);
                old_inode = inode;
                bdev = I_BDEV(inode);
@@ -1075,9 +1068,8 @@ void sync_bdevs(bool wait)
                }
                mutex_unlock(&bdev->bd_disk->open_mutex);
 
-               spin_lock(&blockdev_superblock->s_inode_list_lock);
+               dlock_list_relock(&iter);
        }
-       spin_unlock(&blockdev_superblock->s_inode_list_lock);
        iput(old_inode);
 }
 
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index b9575957a7c2..3596d0a7c0da 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -19,9 +19,9 @@ int sysctl_drop_caches;
 static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
        struct inode *inode, *toput_inode = NULL;
+       DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
 
-       spin_lock(&sb->s_inode_list_lock);
-       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                spin_lock(&inode->i_lock);
                /*
                 * We must skip inodes in unusual state. We may also skip
@@ -35,16 +35,15 @@ static void drop_pagecache_sb(struct super_block *sb, void 
*unused)
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
-               spin_unlock(&sb->s_inode_list_lock);
+               dlock_list_unlock(&iter);
 
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
                iput(toput_inode);
                toput_inode = inode;
 
                cond_resched();
-               spin_lock(&sb->s_inode_list_lock);
+               dlock_list_relock(&iter);
        }
-       spin_unlock(&sb->s_inode_list_lock);
        iput(toput_inode);
 }
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b108c5d26839..1105710482e7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1738,22 +1738,24 @@ static int gfs2_meta_init_fs_context(struct fs_context 
*fc)
  * attempt will time out.  Since inodes are evicted sequentially, this can add
  * up quickly.
  *
- * Function evict_inodes() tries to keep the s_inode_list_lock list locked over
- * a long time, which prevents other inodes from being evicted concurrently.
- * This precludes the cooperative behavior we are looking for.  This special
- * version of evict_inodes() avoids that.
- *
  * Modeled after drop_pagecache_sb().
+ *
+ * XXX(dgc): this is particularly awful. With the dlist for inodes, concurrent
+ * access to the inode list can occur and evict_inodes() will drop the per-cpu
+ * list lock if the CPU needs rescheduling. Hence if this exists just because
+ * evict_inodes() holds the s_inode_list_lock for long periods preventing
+ * concurrent inode eviction work from being done, this can probably go away
+ * entirely now.
  */
 static void gfs2_evict_inodes(struct super_block *sb)
 {
        struct inode *inode, *toput_inode = NULL;
        struct gfs2_sbd *sdp = sb->s_fs_info;
+       DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
 
        set_bit(SDF_EVICTING, &sdp->sd_flags);
 
-       spin_lock(&sb->s_inode_list_lock);
-       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                spin_lock(&inode->i_lock);
                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) &&
                    !need_resched()) {
@@ -1762,15 +1764,14 @@ static void gfs2_evict_inodes(struct super_block *sb)
                }
                atomic_inc(&inode->i_count);
                spin_unlock(&inode->i_lock);
-               spin_unlock(&sb->s_inode_list_lock);
+               dlock_list_unlock(&iter);
 
                iput(toput_inode);
                toput_inode = inode;
 
                cond_resched();
-               spin_lock(&sb->s_inode_list_lock);
+               dlock_list_relock(&iter);
        }
-       spin_unlock(&sb->s_inode_list_lock);
        iput(toput_inode);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 17c50a75514f..3426691fa305 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -30,7 +30,7 @@
  *   inode->i_state, inode->i_hash, __iget(), inode->i_io_list
  * Inode LRU list locks protect:
  *   inode->i_sb->s_inode_lru, inode->i_lru
- * inode->i_sb->s_inode_list_lock protects:
+ * inode->i_sb->s_inodes->head->lock protects:
  *   inode->i_sb->s_inodes, inode->i_sb_list
  * bdi->wb.list_lock protects:
  *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
@@ -39,7 +39,7 @@
  *
  * Lock ordering:
  *
- * inode->i_sb->s_inode_list_lock
+ * inode->i_sb->s_inodes->head->lock
  *   inode->i_lock
  *     Inode LRU list locks
  *
@@ -47,7 +47,7 @@
  *   inode->i_lock
  *
  * inode_hash_lock
- *   inode->i_sb->s_inode_list_lock
+ *   inode->i_sb->s_inodes->head->lock
  *   inode->i_lock
  *
  * iunique_lock
@@ -423,7 +423,7 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->i_io_list);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
-       INIT_LIST_HEAD(&inode->i_sb_list);
+       init_dlock_list_node(&inode->i_sb_list);
        __address_space_init_once(&inode->i_data);
        i_size_ordered_init(inode);
 }
@@ -492,19 +492,14 @@ static void inode_lru_list_del(struct inode *inode)
  */
 void inode_sb_list_add(struct inode *inode)
 {
-       spin_lock(&inode->i_sb->s_inode_list_lock);
-       list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
-       spin_unlock(&inode->i_sb->s_inode_list_lock);
+       dlock_lists_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
 
 static inline void inode_sb_list_del(struct inode *inode)
 {
-       if (!list_empty(&inode->i_sb_list)) {
-               spin_lock(&inode->i_sb->s_inode_list_lock);
-               list_del_init(&inode->i_sb_list);
-               spin_unlock(&inode->i_sb->s_inode_list_lock);
-       }
+       if (!list_empty(&inode->i_sb_list.list))
+               dlock_lists_del(&inode->i_sb_list);
 }
 
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -713,11 +708,12 @@ static void dispose_list(struct list_head *head)
 void evict_inodes(struct super_block *sb)
 {
        struct inode *inode;
+       struct dlock_list_iter iter;
        LIST_HEAD(dispose);
 
 again:
-       spin_lock(&sb->s_inode_list_lock);
-       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+       init_dlock_list_iter(&iter, &sb->s_inodes);
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;
 
@@ -738,13 +734,12 @@ void evict_inodes(struct super_block *sb)
                 * bit so we don't livelock.
                 */
                if (need_resched()) {
-                       spin_unlock(&sb->s_inode_list_lock);
+                       dlock_list_unlock(&iter);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
-       spin_unlock(&sb->s_inode_list_lock);
 
        dispose_list(&dispose);
 }
@@ -759,11 +754,12 @@ EXPORT_SYMBOL_GPL(evict_inodes);
 void invalidate_inodes(struct super_block *sb)
 {
        struct inode *inode;
+       struct dlock_list_iter iter;
        LIST_HEAD(dispose);
 
 again:
-       spin_lock(&sb->s_inode_list_lock);
-       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+       init_dlock_list_iter(&iter, &sb->s_inodes);
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
@@ -779,13 +775,12 @@ void invalidate_inodes(struct super_block *sb)
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);
                if (need_resched()) {
-                       spin_unlock(&sb->s_inode_list_lock);
+                       dlock_list_unlock(&iter);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
-       spin_unlock(&sb->s_inode_list_lock);
 
        dispose_list(&dispose);
 }
@@ -1232,7 +1227,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned 
long hashval,
         * Add inode to the sb list if it's not already. It has I_NEW at this
         * point, so it should be safe to test i_sb_list locklessly.
         */
-       if (list_empty(&inode->i_sb_list))
+       if (list_empty(&inode->i_sb_list.list))
                inode_sb_list_add(inode);
 unlock:
        spin_unlock(&inode_hash_lock);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7974e91ffe13..15e3769e76f5 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -33,14 +33,15 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
  * @sb: superblock being unmounted.
  *
  * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN 
block.
+ * concurrent modifiers. We temporarily drop sb->s_inodes list lock and CAN
+ * block.
  */
 static void fsnotify_unmount_inodes(struct super_block *sb)
 {
        struct inode *inode, *iput_inode = NULL;
+       DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
 
-       spin_lock(&sb->s_inode_list_lock);
-       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                /*
                 * We cannot __iget() an inode in state I_FREEING,
                 * I_WILL_FREE, or I_NEW which is fine because by that point
@@ -68,7 +69,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
 
                __iget(inode);
                spin_unlock(&inode->i_lock);
-               spin_unlock(&sb->s_inode_list_lock);
+               dlock_list_unlock(&iter);
 
                iput(iput_inode);
 
@@ -80,9 +81,8 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
                iput_inode = inode;
 
                cond_resched();
-               spin_lock(&sb->s_inode_list_lock);
+               dlock_list_relock(&iter);
        }
-       spin_unlock(&sb->s_inode_list_lock);
 
        iput(iput_inode);
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 58b5de081b57..e873dcbe6feb 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1024,13 +1024,13 @@ static int dqinit_needed(struct inode *inode, int type)
 static int add_dquot_ref(struct super_block *sb, int type)
 {
        struct inode *inode, *old_inode = NULL;
+       DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
 #ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
 #endif
        int err = 0;
 
-       spin_lock(&sb->s_inode_list_lock);
-       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                spin_lock(&inode->i_lock);
                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
                    !atomic_read(&inode->i_writecount) ||
@@ -1040,7 +1040,7 @@ static int add_dquot_ref(struct super_block *sb, int type)
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
-               spin_unlock(&sb->s_inode_list_lock);
+               dlock_list_unlock(&iter);
 
 #ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
@@ -1053,19 +1053,10 @@ static int add_dquot_ref(struct super_block *sb, int 
type)
                        goto out;
                }
 
-               /*
-                * We hold a reference to 'inode' so it couldn't have been
-                * removed from s_inodes list while we dropped the
-                * s_inode_list_lock. We cannot iput the inode now as we can be
-                * holding the last reference and we cannot iput it under
-                * s_inode_list_lock. So we keep the reference and iput it
-                * later.
-                */
                old_inode = inode;
                cond_resched();
-               spin_lock(&sb->s_inode_list_lock);
+               dlock_list_relock(&iter);
        }
-       spin_unlock(&sb->s_inode_list_lock);
        iput(old_inode);
 out:
 #ifdef CONFIG_QUOTA_DEBUG
@@ -1081,12 +1072,12 @@ static int add_dquot_ref(struct super_block *sb, int 
type)
 static void remove_dquot_ref(struct super_block *sb, int type)
 {
        struct inode *inode;
+       DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
 #ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
 #endif
 
-       spin_lock(&sb->s_inode_list_lock);
-       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                /*
                 *  We have to scan also I_NEW inodes because they can already
                 *  have quota pointer initialized. Luckily, we need to touch
@@ -1108,7 +1099,6 @@ static void remove_dquot_ref(struct super_block *sb, int 
type)
                }
                spin_unlock(&dq_data_lock);
        }
-       spin_unlock(&sb->s_inode_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
diff --git a/fs/super.c b/fs/super.c
index 076392396e72..61c19e3f06d8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -303,6 +303,7 @@ static void destroy_unused_super(struct super_block *s)
        super_unlock_excl(s);
        list_lru_destroy(&s->s_dentry_lru);
        list_lru_destroy(&s->s_inode_lru);
+       free_dlock_list_heads(&s->s_inodes);
        security_sb_free(s);
        put_user_ns(s->s_user_ns);
        kfree(s->s_subtype);
@@ -367,8 +368,6 @@ static struct super_block *alloc_super(struct 
file_system_type *type, int flags,
        INIT_HLIST_NODE(&s->s_instances);
        INIT_HLIST_BL_HEAD(&s->s_roots);
        mutex_init(&s->s_sync_lock);
-       INIT_LIST_HEAD(&s->s_inodes);
-       spin_lock_init(&s->s_inode_list_lock);
        INIT_LIST_HEAD(&s->s_inodes_wb);
        spin_lock_init(&s->s_inode_wblist_lock);
 
@@ -383,6 +382,9 @@ static struct super_block *alloc_super(struct 
file_system_type *type, int flags,
        s->s_time_min = TIME64_MIN;
        s->s_time_max = TIME64_MAX;
 
+       if (alloc_dlock_list_heads(&s->s_inodes))
+               goto fail;
+
        s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
                                     "sb-%s", type->name);
        if (!s->s_shrink)
@@ -695,7 +697,7 @@ void generic_shutdown_super(struct super_block *sb)
                if (sop->put_super)
                        sop->put_super(sb);
 
-               if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes),
+               if (CHECK_DATA_CORRUPTION(!dlock_lists_empty(&sb->s_inodes),
                                "VFS: Busy inodes after unmount of %s (%s)",
                                sb->s_id, sb->s_type->name)) {
                        /*
@@ -704,14 +706,13 @@ void generic_shutdown_super(struct super_block *sb)
                         * iput_final() or such crashes cleanly.
                         */
                        struct inode *inode;
+                       DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
 
-                       spin_lock(&sb->s_inode_list_lock);
-                       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+                       dlist_for_each_entry(inode, &iter, i_sb_list) {
                                inode->i_op = VFS_PTR_POISON;
                                inode->i_sb = VFS_PTR_POISON;
                                inode->i_mapping = VFS_PTR_POISON;
                        }
-                       spin_unlock(&sb->s_inode_list_lock);
                }
        }
        /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..bb35591733f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -43,6 +43,7 @@
 #include <linux/cred.h>
 #include <linux/mnt_idmapping.h>
 #include <linux/slab.h>
+#include <linux/dlock-list.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -702,7 +703,7 @@ struct inode {
        u16                     i_wb_frn_history;
 #endif
        struct list_head        i_lru;          /* inode LRU list */
-       struct list_head        i_sb_list;
+       struct dlock_list_node  i_sb_list;
        struct list_head        i_wb_list;      /* backing dev writeback list */
        union {
                struct hlist_head       i_dentry;
@@ -1315,9 +1316,8 @@ struct super_block {
         */
        int s_stack_depth;
 
-       /* s_inode_list_lock protects s_inodes */
-       spinlock_t              s_inode_list_lock ____cacheline_aligned_in_smp;
-       struct list_head        s_inodes;       /* all inodes */
+       /* The internal per-list locks protect s_inodes */
+       struct dlock_list_heads s_inodes;       /* all inodes */
 
        spinlock_t              s_inode_wblist_lock;
        struct list_head        s_inodes_wb;    /* writeback inodes */
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index bc7c126deea2..4269d9938c09 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -844,12 +844,12 @@ static void hook_inode_free_security(struct inode *const 
inode)
 static void hook_sb_delete(struct super_block *const sb)
 {
        struct inode *inode, *prev_inode = NULL;
+       DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
 
        if (!landlock_initialized)
                return;
 
-       spin_lock(&sb->s_inode_list_lock);
-       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+       dlist_for_each_entry(inode, &iter, i_sb_list) {
                struct landlock_object *object;
 
                /* Only handles referenced inodes. */
@@ -883,6 +883,7 @@ static void hook_sb_delete(struct super_block *const sb)
                /* Keeps a reference to this inode until the next loop walk. */
                __iget(inode);
                spin_unlock(&inode->i_lock);
+               dlock_list_unlock(&iter);
 
                /*
                 * If there is no concurrent release_inode() ongoing, then we
@@ -917,25 +918,11 @@ static void hook_sb_delete(struct super_block *const sb)
                        rcu_read_unlock();
                }
 
-               if (prev_inode) {
-                       /*
-                        * At this point, we still own the __iget() reference
-                        * that we just set in this loop walk.  Therefore we
-                        * can drop the list lock and know that the inode won't
-                        * disappear from under us until the next loop walk.
-                        */
-                       spin_unlock(&sb->s_inode_list_lock);
-                       /*
-                        * We can now actually put the inode reference from the
-                        * previous loop walk, which is not needed anymore.
-                        */
-                       iput(prev_inode);
-                       cond_resched();
-                       spin_lock(&sb->s_inode_list_lock);
-               }
+               iput(prev_inode);
                prev_inode = inode;
+               cond_resched();
+               dlock_list_relock(&iter);
        }
-       spin_unlock(&sb->s_inode_list_lock);
 
        /* Puts the inode reference from the last loop walk, if any. */
        if (prev_inode)
-- 
2.42.0


Reply via email to