We observed the following deadlock in the stress test under low
memory scenario:

Thread A                               Thread B
- erofs_shrink_scan
 - erofs_try_to_release_workgroup
  - erofs_workgroup_try_to_freeze -- A
                                       - z_erofs_do_read_page
                                        - z_erofs_collection_begin
                                         - z_erofs_register_collection
                                          - erofs_insert_workgroup
                                           - xa_lock(&sbi->managed_pslots) -- B
                                           - erofs_workgroup_get
                                            - erofs_wait_on_workgroup_freezed 
-- A
  - xa_erase
   - xa_lock(&sbi->managed_pslots) -- B

To fix this, it need to hold the xa lock before freeze the workgroup
beacuse we will operate xarry. So let's hold the lock before access
each workgroup, just like when we using the radix tree before.

Fixes: 64094a04414f ("erofs: convert workstn to XArray")
Signed-off-by: Huang Jianan <[email protected]>
---
 fs/erofs/utils.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 84da2c280012..84a59f075dd1 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -150,7 +150,7 @@ static bool erofs_try_to_release_workgroup(struct 
erofs_sb_info *sbi,
         * however in order to avoid some race conditions, add a
         * DBG_BUGON to observe this in advance.
         */
-       DBG_BUGON(xa_erase(&sbi->managed_pslots, grp->index) != grp);
+       DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
 
        /* last refcount should be connected with its managed pslot.  */
        erofs_workgroup_unfreeze(grp, 0);
@@ -165,15 +165,20 @@ static unsigned long erofs_shrink_workstation(struct 
erofs_sb_info *sbi,
        unsigned int freed = 0;
        unsigned long index;
 
+       xa_lock(&sbi->managed_pslots);
        xa_for_each(&sbi->managed_pslots, index, grp) {
                /* try to shrink each valid workgroup */
                if (!erofs_try_to_release_workgroup(sbi, grp))
                        continue;
+               xa_unlock(&sbi->managed_pslots);
 
                ++freed;
                if (!--nr_shrink)
-                       break;
+                       return freed;
+               xa_lock(&sbi->managed_pslots);
        }
+       xa_unlock(&sbi->managed_pslots);
+
        return freed;
 }
 
-- 
2.25.1

Reply via email to