ext4_mb_discard_preallocations() only checks for grp->bb_prealloc_list
of every group to discard the group's PA to free up the space if
allocation request fails. Consider below race:-

Process A                               Process B

1. allocate blocks
                                        1. Fails block allocation from
                                             ext4_mb_regular_allocator()
   ext4_lock_group()
        allocated blocks
        more than ac_o_ex.fe_len
   ext4_unlock_group()
                                        2. Scans the
                                           grp->bb_prealloc_list (under
                                           ext4_lock_group()) and
                                           find nothing and thus return
                                           -ENOSPC.

2. Add the additional blocks to PA list

   ext4_lock_group()
        add blocks to grp->bb_prealloc_list
   ext4_unlock_group()

Above race could be avoided if we add those additional blocks to
grp->bb_prealloc_list at the same time with block allocation when
ext4_lock_group() was still held.
With this discard-PA will know if there are actually any blocks which
could be freed from the PA

Signed-off-by: Ritesh Harjani <rite...@linux.ibm.com>
---
 fs/ext4/mballoc.c | 97 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 35 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 7d766dc34cdd..574ce400a3b5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -349,6 +349,7 @@ static void ext4_mb_generate_from_pa(struct super_block 
*sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void 
*bitmap,
                                                ext4_group_t group);
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
 
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -1701,6 +1702,14 @@ static void ext4_mb_use_best_found(struct 
ext4_allocation_context *ac,
                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
                spin_unlock(&sbi->s_md_lock);
        }
+       /*
+        * As we've just preallocated more space than
+        * user requested originally, we store allocated
+        * space in a special descriptor.
+        */
+       if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+               ext4_mb_new_preallocation(ac);
+
 }
 
 /*
@@ -1949,7 +1958,7 @@ void ext4_mb_simple_scan_group(struct 
ext4_allocation_context *ac,
 
                ext4_mb_use_best_found(ac, e4b);
 
-               BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+               BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
 
                if (EXT4_SB(sb)->s_mb_stats)
                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
@@ -3702,7 +3711,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context 
*ac,
 /*
  * creates new preallocated space for given inode
  */
-static noinline_for_stack int
+static noinline_for_stack void
 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
@@ -3715,10 +3724,9 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+       BUG_ON(ac->ac_pa == NULL);
 
-       pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
-       if (pa == NULL)
-               return -ENOMEM;
+       pa = ac->ac_pa;
 
        if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
                int winl;
@@ -3762,7 +3770,6 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
        pa->pa_len = ac->ac_b_ex.fe_len;
        pa->pa_free = pa->pa_len;
-       atomic_set(&pa->pa_count, 1);
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
@@ -3782,21 +3789,17 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        pa->pa_obj_lock = &ei->i_prealloc_lock;
        pa->pa_inode = ac->ac_inode;
 
-       ext4_lock_group(sb, ac->ac_b_ex.fe_group);
        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
-       ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 
        spin_lock(pa->pa_obj_lock);
        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
        spin_unlock(pa->pa_obj_lock);
-
-       return 0;
 }
 
 /*
  * creates new preallocated space for locality group inodes belongs to
  */
-static noinline_for_stack int
+static noinline_for_stack void
 ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
@@ -3808,11 +3811,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+       BUG_ON(ac->ac_pa == NULL);
 
-       BUG_ON(ext4_pspace_cachep == NULL);
-       pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
-       if (pa == NULL)
-               return -ENOMEM;
+       pa = ac->ac_pa;
 
        /* preallocation can change ac_b_ex, thus we store actually
         * allocated blocks for history */
@@ -3822,7 +3823,6 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        pa->pa_lstart = pa->pa_pstart;
        pa->pa_len = ac->ac_b_ex.fe_len;
        pa->pa_free = pa->pa_len;
-       atomic_set(&pa->pa_count, 1);
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
@@ -3843,26 +3843,20 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        pa->pa_obj_lock = &lg->lg_prealloc_lock;
        pa->pa_inode = NULL;
 
-       ext4_lock_group(sb, ac->ac_b_ex.fe_group);
        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
-       ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 
        /*
         * We will later add the new pa to the right bucket
         * after updating the pa_free in ext4_mb_release_context
         */
-       return 0;
 }
 
-static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
 {
-       int err;
-
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
-               err = ext4_mb_new_group_pa(ac);
+               ext4_mb_new_group_pa(ac);
        else
-               err = ext4_mb_new_inode_pa(ac);
-       return err;
+               ext4_mb_new_inode_pa(ac);
 }
 
 /*
@@ -4177,6 +4171,29 @@ void ext4_discard_preallocations(struct inode *inode)
        }
 }
 
+static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
+{
+       struct ext4_prealloc_space *pa;
+
+       BUG_ON(ext4_pspace_cachep == NULL);
+       pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
+       if (!pa)
+               return -ENOMEM;
+       atomic_set(&pa->pa_count, 1);
+       ac->ac_pa = pa;
+       return 0;
+}
+
+static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
+{
+       struct ext4_prealloc_space *pa = ac->ac_pa;
+
+       BUG_ON(!pa);
+       ac->ac_pa = NULL;
+       WARN_ON(!atomic_dec_and_test(&pa->pa_count));
+       kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
 #ifdef CONFIG_EXT4_DEBUG
 static inline void ext4_mb_show_pa(struct super_block *sb)
 {
@@ -4633,23 +4650,28 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        if (!ext4_mb_use_preallocated(ac)) {
                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
                ext4_mb_normalize_request(ac, ar);
+
+               *errp = ext4_mb_pa_alloc(ac);
+               if (*errp)
+                       goto errout;
 repeat:
                /* allocate space in core */
                *errp = ext4_mb_regular_allocator(ac);
-               if (*errp)
-                       goto discard_and_exit;
-
-               /* as we've just preallocated more space than
-                * user requested originally, we store allocated
-                * space in a special descriptor */
-               if (ac->ac_status == AC_STATUS_FOUND &&
-                   ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
-                       *errp = ext4_mb_new_preallocation(ac);
+               /*
+                * pa allocated above is added to grp->bb_prealloc_list only
+                * when we were able to allocate some block i.e. when
+                * ac->ac_status == AC_STATUS_FOUND.
+                * And error from above mean ac->ac_status != AC_STATUS_FOUND
+                * So we have to free this pa here itself.
+                */
                if (*errp) {
-               discard_and_exit:
+                       ext4_mb_pa_free(ac);
                        ext4_discard_allocated_blocks(ac);
                        goto errout;
                }
+               if (ac->ac_status == AC_STATUS_FOUND &&
+                       ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
+                       ext4_mb_pa_free(ac);
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4664,6 +4686,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
                if (freed)
                        goto repeat;
+               /*
+                * If block allocation fails then the pa allocated above
+                * needs to be freed here itself.
+                */
+               ext4_mb_pa_free(ac);
                *errp = -ENOSPC;
        }
 
-- 
2.21.0

Reply via email to