On Mon, Apr 19, 2010 at 06:45:44PM +0800, Yan, Zheng wrote:
> We already have fs_info->chunk_mutex to avoid concurrent
> chunk creation.
> 
> Signed-off-by: Yan Zheng <zheng....@oracle.com>
> 
> ---
> diff -urp 2/fs/btrfs/ctree.h 3/fs/btrfs/ctree.h
> --- 2/fs/btrfs/ctree.h        2010-04-18 08:12:22.086699485 +0800
> +++ 3/fs/btrfs/ctree.h        2010-04-18 08:13:15.457699211 +0800
> @@ -700,9 +700,7 @@ struct btrfs_space_info {
>       struct list_head list;
>  
>       /* for controlling how we free up space for allocations */
> -     wait_queue_head_t allocate_wait;
>       wait_queue_head_t flush_wait;
> -     int allocating_chunk;
>       int flushing;
>  
>       /* for block groups in our same type */
> diff -urp 2/fs/btrfs/extent-tree.c 3/fs/btrfs/extent-tree.c
> --- 2/fs/btrfs/extent-tree.c  2010-04-18 08:12:22.092698714 +0800
> +++ 3/fs/btrfs/extent-tree.c  2010-04-18 08:13:15.463699138 +0800
> @@ -70,6 +70,9 @@ static int find_next_key(struct btrfs_pa
>                        struct btrfs_key *key);
>  static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
>                           int dump_block_groups);
> +static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
> +                             struct btrfs_root *root,
> +                             struct btrfs_space_info *sinfo, u64 num_bytes);
>  
>  static noinline int
>  block_group_cache_done(struct btrfs_block_group_cache *cache)
> @@ -2687,7 +2690,6 @@ static int update_space_info(struct btrf
>               INIT_LIST_HEAD(&found->block_groups[i]);
>       init_rwsem(&found->groups_sem);
>       init_waitqueue_head(&found->flush_wait);
> -     init_waitqueue_head(&found->allocate_wait);
>       spin_lock_init(&found->lock);
>       found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
>                               BTRFS_BLOCK_GROUP_SYSTEM |
> @@ -3000,71 +3002,6 @@ flush:
>       wake_up(&info->flush_wait);
>  }
>  
> -static int maybe_allocate_chunk(struct btrfs_root *root,
> -                              struct btrfs_space_info *info)
> -{
> -     struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
> -     struct btrfs_trans_handle *trans;
> -     bool wait = false;
> -     int ret = 0;
> -     u64 min_metadata;
> -     u64 free_space;
> -
> -     free_space = btrfs_super_total_bytes(disk_super);
> -     /*
> -      * we allow the metadata to grow to a max of either 10gb or 5% of the
> -      * space in the volume.
> -      */
> -     min_metadata = min((u64)10 * 1024 * 1024 * 1024,
> -                          div64_u64(free_space * 5, 100));
> -     if (info->total_bytes >= min_metadata) {
> -             spin_unlock(&info->lock);
> -             return 0;
> -     }
> -
> -     if (info->full) {
> -             spin_unlock(&info->lock);
> -             return 0;
> -     }
> -
> -     if (!info->allocating_chunk) {
> -             info->force_alloc = 1;
> -             info->allocating_chunk = 1;
> -     } else {
> -             wait = true;
> -     }
> -
> -     spin_unlock(&info->lock);
> -
> -     if (wait) {
> -             wait_event(info->allocate_wait,
> -                        !info->allocating_chunk);
> -             return 1;
> -     }
> -
> -     trans = btrfs_start_transaction(root, 1);
> -     if (!trans) {
> -             ret = -ENOMEM;
> -             goto out;
> -     }
> -
> -     ret = do_chunk_alloc(trans, root->fs_info->extent_root,
> -                          4096 + 2 * 1024 * 1024,
> -                          info->flags, 0);
> -     btrfs_end_transaction(trans, root);
> -     if (ret)
> -             goto out;
> -out:
> -     spin_lock(&info->lock);
> -     info->allocating_chunk = 0;
> -     spin_unlock(&info->lock);
> -     wake_up(&info->allocate_wait);
> -
> -     if (ret)
> -             return 0;
> -     return 1;
> -}
> -
>  /*
>   * Reserve metadata space for delalloc.
>   */
> @@ -3105,7 +3042,8 @@ again:
>               flushed++;
>  
>               if (flushed == 1) {
> -                     if (maybe_allocate_chunk(root, meta_sinfo))
> +                     if (maybe_allocate_chunk(NULL, root, meta_sinfo,
> +                                              num_bytes))
>                               goto again;
>                       flushed++;
>               } else {
> @@ -3220,7 +3158,8 @@ again:
>       if (used > meta_sinfo->total_bytes) {
>               retries++;
>               if (retries == 1) {
> -                     if (maybe_allocate_chunk(root, meta_sinfo))
> +                     if (maybe_allocate_chunk(NULL, root, meta_sinfo,
> +                                              num_bytes))
>                               goto again;
>                       retries++;
>               } else {
> @@ -3417,13 +3356,28 @@ static void force_metadata_allocation(st
>       rcu_read_unlock();
>  }
>  
> +static int should_alloc_chunk(struct btrfs_space_info *sinfo,
> +                           u64 alloc_bytes)
> +{
> +     u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
> +
> +     if (sinfo->bytes_used + sinfo->bytes_reserved +
> +         alloc_bytes + 256 * 1024 * 1024 < num_bytes)
> +             return 0;
> +
> +     if (sinfo->bytes_used + sinfo->bytes_reserved +
> +         alloc_bytes < div_factor(num_bytes, 8))
> +             return 0;
> +
> +     return 1;
> +}
> +
>  static int do_chunk_alloc(struct btrfs_trans_handle *trans,
>                         struct btrfs_root *extent_root, u64 alloc_bytes,
>                         u64 flags, int force)
>  {
>       struct btrfs_space_info *space_info;
>       struct btrfs_fs_info *fs_info = extent_root->fs_info;
> -     u64 thresh;
>       int ret = 0;
>  
>       mutex_lock(&fs_info->chunk_mutex);
> @@ -3446,11 +3400,7 @@ static int do_chunk_alloc(struct btrfs_t
>               goto out;
>       }
>  
> -     thresh = space_info->total_bytes - space_info->bytes_readonly;
> -     thresh = div_factor(thresh, 8);
> -     if (!force &&
> -        (space_info->bytes_used + space_info->bytes_pinned +
> -         space_info->bytes_reserved + alloc_bytes) < thresh) {
> +     if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
>               spin_unlock(&space_info->lock);
>               goto out;
>       }
> @@ -3472,6 +3422,8 @@ static int do_chunk_alloc(struct btrfs_t
>       spin_lock(&space_info->lock);
>       if (ret)
>               space_info->full = 1;
> +     else
> +             ret = 1;
>       space_info->force_alloc = 0;
>       spin_unlock(&space_info->lock);
>  out:
> @@ -3479,6 +3431,38 @@ out:
>       return ret;
>  }
>  
> +static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
> +                             struct btrfs_root *root,
> +                             struct btrfs_space_info *sinfo, u64 num_bytes)
> +{
> +     int ret;
> +     int end_trans = 0;
> +
> +     if (sinfo->full)
> +             return 0;
> +

maybe_allocate_chunk is called with the info->lock already held, this will
deadlock.

> +     spin_lock(&sinfo->lock);
> +     ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
> +     spin_unlock(&sinfo->lock);
> +     if (!ret)
> +             return 0;
> +
> +     if (!trans) {
> +             trans = btrfs_join_transaction(root, 1);
> +             BUG_ON(IS_ERR(trans));
> +             end_trans = 1;
> +     }
> +
> +     ret = do_chunk_alloc(trans, root->fs_info->extent_root,
> +                          num_bytes + 2 * 1024 * 1024,
> +                          get_alloc_profile(root, sinfo->flags), 0);
> +
> +     if (end_trans)
> +             btrfs_end_transaction(trans, root);
> +
> +     return ret == 1 ? 1 : 0;
> +}
> +
>  static int update_block_group(struct btrfs_trans_handle *trans,
>                             struct btrfs_root *root,
>                             u64 bytenr, u64 num_bytes, int alloc,

The purpose of maybe_allocate_chunk was that there is no way to know if some
other CPU is currently trying to allocate a chunk for the given space info.  We
could have two cpu's come inot do_chunk_alloc at relatively the same time and
end up allocating twice the amount of space, which is why I did the waitqueue
thing.  It seems like this is still a possibility with your patch.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to