On Fri, Dec 19, 2014 at 01:41:35PM -0500, Josef Bacik wrote:
> Currently any time we try to update the block groups on disk we will walk 
> _all_
> block groups and check for the ->dirty flag to see if it is set.  This 
> function
> can get called several times during a commit.  So if you have several 
> terabytes
> of data you will be a very sad panda as we will loop through _all_ of the 
> block
> groups several times, which makes the commit take a while which slows down the
> rest of the file system operations.
> 
> This patch introduces a dirty list for the block groups that we get added to
> when we dirty the block group for the first time.  Then we simply update any
> block groups that have been dirtied since the last time we called
> btrfs_write_dirty_block_groups.  This allows us to clean up how we write the
> free space cache out so it is much cleaner.  Thanks,
> 
> Signed-off-by: Josef Bacik <[email protected]>
> ---
> V1->V2: Don't unconditionally take the dirty bg list lock in 
> update_block_group,
> only do it if our dirty_bg list is empty.
> 
>  fs/btrfs/ctree.h            |   5 +-
>  fs/btrfs/extent-tree.c      | 169 
> ++++++++++++++------------------------------
>  fs/btrfs/free-space-cache.c |   8 ++-
>  fs/btrfs/transaction.c      |  14 ++--
>  fs/btrfs/transaction.h      |   2 +
>  5 files changed, 74 insertions(+), 124 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index b62315b..e5bc509 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1237,7 +1237,6 @@ enum btrfs_disk_cache_state {
>       BTRFS_DC_ERROR          = 1,
>       BTRFS_DC_CLEAR          = 2,
>       BTRFS_DC_SETUP          = 3,
> -     BTRFS_DC_NEED_WRITE     = 4,
>  };
>  
>  struct btrfs_caching_control {
> @@ -1275,7 +1274,6 @@ struct btrfs_block_group_cache {
>       unsigned long full_stripe_len;
>  
>       unsigned int ro:1;
> -     unsigned int dirty:1;
>       unsigned int iref:1;
>  
>       int disk_cache_state;
> @@ -1309,6 +1307,9 @@ struct btrfs_block_group_cache {
>  
>       /* For read-only block groups */
>       struct list_head ro_list;
> +
> +     /* For dirty block groups */
> +     struct list_head dirty_list;
>  };
>  
>  /* delayed seq elem */
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 74eb29d..71a9752 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -74,8 +74,9 @@ enum {
>       RESERVE_ALLOC_NO_ACCOUNT = 2,
>  };
>  
> -static int update_block_group(struct btrfs_root *root,
> -                           u64 bytenr, u64 num_bytes, int alloc);
> +static int update_block_group(struct btrfs_trans_handle *trans,
> +                           struct btrfs_root *root, u64 bytenr,
> +                           u64 num_bytes, int alloc);
>  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
>                               struct btrfs_root *root,
>                               u64 bytenr, u64 num_bytes, u64 parent,
> @@ -3315,120 +3316,42 @@ int btrfs_write_dirty_block_groups(struct 
> btrfs_trans_handle *trans,
>                                  struct btrfs_root *root)
>  {
>       struct btrfs_block_group_cache *cache;
> -     int err = 0;
> +     struct btrfs_transaction *cur_trans = trans->transaction;
> +     int ret = 0;
>       struct btrfs_path *path;
> -     u64 last = 0;
> +
> +     if (list_empty(&cur_trans->dirty_bgs))
> +             return 0;
>  
>       path = btrfs_alloc_path();
>       if (!path)
>               return -ENOMEM;
>  
> -again:
> -     while (1) {
> -             cache = btrfs_lookup_first_block_group(root->fs_info, last);
> -             while (cache) {
> -                     if (cache->disk_cache_state == BTRFS_DC_CLEAR)
> -                             break;
> -                     cache = next_block_group(root, cache);
> -             }
> -             if (!cache) {
> -                     if (last == 0)
> -                             break;
> -                     last = 0;
> -                     continue;
> -             }
> -             err = cache_save_setup(cache, trans, path);
> -             last = cache->key.objectid + cache->key.offset;
> -             btrfs_put_block_group(cache);
> -     }
> -
> -     while (1) {
> -             if (last == 0) {
> -                     err = btrfs_run_delayed_refs(trans, root,
> -                                                  (unsigned long)-1);
> -                     if (err) /* File system offline */
> -                             goto out;
> -             }
> -
> -             cache = btrfs_lookup_first_block_group(root->fs_info, last);
> -             while (cache) {
> -                     if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
> -                             btrfs_put_block_group(cache);
> -                             goto again;
> -                     }
> -
> -                     if (cache->dirty)
> -                             break;
> -                     cache = next_block_group(root, cache);
> -             }
> -             if (!cache) {
> -                     if (last == 0)
> -                             break;
> -                     last = 0;
> -                     continue;
> -             }
> -
> -             if (cache->disk_cache_state == BTRFS_DC_SETUP)
> -                     cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
> -             cache->dirty = 0;
> -             last = cache->key.objectid + cache->key.offset;
> -
> -             err = write_one_cache_group(trans, root, path, cache);
> -             btrfs_put_block_group(cache);
> -             if (err) /* File system offline */
> -                     goto out;
> -     }
> -
> -     while (1) {
> -             /*
> -              * I don't think this is needed since we're just marking our
> -              * preallocated extent as written, but just in case it can't
> -              * hurt.
> -              */
> -             if (last == 0) {
> -                     err = btrfs_run_delayed_refs(trans, root,
> -                                                  (unsigned long)-1);
> -                     if (err) /* File system offline */
> -                             goto out;
> -             }
> -
> -             cache = btrfs_lookup_first_block_group(root->fs_info, last);
> -             while (cache) {
> -                     /*
> -                      * Really this shouldn't happen, but it could if we
> -                      * couldn't write the entire preallocated extent and
> -                      * splitting the extent resulted in a new block.
> -                      */
> -                     if (cache->dirty) {
> -                             btrfs_put_block_group(cache);
> -                             goto again;
> -                     }
> -                     if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
> -                             break;
> -                     cache = next_block_group(root, cache);
> -             }
> -             if (!cache) {
> -                     if (last == 0)
> -                             break;
> -                     last = 0;
> -                     continue;
> -             }
> -
> -             err = btrfs_write_out_cache(root, trans, cache, path);
> -
> -             /*
> -              * If we didn't have an error then the cache state is still
> -              * NEED_WRITE, so we can set it to WRITTEN.
> -              */
> -             if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
> -                     cache->disk_cache_state = BTRFS_DC_WRITTEN;
> -             last = cache->key.objectid + cache->key.offset;
> +     /*
> +      * We don't need the lock here since we are protected by the transaction
> +      * commit.  We want to do the cache_save_setup first and then run the
> +      * delayed refs to make sure we have the best chance at doing this all
> +      * in one shot.
> +      */
> +     while (!list_empty(&cur_trans->dirty_bgs)) {
> +             cache = list_first_entry(&cur_trans->dirty_bgs,
> +                                      struct btrfs_block_group_cache,
> +                                      dirty_list);
> +             list_del_init(&cache->dirty_list);
> +             if (cache->disk_cache_state == BTRFS_DC_CLEAR)
> +                     cache_save_setup(cache, trans, path);
> +             if (!ret)
> +                     ret = btrfs_run_delayed_refs(trans, root,
> +                                                  (unsigned long) -1);
> +             if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
> +                     btrfs_write_out_cache(root, trans, cache, path);
> +             if (!ret)
> +                     ret = write_one_cache_group(trans, root, path, cache);
>               btrfs_put_block_group(cache);
>       }
> -out:
>  
>       btrfs_free_path(path);
> -     return err;
> +     return ret;
>  }
>  
>  int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
> @@ -5375,8 +5298,9 @@ void btrfs_delalloc_release_space(struct inode *inode, 
> u64 num_bytes)
>       btrfs_free_reserved_data_space(inode, num_bytes);
>  }
>  
> -static int update_block_group(struct btrfs_root *root,
> -                           u64 bytenr, u64 num_bytes, int alloc)
> +static int update_block_group(struct btrfs_trans_handle *trans,
> +                           struct btrfs_root *root, u64 bytenr,
> +                           u64 num_bytes, int alloc)
>  {
>       struct btrfs_block_group_cache *cache = NULL;
>       struct btrfs_fs_info *info = root->fs_info;
> @@ -5414,6 +5338,16 @@ static int update_block_group(struct btrfs_root *root,
>               if (!alloc && cache->cached == BTRFS_CACHE_NO)
>                       cache_block_group(cache, 1);
>  
> +             if (list_empty(&cache->dirty_list)) {
> +                     spin_lock(&trans->transaction->dirty_bgs_lock);
> +                     if (list_empty(&cache->dirty_list)) {
> +                             list_add_tail(&cache->dirty_list,
> +                                           &trans->transaction->dirty_bgs);
> +                             btrfs_get_block_group(cache);
> +                     }
> +                     spin_unlock(&trans->transaction->dirty_bgs_lock);
> +             }
> +
>               byte_in_group = bytenr - cache->key.objectid;
>               WARN_ON(byte_in_group > cache->key.offset);
>  
> @@ -5424,7 +5358,6 @@ static int update_block_group(struct btrfs_root *root,
>                   cache->disk_cache_state < BTRFS_DC_CLEAR)
>                       cache->disk_cache_state = BTRFS_DC_CLEAR;
>  
> -             cache->dirty = 1;
>               old_val = btrfs_block_group_used(&cache->item);
>               num_bytes = min(total, cache->key.offset - byte_in_group);
>               if (alloc) {
> @@ -6102,7 +6035,7 @@ static int __btrfs_free_extent(struct 
> btrfs_trans_handle *trans,
>                       }
>               }
>  
> -             ret = update_block_group(root, bytenr, num_bytes, 0);
> +             ret = update_block_group(trans, root, bytenr, num_bytes, 0);
>               if (ret) {
>                       btrfs_abort_transaction(trans, extent_root, ret);
>                       goto out;
> @@ -7062,7 +6995,7 @@ static int alloc_reserved_file_extent(struct 
> btrfs_trans_handle *trans,
>       if (ret)
>               return ret;
>  
> -     ret = update_block_group(root, ins->objectid, ins->offset, 1);
> +     ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
>       if (ret) { /* -ENOENT, logic error */
>               btrfs_err(fs_info, "update block group failed for %llu %llu",
>                       ins->objectid, ins->offset);
> @@ -7151,7 +7084,8 @@ static int alloc_reserved_tree_block(struct 
> btrfs_trans_handle *trans,
>                       return ret;
>       }
>  
> -     ret = update_block_group(root, ins->objectid, root->nodesize, 1);
> +     ret = update_block_group(trans, root, ins->objectid, root->nodesize,
> +                              1);
>       if (ret) { /* -ENOENT, logic error */
>               btrfs_err(fs_info, "update block group failed for %llu %llu",
>                       ins->objectid, ins->offset);
> @@ -9003,6 +8937,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, 
> u64 start, u64 size)
>       INIT_LIST_HEAD(&cache->cluster_list);
>       INIT_LIST_HEAD(&cache->bg_list);
>       INIT_LIST_HEAD(&cache->ro_list);
> +     INIT_LIST_HEAD(&cache->dirty_list);
>       btrfs_init_free_space_ctl(cache);
>  
>       return cache;
> @@ -9065,9 +9000,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
>                        * b) Setting 'dirty flag' makes sure that we flush
>                        *    the new space cache info onto disk.
>                        */
> -                     cache->disk_cache_state = BTRFS_DC_CLEAR;
>                       if (btrfs_test_opt(root, SPACE_CACHE))
> -                             cache->dirty = 1;
> +                             cache->disk_cache_state = BTRFS_DC_CLEAR;
>               }
>  
>               read_extent_buffer(leaf, &cache->item,
> @@ -9427,6 +9361,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle 
> *trans,
>       if (block_group->cached == BTRFS_CACHE_STARTED)
>               wait_block_group_cache_done(block_group);
>  
> +     spin_lock(&trans->transaction->dirty_bgs_lock);
> +     if (!list_empty(&block_group->dirty_list)) {
> +             list_del_init(&block_group->dirty_list);
> +             btrfs_put_block_group(block_group);
> +     }
> +     spin_unlock(&trans->transaction->dirty_bgs_lock);
> +
>       btrfs_remove_free_space_cache(block_group);
>  
>       spin_lock(&block_group->space_info->lock);
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 3384819..e85de8c 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -1210,6 +1210,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>       struct inode *inode;
>       int ret = 0;
> +     enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
>  
>       root = root->fs_info->tree_root;
>  
> @@ -1233,9 +1234,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>       ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
>                                     path, block_group->key.objectid);
>       if (ret) {
> -             spin_lock(&block_group->lock);
> -             block_group->disk_cache_state = BTRFS_DC_ERROR;
> -             spin_unlock(&block_group->lock);
> +             dcs = BTRFS_DC_ERROR;
>               ret = 0;
>  #ifdef DEBUG
>               btrfs_err(root->fs_info,
> @@ -1244,6 +1243,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  #endif
>       }
>  
> +     spin_lock(&block_group->lock);
> +     block_group->disk_cache_state = dcs;
> +     spin_unlock(&block_group->lock);
>       iput(inode);
>       return ret;
>  }
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 6b1ac53..4b3f999 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -220,6 +220,8 @@ loop:
>       INIT_LIST_HEAD(&cur_trans->pending_snapshots);
>       INIT_LIST_HEAD(&cur_trans->pending_chunks);
>       INIT_LIST_HEAD(&cur_trans->switch_commits);
> +     INIT_LIST_HEAD(&cur_trans->dirty_bgs);
> +     spin_lock_init(&cur_trans->dirty_bgs_lock);
>       list_add_tail(&cur_trans->list, &fs_info->trans_list);
>       extent_io_tree_init(&cur_trans->dirty_pages,
>                            fs_info->btree_inode->i_mapping);
> @@ -957,7 +959,9 @@ static int update_cowonly_root(struct btrfs_trans_handle 
> *trans,
>       while (1) {
>               old_root_bytenr = btrfs_root_bytenr(&root->root_item);
>               if (old_root_bytenr == root->node->start &&
> -                 old_root_used == btrfs_root_used(&root->root_item))
> +                 old_root_used == btrfs_root_used(&root->root_item) &&
> +                 (!extent_root ||
> +                  list_empty(&trans->transaction->dirty_bgs)))
>                       break;
>  
>               btrfs_set_root_node(&root->root_item, root->node);
> @@ -976,6 +980,9 @@ static int update_cowonly_root(struct btrfs_trans_handle 
> *trans,
>               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
>               if (ret)
>                       return ret;
> +             ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
> +             if (ret)
> +                     return ret;

Does the second btrfs_run_delayed_refs() mean to update extent_tree's delayed 
refs produced by the first one?

Thanks,

-liubo

>       }
>  
>       return 0;
> @@ -996,10 +1003,6 @@ static noinline int commit_cowonly_roots(struct 
> btrfs_trans_handle *trans,
>       struct extent_buffer *eb;
>       int ret;
>  
> -     ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
> -     if (ret)
> -             return ret;
> -
>       eb = btrfs_lock_root_node(fs_info->tree_root);
>       ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
>                             0, &eb);
> @@ -1897,6 +1900,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
> *trans,
>       switch_commit_roots(cur_trans, root->fs_info);
>  
>       assert_qgroups_uptodate(trans);
> +     ASSERT(list_empty(&cur_trans->dirty_bgs));
>       update_super_roots(root);
>  
>       btrfs_set_super_log_root(root->fs_info->super_copy, 0);
> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
> index d8f40e1..2aa345e 100644
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -57,6 +57,8 @@ struct btrfs_transaction {
>       struct list_head pending_snapshots;
>       struct list_head pending_chunks;
>       struct list_head switch_commits;
> +     struct list_head dirty_bgs;
> +     spinlock_t dirty_bgs_lock;
>       struct btrfs_delayed_ref_root delayed_refs;
>       int aborted;
>  };
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to