Any Comment?

On Thu, 08 Sep 2011 16:18:09 +0800, Miao Xie wrote:
> This patch introduce free space cluster for each node in the b+tree. And
> we also simplify the fill method and the space allocation of the cluster:
> - Allocate free space cluster for each node
> - Allocate the free space extent (>=256KB, split a large extent or get the
>   contiguous blocks from the bitmaps) from the block group cache and cache
>   it in the cluster, instead of moving the free space entry from the block
>   group cache to the cluster directly. By this way, we just manage a extent
>   in the cluster.
> - When doing the tree block allocation, we can get the space just by split
>   the extent in the parent's cluster. By this way, we can allocate the tree
>   block concurrently and also can store the child nodes/leaves into the
>   contiguous blocks.
> 
> Beside that, we modify the source of the space cache to make it fit the above
> change. When we write out the information of the free space, we also write out
> the extent information in the clusters. And when we load the free space
> information, we will try to merge the free space fragment at first, because 
> the
> extent in the clusters is small, and may merge them to be a large one.
> (Before applying this patch, we build the free space tree by the cached 
> information
> directly. I think we needn't worry about the compatibility. At the worst, we 
> may
> create lots of small extents which may be merge into a large one, but it will 
> not
> break the use of Btrfs.)
> 
> We did some performance test for this patch, we find it works well, and make 
> the
> small file performance grow up.
> 
> The sequential write performance of the small file:
> N_files               N_threads       File Size       Before          After
> 10240         8               2KB(Inline)     2.2055MB/s      4.1779MB/s
> 10240         8               4KB             1.001MB/s       1.1893MB/s
> 
> Signed-off-by: Miao Xie <[email protected]>
> ---
>  fs/btrfs/ctree.c            |   28 ++-
>  fs/btrfs/ctree.h            |   50 +++-
>  fs/btrfs/disk-io.c          |    2 +-
>  fs/btrfs/extent-tree.c      |  107 +++++---
>  fs/btrfs/extent_io.c        |    7 +-
>  fs/btrfs/extent_io.h        |    3 +
>  fs/btrfs/free-space-cache.c |  667 
> ++++++++++++++++---------------------------
>  fs/btrfs/free-space-cache.h |   13 +
>  fs/btrfs/inode-map.c        |    1 +
>  fs/btrfs/inode.c            |   25 +-
>  fs/btrfs/ioctl.c            |    2 +-
>  fs/btrfs/tree-log.c         |    7 +
>  12 files changed, 419 insertions(+), 493 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 011cab3..1d3edd8 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -238,7 +238,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
>       else
>               btrfs_node_key(buf, &disk_key, 0);
>  
> -     cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
> +     cow = btrfs_alloc_free_block(trans, root, buf->len, NULL, 0,
>                                    new_root_objectid, &disk_key, level,
>                                    buf->start, 0);
>       if (IS_ERR(cow))
> @@ -444,9 +444,10 @@ static noinline int __btrfs_cow_block(struct 
> btrfs_trans_handle *trans,
>       } else
>               parent_start = 0;
>  
> -     cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
> -                                  root->root_key.objectid, &disk_key,
> -                                  level, search_start, empty_size);
> +     cow = btrfs_alloc_free_block(trans, root, buf->len, parent,
> +                                  parent_start, root->root_key.objectid,
> +                                  &disk_key, level, search_start,
> +                                  empty_size);
>       if (IS_ERR(cow))
>               return PTR_ERR(cow);
>  
> @@ -467,6 +468,10 @@ static noinline int __btrfs_cow_block(struct 
> btrfs_trans_handle *trans,
>                           (unsigned long)btrfs_header_fsid(cow),
>                           BTRFS_FSID_SIZE);
>  
> +     WARN_ON(cow->cluster);
> +     cow->cluster = buf->cluster;
> +     buf->cluster = NULL;
> +
>       update_ref_for_cow(trans, root, buf, cow, &last_ref);
>  
>       if (root->ref_cows)
> @@ -2070,7 +2075,7 @@ static noinline int insert_new_root(struct 
> btrfs_trans_handle *trans,
>       else
>               btrfs_node_key(lower, &lower_key, 0);
>  
> -     c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
> +     c = btrfs_alloc_free_block(trans, root, root->nodesize, NULL, 0,
>                                  root->root_key.objectid, &lower_key,
>                                  level, root->node->start, 0);
>       if (IS_ERR(c))
> @@ -2170,6 +2175,7 @@ static noinline int split_node(struct 
> btrfs_trans_handle *trans,
>  {
>       struct extent_buffer *c;
>       struct extent_buffer *split;
> +     struct extent_buffer *parent = NULL;
>       struct btrfs_disk_key disk_key;
>       int mid;
>       int ret;
> @@ -2197,9 +2203,12 @@ static noinline int split_node(struct 
> btrfs_trans_handle *trans,
>       mid = (c_nritems + 1) / 2;
>       btrfs_node_key(c, &disk_key, mid);
>  
> -     split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
> -                                     root->root_key.objectid,
> -                                     &disk_key, level, c->start, 0);
> +     if (level + 1 < BTRFS_MAX_LEVEL)
> +             parent = path->nodes[level + 1];
> +
> +     split = btrfs_alloc_free_block(trans, root, root->nodesize, parent, 0,
> +                                    root->root_key.objectid,
> +                                    &disk_key, level, c->start, 0);
>       if (IS_ERR(split))
>               return PTR_ERR(split);
>  
> @@ -2951,7 +2960,8 @@ again:
>       else
>               btrfs_item_key(l, &disk_key, mid);
>  
> -     right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
> +     right = btrfs_alloc_free_block(trans, root, root->leafsize,
> +                                     path->nodes[1], 0,
>                                       root->root_key.objectid,
>                                       &disk_key, 0, l->start, 0);
>       if (IS_ERR(right))
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index c364d50..8c33a74 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -789,14 +789,9 @@ struct btrfs_block_rsv {
>   */
>  struct btrfs_free_cluster {
>       spinlock_t lock;
> -     spinlock_t refill_lock;
> -     struct rb_root root;
> +     struct mutex refill_lock;
>  
> -     /* largest extent in this cluster */
> -     u64 max_size;
> -
> -     /* first extent starting offset */
> -     u64 window_start;
> +     struct btrfs_free_space *info;
>  
>       struct btrfs_block_group_cache *block_group;
>       /*
> @@ -2156,7 +2151,9 @@ u64 btrfs_find_block_group(struct btrfs_root *root,
>                          u64 search_start, u64 search_hint, int owner);
>  struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle 
> *trans,
>                                       struct btrfs_root *root, u32 blocksize,
> -                                     u64 parent, u64 root_objectid,
> +                                     struct extent_buffer *parent,
> +                                     u64 parent_start,
> +                                     u64 root_objectid,
>                                       struct btrfs_disk_key *key, int level,
>                                       u64 hint, u64 empty_size);
>  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
> @@ -2175,12 +2172,37 @@ int btrfs_alloc_logged_file_extent(struct 
> btrfs_trans_handle *trans,
>                                  struct btrfs_root *root,
>                                  u64 root_objectid, u64 owner, u64 offset,
>                                  struct btrfs_key *ins);
> -int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
> -                               struct btrfs_root *root,
> -                               u64 num_bytes, u64 min_alloc_size,
> -                               u64 empty_size, u64 hint_byte,
> -                               u64 search_end, struct btrfs_key *ins,
> -                               u64 data);
> +int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
> +                        struct btrfs_root *root,
> +                        u64 num_bytes, u64 min_alloc_size,
> +                        u64 empty_size, u64 hint_byte,
> +                        u64 search_end, struct btrfs_key *ins,
> +                        u64 data, struct btrfs_free_cluster *cluster);
> +static inline int btrfs_reserve_extent_data(struct btrfs_trans_handle *trans,
> +                                         struct btrfs_root *root,
> +                                         u64 num_bytes, u64 min_alloc_size,
> +                                         u64 empty_size, u64 hint_byte,
> +                                         u64 search_end,
> +                                         struct btrfs_key *ins)
> +{
> +     return __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
> +                                   empty_size, hint_byte, search_end, ins,
> +                                   1, NULL);
> +}
> +
> +static inline int btrfs_reserve_extent_mdata(struct btrfs_trans_handle 
> *trans,
> +                                          struct btrfs_root *root,
> +                                          u64 num_bytes, u64 min_alloc_size,
> +                                          u64 empty_size, u64 hint_byte,
> +                                          u64 search_end,
> +                                          struct btrfs_key *ins,
> +                                          struct btrfs_free_cluster *cluster)
> +{
> +     return __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
> +                                   empty_size, hint_byte, search_end, ins,
> +                                   0, cluster);
> +}
> +
>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>                 struct extent_buffer *buf, int full_backref);
>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 07b3ac6..951a57f 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1171,7 +1171,7 @@ static struct btrfs_root *alloc_log_tree(struct 
> btrfs_trans_handle *trans,
>        */
>       root->ref_cows = 0;
>  
> -     leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
> +     leaf = btrfs_alloc_free_block(trans, root, root->leafsize, NULL, 0,
>                                     BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
>       if (IS_ERR(leaf)) {
>               kfree(root);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 80d6148..43cc5c4 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4672,6 +4672,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle 
> *trans,
>       struct btrfs_block_group_cache *cache = NULL;
>       int ret;
>  
> +     btrfs_free_extent_cluster(buf);
> +
>       if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
>               ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
>                                               parent, root->root_key.objectid,
> @@ -4853,8 +4855,10 @@ enum btrfs_loop_type {
>       LOOP_FIND_IDEAL = 0,
>       LOOP_CACHING_NOWAIT = 1,
>       LOOP_CACHING_WAIT = 2,
> -     LOOP_ALLOC_CHUNK = 3,
> -     LOOP_NO_EMPTY_SIZE = 4,
> +     LOOP_RECLAIM_CLUSTERS = 3,
> +     LOOP_RECLAIM_ALL_CLUSTERS = 4,
> +     LOOP_ALLOC_CHUNK = 5,
> +     LOOP_NO_EMPTY_SIZE = 6,
>  };
>  
>  /*
> @@ -4870,7 +4874,8 @@ static noinline int find_free_extent(struct 
> btrfs_trans_handle *trans,
>                                    u64 num_bytes, u64 empty_size,
>                                    u64 search_start, u64 search_end,
>                                    u64 hint_byte, struct btrfs_key *ins,
> -                                  u64 data)
> +                                  u64 data,
> +                                  struct btrfs_free_cluster *cluster)
>  {
>       int ret = 0;
>       struct btrfs_root *root = orig_root->fs_info->extent_root;
> @@ -4912,9 +4917,12 @@ static noinline int find_free_extent(struct 
> btrfs_trans_handle *trans,
>               allowed_chunk_alloc = 1;
>  
>       if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
> -             last_ptr = &root->fs_info->meta_alloc_cluster;
> +             if (cluster)
> +                     last_ptr = cluster;
> +             else
> +                     last_ptr = &root->fs_info->meta_alloc_cluster;
>               if (!btrfs_test_opt(root, SSD))
> -                     empty_cluster = 64 * 1024;
> +                     empty_cluster = 256 * 1024;
>       }
>  
>       if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
> @@ -4925,7 +4933,7 @@ static noinline int find_free_extent(struct 
> btrfs_trans_handle *trans,
>       if (last_ptr) {
>               spin_lock(&last_ptr->lock);
>               if (last_ptr->block_group)
> -                     hint_byte = last_ptr->window_start;
> +                     hint_byte = last_ptr->info->offset;
>               spin_unlock(&last_ptr->lock);
>       }
>  
> @@ -5043,6 +5051,13 @@ have_block_group:
>               if (unlikely(block_group->ro))
>                       goto loop;
>  
> +
> +             if (loop == LOOP_RECLAIM_CLUSTERS) {
> +                     btrfs_reclaim_extent_clusters(block_group,
> +                                                   empty_cluster * 2);
> +             } else if (loop == LOOP_RECLAIM_ALL_CLUSTERS)
> +                     btrfs_reclaim_extent_clusters(block_group, 0);
> +
>               spin_lock(&block_group->free_space_ctl->tree_lock);
>               if (cached &&
>                   block_group->free_space_ctl->free_space <
> @@ -5066,7 +5081,7 @@ have_block_group:
>                        * the refill lock keeps out other
>                        * people trying to start a new cluster
>                        */
> -                     spin_lock(&last_ptr->refill_lock);
> +                     mutex_lock(&last_ptr->refill_lock);
>                       if (last_ptr->block_group &&
>                           (last_ptr->block_group->ro ||
>                           !block_group_bits(last_ptr->block_group, data))) {
> @@ -5078,7 +5093,7 @@ have_block_group:
>                                                num_bytes, search_start);
>                       if (offset) {
>                               /* we have a block, we're done */
> -                             spin_unlock(&last_ptr->refill_lock);
> +                             mutex_unlock(&last_ptr->refill_lock);
>                               goto checks;
>                       }
>  
> @@ -5097,7 +5112,7 @@ have_block_group:
>                               block_group = last_ptr->block_group;
>                               btrfs_get_block_group(block_group);
>                               spin_unlock(&last_ptr->lock);
> -                             spin_unlock(&last_ptr->refill_lock);
> +                             mutex_unlock(&last_ptr->refill_lock);
>  
>                               last_ptr_loop = 1;
>                               search_start = block_group->key.objectid;
> @@ -5123,7 +5138,7 @@ refill_cluster:
>                       /* allocate a cluster in this block group */
>                       ret = btrfs_find_space_cluster(trans, root,
>                                              block_group, last_ptr,
> -                                            offset, num_bytes,
> +                                            search_start, num_bytes,
>                                              empty_cluster + empty_size);
>                       if (ret == 0) {
>                               /*
> @@ -5135,12 +5150,12 @@ refill_cluster:
>                                                 search_start);
>                               if (offset) {
>                                       /* we found one, proceed */
> -                                     spin_unlock(&last_ptr->refill_lock);
> +                                     mutex_unlock(&last_ptr->refill_lock);
>                                       goto checks;
>                               }
>                       } else if (!cached && loop > LOOP_CACHING_NOWAIT
>                                  && !failed_cluster_refill) {
> -                             spin_unlock(&last_ptr->refill_lock);
> +                             mutex_unlock(&last_ptr->refill_lock);
>  
>                               failed_cluster_refill = true;
>                               wait_block_group_cache_progress(block_group,
> @@ -5155,7 +5170,7 @@ refill_cluster:
>                        * to use, and go to the next block group
>                        */
>                       btrfs_return_cluster_to_free_space(NULL, last_ptr);
> -                     spin_unlock(&last_ptr->refill_lock);
> +                     mutex_unlock(&last_ptr->refill_lock);
>                       goto loop;
>               }
>  
> @@ -5197,9 +5212,11 @@ checks:
>               ins->objectid = search_start;
>               ins->offset = num_bytes;
>  
> -             if (offset < search_start)
> +             if (offset < search_start) {
>                       btrfs_add_free_space(block_group, offset,
>                                            search_start - offset);
> +                     offset = search_start;
> +             }
>               BUG_ON(offset > search_start);
>  
>               ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
> @@ -5210,13 +5227,6 @@ checks:
>               }
>  
>               /* we are all good, lets return */
> -             ins->objectid = search_start;
> -             ins->offset = num_bytes;
> -
> -             if (offset < search_start)
> -                     btrfs_add_free_space(block_group, offset,
> -                                          search_start - offset);
> -             BUG_ON(offset > search_start);
>               btrfs_put_block_group(block_group);
>               break;
>  loop:
> @@ -5236,6 +5246,12 @@ loop:
>        * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
>        *                      caching kthreads as we move along
>        * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
> +      * LOOP_RECLAIM_CLUSTERS, reclaim free space from some clusters, and
> +      *                        by this way, we may find enough continuous
> +      *                        space to fill the cluster, and then search
> +      *                        the free space again.
> +      * LOOP_RECLAIM_ALL_CLUSTERS, reclaim free space from all the clusters,
> +      *                            and then search again.
>        * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
>        * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
>        *                      again
> @@ -5362,12 +5378,12 @@ again:
>       up_read(&info->groups_sem);
>  }
>  
> -int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
> -                      struct btrfs_root *root,
> -                      u64 num_bytes, u64 min_alloc_size,
> -                      u64 empty_size, u64 hint_byte,
> -                      u64 search_end, struct btrfs_key *ins,
> -                      u64 data)
> +int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
> +                        struct btrfs_root *root,
> +                        u64 num_bytes, u64 min_alloc_size,
> +                        u64 empty_size, u64 hint_byte,
> +                        u64 search_end, struct btrfs_key *ins,
> +                        u64 data, struct btrfs_free_cluster *cluster)
>  {
>       int ret;
>       u64 search_start = 0;
> @@ -5386,7 +5402,7 @@ again:
>       WARN_ON(num_bytes < root->sectorsize);
>       ret = find_free_extent(trans, root, num_bytes, empty_size,
>                              search_start, search_end, hint_byte,
> -                            ins, data);
> +                            ins, data, cluster);
>  
>       if (ret == -ENOSPC && num_bytes > min_alloc_size) {
>               num_bytes = num_bytes >> 1;
> @@ -5741,13 +5757,15 @@ static void unuse_block_rsv(struct btrfs_block_rsv 
> *block_rsv, u32 blocksize)
>   */
>  struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle 
> *trans,
>                                       struct btrfs_root *root, u32 blocksize,
> -                                     u64 parent, u64 root_objectid,
> +                                     struct extent_buffer *parent,
> +                                     u64 parent_start, u64 root_objectid,
>                                       struct btrfs_disk_key *key, int level,
>                                       u64 hint, u64 empty_size)
>  {
>       struct btrfs_key ins;
>       struct btrfs_block_rsv *block_rsv;
>       struct extent_buffer *buf;
> +     struct btrfs_free_cluster *cluster = NULL;
>       u64 flags = 0;
>       int ret;
>  
> @@ -5756,8 +5774,19 @@ struct extent_buffer *btrfs_alloc_free_block(struct 
> btrfs_trans_handle *trans,
>       if (IS_ERR(block_rsv))
>               return ERR_CAST(block_rsv);
>  
> -     ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
> -                                empty_size, hint, (u64)-1, &ins, 0);
> +     /*
> +      * We needn't worry about allocation failure, because if failed,
> +      * we will use the default metadata cluster in fs_info
> +      */
> +     if (parent && !parent->cluster)
> +             parent->cluster = btrfs_alloc_free_cluster();
> +
> +     if (parent)
> +             cluster = parent->cluster;
> +
> +     ret = btrfs_reserve_extent_mdata(trans, root, blocksize, blocksize,
> +                                      empty_size, hint, (u64)-1, &ins,
> +                                      cluster);
>       if (ret) {
>               unuse_block_rsv(block_rsv, blocksize);
>               return ERR_PTR(ret);
> @@ -5768,11 +5797,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct 
> btrfs_trans_handle *trans,
>       BUG_ON(IS_ERR(buf));
>  
>       if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
> -             if (parent == 0)
> -                     parent = ins.objectid;
> +             if (parent_start == 0)
> +                     parent_start = ins.objectid;
>               flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
>       } else
> -             BUG_ON(parent > 0);
> +             BUG_ON(parent_start > 0);
>  
>       if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
>               struct btrfs_delayed_extent_op *extent_op;
> @@ -5788,7 +5817,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct 
> btrfs_trans_handle *trans,
>               extent_op->is_data = 0;
>  
>               ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
> -                                     ins.offset, parent, root_objectid,
> +                                     ins.offset, parent_start, root_objectid,
>                                       level, BTRFS_ADD_DELAYED_EXTENT,
>                                       extent_op);
>               BUG_ON(ret);
> @@ -7231,18 +7260,18 @@ int btrfs_remove_block_group(struct 
> btrfs_trans_handle *trans,
>  
>       /* make sure this block group isn't part of an allocation cluster */
>       cluster = &root->fs_info->data_alloc_cluster;
> -     spin_lock(&cluster->refill_lock);
> +     mutex_lock(&cluster->refill_lock);
>       btrfs_return_cluster_to_free_space(block_group, cluster);
> -     spin_unlock(&cluster->refill_lock);
> +     mutex_unlock(&cluster->refill_lock);
>  
>       /*
>        * make sure this block group isn't part of a metadata
>        * allocation cluster
>        */
>       cluster = &root->fs_info->meta_alloc_cluster;
> -     spin_lock(&cluster->refill_lock);
> +     mutex_lock(&cluster->refill_lock);
>       btrfs_return_cluster_to_free_space(block_group, cluster);
> -     spin_unlock(&cluster->refill_lock);
> +     mutex_unlock(&cluster->refill_lock);
>  
>       path = btrfs_alloc_path();
>       if (!path) {
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index d418164..78196f2 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -17,6 +17,7 @@
>  #include "compat.h"
>  #include "ctree.h"
>  #include "btrfs_inode.h"
> +#include "free-space-cache.h"
>  
>  static struct kmem_cache *extent_state_cache;
>  static struct kmem_cache *extent_buffer_cache;
> @@ -2988,6 +2989,7 @@ static struct extent_buffer 
> *__alloc_extent_buffer(struct extent_io_tree *tree,
>       spin_unlock_irqrestore(&leak_lock, flags);
>  #endif
>       atomic_set(&eb->refs, 1);
> +     eb->cluster = NULL;
>  
>       return eb;
>  }
> @@ -3827,7 +3829,10 @@ out:
>       spin_unlock(&tree->buffer_lock);
>  
>       /* at this point we can safely release the extent buffer */
> -     if (atomic_read(&eb->refs) == 0)
> +     if (atomic_read(&eb->refs) == 0) {
> +             btrfs_free_extent_cluster(eb);
>               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
> +     }
>       return ret;
>  }
> +
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 7b2f0c3..8ee8953 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -146,6 +146,9 @@ struct extent_buffer {
>        * to unlock
>        */
>       wait_queue_head_t read_lock_wq;
> +
> +     /* Used for the node to allocate space for its children */
> +     struct btrfs_free_cluster *cluster;
>  };
>  
>  static inline void extent_set_compress_type(unsigned long *bio_flags,
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index e555899..e540e77 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -31,9 +31,15 @@
>  #define MAX_CACHE_BYTES_PER_GIG      (32 * 1024)
>  
>  static struct kmem_cache *btrfs_free_space_cachep;
> +static struct kmem_cache *btrfs_free_cluster_cachep;
>  
>  static int link_free_space(struct btrfs_free_space_ctl *ctl,
>                          struct btrfs_free_space *info);
> +static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
> +                              struct btrfs_free_space *info,
> +                              bool update_stat);
> +static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
> +                           struct btrfs_free_space *info, bool update_stat);
>  
>  int __init free_space_cache_init(void)
>  {
> @@ -43,6 +49,14 @@ int __init free_space_cache_init(void)
>       if (!btrfs_free_space_cachep)
>               return -ENOMEM;
>  
> +     btrfs_free_cluster_cachep = kmem_cache_create("extent_clusters",
> +                     sizeof(struct btrfs_free_cluster), 0,
> +                     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
> +     if (!btrfs_free_cluster_cachep) {
> +             kmem_cache_destroy(btrfs_free_space_cachep);
> +             return -ENOMEM;
> +     }
> +
>       return 0;
>  }
>  
> @@ -50,6 +64,8 @@ void free_space_cache_exit(void)
>  {
>       if (btrfs_free_space_cachep)
>               kmem_cache_destroy(btrfs_free_space_cachep);
> +     if (btrfs_free_cluster_cachep)
> +             kmem_cache_destroy(btrfs_free_cluster_cachep);
>  }
>  
>  static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
> @@ -397,11 +413,32 @@ int __load_free_space_cache(struct btrfs_root *root, 
> struct inode *inode,
>  
>                       if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
>                               spin_lock(&ctl->tree_lock);
> +                             if (try_merge_free_space(ctl, e, true))
> +                                     goto link;
> +
> +                             ret = insert_into_bitmap(ctl, e, true);
> +                             if (ret < 0) {
> +                                     spin_unlock(&ctl->tree_lock);
> +                                     printk(KERN_ERR "Inserting into bitmap "
> +                                            "failed, dumping\n");
> +                                     kmem_cache_free(btrfs_free_space_cachep,
> +                                                     e);
> +                                     kunmap(page);
> +                                     unlock_page(page);
> +                                     page_cache_release(page);
> +                                     goto free_cache;
> +                             } else if (ret) {
> +                                     ret = 0;
> +                                     goto next_entry;
> +                             }
> +link:
>                               ret = link_free_space(ctl, e);
>                               spin_unlock(&ctl->tree_lock);
>                               if (ret) {
>                                       printk(KERN_ERR "Duplicate entries in "
>                                              "free space cache, dumping\n");
> +                                     kmem_cache_free(btrfs_free_space_cachep,
> +                                                     e);
>                                       kunmap(page);
>                                       unlock_page(page);
>                                       page_cache_release(page);
> @@ -425,6 +462,9 @@ int __load_free_space_cache(struct btrfs_root *root, 
> struct inode *inode,
>                               if (ret) {
>                                       printk(KERN_ERR "Duplicate entries in "
>                                              "free space cache, dumping\n");
> +                                     kfree(e->bitmap);
> +                                     kmem_cache_free(
> +                                             btrfs_free_space_cachep, e);
>                                       kunmap(page);
>                                       unlock_page(page);
>                                       page_cache_release(page);
> @@ -432,7 +472,7 @@ int __load_free_space_cache(struct btrfs_root *root, 
> struct inode *inode,
>                               }
>                               list_add_tail(&e->list, &bitmaps);
>                       }
> -
> +next_entry:
>                       num_entries--;
>                       offset += sizeof(struct btrfs_free_space_entry);
>                       if (offset + sizeof(struct btrfs_free_space_entry) >=
> @@ -676,7 +716,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, 
> struct inode *inode,
>               while (node && !next_page) {
>                       struct btrfs_free_space *e;
>  
> -                     e = rb_entry(node, struct btrfs_free_space, 
> offset_index);
> +                     e = rb_entry(node, struct btrfs_free_space,
> +                                  offset_index);
>                       entries++;
>  
>                       entry->offset = cpu_to_le64(e->offset);
> @@ -689,10 +730,39 @@ int __btrfs_write_out_cache(struct btrfs_root *root, 
> struct inode *inode,
>                               entry->type = BTRFS_FREE_SPACE_EXTENT;
>                       }
>                       node = rb_next(node);
> -                     if (!node && cluster) {
> -                             node = rb_first(&cluster->root);
> +                     offset += sizeof(struct btrfs_free_space_entry);
> +                     if (offset + sizeof(struct btrfs_free_space_entry) >=
> +                         PAGE_CACHE_SIZE)
> +                             next_page = true;
> +                     entry++;
> +             }
> +
> +             /*
> +              * We want to write the extent in the cluster to our free space
> +              * cache.
> +              */
> +             while (cluster && !next_page) {
> +                     struct btrfs_free_space *e;
> +
> +                     e = cluster->info;
> +                     if (!e || !e->bytes)
> +                             break;
> +
> +                     entries++;
> +
> +                     entry->offset = cpu_to_le64(e->offset);
> +                     entry->bytes = cpu_to_le64(e->bytes);
> +                     entry->type = BTRFS_FREE_SPACE_EXTENT;
> +
> +                     if (list_is_last(&cluster->block_group_list,
> +                                      &block_group->cluster_list))
>                               cluster = NULL;
> -                     }
> +                     else
> +                             cluster = list_entry(
> +                                             cluster->block_group_list.next,
> +                                             struct btrfs_free_cluster,
> +                                             block_group_list);
> +
>                       offset += sizeof(struct btrfs_free_space_entry);
>                       if (offset + sizeof(struct btrfs_free_space_entry) >=
>                           PAGE_CACHE_SIZE)
> @@ -1125,19 +1195,30 @@ static void unlink_free_space(struct 
> btrfs_free_space_ctl *ctl,
>       ctl->free_space -= info->bytes;
>  }
>  
> +static int __link_free_space(struct btrfs_free_space_ctl *ctl,
> +                          struct btrfs_free_space *info)
> +{
> +     int ret;
> +
> +     ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
> +                              &info->offset_index, (info->bitmap != NULL));
> +     if (ret)
> +             return ret;
> +     ctl->free_extents++;
> +     return 0;
> +}
> +
>  static int link_free_space(struct btrfs_free_space_ctl *ctl,
>                          struct btrfs_free_space *info)
>  {
> -     int ret = 0;
> +     int ret;
>  
>       BUG_ON(!info->bitmap && !info->bytes);
> -     ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
> -                              &info->offset_index, (info->bitmap != NULL));
> +     ret = __link_free_space(ctl, info);
>       if (ret)
>               return ret;
>  
>       ctl->free_space += info->bytes;
> -     ctl->free_extents++;
>       return ret;
>  }
>  
> @@ -1212,7 +1293,7 @@ static void bitmap_clear_bits(struct 
> btrfs_free_space_ctl *ctl,
>  
>  static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
>                           struct btrfs_free_space *info, u64 offset,
> -                         u64 bytes)
> +                         u64 bytes, bool update_stat)
>  {
>       unsigned long start, count;
>  
> @@ -1223,7 +1304,8 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl 
> *ctl,
>       bitmap_set(info->bitmap, start, count);
>  
>       info->bytes += bytes;
> -     ctl->free_space += bytes;
> +     if (update_stat)
> +             ctl->free_space += bytes;
>  }
>  
>  static int search_bitmap(struct btrfs_free_space_ctl *ctl,
> @@ -1394,7 +1476,7 @@ again:
>  
>  static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
>                              struct btrfs_free_space *info, u64 offset,
> -                            u64 bytes)
> +                            u64 bytes, bool update_stat)
>  {
>       u64 bytes_to_set = 0;
>       u64 end;
> @@ -1403,7 +1485,7 @@ static u64 add_bytes_to_bitmap(struct 
> btrfs_free_space_ctl *ctl,
>  
>       bytes_to_set = min(end - offset, bytes);
>  
> -     bitmap_set_bits(ctl, info, offset, bytes_to_set);
> +     bitmap_set_bits(ctl, info, offset, bytes_to_set, update_stat);
>  
>       return bytes_to_set;
>  
> @@ -1451,10 +1533,9 @@ static struct btrfs_free_space_op free_space_op = {
>  };
>  
>  static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
> -                           struct btrfs_free_space *info)
> +                           struct btrfs_free_space *info, bool update_stat)
>  {
>       struct btrfs_free_space *bitmap_info;
> -     struct btrfs_block_group_cache *block_group = NULL;
>       int added = 0;
>       u64 bytes, offset, bytes_added;
>       int ret;
> @@ -1465,49 +1546,7 @@ static int insert_into_bitmap(struct 
> btrfs_free_space_ctl *ctl,
>       if (!ctl->op->use_bitmap(ctl, info))
>               return 0;
>  
> -     if (ctl->op == &free_space_op)
> -             block_group = ctl->private;
>  again:
> -     /*
> -      * Since we link bitmaps right into the cluster we need to see if we
> -      * have a cluster here, and if so and it has our bitmap we need to add
> -      * the free space to that bitmap.
> -      */
> -     if (block_group && !list_empty(&block_group->cluster_list)) {
> -             struct btrfs_free_cluster *cluster;
> -             struct rb_node *node;
> -             struct btrfs_free_space *entry;
> -
> -             cluster = list_entry(block_group->cluster_list.next,
> -                                  struct btrfs_free_cluster,
> -                                  block_group_list);
> -             spin_lock(&cluster->lock);
> -             node = rb_first(&cluster->root);
> -             if (!node) {
> -                     spin_unlock(&cluster->lock);
> -                     goto no_cluster_bitmap;
> -             }
> -
> -             entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -             if (!entry->bitmap) {
> -                     spin_unlock(&cluster->lock);
> -                     goto no_cluster_bitmap;
> -             }
> -
> -             if (entry->offset == offset_to_bitmap(ctl, offset)) {
> -                     bytes_added = add_bytes_to_bitmap(ctl, entry,
> -                                                       offset, bytes);
> -                     bytes -= bytes_added;
> -                     offset += bytes_added;
> -             }
> -             spin_unlock(&cluster->lock);
> -             if (!bytes) {
> -                     ret = 1;
> -                     goto out;
> -             }
> -     }
> -
> -no_cluster_bitmap:
>       bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
>                                        1, 0);
>       if (!bitmap_info) {
> @@ -1515,7 +1554,8 @@ no_cluster_bitmap:
>               goto new_bitmap;
>       }
>  
> -     bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
> +     bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
> +                                       update_stat);
>       bytes -= bytes_added;
>       offset += bytes_added;
>       added = 0;
> @@ -1533,6 +1573,12 @@ new_bitmap:
>               info = NULL;
>               goto again;
>       } else {
> +             if (current->flags & PF_MEMALLOC) {
> +                     printk(KERN_INFO "Alloc memory when reclaim "
> +                            "the pages\n");
> +                     return 0;
> +             }
> +
>               spin_unlock(&ctl->tree_lock);
>  
>               /* no pre-allocated info, allocate a new one */
> @@ -1635,7 +1681,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl 
> *ctl,
>        * extent then we know we're going to have to allocate a new extent, so
>        * before we do that see if we need to drop this into a bitmap
>        */
> -     ret = insert_into_bitmap(ctl, info);
> +     ret = insert_into_bitmap(ctl, info, true);
>       if (ret < 0) {
>               goto out;
>       } else if (ret) {
> @@ -1829,36 +1875,50 @@ __btrfs_return_cluster_to_free_space(
>  {
>       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>       struct btrfs_free_space *entry;
> -     struct rb_node *node;
> +     int ret = 0;
>  
>       spin_lock(&cluster->lock);
> -     if (cluster->block_group != block_group)
> +     if (cluster->block_group != block_group) {
> +             spin_unlock(&cluster->lock);
>               goto out;
> +     }
>  
>       cluster->block_group = NULL;
> -     cluster->window_start = 0;
> +
> +     entry = cluster->info;
> +     cluster->info = NULL;
> +
>       list_del_init(&cluster->block_group_list);
> +     spin_unlock(&cluster->lock);
>  
> -     node = rb_first(&cluster->root);
> -     while (node) {
> -             bool bitmap;
> +     if (!entry)
> +             goto out;
>  
> -             entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -             node = rb_next(&entry->offset_index);
> -             rb_erase(&entry->offset_index, &cluster->root);
> -
> -             bitmap = (entry->bitmap != NULL);
> -             if (!bitmap)
> -                     try_merge_free_space(ctl, entry, false);
> -             tree_insert_offset(&ctl->free_space_offset,
> -                                entry->offset, &entry->offset_index, bitmap);
> +     if (!entry->bytes) {
> +             kmem_cache_free(btrfs_free_space_cachep, entry);
> +             goto out;
>       }
> -     cluster->root = RB_ROOT;
>  
> +     if (try_merge_free_space(ctl, entry, false))
> +             goto link;
> +
> +     ret = insert_into_bitmap(ctl, entry, false);
> +     if (ret < 0)
> +             goto out;
> +     else if (ret) {
> +             ret = 0;
> +             goto out;
> +     }
> +link:
> +     ret = __link_free_space(ctl, entry);
> +     if (ret) {
> +             kmem_cache_free(btrfs_free_space_cachep, entry);
> +             printk(KERN_ERR "Duplicate entries in free space cache, "
> +                             "dumping\n");
> +     }
>  out:
> -     spin_unlock(&cluster->lock);
>       btrfs_put_block_group(block_group);
> -     return 0;
> +     return ret;
>  }
>  
>  void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
> @@ -1889,29 +1949,58 @@ void __btrfs_remove_free_space_cache(struct 
> btrfs_free_space_ctl *ctl)
>       spin_unlock(&ctl->tree_lock);
>  }
>  
> -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache 
> *block_group)
> +static void __btrfs_reclaim_extent_clusters(
> +                             struct btrfs_block_group_cache *block_group,
> +                             u64 to_reclaim)
>  {
> -     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>       struct btrfs_free_cluster *cluster;
>       struct list_head *head;
> +     u64 bytes;
> +     bool reclaim_all = (to_reclaim == 0);
>  
> -     spin_lock(&ctl->tree_lock);
>       while ((head = block_group->cluster_list.next) !=
>              &block_group->cluster_list) {
>               cluster = list_entry(head, struct btrfs_free_cluster,
>                                    block_group_list);
>  
>               WARN_ON(cluster->block_group != block_group);
> +
> +             bytes = cluster->info->bytes;
>               __btrfs_return_cluster_to_free_space(block_group, cluster);
> +
> +             if (!reclaim_all) {
> +                     if (to_reclaim > bytes)
> +                             to_reclaim -= bytes;
> +                     else {
> +                             to_reclaim = 0;
> +                             break;
> +                     }
> +             }
> +
>               if (need_resched()) {
> -                     spin_unlock(&ctl->tree_lock);
> +                     spin_unlock(&block_group->free_space_ctl->tree_lock);
>                       cond_resched();
> -                     spin_lock(&ctl->tree_lock);
> +                     spin_lock(&block_group->free_space_ctl->tree_lock);
>               }
>       }
> +}
> +
> +void btrfs_reclaim_extent_clusters(struct btrfs_block_group_cache 
> *block_group,
> +                                u64 to_reclaim)
> +{
> +     spin_lock(&block_group->free_space_ctl->tree_lock);
> +     __btrfs_reclaim_extent_clusters(block_group, to_reclaim);
> +     spin_unlock(&block_group->free_space_ctl->tree_lock);
> +}
> +
> +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache 
> *block_group)
> +{
> +     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> +
> +     spin_lock(&ctl->tree_lock);
> +     __btrfs_reclaim_extent_clusters(block_group, 0);
>       __btrfs_remove_free_space_cache_locked(ctl);
>       spin_unlock(&ctl->tree_lock);
> -
>  }
>  
>  u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
> @@ -1991,30 +2080,6 @@ int btrfs_return_cluster_to_free_space(
>       return ret;
>  }
>  
> -static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache 
> *block_group,
> -                                struct btrfs_free_cluster *cluster,
> -                                struct btrfs_free_space *entry,
> -                                u64 bytes, u64 min_start)
> -{
> -     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -     int err;
> -     u64 search_start = cluster->window_start;
> -     u64 search_bytes = bytes;
> -     u64 ret = 0;
> -
> -     search_start = min_start;
> -     search_bytes = bytes;
> -
> -     err = search_bitmap(ctl, entry, &search_start, &search_bytes);
> -     if (err)
> -             return 0;
> -
> -     ret = search_start;
> -     __bitmap_clear_bits(ctl, entry, ret, bytes);
> -
> -     return ret;
> -}
> -
>  /*
>   * given a cluster, try to allocate 'bytes' from it, returns 0
>   * if it couldn't find anything suitably large, or a logical disk offset
> @@ -2025,56 +2090,26 @@ u64 btrfs_alloc_from_cluster(struct 
> btrfs_block_group_cache *block_group,
>                            u64 min_start)
>  {
>       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -     struct btrfs_free_space *entry = NULL;
> -     struct rb_node *node;
>       u64 ret = 0;
>  
>       spin_lock(&cluster->lock);
> -     if (bytes > cluster->max_size)
> -             goto out;
> -
>       if (cluster->block_group != block_group)
>               goto out;
>  
> -     node = rb_first(&cluster->root);
> -     if (!node)
> +     /*
> +      * If we set ->block_group, it means we have filled this cluster,
> +      * and ->info also has been set. So we needn't check ->info is
> +      * NULL or not now.
> +      */
> +     if (bytes > cluster->info->bytes)
>               goto out;
>  
> -     entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -     while(1) {
> -             if (entry->bytes < bytes ||
> -                 (!entry->bitmap && entry->offset < min_start)) {
> -                     node = rb_next(&entry->offset_index);
> -                     if (!node)
> -                             break;
> -                     entry = rb_entry(node, struct btrfs_free_space,
> -                                      offset_index);
> -                     continue;
> -             }
> -
> -             if (entry->bitmap) {
> -                     ret = btrfs_alloc_from_bitmap(block_group,
> -                                                   cluster, entry, bytes,
> -                                                   min_start);
> -                     if (ret == 0) {
> -                             node = rb_next(&entry->offset_index);
> -                             if (!node)
> -                                     break;
> -                             entry = rb_entry(node, struct btrfs_free_space,
> -                                              offset_index);
> -                             continue;
> -                     }
> -             } else {
> -                     ret = entry->offset;
> -
> -                     entry->offset += bytes;
> -                     entry->bytes -= bytes;
> -             }
> +     if (min_start >= cluster->info->offset + cluster->info->bytes)
> +             goto out;
>  
> -             if (entry->bytes == 0)
> -                     rb_erase(&entry->offset_index, &cluster->root);
> -             break;
> -     }
> +     ret = cluster->info->offset;
> +     cluster->info->offset += bytes;
> +     cluster->info->bytes -= bytes;
>  out:
>       spin_unlock(&cluster->lock);
>  
> @@ -2082,260 +2117,12 @@ out:
>               return 0;
>  
>       spin_lock(&ctl->tree_lock);
> -
>       ctl->free_space -= bytes;
> -     if (entry->bytes == 0) {
> -             ctl->free_extents--;
> -             if (entry->bitmap) {
> -                     kfree(entry->bitmap);
> -                     ctl->total_bitmaps--;
> -                     ctl->op->recalc_thresholds(ctl);
> -             }
> -             kmem_cache_free(btrfs_free_space_cachep, entry);
> -     }
> -
>       spin_unlock(&ctl->tree_lock);
>  
>       return ret;
>  }
>  
> -static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
> -                             struct btrfs_free_space *entry,
> -                             struct btrfs_free_cluster *cluster,
> -                             u64 offset, u64 bytes, u64 min_bytes)
> -{
> -     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -     unsigned long next_zero;
> -     unsigned long i;
> -     unsigned long search_bits;
> -     unsigned long total_bits;
> -     unsigned long found_bits;
> -     unsigned long start = 0;
> -     unsigned long total_found = 0;
> -     int ret;
> -     bool found = false;
> -
> -     i = offset_to_bit(entry->offset, block_group->sectorsize,
> -                       max_t(u64, offset, entry->offset));
> -     search_bits = bytes_to_bits(bytes, block_group->sectorsize);
> -     total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
> -
> -again:
> -     found_bits = 0;
> -     for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
> -          i < BITS_PER_BITMAP;
> -          i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
> -             next_zero = find_next_zero_bit(entry->bitmap,
> -                                            BITS_PER_BITMAP, i);
> -             if (next_zero - i >= search_bits) {
> -                     found_bits = next_zero - i;
> -                     break;
> -             }
> -             i = next_zero;
> -     }
> -
> -     if (!found_bits)
> -             return -ENOSPC;
> -
> -     if (!found) {
> -             start = i;
> -             found = true;
> -     }
> -
> -     total_found += found_bits;
> -
> -     if (cluster->max_size < found_bits * block_group->sectorsize)
> -             cluster->max_size = found_bits * block_group->sectorsize;
> -
> -     if (total_found < total_bits) {
> -             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
> -             if (i - start > total_bits * 2) {
> -                     total_found = 0;
> -                     cluster->max_size = 0;
> -                     found = false;
> -             }
> -             goto again;
> -     }
> -
> -     cluster->window_start = start * block_group->sectorsize +
> -             entry->offset;
> -     rb_erase(&entry->offset_index, &ctl->free_space_offset);
> -     ret = tree_insert_offset(&cluster->root, entry->offset,
> -                              &entry->offset_index, 1);
> -     BUG_ON(ret);
> -
> -     return 0;
> -}
> -
> -/*
> - * This searches the block group for just extents to fill the cluster with.
> - */
> -static noinline int
> -setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
> -                     struct btrfs_free_cluster *cluster,
> -                     struct list_head *bitmaps, u64 offset, u64 bytes,
> -                     u64 min_bytes)
> -{
> -     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -     struct btrfs_free_space *first = NULL;
> -     struct btrfs_free_space *entry = NULL;
> -     struct btrfs_free_space *prev = NULL;
> -     struct btrfs_free_space *last;
> -     struct rb_node *node;
> -     u64 window_start;
> -     u64 window_free;
> -     u64 max_extent;
> -     u64 max_gap = 128 * 1024;
> -
> -     entry = tree_search_offset(ctl, offset, 0, 1);
> -     if (!entry)
> -             return -ENOSPC;
> -
> -     /*
> -      * We don't want bitmaps, so just move along until we find a normal
> -      * extent entry.
> -      */
> -     while (entry->bitmap) {
> -             if (list_empty(&entry->list))
> -                     list_add_tail(&entry->list, bitmaps);
> -             node = rb_next(&entry->offset_index);
> -             if (!node)
> -                     return -ENOSPC;
> -             entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -     }
> -
> -     window_start = entry->offset;
> -     window_free = entry->bytes;
> -     max_extent = entry->bytes;
> -     first = entry;
> -     last = entry;
> -     prev = entry;
> -
> -     while (window_free <= min_bytes) {
> -             node = rb_next(&entry->offset_index);
> -             if (!node)
> -                     return -ENOSPC;
> -             entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -
> -             if (entry->bitmap) {
> -                     if (list_empty(&entry->list))
> -                             list_add_tail(&entry->list, bitmaps);
> -                     continue;
> -             }
> -
> -             /*
> -              * we haven't filled the empty size and the window is
> -              * very large.  reset and try again
> -              */
> -             if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
> -                 entry->offset - window_start > (min_bytes * 2)) {
> -                     first = entry;
> -                     window_start = entry->offset;
> -                     window_free = entry->bytes;
> -                     last = entry;
> -                     max_extent = entry->bytes;
> -             } else {
> -                     last = entry;
> -                     window_free += entry->bytes;
> -                     if (entry->bytes > max_extent)
> -                             max_extent = entry->bytes;
> -             }
> -             prev = entry;
> -     }
> -
> -     cluster->window_start = first->offset;
> -
> -     node = &first->offset_index;
> -
> -     /*
> -      * now we've found our entries, pull them out of the free space
> -      * cache and put them into the cluster rbtree
> -      */
> -     do {
> -             int ret;
> -
> -             entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -             node = rb_next(&entry->offset_index);
> -             if (entry->bitmap)
> -                     continue;
> -
> -             rb_erase(&entry->offset_index, &ctl->free_space_offset);
> -             ret = tree_insert_offset(&cluster->root, entry->offset,
> -                                      &entry->offset_index, 0);
> -             BUG_ON(ret);
> -     } while (node && entry != last);
> -
> -     cluster->max_size = max_extent;
> -
> -     return 0;
> -}
> -
> -/*
> - * This specifically looks for bitmaps that may work in the cluster, we 
> assume
> - * that we have already failed to find extents that will work.
> - */
> -static noinline int
> -setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
> -                  struct btrfs_free_cluster *cluster,
> -                  struct list_head *bitmaps, u64 offset, u64 bytes,
> -                  u64 min_bytes)
> -{
> -     struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -     struct btrfs_free_space *entry;
> -     struct rb_node *node;
> -     int ret = -ENOSPC;
> -
> -     if (ctl->total_bitmaps == 0)
> -             return -ENOSPC;
> -
> -     /*
> -      * First check our cached list of bitmaps and see if there is an entry
> -      * here that will work.
> -      */
> -     list_for_each_entry(entry, bitmaps, list) {
> -             if (entry->bytes < min_bytes)
> -                     continue;
> -             ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
> -                                        bytes, min_bytes);
> -             if (!ret)
> -                     return 0;
> -     }
> -
> -     /*
> -      * If we do have entries on our list and we are here then we didn't find
> -      * anything, so go ahead and get the next entry after the last entry in
> -      * this list and start the search from there.
> -      */
> -     if (!list_empty(bitmaps)) {
> -             entry = list_entry(bitmaps->prev, struct btrfs_free_space,
> -                                list);
> -             node = rb_next(&entry->offset_index);
> -             if (!node)
> -                     return -ENOSPC;
> -             entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -             goto search;
> -     }
> -
> -     entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
> -     if (!entry)
> -             return -ENOSPC;
> -
> -search:
> -     node = &entry->offset_index;
> -     do {
> -             entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -             node = rb_next(&entry->offset_index);
> -             if (!entry->bitmap)
> -                     continue;
> -             if (entry->bytes < min_bytes)
> -                     continue;
> -             ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
> -                                        bytes, min_bytes);
> -     } while (ret && node);
> -
> -     return ret;
> -}
> -
>  /*
>   * here we try to find a cluster of blocks in a block group.  The goal
>   * is to find at least bytes free and up to empty_size + bytes free.
> @@ -2351,11 +2138,12 @@ int btrfs_find_space_cluster(struct 
> btrfs_trans_handle *trans,
>                            u64 offset, u64 bytes, u64 empty_size)
>  {
>       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -     struct list_head bitmaps;
> -     struct btrfs_free_space *entry, *tmp;
> +     struct btrfs_free_space *entry, *info;
>       u64 min_bytes;
>       int ret;
>  
> +     BUG_ON(cluster->info);
> +
>       /* for metadata, allow allocates with more holes */
>       if (btrfs_test_opt(root, SSD_SPREAD)) {
>               min_bytes = bytes + empty_size;
> @@ -2369,9 +2157,16 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle 
> *trans,
>                       min_bytes = max(bytes, (bytes + empty_size) >> 1);
>               else
>                       min_bytes = max(bytes, (bytes + empty_size) >> 4);
> +             min_bytes = max(min_bytes, (u64)256 * 1024);
>       } else
>               min_bytes = max(bytes, (bytes + empty_size) >> 2);
>  
> +     min_bytes = round_up(min_bytes, root->sectorsize);
> +
> +     info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
> +     if (!info)
> +             return -ENOMEM;
> +
>       spin_lock(&ctl->tree_lock);
>  
>       /*
> @@ -2380,6 +2175,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle 
> *trans,
>        */
>       if (ctl->free_space < min_bytes) {
>               spin_unlock(&ctl->tree_lock);
> +             kmem_cache_free(btrfs_free_space_cachep, info);
>               return -ENOSPC;
>       }
>  
> @@ -2388,26 +2184,44 @@ int btrfs_find_space_cluster(struct 
> btrfs_trans_handle *trans,
>       /* someone already found a cluster, hooray */
>       if (cluster->block_group) {
>               ret = 0;
> +             kmem_cache_free(btrfs_free_space_cachep, info);
>               goto out;
>       }
>  
> -     INIT_LIST_HEAD(&bitmaps);
> -     ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
> -                                   bytes, min_bytes);
> -     if (ret)
> -             ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
> -                                        offset, bytes, min_bytes);
> +     bytes = min_bytes;
> +     entry = find_free_space(ctl, &offset, &min_bytes);
> +     if (!entry) {
> +             ret = -ENOSPC;
> +             kmem_cache_free(btrfs_free_space_cachep, info);
> +             goto out;
> +     }
>  
> -     /* Clear our temporary list */
> -     list_for_each_entry_safe(entry, tmp, &bitmaps, list)
> -             list_del_init(&entry->list);
> +     BUG_ON(min_bytes < bytes);
>  
> -     if (!ret) {
> -             atomic_inc(&block_group->count);
> -             list_add_tail(&cluster->block_group_list,
> -                           &block_group->cluster_list);
> -             cluster->block_group = block_group;
> +     ret = 0;
> +     if (entry->bitmap) {
> +             __bitmap_clear_bits(ctl, entry, offset, bytes);
> +             if (!entry->bytes)
> +                     free_bitmap(ctl, entry);
> +     } else {
> +             __unlink_free_space(ctl, entry);
> +             entry->offset += bytes;
> +             entry->bytes -= bytes;
> +             if (!entry->bytes)
> +                     kmem_cache_free(btrfs_free_space_cachep, entry);
> +             else
> +                     __link_free_space(ctl, entry);
>       }
> +
> +     info->offset = offset;
> +     info->bytes = bytes;
> +
> +     cluster->info = info;
> +
> +     atomic_inc(&block_group->count);
> +     list_add_tail(&cluster->block_group_list,
> +                   &block_group->cluster_list);
> +     cluster->block_group = block_group;
>  out:
>       spin_unlock(&cluster->lock);
>       spin_unlock(&ctl->tree_lock);
> @@ -2421,11 +2235,10 @@ out:
>  void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
>  {
>       spin_lock_init(&cluster->lock);
> -     spin_lock_init(&cluster->refill_lock);
> -     cluster->root = RB_ROOT;
> -     cluster->max_size = 0;
> +     mutex_init(&cluster->refill_lock);
>       INIT_LIST_HEAD(&cluster->block_group_list);
>       cluster->block_group = NULL;
> +     cluster->info = NULL;
>  }
>  
>  int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
> @@ -2665,3 +2478,27 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
>       iput(inode);
>       return ret;
>  }
> +
> +struct btrfs_free_cluster *btrfs_alloc_free_cluster(void)
> +{
> +     struct btrfs_free_cluster *cluster;
> +
> +     cluster = kmem_cache_zalloc(btrfs_free_cluster_cachep, GFP_NOFS);
> +     if (!cluster)
> +             return NULL;
> +
> +     btrfs_init_free_cluster(cluster);
> +     return cluster;
> +}
> +
> +void btrfs_release_free_cluster(struct btrfs_free_cluster *cluster)
> +{
> +     /* return the free space in the cluster to the space cache. */
> +     mutex_lock(&cluster->refill_lock);
> +     btrfs_return_cluster_to_free_space(NULL, cluster);
> +     mutex_unlock(&cluster->refill_lock);
> +
> +     /* free the cluster. */
> +     kmem_cache_free(btrfs_free_cluster_cachep, cluster);
> +}
> +
> diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
> index c27ccba..1f7e75c 100644
> --- a/fs/btrfs/free-space-cache.h
> +++ b/fs/btrfs/free-space-cache.h
> @@ -34,6 +34,7 @@ struct btrfs_free_space_ctl {
>       int extents_thresh;
>       int free_extents;
>       int total_bitmaps;
> +     int total_clusters;
>       int unit;
>       u64 start;
>       struct btrfs_free_space_op *op;
> @@ -113,4 +114,16 @@ int btrfs_return_cluster_to_free_space(
>                              struct btrfs_free_cluster *cluster);
>  int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
>                          u64 *trimmed, u64 start, u64 end, u64 minlen);
> +void btrfs_reclaim_extent_clusters(struct btrfs_block_group_cache 
> *block_group,
> +                                u64 to_reclaim);
> +struct btrfs_free_cluster *btrfs_alloc_free_cluster(void);
> +void btrfs_release_free_cluster(struct btrfs_free_cluster *cluster);
> +static inline void btrfs_free_extent_cluster(struct extent_buffer *eb)
> +{
> +     if (!eb->cluster)
> +             return;
> +
> +     btrfs_release_free_cluster(eb->cluster);
> +     eb->cluster = NULL;
> +}
>  #endif
> diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
> index b4087e0..d501c1f 100644
> --- a/fs/btrfs/inode-map.c
> +++ b/fs/btrfs/inode-map.c
> @@ -457,6 +457,7 @@ again:
>       spin_unlock(&root->cache_lock);
>  
>       spin_lock(&ctl->tree_lock);
> +     WARN_ON(ctl->total_clusters);
>       prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
>       prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
>       prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 7a9e01f..38a5da9 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -51,7 +51,6 @@
>  #include "tree-log.h"
>  #include "compression.h"
>  #include "locking.h"
> -#include "free-space-cache.h"
>  #include "inode-map.h"
>  
>  struct btrfs_iget_args {
> @@ -623,11 +622,10 @@ retry:
>               trans = btrfs_join_transaction(root);
>               BUG_ON(IS_ERR(trans));
>               trans->block_rsv = &root->fs_info->delalloc_block_rsv;
> -             ret = btrfs_reserve_extent(trans, root,
> -                                        async_extent->compressed_size,
> -                                        async_extent->compressed_size,
> -                                        0, alloc_hint,
> -                                        (u64)-1, &ins, 1);
> +             ret = btrfs_reserve_extent_data(trans, root,
> +                                             async_extent->compressed_size,
> +                                             async_extent->compressed_size,
> +                                             0, alloc_hint, (u64)-1, &ins);
>               btrfs_end_transaction(trans, root);
>  
>               if (ret) {
> @@ -828,9 +826,9 @@ static noinline int cow_file_range(struct inode *inode,
>               unsigned long op;
>  
>               cur_alloc_size = disk_num_bytes;
> -             ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
> -                                        root->sectorsize, 0, alloc_hint,
> -                                        (u64)-1, &ins, 1);
> +             ret = btrfs_reserve_extent_data(trans, root, cur_alloc_size,
> +                                             root->sectorsize, 0, alloc_hint,
> +                                             (u64)-1, &ins);
>               BUG_ON(ret);
>  
>               em = alloc_extent_map();
> @@ -5409,8 +5407,8 @@ static struct extent_map 
> *btrfs_new_extent_direct(struct inode *inode,
>       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
>  
>       alloc_hint = get_extent_allocation_hint(inode, start, len);
> -     ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
> -                                alloc_hint, (u64)-1, &ins, 1);
> +     ret = btrfs_reserve_extent_data(trans, root, len, root->sectorsize, 0,
> +                                     alloc_hint, (u64)-1, &ins);
>       if (ret) {
>               em = ERR_PTR(ret);
>               goto out;
> @@ -7276,8 +7274,9 @@ static int __btrfs_prealloc_file_range(struct inode 
> *inode, int mode,
>                       }
>               }
>  
> -             ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
> -                                        0, *alloc_hint, (u64)-1, &ins, 1);
> +             ret = btrfs_reserve_extent_data(trans, root, num_bytes,
> +                                             min_size, 0, *alloc_hint,
> +                                             (u64)-1, &ins);
>               if (ret) {
>                       if (own_trans)
>                               btrfs_end_transaction(trans, root);
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 970977a..808203c 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -347,7 +347,7 @@ static noinline int create_subvol(struct btrfs_root *root,
>       if (IS_ERR(trans))
>               return PTR_ERR(trans);
>  
> -     leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
> +     leaf = btrfs_alloc_free_block(trans, root, root->leafsize, NULL,
>                                     0, objectid, NULL, 0, 0, 0);
>       if (IS_ERR(leaf)) {
>               ret = PTR_ERR(leaf);
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 786639f..21d08e2 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -25,6 +25,7 @@
>  #include "print-tree.h"
>  #include "compat.h"
>  #include "tree-log.h"
> +#include "free-space-cache.h"
>  
>  /* magic values for the inode_only field in btrfs_log_inode:
>   *
> @@ -1763,6 +1764,8 @@ static noinline int walk_down_log_tree(struct 
> btrfs_trans_handle *trans,
>                               ret = btrfs_free_reserved_extent(root,
>                                                        bytenr, blocksize);
>                               BUG_ON(ret);
> +
> +                             btrfs_free_extent_cluster(next);
>                       }
>                       free_extent_buffer(next);
>                       continue;
> @@ -1832,6 +1835,8 @@ static noinline int walk_up_log_tree(struct 
> btrfs_trans_handle *trans,
>                                               path->nodes[*level]->start,
>                                               path->nodes[*level]->len);
>                               BUG_ON(ret);
> +
> +                             btrfs_free_extent_cluster(next);
>                       }
>                       free_extent_buffer(path->nodes[*level]);
>                       path->nodes[*level] = NULL;
> @@ -1900,6 +1905,8 @@ static int walk_log_tree(struct btrfs_trans_handle 
> *trans,
>                       ret = btrfs_free_reserved_extent(log, next->start,
>                                                        next->len);
>                       BUG_ON(ret);
> +
> +                     btrfs_free_extent_cluster(next);
>               }
>       }
>  

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to