On Friday, September 02, 2016 03:40:05 PM Josef Bacik wrote:

Please find my comment inlined below,

> In order to more efficiently support sub-page blocksizes we need to stop
> allocating pages from pagecache for our metadata.  Instead switch to using the
> account_metadata* counters for making sure we are keeping the system aware of
> how much dirty metadata we have, and use the ->free_cached_objects super
> operation in order to handle freeing up extent buffers.  This greatly 
> simplifies
> how we deal with extent buffers as now we no longer have to tie the page cache
> reclaimation stuff to the extent buffer stuff.  This will also allow us to
> simply kmalloc() our data for sub-page blocksizes.
> 
> Signed-off-by: Josef Bacik <[email protected]>
> ---
>  fs/btrfs/btrfs_inode.h                 |   3 +-
>  fs/btrfs/ctree.c                       |  10 +-
>  fs/btrfs/ctree.h                       |  13 +-
>  fs/btrfs/disk-io.c                     | 389 ++++----------
>  fs/btrfs/extent_io.c                   | 913 
> ++++++++++++++++++---------------
>  fs/btrfs/extent_io.h                   |  49 +-
>  fs/btrfs/inode.c                       |   6 +-
>  fs/btrfs/root-tree.c                   |   2 +-
>  fs/btrfs/super.c                       |  29 +-
>  fs/btrfs/tests/btrfs-tests.c           |  37 +-
>  fs/btrfs/tests/extent-io-tests.c       |   4 +-
>  fs/btrfs/tests/free-space-tree-tests.c |   4 +-
>  fs/btrfs/tests/qgroup-tests.c          |   4 +-
>  fs/btrfs/transaction.c                 |  11 +-
>  14 files changed, 726 insertions(+), 748 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 1a8fa46..ad7b185 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -229,10 +229,9 @@ static inline u64 btrfs_ino(struct inode *inode)
>       u64 ino = BTRFS_I(inode)->location.objectid;
> 
>       /*
> -      * !ino: btree_inode
>        * type == BTRFS_ROOT_ITEM_KEY: subvol dir
>        */
> -     if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
> +     if (BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
>               ino = inode->i_ino;
>       return ino;
>  }
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index d1c56c9..b267053 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -1373,8 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 
> struct btrfs_path *path,
> 
>       if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
>               BUG_ON(tm->slot != 0);
> -             eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
> -                                             eb->len);
> +             eb_rewin = alloc_dummy_extent_buffer(fs_info->eb_info,
> +                                                  eb->start, eb->len);
>               if (!eb_rewin) {
>                       btrfs_tree_read_unlock_blocking(eb);
>                       free_extent_buffer(eb);
> @@ -1455,8 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
>       } else if (old_root) {
>               btrfs_tree_read_unlock(eb_root);
>               free_extent_buffer(eb_root);
> -             eb = alloc_dummy_extent_buffer(root->fs_info, logical,
> -                                     root->nodesize);
> +             eb = alloc_dummy_extent_buffer(root->fs_info->eb_info, logical,
> +                                            root->nodesize);
>       } else {
>               btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
>               eb = btrfs_clone_extent_buffer(eb_root);
> @@ -1772,7 +1772,7 @@ static noinline int generic_bin_search(struct 
> extent_buffer *eb,
>       int err;
> 
>       if (low > high) {
> -             btrfs_err(eb->fs_info,
> +             btrfs_err(eb->eb_info->fs_info,
>                "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
>                         __func__, low, high, eb->start,
>                         btrfs_header_owner(eb), btrfs_header_level(eb));
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 282a031..ee6956c 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -37,6 +37,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/security.h>
>  #include <linux/sizes.h>
> +#include <linux/list_lru.h>
>  #include "extent_io.h"
>  #include "extent_map.h"
>  #include "async-thread.h"
> @@ -675,6 +676,7 @@ struct btrfs_device;
>  struct btrfs_fs_devices;
>  struct btrfs_balance_control;
>  struct btrfs_delayed_root;
> +struct btrfs_eb_info;
> 
>  #define BTRFS_FS_BARRIER                     1
>  #define BTRFS_FS_CLOSING_START                       2
> @@ -797,7 +799,7 @@ struct btrfs_fs_info {
>       struct btrfs_super_block *super_for_commit;
>       struct block_device *__bdev;
>       struct super_block *sb;
> -     struct inode *btree_inode;
> +     struct btrfs_eb_info *eb_info;
>       struct backing_dev_info bdi;
>       struct mutex tree_log_mutex;
>       struct mutex transaction_kthread_mutex;
> @@ -1042,10 +1044,6 @@ struct btrfs_fs_info {
>       /* readahead works cnt */
>       atomic_t reada_works_cnt;
> 
> -     /* Extent buffer radix tree */
> -     spinlock_t buffer_lock;
> -     struct radix_tree_root buffer_radix;
> -
>       /* next backup root to be overwritten */
>       int backup_root_index;
> 
> @@ -2884,6 +2882,8 @@ static inline int btrfs_need_cleaner_sleep(struct 
> btrfs_root *root)
> 
>  static inline void free_fs_info(struct btrfs_fs_info *fs_info)
>  {
> +     list_lru_destroy(&fs_info->eb_info->lru_list);
> +     kfree(fs_info->eb_info);
>       kfree(fs_info->balance_ctl);
>       kfree(fs_info->delayed_root);
>       kfree(fs_info->extent_root);
> @@ -3121,9 +3121,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle 
> *trans,
>                            struct btrfs_root *new_root,
>                            struct btrfs_root *parent_root,
>                            u64 new_dirid);
> -int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
> -                      size_t size, struct bio *bio,
> -                      unsigned long bio_flags);
>  void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
>  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
>  int btrfs_readpage(struct file *file, struct page *page);
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 9c42e53..03ac601 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -217,56 +217,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct 
> extent_buffer *eb,
> 
>  #endif
> 
> -/*
> - * extents on the btree inode are pretty simple, there's one extent
> - * that covers the entire device
> - */
> -static struct extent_map *btree_get_extent(struct inode *inode,
> -             struct page *page, size_t pg_offset, u64 start, u64 len,
> -             int create)
> -{
> -     struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
> -     struct extent_map *em;
> -     int ret;
> -
> -     read_lock(&em_tree->lock);
> -     em = lookup_extent_mapping(em_tree, start, len);
> -     if (em) {
> -             em->bdev =
> -                     BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
> -             read_unlock(&em_tree->lock);
> -             goto out;
> -     }
> -     read_unlock(&em_tree->lock);
> -
> -     em = alloc_extent_map();
> -     if (!em) {
> -             em = ERR_PTR(-ENOMEM);
> -             goto out;
> -     }
> -     em->start = 0;
> -     em->len = (u64)-1;
> -     em->block_len = (u64)-1;
> -     em->block_start = 0;
> -     em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
> -
> -     write_lock(&em_tree->lock);
> -     ret = add_extent_mapping(em_tree, em, 0);
> -     if (ret == -EEXIST) {
> -             free_extent_map(em);
> -             em = lookup_extent_mapping(em_tree, start, len);
> -             if (!em)
> -                     em = ERR_PTR(-EIO);
> -     } else if (ret) {
> -             free_extent_map(em);
> -             em = ERR_PTR(ret);
> -     }
> -     write_unlock(&em_tree->lock);
> -
> -out:
> -     return em;
> -}
> -
>  u32 btrfs_csum_data(char *data, u32 seed, size_t len)
>  {
>       return btrfs_crc32c(seed, data, len);
> @@ -349,11 +299,11 @@ static int csum_tree_block(struct btrfs_fs_info 
> *fs_info,
>   * detect blocks that either didn't get written at all or got written
>   * in the wrong place.
>   */
> -static int verify_parent_transid(struct extent_io_tree *io_tree,
> -                              struct extent_buffer *eb, u64 parent_transid,
> +static int verify_parent_transid(struct extent_buffer *eb, u64 
> parent_transid,
>                                int atomic)
>  {
>       struct extent_state *cached_state = NULL;
> +     struct extent_io_tree *io_tree = &eb->eb_info->io_tree;
>       int ret;
>       bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
> 
> @@ -375,7 +325,7 @@ static int verify_parent_transid(struct extent_io_tree 
> *io_tree,
>               ret = 0;
>               goto out;
>       }
> -     btrfs_err_rl(eb->fs_info,
> +     btrfs_err_rl(eb->eb_info->fs_info,
>               "parent transid verify failed on %llu wanted %llu found %llu",
>                       eb->start,
>                       parent_transid, btrfs_header_generation(eb));
> @@ -445,7 +395,6 @@ static int btree_read_extent_buffer_pages(struct 
> btrfs_root *root,
>                                         struct extent_buffer *eb,
>                                         u64 parent_transid)
>  {
> -     struct extent_io_tree *io_tree;
>       int failed = 0;
>       int ret;
>       int num_copies = 0;
> @@ -453,13 +402,10 @@ static int btree_read_extent_buffer_pages(struct 
> btrfs_root *root,
>       int failed_mirror = 0;
> 
>       clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
> -     io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
>       while (1) {
> -             ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
> -                                            btree_get_extent, mirror_num);
> +             ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
>               if (!ret) {
> -                     if (!verify_parent_transid(io_tree, eb,
> -                                                parent_transid, 0))
> +                     if (!verify_parent_transid(eb, parent_transid, 0))
>                               break;
>                       else
>                               ret = -EIO;
> @@ -504,24 +450,11 @@ static int btree_read_extent_buffer_pages(struct 
> btrfs_root *root,
> 
>  static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page 
> *page)
>  {
> -     u64 start = page_offset(page);
> -     u64 found_start;
>       struct extent_buffer *eb;
> 
>       eb = (struct extent_buffer *)page->private;
>       if (page != eb->pages[0])
>               return 0;
> -
> -     found_start = btrfs_header_bytenr(eb);
> -     /*
> -      * Please do not consolidate these warnings into a single if.
> -      * It is useful to know what went wrong.
> -      */
> -     if (WARN_ON(found_start != start))
> -             return -EUCLEAN;
> -     if (WARN_ON(!PageUptodate(page)))
> -             return -EUCLEAN;
> -
>       ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
>                       btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
> 
> @@ -619,8 +552,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
> *io_bio,
>       u64 found_start;
>       int found_level;
>       struct extent_buffer *eb;
> -     struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
> -     struct btrfs_fs_info *fs_info = root->fs_info;
> +     struct btrfs_root *root;
> +     struct btrfs_fs_info *fs_info;
>       int ret = 0;
>       int reads_done;
> 
> @@ -633,6 +566,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
> *io_bio,
>        * in memory.  Make sure we have a ref for all this other checks
>        */
>       extent_buffer_get(eb);
> +     fs_info = eb->eb_info->fs_info;
> +     root = fs_info->tree_root;
> 
>       reads_done = atomic_dec_and_test(&eb->io_pages);
>       if (!reads_done)
> @@ -693,11 +628,19 @@ err:
>               /*
>                * our io error hook is going to dec the io pages
>                * again, we have to make sure it has something
> -              * to decrement
> +              * to decrement.
> +              *
> +              * TODO: Kill this, we've re-arranged how this works now so we
> +              * don't need to do this io_pages dance.
>                */
>               atomic_inc(&eb->io_pages);
>               clear_extent_buffer_uptodate(eb);
>       }
> +     if (reads_done) {
> +             clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +             smp_mb__after_atomic();
> +             wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
> +     }
>       free_extent_buffer(eb);
>  out:
>       return ret;
> @@ -712,7 +655,7 @@ static int btree_io_failed_hook(struct page *page, int 
> failed_mirror)
>       eb->read_mirror = failed_mirror;
>       atomic_dec(&eb->io_pages);
>       if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
> -             btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
> +             btree_readahead_hook(eb->eb_info->fs_info, eb, eb->start, -EIO);
>       return -EIO;    /* we fixed nothing */
>  }
> 
> @@ -884,15 +827,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, 
> struct bio *bio,
>       return 0;
>  }
> 
> -static int btree_csum_one_bio(struct bio *bio)
> +static int btree_csum_one_bio(struct btrfs_fs_info *fs_info, struct bio *bio)
>  {
>       struct bio_vec *bvec;
> -     struct btrfs_root *root;
>       int i, ret = 0;
> 
>       bio_for_each_segment_all(bvec, bio, i) {
> -             root = BTRFS_I(bvec->bv_page->mapping->host)->root;
> -             ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
> +             ret = csum_dirty_buffer(fs_info, bvec->bv_page);
>               if (ret)
>                       break;
>       }
> @@ -904,25 +845,27 @@ static int __btree_submit_bio_start(void *private_data, 
> struct bio *bio,
>                                   int mirror_num, unsigned long bio_flags,
>                                   u64 bio_offset)
>  {
> +     struct btrfs_eb_info *eb_info = private_data;
>       /*
>        * when we're called for a write, we're already in the async
>        * submission context.  Just jump into btrfs_map_bio
>        */
> -     return btree_csum_one_bio(bio);
> +     return btree_csum_one_bio(eb_info->fs_info, bio);
>  }
> 
>  static int __btree_submit_bio_done(void *private_data, struct bio *bio,
>                                int mirror_num, unsigned long bio_flags,
>                                u64 bio_offset)
>  {
> -     struct inode *inode = private_data;
> +     struct btrfs_eb_info *eb_info = private_data;
> +     struct btrfs_root *root = eb_info->fs_info->tree_root;
>       int ret;
> 
>       /*
>        * when we're called for a write, we're already in the async
>        * submission context.  Just jump into btrfs_map_bio
>        */
> -     ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
> +     ret = btrfs_map_bio(root, bio, mirror_num, 1);
>       if (ret) {
>               bio->bi_error = ret;
>               bio_endio(bio);
> @@ -930,7 +873,7 @@ static int __btree_submit_bio_done(void *private_data, 
> struct bio *bio,
>       return ret;
>  }
> 
> -static int check_async_write(struct inode *inode, unsigned long bio_flags)
> +static int check_async_write(unsigned long bio_flags)
>  {
>       if (bio_flags & EXTENT_BIO_TREE_LOG)
>               return 0;
> @@ -945,8 +888,9 @@ static int btree_submit_bio_hook(void *private_data, 
> struct bio *bio,
>                                int mirror_num, unsigned long bio_flags,
>                                u64 bio_offset)
>  {
> -     struct inode *inode = private_data;
> -     int async = check_async_write(inode, bio_flags);
> +     struct btrfs_eb_info *eb_info = private_data;
> +     struct btrfs_root *root = eb_info->fs_info->tree_root;
> +     int async = check_async_write(bio_flags);
>       int ret;
> 
>       if (bio_op(bio) != REQ_OP_WRITE) {
> @@ -954,23 +898,22 @@ static int btree_submit_bio_hook(void *private_data, 
> struct bio *bio,
>                * called for a read, do the setup so that checksum validation
>                * can happen in the async kernel threads
>                */
> -             ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
> -                                       bio, BTRFS_WQ_ENDIO_METADATA);
> +             ret = btrfs_bio_wq_end_io(eb_info->fs_info, bio,
> +                                       BTRFS_WQ_ENDIO_METADATA);
>               if (ret)
>                       goto out_w_error;
> -             ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
> +             ret = btrfs_map_bio(root, bio, mirror_num, 0);
>       } else if (!async) {
> -             ret = btree_csum_one_bio(bio);
> +             ret = btree_csum_one_bio(eb_info->fs_info, bio);
>               if (ret)
>                       goto out_w_error;
> -             ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
> +             ret = btrfs_map_bio(root, bio, mirror_num, 0);
>       } else {
>               /*
>                * kthread helpers are used to submit writes so that
>                * checksumming can happen in parallel across all CPUs
>                */
> -             ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
> -                                       bio, mirror_num, 0,
> +             ret = btrfs_wq_submit_bio(eb_info->fs_info, bio, mirror_num, 0,
>                                         bio_offset, private_data,
>                                         __btree_submit_bio_start,
>                                         __btree_submit_bio_done);
> @@ -986,118 +929,14 @@ out_w_error:
>       return ret;
>  }
> 
> -#ifdef CONFIG_MIGRATION
> -static int btree_migratepage(struct address_space *mapping,
> -                     struct page *newpage, struct page *page,
> -                     enum migrate_mode mode)
> -{
> -     /*
> -      * we can't safely write a btree page from here,
> -      * we haven't done the locking hook
> -      */
> -     if (PageDirty(page))
> -             return -EAGAIN;
> -     /*
> -      * Buffers may be managed in a filesystem specific way.
> -      * We must have no buffers or drop them.
> -      */
> -     if (page_has_private(page) &&
> -         !try_to_release_page(page, GFP_KERNEL))
> -             return -EAGAIN;
> -     return migrate_page(mapping, newpage, page, mode);
> -}
> -#endif
> -
> -
> -static int btree_writepages(struct address_space *mapping,
> -                         struct writeback_control *wbc)
> -{
> -     struct btrfs_fs_info *fs_info;
> -     int ret;
> -
> -     if (wbc->sync_mode == WB_SYNC_NONE) {
> -
> -             if (wbc->for_kupdate)
> -                     return 0;
> -
> -             fs_info = BTRFS_I(mapping->host)->root->fs_info;
> -             /* this is a bit racy, but that's ok */
> -             ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
> -                                          BTRFS_DIRTY_METADATA_THRESH);
> -             if (ret < 0)
> -                     return 0;
> -     }
> -     return btree_write_cache_pages(mapping, wbc);
> -}
> -
> -static int btree_readpage(struct file *file, struct page *page)
> -{
> -     struct extent_io_tree *tree;
> -     tree = &BTRFS_I(page->mapping->host)->io_tree;
> -     return extent_read_full_page(tree, page, btree_get_extent, 0);
> -}
> -
> -static int btree_releasepage(struct page *page, gfp_t gfp_flags)
> -{
> -     if (PageWriteback(page) || PageDirty(page))
> -             return 0;
> -
> -     return try_release_extent_buffer(page);
> -}
> -
> -static void btree_invalidatepage(struct page *page, unsigned int offset,
> -                              unsigned int length)
> -{
> -     struct extent_io_tree *tree;
> -     tree = &BTRFS_I(page->mapping->host)->io_tree;
> -     extent_invalidatepage(tree, page, offset);
> -     btree_releasepage(page, GFP_NOFS);
> -     if (PagePrivate(page)) {
> -             btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
> -                        "page private not zero on page %llu",
> -                        (unsigned long long)page_offset(page));
> -             ClearPagePrivate(page);
> -             set_page_private(page, 0);
> -             put_page(page);
> -     }
> -}
> -
> -static int btree_set_page_dirty(struct page *page)
> -{
> -#ifdef DEBUG
> -     struct extent_buffer *eb;
> -
> -     BUG_ON(!PagePrivate(page));
> -     eb = (struct extent_buffer *)page->private;
> -     BUG_ON(!eb);
> -     BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
> -     BUG_ON(!atomic_read(&eb->refs));
> -     btrfs_assert_tree_locked(eb);
> -#endif
> -     return __set_page_dirty_nobuffers(page);
> -}
> -
> -static const struct address_space_operations btree_aops = {
> -     .readpage       = btree_readpage,
> -     .writepages     = btree_writepages,
> -     .releasepage    = btree_releasepage,
> -     .invalidatepage = btree_invalidatepage,
> -#ifdef CONFIG_MIGRATION
> -     .migratepage    = btree_migratepage,
> -#endif
> -     .set_page_dirty = btree_set_page_dirty,
> -};
> -
>  void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
>  {
>       struct extent_buffer *buf = NULL;
> -     struct inode *btree_inode = root->fs_info->btree_inode;
> 
>       buf = btrfs_find_create_tree_block(root, bytenr);
>       if (IS_ERR(buf))
>               return;
> -     read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
> -                              buf, WAIT_NONE, btree_get_extent, 0);
> +     read_extent_buffer_pages(buf, WAIT_NONE, 0);
>       free_extent_buffer(buf);
>  }
> 
> @@ -1105,8 +944,6 @@ int reada_tree_block_flagged(struct btrfs_root *root, 
> u64 bytenr,
>                        int mirror_num, struct extent_buffer **eb)
>  {
>       struct extent_buffer *buf = NULL;
> -     struct inode *btree_inode = root->fs_info->btree_inode;
> -     struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
>       int ret;
> 
>       buf = btrfs_find_create_tree_block(root, bytenr);
> @@ -1115,8 +952,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, 
> u64 bytenr,
> 
>       set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
> 
> -     ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
> -                                    btree_get_extent, mirror_num);
> +     ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
>       if (ret) {
>               free_extent_buffer(buf);
>               return ret;
> @@ -1136,29 +972,29 @@ int reada_tree_block_flagged(struct btrfs_root *root, 
> u64 bytenr,
>  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
>                                           u64 bytenr)
>  {
> -     return find_extent_buffer(fs_info, bytenr);
> +     return find_extent_buffer(fs_info->eb_info, bytenr);
>  }
> 
>  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
>                                                u64 bytenr)
>  {
>       if (btrfs_is_testing(root->fs_info))
> -             return alloc_test_extent_buffer(root->fs_info, bytenr,
> -                             root->nodesize);
> +             return alloc_test_extent_buffer(root->fs_info->eb_info, bytenr,
> +                                             root->nodesize);
>       return alloc_extent_buffer(root->fs_info, bytenr);
>  }
> 
> 
>  int btrfs_write_tree_block(struct extent_buffer *buf)
>  {
> -     return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
> -                                     buf->start + buf->len - 1);
> +     return btree_write_range(buf->eb_info->fs_info, buf->start,
> +                              buf->start + buf->len - 1);
>  }
> 
>  int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
>  {
> -     return filemap_fdatawait_range(buf->pages[0]->mapping,
> -                                    buf->start, buf->start + buf->len - 1);
> +     return btree_wait_range(buf->eb_info->fs_info, buf->start,
> +                             buf->start + buf->len - 1);
>  }
> 
>  struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
> @@ -1188,14 +1024,10 @@ void clean_tree_block(struct btrfs_trans_handle 
> *trans,
>           fs_info->running_transaction->transid) {
>               btrfs_assert_tree_locked(buf);
> 
> -             if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
> +             if (clear_extent_buffer_dirty(buf))
>                       __percpu_counter_add(&fs_info->dirty_metadata_bytes,
>                                            -buf->len,
>                                            fs_info->dirty_metadata_batch);
> -                     /* ugh, clear_extent_buffer_dirty needs to lock the 
> page */
> -                     btrfs_set_lock_blocking(buf);
> -                     clear_extent_buffer_dirty(buf);
> -             }
>       }
>  }
> 
> @@ -2247,33 +2079,20 @@ static void btrfs_init_balance(struct btrfs_fs_info 
> *fs_info)
>       init_waitqueue_head(&fs_info->balance_wait_q);
>  }
> 
> -static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
> -                                struct btrfs_root *tree_root)
> +int btrfs_init_eb_info(struct btrfs_fs_info *fs_info)
>  {
> -     fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
> -     set_nlink(fs_info->btree_inode, 1);
> -     /*
> -      * we set the i_size on the btree inode to the max possible int.
> -      * the real end of the address space is determined by all of
> -      * the devices in the system
> -      */
> -     fs_info->btree_inode->i_size = OFFSET_MAX;
> -     fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
> -
> -     RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
> -     extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
> -                         fs_info->btree_inode);
> -     BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
> -     extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
> -
> -     BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
> -
> -     BTRFS_I(fs_info->btree_inode)->root = tree_root;
> -     memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
> -            sizeof(struct btrfs_key));
> -     set_bit(BTRFS_INODE_DUMMY,
> -             &BTRFS_I(fs_info->btree_inode)->runtime_flags);
> -     btrfs_insert_inode_hash(fs_info->btree_inode);
> +     struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +
> +     eb_info->fs_info = fs_info;
> +     extent_io_tree_init(&eb_info->io_tree, eb_info);
> +     eb_info->io_tree.track_uptodate = 0;
> +     eb_info->io_tree.ops = &btree_extent_io_ops;
> +     extent_io_tree_init(&eb_info->io_failure_tree, eb_info);
> +     INIT_RADIX_TREE(&eb_info->buffer_radix, GFP_ATOMIC);
> +     spin_lock_init(&eb_info->buffer_lock);
> +     if (list_lru_init(&eb_info->lru_list))
> +             return -ENOMEM;
> +     return 0;
>  }
> 
>  static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
> @@ -2566,16 +2385,7 @@ int open_ctree(struct super_block *sb,
>               goto fail_delalloc_bytes;
>       }
> 
> -     fs_info->btree_inode = new_inode(sb);
> -     if (!fs_info->btree_inode) {
> -             err = -ENOMEM;
> -             goto fail_bio_counter;
> -     }
> -
> -     mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
> -
>       INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
> -     INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
>       INIT_LIST_HEAD(&fs_info->trans_list);
>       INIT_LIST_HEAD(&fs_info->dead_roots);
>       INIT_LIST_HEAD(&fs_info->delayed_iputs);
> @@ -2590,7 +2400,6 @@ int open_ctree(struct super_block *sb,
>       spin_lock_init(&fs_info->tree_mod_seq_lock);
>       spin_lock_init(&fs_info->super_lock);
>       spin_lock_init(&fs_info->qgroup_op_lock);
> -     spin_lock_init(&fs_info->buffer_lock);
>       spin_lock_init(&fs_info->unused_bgs_lock);
>       rwlock_init(&fs_info->tree_mod_log_lock);
>       mutex_init(&fs_info->unused_bg_unpin_mutex);
> @@ -2643,7 +2452,7 @@ int open_ctree(struct super_block *sb,
>                                       GFP_KERNEL);
>       if (!fs_info->delayed_root) {
>               err = -ENOMEM;
> -             goto fail_iput;
> +             goto fail_alloc;
>       }
>       btrfs_init_delayed_root(fs_info->delayed_root);
> 
> @@ -2658,7 +2467,15 @@ int open_ctree(struct super_block *sb,
>       sb->s_blocksize_bits = blksize_bits(4096);
>       sb->s_bdi = &fs_info->bdi;
> 
> -     btrfs_init_btree_inode(fs_info, tree_root);
> +     fs_info->eb_info = kzalloc(sizeof(struct btrfs_eb_info), GFP_KERNEL);
> +     if (!fs_info->eb_info) {
> +             err = -ENOMEM;
> +             goto fail_alloc;
> +     }
> +     if (btrfs_init_eb_info(fs_info)) {
> +             err = -ENOMEM;
> +             goto fail_alloc;
> +     }
> 
>       spin_lock_init(&fs_info->block_group_cache_lock);
>       fs_info->block_group_cache_tree = RB_ROOT;
> @@ -3085,6 +2902,14 @@ retry_root_backup:
>       if (sb->s_flags & MS_RDONLY)
>               return 0;
> 
> +     /*
> +      * We need to make sure we are on the bdi's dirty list so we get
> +      * writeback requests for our fs properly.
> +      */
> +     spin_lock(&fs_info->bdi.sb_list_lock);
> +     list_add_tail(&fs_info->bdi.dirty_sb_list, &sb->s_bdi_list);
> +     spin_unlock(&fs_info->bdi.sb_list_lock);
> +
>       if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
>           !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
>               btrfs_info(fs_info, "creating free space tree");
> @@ -3180,7 +3005,8 @@ fail_cleaner:
>        * make sure we're done with the btree inode before we stop our
>        * kthreads
>        */
> -     filemap_write_and_wait(fs_info->btree_inode->i_mapping);
> +     btree_write_range(fs_info, 0, (u64)-1);
> +     btree_wait_range(fs_info, 0, (u64)-1);
> 
>  fail_sysfs:
>       btrfs_sysfs_remove_mounted(fs_info);
> @@ -3194,16 +3020,11 @@ fail_block_groups:
> 
>  fail_tree_roots:
>       free_root_pointers(fs_info, 1);
> -     invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
> -
> +     btrfs_invalidate_eb_info(fs_info->eb_info);
>  fail_sb_buffer:
>       btrfs_stop_all_workers(fs_info);
>  fail_alloc:
> -fail_iput:
>       btrfs_mapping_tree_free(&fs_info->mapping_tree);
> -
> -     iput(fs_info->btree_inode);
> -fail_bio_counter:
>       percpu_counter_destroy(&fs_info->bio_counter);
>  fail_delalloc_bytes:
>       percpu_counter_destroy(&fs_info->delalloc_bytes);
> @@ -3908,14 +3729,11 @@ void close_ctree(struct btrfs_root *root)
>        * we must make sure there is not any read request to
>        * submit after we stopping all workers.
>        */
> -     invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
>       btrfs_stop_all_workers(fs_info);
> 
>       clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
>       free_root_pointers(fs_info, 1);
> 
> -     iput(fs_info->btree_inode);
> -
>  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
>       if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
>               btrfsic_unmount(root, fs_info->fs_devices);
> @@ -3924,6 +3742,8 @@ void close_ctree(struct btrfs_root *root)
>       btrfs_close_devices(fs_info->fs_devices);
>       btrfs_mapping_tree_free(&fs_info->mapping_tree);
> 
> +     btrfs_invalidate_eb_info(fs_info->eb_info);
> +
>       percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
>       percpu_counter_destroy(&fs_info->delalloc_bytes);
>       percpu_counter_destroy(&fs_info->bio_counter);
> @@ -3951,14 +3771,12 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, 
> u64 parent_transid,
>                         int atomic)
>  {
>       int ret;
> -     struct inode *btree_inode = buf->pages[0]->mapping->host;
> 
>       ret = extent_buffer_uptodate(buf);
>       if (!ret)
>               return ret;
> 
> -     ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
> -                                 parent_transid, atomic);
> +     ret = verify_parent_transid(buf, parent_transid, atomic);
>       if (ret == -EAGAIN)
>               return ret;
>       return !ret;
> @@ -3979,7 +3797,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
>       if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
>               return;
>  #endif
> -     root = BTRFS_I(buf->pages[0]->mapping->host)->root;
> +     root = buf->eb_info->fs_info->tree_root;
>       btrfs_assert_tree_locked(buf);
>       if (transid != root->fs_info->generation)
>               WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
> @@ -4015,10 +3833,9 @@ static void __btrfs_btree_balance_dirty(struct 
> btrfs_root *root,
> 
>       ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
>                                    BTRFS_DIRTY_METADATA_THRESH);
> -     if (ret > 0) {
> +     if (ret > 0)
>               balance_dirty_pages_ratelimited(&root->fs_info->bdi,
>                                               root->fs_info->sb);
> -     }
>  }
> 
>  void btrfs_btree_balance_dirty(struct btrfs_root *root)
> @@ -4033,7 +3850,7 @@ void btrfs_btree_balance_dirty_nodelay(struct 
> btrfs_root *root)
> 
>  int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
>  {
> -     struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
> +     struct btrfs_root *root = buf->eb_info->fs_info->tree_root;
>       return btree_read_extent_buffer_pages(root, buf, parent_transid);
>  }
> 
> @@ -4376,10 +4193,7 @@ static int btrfs_destroy_marked_extents(struct 
> btrfs_root *root,
>                       if (!eb)
>                               continue;
>                       wait_on_extent_buffer_writeback(eb);
> -
> -                     if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
> -                                            &eb->bflags))
> -                             clear_extent_buffer_dirty(eb);
> +                     clear_extent_buffer_dirty(eb);
>                       free_extent_buffer_stale(eb);
>               }
>       }
> @@ -4504,16 +4318,37 @@ static int btrfs_cleanup_transaction(struct 
> btrfs_root *root)
> 
>  static struct btrfs_fs_info *btree_fs_info(void *private_data)
>  {
> -     struct inode *inode = private_data;
> -     return btrfs_sb(inode->i_sb);
> +     struct btrfs_eb_info *eb_info = private_data;
> +     return eb_info->fs_info;
> +}
> +
> +static int btree_merge_bio_hook(struct page *page, unsigned long offset,
> +                             size_t size, struct bio *bio,
> +                             unsigned long bio_flags)
> +{
> +     struct extent_buffer *eb = (struct extent_buffer *)page->private;
> +     struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
> +     u64 logical = (u64)bio->bi_iter.bi_sector << 9;
> +     u64 length = 0;
> +     u64 map_length;
> +     int ret;
> +
> +     length = bio->bi_iter.bi_size;
> +     map_length = length;
> +     ret = btrfs_map_block(fs_info, bio_op(bio), logical, &map_length,
> +                           NULL, 0);
> +     if (ret < 0)
> +             return ret;
> +     if (map_length < length + size)
> +             return 1;
> +     return 0;
>  }
> 
>  static const struct extent_io_ops btree_extent_io_ops = {
>       .readpage_end_io_hook = btree_readpage_end_io_hook,
>       .readpage_io_failed_hook = btree_io_failed_hook,
>       .submit_bio_hook = btree_submit_bio_hook,
> -     /* note we're sharing with inode.c for the merge bio hook */
> -     .merge_bio_hook = btrfs_merge_bio_hook,
> +     .merge_bio_hook = btree_merge_bio_hook,
>       .tree_fs_info = btree_fs_info,
>       .set_range_writeback = btrfs_set_range_writeback,
>  };
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 5dcdd3e..5c18a49 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -75,8 +75,8 @@ void btrfs_leak_debug_check(void)
>       while (!list_empty(&buffers)) {
>               eb = list_entry(buffers.next, struct extent_buffer, leak_list);
>               printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
> -                    "refs %d\n",
> -                    eb->start, eb->len, atomic_read(&eb->refs));
> +                    "bflags %lu refs %d\n",
> +                    eb->start, eb->len, eb->bflags, atomic_read(&eb->refs));
>               list_del(&eb->leak_list);
>               kmem_cache_free(extent_buffer_cache, eb);
>       }
> @@ -3538,7 +3538,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
>                         struct btrfs_fs_info *fs_info,
>                         struct extent_page_data *epd)
>  {
> -     unsigned long i, num_pages;
> +     struct btrfs_eb_info *eb_info = fs_info->eb_info;
>       int flush = 0;
>       int ret = 0;
> 
> @@ -3585,37 +3585,42 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
> 
>       btrfs_tree_unlock(eb);
> 
> -     if (!ret)
> -             return ret;
> -
> -     num_pages = num_extent_pages(eb->start, eb->len);
> -     for (i = 0; i < num_pages; i++) {
> -             struct page *p = eb->pages[i];
> -
> -             if (!trylock_page(p)) {
> -                     if (!flush) {
> -                             flush_write_bio(epd);
> -                             flush = 1;
> -                     }
> -                     lock_page(p);
> -             }
> +     /*
> +      * We cleared dirty on this buffer, we need to adjust the radix tags.
> +      * We do the actual page accounting in write_one_eb.
> +      */
> +     if (ret) {
> +             spin_lock_irq(&eb_info->buffer_lock);
> +             radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
> +                                PAGECACHE_TAG_WRITEBACK);
> +             radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +                                  PAGECACHE_TAG_DIRTY);
> +             radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +                                  PAGECACHE_TAG_TOWRITE);
> +             spin_unlock_irq(&eb_info->buffer_lock);
>       }
> -
>       return ret;
>  }
> 
>  static void end_extent_buffer_writeback(struct extent_buffer *eb)
>  {
> -     clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> -     smp_mb__after_atomic();
> -     wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
> +     if (test_and_clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
> +             struct btrfs_eb_info *eb_info = eb->eb_info;
> +             unsigned long flags;
> +
> +             spin_lock_irqsave(&eb_info->buffer_lock, flags);
> +             radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +                                  PAGECACHE_TAG_WRITEBACK);
> +             spin_unlock_irqrestore(&eb_info->buffer_lock, flags);
> +             wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
> +     }
>  }
> 
>  static void set_btree_ioerr(struct page *page)
>  {
>       struct extent_buffer *eb = (struct extent_buffer *)page->private;
> +     struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
> 
> -     SetPageError(page);
>       if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
>               return;
> 
> @@ -3624,8 +3629,7 @@ static void set_btree_ioerr(struct page *page)
>        * failed, increment the counter transaction->eb_write_errors.
>        * We do this because while the transaction is running and before it's
>        * committing (when we call filemap_fdata[write|wait]_range against
> -      * the btree inode), we might have
> -      * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
> +      * the btree inode), we might have write_metadata() called - if it
>        * returns an error or an error happens during writeback, when we're
>        * committing the transaction we wouldn't know about it, since the pages
>        * can be no longer dirty nor marked anymore for writeback (if a
> @@ -3659,13 +3663,13 @@ static void set_btree_ioerr(struct page *page)
>        */
>       switch (eb->log_index) {
>       case -1:
> -             set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
> +             set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
>               break;
>       case 0:
> -             set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
> +             set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
>               break;
>       case 1:
> -             set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
> +             set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
>               break;
>       default:
>               BUG(); /* unexpected, logic error */
> @@ -3686,16 +3690,13 @@ static void end_bio_extent_buffer_writepage(struct 
> bio *bio)
>               done = atomic_dec_and_test(&eb->io_pages);
> 
>               if (bio->bi_error ||
> -                 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
> -                     ClearPageUptodate(page);
> +                 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
>                       set_btree_ioerr(page);
> -             }
> -
> -             end_page_writeback(page);
> 
> +             account_metadata_end_writeback(page,
> +                                            &eb->eb_info->fs_info->bdi);
>               if (!done)
>                       continue;
> -
>               end_extent_buffer_writeback(eb);
>       }
> 
> @@ -3708,7 +3709,7 @@ static noinline_for_stack int write_one_eb(struct 
> extent_buffer *eb,
>                       struct extent_page_data *epd)
>  {
>       struct block_device *bdev = fs_info->fs_devices->latest_bdev;
> -     struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
> +     struct extent_io_tree *tree = &fs_info->eb_info->io_tree;
>       u64 offset = eb->start;
>       unsigned long i, num_pages;
>       unsigned long bio_flags = 0;
> @@ -3724,8 +3725,6 @@ static noinline_for_stack int write_one_eb(struct 
> extent_buffer *eb,
>       for (i = 0; i < num_pages; i++) {
>               struct page *p = eb->pages[i];
> 
> -             clear_page_dirty_for_io(p);
> -             set_page_writeback(p);
>               ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
>                                        p, offset >> 9, PAGE_SIZE, 0, bdev,
>                                        &epd->bio, -1,
> @@ -3734,34 +3733,90 @@ static noinline_for_stack int write_one_eb(struct 
> extent_buffer *eb,
>               epd->bio_flags = bio_flags;
>               if (ret) {
>                       set_btree_ioerr(p);
> -                     end_page_writeback(p);
>                       if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
>                               end_extent_buffer_writeback(eb);
>                       ret = -EIO;
>                       break;
>               }
> +             account_metadata_writeback(p, &fs_info->bdi);
>               offset += PAGE_SIZE;
>               update_nr_written(p, wbc, 1);
> -             unlock_page(p);
>       }
> 
> -     if (unlikely(ret)) {
> -             for (; i < num_pages; i++) {
> -                     struct page *p = eb->pages[i];
> -                     clear_page_dirty_for_io(p);
> -                     unlock_page(p);
> +     return ret;
> +}
> +
> +#define EB_TAG_BATCH 4096
> +static void tag_ebs_for_writeback(struct btrfs_eb_info *eb_info, pgoff_t 
> start,
> +                               pgoff_t end)
> +{
> +     unsigned long tagged;
> +
> +     do {
> +             spin_lock_irq(&eb_info->buffer_lock);
> +             tagged = radix_tree_range_tag_if_tagged(&eb_info->buffer_radix,
> +                                                     &start, end,
> +                                                     EB_TAG_BATCH,
> +                                                     PAGECACHE_TAG_DIRTY,
> +                                                     PAGECACHE_TAG_TOWRITE);
> +             spin_unlock_irq(&eb_info->buffer_lock);
> +             cond_resched();
> +     } while (tagged >= EB_TAG_BATCH && start);
> +}
> +
> +static unsigned eb_lookup_tag(struct btrfs_eb_info *eb_info,
> +                           struct extent_buffer **ebs, pgoff_t *index,
> +                           int tag, unsigned nr)
> +{
> +     struct radix_tree_iter iter;
> +     void **slot;
> +     unsigned ret = 0;
> +
> +     if (unlikely(!nr))
> +             return 0;
> +
> +     rcu_read_lock();
> +     radix_tree_for_each_tagged(slot, &eb_info->buffer_radix, &iter, *index,
> +                                tag) {
> +             struct extent_buffer *eb;
> +repeat:
> +             eb = radix_tree_deref_slot(slot);
> +             if (unlikely(!eb))
> +                     continue;
> +
> +             if (radix_tree_exception(eb)) {
> +                     if (radix_tree_deref_retry(eb)) {
> +                             slot = radix_tree_iter_retry(&iter);
> +                             continue;
> +                     }
> +                     continue;
>               }
> -     }
> 
> +             if (unlikely(!atomic_inc_not_zero(&eb->refs)))
> +                     continue;
> +
> +             if (unlikely(eb != *slot)) {
> +                     free_extent_buffer(eb);
> +                     goto repeat;
> +             }
> +
> +             ebs[ret] = eb;
> +             if (++ret == nr)
> +                     break;
> +     }
> +     rcu_read_unlock();
> +     if (ret)
> +             *index = (ebs[ret - 1]->start >> PAGE_SHIFT) + 1;
>       return ret;
>  }
> 
> -int btree_write_cache_pages(struct address_space *mapping,
> +#define EBVEC_SIZE 16
> +static int btree_write_cache_pages(struct btrfs_fs_info *fs_info,
>                                  struct writeback_control *wbc)
>  {
> -     struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
> -     struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
> -     struct extent_buffer *eb, *prev_eb = NULL;
> +     struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +     struct extent_io_tree *tree = &eb_info->io_tree;
> +     struct extent_buffer *eb;
>       struct extent_page_data epd = {
>               .bio = NULL,
>               .tree = tree,
> @@ -3772,16 +3827,16 @@ int btree_write_cache_pages(struct address_space 
> *mapping,
>       int ret = 0;
>       int done = 0;
>       int nr_to_write_done = 0;
> -     struct pagevec pvec;
> -     int nr_pages;
> +     struct extent_buffer *ebs[EBVEC_SIZE];
> +     int nr_ebs;
>       pgoff_t index;
>       pgoff_t end;            /* Inclusive */
> +     pgoff_t done_index = 0;
>       int scanned = 0;
>       int tag;
> 
> -     pagevec_init(&pvec, 0);
>       if (wbc->range_cyclic) {
> -             index = mapping->writeback_index; /* Start from prev offset */
> +             index = eb_info->writeback_index; /* Start from prev offset */
>               end = -1;
>       } else {
>               index = wbc->range_start >> PAGE_SHIFT;
> @@ -3794,53 +3849,27 @@ int btree_write_cache_pages(struct address_space 
> *mapping,
>               tag = PAGECACHE_TAG_DIRTY;
>  retry:
>       if (wbc->sync_mode == WB_SYNC_ALL)
> -             tag_pages_for_writeback(mapping, index, end);
> +             tag_ebs_for_writeback(fs_info->eb_info, index, end);
>       while (!done && !nr_to_write_done && (index <= end) &&
> -            (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
> -                     min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
> +            (nr_ebs = eb_lookup_tag(eb_info, ebs, &index, tag,
> +                     min(end - index, (pgoff_t)EBVEC_SIZE-1) + 1))) {
>               unsigned i;
> 
>               scanned = 1;
> -             for (i = 0; i < nr_pages; i++) {
> -                     struct page *page = pvec.pages[i];
> -
> -                     if (!PagePrivate(page))
> -                             continue;
> -
> -                     if (!wbc->range_cyclic && page->index > end) {
> -                             done = 1;
> -                             break;
> -                     }
> -
> -                     spin_lock(&mapping->private_lock);
> -                     if (!PagePrivate(page)) {
> -                             spin_unlock(&mapping->private_lock);
> -                             continue;
> -                     }
> -
> -                     eb = (struct extent_buffer *)page->private;
> -
> -                     /*
> -                      * Shouldn't happen and normally this would be a BUG_ON
> -                      * but no sense in crashing the users box for something
> -                      * we can survive anyway.
> -                      */
> -                     if (WARN_ON(!eb)) {
> -                             spin_unlock(&mapping->private_lock);
> +             for (i = 0; i < nr_ebs; i++) {
> +                     eb = ebs[i];
> +                     if (done) {
> +                             free_extent_buffer(eb);
>                               continue;
>                       }
> 
> -                     if (eb == prev_eb) {
> -                             spin_unlock(&mapping->private_lock);
> +                     if (!wbc->range_cyclic && eb->start > wbc->range_end) {
> +                             done = 1;
> +                             free_extent_buffer(eb);
>                               continue;
>                       }
> 
> -                     ret = atomic_inc_not_zero(&eb->refs);
> -                     spin_unlock(&mapping->private_lock);
> -                     if (!ret)
> -                             continue;
> -
> -                     prev_eb = eb;
> +                     done_index = eb_index(eb);
>                       ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
>                       if (!ret) {
>                               free_extent_buffer(eb);
> @@ -3848,12 +3877,11 @@ retry:
>                       }
> 
>                       ret = write_one_eb(eb, fs_info, wbc, &epd);
> +                     free_extent_buffer(eb);
>                       if (ret) {
>                               done = 1;
> -                             free_extent_buffer(eb);
> -                             break;
> +                             continue;
>                       }
> -                     free_extent_buffer(eb);
> 
>                       /*
>                        * the filesystem may choose to bump up nr_to_write.
> @@ -3862,7 +3890,6 @@ retry:
>                        */
>                       nr_to_write_done = wbc->nr_to_write <= 0;
>               }
> -             pagevec_release(&pvec);
>               cond_resched();
>       }
>       if (!scanned && !done) {
> @@ -3874,10 +3901,77 @@ retry:
>               index = 0;
>               goto retry;
>       }
> +     if (wbc->range_cyclic)
> +             fs_info->eb_info->writeback_index = done_index;
>       flush_write_bio(&epd);
>       return ret;
>  }
> 
> +void btrfs_write_ebs(struct super_block *sb, struct writeback_control *wbc)
> +{
> +     struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +     btree_write_cache_pages(fs_info, wbc);
> +}
> +
> +static int __btree_write_range(struct btrfs_fs_info *fs_info, u64 start,
> +                            u64 end, int sync_mode)
> +{
> +     struct writeback_control wbc = {
> +             .sync_mode = sync_mode,
> +             .nr_to_write = LONG_MAX,
> +             .range_start = start,
> +             .range_end = end,
> +     };
> +
> +     return btree_write_cache_pages(fs_info, &wbc);
> +}
> +
> +void btree_flush(struct btrfs_fs_info *fs_info)
> +{
> +     __btree_write_range(fs_info, 0, (u64)-1, WB_SYNC_NONE);
> +}
> +
> +int btree_write_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
> +{
> +     return __btree_write_range(fs_info, start, end, WB_SYNC_ALL);
> +}
> +
> +int btree_wait_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
> +{
> +     struct extent_buffer *ebs[EBVEC_SIZE];
> +     pgoff_t index = start >> PAGE_SHIFT;
> +     pgoff_t end_index = end >> PAGE_SHIFT;
> +     unsigned nr_ebs;
> +     int ret = 0;
> +
> +     if (end < start)
> +             return ret;
> +
> +     while ((index <= end) &&
> +            (nr_ebs = eb_lookup_tag(fs_info->eb_info, ebs, &index,
> +                                    PAGECACHE_TAG_WRITEBACK,
> +                                    min(end_index - index,
> +                                        (pgoff_t)EBVEC_SIZE-1) + 1)) != 0) {
> +             unsigned i;
> +
> +             for (i = 0; i < nr_ebs; i++) {
> +                     struct extent_buffer *eb = ebs[i];
> +
> +                     if (eb->start > end) {
> +                             free_extent_buffer(eb);
> +                             continue;
> +                     }
> +
> +                     wait_on_extent_buffer_writeback(eb);
> +                     if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
> +                             ret = -EIO;
> +                     free_extent_buffer(eb);
> +             }
> +             cond_resched();
> +     }
> +     return ret;
> +}
> +
>  /**
>   * write_cache_pages - walk the list of dirty pages of the given address 
> space and write all of them.
>   * @mapping: address space structure to write
> @@ -4558,7 +4652,6 @@ static void btrfs_release_extent_buffer_page(struct 
> extent_buffer *eb)
>  {
>       unsigned long index;
>       struct page *page;
> -     int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
> 
>       BUG_ON(extent_buffer_under_io(eb));
> 
> @@ -4566,39 +4659,21 @@ static void btrfs_release_extent_buffer_page(struct 
> extent_buffer *eb)
>       if (index == 0)
>               return;
> 
> +     ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
>       do {
>               index--;
>               page = eb->pages[index];
>               if (!page)
>                       continue;
> -             if (mapped)
> -                     spin_lock(&page->mapping->private_lock);
> -             /*
> -              * We do this since we'll remove the pages after we've
> -              * removed the eb from the radix tree, so we could race
> -              * and have this page now attached to the new eb.  So
> -              * only clear page_private if it's still connected to
> -              * this eb.
> -              */
> -             if (PagePrivate(page) &&
> -                 page->private == (unsigned long)eb) {
> -                     BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
> -                     BUG_ON(PageDirty(page));
> -                     BUG_ON(PageWriteback(page));
> -                     /*
> -                      * We need to make sure we haven't be attached
> -                      * to a new eb.
> -                      */
> -                     ClearPagePrivate(page);
> -                     set_page_private(page, 0);
> -                     /* One for the page private */
> -                     put_page(page);
> -             }
> +             ASSERT(PagePrivate(page));
> +             ASSERT(page->private == (unsigned long)eb);
> +             ClearPagePrivate(page);
> +             set_page_private(page, 0);
> 
> -             if (mapped)
> -                     spin_unlock(&page->mapping->private_lock);
> +             /* Once for the page private. */
> +             put_page(page);
> 
> -             /* One for when we allocated the page */
> +             /* Once for the alloc_page. */
>               put_page(page);
>       } while (index != 0);
>  }
> @@ -4613,7 +4688,7 @@ static inline void btrfs_release_extent_buffer(struct 
> extent_buffer *eb)
>  }
> 
>  static struct extent_buffer *
> -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
> +__alloc_extent_buffer(struct btrfs_eb_info *eb_info, u64 start,
>                     unsigned long len)
>  {
>       struct extent_buffer *eb = NULL;
> @@ -4621,7 +4696,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, 
> u64 start,
>       eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
>       eb->start = start;
>       eb->len = len;
> -     eb->fs_info = fs_info;
> +     eb->eb_info = eb_info;
>       eb->bflags = 0;
>       rwlock_init(&eb->lock);
>       atomic_set(&eb->write_locks, 0);
> @@ -4633,6 +4708,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, 
> u64 start,
>       eb->lock_nested = 0;
>       init_waitqueue_head(&eb->write_lock_wq);
>       init_waitqueue_head(&eb->read_lock_wq);
> +     INIT_LIST_HEAD(&eb->lru);
> 
>       btrfs_leak_debug_add(&eb->leak_list, &buffers);
> 
> @@ -4657,7 +4733,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct 
> extent_buffer *src)
>       struct extent_buffer *new;
>       unsigned long num_pages = num_extent_pages(src->start, src->len);
> 
> -     new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
> +     new = __alloc_extent_buffer(src->eb_info, src->start, src->len);
>       if (new == NULL)
>               return NULL;
> 
> @@ -4668,8 +4744,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct 
> extent_buffer *src)
>                       return NULL;
>               }
>               attach_extent_buffer_page(new, p);
> -             WARN_ON(PageDirty(p));
> -             SetPageUptodate(p);
>               new->pages[i] = p;
>       }
> 
> @@ -4680,8 +4754,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct 
> extent_buffer *src)
>       return new;
>  }
> 
> -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info 
> *fs_info,
> -                                               u64 start, unsigned long len)
> +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_eb_info 
> *eb_info,
> +                                             u64 start, unsigned long len)
>  {
>       struct extent_buffer *eb;
>       unsigned long num_pages;
> @@ -4689,7 +4763,7 @@ struct extent_buffer 
> *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> 
>       num_pages = num_extent_pages(start, len);
> 
> -     eb = __alloc_extent_buffer(fs_info, start, len);
> +     eb = __alloc_extent_buffer(eb_info, start, len);
>       if (!eb)
>               return NULL;
> 
> @@ -4697,6 +4771,7 @@ struct extent_buffer 
> *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>               eb->pages[i] = alloc_page(GFP_NOFS);
>               if (!eb->pages[i])
>                       goto err;
> +             attach_extent_buffer_page(eb, eb->pages[i]);
>       }
>       set_extent_buffer_uptodate(eb);
>       btrfs_set_header_nritems(eb, 0);
> @@ -4704,30 +4779,10 @@ struct extent_buffer 
> *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> 
>       return eb;
>  err:
> -     for (; i > 0; i--)
> -             __free_page(eb->pages[i - 1]);
> -     __free_extent_buffer(eb);
> +     btrfs_release_extent_buffer(eb);
>       return NULL;
>  }
> 
> -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info 
> *fs_info,
> -                                             u64 start, u32 nodesize)
> -{
> -     unsigned long len;
> -
> -     if (!fs_info) {
> -             /*
> -              * Called only from tests that don't always have a fs_info
> -              * available
> -              */
> -             len = nodesize;
> -     } else {
> -             len = fs_info->tree_root->nodesize;
> -     }
> -
> -     return __alloc_dummy_extent_buffer(fs_info, start, len);
> -}
> -
>  static void check_buffer_tree_ref(struct extent_buffer *eb)
>  {
>       int refs;
> @@ -4777,13 +4832,13 @@ static void mark_extent_buffer_accessed(struct 
> extent_buffer *eb,
>       }
>  }
> 
> -struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
> +struct extent_buffer *find_extent_buffer(struct btrfs_eb_info *eb_info,
>                                        u64 start)
>  {
>       struct extent_buffer *eb;
> 
>       rcu_read_lock();
> -     eb = radix_tree_lookup(&fs_info->buffer_radix,
> +     eb = radix_tree_lookup(&eb_info->buffer_radix,
>                              start >> PAGE_SHIFT);
>       if (eb && atomic_inc_not_zero(&eb->refs)) {
>               rcu_read_unlock();
> @@ -4815,30 +4870,30 @@ struct extent_buffer *find_extent_buffer(struct 
> btrfs_fs_info *fs_info,
>  }
> 
>  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
> -struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
> -                                     u64 start, u32 nodesize)
> +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_eb_info *eb_info,
> +                                            u64 start, u32 nodesize)
>  {
>       struct extent_buffer *eb, *exists = NULL;
>       int ret;
> 
> -     eb = find_extent_buffer(fs_info, start);
> +     eb = find_extent_buffer(eb_info, start);
>       if (eb)
>               return eb;
> -     eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
> +     eb = alloc_dummy_extent_buffer(eb_info, start, nodesize);
>       if (!eb)
>               return NULL;
> -     eb->fs_info = fs_info;
> +     eb->eb_info = eb_info;
>  again:
>       ret = radix_tree_preload(GFP_NOFS);
>       if (ret)
>               goto free_eb;
> -     spin_lock(&fs_info->buffer_lock);
> -     ret = radix_tree_insert(&fs_info->buffer_radix,
> +     spin_lock_irq(&eb_info->buffer_lock);
> +     ret = radix_tree_insert(&eb_info->buffer_radix,
>                               start >> PAGE_SHIFT, eb);
> -     spin_unlock(&fs_info->buffer_lock);
> +     spin_unlock_irq(&eb_info->buffer_lock);
>       radix_tree_preload_end();
>       if (ret == -EEXIST) {
> -             exists = find_extent_buffer(fs_info, start);
> +             exists = find_extent_buffer(eb_info, start);
>               if (exists)
>                       goto free_eb;
>               else
> @@ -4854,6 +4909,7 @@ again:
>        * bump the ref count again.
>        */
>       atomic_inc(&eb->refs);
> +     set_extent_buffer_uptodate(eb);
>       return eb;
>  free_eb:
>       btrfs_release_extent_buffer(eb);
> @@ -4867,12 +4923,12 @@ struct extent_buffer *alloc_extent_buffer(struct 
> btrfs_fs_info *fs_info,
>       unsigned long len = fs_info->tree_root->nodesize;
>       unsigned long num_pages = num_extent_pages(start, len);
>       unsigned long i;
> -     unsigned long index = start >> PAGE_SHIFT;
>       struct extent_buffer *eb;
>       struct extent_buffer *exists = NULL;
>       struct page *p;
> -     struct address_space *mapping = fs_info->btree_inode->i_mapping;
> -     int uptodate = 1;
> +     struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +//   struct zone *last_zone = NULL;
> +//   struct pg_data_t *last_pgdata = NULL;
>       int ret;
> 
>       if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
> @@ -4880,62 +4936,36 @@ struct extent_buffer *alloc_extent_buffer(struct 
> btrfs_fs_info *fs_info,
>               return ERR_PTR(-EINVAL);
>       }
> 
> -     eb = find_extent_buffer(fs_info, start);
> +     eb = find_extent_buffer(eb_info, start);
>       if (eb)
>               return eb;
> 
> -     eb = __alloc_extent_buffer(fs_info, start, len);
> +     eb = __alloc_extent_buffer(eb_info, start, len);
>       if (!eb)
>               return ERR_PTR(-ENOMEM);
> 
> -     for (i = 0; i < num_pages; i++, index++) {
> -             p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
> +     for (i = 0; i < num_pages; i++) {
> +             p = alloc_page(GFP_NOFS|__GFP_NOFAIL);
>               if (!p) {
>                       exists = ERR_PTR(-ENOMEM);
>                       goto free_eb;
>               }
> 
> -             spin_lock(&mapping->private_lock);
> -             if (PagePrivate(p)) {
> -                     /*
> -                      * We could have already allocated an eb for this page
> -                      * and attached one so lets see if we can get a ref on
> -                      * the existing eb, and if we can we know it's good and
> -                      * we can just return that one, else we know we can just
> -                      * overwrite page->private.
> -                      */
> -                     exists = (struct extent_buffer *)p->private;
> -                     if (atomic_inc_not_zero(&exists->refs)) {
> -                             spin_unlock(&mapping->private_lock);
> -                             unlock_page(p);
> -                             put_page(p);
> -                             mark_extent_buffer_accessed(exists, p);
> -                             goto free_eb;
> -                     }
> -                     exists = NULL;
> -
> -                     /*
> -                      * Do this so attach doesn't complain and we need to
> -                      * drop the ref the old guy had.
> -                      */
> -                     ClearPagePrivate(p);
> -                     WARN_ON(PageDirty(p));
> -                     put_page(p);
> -             }
> +             /*
> +              * If our pages span zones or numa nodes we have to do
> +              * dirty/writeback accounting per page, otherwise we can do it
> +              * in bulk and save us some looping.
> +              *
> +             if (!last_zone)
> +                     last_zone = page_zone(p);
> +             if (!last_pgdata)
> +                     last_pgdata = page_pgdata(p);
> +             if (last_zone != page_zone(p) || last_pgdata != page_pgdata(p))
> +                     set_bit(EXTENT_BUFFER_MIXED_PAGES, &eb->bflags);
> +             */
>               attach_extent_buffer_page(eb, p);
> -             spin_unlock(&mapping->private_lock);
> -             WARN_ON(PageDirty(p));
>               eb->pages[i] = p;
> -             if (!PageUptodate(p))
> -                     uptodate = 0;
> -
> -             /*
> -              * see below about how we avoid a nasty race with release page
> -              * and why we unlock later
> -              */
>       }
> -     if (uptodate)
> -             set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>  again:
>       ret = radix_tree_preload(GFP_NOFS);
>       if (ret) {
> @@ -4943,13 +4973,13 @@ again:
>               goto free_eb;
>       }
> 
> -     spin_lock(&fs_info->buffer_lock);
> -     ret = radix_tree_insert(&fs_info->buffer_radix,
> +     spin_lock_irq(&eb_info->buffer_lock);
> +     ret = radix_tree_insert(&eb_info->buffer_radix,
>                               start >> PAGE_SHIFT, eb);
> -     spin_unlock(&fs_info->buffer_lock);
> +     spin_unlock_irq(&eb_info->buffer_lock);
>       radix_tree_preload_end();
>       if (ret == -EEXIST) {
> -             exists = find_extent_buffer(fs_info, start);
> +             exists = find_extent_buffer(eb_info, start);
>               if (exists)
>                       goto free_eb;
>               else
> @@ -4959,31 +4989,10 @@ again:
>       check_buffer_tree_ref(eb);
>       set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
> 
> -     /*
> -      * there is a race where release page may have
> -      * tried to find this extent buffer in the radix
> -      * but failed.  It will tell the VM it is safe to
> -      * reclaim the, and it will clear the page private bit.
> -      * We must make sure to set the page private bit properly
> -      * after the extent buffer is in the radix tree so
> -      * it doesn't get lost
> -      */
> -     SetPageChecked(eb->pages[0]);
> -     for (i = 1; i < num_pages; i++) {
> -             p = eb->pages[i];
> -             ClearPageChecked(p);
> -             unlock_page(p);
> -     }
> -     unlock_page(eb->pages[0]);
>       return eb;
> 
>  free_eb:
>       WARN_ON(!atomic_dec_and_test(&eb->refs));
> -     for (i = 0; i < num_pages; i++) {
> -             if (eb->pages[i])
> -                     unlock_page(eb->pages[i]);
> -     }
> -
>       btrfs_release_extent_buffer(eb);
>       return exists;
>  }
> @@ -4999,17 +5008,19 @@ static inline void 
> btrfs_release_extent_buffer_rcu(struct rcu_head *head)
>  /* Expects to have eb->eb_lock already held */
>  static int release_extent_buffer(struct extent_buffer *eb)
>  {
> +     struct btrfs_eb_info *eb_info = eb->eb_info;
> +
>       WARN_ON(atomic_read(&eb->refs) == 0);
>       if (atomic_dec_and_test(&eb->refs)) {
> +             if (eb_info)
> +                     list_lru_del(&eb_info->lru_list, &eb->lru);
>               if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
> -                     struct btrfs_fs_info *fs_info = eb->fs_info;
> -
>                       spin_unlock(&eb->refs_lock);
> 
> -                     spin_lock(&fs_info->buffer_lock);
> -                     radix_tree_delete(&fs_info->buffer_radix,
> -                                       eb->start >> PAGE_SHIFT);
> -                     spin_unlock(&fs_info->buffer_lock);
> +                     spin_lock_irq(&eb_info->buffer_lock);
> +                     radix_tree_delete(&eb_info->buffer_radix,
> +                                       eb_index(eb));
> +                     spin_unlock_irq(&eb_info->buffer_lock);
>               } else {
>                       spin_unlock(&eb->refs_lock);
>               }
> @@ -5024,6 +5035,8 @@ static int release_extent_buffer(struct extent_buffer 
> *eb)
>  #endif
>               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
>               return 1;
> +     } else if (eb_info && atomic_read(&eb->refs) == 1) {
> +             list_lru_add(&eb_info->lru_list, &eb->lru);
>       }
>       spin_unlock(&eb->refs_lock);
> 
> @@ -5057,10 +5070,6 @@ void free_extent_buffer(struct extent_buffer *eb)
>           test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
>               atomic_dec(&eb->refs);
> 
> -     /*
> -      * I know this is terrible, but it's temporary until we stop tracking
> -      * the uptodate bits and such for the extent buffers.
> -      */
>       release_extent_buffer(eb);
>  }
> 
> @@ -5078,82 +5087,163 @@ void free_extent_buffer_stale(struct extent_buffer 
> *eb)
>       release_extent_buffer(eb);
>  }
> 
> -void clear_extent_buffer_dirty(struct extent_buffer *eb)
> +long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc)
>  {
> -     unsigned long i;
> -     unsigned long num_pages;
> -     struct page *page;
> +     struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +     struct btrfs_eb_info *eb_info = fs_info->eb_info;
> 
> -     num_pages = num_extent_pages(eb->start, eb->len);
> +     return list_lru_shrink_count(&eb_info->lru_list, sc);
> +}
> 
> -     for (i = 0; i < num_pages; i++) {
> -             page = eb->pages[i];
> -             if (!PageDirty(page))
> -                     continue;
> +static enum lru_status eb_lru_isolate(struct list_head *item,
> +                                   struct list_lru_one *lru,
> +                                   spinlock_t *lru_lock, void *arg)
> +{
> +     struct list_head *freeable = (struct list_head *)arg;
> +     struct extent_buffer *eb = container_of(item, struct extent_buffer,
> +                                             lru);
> +     enum lru_status ret;
> +     int refs;
> 
> -             lock_page(page);
> -             WARN_ON(!PagePrivate(page));
> +     if (!spin_trylock(&eb->refs_lock))
> +             return LRU_SKIP;
> 
> -             clear_page_dirty_for_io(page);
> -             spin_lock_irq(&page->mapping->tree_lock);
> -             if (!PageDirty(page)) {
> -                     radix_tree_tag_clear(&page->mapping->page_tree,
> -                                             page_index(page),
> -                                             PAGECACHE_TAG_DIRTY);
> -             }
> -             spin_unlock_irq(&page->mapping->tree_lock);
> -             ClearPageError(page);
> -             unlock_page(page);
> +     if (extent_buffer_under_io(eb)) {
> +             ret = LRU_ROTATE;
> +             goto out;
> +     }
> +
> +     refs = atomic_read(&eb->refs);
> +     /* We can race with somebody freeing us, just skip if this happens. */
> +     if (refs == 0) {
> +             ret = LRU_SKIP;
> +             goto out;
> +     }
> +
> +     /* Eb is in use, don't kill it. */
> +     if (refs > 1) {
> +             ret = LRU_ROTATE;
> +             goto out;
> +     }
> +
> +     /*
> +      * If we don't clear the TREE_REF flag then this eb is going to
> +      * disappear soon anyway.  Otherwise we become responsible for dropping
> +      * the last ref on this eb and we know it'll survive until we call
> +      * dispose_list.
> +      */
> +     if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
> +             ret = LRU_SKIP;
> +             goto out;
> +     }
> +     list_lru_isolate_move(lru, &eb->lru, freeable);
> +     ret = LRU_REMOVED;
> +out:
> +     spin_unlock(&eb->refs_lock);
> +     return ret;
> +}
> +
> +static void dispose_list(struct list_head *list)
> +{
> +     struct extent_buffer *eb;
> +
> +     while (!list_empty(list)) {
> +             eb = list_first_entry(list, struct extent_buffer, lru);
> +
> +             spin_lock(&eb->refs_lock);
> +             list_del_init(&eb->lru);
> +             spin_unlock(&eb->refs_lock);
> +             free_extent_buffer(eb);
> +             cond_resched();
>       }
> +}
> +
> +long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc)
> +{
> +     struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +     struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +     LIST_HEAD(freeable);
> +     long freed;
> +
> +     freed = list_lru_shrink_walk(&eb_info->lru_list, sc, eb_lru_isolate,
> +                                  &freeable);
> +     dispose_list(&freeable);
> +     return freed;
> +}
> +
> +#define MAX_EVICT_COUNT 1024
> +void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info)
> +{
> +     LIST_HEAD(freeable);
> +     unsigned long count;
> +
> +     /*
> +      * Evict in batches so we don't lockup the system trying to evict
> +      * memory.
> +      */
> +     do {
> +             count = list_lru_walk(&eb_info->lru_list, eb_lru_isolate,
> +                                   &freeable, MAX_EVICT_COUNT);
> +             cond_resched();
> +     } while (count);
> +     dispose_list(&freeable);
> +     synchronize_rcu();
> +}
> +
> +int clear_extent_buffer_dirty(struct extent_buffer *eb)
> +{
> +     struct btrfs_eb_info *eb_info = eb->eb_info;
> +     unsigned long i;
> +     unsigned long num_pages;
> +
> +     if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
> +             return 0;
> +
> +     spin_lock_irq(&eb_info->buffer_lock);
> +     radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +                          PAGECACHE_TAG_DIRTY);
> +     spin_unlock_irq(&eb_info->buffer_lock);
> +
> +     num_pages = num_extent_pages(eb->start, eb->len);
> +     for (i = 0; i < num_pages; i++)
> +             account_metadata_cleaned(eb->pages[i], &eb_info->fs_info->bdi);
>       WARN_ON(atomic_read(&eb->refs) == 0);
> +     return 1;
>  }
> 
>  int set_extent_buffer_dirty(struct extent_buffer *eb)
>  {
> +     struct btrfs_eb_info *eb_info = eb->eb_info;
>       unsigned long i;
>       unsigned long num_pages;
>       int was_dirty = 0;
> 
>       check_buffer_tree_ref(eb);
> 
> -     was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
> -
> -     num_pages = num_extent_pages(eb->start, eb->len);
>       WARN_ON(atomic_read(&eb->refs) == 0);
>       WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
> +     if (test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
> +             return 1;
> 
> +     num_pages = num_extent_pages(eb->start, eb->len);
>       for (i = 0; i < num_pages; i++)
> -             set_page_dirty(eb->pages[i]);
> +             account_metadata_dirtied(eb->pages[i],
> +                                      &eb->eb_info->fs_info->bdi);
> +     spin_lock_irq(&eb_info->buffer_lock);
> +     radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
> +                        PAGECACHE_TAG_DIRTY);
> +     spin_unlock_irq(&eb_info->buffer_lock);
>       return was_dirty;
>  }
> 
>  void clear_extent_buffer_uptodate(struct extent_buffer *eb)
>  {
> -     unsigned long i;
> -     struct page *page;
> -     unsigned long num_pages;
> -
>       clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -     num_pages = num_extent_pages(eb->start, eb->len);
> -     for (i = 0; i < num_pages; i++) {
> -             page = eb->pages[i];
> -             if (page)
> -                     ClearPageUptodate(page);
> -     }
>  }
> 
>  void set_extent_buffer_uptodate(struct extent_buffer *eb)
>  {
> -     unsigned long i;
> -     struct page *page;
> -     unsigned long num_pages;
> -
>       set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -     num_pages = num_extent_pages(eb->start, eb->len);
> -     for (i = 0; i < num_pages; i++) {
> -             page = eb->pages[i];
> -             SetPageUptodate(page);
> -     }
>  }
> 
>  int extent_buffer_uptodate(struct extent_buffer *eb)
> @@ -5161,103 +5251,166 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
>       return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>  }
> 
> -int read_extent_buffer_pages(struct extent_io_tree *tree,
> -                          struct extent_buffer *eb, int wait,
> -                          get_extent_t *get_extent, int mirror_num)
> +static void end_bio_extent_buffer_readpage(struct bio *bio)
>  {
> +     struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
> +     struct extent_io_tree *tree = NULL;
> +     struct bio_vec *bvec;
> +     u64 unlock_start = 0, unlock_len = 0;
> +     int mirror_num = io_bio->mirror_num;
> +     int uptodate = !bio->bi_error;
> +     int i, ret;
> +
> +     bio_for_each_segment_all(bvec, bio, i) {
> +             struct page *page = bvec->bv_page;
> +             struct btrfs_eb_info *eb_info;
> +             struct extent_buffer *eb;
> +
> +             eb = (struct extent_buffer *)page->private;
> +             if (WARN_ON(!eb))
> +                     continue;
> +
> +             eb_info = eb->eb_info;
> +             if (!tree)
> +                     tree = &eb_info->io_tree;
> +             if (uptodate) {
> +                     /*
> +                      * btree_readpage_end_io_hook doesn't care about
> +                      * start/end so just pass 0.  We'll kill this later.
> +                      */
> +                     ret = tree->ops->readpage_end_io_hook(io_bio, 0,
> +                                                           page, 0, 0,
> +                                                           mirror_num);
> +                     if (ret) {
> +                             uptodate = 0;
> +                     } else {
> +                             u64 start = eb->start;
> +                             int c, num_pages;
> +
> +                             num_pages = num_extent_pages(eb->start,
> +                                                          eb->len);
> +                             for (c = 0; c < num_pages; c++) {
> +                                     if (eb->pages[c] == page)
> +                                             break;
> +                                     start += PAGE_SIZE;
> +                             }
> +                             clean_io_failure(eb_info->fs_info,
> +                                              &eb_info->io_failure_tree,
> +                                              tree, start, page, 0, 0);
> +                     }
> +             }
> +             /*
> +              * We never fix anything in btree_io_failed_hook.
> +              *
> +              * TODO: rework the io failed hook to not assume we can fix
> +              * anything.
> +              */
> +             if (!uptodate)
> +                     tree->ops->readpage_io_failed_hook(page, mirror_num);
> +
> +             if (unlock_start == 0) {
> +                     unlock_start = eb->start;
> +                     unlock_len = PAGE_SIZE;
> +             } else {
> +                     unlock_len += PAGE_SIZE;
> +             }
> +     }
> +
> +     if (unlock_start)
> +             unlock_extent(tree, unlock_start,
> +                           unlock_start + unlock_len - 1);
> +     if (io_bio->end_io)
> +             io_bio->end_io(io_bio, bio->bi_error);
> +     bio_put(bio);
> +}
> +
> +int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
> +                          int mirror_num)
> +{
> +     struct btrfs_eb_info *eb_info = eb->eb_info;
> +     struct extent_io_tree *io_tree = &eb_info->io_tree;
> +     struct block_device *bdev = eb_info->fs_info->fs_devices->latest_bdev;
> +     struct bio *bio = NULL;
> +     u64 offset = eb->start;
> +     u64 unlock_start = 0, unlock_len = 0;
>       unsigned long i;
>       struct page *page;
>       int err;
>       int ret = 0;
> -     int locked_pages = 0;
> -     int all_uptodate = 1;
>       unsigned long num_pages;
> -     unsigned long num_reads = 0;
> -     struct bio *bio = NULL;
> -     unsigned long bio_flags = 0;
> 
>       if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
>               return 0;
> 
> -     num_pages = num_extent_pages(eb->start, eb->len);
> -     for (i = 0; i < num_pages; i++) {
> -             page = eb->pages[i];
> -             if (wait == WAIT_NONE) {
> -                     if (!trylock_page(page))
> -                             goto unlock_exit;
> -             } else {
> -                     lock_page(page);
> -             }
> -             locked_pages++;
> -             if (!PageUptodate(page)) {
> -                     num_reads++;
> -                     all_uptodate = 0;
> -             }
> -     }
> -     if (all_uptodate) {
> -             set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -             goto unlock_exit;
> +     if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) {
> +             if (wait != WAIT_COMPLETE)
> +                     return 0;
> +             wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
> +                            TASK_UNINTERRUPTIBLE);
> +             if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
> +                     ret = -EIO;
> +             return ret;
>       }
> 
> +     lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
> +     num_pages = num_extent_pages(eb->start, eb->len);
>       clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
>       eb->read_mirror = 0;
> -     atomic_set(&eb->io_pages, num_reads);
> +     atomic_set(&eb->io_pages, num_pages);
>       for (i = 0; i < num_pages; i++) {
>               page = eb->pages[i];
> -
> -             if (!PageUptodate(page)) {
> -                     if (ret) {
> -                             atomic_dec(&eb->io_pages);
> -                             unlock_page(page);
> -                             continue;
> +             if (ret) {
> +                     unlock_len += PAGE_SIZE;
> +                     if (atomic_dec_and_test(&eb->io_pages)) {
> +                             clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +                             smp_mb__after_atomic();
> +                             wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>                       }
> +                     continue;
> +             }
> 
> -                     ClearPageError(page);
> -                     err = __extent_read_full_page(tree, page,
> -                                                   get_extent, &bio,
> -                                                   mirror_num, &bio_flags,
> -                                                   REQ_META);
> -                     if (err) {
> -                             ret = err;
> -                             /*
> -                              * We use &bio in above __extent_read_full_page,
> -                              * so we ensure that if it returns error, the
> -                              * current page fails to add itself to bio and
> -                              * it's been unlocked.
> -                              *
> -                              * We must dec io_pages by ourselves.
> -                              */
> -                             atomic_dec(&eb->io_pages);
> +             err = submit_extent_page(REQ_OP_READ, REQ_META, io_tree, NULL,
> +                                      page, offset >> 9, PAGE_SIZE, 0, bdev,
> +                                      &bio, -1,
> +                                      end_bio_extent_buffer_readpage,
> +                                      mirror_num, 0, 0, false);
> +             if (err) {
> +                     ret = err;
> +                     /*
> +                      * We use &bio in above submit_extent_page
> +                      * so we ensure that if it returns error, the
> +                      * current page fails to add itself to bio and
> +                      * it's been unlocked.
> +                      *
> +                      * We must dec io_pages by ourselves.
> +                      */
> +                     if (atomic_dec_and_test(&eb->io_pages)) {
> +                             clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +                             smp_mb__after_atomic();
> +                             wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>                       }
> -             } else {
> -                     unlock_page(page);
> +                     unlock_start = eb->start;

Josef, IMHO "unlock_start" should have been set to "offset". Lets say we
have 4 pages making up a metadata block and the first page was successfully
added to a bio. Assume that adding the second page to the bio resulted in
submit_extent_page() returning an error. In this scenario,
end_bio_extent_buffer_readpage() will own the responsibility of unlocking the
first 4k range in the io tree. However with "unlock_start" being set to
"eb->start", read_extent_buffer_pages() may end up unlocking the first 4k
range in the io tree.

> +                     unlock_len = PAGE_SIZE;
>               }
> +             offset += PAGE_SIZE;
>       }
> 
>       if (bio) {
> -             err = submit_one_bio(bio, mirror_num, bio_flags);
> +             err = submit_one_bio(bio, mirror_num, 0);
>               if (err)
>                       return err;
>       }
> 
> +     if (ret && unlock_start)
> +             unlock_extent(io_tree, unlock_start,
> +                           unlock_start + unlock_len - 1);
>       if (ret || wait != WAIT_COMPLETE)
>               return ret;
> 
> -     for (i = 0; i < num_pages; i++) {
> -             page = eb->pages[i];
> -             wait_on_page_locked(page);
> -             if (!PageUptodate(page))
> -                     ret = -EIO;
> -     }
> -
> -     return ret;
> -
> -unlock_exit:
> -     while (locked_pages > 0) {
> -             locked_pages--;
> -             page = eb->pages[locked_pages];
> -             unlock_page(page);
> -     }
> +     wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
> +                    TASK_UNINTERRUPTIBLE);
> +     if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
> +             ret = -EIO;
>       return ret;
>  }
> 

-- 
chandan

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to