On Mon, Jun 01, 2015 at 08:52:49PM +0530, Chandan Rajendra wrote:
> In subpagesize-blocksize scenario a page can have more than one block. So
> in addition to PagePrivate2 flag, we would have to track the I/O status of
> each block of a page to reliably mark the ordered extent as complete.
>
> Signed-off-by: Chandan Rajendra <[email protected]>
> ---
> fs/btrfs/extent_io.c | 19 +--
> fs/btrfs/extent_io.h | 5 +-
> fs/btrfs/inode.c | 346
> +++++++++++++++++++++++++++++++++++-------------
> fs/btrfs/ordered-data.c | 17 +++
> fs/btrfs/ordered-data.h | 4 +
> 5 files changed, 287 insertions(+), 104 deletions(-)
>
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 0110abc..55f900a 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -4545,11 +4545,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
> * to drop the page.
> */
> static int try_release_extent_state(struct extent_map_tree *map,
> - struct extent_io_tree *tree,
> - struct page *page, gfp_t mask)
> + struct extent_io_tree *tree,
> + struct page *page, u64 start, u64 end,
> + gfp_t mask)
> {
> - u64 start = page_offset(page);
> - u64 end = start + PAGE_CACHE_SIZE - 1;
> int ret = 1;
>
> if (test_range_bit(tree, start, end,
> @@ -4583,12 +4582,12 @@ static int try_release_extent_state(struct
> extent_map_tree *map,
> * map records are removed
> */
> int try_release_extent_mapping(struct extent_map_tree *map,
> - struct extent_io_tree *tree, struct page *page,
> - gfp_t mask)
> + struct extent_io_tree *tree, struct page *page,
> + u64 start, u64 end, gfp_t mask)
> {
> struct extent_map *em;
> - u64 start = page_offset(page);
> - u64 end = start + PAGE_CACHE_SIZE - 1;
> + u64 orig_start = start;
> + u64 orig_end = end;
>
> if ((mask & __GFP_WAIT) &&
> page->mapping->host->i_size > 16 * 1024 * 1024) {
> @@ -4622,7 +4621,9 @@ int try_release_extent_mapping(struct extent_map_tree
> *map,
> free_extent_map(em);
> }
> }
> - return try_release_extent_state(map, tree, page, mask);
> + return try_release_extent_state(map, tree, page,
> + orig_start, orig_end,
> + mask);
> }
>
> /*
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 8fe5ac3..c629e53 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -217,8 +217,9 @@ typedef struct extent_map *(get_extent_t)(struct inode
> *inode,
> void extent_io_tree_init(struct extent_io_tree *tree,
> struct address_space *mapping);
> int try_release_extent_mapping(struct extent_map_tree *map,
> - struct extent_io_tree *tree, struct page *page,
> - gfp_t mask);
> + struct extent_io_tree *tree, struct page *page,
> + u64 start, u64 end,
> + gfp_t mask);
> int try_release_extent_buffer(struct page *page);
> int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
> int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index bff60c6..bfffc62 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -2990,56 +2990,115 @@ static void finish_ordered_fn(struct btrfs_work
> *work)
> btrfs_finish_ordered_io(ordered_extent);
> }
>
> -static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
> - struct extent_state *state, int uptodate)
> +static void mark_blks_io_complete(struct btrfs_ordered_extent *ordered,
> + u64 blk, u64 nr_blks, int uptodate)
> {
> - struct inode *inode = page->mapping->host;
> + struct inode *inode = ordered->inode;
> struct btrfs_root *root = BTRFS_I(inode)->root;
> - struct btrfs_ordered_extent *ordered_extent = NULL;
> struct btrfs_workqueue *wq;
> btrfs_work_func_t func;
> - u64 ordered_start, ordered_end;
> int done;
>
> - trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
> + while (nr_blks--) {
> + if (test_and_set_bit(blk, ordered->blocks_done)) {
> + blk++;
> + continue;
> + }
>
> - ClearPagePrivate2(page);
> -loop:
> - ordered_extent = btrfs_lookup_ordered_range(inode, start,
> - end - start + 1);
> - if (!ordered_extent)
> - goto out;
> + done = btrfs_dec_test_ordered_pending(inode, &ordered,
> + ordered->file_offset
> + + (blk <<
> inode->i_sb->s_blocksize_bits),
> + root->sectorsize,
> + uptodate);
> + if (done) {
> + if (btrfs_is_free_space_inode(inode)) {
> + wq = root->fs_info->endio_freespace_worker;
> + func = btrfs_freespace_write_helper;
> + } else {
> + wq = root->fs_info->endio_write_workers;
> + func = btrfs_endio_write_helper;
> + }
>
> - ordered_start = max_t(u64, start, ordered_extent->file_offset);
> - ordered_end = min_t(u64, end,
> - ordered_extent->file_offset + ordered_extent->len - 1);
> -
> - done = btrfs_dec_test_ordered_pending(inode, &ordered_extent,
> - ordered_start,
> - ordered_end - ordered_start + 1,
> - uptodate);
> - if (done) {
> - if (btrfs_is_free_space_inode(inode)) {
> - wq = root->fs_info->endio_freespace_worker;
> - func = btrfs_freespace_write_helper;
> - } else {
> - wq = root->fs_info->endio_write_workers;
> - func = btrfs_endio_write_helper;
> + btrfs_init_work(&ordered->work, func,
> + finish_ordered_fn, NULL, NULL);
> + btrfs_queue_work(wq, &ordered->work);
> }
>
> - btrfs_init_work(&ordered_extent->work, func,
> - finish_ordered_fn, NULL, NULL);
> - btrfs_queue_work(wq, &ordered_extent->work);
> + blk++;
> }
> +}
>
> - btrfs_put_ordered_extent(ordered_extent);
> +int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
> + struct extent_state *state, int uptodate)
> +{
> + struct inode *inode = page->mapping->host;
> + struct btrfs_root *root = BTRFS_I(inode)->root;
> + struct btrfs_ordered_extent *ordered_extent = NULL;
> + u64 blk, nr_blks;
> + int clear;
>
> - start = ordered_end + 1;
> + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
>
> - if (start < end)
> - goto loop;
> + while (start < end) {
> + ordered_extent = btrfs_lookup_ordered_extent(inode, start);
> + if (!ordered_extent) {
> + start += root->sectorsize;
> + continue;
> + }
> +
> + blk = (start - ordered_extent->file_offset)
> + >> inode->i_sb->s_blocksize_bits;
> +
> + nr_blks = (min(end, ordered_extent->file_offset +
> ordered_extent->len - 1)
> + + 1 - start) >> inode->i_sb->s_blocksize_bits;
> +
> + BUG_ON(!nr_blks);
> +
> + mark_blks_io_complete(ordered_extent, blk, nr_blks, uptodate);
Range [start, end] is surely contiguous, so why are we processing blocks
one by one in mark_blks_io_complete()?
Same question for invalidatepage().
Thanks,
-liubo
> +
> + start = ordered_extent->file_offset + ordered_extent->len;
> +
> + btrfs_put_ordered_extent(ordered_extent);
> + }
> +
> + start = page_offset(page);
> + end = start + PAGE_CACHE_SIZE - 1;
> + clear = 1;
> +
> + while (start < end) {
> + ordered_extent = btrfs_lookup_ordered_extent(inode, start);
> + if (!ordered_extent) {
> + start += root->sectorsize;
> + continue;
> + }
> +
> + blk = (start - ordered_extent->file_offset)
> + >> inode->i_sb->s_blocksize_bits;
> + nr_blks = (min(end, ordered_extent->file_offset +
> ordered_extent->len - 1)
> + + 1 - start) >> inode->i_sb->s_blocksize_bits;
> +
> + BUG_ON(!nr_blks);
> +
> + while (nr_blks--) {
> + if (!test_bit(blk++, ordered_extent->blocks_done)) {
> + clear = 0;
> + break;
> + }
> + }
> +
> + if (!clear) {
> + btrfs_put_ordered_extent(ordered_extent);
> + break;
> + }
> +
> + start += ordered_extent->len;
> +
> + btrfs_put_ordered_extent(ordered_extent);
> + }
> +
> + if (clear)
> + ClearPagePrivate2(page);
>
> -out:
> return 0;
> }
>
> @@ -8472,7 +8531,9 @@ btrfs_readpages(struct file *file, struct address_space
> *mapping,
> return extent_readpages(tree, mapping, pages, nr_pages,
> btrfs_get_extent);
> }
> -static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
> +
> +static int __btrfs_releasepage(struct page *page, u64 start, u64 end,
> + gfp_t gfp_flags)
> {
> struct extent_io_tree *tree;
> struct extent_map_tree *map;
> @@ -8480,31 +8541,149 @@ static int __btrfs_releasepage(struct page *page,
> gfp_t gfp_flags)
>
> tree = &BTRFS_I(page->mapping->host)->io_tree;
> map = &BTRFS_I(page->mapping->host)->extent_tree;
> - ret = try_release_extent_mapping(map, tree, page, gfp_flags);
> - if (ret == 1)
> +
> + ret = try_release_extent_mapping(map, tree, page, start, end,
> + gfp_flags);
> + if ((ret == 1) && ((end - start + 1) == PAGE_CACHE_SIZE)) {
> clear_page_extent_mapped(page);
> + } else {
> + ret = 0;
> + }
>
> return ret;
> }
>
> static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
> {
> + u64 start = page_offset(page);
> + u64 end = start + PAGE_CACHE_SIZE - 1;
> +
> if (PageWriteback(page) || PageDirty(page))
> return 0;
> - return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
> +
> + return __btrfs_releasepage(page, start, end, gfp_flags & GFP_NOFS);
> +}
> +
> +static void invalidate_ordered_extent_blocks(struct inode *inode,
> + struct btrfs_ordered_extent *ordered,
> + u64 locked_start, u64 locked_end,
> + u64 cur,
> + int inode_evicting)
> +{
> + struct btrfs_root *root = BTRFS_I(inode)->root;
> + struct btrfs_ordered_inode_tree *ordered_tree;
> + struct extent_io_tree *tree;
> + u64 blk, blk_done, nr_blks;
> + u64 end;
> + u64 new_len;
> +
> + tree = &BTRFS_I(inode)->io_tree;
> +
> + end = min(locked_end, ordered->file_offset + ordered->len - 1);
> +
> + if (!inode_evicting) {
> + clear_extent_bit(tree, cur, end,
> + EXTENT_DIRTY | EXTENT_DELALLOC |
> + EXTENT_DO_ACCOUNTING |
> + EXTENT_DEFRAG, 1, 0, NULL,
> + GFP_NOFS);
> + unlock_extent(tree, locked_start, locked_end);
> + }
> +
> +
> + ordered_tree = &BTRFS_I(inode)->ordered_tree;
> + spin_lock_irq(&ordered_tree->lock);
> + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
> + new_len = cur - ordered->file_offset;
> + if (new_len < ordered->truncated_len)
> + ordered->truncated_len = new_len;
> +
> + blk = (cur - ordered->file_offset) >> inode->i_sb->s_blocksize_bits;
> + nr_blks = (end + 1 - cur) >> inode->i_sb->s_blocksize_bits;
> +
> + while (nr_blks--) {
> + blk_done = !test_and_set_bit(blk, ordered->blocks_done);
> + if (blk_done) {
> + spin_unlock_irq(&ordered_tree->lock);
> + if (btrfs_dec_test_ordered_pending(inode, &ordered,
> +
> ordered->file_offset + (blk << inode->i_sb->s_blocksize_bits),
> +
> root->sectorsize,
> + 1))
> + btrfs_finish_ordered_io(ordered);
> +
> + spin_lock_irq(&ordered_tree->lock);
> + }
> + blk++;
> + }
> +
> + spin_unlock_irq(&ordered_tree->lock);
> +
> + if (!inode_evicting)
> + lock_extent_bits(tree, locked_start, locked_end, 0, NULL);
> +}
> +
> +static int page_blocks_written(struct page *page)
> +{
> + struct btrfs_ordered_extent *ordered;
> + struct btrfs_root *root;
> + struct inode *inode;
> + unsigned long outstanding_blk;
> + u64 page_start, page_end;
> + u64 blk, last_blk, nr_blks;
> + u64 cur;
> + u64 len;
> +
> + inode = page->mapping->host;
> + root = BTRFS_I(inode)->root;
> +
> + page_start = page_offset(page);
> + page_end = page_start + PAGE_CACHE_SIZE - 1;
> +
> + cur = page_start;
> + while (cur < page_end) {
> + ordered = btrfs_lookup_ordered_extent(inode, cur);
> + if (!ordered) {
> + cur += root->sectorsize;
> + continue;
> + }
> +
> + blk = (cur - ordered->file_offset)
> + >> inode->i_sb->s_blocksize_bits;
> + len = min(page_end, ordered->file_offset + ordered->len - 1)
> + - cur + 1;
> + nr_blks = len >> inode->i_sb->s_blocksize_bits;
> +
> + last_blk = blk + nr_blks - 1;
> +
> + outstanding_blk = find_next_zero_bit(ordered->blocks_done,
> + ordered->len >>
> inode->i_sb->s_blocksize_bits,
> + blk);
> + if (outstanding_blk <= last_blk) {
> + btrfs_put_ordered_extent(ordered);
> + return 0;
> + }
> +
> + btrfs_put_ordered_extent(ordered);
> + cur += len;
> + }
> +
> + return 1;
> }
>
> static void btrfs_invalidatepage(struct page *page, unsigned int offset,
> - unsigned int length)
> + unsigned int length)
> {
> struct inode *inode = page->mapping->host;
> + struct btrfs_root *root = BTRFS_I(inode)->root;
> struct extent_io_tree *tree;
> struct btrfs_ordered_extent *ordered;
> - struct extent_state *cached_state = NULL;
> - u64 page_start = page_offset(page);
> - u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
> + u64 start, end, cur;
> + u64 page_start, page_end;
> int inode_evicting = inode->i_state & I_FREEING;
>
> + page_start = page_offset(page);
> + page_end = page_start + PAGE_CACHE_SIZE - 1;
> +
> /*
> * we have the page locked, so new writeback can't start,
> * and the dirty bit won't be cleared while we are here.
> @@ -8515,73 +8694,54 @@ static void btrfs_invalidatepage(struct page *page,
> unsigned int offset,
> wait_on_page_writeback(page);
>
> tree = &BTRFS_I(inode)->io_tree;
> - if (offset) {
> +
> + start = round_up(offset, root->sectorsize);
> + end = round_down(offset + length, root->sectorsize) - 1;
> + if (end - start + 1 < root->sectorsize) {
> btrfs_releasepage(page, GFP_NOFS);
> return;
> }
>
> + start = round_up(page_start + offset, root->sectorsize);
> + end = round_down(page_start + offset + length,
> + root->sectorsize) - 1;
> +
> if (!inode_evicting)
> - lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
> - ordered = btrfs_lookup_ordered_range(inode, page_start,
> PAGE_CACHE_SIZE);
> - if (ordered) {
> - /*
> - * IO on this page will never be started, so we need
> - * to account for any ordered extents now
> - */
> - if (!inode_evicting)
> - clear_extent_bit(tree, page_start, page_end,
> - EXTENT_DIRTY | EXTENT_DELALLOC |
> - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
> - EXTENT_DEFRAG, 1, 0, &cached_state,
> - GFP_NOFS);
> - /*
> - * whoever cleared the private bit is responsible
> - * for the finish_ordered_io
> - */
> - if (TestClearPagePrivate2(page)) {
> - struct btrfs_ordered_inode_tree *tree;
> - u64 new_len;
> + lock_extent_bits(tree, start, end, 0, NULL);
>
> - tree = &BTRFS_I(inode)->ordered_tree;
> + cur = start;
> + while (cur < end) {
> + ordered = btrfs_lookup_ordered_extent(inode, cur);
> + if (!ordered) {
> + cur += root->sectorsize;
> + continue;
> + }
>
> - spin_lock_irq(&tree->lock);
> - set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
> - new_len = page_start - ordered->file_offset;
> - if (new_len < ordered->truncated_len)
> - ordered->truncated_len = new_len;
> - spin_unlock_irq(&tree->lock);
> + invalidate_ordered_extent_blocks(inode, ordered,
> + start, end, cur,
> + inode_evicting);
>
> - if (btrfs_dec_test_ordered_pending(inode, &ordered,
> - page_start,
> - PAGE_CACHE_SIZE, 1))
> - btrfs_finish_ordered_io(ordered);
> - }
> + cur = min(end + 1, ordered->file_offset + ordered->len);
> btrfs_put_ordered_extent(ordered);
> - if (!inode_evicting) {
> - cached_state = NULL;
> - lock_extent_bits(tree, page_start, page_end, 0,
> - &cached_state);
> - }
> }
>
> - if (!inode_evicting) {
> - clear_extent_bit(tree, page_start, page_end,
> - EXTENT_LOCKED | EXTENT_DIRTY |
> - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
> - EXTENT_DEFRAG, 1, 1,
> - &cached_state, GFP_NOFS);
> + if (page_blocks_written(page))
> + ClearPagePrivate2(page);
>
> - __btrfs_releasepage(page, GFP_NOFS);
> + if (!inode_evicting) {
> + clear_extent_bit(tree, start, end,
> + EXTENT_LOCKED | EXTENT_DIRTY |
> + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
> + EXTENT_DEFRAG, 1, 1, NULL, GFP_NOFS);
> }
>
> - ClearPageChecked(page);
> - if (PagePrivate(page)) {
> - ClearPagePrivate(page);
> - set_page_private(page, 0);
> - page_cache_release(page);
> + if (!offset && length == PAGE_CACHE_SIZE) {
> + WARN_ON(!__btrfs_releasepage(page, start, end, GFP_NOFS));
> + ClearPageChecked(page);
> }
> }
>
> +
> /*
> * btrfs_page_mkwrite() is not allowed to change the file size as it gets
> * called from a page fault handler when a page is first dirtied. Hence we
> must
> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
> index 157cc54..8e614ca 100644
> --- a/fs/btrfs/ordered-data.c
> +++ b/fs/btrfs/ordered-data.c
> @@ -189,12 +189,25 @@ static int __btrfs_add_ordered_extent(struct inode
> *inode, u64 file_offset,
> struct btrfs_ordered_inode_tree *tree;
> struct rb_node *node;
> struct btrfs_ordered_extent *entry;
> + u64 nr_longs;
>
> tree = &BTRFS_I(inode)->ordered_tree;
> entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
> if (!entry)
> return -ENOMEM;
>
> + nr_longs = BITS_TO_LONGS(len >> inode->i_sb->s_blocksize_bits);
> + if (nr_longs == 1) {
> + entry->blocks_done = &entry->blocks_bitmap;
> + } else {
> + entry->blocks_done = kzalloc(nr_longs * sizeof(unsigned long),
> + GFP_NOFS);
> + if (!entry->blocks_done) {
> + kmem_cache_free(btrfs_ordered_extent_cache, entry);
> + return -ENOMEM;
> + }
> + }
> +
> entry->file_offset = file_offset;
> entry->start = start;
> entry->len = len;
> @@ -553,6 +566,10 @@ void btrfs_put_ordered_extent(struct
> btrfs_ordered_extent *entry)
> list_del(&sum->list);
> kfree(sum);
> }
> +
> + if (entry->blocks_done != &entry->blocks_bitmap)
> + kfree(entry->blocks_done);
> +
> kmem_cache_free(btrfs_ordered_extent_cache, entry);
> }
> }
> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
> index e96cd4c..4b3356a 100644
> --- a/fs/btrfs/ordered-data.h
> +++ b/fs/btrfs/ordered-data.h
> @@ -140,6 +140,10 @@ struct btrfs_ordered_extent {
> struct completion completion;
> struct btrfs_work flush_work;
> struct list_head work_list;
> +
> + /* bitmap to track the blocks that have been written to disk */
> + unsigned long *blocks_done;
> + unsigned long blocks_bitmap;
> };
>
> /*
> --
> 2.1.0
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html