Re: [RFC PATCH V11 02/21] Btrfs: subpagesize-blocksize: Fix whole page write.

Liu Bo Fri, 26 Jun 2015 02:52:00 -0700

On Mon, Jun 01, 2015 at 08:52:37PM +0530, Chandan Rajendra wrote:
> For the subpagesize-blocksize scenario, a page can contain multiple
> blocks. In such cases, this patch handles writing data to files.
> 
> Also, When setting EXTENT_DELALLOC, we no longer set EXTENT_UPTODATE bit on
> the extent_io_tree since uptodate status is being tracked by the bitmap
> pointed to by page->private.


To be honestly, I'm not sure why we set EXTENT_UPTODATE bit for data as we
don't check for that bit at all for now, correct me if I'm wrong.

> 
> Signed-off-by: Chandan Rajendra <[email protected]>
> ---
>  fs/btrfs/extent_io.c | 141 
> +++++++++++++++++++++++----------------------------
>  fs/btrfs/file.c      |  16 ++++++
>  fs/btrfs/inode.c     |  58 ++++++++++++++++-----
>  3 files changed, 125 insertions(+), 90 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index d37badb..3736ab5 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1283,9 +1283,8 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 
> start, u64 end,
>  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
>                       struct extent_state **cached_state, gfp_t mask)
>  {
> -     return set_extent_bit(tree, start, end,
> -                           EXTENT_DELALLOC | EXTENT_UPTODATE,
> -                           NULL, cached_state, mask);
> +     return set_extent_bit(tree, start, end, EXTENT_DELALLOC,
> +                     NULL, cached_state, mask);
>  }
>  
>  int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
> @@ -1498,25 +1497,6 @@ int extent_range_redirty_for_io(struct inode *inode, 
> u64 start, u64 end)
>       return 0;
>  }
>  
> -/*
> - * helper function to set both pages and extents in the tree writeback
> - */
> -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 
> end)
> -{
> -     unsigned long index = start >> PAGE_CACHE_SHIFT;
> -     unsigned long end_index = end >> PAGE_CACHE_SHIFT;
> -     struct page *page;
> -
> -     while (index <= end_index) {
> -             page = find_get_page(tree->mapping, index);
> -             BUG_ON(!page); /* Pages should be in the extent_io_tree */
> -             set_page_writeback(page);
> -             page_cache_release(page);
> -             index++;
> -     }
> -     return 0;
> -}
> -
>  /* find the first state struct with 'bits' set after 'start', and
>   * return it.  tree->lock must be held.  NULL will returned if
>   * nothing was found after 'start'
> @@ -2080,6 +2060,14 @@ static int page_read_complete(struct page *page)
>       return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
>  }
>  
> +static int page_write_complete(struct page *page)
> +{
> +     u64 start = page_offset(page);
> +     u64 end = start + PAGE_CACHE_SIZE - 1;
> +
> +     return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
> +}
> +
>  int free_io_failure(struct inode *inode, struct io_failure_record *rec)
>  {
>       int ret;
> @@ -2575,38 +2563,37 @@ int end_extent_writepage(struct page *page, int err, 
> u64 start, u64 end)
>   */
>  static void end_bio_extent_writepage(struct bio *bio, int err)
>  {
> +     struct btrfs_page_private *pg_private;
>       struct bio_vec *bvec;
> +     unsigned long flags;
>       u64 start;
>       u64 end;
> +     int clear_writeback;
>       int i;
>  
>       bio_for_each_segment_all(bvec, bio, i) {
>               struct page *page = bvec->bv_page;
>  
> -             /* We always issue full-page reads, but if some block
> -              * in a page fails to read, blk_update_request() will
> -              * advance bv_offset and adjust bv_len to compensate.
> -              * Print a warning for nonzero offsets, and an error
> -              * if they don't add up to a full page.  */
> -             if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
> -                     if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
> -                             
> btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
> -                                "partial page write in btrfs with offset %u 
> and length %u",
> -                                     bvec->bv_offset, bvec->bv_len);
> -                     else
> -                             
> btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
> -                                "incomplete page write in btrfs with offset 
> %u and "
> -                                "length %u",
> -                                     bvec->bv_offset, bvec->bv_len);
> -             }
> +             start = page_offset(page) + bvec->bv_offset;
> +             end = start + bvec->bv_len - 1;
>  
> -             start = page_offset(page);
> -             end = start + bvec->bv_offset + bvec->bv_len - 1;
> +             pg_private = (struct btrfs_page_private *)page->private;
> +
> +             spin_lock_irqsave(&pg_private->io_lock, flags);
>  
> -             if (end_extent_writepage(page, err, start, end))
> +             if (end_extent_writepage(page, err, start, end)) {
> +                     spin_unlock_irqrestore(&pg_private->io_lock, flags);
>                       continue;
> +             }
>  
> -             end_page_writeback(page);
> +             clear_page_blks_state(page, 1 << BLK_STATE_IO, start, end);
> +
> +             clear_writeback = page_write_complete(page);
> +
> +             spin_unlock_irqrestore(&pg_private->io_lock, flags);
> +
> +             if (clear_writeback)
> +                     end_page_writeback(page);
>       }
>  
>       bio_put(bio);
> @@ -3417,10 +3404,9 @@ static noinline_for_stack int 
> __extent_writepage_io(struct inode *inode,
>       u64 block_start;
>       u64 iosize;
>       sector_t sector;
> -     struct extent_state *cached_state = NULL;
>       struct extent_map *em;
>       struct block_device *bdev;
> -     size_t pg_offset = 0;
> +     size_t pg_offset;
>       size_t blocksize;
>       int ret = 0;
>       int nr = 0;
> @@ -3467,8 +3453,16 @@ static noinline_for_stack int 
> __extent_writepage_io(struct inode *inode,
>                                                        page_end, NULL, 1);
>                       break;
>               }
> -             em = epd->get_extent(inode, page, pg_offset, cur,
> -                                  end - cur + 1, 1);
> +
> +             pg_offset = cur & (PAGE_CACHE_SIZE - 1);
> +
> +             if (!test_page_blks_state(page, BLK_STATE_DIRTY, cur,
> +                                             cur + blocksize - 1, 1)) {
> +                     cur += blocksize;
> +                     continue;
> +             }

If we don't check this, the below get_extent() will return a HOLE (block_start
== EXTENT_MAP_HOLE) and we can still go on to the next block, then we don't
need to maintain this BLK_STATE_DIRTY bit all the while.

> +
> +             em = epd->get_extent(inode, page, pg_offset, cur, blocksize, 1);
>               if (IS_ERR_OR_NULL(em)) {
>                       SetPageError(page);
>                       ret = PTR_ERR_OR_ZERO(em);
> @@ -3479,7 +3473,7 @@ static noinline_for_stack int 
> __extent_writepage_io(struct inode *inode,
>               em_end = extent_map_end(em);
>               BUG_ON(em_end <= cur);
>               BUG_ON(end < cur);
> -             iosize = min(em_end - cur, end - cur + 1);
> +             iosize = min_t(u64, em_end - cur, blocksize);
>               iosize = ALIGN(iosize, blocksize);

This limits us to do one block per loop, if two blocks are contiguous,
it should be fine to write them along.

>               sector = (em->block_start + extent_offset) >> 9;
>               bdev = em->bdev;
> @@ -3488,32 +3482,20 @@ static noinline_for_stack int 
> __extent_writepage_io(struct inode *inode,
>               free_extent_map(em);
>               em = NULL;
>  
> -             /*
> -              * compressed and inline extents are written through other
> -              * paths in the FS
> -              */
> -             if (compressed || block_start == EXTENT_MAP_HOLE ||
> -                 block_start == EXTENT_MAP_INLINE) {
> -                     /*
> -                      * end_io notification does not happen here for
> -                      * compressed extents
> -                      */
> -                     if (!compressed && tree->ops &&
> -                         tree->ops->writepage_end_io_hook)
> -                             tree->ops->writepage_end_io_hook(page, cur,
> -                                                      cur + iosize - 1,
> -                                                      NULL, 1);
> -                     else if (compressed) {
> -                             /* we don't want to end_page_writeback on
> -                              * a compressed extent.  this happens
> -                              * elsewhere
> -                              */
> -                             nr++;
> -                     }
> +             BUG_ON(compressed);
> +             BUG_ON(block_start == EXTENT_MAP_INLINE);
>  
> -                     cur += iosize;
> -                     pg_offset += iosize;
> -                     continue;
> +             if (block_start == EXTENT_MAP_HOLE) {
> +                     if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur,
> +                                                     cur + iosize - 1, 1)) {
> +                             clear_page_blks_state(page,
> +                                             1 << BLK_STATE_DIRTY, cur,
> +                                             cur + iosize - 1);
> +                             cur += iosize;
> +                             continue;
> +                     } else {
> +                             BUG();
> +                     }
>               }
>  
>               if (tree->ops && tree->ops->writepage_io_hook) {
> @@ -3527,7 +3509,13 @@ static noinline_for_stack int 
> __extent_writepage_io(struct inode *inode,
>               } else {
>                       unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
>  
> -                     set_range_writeback(tree, cur, cur + iosize - 1);
> +                     clear_page_blks_state(page, 1 << BLK_STATE_DIRTY, cur,
> +                                     cur + iosize - 1);
> +                     set_page_writeback(page);
> +
> +                     set_page_blks_state(page, 1 << BLK_STATE_IO, cur,
> +                                     cur + iosize - 1);
> +
>                       if (!PageWriteback(page)) {
>                               btrfs_err(BTRFS_I(inode)->root->fs_info,
>                                          "page %lu not writeback, cur %llu 
> end %llu",
> @@ -3542,17 +3530,14 @@ static noinline_for_stack int 
> __extent_writepage_io(struct inode *inode,
>                       if (ret)
>                               SetPageError(page);
>               }
> -             cur = cur + iosize;
> -             pg_offset += iosize;
> +
> +             cur += iosize;
>               nr++;
>       }
>  done:
>       *nr_ret = nr;
>  
>  done_unlocked:
> -
> -     /* drop our reference on any cached states */
> -     free_extent_state(cached_state);
>       return ret;
>  }
>  
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 23b6e03..cbe6381 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -495,6 +495,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct 
> inode *inode,
>       u64 num_bytes;
>       u64 start_pos;
>       u64 end_of_last_block;
> +     u64 start;
> +     u64 end;
> +     u64 page_end;
>       u64 end_pos = pos + write_bytes;
>       loff_t isize = i_size_read(inode);
>  
> @@ -507,11 +510,24 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct 
> inode *inode,
>       if (err)
>               return err;
>  
> +     start = start_pos;
> +
>       for (i = 0; i < num_pages; i++) {
>               struct page *p = pages[i];
>               SetPageUptodate(p);
>               ClearPageChecked(p);
> +
> +             end = page_end = page_offset(p) + PAGE_CACHE_SIZE - 1;
> +
> +             if (i == num_pages - 1)
> +                     end = min_t(u64, page_end, end_of_last_block);
> +
> +             set_page_blks_state(p,
> +                             1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +                             start, end);
>               set_page_dirty(p);
> +
> +             start = page_end + 1;

This is not the usual way, page_end is unnecessary, (start += PAGE_CACHE_SIZE) 
should work.

>       }
>  
>       /*
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 8262f83..ac6a3f3 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1995,6 +1995,11 @@ again:
>        }
>  
>       btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
> +
> +     set_page_blks_state(page,
> +                     1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +                     page_start, page_end);
> +
>       ClearPageChecked(page);
>       set_page_dirty(page);
>  out:
> @@ -2984,26 +2989,48 @@ static int btrfs_writepage_end_io_hook(struct page 
> *page, u64 start, u64 end,
>       struct btrfs_ordered_extent *ordered_extent = NULL;
>       struct btrfs_workqueue *wq;
>       btrfs_work_func_t func;
> +     u64 ordered_start, ordered_end;
> +     int done;
>  
>       trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
>  
>       ClearPagePrivate2(page);
> -     if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
> -                                         end - start + 1, uptodate))
> -             return 0;
> +loop:
> +     ordered_extent = btrfs_lookup_ordered_range(inode, start,
> +                                             end - start + 1);
> +     if (!ordered_extent)
> +             goto out;
>  
> -     if (btrfs_is_free_space_inode(inode)) {
> -             wq = root->fs_info->endio_freespace_worker;
> -             func = btrfs_freespace_write_helper;
> -     } else {
> -             wq = root->fs_info->endio_write_workers;
> -             func = btrfs_endio_write_helper;
> +     ordered_start = max_t(u64, start, ordered_extent->file_offset);
> +     ordered_end = min_t(u64, end,
> +                     ordered_extent->file_offset + ordered_extent->len - 1);
> +
> +     done = btrfs_dec_test_ordered_pending(inode, &ordered_extent,
> +                                     ordered_start,
> +                                     ordered_end - ordered_start + 1,
> +                                     uptodate);
> +     if (done) {
> +             if (btrfs_is_free_space_inode(inode)) {
> +                     wq = root->fs_info->endio_freespace_worker;
> +                     func = btrfs_freespace_write_helper;
> +             } else {
> +                     wq = root->fs_info->endio_write_workers;
> +                     func = btrfs_endio_write_helper;
> +             }
> +
> +             btrfs_init_work(&ordered_extent->work, func,
> +                             finish_ordered_fn, NULL, NULL);
> +             btrfs_queue_work(wq, &ordered_extent->work);
>       }
>  
> -     btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
> -                     NULL);
> -     btrfs_queue_work(wq, &ordered_extent->work);
> +     btrfs_put_ordered_extent(ordered_extent);
> +
> +     start = ordered_end + 1;
> +
> +     if (start < end)
> +             goto loop;
>  
> +out:

I saw this's put a BUG_ON(block_start == EXTENT_MAP_INLINE); in writepage(),
but I didn't see the code of disabling inline data in patch 01 and patch 02,
but anyway I think we can avoid above searching for ordered_extents in a single 
page
if we enable inline data.

Thanks,

-liubo

>       return 0;
>  }
>  
> @@ -4601,6 +4628,9 @@ again:
>               goto out_unlock;
>       }
>  
> +     set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << 
> BLK_STATE_UPTODATE,
> +                     page_start, page_end);
> +
>       if (offset != PAGE_CACHE_SIZE) {
>               if (!len)
>                       len = PAGE_CACHE_SIZE - offset;
> @@ -8590,6 +8620,10 @@ again:
>               ret = VM_FAULT_SIGBUS;
>               goto out_unlock;
>       }
> +
> +     set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << 
> BLK_STATE_UPTODATE,
> +                     page_start, end);
> +
>       ret = 0;
>  
>       /* page is wholly or partially inside EOF */
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH V11 02/21] Btrfs: subpagesize-blocksize: Fix whole page write.

Reply via email to