Re: [RFC PATCH V11 02/21] Btrfs: subpagesize-blocksize: Fix whole page write.

Chandan Rajendra Mon, 29 Jun 2015 01:56:29 -0700

On Friday 26 Jun 2015 17:50:54 Liu Bo wrote:
> On Mon, Jun 01, 2015 at 08:52:37PM +0530, Chandan Rajendra wrote:
> > For the subpagesize-blocksize scenario, a page can contain multiple
> > blocks. In such cases, this patch handles writing data to files.
> > 
> > Also, When setting EXTENT_DELALLOC, we no longer set EXTENT_UPTODATE bit
> > on
> > the extent_io_tree since uptodate status is being tracked by the bitmap
> > pointed to by page->private.
> 
> To be honestly, I'm not sure why we set EXTENT_UPTODATE bit for data as we
> don't check for that bit at all for now, correct me if I'm wrong.


Yes, I didn't find any code using EXTENT_UPTODATE flag. That is probably
because we could get away by referring to the page's PG_uptodate flag in
blocksize == Pagesize scenario. But for the subpagesize-blocksize scenario we
need BLK_STATE_UPTODATE to determine if a page's PG_uptodate flag can be set.

> 
> > Signed-off-by: Chandan Rajendra <[email protected]>
> > ---
> > 
> >  fs/btrfs/extent_io.c | 141
> >  +++++++++++++++++++++++---------------------------- fs/btrfs/file.c     
> >  |  16 ++++++
> >  fs/btrfs/inode.c     |  58 ++++++++++++++++-----
> >  3 files changed, 125 insertions(+), 90 deletions(-)
> > 
> > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> > index d37badb..3736ab5 100644
> > --- a/fs/btrfs/extent_io.c
> > +++ b/fs/btrfs/extent_io.c
> > @@ -1283,9 +1283,8 @@ int clear_extent_bits(struct extent_io_tree *tree,
> > u64 start, u64 end,> 
> >  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
> >  
> >                     struct extent_state **cached_state, gfp_t mask)
> >  
> >  {
> > 
> > -   return set_extent_bit(tree, start, end,
> > -                         EXTENT_DELALLOC | EXTENT_UPTODATE,
> > -                         NULL, cached_state, mask);
> > +   return set_extent_bit(tree, start, end, EXTENT_DELALLOC,
> > +                   NULL, cached_state, mask);
> > 
> >  }
> >  
> >  int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
> > 
> > @@ -1498,25 +1497,6 @@ int extent_range_redirty_for_io(struct inode
> > *inode, u64 start, u64 end)> 
> >     return 0;
> >  
> >  }
> > 
> > -/*
> > - * helper function to set both pages and extents in the tree writeback
> > - */
> > -static int set_range_writeback(struct extent_io_tree *tree, u64 start,
> > u64 end) -{
> > -   unsigned long index = start >> PAGE_CACHE_SHIFT;
> > -   unsigned long end_index = end >> PAGE_CACHE_SHIFT;
> > -   struct page *page;
> > -
> > -   while (index <= end_index) {
> > -           page = find_get_page(tree->mapping, index);
> > -           BUG_ON(!page); /* Pages should be in the extent_io_tree */
> > -           set_page_writeback(page);
> > -           page_cache_release(page);
> > -           index++;
> > -   }
> > -   return 0;
> > -}
> > -
> > 
> >  /* find the first state struct with 'bits' set after 'start', and
> >  
> >   * return it.  tree->lock must be held.  NULL will returned if
> >   * nothing was found after 'start'
> > 
> > @@ -2080,6 +2060,14 @@ static int page_read_complete(struct page *page)
> > 
> >     return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
> >  
> >  }
> > 
> > +static int page_write_complete(struct page *page)
> > +{
> > +   u64 start = page_offset(page);
> > +   u64 end = start + PAGE_CACHE_SIZE - 1;
> > +
> > +   return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
> > +}
> > +
> > 
> >  int free_io_failure(struct inode *inode, struct io_failure_record *rec)
> >  {
> >  
> >     int ret;
> > 
> > @@ -2575,38 +2563,37 @@ int end_extent_writepage(struct page *page, int
> > err, u64 start, u64 end)> 
> >   */
> >  
> >  static void end_bio_extent_writepage(struct bio *bio, int err)
> >  {
> > 
> > +   struct btrfs_page_private *pg_private;
> > 
> >     struct bio_vec *bvec;
> > 
> > +   unsigned long flags;
> > 
> >     u64 start;
> >     u64 end;
> > 
> > +   int clear_writeback;
> > 
> >     int i;
> >     
> >     bio_for_each_segment_all(bvec, bio, i) {
> >     
> >             struct page *page = bvec->bv_page;
> > 
> > -           /* We always issue full-page reads, but if some block
> > -            * in a page fails to read, blk_update_request() will
> > -            * advance bv_offset and adjust bv_len to compensate.
> > -            * Print a warning for nonzero offsets, and an error
> > -            * if they don't add up to a full page.  */
> > -           if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
> > -                   if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
> > -                           btrfs_err(BTRFS_I(page->mapping->host)->root-
>fs_info,
> > -                              "partial page write in btrfs with offset %u 
and length %u",
> > -                                   bvec->bv_offset, bvec->bv_len);
> > -                   else
> > -                           btrfs_info(BTRFS_I(page->mapping->host)->root-
>fs_info,
> > -                              "incomplete page write in btrfs with offset 
%u and "
> > -                              "length %u",
> > -                                   bvec->bv_offset, bvec->bv_len);
> > -           }
> > +           start = page_offset(page) + bvec->bv_offset;
> > +           end = start + bvec->bv_len - 1;
> > 
> > -           start = page_offset(page);
> > -           end = start + bvec->bv_offset + bvec->bv_len - 1;
> > +           pg_private = (struct btrfs_page_private *)page->private;
> > +
> > +           spin_lock_irqsave(&pg_private->io_lock, flags);
> > 
> > -           if (end_extent_writepage(page, err, start, end))
> > +           if (end_extent_writepage(page, err, start, end)) {
> > +                   spin_unlock_irqrestore(&pg_private->io_lock, flags);
> > 
> >                     continue;
> > 
> > +           }
> > 
> > -           end_page_writeback(page);
> > +           clear_page_blks_state(page, 1 << BLK_STATE_IO, start, end);
> > +
> > +           clear_writeback = page_write_complete(page);
> > +
> > +           spin_unlock_irqrestore(&pg_private->io_lock, flags);
> > +
> > +           if (clear_writeback)
> > +                   end_page_writeback(page);
> > 
> >     }
> >     
> >     bio_put(bio);
> > 
> > @@ -3417,10 +3404,9 @@ static noinline_for_stack int
> > __extent_writepage_io(struct inode *inode,> 
> >     u64 block_start;
> >     u64 iosize;
> >     sector_t sector;
> > 
> > -   struct extent_state *cached_state = NULL;
> > 
> >     struct extent_map *em;
> >     struct block_device *bdev;
> > 
> > -   size_t pg_offset = 0;
> > +   size_t pg_offset;
> > 
> >     size_t blocksize;
> >     int ret = 0;
> >     int nr = 0;
> > 
> > @@ -3467,8 +3453,16 @@ static noinline_for_stack int
> > __extent_writepage_io(struct inode *inode,> 
> >                                                      page_end, NULL, 1);
> >                     
> >                     break;
> >             
> >             }
> > 
> > -           em = epd->get_extent(inode, page, pg_offset, cur,
> > -                                end - cur + 1, 1);
> > +
> > +           pg_offset = cur & (PAGE_CACHE_SIZE - 1);
> > +
> > +           if (!test_page_blks_state(page, BLK_STATE_DIRTY, cur,
> > +                                           cur + blocksize - 1, 1)) {
> > +                   cur += blocksize;
> > +                   continue;
> > +           }
> 
> If we don't check this, the below get_extent() will return a HOLE
> (block_start == EXTENT_MAP_HOLE) and we can still go on to the next block,
> then we don't need to maintain this BLK_STATE_DIRTY bit all the while.

Sorry, I am not sure if I understood your comment correctly. Are you
suggesting that *page blocks* that are not dirty are always holes?

Let's assume a 64k page whose contents are within i_size and none of the
blocks of the page map to a file hole. Also assume 4k as the block size. Say,
the userspace writes to the "block 0" of the page. The corresponding code in
__btrfs_buffered_write() reads up the complete page into the inode's page
cache and then marks "block 0" of the page as BLK_STATE_DIRTY. Next, the
userspace seeks and writes to "block 4" of the page. In this case, since the
page has PG_uptodate flag already set we don't read the data from the disk
again. We simply go ahead and mark "block 4" as BLK_STATE_DIRTY. As can be
seen in the example scenario, the blocks 1, 2 and 3 are not holes and hence
btrfs_get_extent() would end up returning values other than EXTENT_MAP_HOLE
for em->block_start.

> 
> > +
> > +           em = epd->get_extent(inode, page, pg_offset, cur, blocksize, 
1);
> > 
> >             if (IS_ERR_OR_NULL(em)) {
> >             
> >                     SetPageError(page);
> >                     ret = PTR_ERR_OR_ZERO(em);
> > 
> > @@ -3479,7 +3473,7 @@ static noinline_for_stack int
> > __extent_writepage_io(struct inode *inode,> 
> >             em_end = extent_map_end(em);
> >             BUG_ON(em_end <= cur);
> >             BUG_ON(end < cur);
> > 
> > -           iosize = min(em_end - cur, end - cur + 1);
> > +           iosize = min_t(u64, em_end - cur, blocksize);
> > 
> >             iosize = ALIGN(iosize, blocksize);
> 
> This limits us to do one block per loop, if two blocks are contiguous,
> it should be fine to write them along.

Yes, I agree. I will fix this up in one of the next versions of the
patchset. Thanks for pointing it out.

> 
> >             sector = (em->block_start + extent_offset) >> 9;
> >             bdev = em->bdev;
> > 
> > @@ -3488,32 +3482,20 @@ static noinline_for_stack int
> > __extent_writepage_io(struct inode *inode,> 
> >             free_extent_map(em);
> >             em = NULL;
> > 
> > -           /*
> > -            * compressed and inline extents are written through other
> > -            * paths in the FS
> > -            */
> > -           if (compressed || block_start == EXTENT_MAP_HOLE ||
> > -               block_start == EXTENT_MAP_INLINE) {
> > -                   /*
> > -                    * end_io notification does not happen here for
> > -                    * compressed extents
> > -                    */
> > -                   if (!compressed && tree->ops &&
> > -                       tree->ops->writepage_end_io_hook)
> > -                           tree->ops->writepage_end_io_hook(page, cur,
> > -                                                    cur + iosize - 1,
> > -                                                    NULL, 1);
> > -                   else if (compressed) {
> > -                           /* we don't want to end_page_writeback on
> > -                            * a compressed extent.  this happens
> > -                            * elsewhere
> > -                            */
> > -                           nr++;
> > -                   }
> > +           BUG_ON(compressed);
> > +           BUG_ON(block_start == EXTENT_MAP_INLINE);
> > 
> > -                   cur += iosize;
> > -                   pg_offset += iosize;
> > -                   continue;
> > +           if (block_start == EXTENT_MAP_HOLE) {
> > +                   if (test_page_blks_state(page, BLK_STATE_UPTODATE, 
cur,
> > +                                                   cur + iosize - 1, 1)) 
{
> > +                           clear_page_blks_state(page,
> > +                                           1 << BLK_STATE_DIRTY, cur,
> > +                                           cur + iosize - 1);
> > +                           cur += iosize;
> > +                           continue;
> > +                   } else {
> > +                           BUG();
> > +                   }
> > 
> >             }
> >             
> >             if (tree->ops && tree->ops->writepage_io_hook) {
> > 
> > @@ -3527,7 +3509,13 @@ static noinline_for_stack int
> > __extent_writepage_io(struct inode *inode,> 
> >             } else {
> >             
> >                     unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 
1;
> > 
> > -                   set_range_writeback(tree, cur, cur + iosize - 1);
> > +                   clear_page_blks_state(page, 1 << BLK_STATE_DIRTY, cur,
> > +                                   cur + iosize - 1);
> > +                   set_page_writeback(page);
> > +
> > +                   set_page_blks_state(page, 1 << BLK_STATE_IO, cur,
> > +                                   cur + iosize - 1);
> > +
> > 
> >                     if (!PageWriteback(page)) {
> >                     
> >                             btrfs_err(BTRFS_I(inode)->root->fs_info,
> >                             
> >                                        "page %lu not writeback, cur %llu 
end %llu",
> > 
> > @@ -3542,17 +3530,14 @@ static noinline_for_stack int
> > __extent_writepage_io(struct inode *inode,> 
> >                     if (ret)
> >                     
> >                             SetPageError(page);
> >             
> >             }
> > 
> > -           cur = cur + iosize;
> > -           pg_offset += iosize;
> > +
> > +           cur += iosize;
> > 
> >             nr++;
> >     
> >     }
> >  
> >  done:
> >     *nr_ret = nr;
> >  
> >  done_unlocked:
> > -
> > -   /* drop our reference on any cached states */
> > -   free_extent_state(cached_state);
> > 
> >     return ret;
> >  
> >  }
> > 
> > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> > index 23b6e03..cbe6381 100644
> > --- a/fs/btrfs/file.c
> > +++ b/fs/btrfs/file.c
> > @@ -495,6 +495,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct
> > inode *inode,> 
> >     u64 num_bytes;
> >     u64 start_pos;
> >     u64 end_of_last_block;
> > 
> > +   u64 start;
> > +   u64 end;
> > +   u64 page_end;
> > 
> >     u64 end_pos = pos + write_bytes;
> >     loff_t isize = i_size_read(inode);
> > 
> > @@ -507,11 +510,24 @@ int btrfs_dirty_pages(struct btrfs_root *root,
> > struct inode *inode,> 
> >     if (err)
> >     
> >             return err;
> > 
> > +   start = start_pos;
> > +
> > 
> >     for (i = 0; i < num_pages; i++) {
> >     
> >             struct page *p = pages[i];
> >             SetPageUptodate(p);
> >             ClearPageChecked(p);
> > 
> > +
> > +           end = page_end = page_offset(p) + PAGE_CACHE_SIZE - 1;
> > +
> > +           if (i == num_pages - 1)
> > +                   end = min_t(u64, page_end, end_of_last_block);
> > +
> > +           set_page_blks_state(p,
> > +                           1 << BLK_STATE_DIRTY | 1 << 
BLK_STATE_UPTODATE,
> > +                           start, end);
> > 
> >             set_page_dirty(p);
> > 
> > +
> > +           start = page_end + 1;
> 
> This is not the usual way, page_end is unnecessary, (start +=
> PAGE_CACHE_SIZE) should work.

"start" may not always be set to a file offset that is a multiple of page
size. If the userspace dirties say "block 4" of 64k page, then start will be
set to 16384. Hence in such cases, "start += PAGE_CACHE_SIZE" would yield an
incorrect value.

> >     }
> >     
> >     /*
> > 
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 8262f83..ac6a3f3 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > 
> > @@ -1995,6 +1995,11 @@ again:
> >      }
> >     
> >     btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
> > 
> > +
> > +   set_page_blks_state(page,
> > +                   1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> > +                   page_start, page_end);
> > +
> > 
> >     ClearPageChecked(page);
> >     set_page_dirty(page);
> >  
> >  out:
> > @@ -2984,26 +2989,48 @@ static int btrfs_writepage_end_io_hook(struct page
> > *page, u64 start, u64 end,> 
> >     struct btrfs_ordered_extent *ordered_extent = NULL;
> >     struct btrfs_workqueue *wq;
> >     btrfs_work_func_t func;
> > 
> > +   u64 ordered_start, ordered_end;
> > +   int done;
> > 
> >     trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
> >     
> >     ClearPagePrivate2(page);
> > 
> > -   if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
> > -                                       end - start + 1, uptodate))
> > -           return 0;
> > +loop:
> > +   ordered_extent = btrfs_lookup_ordered_range(inode, start,
> > +                                           end - start + 1);
> > +   if (!ordered_extent)
> > +           goto out;
> > 
> > -   if (btrfs_is_free_space_inode(inode)) {
> > -           wq = root->fs_info->endio_freespace_worker;
> > -           func = btrfs_freespace_write_helper;
> > -   } else {
> > -           wq = root->fs_info->endio_write_workers;
> > -           func = btrfs_endio_write_helper;
> > +   ordered_start = max_t(u64, start, ordered_extent->file_offset);
> > +   ordered_end = min_t(u64, end,
> > +                   ordered_extent->file_offset + ordered_extent->len - 
1);
> > +
> > +   done = btrfs_dec_test_ordered_pending(inode, &ordered_extent,
> > +                                   ordered_start,
> > +                                   ordered_end - ordered_start + 1,
> > +                                   uptodate);
> > +   if (done) {
> > +           if (btrfs_is_free_space_inode(inode)) {
> > +                   wq = root->fs_info->endio_freespace_worker;
> > +                   func = btrfs_freespace_write_helper;
> > +           } else {
> > +                   wq = root->fs_info->endio_write_workers;
> > +                   func = btrfs_endio_write_helper;
> > +           }
> > +
> > +           btrfs_init_work(&ordered_extent->work, func,
> > +                           finish_ordered_fn, NULL, NULL);
> > +           btrfs_queue_work(wq, &ordered_extent->work);
> > 
> >     }
> > 
> > -   btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
> > -                   NULL);
> > -   btrfs_queue_work(wq, &ordered_extent->work);
> > +   btrfs_put_ordered_extent(ordered_extent);
> > +
> > +   start = ordered_end + 1;
> > +
> > +   if (start < end)
> > +           goto loop;
> 
> > +out:
> I saw this's put a BUG_ON(block_start == EXTENT_MAP_INLINE); in writepage(),
> but I didn't see the code of disabling inline data in patch 01 and patch
> 02, but anyway I think we can avoid above searching for ordered_extents in
> a single page if we enable inline data.

For inline extents, The call to __extent_writepage => writepage_delalloc =>
run_delalloc_range => cow_file_range => cow_file_range_inline should write the
block's content into the appropriate location in the btree leaf. Hence
__extent_writepage_io() should never get invoked for files with inline
extents. The call to BUG_ON(block_start == EXTENT_MAP_INLINE) just makes this
explicit and also helps in debugging.

Liu, However I am not sure if we could avoid looping across ordered
extents in the above code. Could you please elaborate on that?

-- 
chandan

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH V11 02/21] Btrfs: subpagesize-blocksize: Fix whole page write.

Reply via email to