On Sun, Feb 15, 2015 at 10:38:54PM +0000, Filipe Manana wrote:
> When punching a file hole if we endup only zeroing parts of a page,
> because the start offset isn't a multiple of the sector size or the
> start offset and length fall within the same page, we were not updating
> the inode item. This prevented an fsync from doing anything, if no other
> file changes happened in the current transaction, because the fields
> in btrfs_inode used to check if the inode needs to be fsync'ed weren't
> updated.
> 
> This issue is easy to reproduce and the following excerpt from the
> xfstest case I made shows how to trigger it:
> 
>   _scratch_mkfs >> $seqres.full 2>&1
>   _init_flakey
>   _mount_flakey
> 
>   # Create our test file.
>   $XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
>       $SCRATCH_MNT/foo | _filter_xfs_io
> 
>   # Fsync the file, this makes btrfs update some btrfs inode specific fields
>   # that are used to track if the inode needs to be written/updated to the 
> fsync
>   # log or not. After this fsync, the new values for those fields indicate 
> that
>   # a subsequent fsync does not need to touch the fsync log.
>   $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
> 
>   # Force a commit of the current transaction. After this point, any operation
>   # that modifies the data or metadata of our file, should update those 
> fields in
>   # the btrfs inode with values that make the next fsync operation write to 
> the
>   # fsync log.
>   sync
> 
>   # Punch a hole in our file. This small range affects only 1 page.
>   # This made the btrfs hole punching implementation write only some zeroes in
>   # one page, but it did not update the btrfs inode fields used to determine 
> if
>   # the next fsync needs to write to the fsync log.
>   $XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
> 
>   # Another variation of the previously mentioned case.
>   $XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
> 
>   # Now fsync the file. This was a no-operation because the previous hole 
> punch
>   # operation didn't update the inode's fields mentioned before, so they 
> remained
>   # with the values they had after the first fsync - that is, they indicate 
> that
>   # it is not needed to write to fsync log.
>   $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
> 
>   echo "File content before:"
>   od -t x1 $SCRATCH_MNT/foo
> 
>   # Simulate a crash/power loss.
>   _load_flakey_table $FLAKEY_DROP_WRITES
>   _unmount_flakey
> 
>   # Enable writes and mount the fs. This makes the fsync log replay code run.
>   _load_flakey_table $FLAKEY_ALLOW_WRITES
>   _mount_flakey
> 
>   # Because the last fsync didn't do anything, here the file content matched 
> what
>   # it was after the first fsync, before the holes were punched, and not what 
> it
>   # was after the holes were punched.
>   echo "File content after:"
>   od -t x1 $SCRATCH_MNT/foo
> 
> This issue has been around since 2012, when the punch hole implementation
> was added, commit 2aaa66558172 ("Btrfs: add hole punching").

Reviewed-by: Liu Bo <[email protected]>

Thanks,

-liubo
> 
> A test case for xfstests follows soon.
> 
> Signed-off-by: Filipe Manana <[email protected]>
> ---
>  fs/btrfs/file.c | 31 ++++++++++++++++++++++++++++---
>  1 file changed, 28 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index e409025..b476e56 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2276,6 +2276,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t 
> offset, loff_t len)
>       bool same_page;
>       bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
>       u64 ino_size;
> +     bool truncated_page = false;
> +     bool updated_inode = false;
>  
>       ret = btrfs_wait_ordered_range(inode, offset, len);
>       if (ret)
> @@ -2307,13 +2309,18 @@ static int btrfs_punch_hole(struct inode *inode, 
> loff_t offset, loff_t len)
>        * entire page.
>        */
>       if (same_page && len < PAGE_CACHE_SIZE) {
> -             if (offset < ino_size)
> +             if (offset < ino_size) {
> +                     truncated_page = true;
>                       ret = btrfs_truncate_page(inode, offset, len, 0);
> +             } else {
> +                     ret = 0;
> +             }
>               goto out_only_mutex;
>       }
>  
>       /* zero back part of the first page */
>       if (offset < ino_size) {
> +             truncated_page = true;
>               ret = btrfs_truncate_page(inode, offset, 0, 0);
>               if (ret) {
>                       mutex_unlock(&inode->i_mutex);
> @@ -2349,6 +2356,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t 
> offset, loff_t len)
>               if (!ret) {
>                       /* zero the front end of the last page */
>                       if (tail_start + tail_len < ino_size) {
> +                             truncated_page = true;
>                               ret = btrfs_truncate_page(inode,
>                                               tail_start + tail_len, 0, 1);
>                               if (ret)
> @@ -2358,8 +2366,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t 
> offset, loff_t len)
>       }
>  
>       if (lockend < lockstart) {
> -             mutex_unlock(&inode->i_mutex);
> -             return 0;
> +             ret = 0;
> +             goto out_only_mutex;
>       }
>  
>       while (1) {
> @@ -2507,6 +2515,7 @@ out_trans:
>  
>       trans->block_rsv = &root->fs_info->trans_block_rsv;
>       ret = btrfs_update_inode(trans, root, inode);
> +     updated_inode = true;
>       btrfs_end_transaction(trans, root);
>       btrfs_btree_balance_dirty(root);
>  out_free:
> @@ -2516,6 +2525,22 @@ out:
>       unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>                            &cached_state, GFP_NOFS);
>  out_only_mutex:
> +     if (!updated_inode && truncated_page && !ret && !err) {
> +             /*
> +              * If we only end up zeroing part of a page, we still need to
> +              * update the inode item, so that all the time fields are
> +              * updated as well as the necessary btrfs inode in memory fields
> +              * for detecting, at fsync time, if the inode isn't yet in the
> +              * log tree or it's there but not up to date.
> +              */
> +             trans = btrfs_start_transaction(root, 1);
> +             if (IS_ERR(trans)) {
> +                     err = PTR_ERR(trans);
> +             } else {
> +                     err = btrfs_update_inode(trans, root, inode);
> +                     ret = btrfs_end_transaction(trans, root);
> +             }
> +     }
>       mutex_unlock(&inode->i_mutex);
>       if (ret && !err)
>               err = ret;
> -- 
> 2.1.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to