[PATCH v2] Btrfs: use generic_remap_file_range_prep() for cloning and deduplication

2018-12-07 Thread fdmanana
From: Filipe Manana 

Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:

1) When cloning, the destination file's capabilities were not dropped
   (the fstest generic/513 tests this);

2) We were not checking if the destination file is immutable;

3) Not checking if either the source or destination files are swap
   files (swap file support is coming soon for Btrfs);

4) System limits were not checked (resource limits and O_LARGEFILE).

Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:

1) With compression, we need to start writeback twice in order to get the
   pages marked for writeback and ordered extents created;

2) filemap_write_and_wait_range() (and all its other variants) only waits
   for the IO to complete, but we need to wait for the ordered extents to
   finish, so that when we do the actual reflinking operations the file
   extent items are in the fs tree. This is also important due to the fact
   that the generic helper, for the deduplication case, compares the
   contents of the pages in the requested range, which might require
   reading extents from disk in the very unlikely case that pages get
   invalidated after writeback finishes (so the file extent items must be
   up to date in the fs tree).

Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results in
a more simple way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all the
pages first and then if we found any dealloc for the range, or ordered
extent, we would unlock the pages trigger writeback and wait for ordered
extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.

So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic helpe,
since it affected all filesystems supporting these operations, so we no
longer need special checks in Btrfs for them.

Signed-off-by: Filipe Manana 
---

V2: Removed check that verifies if either of the inodes is a directory,
as it is done by generic_remap_file_range_prep(). Oddly in btrfs was being
done only for cloning but not for dedupe.

 fs/btrfs/ioctl.c | 612 ---
 1 file changed, 129 insertions(+), 483 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 802a628e9f7d..321fb9bc149d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3191,92 +3191,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info 
*fs_info,
return ret;
 }
 
-static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
-{
-   struct page *page;
-
-   page = grab_cache_page(inode->i_mapping, index);
-   if (!page)
-   return ERR_PTR(-ENOMEM);
-
-   if (!PageUptodate(page)) {
-   int ret;
-
-   ret = btrfs_readpage(NULL, page);
-   if (ret)
-   return ERR_PTR(ret);
-   lock_page(page);
-   if (!PageUptodate(page)) {
-   unlock_page(page);
-   put_page(page);
-   return ERR_PTR(-EIO);
-   }
-   if (page->mapping != inode->i_mapping) {
-   unlock_page(page);
-   put_page(page);
-   return ERR_PTR(-EAGAIN);
-   }
-   }
-
-   return page;
-}
-
-static int gather_extent_pages(struct inode *inode, struct page **pages,
-  int num_pages, u64 off)
-{
-   int i;
-   pgoff_t index = off >> PAGE_SHIFT;
-
-   for (i = 0; i < num_pages; i++) {
-again:
-   pages[i] = extent_same_get_page(inode, index + i);
-   if (IS_ERR(pages[i])) {
-   int err = PTR_ERR(pages[i]);
-
-   if (err == -EAGAIN)
-   goto again;
-   pages[i] = NULL;
-   return err;
-   }
-   }
-   return 0;
-}
-
-static int lock_extent_range(struct inode *inode, u64 off, u64 len,
-bool retry_range_locking)
-{
-   /*
-* Do any pending 

[PATCH] Btrfs: use generic_remap_file_range_prep() for cloning and deduplication

2018-12-07 Thread fdmanana
From: Filipe Manana 

Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:

1) When cloning, the destination file's capabilities were not dropped
   (the fstest generic/513 tests this);

2) We were not checking if the destination file is immutable;

3) Not checking if either the source or destination files are swap
   files (swap file support is coming soon for Btrfs);

4) System limits were not checked (resource limits and O_LARGEFILE).

Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:

1) With compression, we need to start writeback twice in order to get the
   pages marked for writeback and ordered extents created;

2) filemap_write_and_wait_range() (and all its other variants) only waits
   for the IO to complete, but we need to wait for the ordered extents to
   finish, so that when we do the actual reflinking operations the file
   extent items are in the fs tree. This is also important due to the fact
   that the generic helper, for the deduplication case, compares the
   contents of the pages in the requested range, which might require
   reading extents from disk in the very unlikely case that pages get
   invalidated after writeback finishes (so the file extent items must be
   up to date in the fs tree).

Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results in
a more simple way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all the
pages first and then if we found any dealloc for the range, or ordered
extent, we would unlock the pages trigger writeback and wait for ordered
extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.

So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic helpe,
since it affected all filesystems supporting these operations, so we no
longer need special checks in Btrfs for them.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/ioctl.c | 615 ---
 1 file changed, 132 insertions(+), 483 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 802a628e9f7d..261e116dddb2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3191,92 +3191,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info 
*fs_info,
return ret;
 }
 
-static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
-{
-   struct page *page;
-
-   page = grab_cache_page(inode->i_mapping, index);
-   if (!page)
-   return ERR_PTR(-ENOMEM);
-
-   if (!PageUptodate(page)) {
-   int ret;
-
-   ret = btrfs_readpage(NULL, page);
-   if (ret)
-   return ERR_PTR(ret);
-   lock_page(page);
-   if (!PageUptodate(page)) {
-   unlock_page(page);
-   put_page(page);
-   return ERR_PTR(-EIO);
-   }
-   if (page->mapping != inode->i_mapping) {
-   unlock_page(page);
-   put_page(page);
-   return ERR_PTR(-EAGAIN);
-   }
-   }
-
-   return page;
-}
-
-static int gather_extent_pages(struct inode *inode, struct page **pages,
-  int num_pages, u64 off)
-{
-   int i;
-   pgoff_t index = off >> PAGE_SHIFT;
-
-   for (i = 0; i < num_pages; i++) {
-again:
-   pages[i] = extent_same_get_page(inode, index + i);
-   if (IS_ERR(pages[i])) {
-   int err = PTR_ERR(pages[i]);
-
-   if (err == -EAGAIN)
-   goto again;
-   pages[i] = NULL;
-   return err;
-   }
-   }
-   return 0;
-}
-
-static int lock_extent_range(struct inode *inode, u64 off, u64 len,
-bool retry_range_locking)
-{
-   /*
-* Do any pending delalloc/csum calculations on inode, one way or
-* another, and lock file content.
-* The locking order is:
-*
-*   1) pages
-*   2) range in the inode's io tree
-

[PATCH] Btrfs: scrub, move setup of nofs contexts higher in the stack

2018-12-07 Thread fdmanana
From: Filipe Manana 

Since scrub workers only do memory allocation with GFP_KERNEL when they
need to perform repair, we can move the recent setup of the nofs context
up to scrub_handle_errored_block() instead of setting it up down the call
chain at insert_full_stripe_lock() and scrub_add_page_to_wr_bio(),
removing some duplicate code and comment. So the only paths for which a
scrub worker can do memory allocations using GFP_KERNEL are the following:

 scrub_bio_end_io_worker()
   scrub_block_complete()
 scrub_handle_errored_block()
   lock_full_stripe()
 insert_full_stripe_lock()
   -> kmalloc with GFP_KERNEL

  scrub_bio_end_io_worker()
scrub_block_complete()
  scrub_handle_errored_block()
scrub_write_page_to_dev_replace()
  scrub_add_page_to_wr_bio()
-> kzalloc with GFP_KERNEL

Signed-off-by: Filipe Manana 
---

Applies on top of:

  Btrfs: fix deadlock with memory reclaim during scrub

 fs/btrfs/scrub.c | 34 ++
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bbd1b36f4918..f996f4064596 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -322,7 +322,6 @@ static struct full_stripe_lock *insert_full_stripe_lock(
struct rb_node *parent = NULL;
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
-   unsigned int nofs_flag;
 
lockdep_assert_held(_root->lock);
 
@@ -342,15 +341,8 @@ static struct full_stripe_lock *insert_full_stripe_lock(
 
/*
 * Insert new lock.
-*
-* We must use GFP_NOFS because the scrub task might be waiting for a
-* worker task executing this function and in turn a transaction commit
-* might be waiting the scrub task to pause (which needs to wait for all
-* the worker tasks to complete before pausing).
 */
-   nofs_flag = memalloc_nofs_save();
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
-   memalloc_nofs_restore(nofs_flag);
if (!ret)
return ERR_PTR(-ENOMEM);
ret->logical = fstripe_logical;
@@ -842,6 +834,7 @@ static int scrub_handle_errored_block(struct scrub_block 
*sblock_to_check)
int page_num;
int success;
bool full_stripe_locked;
+   unsigned int nofs_flag;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
  DEFAULT_RATELIMIT_BURST);
 
@@ -867,6 +860,16 @@ static int scrub_handle_errored_block(struct scrub_block 
*sblock_to_check)
dev = sblock_to_check->pagev[0]->dev;
 
/*
+* We must use GFP_NOFS because the scrub task might be waiting for a
+* worker task executing this function and in turn a transaction commit
+* might be waiting the scrub task to pause (which needs to wait for all
+* the worker tasks to complete before pausing).
+* We do allocations in the workers through insert_full_stripe_lock()
+* and scrub_add_page_to_wr_bio(), which happens down the call chain of
+* this function.
+*/
+   nofs_flag = memalloc_nofs_save();
+   /*
 * For RAID5/6, race can happen for a different device scrub thread.
 * For data corruption, Parity and Data threads will both try
 * to recovery the data.
@@ -875,6 +878,7 @@ static int scrub_handle_errored_block(struct scrub_block 
*sblock_to_check)
 */
ret = lock_full_stripe(fs_info, logical, _stripe_locked);
if (ret < 0) {
+   memalloc_nofs_restore(nofs_flag);
spin_lock(>stat_lock);
if (ret == -ENOMEM)
sctx->stat.malloc_errors++;
@@ -914,7 +918,7 @@ static int scrub_handle_errored_block(struct scrub_block 
*sblock_to_check)
 */
 
sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
- sizeof(*sblocks_for_recheck), GFP_NOFS);
+ sizeof(*sblocks_for_recheck), GFP_KERNEL);
if (!sblocks_for_recheck) {
spin_lock(>stat_lock);
sctx->stat.malloc_errors++;
@@ -1212,6 +1216,7 @@ static int scrub_handle_errored_block(struct scrub_block 
*sblock_to_check)
}
 
ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+   memalloc_nofs_restore(nofs_flag);
if (ret < 0)
return ret;
return 0;
@@ -1630,19 +1635,8 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx 
*sctx,
mutex_lock(>wr_lock);
 again:
if (!sctx->wr_curr_bio) {
-   unsigned int nofs_flag;
-
-   /*
-* We must use GFP_NOFS because the scrub task might be waiting
-* for a worker task executing this function and in turn a
-* transaction commit might be waiting the scrub task to pause
-* (which needs to wait for all the worker tasks 

[PATCH v2] Btrfs: fix fsync of files with multiple hard links in new directories

2018-11-28 Thread fdmanana
From: Filipe Manana 

The log tree has a long standing problem that when a file is fsync'ed we
only check for new ancestors, created in the current transaction, by
following only the hard link for which the fsync was issued. We follow the
ancestors using the VFS' dget_parent() API. This means that if we create a
new link for a file in a directory that is new (or in an any other new
ancestor directory) and then fsync the file using an old hard link, we end
up not logging the new ancestor, and on log replay that new hard link and
ancestor do not exist. In some cases, involving renames, the file will not
exist at all.

Example:

  mkfs.btrfs -f /dev/sdb
  mount /dev/sdb /mnt

  mkdir /mnt/A
  touch /mnt/foo
  ln /mnt/foo /mnt/A/bar
  xfs_io -c fsync /mnt/foo

  

In this example after log replay only the hard link named 'foo' exists
and directory A does not exist, which is unexpected. In other major linux
filesystems, such as ext4, xfs and f2fs for example, both hard links exist
and so does directory A after mounting again the filesystem.

Checking if any new ancestors are new and need to be logged was added in
2009 by commit 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes"),
however only for the ancestors of the hard link (dentry) for which the
fsync was issued, instead of checking for all ancestors for all of the
inode's hard links.

So fix this by tracking the id of the last transaction where a hard link
was created for an inode and then on fsync fallback to a full transaction
commit when an inode has more than one hard link and at least one new hard
link was created in the current transaction. This is the simplest solution
since this is not a common use case (adding frequently hard links for
which there's an ancestor created in the current transaction and then
fsync the file). In case it ever becomes a common use case, a solution
that consists of iterating the fs/subvol btree for each hard link and
check if any ancestor is new, could be implemented.

This solves many unexpected scenarios reported by Jayashree Mohan and
Vijay Chidambaram, and for which there is a new test case for fstests
under review.

Reported-by: Vijay Chidambaram 
Reported-by: Jayashree Mohan 
Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes")
Signed-off-by: Filipe Manana 
---

V2: Added missing case set last_link_trans after an inode is evicted and
loaded again.

 fs/btrfs/btrfs_inode.h |  6 ++
 fs/btrfs/inode.c   | 17 +
 fs/btrfs/tree-log.c| 16 
 3 files changed, 39 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..7177d1d33584 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,6 +147,12 @@ struct btrfs_inode {
u64 last_unlink_trans;
 
/*
+* Track the transaction id of the last transaction used to create a
+* hard link for the inode. This is used by the log tree (fsync).
+*/
+   u64 last_link_trans;
+
+   /*
 * Number of bytes outstanding that are going to need csums.  This is
 * used in ENOSPC accounting.
 */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64ea749c1ba4..51f4628be2d2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3685,6 +3685,21 @@ static int btrfs_read_locked_inode(struct inode *inode,
 * inode is not a directory, logging its parent unnecessarily.
 */
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+   /*
+* Similar reasoning for last_link_trans, needs to be set otherwise
+* for a case like the following:
+*
+* mkdir A
+* touch foo
+* ln foo A/bar
+* echo 2 > /proc/sys/vm/drop_caches
+* fsync foo
+* 
+*
+* Would result in link bar and directory A not existing after the power
+* failure.
+*/
+   BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
 
path->slots[0]++;
if (inode->i_nlink != 1 ||
@@ -6651,6 +,7 @@ static int btrfs_link(struct dentry *old_dentry, struct 
inode *dir,
if (err)
goto fail;
}
+   BTRFS_I(inode)->last_link_trans = trans->transid;
d_instantiate(dentry, inode);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
 true, NULL);
@@ -9179,6 +9195,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+   ei->last_link_trans = 0;
ei->last_log_commit = 0;
 
spin_lock_init(>lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index aac3749f697f..896d79144052 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5760,6 +5760,22 @@ static int btrfs_log_inode_parent(struct 
btrfs_trans_handle *trans,
   

[PATCH] Btrfs: fix fsync of files with multiple hard links in new directories

2018-11-28 Thread fdmanana
From: Filipe Manana 

The log tree has a long standing problem that when a file is fsync'ed we
only check for new ancestors, created in the current transaction, by
following only the hard link for which the fsync was issued. We follow the
ancestors using the VFS' dget_parent() API. This means that if we create a
new link for a file in a directory that is new (or in an any other new
ancestor directory) and then fsync the file using an old hard link, we end
up not logging the new ancestor, and on log replay that new hard link and
ancestor do not exist. In some cases, involving renames, the file will not
exist at all.

Example:

  mkfs.btrfs -f /dev/sdb
  mount /dev/sdb /mnt

  mkdir /mnt/A
  touch /mnt/foo
  ln /mnt/foo /mnt/A/bar
  xfs_io -c fsync /mnt/foo

  

In this example after log replay only the hard link named 'foo' exists
and directory A does not exist, which is unexpected. In other major linux
filesystems, such as ext4, xfs and f2fs for example, both hard links exist
and so does directory A after mounting again the filesystem.

Checking if any new ancestors are new and need to be logged was added in
2009 by commit 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes"),
however only for the ancestors of the hard link (dentry) for which the
fsync was issued, instead of checking for all ancestors for all of the
inode's hard links.

So fix this by tracking the id of the last transaction where a hard link
was created for an inode and then on fsync fallback to a full transaction
commit when an inode has more than one hard link and at least one new hard
link was created in the current transaction. This is the simplest solution
since this is not a common use case (adding frequently hard links for
which there's an ancestor created in the current transaction and then
fsync the file). In case it ever becomes a common use case, a solution
that consists of iterating the fs/subvol btree for each hard link and
check if any ancestor is new, could be implemented.

This solves many unexpected scenarios reported by Jayashree Mohan and
Vijay Chidambaram, and for which there is a new test case for fstests
under review.

Reported-by: Vijay Chidambaram 
Reported-by: Jayashree Mohan 
Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/btrfs_inode.h |  6 ++
 fs/btrfs/inode.c   |  2 ++
 fs/btrfs/tree-log.c| 16 
 3 files changed, 24 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..7177d1d33584 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,6 +147,12 @@ struct btrfs_inode {
u64 last_unlink_trans;
 
/*
+* Track the transaction id of the last transaction used to create a
+* hard link for the inode. This is used by the log tree (fsync).
+*/
+   u64 last_link_trans;
+
+   /*
 * Number of bytes outstanding that are going to need csums.  This is
 * used in ENOSPC accounting.
 */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64ea749c1ba4..2e5660eb5aa1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6651,6 +6651,7 @@ static int btrfs_link(struct dentry *old_dentry, struct 
inode *dir,
if (err)
goto fail;
}
+   BTRFS_I(inode)->last_link_trans = trans->transid;
d_instantiate(dentry, inode);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
 true, NULL);
@@ -9179,6 +9180,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+   ei->last_link_trans = 0;
ei->last_log_commit = 0;
 
spin_lock_init(>lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index aac3749f697f..896d79144052 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5760,6 +5760,22 @@ static int btrfs_log_inode_parent(struct 
btrfs_trans_handle *trans,
goto end_trans;
}
 
+   /*
+* If a new hard link was added to the inode in the current transaction
+* and its link count is now greater than 1, we need to fallback to a
+* transaction commit, otherwise we can end up not logging all its new
+* parents for all the hard links. Here just from the dentry used to
+* fsync, we can not visit the ancestor inodes for all the other hard
+* links to figure out if any is new, so we fallback to a transaction
+* commit (instead of adding a lot of complexity of scanning a btree,
+* since this scenario is not a common use case).
+*/
+   if (inode->vfs_inode.i_nlink > 1 &&
+   inode->last_link_trans > last_committed) {
+   ret = -EMLINK;
+   goto end_trans;
+   }
+
while (1) {
 

[PATCH v5] Btrfs: fix deadlock with memory reclaim during scrub

2018-11-26 Thread fdmanana
From: Filipe Manana 

When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL from scrub,
otherwise we risk getting into a deadlock with reclaim.

Checking for scrub pause requests is done early at the beginning of the
while loop of scrub_stripe() and later in the loop, scrub_extent() and
scrub_raid56_parity() are called, which in turn call scrub_pages() and
scrub_pages_for_parity() respectively. These last two functions do memory
allocations using GFP_KERNEL. Same problem could happen while scrubbing
the super blocks, since it calls scrub_pages().

We also can not have any of the worker tasks, created by the scrub task,
doing GFP_KERNEL allocations, because before pausing, the scrub task waits
for all the worker tasks to complete (also done at scrub_stripe()).

So make sure GFP_NOFS is used for the memory allocations because at any
time a scrub pause request can happen from another task that started to
commit a transaction.

Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
Signed-off-by: Filipe Manana 
---

V2: Make using GFP_NOFS unconditionial. Previous version was racy, as pausing
requests migth happen just after we checked for them.

V3: Use memalloc_nofs_save() just like V1 did.

V4: Similar problem happened for raid56, which was previously missed, so
deal with it as well as the case for scrub_supers().

V5: Make sure worker tasks, created by scrub, also don't do GFP_KERNEL
allocations, because in order to pause, the scrub task waits for all
the workers to complete first.

 fs/btrfs/scrub.c | 35 ++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116..392f8a7f65ab 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -322,6 +322,7 @@ static struct full_stripe_lock *insert_full_stripe_lock(
struct rb_node *parent = NULL;
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
+   unsigned int nofs_flag;
 
lockdep_assert_held(_root->lock);
 
@@ -339,8 +340,17 @@ static struct full_stripe_lock *insert_full_stripe_lock(
}
}
 
-   /* Insert new lock */
+   /*
+* Insert new lock.
+*
+* We must use GFP_NOFS because the scrub task might be waiting for a
+* worker task executing this function and in turn a transaction commit
+* might be waiting the scrub task to pause (which needs to wait for all
+* the worker tasks to complete before pausing).
+*/
+   nofs_flag = memalloc_nofs_save();
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+   memalloc_nofs_restore(nofs_flag);
if (!ret)
return ERR_PTR(-ENOMEM);
ret->logical = fstripe_logical;
@@ -1622,8 +1632,19 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx 
*sctx,
mutex_lock(>wr_lock);
 again:
if (!sctx->wr_curr_bio) {
+   unsigned int nofs_flag;
+
+   /*
+* We must use GFP_NOFS because the scrub task might be waiting
+* for a worker task executing this function and in turn a
+* transaction commit might be waiting the scrub task to pause
+* (which needs to wait for all the worker tasks to complete
+* before pausing).
+*/
+   nofs_flag = memalloc_nofs_save();
sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
  GFP_KERNEL);
+   memalloc_nofs_restore(nofs_flag);
if (!sctx->wr_curr_bio) {
mutex_unlock(>wr_lock);
return -ENOMEM;
@@ -3779,6 +3800,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+   unsigned int nofs_flag;
 
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3882,6 +3904,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
atomic_inc(_info->scrubs_running);
mutex_unlock(_info->scrub_lock);
 
+   /*
+* In order to avoid deadlock with reclaim when there is a transaction
+* trying to pause scrub, make sure we use GFP_NOFS for all the
+* allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+* invoked by our callees. The pausing request is done when the
+* transaction commit starts, and it blocks the transaction until scrub
+* is paused (done at specific points at scrub_stripe() or right above
+* before incrementing fs_info->scrubs_running).
+*/
+   nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
/*

[PATCH v4] Btrfs: fix deadlock with memory reclaim during scrub

2018-11-23 Thread fdmanana
From: Filipe Manana 

When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL from scrub,
otherwise we risk getting into a deadlock with reclaim.

Checking for scrub pause requests is done early at the beginning of the
while loop of scrub_stripe() and later in the loop, scrub_extent() and
scrub_raid56_parity() are called, which in turn call scrub_pages() and
scrub_pages_for_parity() respectively. These last two functions do memory
allocations using GFP_KERNEL. Same problem could happen while scrubbing
the super blocks, since it calls scrub_pages().

So make sure GFP_NOFS is used for the memory allocations because at any
time a scrub pause request can happen from another task that started to
commit a transaction.

Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
Signed-off-by: Filipe Manana 
---

V2: Make using GFP_NOFS unconditionial. Previous version was racy, as pausing
requests migth happen just after we checked for them.

V3: Use memalloc_nofs_save() just like V1 did.

V4: Similar problem happened for raid56, which was previously missed, so
deal with it as well as the case for scrub_supers().

 fs/btrfs/scrub.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116..e08b7502d1f0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3779,6 +3779,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+   unsigned int nofs_flag;
 
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3882,6 +3883,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
atomic_inc(_info->scrubs_running);
mutex_unlock(_info->scrub_lock);
 
+   /*
+* In order to avoid deadlock with reclaim when there is a transaction
+* trying to pause scrub, make sure we use GFP_NOFS for all the
+* allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+* invoked by our callees. The pausing request is done when the
+* transaction commit starts, and it blocks the transaction until scrub
+* is paused (done at specific points at scrub_stripe() or right above
+* before incrementing fs_info->scrubs_running).
+*/
+   nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
/*
 * by holding device list mutex, we can
@@ -3895,6 +3906,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
if (!ret)
ret = scrub_enumerate_chunks(sctx, dev, start, end,
 is_dev_replace);
+   memalloc_nofs_restore(nofs_flag);
 
wait_event(sctx->list_wait, atomic_read(>bios_in_flight) == 0);
atomic_dec(_info->scrubs_running);
-- 
2.11.0



[PATCH v3] Btrfs: fix deadlock with memory reclaim during scrub

2018-11-23 Thread fdmanana
From: Filipe Manana 

When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL while scrub
is running, we must use GFP_NOS to avoid deadlock with reclaim. Checking
for pause requests is done early in the while loop of scrub_stripe(), and
later in the loop, scrub_extent() is called, which in turns calls
scrub_pages(), which does memory allocations using GFP_KERNEL. So use
GFP_NOFS for the memory allocations because at any time a scrub pause
request can happen from another task that started to commit a transaction.

Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
Signed-off-by: Filipe Manana 
---

V2: Make using GFP_NOFS unconditionial. Previous version was racy, as pausing
requests migth happen just after we checked for them.

V3: Use memalloc_nofs_save() just like V1 did.

 fs/btrfs/scrub.c | 23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116..8e9ead5073ec 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2204,13 +2204,24 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
 {
struct scrub_block *sblock;
int index;
+   unsigned int nofs_flag;
+   int ret = 0;
+
+   /*
+* In order to avoid deadlock with reclaim when there is a transaction
+* trying to pause scrub, use GFP_NOFS. The pausing request is done when
+* the transaction commit starts, and it blocks the transaction until
+* scrub is paused (done at specific points at scrub_stripe()).
+*/
+   nofs_flag = memalloc_nofs_save();
 
sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
if (!sblock) {
spin_lock(>stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(>stat_lock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
 
/* one ref inside this function, plus one for each page added to
@@ -2230,7 +2241,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
sctx->stat.malloc_errors++;
spin_unlock(>stat_lock);
scrub_block_put(sblock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
scrub_page_get(spage);
@@ -2269,12 +2281,11 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
} else {
for (index = 0; index < sblock->page_count; index++) {
struct scrub_page *spage = sblock->pagev[index];
-   int ret;
 
ret = scrub_add_page_to_rd_bio(sctx, spage);
if (ret) {
scrub_block_put(sblock);
-   return ret;
+   goto out;
}
}
 
@@ -2284,7 +2295,9 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
 
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
-   return 0;
+out:
+   memalloc_nofs_restore(nofs_flag);
+   return ret;
 }
 
 static void scrub_bio_end_io(struct bio *bio)
-- 
2.11.0



[PATCH v2] Btrfs: fix deadlock with memory reclaim during scrub

2018-11-23 Thread fdmanana
From: Filipe Manana 

When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL while scrub
is running, we must use GFP_NOS to avoid deadlock with reclaim. Checking
for pause requests is done early in the while loop of scrub_stripe(), and
later in the loop, scrub_extent() is called, which in turns calls
scrub_pages(), which does memory allocations using GFP_KERNEL. So use
GFP_NOFS for the memory allocations because at any time a scrub pause
request can happen from another task that started to commit a transaction.

Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
Signed-off-by: Filipe Manana 
---

V2: Make using GFP_NOFS unconditionial. Previous version was racy, as pausing
requests migth happen just after we checked for them.

 fs/btrfs/scrub.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116..0630ea0881bc 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2205,7 +2205,13 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
struct scrub_block *sblock;
int index;
 
-   sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+   /*
+* In order to avoid deadlock with reclaim when there is a transaction
+* trying to pause scrub, use GFP_NOFS. The pausing request is done when
+* the transaction commit starts, and it blocks the transaction until
+* scrub is paused (done at specific points at scrub_stripe()).
+*/
+   sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
if (!sblock) {
spin_lock(>stat_lock);
sctx->stat.malloc_errors++;
@@ -2223,7 +2229,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
struct scrub_page *spage;
u64 l = min_t(u64, len, PAGE_SIZE);
 
-   spage = kzalloc(sizeof(*spage), GFP_KERNEL);
+   spage = kzalloc(sizeof(*spage), GFP_NOFS);
if (!spage) {
 leave_nomem:
spin_lock(>stat_lock);
@@ -2250,7 +2256,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
spage->have_csum = 0;
}
sblock->page_count++;
-   spage->page = alloc_page(GFP_KERNEL);
+   spage->page = alloc_page(GFP_NOFS);
if (!spage->page)
goto leave_nomem;
len -= l;
-- 
2.11.0



[PATCH] Btrfs: fix deadlock with memory reclaim during scrub

2018-11-23 Thread fdmanana
From: Filipe Manana 

When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL while scrub
is running, we must use GFP_NOS to avoid deadlock with reclaim. Checking
for pause requests is done early in the while loop of scrub_stripe(), and
later in the loop, scrub_extent() is called, which in turns calls
scrub_pages(), which does memory allocations using GFP_KERNEL. So use
GFP_NOFS for the memory allocations if there are any scrub pause requests.

Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/scrub.c | 26 +-
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116..5fcb9d1eb983 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2204,13 +2204,26 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
 {
struct scrub_block *sblock;
int index;
+   bool pause_req = (atomic_read(>fs_info->scrub_pause_req) != 0);
+   unsigned int nofs_flag;
+   int ret = 0;
+
+   /*
+* In order to avoid deadlock with reclaim when there is a transaction
+* trying to pause scrub, use GFP_NOFS. The pausing request is done when
+* the transaction commit starts, and it blocks the transaction until
+* scrub is paused (done at specific points at scrub_stripe()).
+*/
+   if (pause_req)
+   nofs_flag = memalloc_nofs_save();
 
sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
if (!sblock) {
spin_lock(>stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(>stat_lock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
 
/* one ref inside this function, plus one for each page added to
@@ -2230,7 +2243,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
sctx->stat.malloc_errors++;
spin_unlock(>stat_lock);
scrub_block_put(sblock);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto out;
}
BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
scrub_page_get(spage);
@@ -2269,12 +2283,11 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
} else {
for (index = 0; index < sblock->page_count; index++) {
struct scrub_page *spage = sblock->pagev[index];
-   int ret;
 
ret = scrub_add_page_to_rd_bio(sctx, spage);
if (ret) {
scrub_block_put(sblock);
-   return ret;
+   goto out;
}
}
 
@@ -2284,7 +2297,10 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 
logical, u64 len,
 
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
-   return 0;
+ out:
+   if (pause_req)
+   memalloc_nofs_restore(nofs_flag);
+   return ret;
 }
 
 static void scrub_bio_end_io(struct bio *bio)
-- 
2.11.0



[PATCH] Btrfs: fix race between enabling quotas and subvolume creation

2018-11-19 Thread fdmanana
From: Filipe Manana 

We have a race between enabling quotas end subvolume creation that cause
subvolume creation to fail with -EINVAL, and the following diagram shows
how it happens:

  CPU 0  CPU 1

 btrfs_ioctl()
  btrfs_ioctl_quota_ctl()
   btrfs_quota_enable()
mutex_lock(fs_info->qgroup_ioctl_lock)

  btrfs_ioctl()
   create_subvol()
btrfs_qgroup_inherit()
 -> save fs_info->quota_root
into quota_root
 -> stores a NULL value
 -> tries to lock the mutex
qgroup_ioctl_lock
-> blocks waiting for
   the task at CPU0

   -> sets BTRFS_FS_QUOTA_ENABLED in fs_info
   -> sets quota_root in fs_info->quota_root
  (non-NULL value)

   mutex_unlock(fs_info->qgroup_ioctl_lock)

 -> checks quota enabled
flag is set
 -> returns -EINVAL because
fs_info->quota_root was
NULL before it acquired
the mutex
qgroup_ioctl_lock
   -> ioctl returns -EINVAL

Returning -EINVAL to user space will be confusing if all the arguments
passed to the subvolume creation ioctl were valid.

Fix it by grabbing the value from fs_info->quota_root after acquiring
the mutex.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/qgroup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ae1358253b7b..0bdf28499790 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2250,7 +2250,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle 
*trans, u64 srcid,
int i;
u64 *i_qgroups;
struct btrfs_fs_info *fs_info = trans->fs_info;
-   struct btrfs_root *quota_root = fs_info->quota_root;
+   struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
u32 level_size = 0;
@@ -2260,6 +2260,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle 
*trans, u64 srcid,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags))
goto out;
 
+   quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
goto out;
-- 
2.11.0



[PATCH v2] Btrfs: fix deadlock when enabling quotas due to concurrent snapshot creation

2018-11-19 Thread fdmanana
From: Filipe Manana 

If the quota enable and snapshot creation ioctls are called concurrently
we can get into a deadlock where the task enabling quotas will deadlock
on the fs_info->qgroup_ioctl_lock mutex because it attempts to lock it
twice, or the task creating a snapshot tries to commit the transaction
while the task enabling quota waits for the former task to commit the
transaction while holding the mutex. The following time diagrams show how
both cases happen.

First scenario:

   CPU 0CPU 1

 btrfs_ioctl()
  btrfs_ioctl_quota_ctl()
   btrfs_quota_enable()
mutex_lock(fs_info->qgroup_ioctl_lock)
btrfs_start_transaction()

 btrfs_ioctl()
  btrfs_ioctl_snap_create_v2
   create_snapshot()
--> adds snapshot to the
list pending_snapshots
of the current
transaction

btrfs_commit_transaction()
 create_pending_snapshots()
   create_pending_snapshot()
qgroup_account_snapshot()
 btrfs_qgroup_inherit()
   mutex_lock(fs_info->qgroup_ioctl_lock)
--> deadlock, mutex already locked
by this task at
btrfs_quota_enable()

Second scenario:

   CPU 0CPU 1

 btrfs_ioctl()
  btrfs_ioctl_quota_ctl()
   btrfs_quota_enable()
mutex_lock(fs_info->qgroup_ioctl_lock)
btrfs_start_transaction()

 btrfs_ioctl()
  btrfs_ioctl_snap_create_v2
   create_snapshot()
--> adds snapshot to the
list pending_snapshots
of the current
transaction

btrfs_commit_transaction()
 --> waits for task at
 CPU 0 to release
 its transaction
 handle

btrfs_commit_transaction()
 --> sees another task started
 the transaction commit first
 --> releases its transaction
 handle
 --> waits for the transaction
 commit to be completed by
 the task at CPU 1

 create_pending_snapshot()
  qgroup_account_snapshot()
   btrfs_qgroup_inherit()

mutex_lock(fs_info->qgroup_ioctl_lock)
 --> deadlock, task at CPU 0
 has the mutex locked 
but
 it is waiting for us to
 finish the transaction
 commit

So fix this by setting the quota enabled flag in fs_info after committing
the transaction at btrfs_quota_enable(). This ends up serializing quota
enable and snapshot creation as if the snapshot creation happened just
before the quota enable request. The quota rescan task, scheduled after
committing the transaction in btrfs_quote_enable(), will do the accounting.

Fixes: 6426c7ad697d ("btrfs: qgroup: Fix qgroup accounting when creating 
snapshot")
Signed-off-by: Filipe Manana 
---

V2: Added second deadlock example to changelog and changed the fix to deal
with that case as well.

 fs/btrfs/qgroup.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d4917c0cddf5..ae1358253b7b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1013,16 +1013,22 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
-   spin_lock(_info->qgroup_lock);
-   fs_info->quota_root = quota_root;
-   set_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags);
-   spin_unlock(_info->qgroup_lock);
 
ret = btrfs_commit_transaction(trans);
trans = NULL;
if (ret)
goto out_free_path;
 
+   /*
+* Set quota enabled flag after committing the transaction, to avoid
+* deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
+* creation.
+*/
+   

[PATCH] Btrfs: fix deadlock when enabling quotas due to concurrent snapshot creation

2018-11-19 Thread fdmanana
From: Filipe Manana 

If the quota enable and snapshot creation ioctls are called concurrently
we can get into a deadlock where the task enabling quotas will deadlock
on the fs_info->qgroup_ioctl_lock mutex because it attempts to lock it
twice. The following time diagram shows how this happens.

   CPU 0CPU 1

 btrfs_ioctl()
  btrfs_ioctl_quota_ctl()
   btrfs_quota_enable()
mutex_lock(fs_info->qgroup_ioctl_lock)
btrfs_start_transaction()

 btrfs_ioctl()
  btrfs_ioctl_snap_create_v2
   create_snapshot()
--> adds snapshot to the
list pending_snapshots
of the current
transaction

btrfs_commit_transaction()
 create_pending_snapshots()
   create_pending_snapshot()
qgroup_account_snapshot()
 btrfs_qgroup_inherit()
   mutex_lock(fs_info->qgroup_ioctl_lock)
--> deadlock, mutex already locked
by this task at
btrfs_quota_enable()

So fix this by adding a flag to the transaction handle that signals if the
transaction is being used for enabling quotas (only seen by the task doing
it) and do not lock the mutex qgroup_ioctl_lock at btrfs_qgroup_inherit()
if the transaction handle corresponds to the one being used to enable the
quotas.

Fixes: 6426c7ad697d ("btrfs: qgroup: Fix qgroup accounting when creating 
snapshot")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/qgroup.c  | 10 --
 fs/btrfs/transaction.h |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d4917c0cddf5..3aec3bfa3d70 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -908,6 +908,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
trans = NULL;
goto out;
}
+   trans->enabling_quotas = true;
 
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
if (!fs_info->qgroup_ulist) {
@@ -2250,7 +2251,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle 
*trans, u64 srcid,
u32 level_size = 0;
u64 nums;
 
-   mutex_lock(_info->qgroup_ioctl_lock);
+   if (trans->enabling_quotas)
+   lockdep_assert_held(_info->qgroup_ioctl_lock);
+   else
+   mutex_lock(_info->qgroup_ioctl_lock);
+
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags))
goto out;
 
@@ -2413,7 +2418,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle 
*trans, u64 srcid,
 unlock:
spin_unlock(_info->qgroup_lock);
 out:
-   mutex_unlock(_info->qgroup_ioctl_lock);
+   if (!trans->enabling_quotas)
+   mutex_unlock(_info->qgroup_ioctl_lock);
return ret;
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 703d5116a2fc..a5553a1dee30 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -122,6 +122,7 @@ struct btrfs_trans_handle {
bool reloc_reserved;
bool sync;
bool dirty;
+   bool enabling_quotas;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
-- 
2.11.0



[PATCH] Btrfs: fix access to available allocation bits when starting balance

2018-11-19 Thread fdmanana
From: Filipe Manana 

The available allocation bits members from struct btrfs_fs_info are
protected by a sequence lock, and when starting balance we access them
incorrectly in two different ways:

1) In the read sequence lock loop at btrfs_balance() we use the values we
   read from fs_info->avail_*_alloc_bits and we can immediately do actions
   that have side effects and can not be undone (printing a message and
   jumping to a label). This is wrong because a retry might be needed, so
   our actions must not have side effects and must be repeatable as long
   as read_seqretry() returns a non-zero value. In other words, we were
   essentially ignoring the sequence lock;

2) Right below the read sequence lock loop, we were reading the values
   from avail_metadata_alloc_bits and avail_data_alloc_bits without any
   protection from concurrent writers, that is, reading them outside of
   the read sequence lock critical section.

So fix this by making sure we only read the available allocation bits
while in a read sequence lock critical section and that what we do in the
critical section is repeatable (has nothing that can not be undone) so
that any eventual retry that is needed is handled properly.

Fixes: de98ced9e743 ("Btrfs: use seqlock to protect fs_info->avail_{data, 
metadata, system}_alloc_bits")
Fixes: 14506127979a ("btrfs: fix a bogus warning when converting only data or 
metadata")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/volumes.c | 39 +++
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4405e430da6..223334f08530 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3712,6 +3712,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
int ret;
u64 num_devices;
unsigned seq;
+   bool reducing_integrity;
 
if (btrfs_fs_closing(fs_info) ||
atomic_read(_info->balance_pause_req) ||
@@ -3796,24 +3797,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
 !(bctl->sys.target & allowed)) ||
((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 (fs_info->avail_metadata_alloc_bits & allowed) &&
-!(bctl->meta.target & allowed))) {
-   if (bctl->flags & BTRFS_BALANCE_FORCE) {
-   btrfs_info(fs_info,
-   "balance: force reducing metadata integrity");
-   } else {
-   btrfs_err(fs_info,
-   "balance: reduces metadata integrity, use --force if you want this");
-   ret = -EINVAL;
-   goto out;
-   }
-   }
+!(bctl->meta.target & allowed)))
+   reducing_integrity = true;
+   else
+   reducing_integrity = false;
+
+   /* if we're not converting, the target field is uninitialized */
+   meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+   bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+   data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+   bctl->data.target : fs_info->avail_data_alloc_bits;
} while (read_seqretry(_info->profiles_lock, seq));
 
-   /* if we're not converting, the target field is uninitialized */
-   meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
-   bctl->meta.target : fs_info->avail_metadata_alloc_bits;
-   data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
-   bctl->data.target : fs_info->avail_data_alloc_bits;
+   if (reducing_integrity) {
+   if (bctl->flags & BTRFS_BALANCE_FORCE) {
+   btrfs_info(fs_info,
+  "balance: force reducing metadata 
integrity");
+   } else {
+   btrfs_err(fs_info,
+ "balance: reduces metadata integrity, use --force if you want this");
+   ret = -EINVAL;
+   goto out;
+   }
+   }
+
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
-- 
2.11.0



[PATCH] Btrfs: allow clear_extent_dirty() to receive a cached extent state record

2018-11-16 Thread fdmanana
From: Filipe Manana 

We can have a lot freed extents during the life span of transaction, so
the red black tree that keeps track of the ranges of each freed extent
(fs_info->freed_extents[]) can get quite big. When finishing a transaction
commit we find each range, process it (discard the extents, unpin them)
and then remove it from the red black tree. We can use an extent state
record as a cache when searching for a range, so that when we clean the
range we can use the cached extent state we passed to the search function
instead of iterating the red black tree again. Doing things as fast as
possible when finishing a transaction (in state TRANS_STATE_UNBLOCKED)
is convenient as it reduces the time we block another task that wants to
commit the next transaction.

So change clear_extent_dirty() to allow an optional extent state record to
be passed as an argument, which will be passed down to __clear_extent_bit.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/disk-io.c | 7 +--
 fs/btrfs/extent-tree.c | 7 +--
 fs/btrfs/extent_io.h   | 4 ++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 05dc3c17cb62..ecf3a45490e3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4359,12 +4359,15 @@ static int btrfs_destroy_pinned_extent(struct 
btrfs_fs_info *fs_info,
unpin = pinned_extents;
 again:
while (1) {
+   struct extent_state *cached_state = NULL;
+
ret = find_first_extent_bit(unpin, 0, , ,
-   EXTENT_DIRTY, NULL);
+   EXTENT_DIRTY, _state);
if (ret)
break;
 
-   clear_extent_dirty(unpin, start, end);
+   clear_extent_dirty(unpin, start, end, _state);
+   free_extent_state(cached_state);
btrfs_error_unpin_extent_range(fs_info, start, end);
cond_resched();
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 577878324799..33142d9c36d5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6615,9 +6615,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle 
*trans)
unpin = _info->freed_extents[0];
 
while (!trans->aborted) {
+   struct extent_state *cached_state = NULL;
+
mutex_lock(_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, , ,
-   EXTENT_DIRTY, NULL);
+   EXTENT_DIRTY, _state);
if (ret) {
mutex_unlock(_info->unused_bg_unpin_mutex);
break;
@@ -6627,9 +6629,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle 
*trans)
ret = btrfs_discard_extent(fs_info, start,
   end + 1 - start, NULL);
 
-   clear_extent_dirty(unpin, start, end);
+   clear_extent_dirty(unpin, start, end, _state);
unpin_extent_range(fs_info, start, end, true);
mutex_unlock(_info->unused_bg_unpin_mutex);
+   free_extent_state(cached_state);
cond_resched();
}
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b4d03e677e1d..36f7a9f87e46 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -353,11 +353,11 @@ static inline int set_extent_dirty(struct extent_io_tree 
*tree, u64 start,
 }
 
 static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
-   u64 end)
+u64 end, struct extent_state **cached)
 {
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
-   EXTENT_DO_ACCOUNTING, 0, 0, NULL);
+   EXTENT_DO_ACCOUNTING, 0, 0, cached);
 }
 
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-- 
2.11.0



[PATCH] Btrfs: bring back key search optimization to btrfs_search_old_slot()

2018-11-16 Thread fdmanana
From: Filipe Manana 

Commit d7396f07358a ("Btrfs: optimize key searches in btrfs_search_slot"),
dated from August 2013, introduced an optimization to search for keys in a
node/leaf to both btrfs_search_slot() and btrfs_search_old_slot(). For the
later, it ended up being reverted in commit d4b4087c43cc ("Btrfs: do a
full search everytime in btrfs_search_old_slot"), from September 2013,
because the content of extent buffers were often inconsistent during
replay. It turned out that the reason why they were often inconsistent was
because the extent buffer replay stopped being done atomically, and got
broken after commit c8cc63416537 ("Btrfs: stop using GFP_ATOMIC for the
tree mod log allocations"), introduced in July 2013. The extent buffer
replay issue was then found and fixed by commit 5de865eebb83 ("Btrfs: fix
tree mod logging"), dated from December 2013.

So bring back the optimization to btrfs_search_old_slot() as skipping it
and its comment about disabling it confusing. After all, if unwinding
extent buffers resulted in some inconsistency, the normal searches (binary
searches) would also not always work.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/ctree.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 089b46c4d97f..cf5487a79c96 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2966,7 +2966,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const 
struct btrfs_key *key,
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
-   int prev_cmp = -1;
+   int prev_cmp;
 
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
@@ -2977,6 +2977,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const 
struct btrfs_key *key,
}
 
 again:
+   prev_cmp = -1;
b = get_old_root(root, time_seq);
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
@@ -2994,11 +2995,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const 
struct btrfs_key *key,
 */
btrfs_unlock_up_safe(p, level + 1);
 
-   /*
-* Since we can unwind ebs we want to do a real search every
-* time.
-*/
-   prev_cmp = -1;
ret = key_search(b, key, level, _cmp, );
 
if (level != 0) {
-- 
2.11.0



[PATCH] btrfs: test send after radical changes in a complex directory hierarchy

2018-11-14 Thread fdmanana
From: Filipe Manana 

Test an incremental send operation in a scenario where the relationship
of ancestor-descendant between multiple directories is inversed, and
where multiple directories that were previously ancestors of another
directory now become descendents of multiple directories that used to be
their ancestors in the parent snapshot. This used to trigger an
infinite loop in the kernel code.

This is motivated by a bug found in btrfs which is fixed by the following
patch for the linux kernel:

  "Btrfs: send, fix infinite loop due to directory rename dependencies"

Signed-off-by: Filipe Manana 
---
 tests/btrfs/178 | 190 
 tests/btrfs/178.out |   6 ++
 tests/btrfs/group   |   1 +
 3 files changed, 197 insertions(+)
 create mode 100755 tests/btrfs/178
 create mode 100644 tests/btrfs/178.out

diff --git a/tests/btrfs/178 b/tests/btrfs/178
new file mode 100755
index ..e277fbee
--- /dev/null
+++ b/tests/btrfs/178
@@ -0,0 +1,190 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2017 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. btrfs/178
+#
+# Test an incremental send operation in a scenario where the relationship of
+# ancestor-descendant between multiple directories is inversed, and where
+# multiple directories that were previously ancestors of another directory now
+# become descendents of multiple directories that used to be their ancestors in
+# the parent snapshot. This used to trigger an infinite loop in the kernel 
code.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -fr $send_files_dir
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_test
+_require_scratch
+_require_fssum
+
+send_files_dir=$TEST_DIR/btrfs-test-$seq
+
+rm -f $seqres.full
+rm -fr $send_files_dir
+mkdir $send_files_dir
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+# The name of each directory corresponds to its inode number, to make it easier
+# to debug since btrfs' send processes inodes in ascending order according to
+# their number.
+mkdir $SCRATCH_MNT/257
+mkdir $SCRATCH_MNT/258
+mv $SCRATCH_MNT/257 $SCRATCH_MNT/258/257
+mkdir $SCRATCH_MNT/259
+mkdir $SCRATCH_MNT/260
+mkdir $SCRATCH_MNT/261
+mkdir $SCRATCH_MNT/262
+mkdir $SCRATCH_MNT/263
+mkdir $SCRATCH_MNT/264
+mv $SCRATCH_MNT/258 $SCRATCH_MNT/264/258
+mkdir $SCRATCH_MNT/265
+mkdir $SCRATCH_MNT/266
+mv $SCRATCH_MNT/259 $SCRATCH_MNT/266/259
+mv $SCRATCH_MNT/260 $SCRATCH_MNT/266/260
+mv $SCRATCH_MNT/264 $SCRATCH_MNT/266/264
+mv $SCRATCH_MNT/265 $SCRATCH_MNT/266/265
+mkdir $SCRATCH_MNT/266/260/267
+mkdir $SCRATCH_MNT/266/268
+mkdir $SCRATCH_MNT/266/269
+mv $SCRATCH_MNT/262 $SCRATCH_MNT/266/269/262
+mkdir $SCRATCH_MNT/266/270
+mkdir $SCRATCH_MNT/271
+mv $SCRATCH_MNT/266 $SCRATCH_MNT/271/266
+mkdir $SCRATCH_MNT/271/266/272
+mv $SCRATCH_MNT/263 $SCRATCH_MNT/271/266/272/263
+mkdir $SCRATCH_MNT/273
+mkdir $SCRATCH_MNT/271/266/274
+mv $SCRATCH_MNT/273 $SCRATCH_MNT/271/266/274/273
+mkdir $SCRATCH_MNT/271/266/272/275
+mv $SCRATCH_MNT/271 $SCRATCH_MNT/261/271
+
+# Filesystem looks like:
+#
+# .
+# |--- 261/
+#   |--- 271/
+# |--- 266/
+#   |--- 259/
+#   |--- 260/
+#   | |--- 267
+#   |
+#   |--- 264/
+#   | |--- 258/
+#   |   |--- 257/
+#   |
+#   |--- 265/
+#   |--- 268/
+#   |--- 269/
+#   | |--- 262/
+#   |
+#   |--- 270/
+#   |--- 272/
+#   | |--- 263/
+#   | |--- 275/
+#   |
+#   |--- 274/
+# |--- 273/
+#
+$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT \
+   $SCRATCH_MNT/mysnap1 > /dev/null
+
+$BTRFS_UTIL_PROG send -f $send_files_dir/1.snap \
+   $SCRATCH_MNT/mysnap1 2>&1 1>/dev/null | _filter_scratch
+
+mv $SCRATCH_MNT/261/271/266/272/275 $SCRATCH_MNT/275
+mv $SCRATCH_MNT/261/271/266/274 $SCRATCH_MNT/275/274
+mv $SCRATCH_MNT/261/271/266/269/262 $SCRATCH_MNT/275/274/273/262
+mv $SCRATCH_MNT/261/271/266/269 $SCRATCH_MNT/275/274/273/262/269
+mv $SCRATCH_MNT/261/271/266/264/258/257 $SCRATCH_MNT/261/271/266/272
+mv $SCRATCH_MNT/261/271/266/264/258 $SCRATCH_MNT/275/274/273/262/269/258
+mv $SCRATCH_MNT/261/271 $SCRATCH_MNT/275/274/273/262/269/258/271
+mv $SCRATCH_MNT/275/274/273/262/269/258/271/266/268 \
+   $SCRATCH_MNT/275/274/273/262/269/258/271/268
+mv $SCRATCH_MNT/275/274/273/262/269/258/271/266/260/267 \
+   $SCRATCH_MNT/275/274/273/262/269/258/271/268/267
+mv 

[PATCH] Btrfs: send, fix infinite loop due to directory rename dependencies

2018-11-14 Thread fdmanana
From: Robbie Ko 

When doing an incremental send, due to the need of delaying directory move
(rename) operations we can end up in infinite loop at
apply_children_dir_moves().

An example scenario that triggers this problem is described below, where
directory names correspond to the numbers of their respective inodes.

Parent snapshot:

 .
 |--- 261/
   |--- 271/
 |--- 266/
   |--- 259/
   |--- 260/
   | |--- 267
   |
   |--- 264/
   | |--- 258/
   |   |--- 257/
   |
   |--- 265/
   |--- 268/
   |--- 269/
   | |--- 262/
   |
   |--- 270/
   |--- 272/
   | |--- 263/
   | |--- 275/
   |
   |--- 274/
 |--- 273/

Send snapshot:

 .
 |-- 275/
  |-- 274/
   |-- 273/
|-- 262/
 |-- 269/
  |-- 258/
   |-- 271/
|-- 268/
 |-- 267/
  |-- 270/
   |-- 259/
   ||-- 265/
   |
   |-- 272/
|-- 257/
 |-- 260/
 |-- 264/
  |-- 263/
   |-- 261/
|-- 
266/

When processing inode 257 we delay its move (rename) operation because its
new parent in the send snapshot, inode 272, was not yet processed. Then
when processing inode 272, we delay the move operation for that inode
because inode 274 is its ancestor in the send snapshot. Finally we delay
the move operation for inode 274 when processing it because inode 275 is
its new parent in the send snapshot and was not yet moved.

When finishing processing inode 275, we start to do the move operations
that were previously delayed (at apply_children_dir_moves()), resulting in
the following iterations:

1) We issue the move operation for inode 274;

2) Because inode 262 depended on the move operation of inode 274 (it was
   delayed because 274 is its ancestor in the send snapshot), we issue the
   move operation for inode 262;

3) We issue the move operation for inode 272, because it was delayed by
   inode 274 too (ancestor of 272 in the send snapshot);

4) We issue the move operation for inode 269 (it was delayed by 262);

5) We issue the move operation for inode 257 (it was delayed by 272);

6) We issue the move operation for inode 260 (it was delayed by 272);

7) We issue the move operation for inode 258 (it was delayed by 269);

8) We issue the move operation for inode 264 (it was delayed by 257);

9) We issue the move operation for inode 271 (it was delayed by 258);

10) We issue the move operation for inode 263 (it was delayed by 264);

11) We issue the move operation for inode 268 (it was delayed by 271);

12) We verify if we can issue the move operation for inode 270 (it was
delayed by 271). We detect a path loop in the current state, because
inode 267 needs to be moved first before we can issue the move
operation for inode 270. So we delay again the move operation for
inode 270, this time we will attempt to do it after inode 267 is
moved;

13) We issue the move operation for inode 261 (it was delayed by 263);

14) We verify if we can issue the move operation for inode 266 (it was
delayed by 263). We detect a path loop in the current state, because
inode 270 needs to be moved first before we can issue the move
operation for inode 266. So we delay again the move operation for
inode 266, this time we will attempt to do it after inode 270 is
moved (its move operation was delayed in step 12);

15) We issue the move operation for inode 267 (it was delayed by 268);

16) We verify if we can issue the move operation for inode 266 (it was
delayed by 270). We detect a path loop in the current state, because
inode 270 needs to be moved first before we can issue the move
operation for inode 266. So we delay again the move operation for
inode 266, this time we will attempt to do it after inode 270 is
moved (its move operation was delayed in step 12). So here we added
again the same delayed move operation that we added in step 14;

17) We attempt again to see if we can issue the move 

[PATCH] Btrfs: ensure path name is null terminated at btrfs_control_ioctl

2018-11-14 Thread fdmanana
From: Filipe Manana 

We were using the path name received from user space without checking that
it is null terminated. While btrfs-progs is well behaved and does proper
validation and null termination, someone could call the ioctl and pass
a non-null terminated patch, leading to buffer overrun problems in the
kernel.

So just set the last byte of the path to a null character, similar to what
we do in other ioctls (add/remove/resize device, snapshot creation, etc).

Signed-off-by: Filipe Manana 
---
 fs/btrfs/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6601c9aa5e35..8ad145820ea8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2235,6 +2235,7 @@ static long btrfs_control_ioctl(struct file *file, 
unsigned int cmd,
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
+   vol->name[BTRFS_PATH_NAME_MAX] = '\0';
 
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
-- 
2.11.0



[PATCH] Btrfs: remove no longer used io_err from btrfs_log_ctx

2018-11-12 Thread fdmanana
From: Filipe Manana 

The io_err field of struct btrfs_log_ctx is no longer used after the
recent simplification of the fast fsync path, where we now wait for
ordered extents to complete before logging the inode. We did this in
commit b5e6c3e170b7 ("btrfs: always wait on ordered extents at fsync
time") and commit a2120a473a80 ("btrfs: clean up the left over
logged_list usage") removed its last use.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/file.c | 19 ---
 fs/btrfs/tree-log.h |  2 --
 2 files changed, 21 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6483757f0c09..65b3bcda99e2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2157,25 +2157,6 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
 */
inode_unlock(inode);
 
-   /*
-* If any of the ordered extents had an error, just return it to user
-* space, so that the application knows some writes didn't succeed and
-* can take proper action (retry for e.g.). Blindly committing the
-* transaction in this case, would fool userspace that everything was
-* successful. And we also want to make sure our log doesn't contain
-* file extent items pointing to extents that weren't fully written to -
-* just like in the non fast fsync path, where we check for the ordered
-* operation's error flag before writing to the log tree and return -EIO
-* if any of them had this flag set (btrfs_wait_ordered_range) -
-* therefore we need to check for errors in the ordered operations,
-* which are indicated by ctx.io_err.
-*/
-   if (ctx.io_err) {
-   btrfs_end_transaction(trans);
-   ret = ctx.io_err;
-   goto out;
-   }
-
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
ret = btrfs_sync_log(trans, root, );
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7ab9bb88a639..a1e14ee26c54 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -15,7 +15,6 @@
 struct btrfs_log_ctx {
int log_ret;
int log_transid;
-   int io_err;
bool log_new_dentries;
struct inode *inode;
struct list_head list;
@@ -26,7 +25,6 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx 
*ctx,
 {
ctx->log_ret = 0;
ctx->log_transid = 0;
-   ctx->io_err = 0;
ctx->log_new_dentries = false;
ctx->inode = inode;
INIT_LIST_HEAD(>list);
-- 
2.11.0



[PATCH] Btrfs: fix rare chances for data loss when doing a fast fsync

2018-11-12 Thread fdmanana
From: Filipe Manana 

After the simplification of the fast fsync patch done recently by commit
b5e6c3e170b7 ("btrfs: always wait on ordered extents at fsync time") and
commit e7175a692765 ("btrfs: remove the wait ordered logic in the
log_one_extent path"), we got a very short time window where we can get
extents logged without writeback completing first or extents logged
without logging the respective data checksums. Both issues can only happen
when doing a non-full (fast) fsync.

As soon as we enter btrfs_sync_file() we trigger writeback, then lock the
inode and then wait for the writeback to complete before starting to log
the inode. However before we acquire the inode's lock and after we started
writeback, it's possible that more writes happened and dirtied more pages.
If that happened and those pages get writeback triggered while we are
logging the inode (for example, the VM subsystem triggering it due to
memory pressure, or another concurrent fsync), we end up seeing the
respective extent maps in the inode's list of modified extents and will
log matching file extent items without waiting for the respective
ordered extents to complete, meaning that either of the following will
happen:

1) We log an extent after its writeback finishes but before its checksums
   are added to the csum tree, leading to -EIO errors when attempting to
   read the extent after a log replay.

2) We log an extent before its writeback finishes.
   Therefore after the log replay we will have a file extent item pointing
   to an unwritten extent (and without the respective data checksums as
   well).

This could not happen before the fast fsync patch simplification, because
for any extent we found in the list of modified extents, we would wait for
its respective ordered extent to finish writeback or collect its checksums
for logging if it did not complete yet.

Fix this by triggering writeback again after acquiring the inode's lock
and before waiting for ordered extents to complete.

Fixes: e7175a692765 ("btrfs: remove the wait ordered logic in the 
log_one_extent path")
Fixes: b5e6c3e170b7 ("btrfs: always wait on ordered extents at fsync time")
CC: sta...@vger.kernel.org # 4.19+
Signed-off-by: Filipe Manana 
---
 fs/btrfs/file.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2be00e873e92..6483757f0c09 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2064,6 +2064,30 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
atomic_inc(>log_batch);
 
/*
+* Before we acquired the inode's lock, someone may have dirtied more
+* pages in the target range. We need to make sure that writeback for
+* any such pages does not start while we are logging the inode, because
+* if it does, any of the following might happen when we are not doing a
+* full inode sync:
+*
+* 1) We log an extent after its writeback finishes but before its
+*checksums are added to the csum tree, leading to -EIO errors
+*when attempting to read the extent after a log replay.
+*
+* 2) We can end up logging an extent before its writeback finishes.
+*Therefore after the log replay we will have a file extent item
+*pointing to an unwritten extent (and no data checksums as well).
+*
+* So trigger writeback for any eventual new dirty pages and then we
+* wait for all ordered extents to complete below.
+*/
+   ret = start_ordered_ops(inode, start, end);
+   if (ret) {
+   inode_unlock(inode);
+   goto out;
+   }
+
+   /*
 * We have to do this here to avoid the priority inversion of waiting on
 * IO of a lower priority task while holding a transaciton open.
 */
-- 
2.11.0



[PATCH] Btrfs: simpler and more efficient cleanup of a log tree's extent io tree

2018-11-09 Thread fdmanana
From: Filipe Manana 

We currently are in a loop finding each range (corresponding to a btree
node/leaf) in a log root's extent io tree and then clean it up. This is a
waste of time since we are traversing the extent io tree's rb_tree more
times then needed (one for a range lookup and another for cleaning it up)
without any good reason.

We free the log trees when we are in the critical section of a transaction
commit (the transaction state is set to TRANS_STATE_COMMIT_DOING), so it's
of great convenience to do everything as fast as possible in order to
reduce the time we block other tasks from starting a new transaction.

So fix this by traversing the extent io tree once and cleaning up all its
records in one go while traversing it.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 16 ++--
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d49edd25f2e5..aac3749f697f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3204,8 +3204,6 @@ static void free_log_tree(struct btrfs_trans_handle 
*trans,
  struct btrfs_root *log)
 {
int ret;
-   u64 start;
-   u64 end;
struct walk_control wc = {
.free = 1,
.process_func = process_one_buffer
@@ -3216,18 +3214,8 @@ static void free_log_tree(struct btrfs_trans_handle 
*trans,
if (ret)
btrfs_abort_transaction(trans, ret);
 
-   while (1) {
-   ret = find_first_extent_bit(>dirty_log_pages,
-   0, , ,
-   EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
-   NULL);
-   if (ret)
-   break;
-
-   clear_extent_bits(>dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
-   }
-
+   clear_extent_bits(>dirty_log_pages, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
free_extent_buffer(log->node);
kfree(log);
 }
-- 
2.11.0



[PATCH] Btrfs: do not set log for full commit when creating non-data block groups

2018-11-08 Thread fdmanana
From: Filipe Manana 

When creating a block group we don't need to set the log for full commit
if the new block group is not used for data. Logged items can only point
to logical addresses of data block groups (through file extent items) so
there is no need to for the next fsync to fallback to a transaction commit
if the new block group is for metadata.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 577878324799..588fbd1606fb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10112,7 +10112,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle 
*trans, u64 bytes_used,
struct btrfs_block_group_cache *cache;
int ret;
 
-   btrfs_set_log_full_commit(fs_info, trans);
+   if (type & BTRFS_BLOCK_GROUP_DATA)
+   btrfs_set_log_full_commit(fs_info, trans);
 
cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
if (!cache)
-- 
2.11.0



[PATCH] btrfs: fix computation of max fs size for multiple device fs tests

2018-11-06 Thread fdmanana
From: Filipe Manana 

We were sorting numerical values with the 'sort' tool without telling it
that we are sorting numbers, giving us unexpected ordering. So just pass
the '-n' option to the 'sort' tool.

Example:

$ echo -e "11\n9\n20" | sort
11
20
9

$ echo -e "11\n9\n20" | sort -n
9
11
20

Signed-off-by: Filipe Manana 
---
 tests/btrfs/124 | 2 +-
 tests/btrfs/125 | 2 +-
 tests/btrfs/154 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/btrfs/124 b/tests/btrfs/124
index ce3ad6aa..a52c65f6 100755
--- a/tests/btrfs/124
+++ b/tests/btrfs/124
@@ -61,7 +61,7 @@ dev2=`echo $SCRATCH_DEV_POOL | awk '{print $2}'`
 dev1_sz=`blockdev --getsize64 $dev1`
 dev2_sz=`blockdev --getsize64 $dev2`
 # get min of both
-max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz" | sort | head -1`
+max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz" | sort -n | head -1`
 # Need disks with more than 2G.
 if [ $max_fs_sz -lt 20 ]; then
_scratch_dev_pool_put
diff --git a/tests/btrfs/125 b/tests/btrfs/125
index e38de264..5ac68b67 100755
--- a/tests/btrfs/125
+++ b/tests/btrfs/125
@@ -68,7 +68,7 @@ dev2_sz=`blockdev --getsize64 $dev2`
 dev3_sz=`blockdev --getsize64 $dev3`
 
 # get min of both.
-max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz\n$dev3_sz" | sort | head -1`
+max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz\n$dev3_sz" | sort -n | head -1`
 # Need disks with more than 2G
 if [ $max_fs_sz -lt 20 ]; then
_scratch_dev_pool_put
diff --git a/tests/btrfs/154 b/tests/btrfs/154
index 99ea232a..cd6c688f 100755
--- a/tests/btrfs/154
+++ b/tests/btrfs/154
@@ -51,7 +51,7 @@ DEV1_SZ=`blockdev --getsize64 $DEV1`
 DEV2_SZ=`blockdev --getsize64 $DEV2`
 
 # get min
-MAX_FS_SZ=`echo -e "$DEV1_SZ\n$DEV2_SZ" | sort | head -1`
+MAX_FS_SZ=`echo -e "$DEV1_SZ\n$DEV2_SZ" | sort -n | head -1`
 # Need disks with more than 2G
 if [ $MAX_FS_SZ -lt 20 ]; then
_scratch_dev_pool_put
-- 
2.11.0



[PATCH 3/3] btrfs: add new filter for file cloning error translation

2018-11-05 Thread fdmanana
From: Filipe Manana 

A bug in file cloning/reflinking was recently found that afftected both
Btrfs and XFS, which was caused by allowing the cloning of an eof block
into the middle of a file when the eof is not aligned to the filesystem's
block size.

The fix consists of returning the errno -EINVAL to user space when the
arguments passed to the system call lead to the scenario of data
corruption. However this overlaps with some cases where the system call,
in Btrfs, returned -EOPNOTSUPP, which means we are trying to reflink
inline extents. That is unsupported in Btrfs due to the huge complexity
of supporting it (due to copying and trimming inline extents, deal with
eventual compression, etc).

We have a few btrfs test cases that verify that attempts to clone inline
extents result in a failure, and are currently expecting an -EINVAL error
message from the output of the cloner program. So create a filter that
converts error messages related to the -EOPNOTSUPP error to messages
related to the -EINVAL error, so that the test can run both on patched
and non-patched linux kernels.

The corresponding btrfs patch for the linux kernel is titled:

 "Btrfs: fix data corruption due to cloning of eof block"

And the VFS change that introduces the -EINVAL error return was introduced
by the following linux kernel commit (landed in 4.20-rc1):

 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF 
block")

The btrfs patch is not yet in Linus' tree (it was submitted around the
same time as this change) and the VFS change was introduced in 4.10-rc1.

Signed-off-by: Filipe Manana 
---
 common/filter.btrfs | 17 +
 tests/btrfs/035 |  3 ++-
 tests/btrfs/035.out |  2 +-
 tests/btrfs/096 |  7 ---
 tests/btrfs/096.out |  2 +-
 tests/btrfs/112 | 25 +
 tests/btrfs/112.out | 48 
 tests/btrfs/113 |  4 +++-
 tests/btrfs/113.out |  2 +-
 9 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/common/filter.btrfs b/common/filter.btrfs
index dda85776..d4169cc6 100644
--- a/common/filter.btrfs
+++ b/common/filter.btrfs
@@ -97,5 +97,22 @@ _filter_btrfs_qgroup_assign_warnings()
-e "/quotas may be inconsistent, rescan needed/d"
 }
 
+# Long ago we found that attempting to clone inline extents resulted in hitting
+# a BUG_ON() and then decided to not support such use cases by returning errno
+# -EOPNOTSUPP to user space. Later on, clone/reflink became a VFS API too, 
since
+# other filesystems (such as XFS) implemented this feature. After that we found
+# one scenario of data corruption due to allowing cloning an EOF block into the
+# middle of a file, and started to reject such scenario by returning the errno
+# -EINVAL to user space (this affected both Btrfs and XFS). Such scenario often
+# overlaps the detection of attempts to clone inline extents, since it is done
+# early on based only on the arguments passed to the clone system call (and
+# btrfs' specific ioctl) before processing the source file extents.
+# So replace error messages related to errno -EOPNOTSUPP to be the same as the
+# one we get from a -EINVAL errno.
+_filter_btrfs_cloner_error()
+{
+   sed -e "s/\(clone failed:\) Operation not supported/\1 Invalid 
argument/g"
+}
+
 # make sure this script returns success
 /bin/true
diff --git a/tests/btrfs/035 b/tests/btrfs/035
index c9c09e16..a6f67d4f 100755
--- a/tests/btrfs/035
+++ b/tests/btrfs/035
@@ -24,6 +24,7 @@ trap "_cleanup ; exit \$status" 0 1 2 3 15
 # get standard environment, filters and checks
 . ./common/rc
 . ./common/filter
+. ./common/filter.btrfs
 
 # real QA test starts here
 _supported_fs btrfs
@@ -49,7 +50,7 @@ $CLONER_PROG $SCRATCH_MNT/src $SCRATCH_MNT/src.clone2
 snap_src_sz=`ls -lah $SCRATCH_MNT/src.clone1 | awk '{print $5}'`
 echo "attempting ioctl (src.clone1 src)"
 $CLONER_PROG -s 0 -d 0 -l ${snap_src_sz} \
-   $SCRATCH_MNT/src.clone1 $SCRATCH_MNT/src
+   $SCRATCH_MNT/src.clone1 $SCRATCH_MNT/src | _filter_btrfs_cloner_error
 
 # The clone operation should have failed. If it did not it meant we had data
 # loss, because file "src.clone1" has an inline extent which is 10 bytes long
diff --git a/tests/btrfs/035.out b/tests/btrfs/035.out
index 3ea7d779..d810bb2b 100644
--- a/tests/btrfs/035.out
+++ b/tests/btrfs/035.out
@@ -1,6 +1,6 @@
 QA output created by 035
 attempting ioctl (src.clone1 src)
-clone failed: Operation not supported
+clone failed: Invalid argument
 File src data after attempt to clone from src.clone1 into src:
 000 62 62 62 62 62 62 62 62 62 62 63 63 63 63 63 63
 020 63 63 63 63
diff --git a/tests/btrfs/096 b/tests/btrfs/096
index e8552947..b9188e6e 100755
--- a/tests/btrfs/096
+++ b/tests/btrfs/096
@@ -21,6 +21,7 @@ _cleanup()
 # get standard environment, filters and checks
 . ./common/rc
 . ./common/filter
+. ./common/filter.btrfs
 
 # real QA test starts here
 _supported_fs btrfs
@@ -52,11 +53,11 @@ $XFS_IO_PROG -f -s 

[PATCH 2/3] generic: test attempt to reflink eof block into the middle of a file

2018-11-05 Thread fdmanana
From: Filipe Manana 

Test that we can not clone a range from a file A into the middle of a file B
when the range includes the last block of file A and file A's size is not
aligned with the filesystem's block size. Allowing such case would lead to
data corruption since the data between EOF and the end of its block is
undefined.

This is motivated by a bug recently found that affects both Btrfs and XFS
and is fixed by the following commits/patches for the linux kernel:

 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF 
block")
 b39989009bdb ("xfs: fix data corruption w/ unaligned reflink ranges")
 Btrfs: fix data corruption due to cloning of eof block

The VFS patch landed in kernel 4.20-rc1 and the XFS patch landed in 4.19.
The Btrfs fix is very recent and it is not yet in Linus' tree.

Signed-off-by: Filipe Manana 
---
 tests/generic/518 | 60 +++
 tests/generic/518.out | 10 +
 tests/generic/group   |  1 +
 3 files changed, 71 insertions(+)
 create mode 100755 tests/generic/518
 create mode 100644 tests/generic/518.out

diff --git a/tests/generic/518 b/tests/generic/518
new file mode 100755
index ..c75110d1
--- /dev/null
+++ b/tests/generic/518
@@ -0,0 +1,60 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 518
+#
+# Test that we can not clone a range from a file A into the middle of a file B
+# when the range includes the last block of file A and file A's size is not
+# aligned with the filesystem's block size. Allowing such case would lead to
+# data corruption since the data between EOF and the end of its block is
+# undefined.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch_reflink
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+foo_size=$((256 * 1024 + 100)) # 256Kb + 100 bytes
+bar_size="1M"
+
+$XFS_IO_PROG -f -c "pwrite -S 0x3c 0 $foo_size" $SCRATCH_MNT/foo | 
_filter_xfs_io
+$XFS_IO_PROG -f -c "pwrite -S 0xb5 0 $bar_size" $SCRATCH_MNT/bar | 
_filter_xfs_io
+
+# Cloning the EOF block of a file into the middle of another file should fail
+# with an invalid argument error.
+$XFS_IO_PROG -c "reflink $SCRATCH_MNT/foo 0 512K $foo_size" $SCRATCH_MNT/bar
+
+# Unmount the filesystem and mount it again. This guarantees any file data in
+# the page cache is dropped.
+_scratch_cycle_mount
+
+# Verify no changes were made to the file.
+echo "File content after failed reflink:"
+od -A d -t x1 $SCRATCH_MNT/bar
+
+status=0
+exit
diff --git a/tests/generic/518.out b/tests/generic/518.out
new file mode 100644
index ..726c2073
--- /dev/null
+++ b/tests/generic/518.out
@@ -0,0 +1,10 @@
+QA output created by 518
+wrote 262244/262244 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1048576/1048576 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+XFS_IOC_CLONE_RANGE: Invalid argument
+File content after failed reflink:
+000 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5
+*
+1048576
diff --git a/tests/generic/group b/tests/generic/group
index 326d3a1d..ef24f578 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -520,3 +520,4 @@
 515 auto quick clone
 516 auto quick dedupe clone
 517 auto quick dedupe clone
+518 auto quick clone
-- 
2.11.0



[PATCH 1/3] generic: test attempt to dedup eof block into the middle of a file

2018-11-05 Thread fdmanana
From: Filipe Manana 

Test that deduplication of an entire file that has a size that is not
aligned to the filesystem's block size into the middle of a different
file does not corrupt the destination's file data by reflinking the last
(eof) block.

This test is motivated by a bug recently found that affects both Btrfs
and XFS, and is fixed by the following commits/patches for the linux
kernel:

 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF 
block")
 dceeb47b0ed6 ("xfs: fix data corruption w/ unaligned dedupe ranges")
 de02b9f6bb65 ("Btrfs: fix data corruption when deduplicating between different 
files")
 Btrfs: fix infinite loop on inode eviction after deduplication of eof block

The VFS patch was added to kernel 4.20-rc1 and the XFS and first Btrfs
patches were added to kernel 4.19. The second patch for Btrfs is very
recent and it is not yet in Linus' tree.

Signed-off-by: Filipe Manana 
---
 tests/generic/517 | 98 +++
 tests/generic/517.out | 45 +++
 tests/generic/group   |  1 +
 3 files changed, 144 insertions(+)
 create mode 100755 tests/generic/517
 create mode 100644 tests/generic/517.out

diff --git a/tests/generic/517 b/tests/generic/517
new file mode 100755
index ..601bb24e
--- /dev/null
+++ b/tests/generic/517
@@ -0,0 +1,98 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 517
+#
+# Test that deduplication of an entire file that has a size that is not aligned
+# to the filesystem's block size into the middle of a different file does not
+# corrupt the destination's file data by reflinking the last (eof) block.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch_dedupe
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+# The first byte with a value of 0xae starts at an offset (2518890) which is 
not
+# a multiple of the block size.
+$XFS_IO_PROG -f \
+   -c "pwrite -S 0x6b 0 2518890" \
+   -c "pwrite -S 0xae 2518890 102398" \
+   $SCRATCH_MNT/foo | _filter_xfs_io
+
+# Create a second file with a length not aligned to the block size, whose bytes
+# all have the value 0x6b, so that its extent(s) can be deduplicated with the
+# first file.
+$XFS_IO_PROG -f -c "pwrite -S 0x6b 0 557771" $SCRATCH_MNT/bar | _filter_xfs_io
+
+# The file is filled with bytes having the value 0x6b from offset 0 to offset
+# 2518889 and with the value 0xae from offset 2518890 to offset 2621287.
+echo "File content before first deduplication:"
+od -t x1 $SCRATCH_MNT/foo
+
+# Now deduplicate the entire second file into a range of the first file that
+# also has all bytes with the value 0x6b. The destination range's end offset
+# must not be aligned to the block size and must be less then the offset of
+# the first byte with the value 0xae (byte at offset 2518890).
+$XFS_IO_PROG -c "dedupe $SCRATCH_MNT/bar 0 1957888 557771" $SCRATCH_MNT/foo \
+   | _filter_xfs_io
+
+# We should have exactly the same data we had before we asked for 
deduplication.
+echo "File content after first deduplication and before unmounting:"
+od -A d -t x1 $SCRATCH_MNT/foo
+
+# Unmount the filesystem and mount it again. This guarantees any file data in
+# the page cache is dropped.
+_scratch_cycle_mount
+
+# We should have exactly the same data we had before we asked for 
deduplication.
+echo "File content after first unmount:"
+od -A d -t x1 $SCRATCH_MNT/foo
+
+# Now do a similar test when trying to dedup just the last (eof) block of a 
file
+# into the middle of another file. This triggered a different bug on btrfs.
+$XFS_IO_PROG -f -c "pwrite -S 0xae 0 100" $SCRATCH_MNT/baz | _filter_xfs_io
+
+# Unmount the filesystem and mount it again before attempting to dedupe baz's
+# last block into foo. This is necessary to trigger that btrfs bug mentioned
+# before.
+_scratch_cycle_mount
+
+# Now attempt to dedupe the single block of baz into foo.
+$XFS_IO_PROG -c "dedupe $SCRATCH_MNT/baz 0 2519040 100" $SCRATCH_MNT/foo \
+| _filter_xfs_io
+
+# Now attempt to unmount the filesystem before reading from the file. This is
+# meant to trigger the btrfs bug which caused an infinite loop during inode
+# eviction.
+_scratch_cycle_mount
+
+# We should have exactly the same data we had before we asked for 
deduplication.
+echo "File content after second deduplication:"
+od -A d -t x1 $SCRATCH_MNT/foo
+
+status=0
+exit
diff --git a/tests/generic/517.out b/tests/generic/517.out
new file mode 100644
index ..137a9719
--- /dev/null
+++ 

[PATCH] Btrfs: fix data corruption due to cloning of eof block

2018-11-05 Thread fdmanana
From: Filipe Manana 

We currently allow cloning a range from a file which includes the last
block of the file even if the file's size is not aligned to the block
size. This is fine and useful when the destination file has the same size,
but when it does not and the range ends somewhere in the middle of the
destination file, it leads to corruption because the bytes between the EOF
and the end of the block have undefined data (when there is support for
discard/trimming they have a value of 0x00).

Example:

 $ mkfs.btrfs -f /dev/sdb
 $ mount /dev/sdb /mnt

 $ export foo_size=$((256 * 1024 + 100))
 $ xfs_io -f -c "pwrite -S 0x3c 0 $foo_size" /mnt/foo
 $ xfs_io -f -c "pwrite -S 0xb5 0 1M" /mnt/bar

 $ xfs_io -c "reflink /mnt/foo 0 512K $foo_size" /mnt/bar

 $ od -A d -t x1 /mnt/bar
 000 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5
 *
 0524288 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c
 *
 0786528 3c 3c 3c 3c 00 00 00 00 00 00 00 00 00 00 00 00
 0786544 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 *
 0790528 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5
 *
 1048576

The bytes in the range from 786532 (512Kb + 256Kb + 100 bytes) to 790527
(512Kb + 256Kb + 4Kb - 1) got corrupted, having now a value of 0x00 instead
of 0xb5.

This is similar to the problem we had for deduplication that got recently
fixed by commit de02b9f6bb65 ("Btrfs: fix data corruption when
deduplicating between different files").

Fix this by not allowing such operations to be performed and return the
errno -EINVAL to user space. This is what XFS is doing as well at the VFS
level. This change however now makes us return -EINVAL instead of
-EOPNOTSUPP for cases where the source range maps to an inline extent and
the destination range's end is smaller then the destination file's size,
since the detection of inline extents is done during the actual process of
dropping file extent items (at __btrfs_drop_extents()). Returning the
-EINVAL error is done early on and solely based on the input parameters
(offsets and length) and destination file's size. This makes us consistent
with XFS and anyone else supporting cloning since this case is now checked
at a higher level in the VFS and is where the -EINVAL will be returned
from starting with kernel 4.20 (the VFS changed was introduced in 4.20-rc1
by commit 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into
partial EOF block"). So this change is more geared towards stable kernels,
as it's unlikely the new VFS checks get removed intentionally.

A test case for fstests follows soon, as well as an update to filter
existing tests that expect -EOPNOTSUPP to accept -EINVAL as well.

CC:  # 4.4+
Signed-off-by: Filipe Manana 
---
 fs/btrfs/ioctl.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f3134fc69880..30e098970063 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4277,9 +4277,17 @@ static noinline int btrfs_clone_files(struct file *file, 
struct file *file_src,
goto out_unlock;
if (len == 0)
olen = len = src->i_size - off;
-   /* if we extend to eof, continue to block boundary */
-   if (off + len == src->i_size)
+   /*
+* If we extend to eof, continue to block boundary if and only if the
+* destination end offset matches the destination file's size, otherwise
+* we would be corrupting data by placing the eof block into the middle
+* of a file.
+*/
+   if (off + len == src->i_size) {
+   if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
+   goto out_unlock;
len = ALIGN(src->i_size, bs) - off;
+   }
 
if (len == 0) {
ret = 0;
-- 
2.11.0



[PATCH] Btrfs: fix infinite loop on inode eviction after deduplication of eof block

2018-11-05 Thread fdmanana
From: Filipe Manana 

If we attempt to deduplicate the last block of a file A into the middle of
a file B, and file A's size is not a multiple of the block size, we end
rounding the deduplication length to 0 bytes, to avoid the data corruption
issue fixed by commit de02b9f6bb65 ("Btrfs: fix data corruption when
deduplicating between different files"). However a length of zero will
cause the insertion of an extent state with a start value greater (by 1)
then the end value, leading to a corrupt extent state that will trigger a
warning and cause chaos such as an infinite loop during inode eviction.
Example trace:

 [96049.833585] [ cut here ]
 [96049.833714] WARNING: CPU: 0 PID: 24448 at fs/btrfs/extent_io.c:436 
insert_state+0x101/0x120 [btrfs]
 [96049.833767] CPU: 0 PID: 24448 Comm: xfs_io Not tainted 
4.19.0-rc7-btrfs-next-39 #1
 [96049.833768] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
 [96049.833780] RIP: 0010:insert_state+0x101/0x120 [btrfs]
 [96049.833783] RSP: 0018:afd2c3707af0 EFLAGS: 00010282
 [96049.833785] RAX:  RBX: 0004dfff RCX: 
0006
 [96049.833786] RDX: 0007 RSI: 99045c143230 RDI: 
99047b2168a0
 [96049.833787] RBP: 990457851cd0 R08: 0001 R09: 

 [96049.833787] R10: afd2c3707ab8 R11:  R12: 
9903b93b12c8
 [96049.833788] R13: 0004e000 R14: afd2c3707b80 R15: 
afd2c3707b78
 [96049.833790] FS:  7f5c14e7d700() GS:99047b20() 
knlGS:
 [96049.833791] CS:  0010 DS:  ES:  CR0: 80050033
 [96049.833792] CR2: 7f5c146abff8 CR3: 000115f4c004 CR4: 
003606f0
 [96049.833795] DR0:  DR1:  DR2: 

 [96049.833796] DR3:  DR6: fffe0ff0 DR7: 
0400
 [96049.833796] Call Trace:
 [96049.833809]  __set_extent_bit+0x46c/0x6a0 [btrfs]
 [96049.833823]  lock_extent_bits+0x6b/0x210 [btrfs]
 [96049.833831]  ? _raw_spin_unlock+0x24/0x30
 [96049.833841]  ? test_range_bit+0xdf/0x130 [btrfs]
 [96049.833853]  lock_extent_range+0x8e/0x150 [btrfs]
 [96049.833864]  btrfs_double_extent_lock+0x78/0xb0 [btrfs]
 [96049.833875]  btrfs_extent_same_range+0x14e/0x550 [btrfs]
 [96049.833885]  ? rcu_read_lock_sched_held+0x3f/0x70
 [96049.833890]  ? __kmalloc_node+0x2b0/0x2f0
 [96049.833899]  ? btrfs_dedupe_file_range+0x19a/0x280 [btrfs]
 [96049.833909]  btrfs_dedupe_file_range+0x270/0x280 [btrfs]
 [96049.833916]  vfs_dedupe_file_range_one+0xd9/0xe0
 [96049.833919]  vfs_dedupe_file_range+0x131/0x1b0
 [96049.833924]  do_vfs_ioctl+0x272/0x6e0
 [96049.833927]  ? __fget+0x113/0x200
 [96049.833931]  ksys_ioctl+0x70/0x80
 [96049.833933]  __x64_sys_ioctl+0x16/0x20
 [96049.833937]  do_syscall_64+0x60/0x1b0
 [96049.833939]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
 [96049.833941] RIP: 0033:0x7f5c1478ddd7
 [96049.833943] RSP: 002b:7ffe15b196a8 EFLAGS: 0202 ORIG_RAX: 
0010
 [96049.833945] RAX: ffda RBX:  RCX: 
7f5c1478ddd7
 [96049.833946] RDX: 5625ece322d0 RSI: c0189436 RDI: 
0004
 [96049.833947] RBP:  R08: 7f5c14a46f48 R09: 
0040
 [96049.833948] R10: 0541 R11: 0202 R12: 

 [96049.833949] R13:  R14: 0004 R15: 
5625ece322d0
 [96049.833954] irq event stamp: 6196
 [96049.833956] hardirqs last  enabled at (6195): [] 
console_unlock+0x503/0x640
 [96049.833958] hardirqs last disabled at (6196): [] 
trace_hardirqs_off_thunk+0x1a/0x1c
 [96049.833959] softirqs last  enabled at (6114): [] 
__do_softirq+0x370/0x421
 [96049.833964] softirqs last disabled at (6095): [] 
irq_exit+0xcd/0xe0
 [96049.833965] ---[ end trace db7b05f01b7fa10c ]---
 [96049.935816] R13:  R14: 5562e5259240 R15: 
7092b910
 [96049.935822] irq event stamp: 6584
 [96049.935823] hardirqs last  enabled at (6583): [] 
console_unlock+0x503/0x640
 [96049.935825] hardirqs last disabled at (6584): [] 
trace_hardirqs_off_thunk+0x1a/0x1c
 [96049.935827] softirqs last  enabled at (6328): [] 
__do_softirq+0x370/0x421
 [96049.935828] softirqs last disabled at (6313): [] 
irq_exit+0xcd/0xe0
 [96049.935829] ---[ end trace db7b05f01b7fa123 ]---
 [96049.935840] [ cut here ]
 [96049.936065] WARNING: CPU: 1 PID: 24463 at fs/btrfs/extent_io.c:436 
insert_state+0x101/0x120 [btrfs]
 [96049.936107] CPU: 1 PID: 24463 Comm: umount Tainted: GW 
4.19.0-rc7-btrfs-next-39 #1
 [96049.936108] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
 [96049.936117] RIP: 0010:insert_state+0x101/0x120 [btrfs]
 [96049.936119] RSP: 0018:afd2c3637bc0 EFLAGS: 00010282
 [96049.936120] RAX:  RBX: 0004dfff RCX: 

[PATCH] fstests: fix fssum to actually ignore file holes when supposed to

2018-10-29 Thread fdmanana
From: Filipe Manana 

Unless the '-s' option is passed to fssum, it should not detect file holes
and have their existence influence the computed checksum for a file. This
tool was added to test btrfs' send/receive feature, so that it checks for
any metadata and data differences between the original filesystem and the
filesystem that receives send streams.

For a long time the test btrfs/007, which tests btrfs' send/receive with
fsstress, fails sporadically reporting data differences between files.
However the md5sum/sha1sum from the reported files in the original and
new filesystems are the same. The reason why fssum fails is because even
in normal mode it still accounts for number of holes that exist in the
file and their respective lengths. This is done using the SEEK_DATA mode
of lseek. The btrfs send feature does not preserve holes nor prealloc
extents (not supported by the current protocol), so whenever a hole or
prealloc (unwritten) extent is detected in the source filesystem, it
issues a write command full of zeroes, which will translate to a regular
(written) extent in the destination filesystem. This is why fssum reports
a different checksum. A prealloc extent also counts as hole when using
lseek.

For example when passing a seed of 1540592967 to fsstress in btrfs/007,
the test fails, as file p0/d0/f7 has a prealloc extent in the original
filesystem (in the incr snapshot).

Fix this by making fssum just read the hole file and feed its data to the
digest calculation function when option '-s' is not given. If we ever get
btrfs' send/receive to support holes and fallocate, we can just change
the test and pass the '-s' option to all fssum calls.

Signed-off-by: Filipe Manana 
---
 src/fssum.c | 65 +
 1 file changed, 5 insertions(+), 60 deletions(-)

diff --git a/src/fssum.c b/src/fssum.c
index 5da39abf..f1da72fb 100644
--- a/src/fssum.c
+++ b/src/fssum.c
@@ -224,71 +224,16 @@ int
 sum_file_data_permissive(int fd, sum_t *dst)
 {
int ret;
-   off_t pos;
-   off_t old;
-   int i;
-   uint64_t zeros = 0;
-
-   pos = lseek(fd, 0, SEEK_CUR);
-   if (pos == (off_t)-1)
-   return errno == ENXIO ? 0 : -2;
 
while (1) {
-   old = pos;
-   pos = lseek(fd, pos, SEEK_DATA);
-   if (pos == (off_t)-1) {
-   if (errno == ENXIO) {
-   ret = 0;
-   pos = lseek(fd, 0, SEEK_END);
-   if (pos != (off_t)-1)
-   zeros += pos - old;
-   } else {
-   ret = -2;
-   }
-   break;
-   }
ret = read(fd, buf, sizeof(buf));
-   assert(ret); /* eof found by lseek */
-   if (ret <= 0)
+   if (ret < 0)
+   return -errno;
+   sum_add(dst, buf, ret);
+   if (ret < sizeof(buf))
break;
-   if (old < pos) /* hole */
-   zeros += pos - old;
-   for (i = 0; i < ret; ++i) {
-   for (old = i; buf[i] == 0 && i < ret; ++i)
-   ;
-   if (old < i) /* code like a hole */
-   zeros += i - old;
-   if (i == ret)
-   break;
-   if (zeros) {
-   if (verbose >= 2)
-   fprintf(stderr,
-   "adding %llu zeros to sum\n",
-   (unsigned long long)zeros);
-   sum_add_u64(dst, 0);
-   sum_add_u64(dst, zeros);
-   zeros = 0;
-   }
-   for (old = i; buf[i] != 0 && i < ret; ++i)
-   ;
-   if (verbose >= 2)
-   fprintf(stderr, "adding %u non-zeros to sum\n",
-   i - (int)old);
-   sum_add(dst, buf + old, i - old);
-   }
-   pos += ret;
}
-
-   if (zeros) {
-   if (verbose >= 2)
-   fprintf(stderr,
-   "adding %llu zeros to sum (finishing)\n",
-   (unsigned long long)zeros);
-   sum_add_u64(dst, 0);
-   sum_add_u64(dst, zeros);
-   }
-
-   return ret;
+   return 0;
 }
 
 int
-- 
2.11.0



[PATCH] Btrfs: fix missing data checksums after a ranged fsync (msync)

2018-10-29 Thread fdmanana
From: Filipe Manana 

Recently we got a massive simplification for fsync, where for the fast
path we no longer log new extents while their respective ordered extents
are still running.

However that simplification introduced a subtle regression for the case
where we use a ranged fsync (msync). Consider the following example:

   CPU 0CPU 1

mmap write to range [2Mb, 4Mb[
  mmap write to range [512Kb, 1Mb[
  msync range [512K, 1Mb[
--> triggers fast fsync
(BTRFS_INODE_NEEDS_FULL_SYNC
 not set)
--> creates extent map A for this
range and adds it to list of
modified extents
--> starts ordered extent A for
this range
--> waits for it to complete

writeback triggered for range
[2Mb, 4Mb[
  --> create extent map B and
  adds it to the list of
  modified extents
  --> creates ordered extent B

--> start looking for and logging
modified extents
--> logs extent maps A and B
--> finds checksums for extent A
in the csum tree, but not for
extent B
  fsync (msync) finishes

  --> ordered extent B
  finishes and its
  checksums are added
  to the csum tree



After replaying the log, we have the extent covering the range [2Mb, 4Mb[
but do not have the data checksum items covering that file range.

This happens because at the very beginning of an fsync (btrfs_sync_file())
we start and wait for IO in the given range [512Kb, 1Mb[ and therefore
wait for any ordered extents in that range to complete before we start
logging the extents. However if right before we start logging the extent
in our range [512Kb, 1Mb[, writeback is started for any other dirty range,
such as the range [2Mb, 4Mb[ due to memory pressure or a concurrent fsync
or msync (btrfs_sync_file() starts writeback before acquiring the inode's
lock), an ordered extent is created for that other range and a new extent
map is created to represent that range and added to the inode's list of
modified extents.

That means that we will see that other extent in that list when collecting
extents for logging (done at btrfs_log_changed_extents()) and log the
extent before the respective ordered extent finishes - namely before the
checksum items are added to the checksums tree, which is where
log_extent_csums() looks for the checksums, therefore making us log an
extent without logging its checksums. Before that massive simplification
of fsync, this wasn't a problem because besides looking for checkums in
the checksums tree, we also looked for them in any ordered extent still
running.

The consequence of data checksums missing for a file range is that users
attempting to read the affected file range will get -EIO errors and dmesg
reports the following:

 [10188.358136] BTRFS info (device sdc): no csum found for inode 297 start 57344
 [10188.359278] BTRFS warning (device sdc): csum failed root 5 ino 297 off 
57344 csum 0x98f94189 expected csum 0x mirror 1

So fix this by skipping an extents outside of our logging range at
btrfs_log_changed_extents() and leaving them on the list of modified
extents so that any subsequent ranged fsync may collect them if needed.
Also, if we find a hole extent outside of the range still log it, just
to prevent having gaps between extent items after replaying the log,
otherwise fsck will complain when we are not using the NO_HOLES feature
(fstest btrfs/056 triggers such case).

Fixes: e7175a692765 ("btrfs: remove the wait ordered logic in the 
log_one_extent path")
CC: sta...@vger.kernel.org # 4.19+
Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86c5dd100b2..d49edd25f2e5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4394,6 +4394,23 @@ static int btrfs_log_changed_extents(struct 
btrfs_trans_handle *trans,
test_gen = root->fs_info->last_trans_committed;
 
list_for_each_entry_safe(em, n, >modified_extents, list) {
+   /*
+* Skip extents outside our logging range. It's important to do
+* it for correctness because if we don't ignore them, we may
+* log them before their ordered extent completes, and therefore
+* we could log them without logging their respective checksums
+* (the checksum items are added to the csum tree at the 

[PATCH] Btrfs: remove no longer used logged range variables when logging extents

2018-10-26 Thread fdmanana
From: Filipe Manana 

The logged_start and logged_end variables, at btrfs_log_changed_extents(),
were added in commit 8c6c592831a0 ("btrfs: log csums for all modified
extents"). However since the recent simplification for fsync, which makes
us wait for all ordered extents to complete before logging extents, we
no longer need those variables. Commit a2120a473a80 ("btrfs: clean up the
left over logged_list usage") forgot to remove them.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1673dccc76c2..c86c5dd100b2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4383,7 +4383,6 @@ static int btrfs_log_changed_extents(struct 
btrfs_trans_handle *trans,
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = >extent_tree;
-   u64 logged_start, logged_end;
u64 test_gen;
int ret = 0;
int num = 0;
@@ -4393,8 +4392,6 @@ static int btrfs_log_changed_extents(struct 
btrfs_trans_handle *trans,
down_write(>dio_sem);
write_lock(>lock);
test_gen = root->fs_info->last_trans_committed;
-   logged_start = start;
-   logged_end = end;
 
list_for_each_entry_safe(em, n, >modified_extents, list) {
list_del_init(>list);
@@ -4418,11 +4415,6 @@ static int btrfs_log_changed_extents(struct 
btrfs_trans_handle *trans,
em->start >= i_size_read(>vfs_inode))
continue;
 
-   if (em->start < logged_start)
-   logged_start = em->start;
-   if ((em->start + em->len - 1) > logged_end)
-   logged_end = em->start + em->len - 1;
-
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(>refs);
set_bit(EXTENT_FLAG_LOGGING, >flags);
-- 
2.11.0



[PATCH] Btrfs: remove no longer used stuff for tracking pending ordered extents

2018-10-26 Thread fdmanana
From: Filipe Manana 

Tracking pending ordered extents per transaction was introduced in commit
50d9aa99bd35 ("Btrfs: make sure logged extents complete in the current
transaction V3") and later updated in commit 161c3549b45a ("Btrfs: change
how we wait for pending ordered extents").

However now that on fsync we always wait for ordered extents to complete
before logging, done in commit 5636cf7d6dc8 ("btrfs: remove the logged
extents infrastructure"), we no longer need the stuff to track for pending
ordered extents, which was not completely removed in the mentioned commit.
So remove the remaining of the pending ordered extents infrastructure.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/ordered-data.c | 30 --
 fs/btrfs/ordered-data.h |  2 --
 fs/btrfs/transaction.c  | 11 ---
 fs/btrfs/transaction.h  |  2 --
 4 files changed, 45 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0c4ef208b8b9..6fde2b2741ef 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
struct rb_node *node;
-   bool dec_pending_ordered = false;
 
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(_inode->lock);
@@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, >flags);
-   if (test_and_clear_bit(BTRFS_ORDERED_PENDING, >flags))
-   dec_pending_ordered = true;
spin_unlock_irq(>lock);
 
-   /*
-* The current running transaction is waiting on us, we need to let it
-* know that we're complete and wake it up.
-*/
-   if (dec_pending_ordered) {
-   struct btrfs_transaction *trans;
-
-   /*
-* The checks for trans are just a formality, it should be set,
-* but if it isn't we don't want to deref/assert under the spin
-* lock, so be nice and check if trans is set, but ASSERT() so
-* if it isn't set a developer will notice.
-*/
-   spin_lock(_info->trans_lock);
-   trans = fs_info->running_transaction;
-   if (trans)
-   refcount_inc(>use_count);
-   spin_unlock(_info->trans_lock);
-
-   ASSERT(trans);
-   if (trans) {
-   if (atomic_dec_and_test(>pending_ordered))
-   wake_up(>pending_wait);
-   btrfs_put_transaction(trans);
-   }
-   }
-
spin_lock(>ordered_extent_lock);
list_del_init(>root_extent_list);
root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 02d813aaa261..b10e6765d88f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -56,8 +56,6 @@ struct btrfs_ordered_sum {
   * the isize. */
 #define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */
 
-#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to
- * complete in the current transaction. */
 #define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */
 
 struct btrfs_ordered_extent {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b84f5015029..2fe6c2b1d94b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -232,14 +232,12 @@ static noinline int join_transaction(struct btrfs_fs_info 
*fs_info,
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(_trans->writer_wait);
init_waitqueue_head(_trans->commit_wait);
-   init_waitqueue_head(_trans->pending_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
 * One for this trans handle, one so it will live on until we
 * commit the transaction.
 */
refcount_set(_trans->use_count, 2);
-   atomic_set(_trans->pending_ordered, 0);
cur_trans->flags = 0;
cur_trans->start_time = ktime_get_seconds();
 
@@ -1908,13 +1906,6 @@ static inline void btrfs_wait_delalloc_flush(struct 
btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 }
 
-static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
-{
-   wait_event(cur_trans->pending_wait,
-  atomic_read(_trans->pending_ordered) == 0);
-}
-
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 {
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2049,8 +2040,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans)
 
btrfs_wait_delalloc_flush(fs_info);
 
-   

[PATCH v4] Btrfs: fix deadlock on tree root leaf when finding free extent

2018-10-24 Thread fdmanana
From: Filipe Manana 

When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:

 schedule+0x28/0x80
 btrfs_tree_read_lock+0x8e/0x120 [btrfs]
 ? finish_wait+0x80/0x80
 btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
 btrfs_search_slot+0xf6/0x9f0 [btrfs]
 ? evict_refill_and_join+0xd0/0xd0 [btrfs]
 ? inode_insert5+0x119/0x190
 btrfs_lookup_inode+0x3a/0xc0 [btrfs]
 ? kmem_cache_alloc+0x166/0x1d0
 btrfs_iget+0x113/0x690 [btrfs]
 __lookup_free_space_inode+0xd8/0x150 [btrfs]
 lookup_free_space_inode+0x5b/0xb0 [btrfs]
 load_free_space_cache+0x7c/0x170 [btrfs]
 ? cache_block_group+0x72/0x3b0 [btrfs]
 cache_block_group+0x1b3/0x3b0 [btrfs]
 ? finish_wait+0x80/0x80
 find_free_extent+0x799/0x1010 [btrfs]
 btrfs_reserve_extent+0x9b/0x180 [btrfs]
 btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
 __btrfs_cow_block+0x11d/0x500 [btrfs]
 btrfs_cow_block+0xdc/0x180 [btrfs]
 btrfs_search_slot+0x3bd/0x9f0 [btrfs]
 btrfs_lookup_inode+0x3a/0xc0 [btrfs]
 ? kmem_cache_alloc+0x166/0x1d0
 btrfs_update_inode_item+0x46/0x100 [btrfs]
 cache_save_setup+0xe4/0x3a0 [btrfs]
 btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
 btrfs_commit_transaction+0xcb/0x8b0 [btrfs]

At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.

So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.

Reported-by: Andrew Nelson 
Link: 
https://lore.kernel.org/linux-btrfs/captelenq9x5kowuq+fa7h1r3nsjg8vyith8+ifjurc_duhh...@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson 
Signed-off-by: Filipe Manana 
---

V2: Made the solution more generic, since the problem could happen in any
path COWing an extent buffer from the root tree.

Applies on top of a previous patch titled:

 "Btrfs: fix deadlock when writing out free space caches"

V3: Made it more simple by avoiding the atomic from V2 and pass the root
to find_free_extent().

V4: Changed the whole approach so that we lookup for free space cache inode
items using the commit root instead.
The previous approach was causing some transactions to be aborted with
-ENOSPC during umount because sometimes we were skipping cache loading
of all metadata block groups.

 fs/btrfs/ctree.h|  3 +++
 fs/btrfs/free-space-cache.c | 22 +-
 fs/btrfs/inode.c| 32 ++--
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..2b34b2a05ad6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3177,6 +3177,9 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
 int __init btrfs_init_cachep(void);
 void __cold btrfs_destroy_cachep(void);
+struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key 
*location,
+ struct btrfs_root *root, int *new,
+ struct btrfs_path *path);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 struct btrfs_root *root, int *was_new);
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 65b79500e09f..7265f35324f6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -68,7 +68,8 @@ static struct inode *__lookup_free_space_inode(struct 
btrfs_root *root,
btrfs_disk_key_to_cpu(, _key);
btrfs_release_path(path);
 
-   inode = btrfs_iget(fs_info->sb, , root, NULL);
+   inode = btrfs_iget_path(fs_info->sb, , root, NULL, path);
+   btrfs_release_path(path);
if (IS_ERR(inode))
return inode;
 
@@ -830,6 +831,25 @@ int load_free_space_cache(struct 

[PATCH v3] Btrfs: fix deadlock on tree root leaf when finding free extent

2018-10-22 Thread fdmanana
From: Filipe Manana 

When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:

 schedule+0x28/0x80
 btrfs_tree_read_lock+0x8e/0x120 [btrfs]
 ? finish_wait+0x80/0x80
 btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
 btrfs_search_slot+0xf6/0x9f0 [btrfs]
 ? evict_refill_and_join+0xd0/0xd0 [btrfs]
 ? inode_insert5+0x119/0x190
 btrfs_lookup_inode+0x3a/0xc0 [btrfs]
 ? kmem_cache_alloc+0x166/0x1d0
 btrfs_iget+0x113/0x690 [btrfs]
 __lookup_free_space_inode+0xd8/0x150 [btrfs]
 lookup_free_space_inode+0x5b/0xb0 [btrfs]
 load_free_space_cache+0x7c/0x170 [btrfs]
 ? cache_block_group+0x72/0x3b0 [btrfs]
 cache_block_group+0x1b3/0x3b0 [btrfs]
 ? finish_wait+0x80/0x80
 find_free_extent+0x799/0x1010 [btrfs]
 btrfs_reserve_extent+0x9b/0x180 [btrfs]
 btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
 __btrfs_cow_block+0x11d/0x500 [btrfs]
 btrfs_cow_block+0xdc/0x180 [btrfs]
 btrfs_search_slot+0x3bd/0x9f0 [btrfs]
 btrfs_lookup_inode+0x3a/0xc0 [btrfs]
 ? kmem_cache_alloc+0x166/0x1d0
 btrfs_update_inode_item+0x46/0x100 [btrfs]
 cache_save_setup+0xe4/0x3a0 [btrfs]
 btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
 btrfs_commit_transaction+0xcb/0x8b0 [btrfs]

At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.

So fix this by skipping the loading of free space caches of any block
groups that are not yet cached (rare cases) if we are COWing an extent
buffer from the root tree and space caching is enabled (-o space_cache
mount option). This is a rare case and its downside is failure to
find a free extent (return -ENOSPC) when all the already cached block
groups have no free extents.

Reported-by: Andrew Nelson 
Link: 
https://lore.kernel.org/linux-btrfs/captelenq9x5kowuq+fa7h1r3nsjg8vyith8+ifjurc_duhh...@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson 
Signed-off-by: Filipe Manana 
---

V2: Made the solution more generic, since the problem could happen in any
path COWing an extent buffer from the root tree.

Applies on top of a previous patch titled:

 "Btrfs: fix deadlock when writing out free space caches"

V3: Made it more simple by avoiding the atomic from V2 and pass the root
to find_free_extent().

 fs/btrfs/extent-tree.c | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 577878324799..e5fd086799ab 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7218,12 +7218,13 @@ btrfs_release_block_group(struct 
btrfs_block_group_cache *cache,
  * the free space extent currently.
  */
 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
+   struct btrfs_root *root,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
u64 flags, int delalloc)
 {
int ret = 0;
-   struct btrfs_root *root = fs_info->extent_root;
+   struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
@@ -7366,7 +7367,20 @@ static noinline int find_free_extent(struct 
btrfs_fs_info *fs_info,
 
 have_block_group:
cached = block_group_cache_done(block_group);
-   if (unlikely(!cached)) {
+   /*
+* If we are COWing a leaf/node from the root tree, we can not
+* start caching of a block group because we could deadlock on
+* an extent buffer of the root tree.
+* Because if we are COWing a leaf from the root tree, we are
+* holding a write lock on the respective extent buffer, and
+* loading the space cache of a block group requires searching
+* for its inode item in the root tree, which can be located
+* in the same leaf that we previously write locked, in which
+* case we will deadlock.
+*/
+   if (unlikely(!cached) &&
+   (root != 

[PATCH v2] Btrfs: fix deadlock on tree root leaf when finding free extent

2018-10-22 Thread fdmanana
From: Filipe Manana 

When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:

 schedule+0x28/0x80
 btrfs_tree_read_lock+0x8e/0x120 [btrfs]
 ? finish_wait+0x80/0x80
 btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
 btrfs_search_slot+0xf6/0x9f0 [btrfs]
 ? evict_refill_and_join+0xd0/0xd0 [btrfs]
 ? inode_insert5+0x119/0x190
 btrfs_lookup_inode+0x3a/0xc0 [btrfs]
 ? kmem_cache_alloc+0x166/0x1d0
 btrfs_iget+0x113/0x690 [btrfs]
 __lookup_free_space_inode+0xd8/0x150 [btrfs]
 lookup_free_space_inode+0x5b/0xb0 [btrfs]
 load_free_space_cache+0x7c/0x170 [btrfs]
 ? cache_block_group+0x72/0x3b0 [btrfs]
 cache_block_group+0x1b3/0x3b0 [btrfs]
 ? finish_wait+0x80/0x80
 find_free_extent+0x799/0x1010 [btrfs]
 btrfs_reserve_extent+0x9b/0x180 [btrfs]
 btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
 __btrfs_cow_block+0x11d/0x500 [btrfs]
 btrfs_cow_block+0xdc/0x180 [btrfs]
 btrfs_search_slot+0x3bd/0x9f0 [btrfs]
 btrfs_lookup_inode+0x3a/0xc0 [btrfs]
 ? kmem_cache_alloc+0x166/0x1d0
 btrfs_update_inode_item+0x46/0x100 [btrfs]
 cache_save_setup+0xe4/0x3a0 [btrfs]
 btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
 btrfs_commit_transaction+0xcb/0x8b0 [btrfs]

At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.

So fix this by skipping the loading of free space caches of any block
groups that are not yet cached (rare cases) if we are COWing an extent
buffer from the root tree and space caching is enabled (-o space_cache
mount option). This is a rare case and its downside is failure to
find a free extent (return -ENOSPC) when all the already cached block
groups have no free extents.

Reported-by: Andrew Nelson 
Link: 
https://lore.kernel.org/linux-btrfs/captelenq9x5kowuq+fa7h1r3nsjg8vyith8+ifjurc_duhh...@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson 
Signed-off-by: Filipe Manana 
---

V2: Made the solution more generic, since the problem could happen in any
path COWing an extent buffer from the root tree.

Applies on top of a previous patch titled:

 "Btrfs: fix deadlock when writing out free space caches"

 fs/btrfs/ctree.c   |  4 
 fs/btrfs/ctree.h   |  3 +++
 fs/btrfs/disk-io.c |  2 ++
 fs/btrfs/extent-tree.c | 15 ++-
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 089b46c4d97f..646aafda55a3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1065,10 +1065,14 @@ static noinline int __btrfs_cow_block(struct 
btrfs_trans_handle *trans,
root == fs_info->chunk_root ||
root == fs_info->dev_root)
trans->can_flush_pending_bgs = false;
+   else if (root == fs_info->tree_root)
+   atomic_inc(_info->tree_root_cows);
 
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, _key, level,
search_start, empty_size);
+   if (root == fs_info->tree_root)
+   atomic_dec(_info->tree_root_cows);
trans->can_flush_pending_bgs = true;
if (IS_ERR(cow))
return PTR_ERR(cow);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..1b73433c69e2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1121,6 +1121,9 @@ struct btrfs_fs_info {
u32 sectorsize;
u32 stripesize;
 
+   /* Number of tasks corrently COWing a leaf/node from the tree root. */
+   atomic_t tree_root_cows;
+
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 05dc3c17cb62..08c15bf69fb5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2782,6 +2782,8 @@ int open_ctree(struct super_block *sb,
fs_info->sectorsize = 4096;
fs_info->stripesize = 4096;
 
+   atomic_set(_info->tree_root_cows, 0);
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 577878324799..14f35e020050 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7366,7 +7366,20 

[PATCH] Btrfs: fix use-after-free when dumping free space

2018-10-22 Thread fdmanana
From: Filipe Manana 

We were iterating a block group's free space cache rbtree without locking
first the lock that protects it (the free_space_ctl->free_space_offset
rbtree is protected by the free_space_ctl->tree_lock spinlock).

KASAN reported an use-after-free problem when iterating such a rbtree due
to a concurrent rbtree delete:

[ 9520.359168] 
==
[ 9520.359656] BUG: KASAN: use-after-free in rb_next+0x13/0x90
[ 9520.359949] Read of size 8 at addr 8800b7ada500 by task 
btrfs-transacti/1721
[ 9520.360357]
[ 9520.360530] CPU: 4 PID: 1721 Comm: btrfs-transacti Tainted: G L  
  4.19.0-rc8-nbor #555
[ 9520.360990] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-1ubuntu1 04/01/2014
[ 9520.362682] Call Trace:
[ 9520.362887]  dump_stack+0xa4/0xf5
[ 9520.363146]  print_address_description+0x78/0x280
[ 9520.363412]  kasan_report+0x263/0x390
[ 9520.363650]  ? rb_next+0x13/0x90
[ 9520.363873]  __asan_load8+0x54/0x90
[ 9520.364102]  rb_next+0x13/0x90
[ 9520.364380]  btrfs_dump_free_space+0x146/0x160 [btrfs]
[ 9520.364697]  dump_space_info+0x2cd/0x310 [btrfs]
[ 9520.364997]  btrfs_reserve_extent+0x1ee/0x1f0 [btrfs]
[ 9520.365310]  __btrfs_prealloc_file_range+0x1cc/0x620 [btrfs]
[ 9520.365646]  ? btrfs_update_time+0x180/0x180 [btrfs]
[ 9520.365923]  ? _raw_spin_unlock+0x27/0x40
[ 9520.366204]  ? btrfs_alloc_data_chunk_ondemand+0x2c0/0x5c0 [btrfs]
[ 9520.366549]  btrfs_prealloc_file_range_trans+0x23/0x30 [btrfs]
[ 9520.366880]  cache_save_setup+0x42e/0x580 [btrfs]
[ 9520.367220]  ? btrfs_check_data_free_space+0xd0/0xd0 [btrfs]
[ 9520.367518]  ? lock_downgrade+0x2f0/0x2f0
[ 9520.367799]  ? btrfs_write_dirty_block_groups+0x11f/0x6e0 [btrfs]
[ 9520.368104]  ? kasan_check_read+0x11/0x20
[ 9520.368349]  ? do_raw_spin_unlock+0xa8/0x140
[ 9520.368638]  btrfs_write_dirty_block_groups+0x2af/0x6e0 [btrfs]
[ 9520.368978]  ? btrfs_start_dirty_block_groups+0x870/0x870 [btrfs]
[ 9520.369282]  ? do_raw_spin_unlock+0xa8/0x140
[ 9520.369534]  ? _raw_spin_unlock+0x27/0x40
[ 9520.369811]  ? btrfs_run_delayed_refs+0x1b8/0x230 [btrfs]
[ 9520.370137]  commit_cowonly_roots+0x4b9/0x610 [btrfs]
[ 9520.370560]  ? commit_fs_roots+0x350/0x350 [btrfs]
[ 9520.370926]  ? btrfs_run_delayed_refs+0x1b8/0x230 [btrfs]
[ 9520.371285]  btrfs_commit_transaction+0x5e5/0x10e0 [btrfs]
[ 9520.371612]  ? btrfs_apply_pending_changes+0x90/0x90 [btrfs]
[ 9520.371943]  ? start_transaction+0x168/0x6c0 [btrfs]
[ 9520.372257]  transaction_kthread+0x21c/0x240 [btrfs]
[ 9520.372537]  kthread+0x1d2/0x1f0
[ 9520.372793]  ? btrfs_cleanup_transaction+0xb50/0xb50 [btrfs]
[ 9520.373090]  ? kthread_park+0xb0/0xb0
[ 9520.373329]  ret_from_fork+0x3a/0x50
[ 9520.373567]
[ 9520.373738] Allocated by task 1804:
[ 9520.373974]  kasan_kmalloc+0xff/0x180
[ 9520.374208]  kasan_slab_alloc+0x11/0x20
[ 9520.374447]  kmem_cache_alloc+0xfc/0x2d0
[ 9520.374731]  __btrfs_add_free_space+0x40/0x580 [btrfs]
[ 9520.375044]  unpin_extent_range+0x4f7/0x7a0 [btrfs]
[ 9520.375383]  btrfs_finish_extent_commit+0x15f/0x4d0 [btrfs]
[ 9520.375707]  btrfs_commit_transaction+0xb06/0x10e0 [btrfs]
[ 9520.376027]  btrfs_alloc_data_chunk_ondemand+0x237/0x5c0 [btrfs]
[ 9520.376365]  btrfs_check_data_free_space+0x81/0xd0 [btrfs]
[ 9520.376689]  btrfs_delalloc_reserve_space+0x25/0x80 [btrfs]
[ 9520.377018]  btrfs_direct_IO+0x42e/0x6d0 [btrfs]
[ 9520.377284]  generic_file_direct_write+0x11e/0x220
[ 9520.377587]  btrfs_file_write_iter+0x472/0xac0 [btrfs]
[ 9520.377875]  aio_write+0x25c/0x360
[ 9520.378106]  io_submit_one+0xaa0/0xdc0
[ 9520.378343]  __se_sys_io_submit+0xfa/0x2f0
[ 9520.378589]  __x64_sys_io_submit+0x43/0x50
[ 9520.378840]  do_syscall_64+0x7d/0x240
[ 9520.379081]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 9520.379387]
[ 9520.379557] Freed by task 1802:
[ 9520.379782]  __kasan_slab_free+0x173/0x260
[ 9520.380028]  kasan_slab_free+0xe/0x10
[ 9520.380262]  kmem_cache_free+0xc1/0x2c0
[ 9520.380544]  btrfs_find_space_for_alloc+0x4cd/0x4e0 [btrfs]
[ 9520.380866]  find_free_extent+0xa99/0x17e0 [btrfs]
[ 9520.381166]  btrfs_reserve_extent+0xd5/0x1f0 [btrfs]
[ 9520.381474]  btrfs_get_blocks_direct+0x60b/0xbd0 [btrfs]
[ 9520.381761]  __blockdev_direct_IO+0x10ee/0x58a1
[ 9520.382059]  btrfs_direct_IO+0x25a/0x6d0 [btrfs]
[ 9520.382321]  generic_file_direct_write+0x11e/0x220
[ 9520.382623]  btrfs_file_write_iter+0x472/0xac0 [btrfs]
[ 9520.382904]  aio_write+0x25c/0x360
[ 9520.383172]  io_submit_one+0xaa0/0xdc0
[ 9520.383416]  __se_sys_io_submit+0xfa/0x2f0
[ 9520.383678]  __x64_sys_io_submit+0x43/0x50
[ 9520.383927]  do_syscall_64+0x7d/0x240
[ 9520.384165]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 9520.384439]
[ 9520.384610] The buggy address belongs to the object at 8800b7ada500
which belongs to the cache btrfs_free_space of size 72
[ 9520.385175] The buggy address is located 0 bytes inside of
72-byte region [8800b7ada500, 8800b7ada548)
[ 9520.385691] The buggy 

[PATCH] Btrfs: fix deadlock on tree root leaf when finding free extent

2018-10-22 Thread fdmanana
From: Filipe Manana 

When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:

 schedule+0x28/0x80
 btrfs_tree_read_lock+0x8e/0x120 [btrfs]
 ? finish_wait+0x80/0x80
 btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
 btrfs_search_slot+0xf6/0x9f0 [btrfs]
 ? evict_refill_and_join+0xd0/0xd0 [btrfs]
 ? inode_insert5+0x119/0x190
 btrfs_lookup_inode+0x3a/0xc0 [btrfs]
 ? kmem_cache_alloc+0x166/0x1d0
 btrfs_iget+0x113/0x690 [btrfs]
 __lookup_free_space_inode+0xd8/0x150 [btrfs]
 lookup_free_space_inode+0x5b/0xb0 [btrfs]
 load_free_space_cache+0x7c/0x170 [btrfs]
 ? cache_block_group+0x72/0x3b0 [btrfs]
 cache_block_group+0x1b3/0x3b0 [btrfs]
 ? finish_wait+0x80/0x80
 find_free_extent+0x799/0x1010 [btrfs]
 btrfs_reserve_extent+0x9b/0x180 [btrfs]
 btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
 __btrfs_cow_block+0x11d/0x500 [btrfs]
 btrfs_cow_block+0xdc/0x180 [btrfs]
 btrfs_search_slot+0x3bd/0x9f0 [btrfs]
 btrfs_lookup_inode+0x3a/0xc0 [btrfs]
 ? kmem_cache_alloc+0x166/0x1d0
 btrfs_update_inode_item+0x46/0x100 [btrfs]
 cache_save_setup+0xe4/0x3a0 [btrfs]
 btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
 btrfs_commit_transaction+0xcb/0x8b0 [btrfs]

At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.

So fix this by skipping the loading of free space caches of any block
groups that are not yet cached (rare cases) if we are updating the inode
of a free space cache. This is a rare case and its downside is failure to
find a free extent (return -ENOSPC) when all the already cached block
groups have no free extents.

Reported-by: Andrew Nelson 
Link: 
https://lore.kernel.org/linux-btrfs/captelenq9x5kowuq+fa7h1r3nsjg8vyith8+ifjurc_duhh...@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/ctree.h   |  3 +++
 fs/btrfs/disk-io.c |  2 ++
 fs/btrfs/extent-tree.c | 22 +-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..d23ee26eb17d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1121,6 +1121,9 @@ struct btrfs_fs_info {
u32 sectorsize;
u32 stripesize;
 
+   /* The task currently updating a free space cache inode item. */
+   struct task_struct *space_cache_updater;
+
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 05dc3c17cb62..aa5e9a91e560 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2782,6 +2782,8 @@ int open_ctree(struct super_block *sb,
fs_info->sectorsize = 4096;
fs_info->stripesize = 4096;
 
+   fs_info->space_cache_updater = NULL;
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 577878324799..e93040449771 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3364,7 +3364,9 @@ static int cache_save_setup(struct 
btrfs_block_group_cache *block_group,
 * time.
 */
BTRFS_I(inode)->generation = 0;
+   fs_info->space_cache_updater = current;
ret = btrfs_update_inode(trans, root, inode);
+   fs_info->space_cache_updater = NULL;
if (ret) {
/*
 * So theoretically we could recover from this, simply set the
@@ -7366,7 +7368,25 @@ static noinline int find_free_extent(struct 
btrfs_fs_info *fs_info,
 
 have_block_group:
cached = block_group_cache_done(block_group);
-   if (unlikely(!cached)) {
+   /*
+* If we are updating the inode of a free space cache, we can
+* not start the caching of any block group because we could
+* deadlock on an extent buffer of the root tree.
+* At cache_save_setup() we update the inode item of a free
+* space cache, so we may need to COW a leaf of the root tree,
+* which implies finding a free metadata extent. So if when
+* searching for such an extent we find a block group that was
+   

[PATCH] btrfs: fix test btrfs/007 to not leave temporary files in /tmp

2018-10-15 Thread fdmanana
From: Filipe Manana 

This test was using the "mktemp -d" command to create a temporary
directory for storing send streams and computations from fssum, without
ever deleting them when it finishes. Therefore after running it for many
times it filled up all space from /tmp.

Fix this by using a temporary directory in TEST_DEV instead, as all the
more recent send/receive tests do, to store these files, and making sure
they get deleted when the test finishes. On average the sum of the size
of those files is between 5.5Mb to 6Mb, but changing the number of
operations for fsstress makes it even bigger.

Signed-off-by: Filipe Manana 
---
 tests/btrfs/007 | 33 +++--
 tests/btrfs/007.out |  1 -
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/tests/btrfs/007 b/tests/btrfs/007
index 09f2f011..438f2f27 100755
--- a/tests/btrfs/007
+++ b/tests/btrfs/007
@@ -16,14 +16,14 @@ seq=`basename $0`
 seqres=$RESULT_DIR/$seq
 echo "QA output created by $seq"
 
-tmp=`mktemp -d`
+tmp=/tmp/$$
 status=1
 
 _cleanup()
 {
-   echo "*** unmount"
-   _scratch_unmount 2>/dev/null
+   cd /
rm -f $tmp.*
+   rm -fr $send_files_dir
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -38,7 +38,11 @@ _require_scratch
 _require_fssum
 _require_seek_data_hole
 
+send_files_dir=$TEST_DIR/btrfs-test-$seq
+
 rm -f $seqres.full
+rm -fr $send_files_dir
+mkdir $send_files_dir
 
 workout()
 {
@@ -57,19 +61,20 @@ workout()
 
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT 
$SCRATCH_MNT/incr
 
-   echo "# $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > $tmp/base.snap" \
+   echo "# $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > 
${send_files_dir}/base.snap" \
>> $seqres.full
-   $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > $tmp/base.snap 2>> 
$seqres.full \
+   $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > $send_files_dir/base.snap 2>> 
$seqres.full \
|| _fail "failed: '$@'"
echo "# $BTRFS_UTIL_PROG send -p $SCRATCH_MNT/base\
-   $SCRATCH_MNT/incr > $tmp/incr.snap" >> $seqres.full
+   $SCRATCH_MNT/incr > ${send_files_dir}/incr.snap" >> $seqres.full
$BTRFS_UTIL_PROG send -p $SCRATCH_MNT/base \
-   $SCRATCH_MNT/incr > $tmp/incr.snap 2>> $seqres.full \
+   $SCRATCH_MNT/incr > $send_files_dir/incr.snap 2>> $seqres.full \
|| _fail "failed: '$@'"
 
-   run_check $FSSUM_PROG -A -f -w $tmp/base.fssum $SCRATCH_MNT/base
-   run_check $FSSUM_PROG -A -f -w $tmp/incr.fssum -x 
$SCRATCH_MNT/incr/base \
-   $SCRATCH_MNT/incr
+   run_check $FSSUM_PROG -A -f -w $send_files_dir/base.fssum \
+   $SCRATCH_MNT/base
+   run_check $FSSUM_PROG -A -f -w $send_files_dir/incr.fssum \
+   -x $SCRATCH_MNT/incr/base $SCRATCH_MNT/incr
 
_scratch_unmount >/dev/null 2>&1
echo "*** mkfs -dsize=$fsz">>$seqres.full
@@ -78,11 +83,11 @@ workout()
|| _fail "size=$fsz mkfs failed"
_scratch_mount "-o noatime"
 
-   _run_btrfs_util_prog receive $SCRATCH_MNT < $tmp/base.snap
-   run_check $FSSUM_PROG -r $tmp/base.fssum $SCRATCH_MNT/base
+   _run_btrfs_util_prog receive $SCRATCH_MNT < $send_files_dir/base.snap
+   run_check $FSSUM_PROG -r $send_files_dir/base.fssum $SCRATCH_MNT/base
 
-   _run_btrfs_util_prog receive $SCRATCH_MNT < $tmp/incr.snap
-   run_check $FSSUM_PROG -r $tmp/incr.fssum $SCRATCH_MNT/incr
+   _run_btrfs_util_prog receive $SCRATCH_MNT < $send_files_dir/incr.snap
+   run_check $FSSUM_PROG -r $send_files_dir/incr.fssum $SCRATCH_MNT/incr
 }
 
 echo "*** test send / receive"
diff --git a/tests/btrfs/007.out b/tests/btrfs/007.out
index 8f8cec7d..5d029ceb 100644
--- a/tests/btrfs/007.out
+++ b/tests/btrfs/007.out
@@ -1,4 +1,3 @@
 QA output created by 007
 *** test send / receive
 *** done
-*** unmount
-- 
2.11.0



[PATCH] generic: test fsync after fallocate on a very small file

2018-10-15 Thread fdmanana
From: Filipe Manana 

Test that if we have a very small file, with a size smaller than the
block size, then fallocate a very small range within the block size but
past the file's current size, fsync the file and then power fail, after
mounting the filesystem all the file data is there and the file size is
correct.

This test is motivated by a failure in btrfs where it triggered an
assertion when using the no-holes feature, that is, when running with
MKFS_OPTIONS="-O no-holes". The btrfs issue is fixed by a patch for the
linux kernel titled:

 "Btrfs: fix assertion on fsync of regular file when using no-holes
  feature"

Signed-off-by: Filipe Manana 
---
 tests/generic/512 | 61 +++
 tests/generic/512.out |  9 
 tests/generic/group   |  1 +
 3 files changed, 71 insertions(+)
 create mode 100755 tests/generic/512
 create mode 100644 tests/generic/512.out

diff --git a/tests/generic/512 b/tests/generic/512
new file mode 100755
index ..f4e13c68
--- /dev/null
+++ b/tests/generic/512
@@ -0,0 +1,61 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 512
+#
+# Test that if we have a very small file, with a size smaller than the block
+# size, then fallocate a very small range within the block size but past the
+# file's current size, fsync the file and then power fail, after mounting the
+# filesystem all the file data is there and the file size is correct.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_xfs_io_command "falloc"
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+$XFS_IO_PROG -f \
+-c "pwrite -S 0xb6 0 21" \
+-c "falloc 40 40" \
+-c "fsync" \
+$SCRATCH_MNT/foobar | _filter_xfs_io
+
+# Simulate a power failure and mount the filesystem. We expect no data loss
+# and a correct file size.
+_flakey_drop_and_remount
+
+echo "File content after power failure:"
+od -t x1 -A d $SCRATCH_MNT/foobar
+
+_unmount_flakey
+
+status=0
+exit
diff --git a/tests/generic/512.out b/tests/generic/512.out
new file mode 100644
index ..19a0a1b1
--- /dev/null
+++ b/tests/generic/512.out
@@ -0,0 +1,9 @@
+QA output created by 512
+wrote 21/21 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+File content after power failure:
+000 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6
+016 b6 b6 b6 b6 b6 00 00 00 00 00 00 00 00 00 00 00
+032 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+*
+080
diff --git a/tests/generic/group b/tests/generic/group
index 348214ac..d17a0248 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -514,3 +514,4 @@
 509 auto quick log
 510 auto quick log
 511 auto quick rw zero
+512 auto quick log prealloc
-- 
2.11.0



[PATCH] Btrfs: fix assertion on fsync of regular file when using no-holes feature

2018-10-15 Thread fdmanana
From: Filipe Manana 

When using the NO_HOLES feature and logging a regular file, we were
expecting that if we find an inline extent, that either its size in ram
(uncompressed and unenconded) matches the size of the file or if it does
not, that it matches the sector size and it represents compressed data.
This assertion does not cover a case where the length of the inline extent
is smaller then the sector size and also smaller the file's size, such
case is possible through fallocate. Example:

  $ mkfs.btrfs -f -O no-holes /dev/sdb
  $ mount /dev/sdb /mnt

  $ xfs_io -f -c "pwrite -S 0xb60 0 21" /mnt/foobar
  $ xfs_io -c "falloc 40 40" /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar

In the abobe example we trigger the assertion because the inline extent's
length is 21 bytes while the file size is 80 bytes. The fallocate() call
merely updated the file's size and did not touch the existing inline
extent, as expected.

So fix this by adjusting the assertion so that an inline extent length
smaller then the file size is valid if the file size is smaller then the
filesystem's sector size.

A test case for fstests follows soon.

Reported-by: Anatoly Trosinenko 
Link: 
https://lore.kernel.org/linux-btrfs/CAE5jQCfRSBC7n4pUTFJcmHh109=gwyT9mFkCOL+NKfzswmR=_...@mail.gmail.com/
Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ed455dba..1673dccc76c2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4652,7 +4652,8 @@ static int btrfs_log_trailing_hole(struct 
btrfs_trans_handle *trans,
ASSERT(len == i_size ||
   (len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
-   BTRFS_COMPRESS_NONE));
+   BTRFS_COMPRESS_NONE) ||
+  (len < i_size && i_size < fs_info->sectorsize));
return 0;
}
 
-- 
2.11.0



[PATCH v2] Btrfs: fix null pointer dereference on compressed write path error

2018-10-12 Thread fdmanana
From: Filipe Manana 

At inode.c:compress_file_range(), under the "free_pages_out" label, we can
end up dereferencing the "pages" pointer when it has a NULL value. This
case happens when "start" has a value of 0 and we fail to allocate memory
for the "pages" pointer. When that happens we jump to the "cont" label and
then enter the "if (start == 0)" branch where we immediately call the
cow_file_range_inline() function. If that function returns 0 (success
creating an inline extent) or an error (like -ENOMEM for example) we jump
to the "free_pages_out" label and then access "pages[i]" leading to a NULL
pointer dereference, since "nr_pages" has a value greater than zero at
that point.

Fix this by setting "nr_pages" to 0 when we fail to allocate memory for
the "pages" pointer.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201119
Fixes: 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads")
Signed-off-by: Filipe Manana 
---

V2: Updated changelog.

 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 66c6c4103d2f..d6b61b1facdd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -503,6 +503,7 @@ static noinline void compress_file_range(struct inode 
*inode,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
+   nr_pages = 0;
goto cont;
}
 
-- 
2.11.0



[PATCH] Btrfs: fix null pointer dereference on compressed write path error

2018-10-12 Thread fdmanana
From: Filipe Manana 

At inode.c:compress_file_range(), under the "free_pages_out" label, we can
end up dereferencing the "pages" pointer when it has a NULL value. This
case happens when "start" has a value of 0 and we fail to allocate memory
for the "pages" pointer. When that happens we jump to the "cont" label and
then enter the "if (start == 0)" branch where we immediately call the
cow_file_range_inline() function. If that function returns an error (like
-ENOMEM for example) we jump to the "free_pages_out" label and then access
"pages[i]" leading to a NULL pointer dereference, since "nr_pages" has a
value greater than zero at that point.

Fix this by setting "nr_pages" to 0 when we fail to allocate memory for
the "pages" pointer.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201119
Fixes: 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 66c6c4103d2f..d6b61b1facdd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -503,6 +503,7 @@ static noinline void compress_file_range(struct inode 
*inode,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
+   nr_pages = 0;
goto cont;
}
 
-- 
2.11.0



[PATCH] Btrfs: fix use-after-free during inode eviction

2018-10-12 Thread fdmanana
From: Filipe Manana 

At inode.c:evict_inode_truncate_pages(), when we iterate over the inode's
extent states, we access an extent state record's "state" field after we
unlocked the inode's io tree lock. This can lead to a use-after-free issue
because after we unlock the io tree that extent state record might have
been freed due to being merged into another adjacent extent state
record (a previous inflight bio for a read operation finished in the
meanwhile which unlocked a range in the io tree and cause a merge of
extent state records, as explained in the comment before the while loop
added in commit 6ca0709756710 ("Btrfs: fix hang during inode eviction due
to concurrent readahead")).

Fix this by keeping a copy of the extent state's flags in a local variable
and using it after unlocking the io tree.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201189
Fixes: b9d0b38928e2 ("btrfs: Add handler for invalidate page")
CC: sta...@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana 
---
 fs/btrfs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea5339603cf..66c6c4103d2f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5274,11 +5274,13 @@ static void evict_inode_truncate_pages(struct inode 
*inode)
struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+   unsigned state_flags;
 
node = rb_first(_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
start = state->start;
end = state->end;
+   state_flags = state->state;
spin_unlock(_tree->lock);
 
lock_extent_bits(io_tree, start, end, _state);
@@ -5291,7 +5293,7 @@ static void evict_inode_truncate_pages(struct inode 
*inode)
 *
 * Note, end is the bytenr of last byte, so we need + 1 here.
 */
-   if (state->state & EXTENT_DELALLOC)
+   if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(inode, NULL, start, end - start 
+ 1);
 
clear_extent_bit(io_tree, start, end,
-- 
2.11.0



[PATCH] Btrfs: fix deadlock when writing out free space caches

2018-10-12 Thread fdmanana
From: Filipe Manana 

When writing out a block group free space cache we can end deadlocking
with ourseves on an extent buffer lock resulting in a warning like the
following:

  [245043.379979] WARNING: CPU: 4 PID: 2608 at fs/btrfs/locking.c:251 
btrfs_tree_lock+0x1be/0x1d0 [btrfs]
  [245043.392792] CPU: 4 PID: 2608 Comm: btrfs-transacti Tainted: G
W I  4.16.8 #1
  [245043.395489] RIP: 0010:btrfs_tree_lock+0x1be/0x1d0 [btrfs]
  [245043.396791] RSP: 0018:c9000424b840 EFLAGS: 00010246
  [245043.398093] RAX: 0a30 RBX: 8807e20a3d20 RCX: 
0001
  [245043.399414] RDX: 0001 RSI: 0002 RDI: 
8807e20a3d20
  [245043.400732] RBP: 0001 R08: 88041f39a700 R09: 
8800
  [245043.402021] R10: 0040 R11: 8807e20a3d20 R12: 
8807cb220630
  [245043.403296] R13: 0001 R14: 8807cb220628 R15: 
88041fbdf000
  [245043.404780] FS:  () GS:88082fc8() 
knlGS:
  [245043.406050] CS:  0010 DS:  ES:  CR0: 80050033
  [245043.407321] CR2: 7fffdbdb9f10 CR3: 01c09005 CR4: 
000206e0
  [245043.408670] Call Trace:
  [245043.409977]  btrfs_search_slot+0x761/0xa60 [btrfs]
  [245043.411278]  btrfs_insert_empty_items+0x62/0xb0 [btrfs]
  [245043.412572]  btrfs_insert_item+0x5b/0xc0 [btrfs]
  [245043.413922]  btrfs_create_pending_block_groups+0xfb/0x1e0 [btrfs]
  [245043.415216]  do_chunk_alloc+0x1e5/0x2a0 [btrfs]
  [245043.416487]  find_free_extent+0xcd0/0xf60 [btrfs]
  [245043.417813]  btrfs_reserve_extent+0x96/0x1e0 [btrfs]
  [245043.419105]  btrfs_alloc_tree_block+0xfb/0x4a0 [btrfs]
  [245043.420378]  __btrfs_cow_block+0x127/0x550 [btrfs]
  [245043.421652]  btrfs_cow_block+0xee/0x190 [btrfs]
  [245043.422979]  btrfs_search_slot+0x227/0xa60 [btrfs]
  [245043.424279]  ? btrfs_update_inode_item+0x59/0x100 [btrfs]
  [245043.425538]  ? iput+0x72/0x1e0
  [245043.426798]  write_one_cache_group.isra.49+0x20/0x90 [btrfs]
  [245043.428131]  btrfs_start_dirty_block_groups+0x102/0x420 [btrfs]
  [245043.429419]  btrfs_commit_transaction+0x11b/0x880 [btrfs]
  [245043.430712]  ? start_transaction+0x8e/0x410 [btrfs]
  [245043.432006]  transaction_kthread+0x184/0x1a0 [btrfs]
  [245043.433341]  kthread+0xf0/0x130
  [245043.434628]  ? btrfs_cleanup_transaction+0x4e0/0x4e0 [btrfs]
  [245043.435928]  ? kthread_create_worker_on_cpu+0x40/0x40
  [245043.437236]  ret_from_fork+0x1f/0x30
  [245043.441054] ---[ end trace 15abaa2aaf36827f ]---

This is because at write_one_cache_group() when we are COWing a leaf from
the extent tree we end up allocating a new block group (chunk) and,
because we have hit a threshold on the number of bytes reserved for system
chunks, we attempt to finalize the creation of new block groups from the
current transaction, by calling btrfs_create_pending_block_groups().
However here we also need to modify the extent tree in order to insert
a block group item, and if the location for this new block group item
happens to be in the same leaf that we were COWing earlier, we deadlock
since btrfs_search_slot() tries to write lock the extent buffer that we
locked before at write_one_cache_group().

We have already hit similar cases in the past and commit d9a0540a79f8
("Btrfs: fix deadlock when finalizing block group creation") fixed some
of those cases by delaying the creation of pending block groups at the
known specific spots that could lead to a deadlock. This change reworks
that commit to be more generic so that we don't have to add similar logic
to every possible path that can lead to a deadlock. This is done by
making __btrfs_cow_block() disallowing the creation of new block groups
(setting the transaction's can_flush_pending_bgs to false) before it
attempts to allocate a new extent buffer for either the extent, chunk or
device trees, since those are the trees that pending block creation
modifies. Once the new extent buffer is allocated, it allows creation of
pending block groups to happen again.

This change depends on a recent patch from Josef which is not yet in
Linus' tree, named "btrfs: make sure we create all new block groups" in
order to avoid occasional warnings at btrfs_trans_release_chunk_metadata().

Fixes: d9a0540a79f8 ("Btrfs: fix deadlock when finalizing block group creation")
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199753
Link: 
https://lore.kernel.org/linux-btrfs/CAJtFHUTHna09ST-_EEiyWmDH6gAqS6wa=zmnmbsifj8abu9...@mail.gmail.com/
Reported-by: E V 
Signed-off-by: Filipe Manana 
---
 fs/btrfs/ctree.c   | 17 +
 fs/btrfs/extent-tree.c | 16 ++--
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d436fb4c002e..089b46c4d97f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1050,9 +1050,26 @@ static noinline int __btrfs_cow_block(struct 
btrfs_trans_handle *trans,
if ((root->root_key.objectid == 

[PATCH] generic: test for file fsync after moving it to a new parent directory

2018-10-09 Thread fdmanana
From: Filipe Manana 

Test that if we move a file from a directory B to a directory A, replace
directory B with directory A, fsync the file and then power fail, after
mounting the filesystem the file has a single parent, named B and there
is no longer any directory with the name A.

This test is motivated by a bug found in btrfs which is fixed by a patch
for the linux kernel titled:

  "Btrfs: fix wrong dentries after fsync of file that got its parent
   replaced"

This test passes on ext4, xfs and patched btrfs but it hangs on f2fs (the
fsck.f2fs process seems stuck).

Signed-off-by: Filipe Manana 
---
 tests/generic/507 | 71 +++
 tests/generic/507.out |  7 +
 tests/generic/group   |  1 +
 3 files changed, 79 insertions(+)
 create mode 100755 tests/generic/507
 create mode 100644 tests/generic/507.out

diff --git a/tests/generic/507 b/tests/generic/507
new file mode 100755
index ..f23db677
--- /dev/null
+++ b/tests/generic/507
@@ -0,0 +1,71 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 507
+#
+# Test that if we move a file from a directory B to a directory A, replace
+# directory B with directory A, fsync the file and then power fail, after
+# mounting the filesystem the file has a single parent, named B and there
+# is no longer any directory with the name A.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+# Create our test directories and file.
+mkdir $SCRATCH_MNT/testdir
+mkdir $SCRATCH_MNT/testdir/A
+mkdir $SCRATCH_MNT/testdir/B
+touch $SCRATCH_MNT/testdir/B/bar
+
+# Make sure everything done so far is durably persisted.
+sync
+
+# Now move our file bar from directory B to directory A and then replace
+# directory B with directory A, also renaming directory A to B. Finally
+# fsync file bar.
+mv $SCRATCH_MNT/testdir/B/bar $SCRATCH_MNT/testdir/A/bar
+mv -T $SCRATCH_MNT/testdir/A $SCRATCH_MNT/testdir/B
+$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/B/bar
+
+# Simulate a power failure and mount the filesystem. We expect file bar
+# to exist and have a single parent directory, named B, and that no
+# directory named A exists.
+_flakey_drop_and_remount
+
+echo "Filesystem content after power failure:"
+ls -R $SCRATCH_MNT/testdir | _filter_scratch
+
+_unmount_flakey
+
+status=0
+exit
diff --git a/tests/generic/507.out b/tests/generic/507.out
new file mode 100644
index ..49877654
--- /dev/null
+++ b/tests/generic/507.out
@@ -0,0 +1,7 @@
+QA output created by 507
+Filesystem content after power failure:
+SCRATCH_MNT/testdir:
+B
+
+SCRATCH_MNT/testdir/B:
+bar
diff --git a/tests/generic/group b/tests/generic/group
index 2e2a6247..f4d1524b 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -509,3 +509,4 @@
 504 auto quick locks
 505 shutdown auto quick metadata
 506 auto quick log
+507 auto quick log
-- 
2.11.0



[PATCH] Btrfs: fix wrong dentries after fsync of file that got its parent replaced

2018-10-09 Thread fdmanana
From: Filipe Manana 

In a scenario like the following:

  mkdir /mnt/A   # inode 258
  mkdir /mnt/B   # inode 259
  touch /mnt/B/bar   # inode 260

  sync

  mv /mnt/B/bar /mnt/A/bar
  mv -T /mnt/A /mnt/B
  fsync /mnt/B/bar

  

After replaying the log we end up with file bar having 2 hard links, both
with the name 'bar' and one in the directory with inode number 258 and the
other in the directory with inode number 259. Also, we end up with the
directory inode 259 still existing and with the directory inode 258 still
named as 'A', instead of 'B'. In this scenario, file 'bar' should only
have one hard link, located at directory inode 258, the directory inode
259 should not exist anymore and the name for directory inode 258 should
be 'B'.

This incorrect behaviour happens because when attempting to log the old
parents of an inode, we skip any parents that no longer exist. Fix this
by forcing a full commit if an old parent no longer exists.

A test case for fstests follows soon.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5e83991eb064..ed455dba 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5580,9 +5580,33 @@ static int btrfs_log_all_parents(struct 
btrfs_trans_handle *trans,
 
dir_inode = btrfs_iget(fs_info->sb, _key,
   root, NULL);
-   /* If parent inode was deleted, skip it. */
-   if (IS_ERR(dir_inode))
-   continue;
+   /*
+* If the parent inode was deleted, return an error to
+* fallback to a transaction commit. This is to prevent
+* getting an inode that was moved from one parent A to
+* a parent B, got its former parent A deleted and then
+* it got fsync'ed, from existing at both parents after
+* a log replay (and the old parent still existing).
+* Example:
+*
+* mkdir /mnt/A
+* mkdir /mnt/B
+* touch /mnt/B/bar
+* sync
+* mv /mnt/B/bar /mnt/A/bar
+* mv -T /mnt/A /mnt/B
+* fsync /mnt/B/bar
+* 
+*
+* If we ignore the old parent B which got deleted,
+* after a log replay we would have file bar linked
+* at both parents and the old parent B would still
+* exist.
+*/
+   if (IS_ERR(dir_inode)) {
+   ret = PTR_ERR(dir_inode);
+   goto out;
+   }
 
if (ctx)
ctx->log_new_dentries = false;
-- 
2.11.0



[PATCH] Btrfs: fix warning when replaying log after fsync of a tmpfile

2018-10-08 Thread fdmanana
From: Filipe Manana 

When replaying a log which contains a tmpfile (which necessarily has a
link count of 0) we end up calling inc_nlink(), at
fs/btrfs/tree-log.c:replay_one_buffer(), which produces a warning like
the following:

  [195191.943673] WARNING: CPU: 0 PID: 6924 at fs/inode.c:342 
inc_nlink+0x33/0x40
  [195191.943674] Modules linked in: btrfs dm_flakey dm_mod xor raid6_pq 
libcrc32c kvm_intel bochs_drm ttm kvm drm_kms_helper drm irqbypass 
crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel aes_x86_64 
crypto_simd cryptd glue_helper joydev sg button evdev pcspkr qemu_fw_cfg 
serio_raw parport_pc ppdev lp parport ip_tables x_tables autofs4 ext4 
crc32c_generic crc16 mbcache jbd2 fscrypto sd_mod virtio_scsi ata_generic 
virtio_pci virtio_ring virtio ata_piix floppy crc32c_intel libata psmouse e1000 
scsi_mod i2c_piix4 [last unloaded: btrfs]
  [195191.943723] CPU: 0 PID: 6924 Comm: mount Not tainted 
4.19.0-rc6-btrfs-next-38 #1
  [195191.943724] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
  [195191.943726] RIP: 0010:inc_nlink+0x33/0x40
  [195191.943727] Code: c0 74 07 83 c0 01 89 47 48 c3 f6 87 d1 00 00 00 04 74 
17 48 8b 47 28 f0 48 83 a8 70 07 00 00 01 8b 47 48 83 c0 01 89 47 48 c3 <0f> 0b 
eb e5 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 65 ff 05 54
  [195191.943728] RSP: 0018:b96e425e3870 EFLAGS: 00010246
  [195191.943730] RAX:  RBX: 8c0d1e6af4f0 RCX: 
0006
  [195191.943731] RDX:  RSI:  RDI: 
8c0d1e6af4f0
  [195191.943731] RBP: 0097 R08: 0001 R09: 

  [195191.943732] R10:  R11:  R12: 
b96e425e3a60
  [195191.943733] R13: 8c0d10cff0c8 R14: 8c0d0d515348 R15: 
8c0d78a1b3f8
  [195191.943735] FS:  7f570ee24480() GS:8c0dfb20() 
knlGS:
  [195191.943736] CS:  0010 DS:  ES:  CR0: 80050033
  [195191.943737] CR2: 5593286277c8 CR3: bb8f2006 CR4: 
003606f0
  [195191.943739] DR0:  DR1:  DR2: 

  [195191.943740] DR3:  DR6: fffe0ff0 DR7: 
0400
  [195191.943741] Call Trace:
  [195191.943778]  replay_one_buffer+0x797/0x7d0 [btrfs]
  [195191.943802]  walk_up_log_tree+0x1c1/0x250 [btrfs]
  [195191.943809]  ? rcu_read_lock_sched_held+0x3f/0x70
  [195191.943825]  walk_log_tree+0xae/0x1d0 [btrfs]
  [195191.943840]  btrfs_recover_log_trees+0x1d7/0x4d0 [btrfs]
  [195191.943856]  ? replay_dir_deletes+0x280/0x280 [btrfs]
  [195191.943870]  open_ctree+0x1c3b/0x22a0 [btrfs]
  [195191.943887]  btrfs_mount_root+0x6b4/0x800 [btrfs]
  [195191.943894]  ? rcu_read_lock_sched_held+0x3f/0x70
  [195191.943899]  ? pcpu_alloc+0x55b/0x7c0
  [195191.943906]  ? mount_fs+0x3b/0x140
  [195191.943908]  mount_fs+0x3b/0x140
  [195191.943912]  ? __init_waitqueue_head+0x36/0x50
  [195191.943916]  vfs_kern_mount+0x62/0x160
  [195191.943927]  btrfs_mount+0x134/0x890 [btrfs]
  [195191.943936]  ? rcu_read_lock_sched_held+0x3f/0x70
  [195191.943938]  ? pcpu_alloc+0x55b/0x7c0
  [195191.943943]  ? mount_fs+0x3b/0x140
  [195191.943952]  ? btrfs_remount+0x570/0x570 [btrfs]
  [195191.943954]  mount_fs+0x3b/0x140
  [195191.943956]  ? __init_waitqueue_head+0x36/0x50
  [195191.943960]  vfs_kern_mount+0x62/0x160
  [195191.943963]  do_mount+0x1f9/0xd40
  [195191.943967]  ? memdup_user+0x4b/0x70
  [195191.943971]  ksys_mount+0x7e/0xd0
  [195191.943974]  __x64_sys_mount+0x21/0x30
  [195191.943977]  do_syscall_64+0x60/0x1b0
  [195191.943980]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [195191.943983] RIP: 0033:0x7f570e4e524a
  [195191.943985] Code: 48 8b 0d 51 fc 2a 00 f7 d8 64 89 01 48 83 c8 ff c3 66 
2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 
01 f0 ff ff 73 01 c3 48 8b 0d 1e fc 2a 00 f7 d8 64 89 01 48
  [195191.943986] RSP: 002b:7ffd83589478 EFLAGS: 0206 ORIG_RAX: 
00a5
  [195191.943989] RAX: ffda RBX: 563f335b2060 RCX: 
7f570e4e524a
  [195191.943990] RDX: 563f335b2240 RSI: 563f335b2280 RDI: 
563f335b2260
  [195191.943992] RBP:  R08:  R09: 
0020
  [195191.943993] R10: c0ed R11: 0206 R12: 
563f335b2260
  [195191.943994] R13: 563f335b2240 R14:  R15: 

  [195191.944002] irq event stamp: 8688
  [195191.944010] hardirqs last  enabled at (8687): [] 
console_unlock+0x503/0x640
  [195191.944012] hardirqs last disabled at (8688): [] 
trace_hardirqs_off_thunk+0x1a/0x1c
  [195191.944018] softirqs last  enabled at (8638): [] 
__set_page_dirty_nobuffers+0x101/0x150
  [195191.944020] softirqs last disabled at (8634): [] 
wb_wakeup_delayed+0x2e/0x60
  [195191.944022] ---[ end trace 5d6e873a9a0b811a ]---

This happens because the inode does not have the flag I_LINKABLE set,

[PATCH] generic: test mounting filesystem after fsync of a tmpfile

2018-10-08 Thread fdmanana
From: Filipe Manana 

Test that if we fsync a tmpfile, without adding a hard link to it, and
then power fail, we will be able to mount the filesystem without
triggering any crashes, warnings or corruptions.

This test is motivated by an issue in btrfs where this scenario triggered
a warning (without any side effects). The following linux kernel patch
fixes the issue in btrfs:

  "Btrfs: fix warning when replaying log after fsync of a tmpfile"

Signed-off-by: Filipe Manana 
---
 tests/generic/506 | 58 +++
 tests/generic/506.out |  3 +++
 tests/generic/group   |  1 +
 3 files changed, 62 insertions(+)
 create mode 100755 tests/generic/506
 create mode 100644 tests/generic/506.out

diff --git a/tests/generic/506 b/tests/generic/506
new file mode 100755
index ..7d28d3b0
--- /dev/null
+++ b/tests/generic/506
@@ -0,0 +1,58 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 506
+#
+# Test that if we fsync a tmpfile, without adding a hard link to it, and then
+# power fail, we will be able to mount the filesystem without triggering any
+# crashes, warnings or corruptions.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_xfs_io_command "-T"
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+# Create our tmpfile, write some data to it and fsync it. We want a power
+# failure to happen after the fsync, so that we have an inode with a link
+# count of 0 in our log/journal.
+$XFS_IO_PROG -T \
+   -c "pwrite -S 0xab 0 64K" \
+   -c "fsync" \
+   $SCRATCH_MNT | _filter_xfs_io
+
+# Simulate a power failure and mount the filesystem to check that it succeeds.
+_flakey_drop_and_remount
+
+_unmount_flakey
+
+status=0
+exit
diff --git a/tests/generic/506.out b/tests/generic/506.out
new file mode 100644
index ..f522e663
--- /dev/null
+++ b/tests/generic/506.out
@@ -0,0 +1,3 @@
+QA output created by 506
+wrote 65536/65536 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
diff --git a/tests/generic/group b/tests/generic/group
index 4da0e188..2e2a6247 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -508,3 +508,4 @@
 503 auto quick dax punch collapse zero
 504 auto quick locks
 505 shutdown auto quick metadata
+506 auto quick log
-- 
2.11.0



[PATCH] generic: test for deduplication between different files

2018-08-17 Thread fdmanana
From: Filipe Manana 

Test that deduplication of an entire file that has a size that is not
aligned to the filesystem's block size into a different file does not
corrupt the destination's file data.

This test is motivated by a bug found in Btrfs which is fixed by the
following patch for the linux kernel:

  "Btrfs: fix data corruption when deduplicating between different files"

XFS also fails this test, at least as of linux kernel 4.18-rc7, exactly
with the same corruption as in Btrfs - some bytes of a block get replaced
with zeroes after the deduplication.

Signed-off-by: Filipe Manana 
---
 tests/generic/505 | 84 +++
 tests/generic/505.out | 33 
 tests/generic/group   |  1 +
 3 files changed, 118 insertions(+)
 create mode 100755 tests/generic/505
 create mode 100644 tests/generic/505.out

diff --git a/tests/generic/505 b/tests/generic/505
new file mode 100755
index ..5ee232a2
--- /dev/null
+++ b/tests/generic/505
@@ -0,0 +1,84 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 505
+#
+# Test that deduplication of an entire file that has a size that is not aligned
+# to the filesystem's block size into a different file does not corrupt the
+# destination's file data.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch_dedupe
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+# The first byte with a value of 0xae starts at an offset (2518890) which is 
not
+# a multiple of the block size.
+$XFS_IO_PROG -f \
+   -c "pwrite -S 0x6b 0 2518890" \
+   -c "pwrite -S 0xae 2518890 102398" \
+   $SCRATCH_MNT/foo | _filter_xfs_io
+
+# Create a second file with a length not aligned to the block size, whose bytes
+# all have the value 0x6b, so that its extent(s) can be deduplicated with the
+# first file.
+$XFS_IO_PROG -f -c "pwrite -S 0x6b 0 557771" $SCRATCH_MNT/bar | _filter_xfs_io
+
+# The file is filled with bytes having the value 0x6b from offset 0 to offset
+# 2518889 and with the value 0xae from offset 2518890 to offset 2621287.
+echo "File content before deduplication:"
+od -t x1 $SCRATCH_MNT/foo
+
+# Now deduplicate the entire second file into a range of the first file that
+# also has all bytes with the value 0x6b. The destination range's end offset
+# must not be aligned to the block size and must be less then the offset of
+# the first byte with the value 0xae (byte at offset 2518890).
+$XFS_IO_PROG -c "dedupe $SCRATCH_MNT/bar 0 1957888 557771" $SCRATCH_MNT/foo \
+   | _filter_xfs_io
+
+# The bytes in the range starting at offset 2515659 (end of the deduplication
+# range) and ending at offset 2519040 (start offset rounded up to the block
+# size) must all have the value 0xae (and not replaced with 0x00 values).
+# In other words, we should have exactly the same data we had before we asked
+# for deduplication.
+echo "File content after deduplication and before unmounting:"
+od -t x1 $SCRATCH_MNT/foo
+
+# Unmount the filesystem and mount it again. This guarantees any file data in
+# the page cache is dropped.
+_scratch_cycle_mount
+
+# The bytes in the range starting at offset 2515659 (end of the deduplication
+# range) and ending at offset 2519040 (start offset rounded up to the block
+# size) must all have the value 0xae (and not replaced with 0x00 values).
+# In other words, we should have exactly the same data we had before we asked
+# for deduplication.
+echo "File content after unmounting:"
+od -t x1 $SCRATCH_MNT/foo
+
+status=0
+exit
diff --git a/tests/generic/505.out b/tests/generic/505.out
new file mode 100644
index ..7556b9fb
--- /dev/null
+++ b/tests/generic/505.out
@@ -0,0 +1,33 @@
+QA output created by 505
+wrote 2518890/2518890 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 102398/102398 bytes at offset 2518890
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 557771/557771 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+File content before deduplication:
+000 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
+*
+11467540 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b ae ae ae ae ae ae
+11467560 ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae
+*
+11777540 ae ae ae ae ae ae ae ae
+11777550
+deduped 557771/557771 bytes at offset 1957888
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+File content after deduplication and before unmounting:
+000 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
+*
+11467540 6b 6b 6b 6b 

[PATCH] Btrfs: fix data corruption when deduplicating between different files

2018-08-17 Thread fdmanana
From: Filipe Manana 

If we deduplicate extents between two different files we can end up
corrupting data if the source range ends at the size of the source file,
the source file's size is not aligned to the filesystem's block size
and the destination range does not go past the size of the destination
file size.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ xfs_io -f -c "pwrite -S 0x6b 0 2518890" /mnt/foo
  # The first byte with a value of 0xae starts at an offset (2518890)
  # which is not a multiple of the sector size.
  $ xfs_io -c "pwrite -S 0xae 2518890 102398" /mnt/foo

  # Confirm the file content is full of bytes with values 0x6b and 0xae.
  $ od -t x1 /mnt/foo
  000 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
  *
  11467540 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b ae ae ae ae ae ae
  11467560 ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae
  *
  11777540 ae ae ae ae ae ae ae ae
  11777550

  # Create a second file with a length not aligned to the sector size,
  # whose bytes all have the value 0x6b, so that its extent(s) can be
  # deduplicated with the first file.
  $ xfs_io -f -c "pwrite -S 0x6b 0 557771" /mnt/bar

  # Now deduplicate the entire second file into a range of the first file
  # that also has all bytes with the value 0x6b. The destination range's
  # end offset must not be aligned to the sector size and must be less
  # then the offset of the first byte with the value 0xae (byte at offset
  # 2518890).
  $ xfs_io -c "dedupe /mnt/bar 0 1957888 557771" /mnt/foo

  # The bytes in the range starting at offset 2515659 (end of the
  # deduplication range) and ending at offset 2519040 (start offset
  # rounded up to the block size) must all have the value 0xae (and not
  # replaced with 0x00 values). In other words, we should have exactly
  # the same data we had before we asked for deduplication.
  $ od -t x1 /mnt/foo
  000 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
  *
  11467540 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b ae ae ae ae ae ae
  11467560 ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae
  *
  11777540 ae ae ae ae ae ae ae ae
  11777550

  # Unmount the filesystem and mount it again. This guarantees any file
  # data in the page cache is dropped.
  $ umount /dev/sdb
  $ mount /dev/sdb /mnt

  $ od -t x1 /mnt/foo
  000 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
  *
  11461300 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 00
  11461320 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  *
  1147 ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae
  *
  11777540 ae ae ae ae ae ae ae ae
  11777550

  # The bytes in range 2515659 to 2519040 have a value of 0x00 and not a
  # value of 0xae, data corruption happened due to the deduplication
  # operation.

So fix this by rounding down, to the sector size, the length used for the
deduplication when the following conditions are met:

  1) Source file's range ends at its i_size;
  2) Source file's i_size is not aligned to the sector size;
  3) Destination range does not cross the i_size of the destination file.

Fixes: e1d227a42ea2 ("btrfs: Handle unaligned length in extent_same")
CC: sta...@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana 
---
 fs/btrfs/ioctl.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 331b495d2db9..230644d1e439 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3479,6 +3479,25 @@ static int btrfs_extent_same_range(struct inode *src, 
u64 loff, u64 olen,
 
same_lock_start = min_t(u64, loff, dst_loff);
same_lock_len = max_t(u64, loff, dst_loff) + len - 
same_lock_start;
+   } else {
+   /*
+* If the source and destination inodes are different, the
+* source's range end offset matches the source's i_size, that
+* i_size is not a multiple of the sector size, and the
+* destination range does not go past the destination's i_size,
+* we must round down the length to the nearest sector size
+* multiple. If we don't do this adjustment we end replacing
+* with zeroes the bytes in the range that starts at the
+* deduplication range's end offset and ends at the next sector
+* size multiple.
+*/
+   if (loff + olen == i_size_read(src) &&
+   dst_loff + len < i_size_read(dst)) {
+   const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
+
+   len = round_down(i_size_read(src), sz) - loff;
+   olen = len;
+   }
}
 
 again:
-- 
2.11.0



[PATCH] btrfs: test writing into unwritten extent right before snapshotting

2018-08-06 Thread fdmanana
From: Filipe Manana 

Test that if we write into an unwritten extent of a file when there is no
more space left to allocate in the filesystem and then snapshot the file's
subvolume, after a clean shutdown the data was not lost.

This test is motivated by a bug found by Robbie Ko for which there is a
fix whose patch title is:

  "Btrfs: fix unexpected failure of nocow buffered writes after
   snapshotting when low on space"

Reported-by: Robbie Ko 
Signed-off-by: Filipe Manana 
---
 tests/btrfs/170 | 75 +
 tests/btrfs/170.out |  8 ++
 tests/btrfs/group   |  1 +
 3 files changed, 84 insertions(+)
 create mode 100755 tests/btrfs/170
 create mode 100644 tests/btrfs/170.out

diff --git a/tests/btrfs/170 b/tests/btrfs/170
new file mode 100755
index ..cf6886fd
--- /dev/null
+++ b/tests/btrfs/170
@@ -0,0 +1,75 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 170
+#
+# Test that if we write into an unwritten extent of a file when there is no
+# more space left to allocate in the filesystem and then snapshot the file's
+# subvolume, after a clean shutdown the data was not lost.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+_require_xfs_io_command "falloc" "-k"
+
+rm -f $seqres.full
+
+# Use a fixed size filesystem so that we can precisely fill the data block 
group
+# mkfs.btrfs creates and allocate all unused space for a new data block group.
+# It's important to not use the mixed block groups feature as well because we
+# later want to not have more space available for allocating data extents but
+# still have enough metadata space free for creating the snapshot.
+fs_size=$((2 * 1024 * 1024 * 1024)) # 2Gb
+_scratch_mkfs_sized $fs_size >>$seqres.full 2>&1
+
+# Mount without space cache so that we can precisely fill all data space and
+# unallocated space later (space cache v1 uses data block groups).
+_scratch_mount "-o nospace_cache"
+
+# Create our test file and allocate 1826.25Mb of space for it.
+# This will exhaust the existing data block group and all unallocated space on
+# this small fileystem (2Gb).
+$XFS_IO_PROG -f -c "falloc -k 0 1914961920" $SCRATCH_MNT/foobar
+
+# Write some data to the file and check its digest. This write will result in a
+# NOCOW write because there's no more space available to allocate and the file
+# has preallocated (unwritten) extents.
+$XFS_IO_PROG -c "pwrite -S 0xea -b 128K 0 128K" $SCRATCH_MNT/foobar | 
_filter_xfs_io
+
+echo "File digest after write:"
+md5sum $SCRATCH_MNT/foobar | _filter_scratch
+
+# Create a snapshot of the subvolume where our file is.
+$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/snap 2>&1 \
+   | _filter_scratch
+
+# Cleanly unmount the filesystem.
+_scratch_unmount
+
+# Mount the filesystem again and verify the file has the same data it had 
before
+# we unmounted the filesystem (same digest).
+_scratch_mount
+echo "File digest after mounting the filesystem again:"
+md5sum $SCRATCH_MNT/foobar | _filter_scratch
+
+status=0
+exit
diff --git a/tests/btrfs/170.out b/tests/btrfs/170.out
new file mode 100644
index ..4c5fd87a
--- /dev/null
+++ b/tests/btrfs/170.out
@@ -0,0 +1,8 @@
+QA output created by 170
+wrote 131072/131072 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+File digest after write:
+85054e9e74bc3ae186d693890106b71f  SCRATCH_MNT/foobar
+Create a readonly snapshot of 'SCRATCH_MNT' in 'SCRATCH_MNT/snap'
+File digest after mounting the filesystem again:
+85054e9e74bc3ae186d693890106b71f  SCRATCH_MNT/foobar
diff --git a/tests/btrfs/group b/tests/btrfs/group
index b616c73d..3d330eed 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -172,3 +172,4 @@
 167 auto quick replace volume
 168 auto quick send
 169 auto quick send
+170 auto quick snapshot
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: test send with prealloc extent beyond EOF and hole punching

2018-07-30 Thread fdmanana
From: Filipe Manana 

Test that an incremental send operation produces correct results if a file
that has a prealloc (unwritten) extent beyond its EOF gets a hole punched
in a section of that prealloc extent.

This test is motivated by a bug found in btrfs which is fixed by a patch
for the linux kernel titled:

 "Btrfs: send, fix incorrect file layout after hole punching beyond eof"

Signed-off-by: Filipe Manana 
---
 tests/btrfs/169 | 83 +
 tests/btrfs/169.out | 13 +
 tests/btrfs/group   |  1 +
 3 files changed, 97 insertions(+)
 create mode 100755 tests/btrfs/169
 create mode 100644 tests/btrfs/169.out

diff --git a/tests/btrfs/169 b/tests/btrfs/169
new file mode 100755
index ..80eb5ab3
--- /dev/null
+++ b/tests/btrfs/169
@@ -0,0 +1,83 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 169
+#
+# Test that an incremental send operation produces correct results if a file
+# that has a prealloc (unwritten) extent beyond its EOF gets a hole punched
+# in a section of that prealloc extent.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+   rm -fr $send_files_dir
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_test
+_require_scratch
+_require_xfs_io_command "fpunch"
+_require_xfs_io_command "falloc" "-k"
+
+send_files_dir=$TEST_DIR/btrfs-test-$seq
+
+rm -f $seqres.full
+rm -fr $send_files_dir
+mkdir $send_files_dir
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+# Create our test file with a prealloc extent of 4Mb starting at offset 0,
+# then write 1Mb of data into offset 0.
+$XFS_IO_PROG -f -c "falloc -k 0 4M" \
+-c "pwrite -S 0xea 0 1M" \
+$SCRATCH_MNT/foobar | _filter_xfs_io
+
+$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/snap1 2>&1 \
+   | _filter_scratch
+$BTRFS_UTIL_PROG send -f $send_files_dir/1.snap $SCRATCH_MNT/snap1 2>&1 \
+| _filter_scratch
+
+# Now punch a hole starting at an offset that corresponds to the file's current
+# size (1Mb) and ends at an offset smaller then the end offset of the prealloc
+# extent we allocated earlier (3Mb < 4Mb).
+$XFS_IO_PROG -c "fpunch 1M 2M" $SCRATCH_MNT/foobar
+
+$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/snap2 2>&1 \
+   | _filter_scratch
+$BTRFS_UTIL_PROG send -p $SCRATCH_MNT/snap1 -f $send_files_dir/2.snap \
+$SCRATCH_MNT/snap2 2>&1 | _filter_scratch
+
+echo "File digest in the original filesystem:"
+md5sum $SCRATCH_MNT/snap2/foobar | _filter_scratch
+
+# Now recreate the filesystem by receiving both send streams and verify we get
+# the same file content that the original filesystem had.
+_scratch_unmount
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+$BTRFS_UTIL_PROG receive -f $send_files_dir/1.snap $SCRATCH_MNT
+$BTRFS_UTIL_PROG receive -f $send_files_dir/2.snap $SCRATCH_MNT
+
+echo "File digest in the new filesystem:"
+md5sum $SCRATCH_MNT/snap2/foobar | _filter_scratch
+
+status=0
+exit
diff --git a/tests/btrfs/169.out b/tests/btrfs/169.out
new file mode 100644
index ..ba77bf0a
--- /dev/null
+++ b/tests/btrfs/169.out
@@ -0,0 +1,13 @@
+QA output created by 169
+wrote 1048576/1048576 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Create a readonly snapshot of 'SCRATCH_MNT' in 'SCRATCH_MNT/snap1'
+At subvol SCRATCH_MNT/snap1
+Create a readonly snapshot of 'SCRATCH_MNT' in 'SCRATCH_MNT/snap2'
+At subvol SCRATCH_MNT/snap2
+File digest in the original filesystem:
+d31659e82e87798acd4669a1e0a19d4f  SCRATCH_MNT/snap2/foobar
+At subvol snap1
+At snapshot snap2
+File digest in the new filesystem:
+d31659e82e87798acd4669a1e0a19d4f  SCRATCH_MNT/snap2/foobar
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 5b132651..b616c73d 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -171,3 +171,4 @@
 166 auto quick qgroup
 167 auto quick replace volume
 168 auto quick send
+169 auto quick send
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: send, fix incorrect file layout after hole punching beyond eof

2018-07-30 Thread fdmanana
From: Filipe Manana 

When doing an incremental send, if we have a file in the parent snapshot
that has prealloc extents beyond EOF and in the send snapshot it got a
hole punch that partially covers the prealloc extents, the send stream,
when replayed by a receiver, can result in a file that has a size bigger
than it should and filled with zeroes past the correct EOF.

For example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ xfs_io -f -c "falloc -k 0 4M" /mnt/foobar
  $ xfs_io -c "pwrite -S 0xea 0 1M" /mnt/foobar

  $ btrfs subvolume snapshot -r /mnt /mnt/snap1
  $ btrfs send -f /tmp/1.send /mnt/snap1

  $ xfs_io -c "fpunch 1M 2M" /mnt/foobar

  $ btrfs subvolume snapshot -r /mnt /mnt/snap2
  $ btrfs send -f /tmp/2.send -p /mnt/snap1 /mnt/snap2

  $ stat --format %s /mnt/snap2/foobar
  1048576
  $ md5sum /mnt/snap2/foobar
  d31659e82e87798acd4669a1e0a19d4f  /mnt/snap2/foobar

  $ umount /mnt
  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt

  $ btrfs receive -f /mnt/1.snap /mnt
  $ btrfs receive -f /mnt/2.snap /mnt

  $ stat --format %s /mnt/snap2/foobar
  3145728
  # --> should be 1Mb and not 3Mb (which was the end offset of hole
  # punch operation)
  $ md5sum /mnt/snap2/foobar
  117baf295297c2a995f92da725b0b651  /mnt/snap2/foobar
  # --> should be d31659e82e87798acd4669a1e0a19d4f as in the original fs

This issue actually happens only since commit ffa7c4296e93 ("Btrfs: send,
do not issue unnecessary truncate operations"), but before that commit we
were issuing a write operation full of zeroes (to "punch" a hole) which
was extending the file size beyond the correct value and then immediately
issue a truncate operation to the correct size and undoing the previous
write operation. Since the send protocol does not support fallocate, for
extent preallocation and hole punching, fix this by not even attempting
to send a "hole" (regular write full of zeroes) if it starts at an offset
greater then or equals to the file's size. This approach, besides being
much more simple then making send issue the truncate operation, adds the
benefit of avoiding the useless pair of write of zeroes and truncate
operations, saving time and IO at the receiver and reducing the size of
the send stream.

A test case for fstests follows soon.

CC: sta...@vger.kernel.org # 4.17+
Signed-off-by: Filipe Manana 
---
 fs/btrfs/send.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0f31760f875f..87e89f7ff335 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5007,6 +5007,15 @@ static int send_hole(struct send_ctx *sctx, u64 end)
u64 len;
int ret = 0;
 
+   /*
+* A hole that starts at EOF or beyond it. Since we do not yet support
+* fallocate (for extent preallocation and hole punching), sending a
+* write of zeroes starting at EOF or beyond would later require issuing
+* a truncate operation which would undo the write and achieve nothing.
+*/
+   if (offset >= sctx->cur_inode_size)
+   return 0;
+
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, end - offset);
 
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] Btrfs: fix send failure when root has deleted files still open

2018-07-24 Thread fdmanana
From: Filipe Manana 

The more common use case of send involves creating a RO snapshot and then
use it for a send operation. In this case it's not possible to have inodes
in the snapshot that have a link count of zero (inode with an orphan item)
since during snapshot creation we do the orphan cleanup. However, other
less common use cases for send can end up seeing inodes with a link count
of zero and in this case the send operation fails with a ENOENT error
because any attempt to generate a path for the inode, with the purpose
of creating it or updating it at the receiver, fails since there are no
inode reference items. One use case it to use a regular subvolume for
a send operation after turning it to RO mode or turning a RW snapshot
into RO mode and then using it for a send operation. In both cases, if a
file gets all its hard links deleted while there is an open file
descriptor before turning the subvolume/snapshot into RO mode, the send
operation will encounter an inode with a link count of zero and then
fail with errno ENOENT.

Example using a full send with a subvolume:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ btrfs subvolume create /mnt/sv1
  $ touch /mnt/sv1/foo
  $ touch /mnt/sv1/bar

  # keep an open file descriptor on file bar
  $ exec 73> /mnt/sv1/bar

  $ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap2

  # Turn the second snapshot to RW mode and delete file foo while
  # holding an open file descriptor on it.
  $ btrfs property set /mnt/snap2 ro false
  $ exec 73
Signed-off-by: Filipe Manana 
---

V2: Fixed a null pointer dereference for non-incremental send on
sctx->left_path.

 fs/btrfs/send.c | 138 
 1 file changed, 130 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 30be18be0036..0f31760f875f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -100,6 +100,7 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+   bool ignore_cur_inode;
 
u64 send_progress;
 
@@ -5799,6 +5800,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, 
int at_end)
int pending_move = 0;
int refs_processed = 0;
 
+   if (sctx->ignore_cur_inode)
+   return 0;
+
ret = process_recorded_refs_if_needed(sctx, at_end, _move,
  _processed);
if (ret < 0)
@@ -5917,6 +5921,94 @@ static int finish_inode_if_needed(struct send_ctx *sctx, 
int at_end)
return ret;
 }
 
+struct parent_paths_ctx {
+   struct list_head *refs;
+   struct send_ctx *sctx;
+};
+
+static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
+void *ctx)
+{
+   struct parent_paths_ctx *ppctx = ctx;
+
+   return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
+ ppctx->refs);
+}
+
+/*
+ * Issue unlink operations for all paths of the current inode found in the
+ * parent snapshot.
+ */
+static int btrfs_unlink_all_paths(struct send_ctx *sctx)
+{
+   LIST_HEAD(deleted_refs);
+   struct btrfs_path *path;
+   struct btrfs_key key;
+   struct parent_paths_ctx ctx;
+   int ret;
+
+   path = alloc_path_for_send();
+   if (!path)
+   return -ENOMEM;
+
+   key.objectid = sctx->cur_ino;
+   key.type = BTRFS_INODE_REF_KEY;
+   key.offset = 0;
+   ret = btrfs_search_slot(NULL, sctx->parent_root, , path, 0, 0);
+   if (ret < 0)
+   goto out;
+
+   ctx.refs = _refs;
+   ctx.sctx = sctx;
+
+   while (true) {
+   struct extent_buffer *eb = path->nodes[0];
+   int slot = path->slots[0];
+
+   if (slot >= btrfs_header_nritems(eb)) {
+   ret = btrfs_next_leaf(sctx->parent_root, path);
+   if (ret < 0)
+   goto out;
+   else if (ret > 0)
+   break;
+   continue;
+   }
+
+   btrfs_item_key_to_cpu(eb, , slot);
+   if (key.objectid != sctx->cur_ino)
+   break;
+   if (key.type != BTRFS_INODE_REF_KEY &&
+   key.type != BTRFS_INODE_EXTREF_KEY)
+   break;
+
+   ret = iterate_inode_ref(sctx->parent_root, path, , 1,
+   record_parent_ref, );
+   if (ret < 0)
+   goto out;
+
+   path->slots[0]++;
+   }
+
+   while (!list_empty(_refs)) {
+   struct recorded_ref *ref;
+
+   ref = list_first_entry(_refs, struct recorded_ref,
+  list);
+   ret = send_unlink(sctx, ref->full_path);
+   if (ret < 0)
+   goto out;
+   

[PATCH] btrfs: test send with snapshots that have files deleted while open

2018-07-23 Thread fdmanana
From: Filipe Manana 

Test that we are able to do send operations when one of the source
snapshots (or subvolume) has a file that is deleted while there is still
a open file descriptor for that file.

This test is motivated by a bug found in btrfs which is fixed by a patch
for the linux kernel titled:

  "Btrfs: fix send failure when root has deleted files still open"

Signed-off-by: Filipe Manana 
---
 tests/btrfs/168 | 136 
 tests/btrfs/168.out |  13 +
 tests/btrfs/group   |   1 +
 3 files changed, 150 insertions(+)
 create mode 100755 tests/btrfs/168
 create mode 100644 tests/btrfs/168.out

diff --git a/tests/btrfs/168 b/tests/btrfs/168
new file mode 100755
index ..9a159d61
--- /dev/null
+++ b/tests/btrfs/168
@@ -0,0 +1,136 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 168
+#
+# Test that we are able to do send operations when one of the source snapshots
+# (or subvolume) has a file that is deleted while there is still a open file
+# descriptor for that file.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+   rm -fr $send_files_dir
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_test
+_require_scratch
+_require_btrfs_command "property"
+_require_fssum
+
+send_files_dir=$TEST_DIR/btrfs-test-$seq
+
+rm -f $seqres.full
+rm -fr $send_files_dir
+mkdir $send_files_dir
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+# Create a subvolume used for first full send test and used to create two
+# snapshots for the incremental send test.
+$BTRFS_UTIL_PROG subvolume create $SCRATCH_MNT/sv1 | _filter_scratch
+
+# Create some test files.
+$XFS_IO_PROG -f -c "pwrite -S 0xf1 0 64K" $SCRATCH_MNT/sv1/foo >>$seqres.full
+$XFS_IO_PROG -f -c "pwrite -S 0x7b 0 90K" $SCRATCH_MNT/sv1/bar >>$seqres.full
+$XFS_IO_PROG -f -c "pwrite -S 0xea 0 256K" $SCRATCH_MNT/sv1/baz >>$seqres.full
+
+# Flush the previous buffered writes, since setting a subvolume to RO mode
+# does not do it and we want to check if the data is correctly transmitted by
+# the send operations.
+sync
+
+# Keep an open file descriptor on file bar.
+exec 73<$SCRATCH_MNT/sv1/bar
+
+# While the file descriptor is open, delete the file, set the subvolume to
+# read-only mode and see if a full send operation succeeds.
+unlink $SCRATCH_MNT/sv1/bar
+$BTRFS_UTIL_PROG property set $SCRATCH_MNT/sv1 ro true
+$FSSUM_PROG -A -f -w $send_files_dir/sv1.fssum $SCRATCH_MNT/sv1
+$BTRFS_UTIL_PROG send -f $send_files_dir/sv1.send $SCRATCH_MNT/sv1 2>&1 \
+   | _filter_scratch
+
+# Now that the we did the full send, close the file descriptor and set the
+# subvolume back to read-write mode.
+exec 73>&-
+$BTRFS_UTIL_PROG property set $SCRATCH_MNT/sv1 ro false
+
+# Now try an incremental send operation while there's an open file descriptor
+# for a file that was deleted from the send snapshot (while it was in 
read-write
+# mode).
+
+# Create a snapshot of the subvolume, to be used later as the parent snapshot
+# for an incremental send operation.
+$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT/sv1 $SCRATCH_MNT/snap1 \
+   | _filter_scratch
+
+# First do a full send of this snapshot.
+$FSSUM_PROG -A -f -w $send_files_dir/snap1.fssum $SCRATCH_MNT/snap1
+$BTRFS_UTIL_PROG send -f $send_files_dir/snap1.send $SCRATCH_MNT/snap1 2>&1 \
+   | _filter_scratch
+
+# Modify file baz, to check that the incremental send operation does not miss
+# that this file has changed.
+$XFS_IO_PROG -c "pwrite -S 0x19 4K 8K" $SCRATCH_MNT/sv1/baz >>$seqres.full
+
+# Create a second snapshot of the subvolume, to be used later as the send
+# snapshot of an incremental send operation.
+$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT/sv1 $SCRATCH_MNT/snap2 \
+   | _filter_scratch
+
+# Temporarily turn the second snapshot to read-write mode and then open a file
+# descriptor on its foo file.
+$BTRFS_UTIL_PROG property set $SCRATCH_MNT/snap2 ro false
+exec 73<$SCRATCH_MNT/snap2/foo
+
+# Delete the foo file from the second snapshot while holding the file 
descriptor
+# open.
+unlink $SCRATCH_MNT/snap2/foo
+
+# Set the second snapshot back to RO mode, so that we can use it for the
+# incremental send operation.
+$BTRFS_UTIL_PROG property set $SCRATCH_MNT/snap2 ro true
+
+# Do the incremental send while there's an open file descriptor on file foo 
from
+# the second snapshot.
+$FSSUM_PROG -A -f -w $send_files_dir/snap2.fssum $SCRATCH_MNT/snap2
+$BTRFS_UTIL_PROG send -f $send_files_dir/snap2.send -p $SCRATCH_MNT/snap1 \
+   $SCRATCH_MNT/snap2 2>&1 | _filter_scratch
+
+# Now that the incremental send is done close the file 

[PATCH] Btrfs: remove unused key assignment when doing a full send

2018-07-23 Thread fdmanana
From: Filipe Manana 

At send.c:full_send_tree() we were setting the 'key' variable in the loop
while never using it later. We were also using two btrfs_key variables
to store the initial key for search and the key found in every iteration
of the loop. So remove this useless key assignment and use the same
btrfs_key variable to store the initial search key and the key found in
each iteration. This was introduced in the initial send commit but was
never used (commit 31db9f7c23fb ("Btrfs: introduce BTRFS_IOC_SEND for
btrfs send/receive").

Signed-off-by: Filipe Manana 
---
 fs/btrfs/send.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6ffe1c983b76..8acc0e712cfa 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6451,7 +6451,6 @@ static int full_send_tree(struct send_ctx *sctx)
int ret;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
-   struct btrfs_key found_key;
struct btrfs_path *path;
struct extent_buffer *eb;
int slot;
@@ -6473,17 +6472,13 @@ static int full_send_tree(struct send_ctx *sctx)
while (1) {
eb = path->nodes[0];
slot = path->slots[0];
-   btrfs_item_key_to_cpu(eb, _key, slot);
+   btrfs_item_key_to_cpu(eb, , slot);
 
-   ret = changed_cb(path, NULL, _key,
+   ret = changed_cb(path, NULL, ,
 BTRFS_COMPARE_TREE_NEW, sctx);
if (ret < 0)
goto out;
 
-   key.objectid = found_key.objectid;
-   key.type = found_key.type;
-   key.offset = found_key.offset + 1;
-
ret = btrfs_next_item(send_root, path);
if (ret < 0)
goto out;
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix send failure when root has deleted files still open

2018-07-23 Thread fdmanana
From: Filipe Manana 

The more common use case of send involves creating a RO snapshot and then
use it for a send operation. In this case it's not possible to have inodes
in the snapshot that have a link count of zero (inode with an orphan item)
since during snapshot creation we do the orphan cleanup. However, other
less common use cases for send can end up seeing inodes with a link count
of zero and in this case the send operation fails with a ENOENT error
because any attempt to generate a path for the inode, with the purpose
of creating it or updating it at the receiver, fails since there are no
inode reference items. One use case it to use a regular subvolume for
a send operation after turning it to RO mode or turning a RW snapshot
into RO mode and then using it for a send operation. In both cases, if a
file gets all its hard links deleted while there is an open file
descriptor before turning the subvolume/snapshot into RO mode, the send
operation will encounter an inode with a link count of zero and then
fail with errno ENOENT.

Example using a full send with a subvolume:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ btrfs subvolume create /mnt/sv1
  $ touch /mnt/sv1/foo
  $ touch /mnt/sv1/bar

  # keep an open file descriptor on file bar
  $ exec 73> /mnt/sv1/bar

  $ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap2

  # Turn the second snapshot to RW mode and delete file foo while
  # holding an open file descriptor on it.
  $ btrfs property set /mnt/snap2 ro false
  $ exec 73
Signed-off-by: Filipe Manana 
---
 fs/btrfs/send.c | 143 
 1 file changed, 133 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c47f62b19226..6ffe1c983b76 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -100,6 +100,7 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+   bool ignore_cur_inode;
 
u64 send_progress;
 
@@ -5799,6 +5800,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, 
int at_end)
int pending_move = 0;
int refs_processed = 0;
 
+   if (sctx->ignore_cur_inode)
+   return 0;
+
ret = process_recorded_refs_if_needed(sctx, at_end, _move,
  _processed);
if (ret < 0)
@@ -5917,6 +5921,94 @@ static int finish_inode_if_needed(struct send_ctx *sctx, 
int at_end)
return ret;
 }
 
+struct parent_paths_ctx {
+   struct list_head *refs;
+   struct send_ctx *sctx;
+};
+
+static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
+void *ctx)
+{
+   struct parent_paths_ctx *ppctx = ctx;
+
+   return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
+ ppctx->refs);
+}
+
+/*
+ * Issue unlink operations for all paths of the current inode found in the
+ * parent snapshot.
+ */
+static int btrfs_unlink_all_paths(struct send_ctx *sctx)
+{
+   LIST_HEAD(deleted_refs);
+   struct btrfs_path *path;
+   struct btrfs_key key;
+   struct parent_paths_ctx ctx;
+   int ret;
+
+   path = alloc_path_for_send();
+   if (!path)
+   return -ENOMEM;
+
+   key.objectid = sctx->cur_ino;
+   key.type = BTRFS_INODE_REF_KEY;
+   key.offset = 0;
+   ret = btrfs_search_slot(NULL, sctx->parent_root, , path, 0, 0);
+   if (ret < 0)
+   goto out;
+
+   ctx.refs = _refs;
+   ctx.sctx = sctx;
+
+   while (true) {
+   struct extent_buffer *eb = path->nodes[0];
+   int slot = path->slots[0];
+
+   if (slot >= btrfs_header_nritems(eb)) {
+   ret = btrfs_next_leaf(sctx->parent_root, path);
+   if (ret < 0)
+   goto out;
+   else if (ret > 0)
+   break;
+   continue;
+   }
+
+   btrfs_item_key_to_cpu(eb, , slot);
+   if (key.objectid != sctx->cur_ino)
+   break;
+   if (key.type != BTRFS_INODE_REF_KEY &&
+   key.type != BTRFS_INODE_EXTREF_KEY)
+   break;
+
+   ret = iterate_inode_ref(sctx->parent_root, path, , 1,
+   record_parent_ref, );
+   if (ret < 0)
+   goto out;
+
+   path->slots[0]++;
+   }
+
+   while (!list_empty(_refs)) {
+   struct recorded_ref *ref;
+
+   ref = list_first_entry(_refs, struct recorded_ref,
+  list);
+   ret = send_unlink(sctx, ref->full_path);
+   if (ret < 0)
+   goto out;
+   fs_path_free(ref->full_path);
+   list_del(>list);
+   kfree(ref);
+   

[PATCH v2] Btrfs: fix mount failure after fsync due to hard link recreation

2018-07-20 Thread fdmanana
From: Filipe Manana 

If we end up with logging an inode reference item which has the same name
but different index from the one we have persisted, we end up failing when
replaying the log with an errno value of -EEXIST. The error comes from
btrfs_add_link(), which is called from add_inode_ref(), when we are
replaying an inode reference item.

Example scenario where this happens:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foo
  $ ln /mnt/foo /mnt/bar

  $ sync

  # Rename the first hard link (foo) to a new name and rename the second
  # hard link (bar) to the old name of the first hard link (foo).
  $ mv /mnt/foo /mnt/qwerty
  $ mv /mnt/bar /mnt/foo

  # Create a new file, in the same parent directory, with the old name of
  # the second hard link (bar) and fsync this new file.
  # We do this instead of calling fsync on foo/qwerty because if we did
  # that the fsync resulted in a full transaction commit, not triggering
  # the problem.
  $ touch /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  

  $ mount /dev/sdb /mnt
  mount: mount /dev/sdb on /mnt failed: File exists

So fix this by checking if a conflicting inode reference exists (same
name, same parent but different index), removing it (and the associated
dir index entries from the parent inode) if it exists, before attempting
to add the new reference.

A test case for fstests follows soon.

Signed-off-by: Filipe Manana 
---

V2: added missing error check from btrfs_unlink_inode()

 fs/btrfs/tree-log.c | 69 +
 1 file changed, 69 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f8220ec02036..5f3c33466c18 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1291,6 +1291,49 @@ static int unlink_old_inode_refs(struct 
btrfs_trans_handle *trans,
return ret;
 }
 
+static int btrfs_inode_ref_exists(struct inode *inode,
+ struct inode *dir,
+ const u8 ref_type,
+ const char *name,
+ const int namelen)
+{
+   struct btrfs_key key;
+   struct btrfs_path *path;
+   const u64 parent_id = btrfs_ino(BTRFS_I(dir));
+   int ret;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   key.objectid = btrfs_ino(BTRFS_I(inode));
+   key.type = ref_type;
+   if (key.type == BTRFS_INODE_REF_KEY)
+   key.offset = parent_id;
+   else
+   key.offset = btrfs_extref_hash(parent_id, name, namelen);
+
+   ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, , path, 0, 0);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = 0;
+   goto out;
+   }
+   if (key.type == BTRFS_INODE_EXTREF_KEY)
+   ret = btrfs_find_name_in_ext_backref(path->nodes[0],
+path->slots[0],
+parent_id, name,
+namelen, NULL);
+   else
+   ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+name, namelen, NULL);
+
+out:
+   btrfs_free_path(path);
+   return ret;
+}
+
 /*
  * replay one inode back reference item found in the log tree.
  * eb, slot and key refer to the buffer and key found in the log tree.
@@ -1400,6 +1443,32 @@ static noinline int add_inode_ref(struct 
btrfs_trans_handle *trans,
}
}
 
+   /*
+* If a reference item already exists for this inode
+* with the same parent and name, but different index,
+* drop it and the corresponding directory index entries
+* from the parent before adding the new reference item
+* and dir index entries, otherwise we would fail with
+* -EEXIST returned from btrfs_add_link() below.
+*/
+   ret = btrfs_inode_ref_exists(inode, dir, key->type,
+name, namelen);
+   if (ret > 0) {
+   ret = btrfs_unlink_inode(trans, root,
+BTRFS_I(dir),
+BTRFS_I(inode),
+name, namelen);
+   /*
+* If we dropped the link count to 0, bump it so
+* that later the iput() on the inode will not
+* free it. We will fixup the link count later.
+*/
+   

[PATCH] fstests: add test for fsync after renaming hard links of same file

2018-07-19 Thread fdmanana
From: Filipe Manana 

Test that if we have a file with 2 (or more) hard links in the same parent
directory, rename of the hard links, rename one of the other hard links to
the old name of the hard link we renamed before, create a new file in the
same parent directory with the old name of second hard link we renamed,
fsync fsync this new file and power fail, we will be able to mount again
the filesystem and the new file and all hard links exist.

This test is motivated by a bug found in btrfs, where mounting the
filesystem after the power failure resulted in failure with an errno
value of EEXIST, which is fixed by a patch for the linux kernel titled:

  "Btrfs: fix mount failure after fsync due to hard link recreation"

Signed-off-by: Filipe Manana 
---
 tests/generic/502 | 79 +++
 tests/generic/502.out | 11 +++
 tests/generic/group   |  1 +
 3 files changed, 91 insertions(+)
 create mode 100755 tests/generic/502
 create mode 100644 tests/generic/502.out

diff --git a/tests/generic/502 b/tests/generic/502
new file mode 100755
index ..a60ac9a7
--- /dev/null
+++ b/tests/generic/502
@@ -0,0 +1,79 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 502
+#
+# Test that if we have a file with 2 (or more) hard links in the same parent
+# directory, rename of the hard links, rename one of the other hard links to
+# the old name of the hard link we renamed before, create a new file in the
+# same parent directory with the old name of second hard link we renamed, fsync
+# fsync this new file and power fail, we will be able to mount again the
+# filesystem and the new file and all hard links exist.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+# Create our test file with 2 hard links in the same parent directory.
+mkdir $SCRATCH_MNT/testdir
+touch $SCRATCH_MNT/testdir/foo
+ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar
+
+# Make sure everything done so far is durably persisted.
+sync
+
+# Now rename the first hard link (foo) to a new name and rename the second hard
+# link (bar) to the old name of the first hard link (foo).
+mv $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/qwerty
+mv $SCRATCH_MNT/testdir/bar $SCRATCH_MNT/testdir/foo
+
+# Create a new file, in the same parent directory, with the old name of the
+# second hard link (bar) and fsync this new file.
+touch $SCRATCH_MNT/testdir/bar
+$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/bar
+
+echo "Contents of test directory before the power failure:"
+ls -R $SCRATCH_MNT/testdir | _filter_scratch
+
+# Simulate a power failure and mount the filesystem to check that we are able 
to
+# mount it and we have the same files, with the same hard links, that we had
+# before the power failure and in the same order.
+_flakey_drop_and_remount
+
+echo "Contents of test directory after the power failure:"
+ls -R $SCRATCH_MNT/testdir | _filter_scratch
+
+_unmount_flakey
+_cleanup_flakey
+
+status=0
+exit
diff --git a/tests/generic/502.out b/tests/generic/502.out
new file mode 100644
index ..0f43f0fb
--- /dev/null
+++ b/tests/generic/502.out
@@ -0,0 +1,11 @@
+QA output created by 502
+Contents of test directory before the power failure:
+SCRATCH_MNT/testdir:
+bar
+foo
+qwerty
+Contents of test directory after the power failure:
+SCRATCH_MNT/testdir:
+bar
+foo
+qwerty
diff --git a/tests/generic/group b/tests/generic/group
index 029c002c..d0b7dcf6 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -504,3 +504,4 @@
 499 auto quick rw collapse zero
 500 auto thin trim
 501 auto quick clone log
+502 auto quick log
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix mount failure after fsync due to hard link recreation

2018-07-19 Thread fdmanana
From: Filipe Manana 

If we end up with logging an inode reference item which has the same name
but different index from the one we have persisted, we end up failing when
replaying the log with an errno value of -EEXIST. The error comes from
btrfs_add_link(), which is called from add_inode_ref(), when we are
replaying an inode reference item.

Example scenario where this happens:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foo
  $ ln /mnt/foo /mnt/bar

  $ sync

  # Rename the first hard link (foo) to a new name and rename the second
  # hard link (bar) to the old name of the first hard link (foo).
  $ mv /mnt/foo /mnt/qwerty
  $ mv /mnt/bar /mnt/foo

  # Create a new file, in the same parent directory, with the old name of
  # the second hard link (bar) and fsync this new file.
  # We do this instead of calling fsync on foo/qwerty because if we did
  # that the fsync resulted in a full transaction commit, not triggering
  # the problem.
  $ touch /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  

  $ mount /dev/sdb /mnt
  mount: mount /dev/sdb on /mnt failed: File exists

So fix this by checking if a conflicting inode reference exists (same
name, same parent but different index), removing it (and the associated
dir index entries from the parent inode) if it exists, before attempting
to add the new reference.

A test case for fstests follows soon.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 69 +
 1 file changed, 69 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f8220ec02036..4275b09200e3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1291,6 +1291,49 @@ static int unlink_old_inode_refs(struct 
btrfs_trans_handle *trans,
return ret;
 }
 
+static int btrfs_inode_ref_exists(struct inode *inode,
+ struct inode *dir,
+ const u8 ref_type,
+ const char *name,
+ const int namelen)
+{
+   struct btrfs_key key;
+   struct btrfs_path *path;
+   const u64 parent_id = btrfs_ino(BTRFS_I(dir));
+   int ret;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   key.objectid = btrfs_ino(BTRFS_I(inode));
+   key.type = ref_type;
+   if (key.type == BTRFS_INODE_REF_KEY)
+   key.offset = parent_id;
+   else
+   key.offset = btrfs_extref_hash(parent_id, name, namelen);
+
+   ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, , path, 0, 0);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = 0;
+   goto out;
+   }
+   if (key.type == BTRFS_INODE_EXTREF_KEY)
+   ret = btrfs_find_name_in_ext_backref(path->nodes[0],
+path->slots[0],
+parent_id, name,
+namelen, NULL);
+   else
+   ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
+name, namelen, NULL);
+
+out:
+   btrfs_free_path(path);
+   return ret;
+}
+
 /*
  * replay one inode back reference item found in the log tree.
  * eb, slot and key refer to the buffer and key found in the log tree.
@@ -1400,6 +1443,32 @@ static noinline int add_inode_ref(struct 
btrfs_trans_handle *trans,
}
}
 
+   /*
+* If a reference item already exists for this inode
+* with the same parent and name, but different index,
+* drop it and the corresponding directory index entries
+* from the parent before adding the new reference item
+* and dir index entries, otherwise we would fail with
+* -EEXIST returned from btrfs_add_link() below.
+*/
+   ret = btrfs_inode_ref_exists(inode, dir, key->type,
+name, namelen);
+   if (ret < 0)
+   goto out;
+   if (ret) {
+   ret = btrfs_unlink_inode(trans, root,
+BTRFS_I(dir),
+BTRFS_I(inode),
+name, namelen);
+   /*
+* If we dropped the link count to 0, bump it so
+* that later the iput() on the inode will not
+* free it. We will fixup the link count later.
+*/
+  

[PATCH v2] generic: add test for fsync after cloning file range

2018-07-12 Thread fdmanana
From: Filipe Manana 

Test that if we do a buffered write to a file, fsync it, clone a range
from another file into our file that overlaps the previously written
range, fsync the file again and then power fail, after we mount again the
filesystem, no file data was lost or corrupted.

This test is motivated by a bug found in btrfs, which is fixed by a patch
for the linux kernel titled:

  "Btrfs: fix file data corruption after cloning a range and fsync"

Signed-off-by: Filipe Manana 
---

V2: Fixed title of referenced btrfs patch.

 tests/generic/500 | 70 +++
 tests/generic/500.out |  5 
 tests/generic/group   |  1 +
 3 files changed, 76 insertions(+)
 create mode 100755 tests/generic/500
 create mode 100644 tests/generic/500.out

diff --git a/tests/generic/500 b/tests/generic/500
new file mode 100755
index ..b7baca34
--- /dev/null
+++ b/tests/generic/500
@@ -0,0 +1,70 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 500
+#
+# Test that if we do a buffered write to a file, fsync it, clone a range from
+# another file into our file that overlaps the previously written range, fsync
+# the file again and then power fail, after we mount again the filesystem, no
+# file data was lost or corrupted.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/reflink
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch_reflink
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+$XFS_IO_PROG -f -c "pwrite -S 0x18 9000K 6908K" $SCRATCH_MNT/foo >>$seqres.full
+$XFS_IO_PROG -f -c "pwrite -S 0x20 2572K 156K" $SCRATCH_MNT/bar >>$seqres.full
+
+# We clone from file foo into a range of file bar that overlaps the existing
+# extent at file bar. The destination offset of the reflink operation matches
+# the eof position of file bar minus 4Kb.
+$XFS_IO_PROG -c "fsync" \
+-c "reflink ${SCRATCH_MNT}/foo 0 2724K 15908K" \
+-c "fsync" \
+$SCRATCH_MNT/bar >>$seqres.full
+
+echo "File bar digest before power failure:"
+md5sum $SCRATCH_MNT/bar | _filter_scratch
+
+# Simulate a power failure and mount the filesystem to check that no file data
+# was lost or corrupted.
+_flakey_drop_and_remount
+
+echo "File bar digest after power failure:"
+md5sum $SCRATCH_MNT/bar | _filter_scratch
+
+_unmount_flakey
+_cleanup_flakey
+
+status=0
+exit
diff --git a/tests/generic/500.out b/tests/generic/500.out
new file mode 100644
index ..f590154e
--- /dev/null
+++ b/tests/generic/500.out
@@ -0,0 +1,5 @@
+QA output created by 500
+File bar digest before power failure:
+95a95813a8c2abc9aa75a6c2914a077e  SCRATCH_MNT/bar
+File bar digest after power failure:
+95a95813a8c2abc9aa75a6c2914a077e  SCRATCH_MNT/bar
diff --git a/tests/generic/group b/tests/generic/group
index b2a093f4..a84321dd 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -502,3 +502,4 @@
 497 auto quick swap collapse
 498 auto quick log
 499 auto quick rw collapse zero
+500 auto quick clone log
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] Btrfs: fix file data corruption after cloning a range and fsync

2018-07-12 Thread fdmanana
From: Filipe Manana 

When we clone a range into a file we can end up dropping existing
extent maps (or trimming them) and replacing them with new ones if the
range to be cloned overlaps with a range in the destination inode.
When that happens we add the new extent maps to the  list of modified
extents in the inode's extent map tree, so that a "fast" fsync (the flag
BTRFS_INODE_NEEDS_FULL_SYNC not set in the inode) will see the extent maps
and log corresponding extent items. However, at the end of range cloning
operation we do truncate all the pages in the affected range (in order to
ensure future reads will not get stale data). Sometimes this truncation
will release the corresponding extent maps besides the pages from the page
cache. If this happens, then a "fast" fsync operation will miss logging
some extent items, because it relies exclusively on the extent maps being
present in the inode's extent tree, leading to data loss/corruption if
the fsync ends up using the same transaction used by the clone operation
(that transaction was not committed in the meanwhile). An extent map is
released through the callback btrfs_invalidatepage(), which gets called by
truncate_inode_pages_range(), and it calls __btrfs_releasepage(). The
later ends up calling try_release_extent_mapping() which will release the
extent map if some conditions are met, like the file size being greater
than 16Mb, gfp flags allow blocking and the range not being locked (which
is the case during the clone operation) nor being the extent map flagged
as pinned (also the case for cloning).

The following example, turned into a test for fstests, reproduces the
issue:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ xfs_io -f -c "pwrite -S 0x18 9000K 6908K" /mnt/foo
  $ xfs_io -f -c "pwrite -S 0x20 2572K 156K" /mnt/bar

  $ xfs_io -c "fsync" /mnt/bar
  # reflink destination offset corresponds to the size of file bar,
  # 2728Kb minus 4Kb.
  $ xfs_io -c ""reflink ${SCRATCH_MNT}/foo 0 2724K 15908K" /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  $ md5sum /mnt/bar
  95a95813a8c2abc9aa75a6c2914a077e  /mnt/bar

  

  $ mount /dev/sdb /mnt
  $ md5sum /mnt/bar
  207fd8d0b161be8a84b945f0df8d5f8d  /mnt/bar
  # digest should be 95a95813a8c2abc9aa75a6c2914a077e like before the
  # power failure

In the above example, the destination offset of the clone operation
corresponds to the size of the "bar" file minus 4Kb. So during the clone
operation, the extent map covering the range from 2572Kb to 2728Kb gets
trimmed so that it ends at offset 2724Kb, and a new extent map covering
the range from 2724Kb to 11724Kb is created. So at the end of the clone
operation when we ask to truncate the pages in the range from 2724Kb to
2724Kb + 15908Kb, the page invalidation callback ends up removing the new
extent map (through try_release_extent_mapping()) when the page at offset
2724Kb is passed to that callback.

Fix this by setting the bit BTRFS_INODE_NEEDS_FULL_SYNC whenever an extent
map is removed at try_release_extent_mapping(), forcing the next fsync to
search for modified extents in the fs/subvolume tree instead of relying on
the presence of extent maps in memory. This way we can continue doing a
"fast" fsync if the destination range of a clone operation does not
overlap with an existing range or if any of the criteria necessary to
remove an extent map at try_release_extent_mapping() is not met (file
size not bigger then 16Mb or gfp flags do not allow blocking).

CC: sta...@vger.kernel.org # 3.16+
Signed-off-by: Filipe Manana 
---

V2: Added missing "fix" word to subject only.

 fs/btrfs/extent_io.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e55843f536bc..b3e45714d28f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4238,8 +4238,9 @@ int try_release_extent_mapping(struct page *page, gfp_t 
mask)
struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
-   struct extent_io_tree *tree = _I(page->mapping->host)->io_tree;
-   struct extent_map_tree *map = 
_I(page->mapping->host)->extent_tree;
+   struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
+   struct extent_io_tree *tree = _inode->io_tree;
+   struct extent_map_tree *map = _inode->extent_tree;
 
if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > SZ_16M) {
@@ -4262,6 +4263,8 @@ int try_release_extent_mapping(struct page *page, gfp_t 
mask)
extent_map_end(em) - 1,
EXTENT_LOCKED | EXTENT_WRITEBACK,
0, NULL)) {
+   set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+   _inode->runtime_flags);
remove_extent_mapping(map, em);
/* once for the rb tree */
  

[PATCH] generic: add test for fsync after cloning file range

2018-07-12 Thread fdmanana
From: Filipe Manana 

Test that if we do a buffered write to a file, fsync it, clone a range
from another file into our file that overlaps the previously written
range, fsync the file again and then power fail, after we mount again the
filesystem, no file data was lost or corrupted.

This test is motivated by a bug found in btrfs, which is fixed by a patch
for the linux kernel titled:

  "Btrfs: file data corruption after cloning a range and fsync"

Signed-off-by: Filipe Manana 
---
 tests/generic/500 | 70 +++
 tests/generic/500.out |  5 
 tests/generic/group   |  1 +
 3 files changed, 76 insertions(+)
 create mode 100755 tests/generic/500
 create mode 100644 tests/generic/500.out

diff --git a/tests/generic/500 b/tests/generic/500
new file mode 100755
index ..b7baca34
--- /dev/null
+++ b/tests/generic/500
@@ -0,0 +1,70 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 500
+#
+# Test that if we do a buffered write to a file, fsync it, clone a range from
+# another file into our file that overlaps the previously written range, fsync
+# the file again and then power fail, after we mount again the filesystem, no
+# file data was lost or corrupted.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/reflink
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch_reflink
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+$XFS_IO_PROG -f -c "pwrite -S 0x18 9000K 6908K" $SCRATCH_MNT/foo >>$seqres.full
+$XFS_IO_PROG -f -c "pwrite -S 0x20 2572K 156K" $SCRATCH_MNT/bar >>$seqres.full
+
+# We clone from file foo into a range of file bar that overlaps the existing
+# extent at file bar. The destination offset of the reflink operation matches
+# the eof position of file bar minus 4Kb.
+$XFS_IO_PROG -c "fsync" \
+-c "reflink ${SCRATCH_MNT}/foo 0 2724K 15908K" \
+-c "fsync" \
+$SCRATCH_MNT/bar >>$seqres.full
+
+echo "File bar digest before power failure:"
+md5sum $SCRATCH_MNT/bar | _filter_scratch
+
+# Simulate a power failure and mount the filesystem to check that no file data
+# was lost or corrupted.
+_flakey_drop_and_remount
+
+echo "File bar digest after power failure:"
+md5sum $SCRATCH_MNT/bar | _filter_scratch
+
+_unmount_flakey
+_cleanup_flakey
+
+status=0
+exit
diff --git a/tests/generic/500.out b/tests/generic/500.out
new file mode 100644
index ..f590154e
--- /dev/null
+++ b/tests/generic/500.out
@@ -0,0 +1,5 @@
+QA output created by 500
+File bar digest before power failure:
+95a95813a8c2abc9aa75a6c2914a077e  SCRATCH_MNT/bar
+File bar digest after power failure:
+95a95813a8c2abc9aa75a6c2914a077e  SCRATCH_MNT/bar
diff --git a/tests/generic/group b/tests/generic/group
index b2a093f4..a84321dd 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -502,3 +502,4 @@
 497 auto quick swap collapse
 498 auto quick log
 499 auto quick rw collapse zero
+500 auto quick clone log
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: file data corruption after cloning a range and fsync

2018-07-12 Thread fdmanana
From: Filipe Manana 

When we clone a range into a file we can end up dropping existing
extent maps (or trimming them) and replacing them with new ones if the
range to be cloned overlaps with a range in the destination inode.
When that happens we add the new extent maps to the  list of modified
extents in the inode's extent map tree, so that a "fast" fsync (the flag
BTRFS_INODE_NEEDS_FULL_SYNC not set in the inode) will see the extent maps
and log corresponding extent items. However, at the end of range cloning
operation we do truncate all the pages in the affected range (in order to
ensure future reads will not get stale data). Sometimes this truncation
will release the corresponding extent maps besides the pages from the page
cache. If this happens, then a "fast" fsync operation will miss logging
some extent items, because it relies exclusively on the extent maps being
present in the inode's extent tree, leading to data loss/corruption if
the fsync ends up using the same transaction used by the clone operation
(that transaction was not committed in the meanwhile). An extent map is
released through the callback btrfs_invalidatepage(), which gets called by
truncate_inode_pages_range(), and it calls __btrfs_releasepage(). The
later ends up calling try_release_extent_mapping() which will release the
extent map if some conditions are met, like the file size being greater
than 16Mb, gfp flags allow blocking and the range not being locked (which
is the case during the clone operation) nor being the extent map flagged
as pinned (also the case for cloning).

The following example, turned into a test for fstests, reproduces the
issue:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ xfs_io -f -c "pwrite -S 0x18 9000K 6908K" /mnt/foo
  $ xfs_io -f -c "pwrite -S 0x20 2572K 156K" /mnt/bar

  $ xfs_io -c "fsync" /mnt/bar
  # reflink destination offset corresponds to the size of file bar,
  # 2728Kb minus 4Kb.
  $ xfs_io -c ""reflink ${SCRATCH_MNT}/foo 0 2724K 15908K" /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  $ md5sum /mnt/bar
  95a95813a8c2abc9aa75a6c2914a077e  /mnt/bar

  

  $ mount /dev/sdb /mnt
  $ md5sum /mnt/bar
  207fd8d0b161be8a84b945f0df8d5f8d  /mnt/bar
  # digest should be 95a95813a8c2abc9aa75a6c2914a077e like before the
  # power failure

In the above example, the destination offset of the clone operation
corresponds to the size of the "bar" file minus 4Kb. So during the clone
operation, the extent map covering the range from 2572Kb to 2728Kb gets
trimmed so that it ends at offset 2724Kb, and a new extent map covering
the range from 2724Kb to 11724Kb is created. So at the end of the clone
operation when we ask to truncate the pages in the range from 2724Kb to
2724Kb + 15908Kb, the page invalidation callback ends up removing the new
extent map (through try_release_extent_mapping()) when the page at offset
2724Kb is passed to that callback.

Fix this by setting the bit BTRFS_INODE_NEEDS_FULL_SYNC whenever an extent
map is removed at try_release_extent_mapping(), forcing the next fsync to
search for modified extents in the fs/subvolume tree instead of relying on
the presence of extent maps in memory. This way we can continue doing a
"fast" fsync if the destination range of a clone operation does not
overlap with an existing range or if any of the criteria necessary to
remove an extent map at try_release_extent_mapping() is not met (file
size not bigger then 16Mb or gfp flags do not allow blocking).

CC: sta...@vger.kernel.org # 3.16+
Signed-off-by: Filipe Manana 
---
 fs/btrfs/extent_io.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e55843f536bc..b3e45714d28f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4238,8 +4238,9 @@ int try_release_extent_mapping(struct page *page, gfp_t 
mask)
struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
-   struct extent_io_tree *tree = _I(page->mapping->host)->io_tree;
-   struct extent_map_tree *map = 
_I(page->mapping->host)->extent_tree;
+   struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
+   struct extent_io_tree *tree = _inode->io_tree;
+   struct extent_map_tree *map = _inode->extent_tree;
 
if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > SZ_16M) {
@@ -4262,6 +4263,8 @@ int try_release_extent_mapping(struct page *page, gfp_t 
mask)
extent_map_end(em) - 1,
EXTENT_LOCKED | EXTENT_WRITEBACK,
0, NULL)) {
+   set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+   _inode->runtime_flags);
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);

[PATCH] fstests: test power failure on btrfs while qgroups rescan is in progress

2018-06-27 Thread fdmanana
From: Filipe Manana 

Test that if a power failure happens on a filesystem with quotas (qgroups)
enabled while the quota rescan kernel thread is running, we will be able
to mount the filesystem after the power failure.

This test is motivated by a recent regression introduced in the linux
kernel's 4.18 merge window and is fixed by a patch with the title:

  "Btrfs: fix mount failure when qgroup rescan is in progress"

Signed-off-by: Filipe Manana 
---
 tests/btrfs/166 | 57 +
 tests/btrfs/166.out |  2 ++
 tests/btrfs/group   |  1 +
 3 files changed, 60 insertions(+)
 create mode 100755 tests/btrfs/166
 create mode 100644 tests/btrfs/166.out

diff --git a/tests/btrfs/166 b/tests/btrfs/166
new file mode 100755
index ..c74b0861
--- /dev/null
+++ b/tests/btrfs/166
@@ -0,0 +1,57 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FSQA Test No. 166
+#
+# Test that if a power failure happens on a filesystem with quotas (qgroups)
+# enabled while the quota rescan kernel thread is running, we will be able
+# to mount the filesystem after the power failure.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs  >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+# Enable qgroups on the filesystem. This will start the qgroup rescan kernel
+# thread.
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+
+# Simulate a power failure, while the qgroup rescan kernel thread is running,
+# and then mount the filesystem to check that mounting the filesystem does not
+# fail.
+_flakey_drop_and_remount
+
+_unmount_flakey
+_cleanup_flakey
+
+echo "Silence is golden"
+status=0
+exit
diff --git a/tests/btrfs/166.out b/tests/btrfs/166.out
new file mode 100644
index ..1b1db1f8
--- /dev/null
+++ b/tests/btrfs/166.out
@@ -0,0 +1,2 @@
+QA output created by 166
+Silence is golden
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 9988cedd..e68aa1b7 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -168,3 +168,4 @@
 163 auto quick volume
 164 auto quick volume
 165 auto quick subvol
+166 auto quick qgroup
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix mount failure when qgroup rescan is in progress

2018-06-27 Thread fdmanana
From: Filipe Manana 

If a power failure happens while the qgroup rescan kthread is running,
the next mount operation will always fail. This is because of a recent
regression that makes qgroup_rescan_init() incorrectly return -EINVAL
when we are mounting the filesystem (through btrfs_read_qgroup_config()).
This causes the -EINVAL error to be returned regardless of any qgroup
flags being set instead of returning the error only when neither of
the flags BTRFS_QGROUP_STATUS_FLAG_RESCAN nor BTRFS_QGROUP_STATUS_FLAG_ON
are set.

A test case for fstests follows up soon.

Fixes: 9593bf49675e ("btrfs: qgroup: show more meaningful qgroup_rescan_init 
error message")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/qgroup.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1874a6d2e6f5..d4171de93087 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2784,13 +2784,20 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 
progress_objectid,
 
if (!init_flags) {
/* we're resuming qgroup rescan at mount time */
-   if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN))
+   if (!(fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
-   else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+   ret = -EINVAL;
+   } else if (!(fs_info->qgroup_flags &
+BTRFS_QGROUP_STATUS_FLAG_ON)) {
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup rescan is not 
queued");
-   return -EINVAL;
+   ret = -EINVAL;
+   }
+
+   if (ret)
+   return ret;
}
 
mutex_lock(_info->qgroup_rescan_lock);
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] Btrfs: fix physical offset reported by fiemap for inline extents

2018-06-20 Thread fdmanana
From: Filipe Manana 

Commit 9d311e11fc1f ("Btrfs: fiemap: pass correct bytenr when
fm_extent_count is zero") introduced a regression where we no longer
report 0 as the physical offset for inline extents (and other extents
with a special block_start value). This is because it always sets the
variable used to report the physical offset ("disko") as em->block_start
plus some offset, and em->block_start has the value 18446744073709551614
((u64) -2) for inline extents.

This made the btrfs test 004 (from fstests) often fail, for example, for
a file with an inline extent we have the following items in the subvolume
tree:

item 101 key (418 INODE_ITEM 0) itemoff 11029 itemsize 160
   generation 25 transid 38 size 1525 nbytes 1525
   block group 0 mode 100666 links 1 uid 0 gid 0 rdev 0
   sequence 0 flags 0x2(none)
   atime 1529342058.461891730 (2018-06-18 18:14:18)
   ctime 1529342058.461891730 (2018-06-18 18:14:18)
   mtime 1529342058.461891730 (2018-06-18 18:14:18)
   otime 1529342055.869892885 (2018-06-18 18:14:15)
item 102 key (418 INODE_REF 264) itemoff 11016 itemsize 13
   index 25 namelen 3 name: fc7
item 103 key (418 EXTENT_DATA 0) itemoff 9470 itemsize 1546
   generation 38 type 0 (inline)
   inline extent data size 1525 ram_bytes 1525 compression 0 (none)

Then when test 004 invoked fiemap against the file it got a non-zero
physical offset:

 $ filefrag -v /mnt/p0/d4/d7/fc7
 Filesystem type is: 9123683e
 File size of /mnt/p0/d4/d7/fc7 is 1525 (1 block of 4096 bytes)
  ext: logical_offset:physical_offset: length:   expected: flags:
0:0..4095: 18446744073709551614..  4093:   4096:
 last,not_aligned,inline,eof
 /mnt/p0/d4/d7/fc7: 1 extent found

This resulted in the test failing like this:

btrfs/004 49s ... [failed, exit status 1]- output mismatch (see 
/home/fdmanana/git/hub/xfstests/results//btrfs/004.out.bad)
--- tests/btrfs/004.out 2016-08-23 10:17:35.027012095 +0100
+++ /home/fdmanana/git/hub/xfstests/results//btrfs/004.out.bad  
2018-06-18 18:15:02.385872155 +0100
@@ -1,3 +1,10 @@
 QA output created by 004
 *** test backref walking
-*** done
+./tests/btrfs/004: line 227: [: 7.55578637259143e+22: integer expression 
expected
+ERROR: 7.55578637259143e+22 is not a valid numeric value.
+unexpected output from
+   /home/fdmanana/git/hub/btrfs-progs/btrfs inspect-internal 
logical-resolve -s 65536 -P 7.55578637259143e+22 
/home/fdmanana/btrfs-tests/scratch_1
...
(Run 'diff -u tests/btrfs/004.out 
/home/fdmanana/git/hub/xfstests/results//btrfs/004.out.bad'  to see the entire 
diff)
Ran: btrfs/004

The large number in scientific notation reported as an invalid numeric
value is the result from the filter passed to perl which multiplies the
physical offset by the block size reported by fiemap.

So fix this by ensuring the physical offset is always set to 0 when we
are processing an extent with a special block_start value.

Fixes: 9d311e11fc1f ("Btrfs: fiemap: pass correct bytenr when fm_extent_count 
is zero")
Signed-off-by: Filipe Manana 
---

v2: Set the physical offset to 0 for other extent maps with a special
block_start value as well.

 fs/btrfs/extent_io.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8e4a7cdbc9f5..1aa91d57404a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4545,8 +4545,11 @@ int extent_fiemap(struct inode *inode, struct 
fiemap_extent_info *fieinfo,
offset_in_extent = em_start - em->start;
em_end = extent_map_end(em);
em_len = em_end - em_start;
-   disko = em->block_start + offset_in_extent;
flags = 0;
+   if (em->block_start < EXTENT_MAP_LAST_BYTE)
+   disko = em->block_start + offset_in_extent;
+   else
+   disko = 0;
 
/*
 * bump off for our next call to get_extent
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix physical offset reported by fiemap for inline extents

2018-06-19 Thread fdmanana
From: Filipe Manana 

Commit 9d311e11fc1f ("Btrfs: fiemap: pass correct bytenr when
fm_extent_count is zero") introduced a regression where we no longer
report 0 as the physical offset for inline extents. This is because it
always sets the variable used to report the physical offset ("disko")
as em->block_start plus some offset, and em->block_start has the value
18446744073709551614 ((u64) -2) for inline extents.

This made the btrfs test 004 (from fstests) often fail, for example, for
a file with an inline extent we have the following items in the subvolume
tree:

item 101 key (418 INODE_ITEM 0) itemoff 11029 itemsize 160
   generation 25 transid 38 size 1525 nbytes 1525
   block group 0 mode 100666 links 1 uid 0 gid 0 rdev 0
   sequence 0 flags 0x2(none)
   atime 1529342058.461891730 (2018-06-18 18:14:18)
   ctime 1529342058.461891730 (2018-06-18 18:14:18)
   mtime 1529342058.461891730 (2018-06-18 18:14:18)
   otime 1529342055.869892885 (2018-06-18 18:14:15)
item 102 key (418 INODE_REF 264) itemoff 11016 itemsize 13
   index 25 namelen 3 name: fc7
item 103 key (418 EXTENT_DATA 0) itemoff 9470 itemsize 1546
   generation 38 type 0 (inline)
   inline extent data size 1525 ram_bytes 1525 compression 0 (none)

Then when test 004 invoked fiemap against the file it got a non-zero
physical offset:

 $ filefrag -v /mnt/p0/d4/d7/fc7
 Filesystem type is: 9123683e
 File size of /mnt/p0/d4/d7/fc7 is 1525 (1 block of 4096 bytes)
  ext: logical_offset:physical_offset: length:   expected: flags:
0:0..4095: 18446744073709551614..  4093:   4096:
 last,not_aligned,inline,eof
 /mnt/p0/d4/d7/fc7: 1 extent found

This resulted in the test failing like this:

btrfs/004 49s ... [failed, exit status 1]- output mismatch (see 
/home/fdmanana/git/hub/xfstests/results//btrfs/004.out.bad)
--- tests/btrfs/004.out 2016-08-23 10:17:35.027012095 +0100
+++ /home/fdmanana/git/hub/xfstests/results//btrfs/004.out.bad  
2018-06-18 18:15:02.385872155 +0100
@@ -1,3 +1,10 @@
 QA output created by 004
 *** test backref walking
-*** done
+./tests/btrfs/004: line 227: [: 7.55578637259143e+22: integer expression 
expected
+ERROR: 7.55578637259143e+22 is not a valid numeric value.
+unexpected output from
+   /home/fdmanana/git/hub/btrfs-progs/btrfs inspect-internal 
logical-resolve -s 65536 -P 7.55578637259143e+22 
/home/fdmanana/btrfs-tests/scratch_1
...
(Run 'diff -u tests/btrfs/004.out 
/home/fdmanana/git/hub/xfstests/results//btrfs/004.out.bad'  to see the entire 
diff)
Ran: btrfs/004

The large number in scientific notation reported as an invalid numeric
value is the result from the filter passed to perl which multiplies the
physical offset by the block size reported by fiemap.

So fix this by ensuring the physical offset is always set to 0 when we
are processing an inline extent.

Fixes: 9d311e11fc1f ("Btrfs: fiemap: pass correct bytenr when fm_extent_count 
is zero")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/extent_io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8e4a7cdbc9f5..978327d98fc5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4559,6 +4559,7 @@ int extent_fiemap(struct inode *inode, struct 
fiemap_extent_info *fieinfo,
end = 1;
flags |= FIEMAP_EXTENT_LAST;
} else if (em->block_start == EXTENT_MAP_INLINE) {
+   disko = 0;
flags |= (FIEMAP_EXTENT_DATA_INLINE |
  FIEMAP_EXTENT_NOT_ALIGNED);
} else if (em->block_start == EXTENT_MAP_DELALLOC) {
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] generic: add test for fsync of directory after creating hard link

2018-06-11 Thread fdmanana
From: Filipe Manana 

Test that if we create a new hard link for a file which was previously
fsync'ed, fsync a parent directory of the new hard link and power fail,
the parent directory exists after mounting the filesystem again. The
parent directory must be a new directory, not yet persisted.

This test is motivated by a bug found in btrfs, where the fsync'ed parent
directory was lost after a power failure. The bug in btrfs is fixed by a
patch for the linux kernel titled:

 "Btrfs: sync log after logging new name"

Signed-off-by: Filipe Manana 
---
 tests/generic/498 | 65 +++
 tests/generic/498.out |  2 ++
 tests/generic/group   |  1 +
 3 files changed, 68 insertions(+)
 create mode 100755 tests/generic/498
 create mode 100644 tests/generic/498.out

diff --git a/tests/generic/498 b/tests/generic/498
new file mode 100755
index ..1cf73bda
--- /dev/null
+++ b/tests/generic/498
@@ -0,0 +1,65 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 498
+#
+# Test that if we create a new hard link for a file which was previously
+# fsync'ed, fsync a parent directory of the new hard link and power fail,
+# the parent directory exists after mounting the filesystem again.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+mkdir $SCRATCH_MNT/A
+mkdir $SCRATCH_MNT/B
+mkdir $SCRATCH_MNT/A/C
+touch $SCRATCH_MNT/B/foo
+$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/B/foo
+# It is important the new hard link is located in a hierarchy of new 
directories
+# (not yet persisted).
+ln $SCRATCH_MNT/B/foo $SCRATCH_MNT/A/C/foo
+$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/A
+
+# Simulate a power failure and mount the filesystem to check that what we
+# explicitly fsync'ed exists.
+_flakey_drop_and_remount
+
+[ -d $SCRATCH_MNT/A ] || echo "directory A missing"
+[ -f $SCRATCH_MNT/B/foo ] || echo "file B/foo is missing"
+
+_unmount_flakey
+_cleanup_flakey
+
+echo "Silence is golden"
+status=0
+exit
diff --git a/tests/generic/498.out b/tests/generic/498.out
new file mode 100644
index ..31a5ff40
--- /dev/null
+++ b/tests/generic/498.out
@@ -0,0 +1,2 @@
+QA output created by 498
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index 397f9c1c..83a6fdab 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -500,3 +500,4 @@
 495 auto quick swap
 496 auto quick swap
 497 auto quick swap collapse
+498 auto quick log
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] Btrfs: fix return value on rename exchange failure

2018-06-11 Thread fdmanana
From: Filipe Manana 

If we failed during a rename exchange operation after starting/joining a
transaction, we would end up replacing the return value, stored in the
local 'ret' variable, with the return value from btrfs_end_transaction().
So this could end up returning 0 (success) to user space despite the
operation having failed and aborted the transaction, because if there are
multiple tasks having a reference on the transaction at the time
btrfs_end_transaction() is called by the rename exchange, that function
returns 0 (otherwise it returns -EIO and not the original error value).
So fix this by not overwriting the return value on error after getting
a transaction handle.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 89b208201783..2bb893aa54da 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9445,6 +9445,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
u64 new_idx = 0;
u64 root_objectid;
int ret;
+   int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
 
@@ -9641,7 +9642,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
dest_log_pinned = false;
}
}
-   ret = btrfs_end_transaction(trans);
+   ret2 = btrfs_end_transaction(trans);
+   ret = ret ? ret : ret2;
 out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(_info->subvol_sem);
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] Btrfs: sync log after logging new name

2018-06-11 Thread fdmanana
From: Filipe Manana 

When we add a new name for an inode which was logged in the current
transaction, we update the inode in the log so that its new name and
ancestors are added to the log. However when we do this we do not persist
the log, so the changes remain in memory only, and as a consequence, any
ancestors that were created in the current transaction are updated such
that future calls to btrfs_inode_in_log() return true. This leads to a
subsequent fsync against such new ancestor directories returning
immediately, without persisting the log, therefore after a power failure
the new ancestor directories do not exist, despite fsync being called
against them explicitly.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ mkdir /mnt/A
  $ mkdir /mnt/B
  $ mkdir /mnt/A/C
  $ touch /mnt/B/foo
  $ xfs_io -c "fsync" /mnt/B/foo
  $ ln /mnt/B/foo /mnt/A/C/foo
  $ xfs_io -c "fsync" /mnt/A
  

After the power failure, directory "A" does not exist, despite the explicit
fsync on it.

Instead of fixing this by changing the behaviour of the explicit fsync on
directory "A" to persist the log instead of doing nothing, make the logging
of the new file name (which happens when creating a hard link or renaming)
persist the log. This approach not only is simpler, not requiring addition
of new fields to the inode in memory structure, but also gives us the same
behaviour as ext4, xfs and f2fs (possibly other filesystems too).

A test case for fstests follows soon.

Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes")
Reported-by: Vijay Chidambaram 
Signed-off-by: Filipe Manana 
---
 fs/btrfs/inode.c| 92 ++---
 fs/btrfs/tree-log.c | 48 
 fs/btrfs/tree-log.h | 10 +-
 3 files changed, 131 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bb893aa54da..e84bf9d93ed4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6692,6 +6692,8 @@ static int btrfs_link(struct dentry *old_dentry, struct 
inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
+   int ret;
+
err = btrfs_update_inode(trans, root, inode);
if (err)
goto fail;
@@ -6705,7 +6707,12 @@ static int btrfs_link(struct dentry *old_dentry, struct 
inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
-   btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
+   ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
+true, NULL);
+   if (ret == BTRFS_NEED_TRANS_COMMIT) {
+   err = btrfs_commit_transaction(trans);
+   trans = NULL;
+   }
}
 
 fail:
@@ -9445,14 +9452,21 @@ static int btrfs_rename_exchange(struct inode *old_dir,
u64 new_idx = 0;
u64 root_objectid;
int ret;
-   int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
+   struct btrfs_log_ctx ctx_root;
+   struct btrfs_log_ctx ctx_dest;
+   bool sync_log_root = false;
+   bool sync_log_dest = false;
+   bool commit_transaction = false;
 
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
 
+   btrfs_init_log_ctx(_root, old_inode);
+   btrfs_init_log_ctx(_dest, new_inode);
+
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(_info->subvol_sem);
@@ -9601,15 +9615,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 
if (root_log_pinned) {
parent = new_dentry->d_parent;
-   btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
-   parent);
+   ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
+BTRFS_I(old_dir), parent,
+false, _root);
+   if (ret == BTRFS_NEED_LOG_SYNC)
+   sync_log_root = true;
+   else if (ret == BTRFS_NEED_TRANS_COMMIT)
+   commit_transaction = true;
+   ret = 0;
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
-   parent = old_dentry->d_parent;
-   btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
-   parent);
+   if (!commit_transaction) {
+   parent = old_dentry->d_parent;
+   ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
+

[PATCH] fstests: generic test for fsync of file with xattrs

2018-05-11 Thread fdmanana
From: Filipe Manana 

Test that xattrs are not lost after calling fsync multiple times with a
filesystem commit in between the fsync calls.

This test is motivated by a bug found in btrfs which is fixed by a patch
for the linux kernel titled:

  Btrfs: fix xattr loss after power failure

Signed-off-by: Filipe Manana 
---
 tests/generic/487 | 86 +++
 tests/generic/487.out | 11 +++
 tests/generic/group   |  1 +
 3 files changed, 98 insertions(+)
 create mode 100755 tests/generic/487
 create mode 100644 tests/generic/487.out

diff --git a/tests/generic/487 b/tests/generic/487
new file mode 100755
index ..328b5378
--- /dev/null
+++ b/tests/generic/487
@@ -0,0 +1,86 @@
+#! /bin/bash
+# FSQA Test No. 487
+#
+# Test that xattrs are not lost after calling fsync multiple times with a
+# filesystem commit in between the fsync calls.
+#
+#---
+#
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+. ./common/attr
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+_require_attrs
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+touch $SCRATCH_MNT/foobar
+$SETFATTR_PROG -n user.xa1 -v qwerty $SCRATCH_MNT/foobar
+$SETFATTR_PROG -n user.xa2 -v 'hello world' $SCRATCH_MNT/foobar
+$SETFATTR_PROG -n user.xa3 -v test $SCRATCH_MNT/foobar
+$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foobar
+
+# Call sync to commit all fileystem metadata.
+sync
+
+$XFS_IO_PROG -c "pwrite -S 0xea 0 64K" \
+-c "fsync" \
+$SCRATCH_MNT/foobar >>$seqres.full
+
+# Simulate a power failure and mount the filesystem to check that the xattrs
+# were not lost and neither was the data we wrote.
+_flakey_drop_and_remount
+echo "File xattrs after power failure:"
+$GETFATTR_PROG --absolute-names --dump $SCRATCH_MNT/foobar | _filter_scratch
+echo "File data after power failure:"
+od -t x1 $SCRATCH_MNT/foobar
+
+_unmount_flakey
+_cleanup_flakey
+
+status=0
+exit
diff --git a/tests/generic/487.out b/tests/generic/487.out
new file mode 100644
index ..44a119f8
--- /dev/null
+++ b/tests/generic/487.out
@@ -0,0 +1,11 @@
+QA output created by 487
+File xattrs after power failure:
+# file: SCRATCH_MNT/foobar
+user.xa1="qwerty"
+user.xa2="hello world"
+user.xa3="test"
+
+File data after power failure:
+000 ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea
+*
+020
diff --git a/tests/generic/group b/tests/generic/group
index 505383f7..c8f51ec2 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -489,3 +489,4 @@
 484 auto quick
 485 auto quick insert
 486 auto quick attr
+487 auto quick attr
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix xattr loss after power failure

2018-05-11 Thread fdmanana
From: Filipe Manana 

If a file has xattrs, we fsync it, to ensure we clear the flags
BTRFS_INODE_NEEDS_FULL_SYNC and BTRFS_INODE_COPY_EVERYTHING from its
inode, the current transaction commits and then we fsync it (without
either of those bits being set in its inode), we end up not logging
all its xattrs. This results in deleting all xattrs when replying the
log after a power failure.

Trivial reproducer

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foobar
  $ setfattr -n user.xa -v qwerty /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar

  $ sync

  $ xfs_io -c "pwrite -S 0xab 0 64K" /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar
  

  $ mount /dev/sdb /mnt
  $ getfattr --absolute-names --dump /mnt/foobar
  
  $

So fix this by making sure all xattrs are logged if we log a file's inode
item and neither the flags BTRFS_INODE_NEEDS_FULL_SYNC nor
BTRFS_INODE_COPY_EVERYTHING were set in the inode.

Fixes: 36283bf777d9 ("Btrfs: fix fsync xattr loss in the fast fsync path")
Cc: 
Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 25b888df00c9..d656de8bec52 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4916,6 +4916,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
*trans,
struct extent_map_tree *em_tree = >extent_tree;
u64 logged_isize = 0;
bool need_log_inode_item = true;
+   bool xattrs_logged = false;
 
path = btrfs_alloc_path();
if (!path)
@@ -5217,6 +5218,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
*trans,
err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
if (err)
goto out_unlock;
+   xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
@@ -5229,6 +5231,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
*trans,
btrfs_release_path(dst_path);
if (need_log_inode_item) {
err = log_inode_item(trans, log, dst_path, inode);
+   if (!err && !xattrs_logged) {
+   err = btrfs_log_all_xattrs(trans, root, inode, path,
+  dst_path);
+   btrfs_release_path(path);
+   }
if (err)
goto out_unlock;
}
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix duplicate extents after fsync of file with prealloc extents

2018-05-09 Thread fdmanana
From: Filipe Manana 

In commit 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size
after fsync log replay"), on fsync,  we started to always log all prealloc
extents beyond an inode's i_size in order to avoid losing them after a
power failure. However under some cases this can lead to the log replay
code to create duplicate extent items, with different lengths, in the
extent tree. That happens because, as of that commit, we can now log
extent items based on extent maps that are not on the "modified" list
of extent maps of the inode's extent map tree. Logging extent items based
on extent maps is used during the fast fsync path to save time and for
this to work reliably it requires that the extent maps are not merged
with other adjacent extent maps - having the extent maps in the list
of modified extents gives such guarantee.

Consider the following example, captured during a long run of fsstress,
which illustrates this problem.

We have inode 271, in the filesystem tree (root 5), for which all of the
following operations and discussion apply to.

A buffered write starts at offset 312391 with a length of 933471 bytes
(end offset at 1245862). At this point we have, for this inode, the
following extent maps with the their field values:

em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613,
  block_len 0, orig_block_len 0
em B, start 40960, orig_start 40960, len 376832, block_start 1106399232,
  block_len 376832, orig_block_len 376832
em C, start 417792, orig_start 417792, len 782336, block_start
  18446744073709551613, block_len 0, orig_block_len 0
em D, start 1200128, orig_start 1200128, len 835584, block_start
  1106776064, block_len 835584, orig_block_len 835584
em E, start 2035712, orig_start 2035712, len 245760, block_start
  1107611648, block_len 245760, orig_block_len 245760

Extent map A corresponds to a hole and extent maps D and E correspond to
preallocated extents.

Extent map D ends where extent map E begins (1106776064 + 835584 =
1107611648), but these extent maps were not merged because they are in
the inode's list of modified extent maps.

An fsync against this inode is made, which triggers the fast path
(BTRFS_INODE_NEEDS_FULL_SYNC is not set). This fsync triggers writeback
of the data previously written using buffered IO, and when the respective
ordered extent finishes, btrfs_drop_extents() is called against the
(aligned) range 311296..1249279. This causes a split of extent map D at
btrfs_drop_extent_cache(), replacing extent map D with a new extent map
D', also added to the list of modified extents,  with the following
values:

em D', start 1249280, orig_start of 1200128,
   block_start 1106825216 (= 1106776064 + 1249280 - 1200128),
   orig_block_len 835584,
   block_len 786432 (835584 - (1249280 - 1200128))

Then, during the fast fsync, btrfs_log_changed_extents() is called and
extent maps D' and E are removed from the list of modified extents. The
flag EXTENT_FLAG_LOGGING is also set on them. After the extents are logged
clear_em_logging() is called on each of them, and that makes extent map E
to be merged with extent map D' (try_merge_map()), resulting in D' being
deleted and E adjusted to:

em E, start 1249280, orig_start 1200128, len 1032192,
  block_start 1106825216, block_len 1032192,
  orig_block_len 245760

A direct IO write at offset 1847296 and length of 360448 bytes (end offset
at 2207744) starts, and at that moment the following extent maps exist for
our inode:

em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613,
  block_len 0, orig_block_len 0
em B, start 40960, orig_start 40960, len 270336, block_start 1106399232,
  block_len 270336, orig_block_len 376832
em C, start 311296, orig_start 311296, len 937984, block_start 1112842240,
  block_len 937984, orig_block_len 937984
em E (prealloc), start 1249280, orig_start 1200128, len 1032192,
  block_start 1106825216, block_len 1032192, orig_block_len 245760

The dio write results in drop_extent_cache() being called twice. The first
time for a range that starts at offset 1847296 and ends at offset 2035711
(length of 188416), which results in a double split of extent map E,
replacing it with two new extent maps:

em F, start 1249280, orig_start 1200128, block_start 1106825216,
  block_len 598016, orig_block_len 598016
em G, start 2035712, orig_start 1200128, block_start 1107611648,
  block_len 245760, orig_block_len 1032192

It also creates a new extent map that represents a part of the requested
IO (through create_io_em()):

em H, start 1847296, len 188416, block_start 1107423232, block_len 188416

The second call to drop_extent_cache() has a range with a start offset of
2035712 and end offset of 2207743 (length of 172032). This leads to
replacing extent map G with a new extent map I with the following values:

em I, start 2207744, orig_start 1200128, block_start 1107783680,
  block_len 73728, 

[PATCH] Btrfs: send, fix missing truncate for inode with prealloc extent past eof

2018-04-30 Thread fdmanana
From: Filipe Manana 

An incremental send operation can miss a truncate operation when an inode
has an increased size in the send snapshot and a prealloc extent beyond
its size.

Consider the following scenario where a necessary truncate operation is
missing in the incremental send stream:

1) In the parent snapshot an inode has a size of 1282957 bytes and it has
   no prealloc extents beyond its size;

2) In the the send snapshot it has a size of 5738496 bytes and has a new
   extent at offsets 1884160 (length of 106496 bytes) and a prealloc
   extent beyond eof at offset 6729728 (and a length of 339968 bytes);

3) When processing the prealloc extent, at offset 6729728, we end up at
   send.c:send_write_or_clone() and set the @len variable to a value of
   18446744073708560384 because @offset plus the original @len value is
   larger then the inode's size (6729728 + 339968 > 5738496). We then
   call send_extent_data(), with that @offset and @len, which in turn
   calls send_write(), and then the later calls fill_read_buf(). Because
   the offset passed to fill_read_buf() is greater then inode's i_size,
   this function returns 0 immediately, which makes send_write() and
   send_extent_data() do nothing and return immediately as well. When
   we get back to send.c:send_write_or_clone() we adjust the value
   of sctx->cur_inode_next_write_offset to @offset plus @len, which
   corresponds to 6729728 + 18446744073708560384 = 5738496, which is
   precisely the the size of the inode in the send snapshot;

4) Later when at send.c:finish_inode_if_needed() we determine that
   we don't need to issue a truncate operation because the value of
   sctx->cur_inode_next_write_offset corresponds to the inode's new
   size, 5738496 bytes. This is wrong because the last write operation
   that was issued started at offset 1884160 with a length of 106496
   bytes, so the correct value for sctx->cur_inode_next_write_offset
   should be 1990656 (1884160 + 106496), so that a truncate operation
   with a value of 5738496 bytes would have been sent to insert a
   trailing hole at the destination.

So fix the issue by making send.c:send_write_or_clone() not attempt
to send write or clone operations for extents that start beyond the
inode's size, since such attempts do nothing but waste time by
calling helper functions and allocating path structures, and send
currently has no fallocate command in order to create prealloc extents
at the destination (either beyond a file's eof or not).

The issue was found running the test btrfs/007 from fstests using a seed
value of 1524346151 for fsstress.

Reported-by: Gu, Jinxiang 
Fixes: ffa7c4296e93 ("Btrfs: send, do not issue unnecessary truncate 
operations")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/send.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 1f5748c7d1c7..fff44ed15927 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5249,6 +5249,10 @@ static int send_write_or_clone(struct send_ctx *sctx,
len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
}
 
+   if (offset >= sctx->cur_inode_size) {
+   ret = 0;
+   goto out;
+   }
if (offset + len > sctx->cur_inode_size)
len = sctx->cur_inode_size - offset;
if (len == 0) {
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] fstests: generic test for fsync after fallocate

2018-04-09 Thread fdmanana
From: Filipe Manana 

Test that fsync operations preserve extents allocated with fallocate(2)
that are placed beyond a file's size.

This test is motivated by a bug found in btrfs where unwritten extents
beyond the inode's i_size were not preserved after a fsync and power
failure. The btrfs bug is fixed by the following patch for the linux
kernel:

 "Btrfs: fix loss of prealloc extents past i_size after fsync log replay"

Signed-off-by: Filipe Manana 
---

V2: Use different fiemap filter to not relly on written vs unwritten
differences in extents beyond eof, since has xfs a specific but
valid behaviour. Also check file sizes after the power failure
since the new fiemap filter now merges written and unwritten
extents.

 tests/generic/482 | 124 ++
 tests/generic/482.out |  13 ++
 tests/generic/group   |   1 +
 3 files changed, 138 insertions(+)
 create mode 100755 tests/generic/482
 create mode 100644 tests/generic/482.out

diff --git a/tests/generic/482 b/tests/generic/482
new file mode 100755
index ..a1947693
--- /dev/null
+++ b/tests/generic/482
@@ -0,0 +1,124 @@
+#! /bin/bash
+# FSQA Test No. 482
+#
+# Test that fsync operations preserve extents allocated with fallocate(2) that
+# are placed beyond a file's size.
+#
+#---
+#
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+. ./common/punch
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+_require_xfs_io_command "falloc" "-k"
+_require_xfs_io_command "fiemap"
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+# Create our test files.
+$XFS_IO_PROG -f -c "pwrite -S 0xea 0 256K" $SCRATCH_MNT/foo >/dev/null
+
+# Create a file with many extents. We later want to shrink truncate it and
+# add a prealloc extent beyond its new size.
+for ((i = 1; i <= 500; i++)); do
+   offset=$(((i - 1) * 4 * 1024))
+   $XFS_IO_PROG -f -s -c "pwrite -S 0xcf $offset 4K" \
+   $SCRATCH_MNT/bar >/dev/null
+done
+
+# A file which already has a prealloc extent beyond its size.
+# The fsync done on it is motivated by differences in the btrfs implementation
+# of fsync (first fsync has different logic from subsequent fsyncs).
+$XFS_IO_PROG -f -c "pwrite -S 0xf1 0 256K" \
+-c "falloc -k 256K 768K" \
+-c "fsync" \
+$SCRATCH_MNT/baz >/dev/null
+
+# Make sure everything done so far is durably persisted.
+sync
+
+# Allocate an extent beyond the size of the first test file and fsync it.
+$XFS_IO_PROG -c "falloc -k 256K 1M"\
+-c "fsync" \
+$SCRATCH_MNT/foo
+
+# Do a shrinking truncate of our test file, add a prealloc extent to it after
+# its new size and fsync it.
+$XFS_IO_PROG -c "truncate 256K" \
+-c "falloc -k 256K 1M"\
+-c "fsync" \
+$SCRATCH_MNT/bar
+
+# Allocate another extent beyond the size of file baz.
+$XFS_IO_PROG -c "falloc -k 1M 2M"\
+-c "fsync" \
+$SCRATCH_MNT/baz
+
+# Simulate a power failure and mount the filesystem to check that the extents
+# previously allocated were not lost and the file sizes are correct.
+_flakey_drop_and_remount
+
+echo "File foo fiemap:"
+$XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/foo | _filter_hole_fiemap
+echo "File foo size:"
+stat --format %s $SCRATCH_MNT/foo
+
+echo "File bar fiemap:"
+$XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/bar | _filter_hole_fiemap
+echo "File bar size:"
+stat --format %s $SCRATCH_MNT/bar
+
+echo "File baz fiemap:"
+$XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/baz | _filter_hole_fiemap
+echo "File baz size:"
+stat --format %s 

[PATCH] fstests: generic test for fsync after fallocate

2018-04-06 Thread fdmanana
From: Filipe Manana 

Test that fsync operations preserve extents allocated with fallocate(2)
that are placed beyond a file's size.

This test is motivated by a bug found in btrfs where unwritten extents
beyond the inode's i_size were not preserved after a fsync and power
failure. The btrfs bug is fixed by the following patch for the linux
kernel:

 "Btrfs: fix loss of prealloc extents past i_size after fsync log replay"

Signed-off-by: Filipe Manana 
---
 tests/generic/482 | 118 ++
 tests/generic/482.out |  10 +
 tests/generic/group   |   1 +
 3 files changed, 129 insertions(+)
 create mode 100755 tests/generic/482
 create mode 100644 tests/generic/482.out

diff --git a/tests/generic/482 b/tests/generic/482
new file mode 100755
index ..43bbc913
--- /dev/null
+++ b/tests/generic/482
@@ -0,0 +1,118 @@
+#! /bin/bash
+# FSQA Test No. 482
+#
+# Test that fsync operations preserve extents allocated with fallocate(2) that
+# are placed beyond a file's size.
+#
+#---
+#
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+. ./common/punch
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+_require_xfs_io_command "falloc" "-k"
+_require_xfs_io_command "fiemap"
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+# Create our test files.
+$XFS_IO_PROG -f -c "pwrite -S 0xea 0 256K" $SCRATCH_MNT/foo >/dev/null
+
+# Create a file with many extents. We later want to shrink truncate it and
+# add a prealloc extent beyond its new size.
+for ((i = 1; i <= 500; i++)); do
+   offset=$(((i - 1) * 4 * 1024))
+   $XFS_IO_PROG -f -s -c "pwrite -S 0xcf $offset 4K" \
+   $SCRATCH_MNT/bar >/dev/null
+done
+
+# A file which already has a prealloc extent beyond its size.
+# The fsync done on it is motivated by differences in the btrfs implementation
+# of fsync (first fsync has different logic from subsequent fsyncs).
+$XFS_IO_PROG -f -c "pwrite -S 0xf1 0 256K" \
+-c "falloc -k 256K 768K" \
+-c "fsync" \
+$SCRATCH_MNT/baz >/dev/null
+
+# Make sure everything done so far is durably persisted.
+sync
+
+# Allocate an extent beyond the size of the first test file and fsync it.
+$XFS_IO_PROG -c "falloc -k 256K 1M"\
+-c "fsync" \
+$SCRATCH_MNT/foo
+
+# Do a shrinking truncate of our test file, add a prealloc extent to it after
+# its new size and fsync it.
+$XFS_IO_PROG -c "truncate 256K" \
+-c "falloc -k 256K 1M"\
+-c "fsync" \
+$SCRATCH_MNT/bar
+
+# Allocate another extent beyond the size of file baz.
+$XFS_IO_PROG -c "falloc -k 1M 2M"\
+-c "fsync" \
+$SCRATCH_MNT/baz
+
+# Simulate a power failure and mount the filesystem to check that the extents
+# previously allocated were not lost.
+_flakey_drop_and_remount
+
+echo "File foo fiemap:"
+$XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/foo | _filter_fiemap
+
+echo "File bar fiemap:"
+$XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/bar | _filter_fiemap
+
+echo "File baz fiemap:"
+$XFS_IO_PROG -c "fiemap -v" $SCRATCH_MNT/baz | _filter_fiemap
+
+_unmount_flakey
+_cleanup_flakey
+
+status=0
+exit
diff --git a/tests/generic/482.out b/tests/generic/482.out
new file mode 100644
index ..7e3ed139
--- /dev/null
+++ b/tests/generic/482.out
@@ -0,0 +1,10 @@
+QA output created by 482
+File foo fiemap:
+0: [0..511]: data
+1: [512..2559]: unwritten
+File bar fiemap:
+0: [0..511]: data
+1: [512..2559]: unwritten
+File baz fiemap:
+0: [0..511]: data
+1: [512..6143]: unwritten
diff --git a/tests/generic/group b/tests/generic/group
index 

[PATCH] Btrfs: fix loss of prealloc extents past i_size after fsync log replay

2018-04-06 Thread fdmanana
From: Filipe Manana 

Currently if we allocate extents beyond an inode's i_size (through the
fallocate system call) and then fsync the file, we log the extents but
after a power failure we replay them and then immediately drop them.
This behaviour happens since about 2009, commit c71bf099abdd ("Btrfs:
Avoid orphan inodes cleanup while replaying log"), because it marks
the inode as an orphan instead of dropping any extents beyond i_size
before replaying logged extents, so after the log replay, and while
the mount operation is still ongoing, we find the inode marked as an
orphan and then perform a truncation (drop extents beyond the inode's
i_size). Because the processing of orphan inodes is still done
right after replaying the log and before the mount operation finishes,
the intention of that commit does not make any sense (at least as
of today). However reverting that behaviour is not enough, because
we can not simply discard all extents beyond i_size and then replay
logged extents, because we risk dropping extents beyond i_size created
in past transactions, for example:

  add prealloc extent beyond i_size
  fsync - clears the flag BTRFS_INODE_NEEDS_FULL_SYNC from the inode
  transaction commit
  add another prealloc extent beyond i_size
  fsync - triggers the fast fsync path
  power failure

In that scenario, we would drop the first extent and then replay the
second one. To fix this just make sure that all prealloc extents
beyond i_size are logged, and if we find too many (which is far from
a common case), fallback to a full transaction commit (like we do when
logging regular extents in the fast fsync path).

Trivial reproducer:

 $ mkfs.btrfs -f /dev/sdb
 $ mount /dev/sdb /mnt
 $ xfs_io -f -c "pwrite -S 0xab 0 256K" /mnt/foo
 $ sync
 $ xfs_io -c "falloc -k 256K 1M" /mnt/foo
 $ xfs_io -c "fsync" /mnt/foo
 

 # mount to replay log
 $ mount /dev/sdb /mnt
 # at this point the file only has one extent, at offset 0, size 256K

A test case for fstests follows soon, covering multiple scenarios that
involve adding prealloc extents with previous shrinking truncates and
without such truncates.

Fixes: c71bf099abdd ("Btrfs: Avoid orphan inodes cleanup while replaying log")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 63 -
 1 file changed, 58 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 70afd1085033..eb3a41269b0e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2457,13 +2457,41 @@ static int replay_one_buffer(struct btrfs_root *log, 
struct extent_buffer *eb,
if (ret)
break;
 
-   /* for regular files, make sure corresponding
-* orphan item exist. extents past the new EOF
-* will be truncated later by orphan cleanup.
+   /*
+* Before replaying extents, truncate the inode to its
+* size. We need to do it now and not after log replay
+* because before an fsync we can have prealloc extents
+* added beyond the inode's i_size. If we did it after,
+* through orphan cleanup for example, we would drop
+* those prealloc extents just after replaying them.
 */
if (S_ISREG(mode)) {
-   ret = insert_orphan_item(wc->trans, root,
-key.objectid);
+   struct inode *inode;
+   u64 from;
+
+   inode = read_one_inode(root, key.objectid);
+   if (!inode) {
+   ret = -EIO;
+   break;
+   }
+   from = ALIGN(i_size_read(inode),
+root->fs_info->sectorsize);
+   ret = btrfs_drop_extents(wc->trans, root, inode,
+from, (u64)-1, 1);
+   /*
+* If the nlink count is zero here, the iput
+* will free the inode.  We bump it to make
+* sure it doesn't get freed until the link
+* count fixup is done.
+*/
+   if (!ret) {
+   if (inode->i_nlink == 0)
+   inc_nlink(inode);
+   /* Update link count and nbytes. */
+   ret = btrfs_update_inode(wc->trans,
+

[PATCH v2] fstests: test btrfs fsync after hole punching with no-holes mode

2018-04-02 Thread fdmanana
From: Filipe Manana 

Test that when we have the no-holes mode enabled and a specific metadata
layout, if we punch a hole and fsync the file, at replay time the whole
hole was preserved.

This issue is fixed by the following btrfs patch for the linux kernel:

  "Btrfs: fix fsync after hole punching when using no-holes feature"

Signed-off-by: Filipe Manana 
---

V2: Made the test work when selinux is enabled, and made it use direct IO
writes to ensure 256K extents.

 tests/btrfs/159 | 115 
 tests/btrfs/159.out |   9 
 tests/btrfs/group   |   1 +
 3 files changed, 125 insertions(+)
 create mode 100755 tests/btrfs/159
 create mode 100644 tests/btrfs/159.out

diff --git a/tests/btrfs/159 b/tests/btrfs/159
new file mode 100755
index ..eb667692
--- /dev/null
+++ b/tests/btrfs/159
@@ -0,0 +1,115 @@
+#! /bin/bash
+# FSQA Test No. 159
+#
+# Test that when we have the no-holes mode enabled and a specific metadata
+# layout, if we punch a hole and fsync the file, at replay time the whole
+# hole was preserved.
+#
+#---
+#
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+_require_xfs_io_command "fpunch"
+_require_odirect
+
+rm -f $seqres.full
+
+run_test()
+{
+   local punch_offset=$1
+
+   # We create the filesystem with a node size of 64Kb because we need to
+   # create a specific metadata layout in order to trigger the bug we are
+   # testing. At the moment the node size can not be smaller then the
+   # system's page size, so given that the largest possible page size is
+   # 64Kb and by default the node size is set to the system's page size
+   # value, we explicitly create a filesystem with a 64Kb node size.
+   _scratch_mkfs -O no-holes -n $((64 * 1024)) >>$seqres.full 2>&1
+   _require_metadata_journaling $SCRATCH_DEV
+   _init_flakey
+   _mount_flakey
+
+   # Create our test file with 832 extents of 256Kb each. Before each
+   # extent, there is a 256Kb hole (except for the first extent, which
+   # starts at offset 0). This creates two leafs in the filesystem tree.
+   # We use direct IO to ensure we get exactly 256K extents (with buffered
+   # IO we can get writeback triggered at any time and therefore get
+   # extents smaller than 256K).
+   for ((i = 0; i <= 831; i++)); do
+   local offset=$((i * 2 * 256 * 1024))
+   $XFS_IO_PROG -f -d -c "pwrite -S 0xab -b 256K $offset 256K" \
+   $SCRATCH_MNT/foobar >/dev/null
+   done
+
+   # Make sure everything done so far is durably persisted.
+   sync
+
+   # Now punch a hole that covers part of the extent at offset
+   # "$punch_offset".
+   # We want to punch a hole that starts in the middle of the last extent
+   # item in the first leaf. On a system without selinux enabled that is
+   # the extent that starts at offset 216530944, while on a system with it
+   # enabled it is the extent that starts at offset 216006656 (because
+   # selinux causes a xattr item to be added to our test file).
+   $XFS_IO_PROG -c "fpunch $((punch_offset + 128 * 1024 - 4000)) 256K" \
+-c "fsync" \
+$SCRATCH_MNT/foobar
+
+   echo "File digest before power failure:"
+   md5sum $SCRATCH_MNT/foobar | _filter_scratch
+   # Simulate a power failure and mount the filesystem to check that
+   # replaying the fsync log/journal succeeds and our test file has the
+   # expected content.
+   _flakey_drop_and_remount
+   echo "File digest after power 

[PATCH] fstests: test btrfs fsync after hole punching with no-holes mode

2018-03-27 Thread fdmanana
From: Filipe Manana 

Test that when we have the no-holes mode enabled and a specific metadata
layout, if we punch a hole and fsync the file, at replay time the whole
hole was preserved.

This issue is fixed by the following btrfs patch for the linux kernel:

  "Btrfs: fix fsync after hole punching when using no-holes feature"

Signed-off-by: Filipe Manana 
---
 tests/btrfs/159 | 100 
 tests/btrfs/159.out |   5 +++
 tests/btrfs/group   |   1 +
 3 files changed, 106 insertions(+)
 create mode 100755 tests/btrfs/159
 create mode 100644 tests/btrfs/159.out

diff --git a/tests/btrfs/159 b/tests/btrfs/159
new file mode 100755
index ..6083975a
--- /dev/null
+++ b/tests/btrfs/159
@@ -0,0 +1,100 @@
+#! /bin/bash
+# FSQA Test No. 159
+#
+# Test that when we have the no-holes mode enabled and a specific metadata
+# layout, if we punch a hole and fsync the file, at replay time the whole
+# hole was preserved.
+#
+#---
+#
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+_require_xfs_io_command "fpunch"
+
+rm -f $seqres.full
+
+# We create the filesystem with a node size of 64Kb because we need to create a
+# specific metadata layout in order to trigger the bug we are testing. At the
+# moment the node size can not be smaller then the system's page size, so given
+# that the largest possible page size is 64Kb and by default the node size is
+# set to the system's page size value, we explictly create a filesystem with a
+# 64Kb node size.
+_scratch_mkfs -O no-holes -n $((64 * 1024)) >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+# Create our test file with 831 extents of 256Kb each. Before each extent, 
there
+# is a 256Kb hole (except for the first extent, which starts at offset 0). This
+# creates two leafs in the filesystem tree, with the extent at offset 216530944
+# being the last item in the first leaf and the extent at offset 217055232 
being
+# the first item in the second leaf.
+for ((i = 0; i <= 831; i++)); do
+   offset=$((i * 2 * 256 * 1024))
+   $XFS_IO_PROG -f -c "pwrite -S 0xab -b 256K $offset 256K" \
+   $SCRATCH_MNT/foobar >/dev/null
+done
+
+# Now persist everything done so far.
+sync
+
+# Now punch a hole that covers part of the extent at offset 216530944.
+$XFS_IO_PROG -c "fpunch $((216530944 + 128 * 1024 - 4000)) 256K" \
+-c "fsync" \
+$SCRATCH_MNT/foobar
+
+echo "File digest before power failure:"
+md5sum $SCRATCH_MNT/foobar | _filter_scratch
+
+# Simulate a power failure and mount the filesystem to check that replaying the
+# fsync log/journal succeeds and our test file has the expected content.
+_flakey_drop_and_remount
+
+echo "File digest after power failure and log replay:"
+md5sum $SCRATCH_MNT/foobar | _filter_scratch
+
+_unmount_flakey
+_cleanup_flakey
+
+status=0
+exit
diff --git a/tests/btrfs/159.out b/tests/btrfs/159.out
new file mode 100644
index ..3317e516
--- /dev/null
+++ b/tests/btrfs/159.out
@@ -0,0 +1,5 @@
+QA output created by 159
+File digest before power failure:
+c5c0a13588a639529c979c57c336f441  SCRATCH_MNT/foobar
+File digest after power failure and log replay:
+c5c0a13588a639529c979c57c336f441  SCRATCH_MNT/foobar
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 8007e07e..ba766f6b 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -161,3 +161,4 @@
 156 auto quick trim
 157 auto quick raid
 158 auto quick raid scrub
+159 auto quick
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to 

[PATCH 2/2] Btrfs: fix copy_items() return value when logging an inode

2018-03-27 Thread fdmanana
From: Filipe Manana 

When logging an inode, at tree-log.c:copy_items(), if we call
btrfs_next_leaf() at the loop which checks for the need to log holes, we
need to make sure copy_items() returns the value 1 to its caller and
not 0 (on success). This is because the path the caller passed was
released and is now different from what is was before, and the caller
expects a return value of 0 to mean both success and that the path
has not changed, while a return value of 1 means both success and
signals the caller that it can not reuse the path, it has to perform
another tree search.

Even though this is a case that should not be triggered on normal
circumstances or very rare at least, its consequences can be very
unpredictable (especially when replaying a log tree).

Fixes: 16e7549f045d ("Btrfs: incompatible format change to remove hole extents")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1d738ff5c41b..abd9fd81cd1c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3960,6 +3960,7 @@ static noinline int copy_items(struct btrfs_trans_handle 
*trans,
ASSERT(ret == 0);
src = src_path->nodes[0];
i = 0;
+   need_find_last_extent = true;
}
 
btrfs_item_key_to_cpu(src, , i);
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] Btrfs: fix fsync after hole punching when using no-holes feature

2018-03-27 Thread fdmanana
From: Filipe Manana 

When we have the no-holes mode enabled and fsync a file after punching a
hole in it, we can end up not logging the whole hole range in the log tree.
This happens if the file has extent items that span more than one leaf and
we punch a hole that covers a range that starts in a leaf but does not go
beyond the offset of the first extent in the next leaf.

Example:

  $ mkfs.btrfs -f -O no-holes -n 65536 /dev/sdb
  $ mount /dev/sdb /mnt
  $ for ((i = 0; i <= 831; i++)); do
offset=$((i * 2 * 256 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 256K $offset 256K" \
/mnt/foobar >/dev/null
done
  $ sync

  # We now have 2 leafs in our filesystem fs tree, the first leaf has an
  # item corresponding the extent at file offset 216530944 and the second
  # leaf has a first item corresponding to the extent at offset 217055232.
  # Now we punch a hole that partially covers the range of the extent at
  # offset 216530944 but does go beyond the offset 217055232.

  $ xfs_io -c "fpunch $((216530944 + 128 * 1024 - 4000)) 256K" /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar

  

  # mount to replay the log
  $ mount /dev/sdb /mnt

  # Before this patch, only the subrange [216658016, 216662016[ (length of
  # 4000 bytes) was logged, leaving an incorrect file layout after log
  # replay.

Fix this by checking if there is a hole between the last extent item that
we processed and the first extent item in the next leaf, and if there is
one, log an explicit hole extent item.

Fixes: 16e7549f045d ("Btrfs: incompatible format change to remove hole extents")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fd573816f461..1d738ff5c41b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3994,6 +3994,36 @@ static noinline int copy_items(struct btrfs_trans_handle 
*trans,
break;
*last_extent = extent_end;
}
+
+   /*
+* Check if there is a hole between the last extent found in our leaf
+* and the first extent in the next leaf. If there is one, we need to
+* log an explicit hole so that at replay time we can punch the hole.
+*/
+   if (ret == 0 &&
+   key.objectid == btrfs_ino(inode) &&
+   key.type == BTRFS_EXTENT_DATA_KEY &&
+   i == btrfs_header_nritems(src_path->nodes[0])) {
+   ret = btrfs_next_leaf(inode->root, src_path);
+   need_find_last_extent = true;
+   if (ret > 0) {
+   ret = 0;
+   } else if (ret == 0) {
+   btrfs_item_key_to_cpu(src_path->nodes[0], ,
+ src_path->slots[0]);
+   if (key.objectid == btrfs_ino(inode) &&
+   key.type == BTRFS_EXTENT_DATA_KEY &&
+   *last_extent < key.offset) {
+   const u64 len = key.offset - *last_extent;
+
+   ret = btrfs_insert_file_extent(trans, log,
+  btrfs_ino(inode),
+  *last_extent, 0,
+  0, len, 0, len,
+  0, 0, 0);
+   }
+   }
+   }
/*
 * Need to let the callers know we dropped the path so they should
 * re-search.
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2 v3] Btrfs-progs: add fsck test for filesystem with shared prealloc extents

2018-03-15 Thread fdmanana
From: Filipe Manana 

Verify that a filesystem check operation (fsck) does not report the
following scenario as an error:

An extent is shared between two inodes, as a result of clone/reflink
operation, and for one of the inodes, lets call it inode A, the extent is
referenced through a file extent item as a prealloc extent, while for the
other inode, call it inode B, the extent is referenced through a regular
file extent item, that is, it was written to. The goal of this test is to
make sure a filesystem check operation will not report "odd csum items"
errors for the prealloc extent at inode A, because this scenario is valid
since the extent was written through inode B and therefore it is expected
to have checksum items in the filesystem's checksum btree for that shared
extent.

Such scenario can be created with the following steps for example:

 mkfs.btrfs -f /dev/sdb
 mount /dev/sdb /mnt

 touch /mnt/foo
 xfs_io -c "falloc 0 256K" /mnt/foo
 sync

 xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
 touch /mnt/bar
 xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
 xfs_io -c "fsync" /mnt/bar

 
 mount /dev/sdb /mnt
 umount /mnt

This scenario is fixed by the following patch for the filesystem checker:

 "Btrfs-progs: check, fix false error reports for shared prealloc extents"

Signed-off-by: Filipe Manana 
Reviewed-by: Qu Wenruo 
---

V3: No changes.
V2: No changes, only added Qu's reviewed-by tag.

 .../reflinked-prealloc-extents.img.xz  | Bin 0 -> 3244 bytes
 .../030-reflinked-prealloc-extents/test.sh |  42 +
 2 files changed, 42 insertions(+)
 create mode 100644 
tests/fsck-tests/030-reflinked-prealloc-extents/reflinked-prealloc-extents.img.xz
 create mode 100755 tests/fsck-tests/030-reflinked-prealloc-extents/test.sh

diff --git 
a/tests/fsck-tests/030-reflinked-prealloc-extents/reflinked-prealloc-extents.img.xz
 
b/tests/fsck-tests/030-reflinked-prealloc-extents/reflinked-prealloc-extents.img.xz
new file mode 100644
index 
..8adf0071328806fa6981f6ef225084e517d1bc3e
GIT binary patch
literal 3244
zcmV;d3{&&{H+ooF000E$*0e?f03iV!G>wRyj;C3^v%$$4d1wo3
zjjaF1$3n~+-*XiD+@YGxDtdHe$qq4$Wo>7O_CGnuIn8OLnT=x4IGVZ-w??mVmRMZ7
z5Ay+V;mHSAoDkyM-tpo^bS;x+v}A`sNQY^!9&~{eDz@XZKXO?8OF)IjGm%(
zDUT1JIsHjj>>}y5qdSJT79-V@5Qr*LP$DwA@XNl^>?ShU!y!KD38l^=LE*nzF2}qs
zeB4zmhwleI8r9LN(p#cQeLEqdB4_Wgl7Qx-M>xaL`0TU`9k;qs6+KP*H>U4>=}6F*5_MhCaIzBlfo8uAWSd+hP%WEflaqMvG9Yi+k#=?=X1;)RqEp2fES1y0!CcOxHBNudB4H~
zg^Q=pEd>6f!;l~S^outD;V9&@RLa{&{9spkk|mHXS%NxkxrGeraxy^{Cx+(
z$hvBW#`Zhts1td;TLkX1RsNg`CWT*I)*I@mk(hGjz?_yyPd-M$ua7B`xni7MSs
zWyl`7wxZ%$wBM$)P<1(dhx?%fv+}{l<|u?L$$QGeT(rUdBI{pRI`}U)a7Fj|kXDbn-0nezRe
zyo)Z$_6XM$t$LFEKukadE#*)^!)pQ&8mYw_kMjNgxd*nQI${ow>pAWyg-
zeY~@@vvyi{r6BFgsls*62dIJ6pHyrL+V9bwi^Y3AmVbJ-rj~|0%3qm%C-loCfyP>s
z$_FFfC0=|!1xk(2zP?u6Z!@qsjy-cb?f8VOdA*CL8lOP3H25+rghFiC_`b!X$
z>P+>~7sXasAE_)+#HFn-4WN^kg_vvy*M&~#=`LtcOqgLlop`KKZ3Jgg1KVc>0+
z<&7l#_?5OdUbH9q)rI1J($!(YD81|JB{3iNd4fYKY~4PtyP7Z3#>vE
zLbdE%`v*0S%thI+abigpJ5|RLJpSeZubQp{B{+2vJAP~H{7!6?%0k~ao{0;mLf$)D
z#8FMC2d0|gj@k<#OyA%!|kb9cc7H-YKcGH#V8vGFVvS$6Fd)cwk
ze|9UrRDX1SMKK2wwAIrYlX^SQ^Gd1TmT8TquvY#QgwM?S~ZIw4!K(Esm|Uq
z`*nSS2CYP0d>BeDe@8%d7RbS$9;pJuD7(EMB+vmZ%gK70s>Oyw9^pFXUsEVku?z;W
zEAMEG2zVPq|IQkY!446b#zBa;-T;gc5rz(FDUjCpB9(^S_qi=4Lhcn8?1nXaN5b_i
z6}`_lNEJMtzqR%|caxD_PZ1dBV+a)jInRU=V%d);03w3e2LWwuKn=GMU>w=2+NG32Rn!YaXN2+v~(kl
zo^@2=_Kn$=vP0Dp_k^q>8Q{e{(7kB%9_mJ|<39=H&0Li1=wcrd{hN7URQks)E8JY=
zgM5#Nt%rY;GY*s%)x8WSes6E8vtAxZ0RAS+(V%}@kdj~mJ5{x#s
zou?Fb;e7HrfU;p4?0sY877;Z1H?}-Ygcc31-G9Y(;OpjA_g*cpG4-G%Xr76TlW!S!
zD_4#GiJ9d6}O$qGTb3Ue-WLHSzqdibPv^Qgkk
zF!gSYFP0o14elE{;)bNl2Z@l^s_=eh+ez))ZImHa$*Ti;e$`XkLqg
zSP(U&{f0R?H;DC~3y7mGE!{R^vpI=_uSE%tjI~g#ih?Ln6gz3n`*V}gKOyr)vUi}w

[PATCH 1/2 v3] Btrfs-progs: check, fix false error reports for shared prealloc extents

2018-03-15 Thread fdmanana
From: Filipe Manana 

Under some cases the filesystem checker reports an error when it finds
checksum items for an extent that is referenced by an inode as a prealloc
extent. Such cases are not an error when the extent is actually shared
(was cloned/reflinked) with other inodes and was written through one of
those other inodes.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foo
  $ xfs_io -c "falloc 0 256K" /mnt/foo
  $ sync

  $ xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
  $ touch /mnt/bar
  $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  
  $ mount /dev/sdb /mnt
  $ umount /mnt

  $ btrfs check /dev/sdc
  Checking filesystem on /dev/sdb
  UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
  checking extents
  checking free space cache
  checking fs roots
  root 5 inode 257 errors 800, odd csum item
  ERROR: errors found in fs roots
  found 688128 bytes used, error(s) found
  total csum bytes: 256
  total tree bytes: 163840
  total fs tree bytes: 65536
  total extent tree bytes: 16384
  btree space waste bytes: 138819
  file data blocks allocated: 10747904
   referenced 10747904
  $ echo $?
  1

So teach check to not report such cases as errors by checking if the
extent is shared with other inodes and if so, consider it an error the
existence of checksum items only if all those other inodes are referencing
the extent as a prealloc extent.
This case can be hit often when running the generic/475 testcase from
fstests.

A test case will follow in a separate patch.

Signed-off-by: Filipe Manana 
---

V3: Replaced incorrect argument to a btrfs_next_leaf() call (extent_root -> 
root).

V2: Made stuff work with lowmem mode as well.
Added a comment about the limitations of the current check.
Removed filtering by inode number since it was unreliable as we can
have different inodes with same inode number but in different roots
(so opted to drop the filtering instead of filtering by root as well,
to keep it simpler).


 check/main.c|  11 ++-
 check/mode-common.c | 258 
 check/mode-common.h |   3 +
 check/mode-lowmem.c |  14 ++-
 4 files changed, 281 insertions(+), 5 deletions(-)

diff --git a/check/main.c b/check/main.c
index 392195ca..88e6c1e9 100644
--- a/check/main.c
+++ b/check/main.c
@@ -1515,8 +1515,15 @@ static int process_file_extent(struct btrfs_root *root,
if (found < num_bytes)
rec->some_csum_missing = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-   if (found > 0)
-   rec->errors |= I_ERR_ODD_CSUM_ITEM;
+   if (found > 0) {
+   ret = 
check_prealloc_extent_written(root->fs_info,
+   disk_bytenr,
+   num_bytes);
+   if (ret < 0)
+   return ret;
+   if (ret == 0)
+   rec->errors |= I_ERR_ODD_CSUM_ITEM;
+   }
}
}
return 0;
diff --git a/check/mode-common.c b/check/mode-common.c
index 1b56a968..8a59e8a4 100644
--- a/check/mode-common.c
+++ b/check/mode-common.c
@@ -24,6 +24,264 @@
 #include "check/mode-common.h"
 
 /*
+ * Check if the inode referenced by the given data reference uses the extent
+ * at disk_bytenr as a non-prealloc extent.
+ *
+ * Returns 1 if true, 0 if false and < 0 on error.
+ */
+static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
+  u64 disk_bytenr,
+  struct btrfs_extent_data_ref *dref,
+  struct extent_buffer *eb)
+{
+   u64 rootid = btrfs_extent_data_ref_root(eb, dref);
+   u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
+   u64 offset = btrfs_extent_data_ref_offset(eb, dref);
+   struct btrfs_root *root;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   int ret;
+
+   btrfs_init_path();
+   key.objectid = rootid;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = (u64)-1;
+   root = btrfs_read_fs_root(fs_info, );
+   if (IS_ERR(root)) {
+   ret = PTR_ERR(root);
+   goto out;
+   }
+
+   key.objectid = objectid;
+   key.type = BTRFS_EXTENT_DATA_KEY;
+   key.offset = offset;
+   ret = btrfs_search_slot(NULL, root, , , 0, 0);
+   if (ret > 0) {
+   fprintf(stderr,
+   "Missing file extent item for inode %llu, root %llu, 
offset %llu",
+   objectid, rootid, offset);
+   ret = -ENOENT;
+   }
+   if (ret < 0)
+   goto out;
+
+

[PATCH 2/2 v2] Btrfs-progs: add fsck test for filesystem with shared prealloc extents

2018-03-14 Thread fdmanana
From: Filipe Manana 

Verify that a filesystem check operation (fsck) does not report the
following scenario as an error:

An extent is shared between two inodes, as a result of clone/reflink
operation, and for one of the inodes, lets call it inode A, the extent is
referenced through a file extent item as a prealloc extent, while for the
other inode, call it inode B, the extent is referenced through a regular
file extent item, that is, it was written to. The goal of this test is to
make sure a filesystem check operation will not report "odd csum items"
errors for the prealloc extent at inode A, because this scenario is valid
since the extent was written through inode B and therefore it is expected
to have checksum items in the filesystem's checksum btree for that shared
extent.

Such scenario can be created with the following steps for example:

 mkfs.btrfs -f /dev/sdb
 mount /dev/sdb /mnt

 touch /mnt/foo
 xfs_io -c "falloc 0 256K" /mnt/foo
 sync

 xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
 touch /mnt/bar
 xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
 xfs_io -c "fsync" /mnt/bar

 
 mount /dev/sdb /mnt
 umount /mnt

This scenario is fixed by the following patch for the filesystem checker:

 "Btrfs-progs: check, fix false error reports for shared prealloc extents"

Signed-off-by: Filipe Manana 
Reviewed-by: Qu Wenruo 
---

V2: No changes, only added Qu's reviewed-by tag.

 .../reflinked-prealloc-extents.img.xz  | Bin 0 -> 3244 bytes
 .../030-reflinked-prealloc-extents/test.sh |  42 +
 2 files changed, 42 insertions(+)
 create mode 100644 
tests/fsck-tests/030-reflinked-prealloc-extents/reflinked-prealloc-extents.img.xz
 create mode 100755 tests/fsck-tests/030-reflinked-prealloc-extents/test.sh

diff --git 
a/tests/fsck-tests/030-reflinked-prealloc-extents/reflinked-prealloc-extents.img.xz
 
b/tests/fsck-tests/030-reflinked-prealloc-extents/reflinked-prealloc-extents.img.xz
new file mode 100644
index 
..8adf0071328806fa6981f6ef225084e517d1bc3e
GIT binary patch
literal 3244
zcmV;d3{&&{H+ooF000E$*0e?f03iV!G>wRyj;C3^v%$$4d1wo3
zjjaF1$3n~+-*XiD+@YGxDtdHe$qq4$Wo>7O_CGnuIn8OLnT=x4IGVZ-w??mVmRMZ7
z5Ay+V;mHSAoDkyM-tpo^bS;x+v}A`sNQY^!9&~{eDz@XZKXO?8OF)IjGm%(
zDUT1JIsHjj>>}y5qdSJT79-V@5Qr*LP$DwA@XNl^>?ShU!y!KD38l^=LE*nzF2}qs
zeB4zmhwleI8r9LN(p#cQeLEqdB4_Wgl7Qx-M>xaL`0TU`9k;qs6+KP*H>U4>=}6F*5_MhCaIzBlfo8uAWSd+hP%WEflaqMvG9Yi+k#=?=X1;)RqEp2fES1y0!CcOxHBNudB4H~
zg^Q=pEd>6f!;l~S^outD;V9&@RLa{&{9spkk|mHXS%NxkxrGeraxy^{Cx+(
z$hvBW#`Zhts1td;TLkX1RsNg`CWT*I)*I@mk(hGjz?_yyPd-M$ua7B`xni7MSs
zWyl`7wxZ%$wBM$)P<1(dhx?%fv+}{l<|u?L$$QGeT(rUdBI{pRI`}U)a7Fj|kXDbn-0nezRe
zyo)Z$_6XM$t$LFEKukadE#*)^!)pQ&8mYw_kMjNgxd*nQI${ow>pAWyg-
zeY~@@vvyi{r6BFgsls*62dIJ6pHyrL+V9bwi^Y3AmVbJ-rj~|0%3qm%C-loCfyP>s
z$_FFfC0=|!1xk(2zP?u6Z!@qsjy-cb?f8VOdA*CL8lOP3H25+rghFiC_`b!X$
z>P+>~7sXasAE_)+#HFn-4WN^kg_vvy*M&~#=`LtcOqgLlop`KKZ3Jgg1KVc>0+
z<&7l#_?5OdUbH9q)rI1J($!(YD81|JB{3iNd4fYKY~4PtyP7Z3#>vE
zLbdE%`v*0S%thI+abigpJ5|RLJpSeZubQp{B{+2vJAP~H{7!6?%0k~ao{0;mLf$)D
z#8FMC2d0|gj@k<#OyA%!|kb9cc7H-YKcGH#V8vGFVvS$6Fd)cwk
ze|9UrRDX1SMKK2wwAIrYlX^SQ^Gd1TmT8TquvY#QgwM?S~ZIw4!K(Esm|Uq
z`*nSS2CYP0d>BeDe@8%d7RbS$9;pJuD7(EMB+vmZ%gK70s>Oyw9^pFXUsEVku?z;W
zEAMEG2zVPq|IQkY!446b#zBa;-T;gc5rz(FDUjCpB9(^S_qi=4Lhcn8?1nXaN5b_i
z6}`_lNEJMtzqR%|caxD_PZ1dBV+a)jInRU=V%d);03w3e2LWwuKn=GMU>w=2+NG32Rn!YaXN2+v~(kl
zo^@2=_Kn$=vP0Dp_k^q>8Q{e{(7kB%9_mJ|<39=H&0Li1=wcrd{hN7URQks)E8JY=
zgM5#Nt%rY;GY*s%)x8WSes6E8vtAxZ0RAS+(V%}@kdj~mJ5{x#s
zou?Fb;e7HrfU;p4?0sY877;Z1H?}-Ygcc31-G9Y(;OpjA_g*cpG4-G%Xr76TlW!S!
zD_4#GiJ9d6}O$qGTb3Ue-WLHSzqdibPv^Qgkk
zF!gSYFP0o14elE{;)bNl2Z@l^s_=eh+ez))ZImHa$*Ti;e$`XkLqg
zSP(U&{f0R?H;DC~3y7mGE!{R^vpI=_uSE%tjI~g#ih?Ln6gz3n`*V}gKOyr)vUi}w

[PATCH 1/2 v2] Btrfs-progs: check, fix false error reports for shared prealloc extents

2018-03-14 Thread fdmanana
From: Filipe Manana 

Under some cases the filesystem checker reports an error when it finds
checksum items for an extent that is referenced by an inode as a prealloc
extent. Such cases are not an error when the extent is actually shared
(was cloned/reflinked) with other inodes and was written through one of
those other inodes.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foo
  $ xfs_io -c "falloc 0 256K" /mnt/foo
  $ sync

  $ xfs_io -c "pwrite -S 0xab 0 256K" /mnt/foo
  $ touch /mnt/bar
  $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  
  $ mount /dev/sdb /mnt
  $ umount /mnt

  $ btrfs check /dev/sdc
  Checking filesystem on /dev/sdb
  UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
  checking extents
  checking free space cache
  checking fs roots
  root 5 inode 257 errors 800, odd csum item
  ERROR: errors found in fs roots
  found 688128 bytes used, error(s) found
  total csum bytes: 256
  total tree bytes: 163840
  total fs tree bytes: 65536
  total extent tree bytes: 16384
  btree space waste bytes: 138819
  file data blocks allocated: 10747904
   referenced 10747904
  $ echo $?
  1

So teach check to not report such cases as errors by checking if the
extent is shared with other inodes and if so, consider it an error the
existence of checksum items only if all those other inodes are referencing
the extent as a prealloc extent.
This case can be hit often when running the generic/475 testcase from
fstests.

A test case will follow in a separate patch.

Signed-off-by: Filipe Manana 
---

V2: Made stuff work with lowmem mode as well.
Added a comment about the limitations of the current check.
Removed filtering by inode number since it was unreliable as we can
have different inodes with same inode number but in different roots
(so opted to drop the filtering instead of filtering by root as well,
to keep it simpler).

 check/main.c|  11 ++-
 check/mode-common.c | 258 
 check/mode-common.h |   3 +
 check/mode-lowmem.c |  14 ++-
 4 files changed, 281 insertions(+), 5 deletions(-)

diff --git a/check/main.c b/check/main.c
index 392195ca..88e6c1e9 100644
--- a/check/main.c
+++ b/check/main.c
@@ -1515,8 +1515,15 @@ static int process_file_extent(struct btrfs_root *root,
if (found < num_bytes)
rec->some_csum_missing = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-   if (found > 0)
-   rec->errors |= I_ERR_ODD_CSUM_ITEM;
+   if (found > 0) {
+   ret = 
check_prealloc_extent_written(root->fs_info,
+   disk_bytenr,
+   num_bytes);
+   if (ret < 0)
+   return ret;
+   if (ret == 0)
+   rec->errors |= I_ERR_ODD_CSUM_ITEM;
+   }
}
}
return 0;
diff --git a/check/mode-common.c b/check/mode-common.c
index 1b56a968..559cd11d 100644
--- a/check/mode-common.c
+++ b/check/mode-common.c
@@ -24,6 +24,264 @@
 #include "check/mode-common.h"
 
 /*
+ * Check if the inode referenced by the given data reference uses the extent
+ * at disk_bytenr as a non-prealloc extent.
+ *
+ * Returns 1 if true, 0 if false and < 0 on error.
+ */
+static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
+  u64 disk_bytenr,
+  struct btrfs_extent_data_ref *dref,
+  struct extent_buffer *eb)
+{
+   u64 rootid = btrfs_extent_data_ref_root(eb, dref);
+   u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
+   u64 offset = btrfs_extent_data_ref_offset(eb, dref);
+   struct btrfs_root *root;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   int ret;
+
+   btrfs_init_path();
+   key.objectid = rootid;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = (u64)-1;
+   root = btrfs_read_fs_root(fs_info, );
+   if (IS_ERR(root)) {
+   ret = PTR_ERR(root);
+   goto out;
+   }
+
+   key.objectid = objectid;
+   key.type = BTRFS_EXTENT_DATA_KEY;
+   key.offset = offset;
+   ret = btrfs_search_slot(NULL, root, , , 0, 0);
+   if (ret > 0) {
+   fprintf(stderr,
+   "Missing file extent item for inode %llu, root %llu, 
offset %llu",
+   objectid, rootid, offset);
+   ret = -ENOENT;
+   }
+   if (ret < 0)
+   goto out;
+
+   while (true) {
+   struct btrfs_file_extent_item *fi;
+   

[PATCH 1/2] Btrfs-progs: check, fix false error reports for shared prealloc extents

2018-03-13 Thread fdmanana
From: Filipe Manana 

Under some cases the filesystem checker reports an error when it finds
checksum items for an extent that is referenced by an inode as a prealloc
extent. Such cases are not an error when the extent is actually shared
(was cloned/reflinked) with other inodes and was written through one of
those other inodes.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ touch /mnt/foo
  $ xfs_io -c "falloc 0 256K" /mnt/foo
  $ sync

  $ xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
  $ touch /mnt/bar
  $ xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  
  $ mount /dev/sdb /mnt
  $ umount /mnt

  $ btrfs check /dev/sdc
  Checking filesystem on /dev/sdb
  UUID: 52d3006e-ee3b-40eb-aa21-e56253a03d39
  checking extents
  checking free space cache
  checking fs roots
  root 5 inode 257 errors 800, odd csum item
  ERROR: errors found in fs roots
  found 688128 bytes used, error(s) found
  total csum bytes: 256
  total tree bytes: 163840
  total fs tree bytes: 65536
  total extent tree bytes: 16384
  btree space waste bytes: 138819
  file data blocks allocated: 10747904
   referenced 10747904
  $ echo $?
  1

So tech check to not report such cases as errors by checking if the
extent is shared with other inodes and if so, consider it an error the
existence of checksum items only if all those other inodes are referencing
the extent as a prealloc extent.
This case can be hit often when running the generic/475 testcase from
fstests.

A test case will follow in a separate patch.

Signed-off-by: Filipe Manana 
---
 check/main.c | 270 ++-
 1 file changed, 268 insertions(+), 2 deletions(-)

diff --git a/check/main.c b/check/main.c
index 392195ca..bb816311 100644
--- a/check/main.c
+++ b/check/main.c
@@ -1424,6 +1424,264 @@ static int process_inode_extref(struct extent_buffer 
*eb,
 
 }
 
+/*
+ * Check if the inode referenced by the given data reference uses the extent
+ * at disk_bytenr as a non-prealloc extent.
+ *
+ * Returns 1 if true, 0 if false and < 0 on error.
+ */
+static int check_prealloc_data_ref(struct btrfs_fs_info *fs_info,
+  u64 ino,
+  u64 disk_bytenr,
+  struct btrfs_extent_data_ref *dref,
+  struct extent_buffer *eb)
+{
+   u64 rootid = btrfs_extent_data_ref_root(eb, dref);
+   u64 objectid = btrfs_extent_data_ref_objectid(eb, dref);
+   u64 offset = btrfs_extent_data_ref_offset(eb, dref);
+   struct btrfs_root *root;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   int ret;
+
+   if (objectid == ino)
+   return 0;
+
+   btrfs_init_path();
+   key.objectid = rootid;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = (u64)-1;
+   root = btrfs_read_fs_root(fs_info, );
+   if (IS_ERR(root)) {
+   ret = PTR_ERR(root);
+   goto out;
+   }
+
+   key.objectid = objectid;
+   key.type = BTRFS_EXTENT_DATA_KEY;
+   key.offset = offset;
+   ret = btrfs_search_slot(NULL, root, , , 0, 0);
+   if (ret > 0) {
+   fprintf(stderr,
+   "Missing file extent item for inode %llu, root %llu, 
offset %llu",
+   objectid, rootid, offset);
+   ret = -ENOENT;
+   }
+   if (ret < 0)
+   goto out;
+
+   while (true) {
+   struct btrfs_file_extent_item *fi;
+   int extent_type;
+
+   if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+   ret = btrfs_next_leaf(fs_info->extent_root, );
+   if (ret < 0)
+   goto out;
+   if (ret > 0)
+   break;
+   }
+
+   btrfs_item_key_to_cpu(path.nodes[0], , path.slots[0]);
+   if (key.objectid != objectid ||
+   key.type != BTRFS_EXTENT_DATA_KEY)
+   break;
+
+   fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_file_extent_item);
+   extent_type = btrfs_file_extent_type(path.nodes[0], fi);
+   if (extent_type != BTRFS_FILE_EXTENT_REG &&
+   extent_type != BTRFS_FILE_EXTENT_PREALLOC)
+   goto next;
+
+   if (btrfs_file_extent_disk_bytenr(path.nodes[0], fi) !=
+   disk_bytenr)
+   break;
+
+   if (extent_type == BTRFS_FILE_EXTENT_REG) {
+   ret = 1;
+   goto out;
+   }
+next:
+   path.slots[0]++;
+   }
+   ret = 0;
+ out:
+   btrfs_release_path();
+   return ret;
+}
+
+/*
+ * Check if a shared data reference points to a node that 

[PATCH 2/2] Btrfs-progs: add fsck test for filesystem with shared prealloc extents

2018-03-13 Thread fdmanana
From: Filipe Manana 

Verify that a filesystem check operation (fsck) does not report the
following scenario as an error:

An extent is shared between two inodes, as a result of clone/reflink
operation, and for one of the inodes, lets call it inode A, the extent is
referenced through a file extent item as a prealloc extent, while for the
other inode, call it inode B, the extent is referenced through a regular
file extent item, that is, it was written to. The goal of this test is to
make sure a filesystem check operation will not report "odd csum items"
errors for the prealloc extent at inode A, because this scenario is valid
since the extent was written through inode B and therefore it is expected
to have checksum items in the filesystem's checksum btree for that shared
extent.

Such scenario can be created with the following steps for example:

 mkfs.btrfs -f /dev/sdb
 mount /dev/sdb /mnt

 touch /mnt/foo
 xfs_io -c "falloc 0 256K" /mnt/foo
 sync

 xfs_io -c "pwrite -S 0xab -b 0 256K" /mnt/foo
 touch /mnt/bar
 xfs_io -c "reflink /mnt/foo 0 0 256K" /mnt/bar
 xfs_io -c "fsync" /mnt/bar

 
 mount /dev/sdb /mnt
 umount /mnt

This scenario is fixed by the following patch for the filesystem checker:

 "Btrfs-progs: check, fix false error reports for shared prealloc extents"

Signed-off-by: Filipe Manana 
---
 .../reflinked-prealloc-extents.img.xz  | Bin 0 -> 3244 bytes
 .../030-reflinked-prealloc-extents/test.sh |  42 +
 2 files changed, 42 insertions(+)
 create mode 100644 
tests/fsck-tests/030-reflinked-prealloc-extents/reflinked-prealloc-extents.img.xz
 create mode 100755 tests/fsck-tests/030-reflinked-prealloc-extents/test.sh

diff --git 
a/tests/fsck-tests/030-reflinked-prealloc-extents/reflinked-prealloc-extents.img.xz
 
b/tests/fsck-tests/030-reflinked-prealloc-extents/reflinked-prealloc-extents.img.xz
new file mode 100644
index 
..8adf0071328806fa6981f6ef225084e517d1bc3e
GIT binary patch
literal 3244
zcmV;d3{&&{H+ooF000E$*0e?f03iV!G>wRyj;C3^v%$$4d1wo3
zjjaF1$3n~+-*XiD+@YGxDtdHe$qq4$Wo>7O_CGnuIn8OLnT=x4IGVZ-w??mVmRMZ7
z5Ay+V;mHSAoDkyM-tpo^bS;x+v}A`sNQY^!9&~{eDz@XZKXO?8OF)IjGm%(
zDUT1JIsHjj>>}y5qdSJT79-V@5Qr*LP$DwA@XNl^>?ShU!y!KD38l^=LE*nzF2}qs
zeB4zmhwleI8r9LN(p#cQeLEqdB4_Wgl7Qx-M>xaL`0TU`9k;qs6+KP*H>U4>=}6F*5_MhCaIzBlfo8uAWSd+hP%WEflaqMvG9Yi+k#=?=X1;)RqEp2fES1y0!CcOxHBNudB4H~
zg^Q=pEd>6f!;l~S^outD;V9&@RLa{&{9spkk|mHXS%NxkxrGeraxy^{Cx+(
z$hvBW#`Zhts1td;TLkX1RsNg`CWT*I)*I@mk(hGjz?_yyPd-M$ua7B`xni7MSs
zWyl`7wxZ%$wBM$)P<1(dhx?%fv+}{l<|u?L$$QGeT(rUdBI{pRI`}U)a7Fj|kXDbn-0nezRe
zyo)Z$_6XM$t$LFEKukadE#*)^!)pQ&8mYw_kMjNgxd*nQI${ow>pAWyg-
zeY~@@vvyi{r6BFgsls*62dIJ6pHyrL+V9bwi^Y3AmVbJ-rj~|0%3qm%C-loCfyP>s
z$_FFfC0=|!1xk(2zP?u6Z!@qsjy-cb?f8VOdA*CL8lOP3H25+rghFiC_`b!X$
z>P+>~7sXasAE_)+#HFn-4WN^kg_vvy*M&~#=`LtcOqgLlop`KKZ3Jgg1KVc>0+
z<&7l#_?5OdUbH9q)rI1J($!(YD81|JB{3iNd4fYKY~4PtyP7Z3#>vE
zLbdE%`v*0S%thI+abigpJ5|RLJpSeZubQp{B{+2vJAP~H{7!6?%0k~ao{0;mLf$)D
z#8FMC2d0|gj@k<#OyA%!|kb9cc7H-YKcGH#V8vGFVvS$6Fd)cwk
ze|9UrRDX1SMKK2wwAIrYlX^SQ^Gd1TmT8TquvY#QgwM?S~ZIw4!K(Esm|Uq
z`*nSS2CYP0d>BeDe@8%d7RbS$9;pJuD7(EMB+vmZ%gK70s>Oyw9^pFXUsEVku?z;W
zEAMEG2zVPq|IQkY!446b#zBa;-T;gc5rz(FDUjCpB9(^S_qi=4Lhcn8?1nXaN5b_i
z6}`_lNEJMtzqR%|caxD_PZ1dBV+a)jInRU=V%d);03w3e2LWwuKn=GMU>w=2+NG32Rn!YaXN2+v~(kl
zo^@2=_Kn$=vP0Dp_k^q>8Q{e{(7kB%9_mJ|<39=H&0Li1=wcrd{hN7URQks)E8JY=
zgM5#Nt%rY;GY*s%)x8WSes6E8vtAxZ0RAS+(V%}@kdj~mJ5{x#s
zou?Fb;e7HrfU;p4?0sY877;Z1H?}-Ygcc31-G9Y(;OpjA_g*cpG4-G%Xr76TlW!S!
zD_4#GiJ9d6}O$qGTb3Ue-WLHSzqdibPv^Qgkk
zF!gSYFP0o14elE{;)bNl2Z@l^s_=eh+ez))ZImHa$*Ti;e$`XkLqg
zSP(U&{f0R?H;DC~3y7mGE!{R^vpI=_uSE%tjI~g#ih?Ln6gz3n`*V}gKOyr)vUi}w
z8AcDrm${MstR=4MxubM%QS6^8F9MtEA!AVMExun)-S{WhkF+ed-#7js)&$Fm9l>Et
z>G~Km`H;$$RF;JKpJ_{*56=Se53(~Yv`{#DSO;w=e6G4}ivFf?PPm}qFweC>2lA

[PATCH] generic: add test for fsync after renaming and linking special file

2018-02-28 Thread fdmanana
From: Filipe Manana 

Test that when a fsync journal/log exists, if we rename a special file
(fifo, symbolic link or device), create a hard link for it with its old
name and then commit the journal/log, if a power loss happens the
filesystem will not fail to replay the journal/log when it is mounted
the next time.

This test is motivated by a bug found in btrfs, which is fixed by the
following patch for the linux kernel:

  "Btrfs: fix log replay failure after linking special file and fsync"

Signed-off-by: Filipe Manana 
---
 tests/generic/479 | 112 ++
 tests/generic/479.out |   2 +
 tests/generic/group   |   1 +
 3 files changed, 115 insertions(+)
 create mode 100644 tests/generic/479
 create mode 100644 tests/generic/479.out

diff --git a/tests/generic/479 b/tests/generic/479
new file mode 100644
index ..7e4ba7d0
--- /dev/null
+++ b/tests/generic/479
@@ -0,0 +1,112 @@
+#! /bin/bash
+# FSQA Test No. 479
+#
+# Test that when a fsync journal/log exists, if we rename a special file (fifo,
+# symbolic link or device), create a hard link for it with its old name and 
then
+# commit the journal/log, if a power loss happens the filesystem will not fail
+# to replay the journal/log when it is mounted the next time.
+#
+#---
+#
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+run_test()
+{
+   local file_type=$1
+
+   _scratch_mkfs >>$seqres.full 2>&1
+   _require_metadata_journaling $SCRATCH_DEV
+   _init_flakey
+   _mount_flakey
+
+   mkdir $SCRATCH_MNT/testdir
+   case $file_type in
+   symlink)
+   ln -s xxx $SCRATCH_MNT/testdir/foo
+   ;;
+   fifo)
+   mkfifo $SCRATCH_MNT/testdir/foo
+   ;;
+   dev)
+   mknod $SCRATCH_MNT/testdir/foo c 0 0
+   ;;
+   *)
+   _fail "Invalid file type argument: $file_type"
+   esac
+   # Make sure everything done so far is durably persisted.
+   sync
+
+   # Create a file and fsync it just to create a journal/log. This file
+   # must be in the same directory as our special file "foo".
+   touch $SCRATCH_MNT/testdir/f1
+   $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/f1
+
+   # Rename our special file and then create link that has its old name.
+   mv $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar
+   ln $SCRATCH_MNT/testdir/bar $SCRATCH_MNT/testdir/foo
+
+   # Create a second file and fsync it. This is just to durably persist the
+   # fsync journal/log which is typically modified by the previous rename
+   # and link operations. This file does not need to be placed in the same
+   # directory as our special file.
+   touch $SCRATCH_MNT/f2
+   $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/f2
+
+   # Simulate a power failure and mount the filesystem to check that
+   # replaying the fsync log/journal succeeds, that is the mount operation
+   # does not fail.
+   _flakey_drop_and_remount
+   _unmount_flakey
+   _cleanup_flakey
+}
+
+run_test symlink
+run_test fifo
+run_test dev
+
+echo "Silence is golden"
+status=0
+exit
diff --git a/tests/generic/479.out b/tests/generic/479.out
new file mode 100644
index ..290f18b3
--- /dev/null
+++ b/tests/generic/479.out
@@ -0,0 +1,2 @@
+QA output created by 479
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index 1e808865..3b9b47e3 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -481,3 +481,4 @@
 476 auto rw
 477 auto quick exportfs
 478 auto quick
+479 auto 

[PATCH] generic: test fsync new file after removing hard link

2018-02-28 Thread fdmanana
From: Filipe Manana 

Test that if we have a file with two hard links in the same parent
directory, then remove of the links, create a new file in the same
parent directory and with the name of the link removed, fsync the new
file and have a power loss, mounting the filesystem succeeds.

This test is motivated by a bug found in btrfs, which is fixed by
the linux kernel patch titled:

  "Btrfs: fix log replay failure after unlink and link combination"

Signed-off-by: Filipe Manana 
---
 tests/generic/480 | 83 +++
 tests/generic/480.out |  2 ++
 tests/generic/group   |  1 +
 3 files changed, 86 insertions(+)
 create mode 100755 tests/generic/480
 create mode 100644 tests/generic/480.out

diff --git a/tests/generic/480 b/tests/generic/480
new file mode 100755
index ..a287684b
--- /dev/null
+++ b/tests/generic/480
@@ -0,0 +1,83 @@
+#! /bin/bash
+# FSQA Test No. 480
+#
+# Test that if we have a file with two hard links in the same parent directory,
+# then remove of the links, create a new file in the same parent directory and
+# with the name of the link removed, fsync the new file and have a power loss,
+# mounting the filesystem succeeds.
+#
+#---
+#
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+mkdir $SCRATCH_MNT/testdir
+touch $SCRATCH_MNT/testdir/foo
+ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar
+
+# Make sure everything done so far is durably persisted.
+sync
+
+# Now remove of the links of our file and create a new file with the same name
+# and in the same parent directory, and finally fsync this new file.
+unlink $SCRATCH_MNT/testdir/bar
+touch $SCRATCH_MNT/testdir/bar
+$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/bar
+
+# Simulate a power failure and mount the filesystem to check that replaying the
+# the fsync log/journal succeeds, that is the mount operation does not fail.
+_flakey_drop_and_remount
+
+_unmount_flakey
+_cleanup_flakey
+
+echo "Silence is golden"
+status=0
+exit
diff --git a/tests/generic/480.out b/tests/generic/480.out
new file mode 100644
index ..a40a718e
--- /dev/null
+++ b/tests/generic/480.out
@@ -0,0 +1,2 @@
+QA output created by 480
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index 3b9b47e3..ea2056b1 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -482,3 +482,4 @@
 477 auto quick exportfs
 478 auto quick
 479 auto quick metadata
+480 auto quick metadata
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] Btrfs: fix log replay failure after linking special file and fsync

2018-02-28 Thread fdmanana
From: Filipe Manana 

If in the same transaction we rename a special file (fifo, character/block
device or symbolic link), create a hard link for it having its old name
then sync the log, we will end up with a log that can not be replayed and
at when attempting to replay it, an EEXIST error is returned and mounting
the filesystem fails. Example scenario:

  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt
  $ mkdir /mnt/testdir
  $ mkfifo /mnt/testdir/foo
  # Make sure everything done so far is durably persisted.
  $ sync

  # Create some unrelated file and fsync it, this is just to create a log
  # tree. The file must be in the same directory as our special file.
  $ touch /mnt/testdir/f1
  $ xfs_io -c "fsync" /mnt/testdir/f1

  # Rename our special file and then create a hard link with its old name.
  $ mv /mnt/testdir/foo /mnt/testdir/bar
  $ ln /mnt/testdir/bar /mnt/testdir/foo

  # Create some other unrelated file and fsync it, this is just to persist
  # the log tree which was modified by the previous rename and link
  # operations. Alternatively we could have modified file f1 and fsync it.
  $ touch /mnt/f2
  $ xfs_io -c "fsync" /mnt/f2

  

  $ mount /dev/sdc /mnt
  mount: mount /dev/sdc on /mnt failed: File exists

This happens because when both the log tree and the subvolume's tree have
an entry in the directory "testdir" with the same name, that is, there
is one key (258 INODE_REF 257) in the subvolume tree and another one in
the log tree (where 258 is the inode number of our special file and 257
is the inode for directory "testdir"). Only the data of those two keys
differs, in the subvolume tree the index field for inode reference has
a value of 3 while the log tree it has a value of 5. Because the same key
exists in both trees, but have different index, the log replay fails with
an -EEXIST error when attempting to replay the inode reference from the
log tree.

Fix this by setting the last_unlink_trans field of the inode (our special
file) to the current transaction id when a hard link is created, as this
forces logging the parent directory inode, solving the conflict at log
replay time.

A new generic test case for fstests was also submitted.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 28d0de199b05..411a022489e4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5841,7 +5841,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 * this will force the logging code to walk the dentry chain
 * up for the file
 */
-   if (S_ISREG(inode->vfs_inode.i_mode))
+   if (!S_ISDIR(inode->vfs_inode.i_mode))
inode->last_unlink_trans = trans->transid;
 
/*
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix log replay failure after unlink and link combination

2018-02-28 Thread fdmanana
From: Filipe Manana 

If we have a file with 2 (or more) hard links in the same directory,
remove one of the hard links, create a new file (or link an existing file)
in the same directory with the name of the removed hard link, and then
finally fsync the new file, we end up with a log that fails to replay,
causing a mount failure.

Example:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ mkdir /mnt/testdir
  $ touch /mnt/testdir/foo
  $ ln /mnt/testdir/foo /mnt/testdir/bar

  $ sync

  $ unlink /mnt/testdir/bar
  $ touch /mnt/testdir/bar
  $ xfs_io -c "fsync" /mnt/testdir/bar

  

  $ mount /dev/sdb /mnt
  mount: mount(2) failed: /mnt: No such file or directory

When replaying the log, for that example, we also see the following in
dmesg/syslog:

  [71813.671307] BTRFS info (device dm-0): failed to delete reference to bar, 
inode 258 parent 257
  [71813.674204] [ cut here ]
  [71813.675694] BTRFS: Transaction aborted (error -2)
  [71813.677236] WARNING: CPU: 1 PID: 13231 at fs/btrfs/inode.c:4128 
__btrfs_unlink_inode+0x17b/0x355 [btrfs]
  [71813.679669] Modules linked in: btrfs xfs f2fs dm_flakey dm_mod dax 
ghash_clmulni_intel ppdev pcbc aesni_intel aes_x86_64 crypto_simd cryptd 
glue_helper evdev psmouse i2c_piix4 parport_pc i2c_core pcspkr sg serio_raw 
parport button sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress 
zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq 
async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath 
linear md_mod ata_generic sd_mod virtio_scsi ata_piix libata virtio_pci 
virtio_ring crc32c_intel floppy virtio e1000 scsi_mod [last unloaded: btrfs]
  [71813.679669] CPU: 1 PID: 13231 Comm: mount Tainted: GW
4.15.0-rc9-btrfs-next-56+ #1
  [71813.679669] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
  [71813.679669] RIP: 0010:__btrfs_unlink_inode+0x17b/0x355 [btrfs]
  [71813.679669] RSP: 0018:c90001cef738 EFLAGS: 00010286
  [71813.679669] RAX: 0025 RBX: 880217ce4708 RCX: 
0001
  [71813.679669] RDX:  RSI: 81c14bae RDI: 

  [71813.679669] RBP: c90001cef7c0 R08: 0001 R09: 
0001
  [71813.679669] R10: c90001cef5e0 R11: 8343f007 R12: 
880217d474c8
  [71813.679669] R13: fffe R14: 88021ccf1548 R15: 
0101
  [71813.679669] FS:  7f7cee84c480() GS:88023fc8() 
knlGS:
  [71813.679669] CS:  0010 DS:  ES:  CR0: 80050033
  [71813.679669] CR2: 7f7cedc1abf9 CR3: 0002354b4003 CR4: 
001606e0
  [71813.679669] Call Trace:
  [71813.679669]  btrfs_unlink_inode+0x17/0x41 [btrfs]
  [71813.679669]  drop_one_dir_item+0xfa/0x131 [btrfs]
  [71813.679669]  add_inode_ref+0x71e/0x851 [btrfs]
  [71813.679669]  ? __lock_is_held+0x39/0x71
  [71813.679669]  ? replay_one_buffer+0x53/0x53a [btrfs]
  [71813.679669]  replay_one_buffer+0x4a4/0x53a [btrfs]
  [71813.679669]  ? rcu_read_unlock+0x3a/0x57
  [71813.679669]  ? __lock_is_held+0x39/0x71
  [71813.679669]  walk_up_log_tree+0x101/0x1d2 [btrfs]
  [71813.679669]  walk_log_tree+0xad/0x188 [btrfs]
  [71813.679669]  btrfs_recover_log_trees+0x1fa/0x31e [btrfs]
  [71813.679669]  ? replay_one_extent+0x544/0x544 [btrfs]
  [71813.679669]  open_ctree+0x1cf6/0x2209 [btrfs]
  [71813.679669]  btrfs_mount_root+0x368/0x482 [btrfs]
  [71813.679669]  ? trace_hardirqs_on_caller+0x14c/0x1a6
  [71813.679669]  ? __lockdep_init_map+0x176/0x1c2
  [71813.679669]  ? mount_fs+0x64/0x10b
  [71813.679669]  mount_fs+0x64/0x10b
  [71813.679669]  vfs_kern_mount+0x68/0xce
  [71813.679669]  btrfs_mount+0x13e/0x772 [btrfs]
  [71813.679669]  ? trace_hardirqs_on_caller+0x14c/0x1a6
  [71813.679669]  ? __lockdep_init_map+0x176/0x1c2
  [71813.679669]  ? mount_fs+0x64/0x10b
  [71813.679669]  mount_fs+0x64/0x10b
  [71813.679669]  vfs_kern_mount+0x68/0xce
  [71813.679669]  do_mount+0x6e5/0x973
  [71813.679669]  ? memdup_user+0x3e/0x5c
  [71813.679669]  SyS_mount+0x72/0x98
  [71813.679669]  entry_SYSCALL_64_fastpath+0x1e/0x8b
  [71813.679669] RIP: 0033:0x7f7cedf150ba
  [71813.679669] RSP: 002b:7ffca71da688 EFLAGS: 0206
  [71813.679669] Code: 7f a0 e8 51 0c fd ff 48 8b 43 50 f0 0f ba a8 30 2c 00 00 
02 72 17 41 83 fd fb 74 11 44 89 ee 48 c7 c7 7d 11 7f a0 e8 38 f5 8d e0 <0f> ff 
44 89 e9 ba 20 10 00 00 eb 4d 48 8b 4d b0 48 8b 75 88 4c
  [71813.679669] ---[ end trace 83bd473fc5b4663b ]---
  [71813.854764] BTRFS: error (device dm-0) in __btrfs_unlink_inode:4128: 
errno=-2 No such entry
  [71813.886994] BTRFS: error (device dm-0) in btrfs_replay_log:2307: errno=-2 
No such entry (Failed to recover log tree)
  [71813.903357] BTRFS error (device dm-0): cleaner transaction attach returned 
-30
  [71814.128078] BTRFS error (device dm-0): open_ctree failed

This happens because the log has inode reference 

[PATCH] Btrfs: send, do not issue unnecessary truncate operations

2018-02-07 Thread fdmanana
From: Filipe Manana 

When send finishes processing an inode representing a regular file, it
always issues a truncate operation for that file, even if its size did
not change or the last write sets the file size correctly. In the most
common cases, the issued write operations set the file to correct size
(either full or incremental sends) or the file size did not change (for
incremental sends), so the only case where a truncate operation is needed
is when a file size becomes smaller in the send snapshot when compared
to the parent snapshot.

By not issuing unnecessary truncate operations we reduce the stream size
and save time in the receiver. Currently truncating a file to the same
size triggers writeback of its last page (if it's dirty) and waits for it
to complete (only if the file size is not aligned with the filesystem's
sector size). This is being fixed by another patch and is independent of
this change (that patch's title is "Btrfs: skip writeback of last page
when truncating file to same size").

The following script was used to measure time spent by a receiver without
this change applied, with this change applied, and without this change and
with the truncate fix applied (the fix to not make it start and wait for
writeback to complete).

  $ cat test_send.sh
  #!/bin/bash

  SRC_DEV=/dev/sdc
  DST_DEV=/dev/sdd
  SRC_MNT=/mnt/sdc
  DST_MNT=/mnt/sdd

  mkfs.btrfs -f $SRC_DEV >/dev/null
  mkfs.btrfs -f $DST_DEV >/dev/null
  mount $SRC_DEV $SRC_MNT
  mount $DST_DEV $DST_MNT

  echo "Creating source filesystem"
  for ((t = 0; t < 10; t++)); do
  (
  for ((i = 1; i <= 2; i++)); do
  xfs_io -f -c "pwrite -S 0xab 0 5000" \
  $SRC_MNT/file_$i > /dev/null
  done
  ) &
 worker_pids[$t]=$!
  done
  wait ${worker_pids[@]}

  echo "Creating and sending snapshot"
  btrfs subvolume snapshot -r $SRC_MNT $SRC_MNT/snap1 >/dev/null
  /usr/bin/time -f "send took %e seconds"\
 btrfs send -f $SRC_MNT/send_file $SRC_MNT/snap1
  /usr/bin/time -f "receive took %e seconds" \
 btrfs receive -f $SRC_MNT/send_file $DST_MNT

  umount $SRC_MNT
  umount $DST_MNT

The results, which are averages for 5 runs for each case, were the
following:

* Without this change

average receive time was 26.49 seconds
standard deviation of 2.53 seconds

* Without this change and with the truncate fix

average receive time was 12.51 seconds
standard deviation of 0.32 seconds

* With this change and without the truncate fix

average receive time was 10.02 seconds
standard deviation of 1.11 seconds

Signed-off-by: Filipe Manana 
---
 fs/btrfs/send.c | 26 +-
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 484e2af793de..5df50d67d319 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -112,6 +112,7 @@ struct send_ctx {
u64 cur_inode_mode;
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
+   u64 cur_inode_next_write_offset;
 
u64 send_progress;
 
@@ -5029,6 +5030,7 @@ static int send_hole(struct send_ctx *sctx, u64 end)
break;
offset += len;
}
+   sctx->cur_inode_next_write_offset = offset;
 tlv_put_failure:
fs_path_free(p);
return ret;
@@ -5264,6 +5266,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
} else {
ret = send_extent_data(sctx, offset, len);
}
+   sctx->cur_inode_next_write_offset = offset + len;
 out:
return ret;
 }
@@ -5788,6 +5791,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, 
int at_end)
u64 right_gid;
int need_chmod = 0;
int need_chown = 0;
+   int need_truncate = 1;
int pending_move = 0;
int refs_processed = 0;
 
@@ -5825,9 +5829,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, 
int at_end)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode))
need_chmod = 1;
+   if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
+   need_truncate = 0;
} else {
+   u64 old_size;
+
ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
-   NULL, NULL, _mode, _uid,
+   _size, NULL, _mode, _uid,
_gid, NULL);
if (ret < 0)
goto out;
@@ -5836,6 +5844,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, 
int at_end)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
need_chmod = 1;
+   if ((old_size == sctx->cur_inode_size) ||
+   (sctx->cur_inode_size > old_size &&
+sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
+   

[PATCH] Btrfs: skip writeback of last page when truncating file to same size

2018-02-07 Thread fdmanana
From: Filipe Manana 

When we truncate a file to the same size and that size is not aligned
with the sector size, we end up triggering writeback (and wait for it to
complete) of the last page. This is unncessary as we can not have delayed
allocation beyond the inode's i_size and the goal of truncating a file
to its own size is to discard prealloc extents (allocated via the
fallocate(2) system call). Besides the unnecessary IO start and wait, it
also breaks the oppurtunity for larger contiguous extents on disk, as
before the last dirty page there might be other dirty pages.

This scenario is probably not very common in general, however it is
common for btrfs receive implementations because currently the send
stream always issues a truncate operation for each processed inode as
the last operation for that inode (this truncate operation is not
always needed and the send implementation will be addressed to avoid
them).

So improve this by not starting and waiting for writeback of the inode's
last page when we are truncating to exactly the same size.

The following script was used to quickly measure the time a receive
operation takes:

 $ cat test_send.sh
 #!/bin/bash

 SRC_DEV=/dev/sdc
 DST_DEV=/dev/sdd
 SRC_MNT=/mnt/sdc
 DST_MNT=/mnt/sdd

 mkfs.btrfs -f $SRC_DEV >/dev/null
 mkfs.btrfs -f $DST_DEV >/dev/null
 mount $SRC_DEV $SRC_MNT
 mount $DST_DEV $DST_MNT

 echo "Creating source filesystem"
 for ((t = 0; t < 10; t++)); do
 (
 for ((i = 1; i <= 2; i++)); do
 xfs_io -f -c "pwrite -S 0xab 0 5000" \
$SRC_MNT/file_$i > /dev/null
 done
 ) &
 worker_pids[$t]=$!
 done
 wait ${worker_pids[@]}

 echo "Creating and sending snapshot"
 btrfs subvolume snapshot -r $SRC_MNT $SRC_MNT/snap1 >/dev/null
 /usr/bin/time -f "send took %e seconds"\
 btrfs send -f $SRC_MNT/send_file $SRC_MNT/snap1
 /usr/bin/time -f "receive took %e seconds" \
 btrfs receive -f $SRC_MNT/send_file $DST_MNT

 umount $SRC_MNT
 umount $DST_MNT

The results for 5 runs were the following:

* Without this change

average receive time was 26.49 seconds
standard deviation of 2.53 seconds

* With this change

average receive time was 12.51 seconds
standard deviation of 0.32 seconds

Reported-by: Robbie Ko 
Signed-off-by: Filipe Manana 
---
 fs/btrfs/inode.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2a19413a7868..dae631ab5cb2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -101,7 +101,7 @@ static const unsigned char btrfs_type_by_mode[S_IFMT >> 
S_SHIFT] = {
 };
 
 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
-static int btrfs_truncate(struct inode *inode);
+static int btrfs_truncate(struct inode *inode, bool skip_writeback);
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent 
*ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
   struct page *locked_page,
@@ -3625,7 +3625,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
goto out;
}
 
-   ret = btrfs_truncate(inode);
+   ret = btrfs_truncate(inode, false);
if (ret)
btrfs_orphan_del(NULL, BTRFS_I(inode));
} else {
@@ -5109,7 +5109,7 @@ static int btrfs_setsize(struct inode *inode, struct 
iattr *attr)
inode_dio_wait(inode);
btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
 
-   ret = btrfs_truncate(inode);
+   ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
int err;
 
@@ -9087,7 +9087,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
return ret;
 }
 
-static int btrfs_truncate(struct inode *inode)
+static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -9098,10 +9098,12 @@ static int btrfs_truncate(struct inode *inode)
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
 
-   ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
-  (u64)-1);
-   if (ret)
-   return ret;
+   if (!skip_writeback) {
+   ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+  (u64)-1);
+   if (ret)
+   return ret;
+   }
 
/*
 * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More 

[PATCH] Btrfs: send, fix issuing write op when processing hole in no data mode

2018-02-07 Thread fdmanana
From: Filipe Manana 

When doing an incremental send of a filesystem with the no-holes feature
enabled, we end up issuing a write operation when using the no data mode
send flag, instead of issuing an update extent operation. Fix this by
issuing the update extent operation instead.

Trivial reproducer:

  $ mkfs.btrfs -f -O no-holes /dev/sdc
  $ mkfs.btrfs -f /dev/sdd
  $ mount /dev/sdc /mnt/sdc
  $ mount /dev/sdd /mnt/sdd

  $ xfs_io -f -c "pwrite -S 0xab 0 32K" /mnt/sdc/foobar
  $ btrfs subvolume snapshot -r /mnt/sdc /mnt/sdc/snap1

  $ xfs_io -c "fpunch 8K 8K" /mnt/sdc/foobar
  $ btrfs subvolume snapshot -r /mnt/sdc /mnt/sdc/snap2

  $ btrfs send /mnt/sdc/snap1 | btrfs receive /mnt/sdd
  $ btrfs send --no-data -p /mnt/sdc/snap1 /mnt/sdc/snap2 \
   | btrfs receive -vv /mnt/sdd

Before this change the output of the second receive command is:

  receiving snapshot snap2 uuid=f6922049-8c22-e544-9ff9-fc6755918447...
  utimes
  write foobar, offset 8192, len 8192
  utimes foobar
  BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=f6922049-8c22-e544-9ff9-...

After this change it is:

  receiving snapshot snap2 uuid=564d36a3-ebc8-7343-aec9-bf6fda278e64...
  utimes
  update_extent foobar: offset=8192, len=8192
  utimes foobar
  BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=564d36a3-ebc8-7343-aec9-bf6fda278e64...

Signed-off-by: Filipe Manana 
---
 fs/btrfs/send.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f306c608dc28..484e2af793de 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5005,6 +5005,9 @@ static int send_hole(struct send_ctx *sctx, u64 end)
u64 len;
int ret = 0;
 
+   if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+   return send_update_extent(sctx, offset, end - offset);
+
p = fs_path_alloc();
if (!p)
return -ENOMEM;
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix null pointer dereference when replacing missing device

2018-01-30 Thread fdmanana
From: Filipe Manana 

When we are replacing a missing device we mount the filesystem with the
degraded mode option in which case we are allowed to have a btrfs device
structure without a backing device member (its bdev member is NULL) and
therefore we can't dereference that member. Commit 38b5f68e9811
("btrfs: drop btrfs_device::can_discard to query directly") started to
dereference that member when discarding extents, resulting in a null
pointer dereference:

 [ 3145.322257] BTRFS warning (device sdf): devid 2 uuid 
4d922414-58eb-4880-8fed-9c3840f6c5d5 is missing
 [ 3145.364116] BTRFS info (device sdf): dev_replace from  (devid 
2) to /dev/sdg started
 [ 3145.413489] BUG: unable to handle kernel NULL pointer dereference at 
00e0
 [ 3145.415085] IP: btrfs_discard_extent+0x6a/0xf8 [btrfs]
 [ 3145.415085] PGD 0 P4D 0
 [ 3145.415085] Oops:  [#1] PREEMPT SMP PTI
 [ 3145.415085] Modules linked in: ppdev ghash_clmulni_intel pcbc aesni_intel 
aes_x86_64 crypto_simd cryptd glue_helper evdev psmouse parport_pc serio_raw 
i2c_piix4 i2
 [ 3145.415085] CPU: 0 PID: 11989 Comm: btrfs Tainted: GW
4.15.0-rc9-btrfs-next-55+ #1
 [ 3145.415085] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [ 3145.415085] RIP: 0010:btrfs_discard_extent+0x6a/0xf8 [btrfs]
 [ 3145.415085] RSP: 0018:c90004813c60 EFLAGS: 00010293
 [ 3145.415085] RAX: 88020d39cc00 RBX: 88020c4ea2a0 RCX: 
0002
 [ 3145.415085] RDX:  RSI: 88020c4ea240 RDI: 

 [ 3145.415085] RBP:  R08: 4000 R09: 

 [ 3145.415085] R10: c90004813ae8 R11:  R12: 

 [ 3145.415085] R13: 88020c418000 R14:  R15: 

 [ 3145.415085] FS:  7f565681f8c0() GS:88023fc0() 
knlGS:
 [ 3145.415085] CS:  0010 DS:  ES:  CR0: 80050033
 [ 3145.415085] CR2: 00e0 CR3: 00020d208006 CR4: 
001606f0
 [ 3145.415085] Call Trace:
 [ 3145.415085]  btrfs_finish_extent_commit+0x9a/0x1be [btrfs]
 [ 3145.415085]  btrfs_commit_transaction+0x649/0x7a0 [btrfs]
 [ 3145.415085]  ? start_transaction+0x2b0/0x3b3 [btrfs]
 [ 3145.415085]  btrfs_dev_replace_start+0x274/0x30c [btrfs]
 [ 3145.415085]  btrfs_dev_replace_by_ioctl+0x45/0x59 [btrfs]
 [ 3145.415085]  btrfs_ioctl+0x1a91/0x1d62 [btrfs]
 [ 3145.415085]  ? lock_acquire+0x16a/0x1af
 [ 3145.415085]  ? vfs_ioctl+0x1b/0x28
 [ 3145.415085]  ? trace_hardirqs_on_caller+0x14c/0x1a6
 [ 3145.415085]  vfs_ioctl+0x1b/0x28
 [ 3145.415085]  do_vfs_ioctl+0x5a9/0x5e0
 [ 3145.415085]  ? _raw_spin_unlock_irq+0x34/0x46
 [ 3145.415085]  ? entry_SYSCALL_64_fastpath+0x5/0x8b
 [ 3145.415085]  ? trace_hardirqs_on_caller+0x14c/0x1a6
 [ 3145.415085]  SyS_ioctl+0x52/0x76
 [ 3145.415085]  entry_SYSCALL_64_fastpath+0x1e/0x8b
 [ 3145.415085] RIP: 0033:0x7f56558b3c47
 [ 3145.415085] RSP: 002b:7ffdcfac4c58 EFLAGS: 0202
 [ 3145.415085] Code: be 02 00 00 00 4c 89 ef e8 b9 e7 03 00 85 c0 89 c5 75 75 
48 8b 44 24 08 45 31 f6 48 8d 58 60 eb 52 48 8b 03 48 8b b8 a0 00 00 00 <48> 8b 
87 e0 00
 [ 3145.415085] RIP: btrfs_discard_extent+0x6a/0xf8 [btrfs] RSP: 
c90004813c60
 [ 3145.415085] CR2: 00e0
 [ 3145.458185] ---[ end trace 06302e7ac31902bf ]---

This is trivially reproduced by running the test btrfs/027 from fstests
like this:

  $ MOUNT_OPTIONS="-o discard" ./check btrfs/027

Fix this by skipping devices without a backing device before attempting
to discard.

Fixes: 38b5f68e9811 ("btrfs: drop btrfs_device::can_discard to query directly")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/extent-tree.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9d220b276c8f..d59ee24645e3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2147,6 +2147,10 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, 
u64 bytenr,
u64 bytes;
struct request_queue *req_q;
 
+   if (!stripe->dev->bdev) {
+   ASSERT(btrfs_test_opt(fs_info, DEGRADED));
+   continue;
+   }
req_q = bdev_get_queue(stripe->dev->bdev);
if (!blk_queue_discard(req_q))
continue;
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] Btrfs: fix missing inode i_size update after zero range operation

2018-01-18 Thread fdmanana
From: Filipe Manana 

For a fallocate's zero range operation that targets a range with an end
that is not aligned to the sector size, we can end up not updating the
inode's i_size. This happens when the last page of the range maps to an
unwritten (prealloc) extent and before that last page we have either a
hole or a written extent. This is because in this scenario we relied
on a call to btrfs_prealloc_file_range() to update the inode's i_size,
however it can only update the i_size to the "down aligned" end of the
range.

Example:

 $ mkfs.btrfs -f /dev/sdc
 $ mount /dev/sdc /mnt
 $ xfs_io -f -c "pwrite -S 0xff 0 428K" /mnt/foobar
 $ xfs_io -c "falloc -k 428K 4K" /mnt/foobar
 $ xfs_io -c "fzero 0 430K" /mnt/foobar
 $ du --bytes /mnt/foobar
 438272 /mnt/foobar

The inode's i_size was left as 428Kb (438272 bytes) when it should have
been updated to 430Kb (440320 bytes).
Fix this by always updating the inode's i_size explicitly after zeroing
the range.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/file.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc95d9590d2d..9ad0465d2e8e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3026,9 +3026,12 @@ static int btrfs_zero_range(struct inode *inode,
unlock_extent_cached(_I(inode)->io_tree, lockstart,
 lockend, _state, GFP_KERNEL);
/* btrfs_prealloc_file_range releases reserved space on error */
-   if (ret)
+   if (ret) {
space_reserved = false;
+   goto out;
+   }
}
+   ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
  out:
if (ret && space_reserved)
btrfs_free_reserved_data_space(inode, data_reserved,
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] Btrfs: fix space leak after fallocate and zero range operations

2018-01-18 Thread fdmanana
From: Filipe Manana 

If we do a buffered write after a zero range operation that has an
unaligned (with the filesystem's sector size) end which also falls within
an unwritten (prealloc) extent that is currently beyond the inode's
i_size, and the zero range operation has the flag FALLOC_FL_KEEP_SIZE,
we end up leaking data and metadata space. This happens because when
zeroing a range we call btrfs_truncate_block(), which does delalloc
(loads the page and partially zeroes its content), and in the buffered
write path we only clear existing delalloc space reservation for the
range we are writing into if that range starts at an offset smaller then
the inode's i_size, which makes sense since we can not have delalloc
extents beyond the i_size, only unwritten extents are allowed.

Example reproducer:

 $ mkfs.btrfs -f /dev/sdb
 $ mount /dev/sdb /mnt
 $ xfs_io -f -c "falloc -k 428K 4K" /mnt/foobar
 $ xfs_io -c "fzero -k 0 430K" /mnt/foobar
 $ xfs_io -c "pwrite -S 0xaa 428K 4K" /mnt/foobar
 $ umount /mnt

After the unmount we get the metadata and data space leaks reported in
dmesg/syslog:

 [95794.602253] [ cut here ]
 [95794.603322] WARNING: CPU: 0 PID: 31496 at fs/btrfs/inode.c:9561 
btrfs_destroy_inode+0x4e/0x206 [btrfs]
 [95794.605167] Modules linked in: btrfs xfs ppdev ghash_clmulni_intel pcbc 
aesni_intel aes_x86_64 crypto_simd cryptd glue_helper parport_pc psmouse sg 
i2c_piix4 parport i2c_core evdev pcspkr button serio_raw sunrpc loop autofs4 
ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 
async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq 
libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sd_mod virtio_scsi 
ata_generic crc32c_intel ata_piix floppy virtio_pci virtio_ring virtio libata 
scsi_mod e1000 [last unloaded: btrfs]
 [95794.613000] CPU: 0 PID: 31496 Comm: umount Tainted: GW   
4.14.0-rc6-btrfs-next-54+ #1
 [95794.614448] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [95794.615972] task: 880075aa0240 task.stack: c90001734000
 [95794.617114] RIP: 0010:btrfs_destroy_inode+0x4e/0x206 [btrfs]
 [95794.618001] RSP: 0018:c90001737d00 EFLAGS: 00010202
 [95794.618721] RAX:  RBX: 880070fa1418 RCX: 
c90001737c7c
 [95794.619645] RDX: 000175aa0240 RSI: 0001 RDI: 
880070fa1418
 [95794.620711] RBP: c90001737d38 R08:  R09: 

 [95794.621932] R10: c90001737c48 R11: 88007123e158 R12: 
880075b6a000
 [95794.623124] R13: 88006145c000 R14: 880070fa1418 R15: 
880070c3b4a0
 [95794.624188] FS:  7fa6793c92c0() GS:88023fc0() 
knlGS:
 [95794.625578] CS:  0010 DS:  ES:  CR0: 80050033
 [95794.626522] CR2: 56338670d048 CR3: 610dc005 CR4: 
001606f0
 [95794.627647] Call Trace:
 [95794.628128]  destroy_inode+0x3d/0x55
 [95794.628573]  evict+0x177/0x17e
 [95794.629010]  dispose_list+0x50/0x71
 [95794.629478]  evict_inodes+0x132/0x141
 [95794.630289]  generic_shutdown_super+0x3f/0x10b
 [95794.630864]  kill_anon_super+0x12/0x1c
 [95794.631383]  btrfs_kill_super+0x16/0x21 [btrfs]
 [95794.631930]  deactivate_locked_super+0x30/0x68
 [95794.632539]  deactivate_super+0x36/0x39
 [95794.633200]  cleanup_mnt+0x49/0x67
 [95794.633818]  __cleanup_mnt+0x12/0x14
 [95794.634416]  task_work_run+0x82/0xa6
 [95794.634902]  prepare_exit_to_usermode+0xe1/0x10c
 [95794.635525]  syscall_return_slowpath+0x18c/0x1af
 [95794.636122]  entry_SYSCALL_64_fastpath+0xab/0xad
 [95794.636834] RIP: 0033:0x7fa678cb99a7
 [95794.637370] RSP: 002b:7ffccf0aaed8 EFLAGS: 0246 ORIG_RAX: 
00a6
 [95794.638672] RAX:  RBX: 563386706030 RCX: 
7fa678cb99a7
 [95794.639596] RDX: 0001 RSI:  RDI: 
56338670ca90
 [95794.640703] RBP: 56338670ca90 R08: 56338670c740 R09: 
0015
 [95794.641773] R10: 06b4 R11: 0246 R12: 
7fa6791bae64
 [95794.643150] R13:  R14: 563386706210 R15: 
7ffccf0ab160
 [95794.644249] Code: ff 4c 8b a8 80 06 00 00 48 8b 87 c0 01 00 00 48 85 c0 74 
02 0f ff 48 83 bb e0 02 00 00 00 74 02 0f ff 83 bb 3c ff ff ff 00 74 02 <0f> ff 
83 bb 40 ff ff ff 00 74 02 0f ff 48 83 bb f8 fe ff ff 00
 [95794.646929] ---[ end trace e95877675c6ec007 ]---
 [95794.647751] [ cut here ]
 [95794.648509] WARNING: CPU: 0 PID: 31496 at fs/btrfs/inode.c:9562 
btrfs_destroy_inode+0x59/0x206 [btrfs]
 [95794.649842] Modules linked in: btrfs xfs ppdev ghash_clmulni_intel pcbc 
aesni_intel aes_x86_64 crypto_simd cryptd glue_helper parport_pc psmouse sg 
i2c_piix4 parport i2c_core evdev pcspkr button serio_raw sunrpc loop autofs4 
ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 
async_raid6_recov async_memcpy async_pq async_xor async_tx 

[PATCH] btrfs: test send for files with multiple hard links renamed

2017-11-24 Thread fdmanana
From: Filipe Manana 

Test that an incremental send operation works if a file that has multiple
hard links has some of its hard links renamed in the send snapshot, with
one of them getting the same path that some other inode had in the send
snapshot.

At the moment this test fails on btrfs and a fix is provived by a linux
kernel patch titled:

  "Btrfs: incremental send, fix wrong unlink path after renaming file"

Signed-off-by: Filipe Manana 
---
 tests/btrfs/155 | 147 
 tests/btrfs/155.out |   6 +++
 tests/btrfs/group   |   1 +
 3 files changed, 154 insertions(+)
 create mode 100755 tests/btrfs/155
 create mode 100644 tests/btrfs/155.out

diff --git a/tests/btrfs/155 b/tests/btrfs/155
new file mode 100755
index ..37c23260
--- /dev/null
+++ b/tests/btrfs/155
@@ -0,0 +1,147 @@
+#! /bin/bash
+# FS QA Test No. btrfs/155
+#
+# Test that an incremental send operation works if a file that has multiple
+# hard links has some of its hard links renamed in the send snapshot, with one
+# of them getting the same path that some other inode had in the send snapshot.
+#
+#---
+#
+# Copyright (C) 2017 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -fr $send_files_dir
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_test
+_require_scratch
+_require_fssum
+
+send_files_dir=$TEST_DIR/btrfs-test-$seq
+
+rm -f $seqres.full
+rm -fr $send_files_dir
+mkdir $send_files_dir
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+mkdir $SCRATCH_MNT/a
+touch $SCRATCH_MNT/a/f1
+mkdir -p $SCRATCH_MNT/a/b/c
+ln $SCRATCH_MNT/a/f1 $SCRATCH_MNT/a/b/c/f1l1
+touch $SCRATCH_MNT/a/b/f2
+mkdir $SCRATCH_MNT/d
+mv $SCRATCH_MNT/a/b/c/f1l1 $SCRATCH_MNT/d/f1l1_2
+ln $SCRATCH_MNT/a/b/f2 $SCRATCH_MNT/a/f2l1
+ln $SCRATCH_MNT/a/b/f2 $SCRATCH_MNT/d/f2l2
+mv $SCRATCH_MNT/a/f1 $SCRATCH_MNT/d/f1_2
+
+# Filesystem looks like:
+#
+# .  (ino 256)
+# | a/   (ino 257)
+# | | b/ (ino 259)
+# | | | c/   (ino 260)
+# | | | f2   (ino 261)
+# | |
+# | | f2l1   (ino 261)
+# |
+# | d/   (ino 262)
+#   | f1l1_2 (ino 258)
+#   | f2l2   (ino 261)
+#   | f1_2   (ino 258)
+#
+$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT \
+   $SCRATCH_MNT/mysnap1 > /dev/null
+
+$BTRFS_UTIL_PROG send -f $send_files_dir/1.snap \
+   $SCRATCH_MNT/mysnap1 2>&1 1>/dev/null | _filter_scratch
+
+mv $SCRATCH_MNT/d $SCRATCH_MNT/a/b/d2
+mv $SCRATCH_MNT/a/f2l1 $SCRATCH_MNT/d
+mkdir $SCRATCH_MNT/a/f2l1
+ln $SCRATCH_MNT/a/b/d2/f1_2 $SCRATCH_MNT/a/b/f1l2
+mv $SCRATCH_MNT/a/b $SCRATCH_MNT/a/f2l1/b2
+mv $SCRATCH_MNT/a/f2l1/b2/d2 $SCRATCH_MNT/a/f2l1/b2/c/d3
+mv $SCRATCH_MNT/a/f2l1/b2/c/d3/f2l2 $SCRATCH_MNT/a/f2l1/b2/c/d3/f2l2_2
+
+# Filesystem now looks like:
+#
+# .  (ino 256)
+# | a/   (ino 257)
+# | | f2l1/  (ino 263)
+# | | b2/(ino 259)
+# |   | c/   (ino 260)
+# |   | | d3 (ino 262)
+# |   |   | f1l1_2   (ino 258)
+# |   |   | f2l2_2   (ino 261)
+# |   |   | f1_2 (ino 258)
+# 

  1   2   3   4   5   >