Re: [PATCH 9/9] btrfs: add explicit check for replace result no error

2018-11-07 Thread Anand Jain




On 11/07/2018 08:15 PM, Nikolay Borisov wrote:



On 7.11.18 г. 13:43 ч., Anand Jain wrote:

We recast the replace return status
BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to 0, to indicate no
error.
And since BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR should also return 0,
which is also declared as 0, so we just return. Instead add it to the if
statement so that there is enough clarity while reading the code.

Signed-off-by: Anand Jain 
---
  fs/btrfs/dev-replace.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cf3554554616..ca44998189c7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -533,8 +533,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info 
*fs_info,

args->start.cont_reading_from_srcdev_mode);
args->result = ret;
/* don't warn if EINPROGRESS, someone else might be running scrub */
-   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
-   ret = 0;
+   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
+   ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)


BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR can only be returned from
btrfs_dev_replace_cancel,


 It can return 0 and which is also
 BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR which this patch makes it
 explicit.

 Looking at this again tells me that, as btrfs_dev_replace_start()
 is internal helper function, its better if it free from all usage
 of BTRFS_IOCTL_DEV_REPLACE_RESULT*.

Thanks, Anand


so what you are doing with checking ret ==
BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR is redundant. So using
BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR is at the very least misleading
here.



I suggest you drop this patch.


+   return 0;
  
  	return ret;

  }



Re: [PATCH -next] btrfs: compression: remove set but not used variable 'tree'

2018-11-07 Thread Nikolay Borisov



On 8.11.18 г. 4:15 ч., YueHaibing wrote:
> Fixes gcc '-Wunused-but-set-variable' warning:
> 
> fs/btrfs/compression.c: In function 'end_compressed_bio_write':
> fs/btrfs/compression.c:232:25: warning:
>  variable 'tree' set but not used [-Wunused-but-set-variable]
> 
> It not used any more after
> commit 2922040236f9 ("btrfs: Remove extent_io_ops::writepage_end_io_hook")
> 
> Signed-off-by: YueHaibing 

Reviewed-by: Nikolay Borisov 

> ---
>  fs/btrfs/compression.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index bde8d04..088570c 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -229,7 +229,6 @@ static noinline void end_compressed_writeback(struct 
> inode *inode,
>   */
>  static void end_compressed_bio_write(struct bio *bio)
>  {
> - struct extent_io_tree *tree;
>   struct compressed_bio *cb = bio->bi_private;
>   struct inode *inode;
>   struct page *page;
> @@ -248,7 +247,6 @@ static void end_compressed_bio_write(struct bio *bio)
>* call back into the FS and do all the end_io operations
>*/
>   inode = cb->inode;
> - tree = _I(inode)->io_tree;
>   cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
>   btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
>   cb->start, cb->start + cb->len - 1, NULL,
> 
> 
> 
> 


Re: [PATCH -next] btrfs: remove set but not used variable 'tree'

2018-11-07 Thread Nikolay Borisov



On 8.11.18 г. 4:14 ч., YueHaibing wrote:
> Fixes gcc '-Wunused-but-set-variable' warning:
> 
> fs/btrfs/extent_io.c: In function 'end_extent_writepage':
> fs/btrfs/extent_io.c:2406:25: warning:
>  variable 'tree' set but not used [-Wunused-but-set-variable]
> 
> It not used any more after
> commit 2922040236f9 ("btrfs: Remove extent_io_ops::writepage_end_io_hook")
> 
> Signed-off-by: YueHaibing 

Reviewed-by: Nikolay Borisov 

> ---
>  fs/btrfs/extent_io.c | 3 ---
>  1 file changed, 3 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 0f8f9c0..17a15cc 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -2403,11 +2403,8 @@ static int bio_readpage_error(struct bio *failed_bio, 
> u64 phy_offset,
>  void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
>  {
>   int uptodate = (err == 0);
> - struct extent_io_tree *tree;
>   int ret = 0;
>  
> - tree = _I(page->mapping->host)->io_tree;
> -
>   btrfs_writepage_endio_finish_ordered(page, start, end, NULL, uptodate);
>  
>   if (!uptodate) {
> 
> 
> 
> 


[PATCH v2 5/6] btrfs: qgroup: Use delayed subtree rescan for balance

2018-11-07 Thread Qu Wenruo
Before this patch, qgroup code trace the whole subtree of file and reloc
trees unconditionally.

This makes qgroup numbers consistent, but it could cause tons of
unnecessary extent trace, which cause a lot of overhead.

However for subtree swap of balance, since both subtree contains the
same content and tree structures, just swap them won't change qgroup
numbers.

It's the race window between subtree swap and transaction commit could
cause qgroup number change.

This patch will delay the qgroup subtree scan until CoW happens for the
subtree root.

So if there is no other operations for the fs, balance won't cause extra
qgroup overhead. (best case scenario)
And depends on the workload, most of the subtree scan can still be
avoided.

Only for worst case scenario, it will fall back to old subtree swap
overhead. (scan all swapped subtrees)

[[Benchmark]]
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.

Mkfs parameter:
--nodesize 4K (To bump up tree size)

Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)

Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.

balance parameter:
-m

So the content should be pretty similar to a real world root fs layout.

And after file system population, there is no other activity, so it
should be the best case scenario.

 | v4.20-rc1| w/ patchset| diff
---
relocated extents| 22615| 22457  | -0.1%
qgroup dirty extents | 163457   | 121606 | -25.6%
time (sys)   | 22.884s  | 18.842s| -17.6%
time (real)  | 27.724s  | 22.884s| -17.5%

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.c  |  8 
 fs/btrfs/qgroup.c | 87 +++
 fs/btrfs/qgroup.h |  2 +
 fs/btrfs/relocation.c | 14 +++
 4 files changed, 102 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 539901fb5165..f4b1f73ecb71 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -12,6 +12,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "locking.h"
+#include "qgroup.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  *root, struct btrfs_path *path, int level);
@@ -1462,6 +1463,13 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle 
*trans,
btrfs_set_lock_blocking(parent);
btrfs_set_lock_blocking(buf);
 
+   /*
+* Before CoWing this block for later modification, check if it's
+* the subtree root and do the delayed subtree trace if needed.
+*
+* Also We don't care about the error, as it's handled internally.
+*/
+   btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
 parent_slot, cow_ret, search_start, 0);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 461895af512b..58ba106abad9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3982,3 +3982,90 @@ int btrfs_qgroup_add_swapped_blocks(struct 
btrfs_trans_handle *trans,
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
 }
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+   struct btrfs_root *root, struct extent_buffer *file_eb)
+{
+   struct btrfs_fs_info *fs_info = root->fs_info;
+   struct btrfs_qgroup_swapped_blocks *blocks = >swapped_blocks;
+   struct btrfs_qgroup_swapped_block *block;
+   struct extent_buffer *reloc_eb = NULL;
+   struct rb_node *n;
+   bool found = false;
+   bool swapped = false;
+   int level = btrfs_header_level(file_eb);
+   int ret = 0;
+   int i;
+
+   if (!test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags))
+   return 0;
+   if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+   return 0;
+
+   spin_lock(>lock);
+   if (!blocks->swapped) {
+   spin_unlock(>lock);
+   goto out;
+   }
+   n = blocks->blocks[level].rb_node;
+
+   while (n) {
+   block = rb_entry(n, struct btrfs_qgroup_swapped_block, node);
+   if (block->file_bytenr < file_eb->start)
+   n = n->rb_left;
+   else if (block->file_bytenr > file_eb->start)
+   n = n->rb_right;
+   else {
+   found = true;
+   break;
+ 

[PATCH v2 2/6] btrfs: relocation: Delay reloc tree deletion after merge_reloc_roots()

2018-11-07 Thread Qu Wenruo
Relocation code will drop btrfs_root::reloc_root as soon as
merge_reloc_root() finishes.

However later qgroup code will need to access btrfs_root::reloc_root
after merge_reloc_root() for delayed subtree rescan.

So alter the timming of resetting btrfs_root:::reloc_root, make it
happens after transaction commit.

With this patch, we will introduce a new btrfs_root::state,
BTRFS_ROOT_DEAD_RELOC_TREE, to info part of btrfs_root::reloc_tree user
that although btrfs_root::reloc_tree is still non-NULL, but still it's
not used any more.

The lifespan of btrfs_root::reloc tree will become:
  Old behavior|  New

btrfs_init_reloc_root()  ---  | btrfs_init_reloc_root()  ---
  set reloc_root  |   |   set reloc_root  |
  |   |   |
  |   |   |
merge_reloc_root()|   | merge_reloc_root()|
|- btrfs_update_reloc_root() ---  | |- btrfs_update_reloc_root() -+-
 clear btrfs_root::reloc_root |  set ROOT_DEAD_RELOC_TREE |
  |  record root into dirty   |
  |  roots rbtree |
  |   |
  | reloc_block_group() Or|
  | btrfs_recover_relocation()|
  | | After transaction commit|
  | |- clean_dirty_root()---
  | clear btrfs_root::reloc_root

During ROOT_DEAD_RELOC_TREE set lifespan, the only user of
btrfs_root::reloc_tree should be qgroup.

And to co-operate this, also delayed btrfs_drop_snapshot() call on reloc
tree, btrfs_drop_snapshot() call will also be delayed to
clean_dirty_root().

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h  |   1 +
 fs/btrfs/relocation.c | 125 --
 2 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80953528572d..2c33506bdaaa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1149,6 +1149,7 @@ struct btrfs_subvolume_writers {
 #define BTRFS_ROOT_FORCE_COW   6
 #define BTRFS_ROOT_MULTI_LOG_TASKS 7
 #define BTRFS_ROOT_DIRTY   8
+#define BTRFS_ROOT_DEAD_RELOC_TREE 9
 
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 924116f654a1..6f1f11b5d8f6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -143,6 +143,20 @@ struct file_extent_cluster {
unsigned int nr;
 };
 
+/*
+ * Helper structure to keep record of a file tree whose reloc
+ * root needs to be cleaned up.
+ *
+ * Since reloc_control is used less frequently than btrfs_root, this should
+ * prevent us to add another structure in btrfs_root.
+ */
+struct dirty_source_root {
+   struct rb_node node;
+
+   /* Root must be file tree */
+   struct btrfs_root *root;
+};
+
 struct reloc_control {
/* block group to relocate */
struct btrfs_block_group_cache *block_group;
@@ -172,6 +186,9 @@ struct reloc_control {
u64 search_start;
u64 extents_found;
 
+   /* dirty source roots, whose reloc root needs to be cleaned up */
+   struct rb_root dirty_roots;
+
unsigned int stage:8;
unsigned int create_reloc_tree:1;
unsigned int merge_reloc_tree:1;
@@ -1467,15 +1484,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle 
*trans,
struct btrfs_root_item *root_item;
int ret;
 
-   if (!root->reloc_root)
+   if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, >state) ||
+   !root->reloc_root)
goto out;
 
reloc_root = root->reloc_root;
root_item = _root->root_item;
 
+   /* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
-   root->reloc_root = NULL;
+   set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, >state);
__del_reloc_root(reloc_root);
}
 
@@ -2120,6 +2139,84 @@ static int find_next_key(struct btrfs_path *path, int 
level,
return 1;
 }
 
+/*
+ * Helper to insert current root into reloc_control::dirty_roots
+ */
+static int insert_dirty_root(struct btrfs_trans_handle *trans,
+struct reloc_control *rc,
+struct btrfs_root *root)
+{
+   struct rb_node **p = >dirty_roots.rb_node;
+   struct rb_node *parent = NULL;
+   struct dirty_source_root *entry;
+   struct btrfs_root *reloc_root = root->reloc_root;
+   struct btrfs_root_item *reloc_root_item;
+   u64 

[PATCH v2 0/6] btrfs: qgroup: Delay subtree scan to reduce overhead

2018-11-07 Thread Qu Wenruo
This patchset can be fetched from github:
https://github.com/adam900710/linux/tree/qgroup_delayed_subtree_rebased

Which is based on v4.20-rc1.

This patch address the heavy load subtree scan, but delaying it until
we're going to modify the swapped tree block.

The overall workflow is:

1) Record the subtree root block get swapped.

   During subtree swap:
   O = Old tree blocks
   N = New tree blocks
 reloc tree file tree X
Root   Root
   /\ /\
 NA OB  OA  OB
   /  | |  \  /  |  |  \
 NC  ND OE  OF   OC  OD OE  OF

  In these case, NA and OA is going to be swapped, record (NA, OA) into
  file tree X.

2) After subtree swap.
 reloc tree file tree X
Root   Root
   /\ /\
 OA OB  NA  OB
   /  | |  \  /  |  |  \
 OC  OD OE  OF   NC  ND OE  OF

3a) CoW happens for OB
If we are going to CoW tree block OB, we check OB's bytenr against
tree X's swapped_blocks structure.
It doesn't fit any one, nothing will happen.

3b) CoW happens for NA
Check NA's bytenr against tree X's swapped_blocks, and get a hit.
Then we do subtree scan on both subtree OA and NA.
Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).

Then no matter what we do to file tree X, qgroup numbers will
still be correct.
Then NA's record get removed from X's swapped_blocks.

4)  Transaction commit
Any record in X's swapped_blocks get removed, since there is no
modification to swapped subtrees, no need to trigger heavy qgroup
subtree rescan for them.

[[Benchmark]]
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.

Mkfs parameter:
--nodesize 4K (To bump up tree size)

Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)

Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.

balance parameter:
-m

So the content should be pretty similar to a real world root fs layout.

And after file system population, there is no other activity, so it
should be the best case scenario.

 | v4.20-rc1| w/ patchset| diff
---
relocated extents| 22615| 22457  | -0.1%
qgroup dirty extents | 163457   | 121606 | -25.6%
time (sys)   | 22.884s  | 18.842s| -17.6%
time (real)  | 27.724s  | 22.884s| -17.5%

changelog:
v2:
  Rebase to v4.20-rc1.
  Instead commit transaction after each reloc tree merge, delay it until
  merge_reloc_roots() finishes.
  This provides a more natural behavior, and reduce the unnecessary
  transaction commits.

Qu Wenruo (6):
  btrfs: qgroup: Allow btrfs_qgroup_extent_record::old_roots unpopulated
at insert time
  btrfs: relocation: Delay reloc tree deletion after merge_reloc_roots()
  btrfs: qgroup: Refactor btrfs_qgroup_trace_subtree_swap()
  btrfs: qgroup: Introduce per-root swapped blocks infrastructure
  btrfs: qgroup: Use delayed subtree rescan for balance
  btrfs: qgroup: Cleanup old subtree swap code

 fs/btrfs/ctree.c   |   8 +
 fs/btrfs/ctree.h   |  14 ++
 fs/btrfs/disk-io.c |   1 +
 fs/btrfs/qgroup.c  | 376 +++--
 fs/btrfs/qgroup.h  | 107 +++-
 fs/btrfs/relocation.c  | 140 ---
 fs/btrfs/transaction.c |   1 +
 7 files changed, 527 insertions(+), 120 deletions(-)

-- 
2.19.1



[PATCH v2 6/6] btrfs: qgroup: Cleanup old subtree swap code

2018-11-07 Thread Qu Wenruo
Since it's replaced by new delayed subtree swap code, remove the
original code.

The cleanup is small since most of its core function is still used by
delayed subtree swap trace.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 94 ---
 fs/btrfs/qgroup.h |  6 ---
 2 files changed, 100 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 58ba106abad9..b662be1e35cc 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2099,100 +2099,6 @@ static int qgroup_trace_subtree_swap(struct 
btrfs_trans_handle *trans,
return ret;
 }
 
-/*
- * Inform qgroup to trace subtree swap used in balance.
- *
- * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
- * new tree blocks whose generation is equal to (or larger than) 
@last_snapshot.
- *
- * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
- * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
- * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
- * the conterpart of the tree block, then mark both tree blocks as qgroup 
dirty,
- * and skip all tree blocks whose generation is smaller than last_snapshot.
- *
- * This would skip tons of tree blocks of original 
btrfs_qgroup_trace_subtree(),
- * which could be the cause of very slow balance if the file tree is large.
- *
- * @src_parent, @src_slot: pointer to src (file tree) eb.
- * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
- */
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
-   struct btrfs_block_group_cache *bg_cache,
-   struct extent_buffer *src_parent, int src_slot,
-   struct extent_buffer *dst_parent, int dst_slot,
-   u64 last_snapshot)
-{
-   struct btrfs_fs_info *fs_info = trans->fs_info;
-   struct btrfs_key first_key;
-   struct extent_buffer *src_eb = NULL;
-   struct extent_buffer *dst_eb = NULL;
-   bool trace_leaf = false;
-   u64 child_gen;
-   u64 child_bytenr;
-   int ret;
-
-   if (!test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags))
-   return 0;
-
-   /* Check parameter order */
-   if (btrfs_node_ptr_generation(src_parent, src_slot) >
-   btrfs_node_ptr_generation(dst_parent, dst_slot)) {
-   btrfs_err_rl(fs_info,
-   "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
-   btrfs_node_ptr_generation(src_parent, src_slot),
-   btrfs_node_ptr_generation(dst_parent, dst_slot));
-   return -EUCLEAN;
-   }
-
-   /*
-* Only trace leaf if we're relocating data block groups, this could
-* reduce tons of data extents tracing for meta/sys bg relocation.
-*/
-   if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
-   trace_leaf = true;
-   /* Read out real @src_eb, pointed by @src_parent and @src_slot */
-   child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
-   child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
-   btrfs_node_key_to_cpu(src_parent, _key, src_slot);
-
-   src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
-   btrfs_header_level(src_parent) - 1, _key);
-   if (IS_ERR(src_eb)) {
-   ret = PTR_ERR(src_eb);
-   goto out;
-   }
-
-   /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
-   child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
-   child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
-   btrfs_node_key_to_cpu(dst_parent, _key, dst_slot);
-
-   dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
-   btrfs_header_level(dst_parent) - 1, _key);
-   if (IS_ERR(dst_eb)) {
-   ret = PTR_ERR(dst_eb);
-   goto out;
-   }
-
-   if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) 
{
-   ret = -EINVAL;
-   goto out;
-   }
-
-   /* Do the generation aware breadth-first search */
-   ret = qgroup_trace_subtree_swap(trans, src_eb, dst_eb, last_snapshot,
-   trace_leaf, true);
-   if (ret < 0)
-   goto out;
-   ret = 0;
-
-out:
-   free_extent_buffer(src_eb);
-   free_extent_buffer(dst_eb);
-   return ret;
-}
-
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
   struct extent_buffer *root_eb,
   u64 root_gen, int root_level)
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 9f941421c405..3254add3c340 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -316,12 +316,6 @@ int btrfs_qgroup_trace_leaf_items(struct 
btrfs_trans_handle *trans,
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle 

[PATCH v2 3/6] btrfs: qgroup: Refactor btrfs_qgroup_trace_subtree_swap()

2018-11-07 Thread Qu Wenruo
Refactor btrfs_qgroup_trace_subtree_swap() into
qgroup_trace_subtree_swap(), which only needs two extent buffer and some
other bool to control the behavior.

Also, allow depending functions to accept parameter @exec_post to
determine whether we need to trigger backref walk.

This provides the basis for later delayed subtree scan work.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 104 --
 1 file changed, 72 insertions(+), 32 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 6c674ac29b90..c50c369d5f16 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1793,7 +1793,7 @@ static int qgroup_trace_extent_swap(struct 
btrfs_trans_handle* trans,
struct extent_buffer *src_eb,
struct btrfs_path *dst_path,
int dst_level, int root_level,
-   bool trace_leaf)
+   bool trace_leaf, bool exec_post)
 {
struct btrfs_key key;
struct btrfs_path *src_path;
@@ -1884,22 +1884,23 @@ static int qgroup_trace_extent_swap(struct 
btrfs_trans_handle* trans,
 * Now both @dst_path and @src_path have been populated, record the tree
 * blocks for qgroup accounting.
 */
-   ret = btrfs_qgroup_trace_extent(trans, 
src_path->nodes[dst_level]->start,
-   nodesize, GFP_NOFS);
+   ret = qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS, exec_post);
if (ret < 0)
goto out;
-   ret = btrfs_qgroup_trace_extent(trans,
-   dst_path->nodes[dst_level]->start,
-   nodesize, GFP_NOFS);
+   ret = qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS, exec_post);
if (ret < 0)
goto out;
 
/* Record leaf file extents */
if (dst_level == 0 && trace_leaf) {
-   ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
+   ret = qgroup_trace_leaf_items(trans, src_path->nodes[0],
+ exec_post);
if (ret < 0)
goto out;
-   ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
+   ret = qgroup_trace_leaf_items(trans, dst_path->nodes[0],
+ exec_post);
}
 out:
btrfs_free_path(src_path);
@@ -1932,7 +1933,8 @@ static int qgroup_trace_new_subtree_blocks(struct 
btrfs_trans_handle* trans,
   struct extent_buffer *src_eb,
   struct btrfs_path *dst_path,
   int cur_level, int root_level,
-  u64 last_snapshot, bool trace_leaf)
+  u64 last_snapshot, bool trace_leaf,
+  bool exec_post)
 {
struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_buffer *eb;
@@ -2004,7 +2006,7 @@ static int qgroup_trace_new_subtree_blocks(struct 
btrfs_trans_handle* trans,
 
/* Now record this tree block and its counter part for qgroups */
ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
-  root_level, trace_leaf);
+  root_level, trace_leaf, exec_post);
if (ret < 0)
goto cleanup;
 
@@ -2021,7 +2023,7 @@ static int qgroup_trace_new_subtree_blocks(struct 
btrfs_trans_handle* trans,
/* Recursive call (at most 7 times) */
ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
dst_path, cur_level - 1, root_level,
-   last_snapshot, trace_leaf);
+   last_snapshot, trace_leaf, exec_post);
if (ret < 0)
goto cleanup;
}
@@ -2041,6 +2043,62 @@ static int qgroup_trace_new_subtree_blocks(struct 
btrfs_trans_handle* trans,
return ret;
 }
 
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+   struct extent_buffer *src_eb,
+   struct extent_buffer *dst_eb,
+   u64 last_snapshot, bool trace_leaf,
+   bool exec_post)
+{
+   struct btrfs_fs_info *fs_info = trans->fs_info;
+   struct btrfs_path *dst_path = NULL;
+   int level;
+   int ret;
+
+   if (!test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags))
+   return 0;
+
+   /* Wrong parameter order */
+   if 

[PATCH v2 4/6] btrfs: qgroup: Introduce per-root swapped blocks infrastructure

2018-11-07 Thread Qu Wenruo
To allow delayed subtree swap rescan, btrfs needs to record per-root
info about which tree blocks get swapped.

So this patch introduces per-root btrfs_qgroup_swapped_blocks structure,
which records which tree blocks get swapped.

The designed workflow will be:

1) Record the subtree root block get swapped.

   During subtree swap:
   O = Old tree blocks
   N = New tree blocks
 reloc tree file tree X
Root   Root
   /\ /\
 NA OB  OA  OB
   /  | |  \  /  |  |  \
 NC  ND OE  OF   OC  OD OE  OF

  In these case, NA and OA is going to be swapped, record (NA, OA) into
  file tree X.

2) After subtree swap.
 reloc tree file tree X
Root   Root
   /\ /\
 OA OB  NA  OB
   /  | |  \  /  |  |  \
 OC  OD OE  OF   NC  ND OE  OF

3a) CoW happens for OB
If we are going to CoW tree block OB, we check OB's bytenr against
tree X's swapped_blocks structure.
It doesn't fit any one, nothing will happen.

3b) CoW happens for NA
Check NA's bytenr against tree X's swapped_blocks, and get a hit.
Then we do subtree scan on both subtree OA and NA.
Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).

Then no matter what we do to file tree X, qgroup numbers will
still be correct.
Then NA's record get removed from X's swapped_blocks.

4)  Transaction commit
Any record in X's swapped_blocks get removed, since there is no
modification to swapped subtrees, no need to trigger heavy qgroup
subtree rescan for them.

This will introduce 128 bytes overhead for each btrfs_root even qgroup
is not enabled.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  13 +
 fs/btrfs/disk-io.c |   1 +
 fs/btrfs/qgroup.c  | 130 +
 fs/btrfs/qgroup.h  |  99 +++
 fs/btrfs/relocation.c  |   7 +++
 fs/btrfs/transaction.c |   1 +
 6 files changed, 251 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c33506bdaaa..e32fcf211c8a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1151,6 +1151,17 @@ struct btrfs_subvolume_writers {
 #define BTRFS_ROOT_DIRTY   8
 #define BTRFS_ROOT_DEAD_RELOC_TREE 9
 
+/*
+ * Record swapped tree blocks of a file/subvolume tree for delayed subtree
+ * trace code. For detail check comment in fs/btrfs/qgroup.c.
+ */
+struct btrfs_qgroup_swapped_blocks {
+   spinlock_t lock;
+   struct rb_root blocks[BTRFS_MAX_LEVEL];
+   /* RM_EMPTY_ROOT() of above blocks[] */
+   bool swapped;
+};
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
@@ -1275,6 +1286,8 @@ struct btrfs_root {
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
 
+   struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ab41da91d1..bd37c3ee2fa9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1204,6 +1204,7 @@ static void __setup_root(struct btrfs_root *root, struct 
btrfs_fs_info *fs_info,
root->anon_dev = 0;
 
spin_lock_init(>root_item_lock);
+   btrfs_qgroup_init_swapped_blocks(>swapped_blocks);
 }
 
 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c50c369d5f16..461895af512b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3852,3 +3852,133 @@ void btrfs_qgroup_check_reserved_leak(struct inode 
*inode)
}
extent_changeset_release();
 }
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Get called when commit one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+   struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+   int i;
+
+   swapped_blocks = >swapped_blocks;
+
+   spin_lock(_blocks->lock);
+   if (!swapped_blocks->swapped)
+   goto out;
+   for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+   struct rb_root *cur_root = _blocks->blocks[i];
+   struct btrfs_qgroup_swapped_block *entry;
+   struct btrfs_qgroup_swapped_block *next;
+
+   rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+node)
+   kfree(entry);
+   swapped_blocks->blocks[i] = RB_ROOT;
+   }
+   

[PATCH v2 1/6] btrfs: qgroup: Allow btrfs_qgroup_extent_record::old_roots unpopulated at insert time

2018-11-07 Thread Qu Wenruo
Commit fb235dc06fac ("btrfs: qgroup: Move half of the qgroup accounting
time out of commit trans") makes btrfs_qgroup_extent_record::old_roots
populated at insert time.

It's OK for most cases as btrfs_qgroup_extent_record is inserted at
delayed ref head insert time, which has a less restrict lock context.

But later delayed subtree scan optimization will need to insert
btrfs_qgroup_extent_record with path write lock hold, where triggering a
backref walk can easily lead to dead lock.

So this patch introduces two new internal functions,
qgroup_trace_extent() and qgroup_trace_leaf_items(), with new @exec_post
parameter to info whether we need to initialize the backref walk right
now.

Also modifies btrfs_qgroup_account_extents() not to trigger kernel
warning.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 51 +--
 1 file changed, 41 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 45868fd76209..6c674ac29b90 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1580,8 +1580,16 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info 
*fs_info,
return 0;
 }
 
-int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
- u64 num_bytes, gfp_t gfp_flag)
+/*
+ * Insert qgroup extent record for extent at @bytenr, @num_bytes.
+ *
+ * @bytenr:bytenr of the extent
+ * @num_bytes: length of the extent
+ * @exec_post: whether to exec the post insert work
+ * will init backref walk if set to true.
+ */
+static int qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+  u64 num_bytes, gfp_t gfp_flag, bool exec_post)
 {
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
@@ -1607,11 +1615,27 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle 
*trans, u64 bytenr,
kfree(record);
return 0;
}
-   return btrfs_qgroup_trace_extent_post(fs_info, record);
+   if (exec_post)
+   return btrfs_qgroup_trace_extent_post(fs_info, record);
+   return 0;
 }
 
-int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
- struct extent_buffer *eb)
+int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes, gfp_t gfp_flag)
+{
+   return qgroup_trace_extent(trans, bytenr, num_bytes, gfp_flag, true);
+}
+
+/*
+ * Insert qgroup extent record for leaf and all file extents in it
+ *
+ * @bytenr:bytenr of the leaf
+ * @num_bytes: length of the leaf
+ * @exec_post: whether to exec the post insert work
+ * will init backref walk if set to true.
+ */
+static int qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
+  struct extent_buffer *eb, bool exec_post)
 {
struct btrfs_fs_info *fs_info = trans->fs_info;
int nr = btrfs_header_nritems(eb);
@@ -1643,8 +1667,8 @@ int btrfs_qgroup_trace_leaf_items(struct 
btrfs_trans_handle *trans,
 
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
 
-   ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
-   GFP_NOFS);
+   ret = qgroup_trace_extent(trans, bytenr, num_bytes, GFP_NOFS,
+ exec_post);
if (ret)
return ret;
}
@@ -1652,6 +1676,12 @@ int btrfs_qgroup_trace_leaf_items(struct 
btrfs_trans_handle *trans,
return 0;
 }
 
+int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb)
+{
+   return qgroup_trace_leaf_items(trans, eb, true);
+}
+
 /*
  * Walk up the tree from the bottom, freeing leaves and any interior
  * nodes which have had all slots visited. If a node (leaf or
@@ -2558,10 +2588,11 @@ int btrfs_qgroup_account_extents(struct 
btrfs_trans_handle *trans)
 
if (!ret) {
/*
-* Old roots should be searched when inserting qgroup
-* extent record
+* Most record->old_roots should have been populated at
+* insert time. Although we still allow some records
+* without old_roots populated.
 */
-   if (WARN_ON(!record->old_roots)) {
+   if (!record->old_roots) {
/* Search commit root to find old_roots */
ret = btrfs_find_all_roots(NULL, fs_info,
record->bytenr, 0,
-- 
2.19.1



[PATCH -next] btrfs: compression: remove set but not used variable 'tree'

2018-11-07 Thread YueHaibing
Fixes gcc '-Wunused-but-set-variable' warning:

fs/btrfs/compression.c: In function 'end_compressed_bio_write':
fs/btrfs/compression.c:232:25: warning:
 variable 'tree' set but not used [-Wunused-but-set-variable]

It not used any more after
commit 2922040236f9 ("btrfs: Remove extent_io_ops::writepage_end_io_hook")

Signed-off-by: YueHaibing 
---
 fs/btrfs/compression.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index bde8d04..088570c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -229,7 +229,6 @@ static noinline void end_compressed_writeback(struct inode 
*inode,
  */
 static void end_compressed_bio_write(struct bio *bio)
 {
-   struct extent_io_tree *tree;
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
@@ -248,7 +247,6 @@ static void end_compressed_bio_write(struct bio *bio)
 * call back into the FS and do all the end_io operations
 */
inode = cb->inode;
-   tree = _I(inode)->io_tree;
cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
cb->start, cb->start + cb->len - 1, NULL,





[PATCH -next] btrfs: remove set but not used variable 'tree'

2018-11-07 Thread YueHaibing
Fixes gcc '-Wunused-but-set-variable' warning:

fs/btrfs/extent_io.c: In function 'end_extent_writepage':
fs/btrfs/extent_io.c:2406:25: warning:
 variable 'tree' set but not used [-Wunused-but-set-variable]

It not used any more after
commit 2922040236f9 ("btrfs: Remove extent_io_ops::writepage_end_io_hook")

Signed-off-by: YueHaibing 
---
 fs/btrfs/extent_io.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0f8f9c0..17a15cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2403,11 +2403,8 @@ static int bio_readpage_error(struct bio *failed_bio, 
u64 phy_offset,
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 {
int uptodate = (err == 0);
-   struct extent_io_tree *tree;
int ret = 0;
 
-   tree = _I(page->mapping->host)->io_tree;
-
btrfs_writepage_endio_finish_ordered(page, start, end, NULL, uptodate);
 
if (!uptodate) {





Re: [PATCH] Btrfs: fix data corruption due to cloning of eof block

2018-11-07 Thread Josef Bacik
On Mon, Nov 05, 2018 at 11:14:17AM +, fdman...@kernel.org wrote:
> From: Filipe Manana 
> 
> We currently allow cloning a range from a file which includes the last
> block of the file even if the file's size is not aligned to the block
> size. This is fine and useful when the destination file has the same size,
> but when it does not and the range ends somewhere in the middle of the
> destination file, it leads to corruption because the bytes between the EOF
> and the end of the block have undefined data (when there is support for
> discard/trimming they have a value of 0x00).
> 
> Example:
> 
>  $ mkfs.btrfs -f /dev/sdb
>  $ mount /dev/sdb /mnt
> 
>  $ export foo_size=$((256 * 1024 + 100))
>  $ xfs_io -f -c "pwrite -S 0x3c 0 $foo_size" /mnt/foo
>  $ xfs_io -f -c "pwrite -S 0xb5 0 1M" /mnt/bar
> 
>  $ xfs_io -c "reflink /mnt/foo 0 512K $foo_size" /mnt/bar
> 
>  $ od -A d -t x1 /mnt/bar
>  000 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5
>  *
>  0524288 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c
>  *
>  0786528 3c 3c 3c 3c 00 00 00 00 00 00 00 00 00 00 00 00
>  0786544 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>  *
>  0790528 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5
>  *
>  1048576
> 
> The bytes in the range from 786532 (512Kb + 256Kb + 100 bytes) to 790527
> (512Kb + 256Kb + 4Kb - 1) got corrupted, having now a value of 0x00 instead
> of 0xb5.
> 
> This is similar to the problem we had for deduplication that got recently
> fixed by commit de02b9f6bb65 ("Btrfs: fix data corruption when
> deduplicating between different files").
> 
> Fix this by not allowing such operations to be performed and return the
> errno -EINVAL to user space. This is what XFS is doing as well at the VFS
> level. This change however now makes us return -EINVAL instead of
> -EOPNOTSUPP for cases where the source range maps to an inline extent and
> the destination range's end is smaller then the destination file's size,
> since the detection of inline extents is done during the actual process of
> dropping file extent items (at __btrfs_drop_extents()). Returning the
> -EINVAL error is done early on and solely based on the input parameters
> (offsets and length) and destination file's size. This makes us consistent
> with XFS and anyone else supporting cloning since this case is now checked
> at a higher level in the VFS and is where the -EINVAL will be returned
> from starting with kernel 4.20 (the VFS changed was introduced in 4.20-rc1
> by commit 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into
> partial EOF block"). So this change is more geared towards stable kernels,
> as it's unlikely the new VFS checks get removed intentionally.
> 
> A test case for fstests follows soon, as well as an update to filter
> existing tests that expect -EOPNOTSUPP to accept -EINVAL as well.
> 
> CC:  # 4.4+
> Signed-off-by: Filipe Manana 

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-07 Thread David Sterba
On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> From: Omar Sandoval 
> 
> There's a race between close_ctree() and cleaner_kthread().
> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> sees it set, but this is racy; the cleaner might have already checked
> the bit and could be cleaning stuff. In particular, if it deletes unused
> block groups, it will create delayed iputs for the free space cache
> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> longer running delayed iputs after a commit. Therefore, if the cleaner
> creates more delayed iputs after delayed iputs are run in
> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> inode crash from the VFS.
> 
> Fix it by parking the cleaner before we actually close anything. Then,
> any remaining delayed iputs will always be handled in
> btrfs_commit_super(). This also ensures that the commit in close_ctree()
> is really the last commit, so we can get rid of the commit in
> cleaner_kthread().
> 
> Fixes: 30928e9baac2 ("btrfs: don't run delayed_iputs in commit")
> Signed-off-by: Omar Sandoval 

I'll queue this patch for rc2 as it fixes crashes I see during testing.
My version does more changes and would be more suitable for a series,
that could actually document the shutdown sequence and add a few
assertions on top.

Reviewed-by: David Sterba 


Re: [PATCH v9 0/6] Btrfs: implement swap file support

2018-11-07 Thread David Sterba
On Wed, Nov 07, 2018 at 05:07:00PM +0200, Nikolay Borisov wrote:
> 
> 
> On 7.11.18 г. 16:49 ч., David Sterba wrote:
> > On Tue, Nov 06, 2018 at 10:54:51AM +0100, David Sterba wrote:
> >> On Thu, Sep 27, 2018 at 11:17:32AM -0700, Omar Sandoval wrote:
> >>> From: Omar Sandoval 
> >>> This series implements swap file support for Btrfs.
> >>>
> >>> Changes from v8 [1]:
> >>>
> >>> - Fixed a bug in btrfs_swap_activate() which would cause us to miss some
> >>>   file extents if they were merged into one extent map entry.
> >>> - Fixed build for !CONFIG_SWAP.
> >>> - Changed all error messages to KERN_WARN.
> >>> - Unindented long error messages.
> >>>
> >>> I've Cc'd Jon and Al on patch 3 this time, so hopefully we can get an
> >>> ack for that one, too.
> >>>
> >>> Thanks!
> >>>
> >>> 1: https://www.spinics.net/lists/linux-btrfs/msg82267.html
> >>>
> >>> Omar Sandoval (6):
> >>>   mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS
> >>>   mm: export add_swap_extent()
> >>>   vfs: update swap_{,de}activate documentation
> >>>   Btrfs: prevent ioctls from interfering with a swap file
> >>>   Btrfs: rename get_chunk_map() and make it non-static
> >>>   Btrfs: support swap files
> >>
> >> fstest generic/472 reports an assertion failure. This is on the updated 
> >> fstests
> >> git (70c4067285b0bc076), though it should not matter:
> >>
> >> [16597.002190] assertion failed: IS_ALIGNED(start, fs_info->sectorsize) && 
> >> IS_ALIGNED(end + 1, fs_info->sectorsize), file: fs/btrfs/file-item.c, 
> >> line: 319
> > 
> > I have to revert the patch for now as it kills the testing machines.
> 
> The reason is that the isize is not aligned to a sectorsize. Ie it
> should be:
> 
> +   u64 isize = ALIGN_DOWN(i_size_read(inode), fs_info->sectorsize);
> 
> With this fixlet generic/472 succeeds.

Thanks for the fix, I'll fold it in.


Re: [PATCH v9 0/6] Btrfs: implement swap file support

2018-11-07 Thread Nikolay Borisov



On 7.11.18 г. 16:49 ч., David Sterba wrote:
> On Tue, Nov 06, 2018 at 10:54:51AM +0100, David Sterba wrote:
>> On Thu, Sep 27, 2018 at 11:17:32AM -0700, Omar Sandoval wrote:
>>> From: Omar Sandoval 
>>> This series implements swap file support for Btrfs.
>>>
>>> Changes from v8 [1]:
>>>
>>> - Fixed a bug in btrfs_swap_activate() which would cause us to miss some
>>>   file extents if they were merged into one extent map entry.
>>> - Fixed build for !CONFIG_SWAP.
>>> - Changed all error messages to KERN_WARN.
>>> - Unindented long error messages.
>>>
>>> I've Cc'd Jon and Al on patch 3 this time, so hopefully we can get an
>>> ack for that one, too.
>>>
>>> Thanks!
>>>
>>> 1: https://www.spinics.net/lists/linux-btrfs/msg82267.html
>>>
>>> Omar Sandoval (6):
>>>   mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS
>>>   mm: export add_swap_extent()
>>>   vfs: update swap_{,de}activate documentation
>>>   Btrfs: prevent ioctls from interfering with a swap file
>>>   Btrfs: rename get_chunk_map() and make it non-static
>>>   Btrfs: support swap files
>>
>> fstest generic/472 reports an assertion failure. This is on the updated 
>> fstests
>> git (70c4067285b0bc076), though it should not matter:
>>
>> [16597.002190] assertion failed: IS_ALIGNED(start, fs_info->sectorsize) && 
>> IS_ALIGNED(end + 1, fs_info->sectorsize), file: fs/btrfs/file-item.c, line: 
>> 319
> 
> I have to revert the patch for now as it kills the testing machines.

The reason is that the isize is not aligned to a sectorsize. Ie it
should be:

+   u64 isize = ALIGN_DOWN(i_size_read(inode), fs_info->sectorsize);

With this fixlet generic/472 succeeds.

> 


Re: [PATCH v9 0/6] Btrfs: implement swap file support

2018-11-07 Thread David Sterba
On Tue, Nov 06, 2018 at 10:54:51AM +0100, David Sterba wrote:
> On Thu, Sep 27, 2018 at 11:17:32AM -0700, Omar Sandoval wrote:
> > From: Omar Sandoval 
> > This series implements swap file support for Btrfs.
> > 
> > Changes from v8 [1]:
> > 
> > - Fixed a bug in btrfs_swap_activate() which would cause us to miss some
> >   file extents if they were merged into one extent map entry.
> > - Fixed build for !CONFIG_SWAP.
> > - Changed all error messages to KERN_WARN.
> > - Unindented long error messages.
> > 
> > I've Cc'd Jon and Al on patch 3 this time, so hopefully we can get an
> > ack for that one, too.
> > 
> > Thanks!
> > 
> > 1: https://www.spinics.net/lists/linux-btrfs/msg82267.html
> > 
> > Omar Sandoval (6):
> >   mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS
> >   mm: export add_swap_extent()
> >   vfs: update swap_{,de}activate documentation
> >   Btrfs: prevent ioctls from interfering with a swap file
> >   Btrfs: rename get_chunk_map() and make it non-static
> >   Btrfs: support swap files
> 
> fstest generic/472 reports an assertion failure. This is on the updated 
> fstests
> git (70c4067285b0bc076), though it should not matter:
> 
> [16597.002190] assertion failed: IS_ALIGNED(start, fs_info->sectorsize) && 
> IS_ALIGNED(end + 1, fs_info->sectorsize), file: fs/btrfs/file-item.c, line: 
> 319

I have to revert the patch for now as it kills the testing machines.


Re: [PATCH 2/9] btrfs: replace go back to suspended if target missing

2018-11-07 Thread Nikolay Borisov



On 7.11.18 г. 13:43 ч., Anand Jain wrote:
> At the time of forced unmount we place the running replace to
> BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
> back and suppose the target device is missing, then let the replace
> state continue to be in BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state
> instead of BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any
> matching scrub running as part of replace.
> 
> Signed-off-by: Anand Jain 
> ---
>  fs/btrfs/dev-replace.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index 59991165e126..47d6768a9cde 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -884,6 +884,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info 
> *fs_info)
>  "cannot continue dev_replace, tgtdev is missing");
>   btrfs_info(fs_info,
>  "you may cancel the operation after 'mount -o 
> degraded'");
> + dev_replace->replace_state =
> + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
> + dev_replace->item_needs_writeback = 1;

Why do we need items_needs_writeback = 1 here, nothing is changed w.r.t
on-disk data?

>   btrfs_dev_replace_write_unlock(dev_replace);
>   return 0;
>   }
> 


Re: [PATCH 5/9] btrfs: replace cancel is successful if scrub cancel is successful

2018-11-07 Thread Nikolay Borisov



On 7.11.18 г. 14:25 ч., Nikolay Borisov wrote:
> 
> 
> On 7.11.18 г. 13:43 ч., Anand Jain wrote:
>> In btrfs_dev_replace_cancel() we should check if the
>> btrfs_scrub_cancel() is successful. If the btrfs_scrub_cancel() fails, return
>> BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED so that user can try to
>> cancel the replace again.
>>
>> Signed-off-by: Anand Jain 
>> ---
>>  fs/btrfs/dev-replace.c | 22 +-
>>  1 file changed, 13 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
>> index 90c124edec76..c092ed559714 100644
>> --- a/fs/btrfs/dev-replace.c
>> +++ b/fs/btrfs/dev-replace.c
>> @@ -799,18 +799,22 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
>> *fs_info)
>>  btrfs_dev_replace_write_unlock(dev_replace);
>>  break;
>>  case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
>> -result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
>>  tgt_device = dev_replace->tgtdev;
>>  src_device = dev_replace->srcdev;
>>  btrfs_dev_replace_write_unlock(dev_replace);
>> -btrfs_scrub_cancel(fs_info);
>> -/*
>> - * btrfs_dev_replace_finishing() will handle the cleanup part
>> - */
>> -btrfs_info_in_rcu(fs_info,
>> -"dev_replace from %s (devid %llu) to %s canceled",
>> -btrfs_dev_name(src_device), src_device->devid,
>> -btrfs_dev_name(tgt_device));
>> +ret = btrfs_scrub_cancel(fs_info);
>> +if (ret) {
>> +result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
>> +} else {
>> +result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
>> +/*
>> + * btrfs_dev_replace_finishing() will handle the 
>> cleanup part
>> + */
>> +btrfs_info_in_rcu(fs_info,
>> +"dev_replace from %s (devid %llu) to %s 
>> canceled",
>> +btrfs_dev_name(src_device), src_device->devid,
>> +btrfs_dev_name(tgt_device));
> 
> This is identical to the btrfs_info_in_rcu several lines further down.
> So if btrfs_scrub_cancel is successful you will print this messages
> twice. Furthermore, there is already an unconditinal call to
> btrfs_scrub_cancel. You are duplicating this function, this definitely
> needs another look...

Ah, it builds on top of the previous patch which I still haven't
reviewed. So ignore this, my bad.
> 
>> +}
>>  break;
>>  case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
>>  /*
>>


Re: [PATCH 5/9] btrfs: replace cancel is successful if scrub cancel is successful

2018-11-07 Thread Nikolay Borisov



On 7.11.18 г. 13:43 ч., Anand Jain wrote:
> In btrfs_dev_replace_cancel() we should check if the
> btrfs_scrub_cancel() is successful. If the btrfs_scrub_cancel() fails, return
> BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED so that user can try to
> cancel the replace again.
> 
> Signed-off-by: Anand Jain 
> ---
>  fs/btrfs/dev-replace.c | 22 +-
>  1 file changed, 13 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index 90c124edec76..c092ed559714 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -799,18 +799,22 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
> *fs_info)
>   btrfs_dev_replace_write_unlock(dev_replace);
>   break;
>   case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
> - result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
>   tgt_device = dev_replace->tgtdev;
>   src_device = dev_replace->srcdev;
>   btrfs_dev_replace_write_unlock(dev_replace);
> - btrfs_scrub_cancel(fs_info);
> - /*
> -  * btrfs_dev_replace_finishing() will handle the cleanup part
> -  */
> - btrfs_info_in_rcu(fs_info,
> - "dev_replace from %s (devid %llu) to %s canceled",
> - btrfs_dev_name(src_device), src_device->devid,
> - btrfs_dev_name(tgt_device));
> + ret = btrfs_scrub_cancel(fs_info);
> + if (ret) {
> + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
> + } else {
> + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
> + /*
> +  * btrfs_dev_replace_finishing() will handle the 
> cleanup part
> +  */
> + btrfs_info_in_rcu(fs_info,
> + "dev_replace from %s (devid %llu) to %s 
> canceled",
> + btrfs_dev_name(src_device), src_device->devid,
> + btrfs_dev_name(tgt_device));

This is identical to the btrfs_info_in_rcu several lines further down.
So if btrfs_scrub_cancel is successful you will print this messages
twice. Furthermore, there is already an unconditinal call to
btrfs_scrub_cancel. You are duplicating this function, this definitely
needs another look...

> + }
>   break;
>   case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
>   /*
> 


Re: [PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state

2018-11-07 Thread Nikolay Borisov



On 7.11.18 г. 13:43 ч., Anand Jain wrote:
> + /* scrub for replace must not be running in suspended state */
> + if (btrfs_scrub_cancel(fs_info) != -ENOTCONN)
> + ASSERT(0);

ASSERT(btrfs_scrub_cancel(fs_info) == -ENOTCONN)


Re: [PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish

2018-11-07 Thread Nikolay Borisov



On 7.11.18 г. 13:43 ч., Anand Jain wrote:
> - WARN_ON(ret);
> + if (ret != -ECANCELED)
> + WARN_ON(ret);

WARN_ON(ret && ret != -ECANCELED)


Re: [PATCH 9/9] btrfs: add explicit check for replace result no error

2018-11-07 Thread Nikolay Borisov



On 7.11.18 г. 13:43 ч., Anand Jain wrote:
> We recast the replace return status
> BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to 0, to indicate no
> error.
> And since BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR should also return 0,
> which is also declared as 0, so we just return. Instead add it to the if
> statement so that there is enough clarity while reading the code.
> 
> Signed-off-by: Anand Jain 
> ---
>  fs/btrfs/dev-replace.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index cf3554554616..ca44998189c7 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -533,8 +533,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info 
> *fs_info,
>   
> args->start.cont_reading_from_srcdev_mode);
>   args->result = ret;
>   /* don't warn if EINPROGRESS, someone else might be running scrub */
> - if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
> - ret = 0;
> + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
> + ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)

BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR can only be returned from
btrfs_dev_replace_cancel, so what you are doing with checking ret ==
BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR is redundant. So using
BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR is at the very least misleading
here.

I suggest you drop this patch.

> + return 0;
>  
>   return ret;
>  }
> 


Re: [PATCH 1/9] btrfs: mark btrfs_dev_replace_start() as static

2018-11-07 Thread Nikolay Borisov



On 7.11.18 г. 13:43 ч., Anand Jain wrote:
> There isn't any other consumer other than in its own file dev-replace.c.
> 
> Signed-off-by: Anand Jain 

Reviewed-by: Nikolay Borisov 

> ---
>  fs/btrfs/dev-replace.c | 2 +-
>  fs/btrfs/dev-replace.h | 3 ---
>  2 files changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index 2aa48aecc52b..59991165e126 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -390,7 +390,7 @@ static char* btrfs_dev_name(struct btrfs_device *device)
>   return rcu_str_deref(device->name);
>  }
>  
> -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
> +static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
>   const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
>   int read_src)
>  {
> diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
> index 795c551f5b5e..27e3bb0cca11 100644
> --- a/fs/btrfs/dev-replace.h
> +++ b/fs/btrfs/dev-replace.h
> @@ -13,9 +13,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
> struct btrfs_fs_info *fs_info);
>  int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
>   struct btrfs_ioctl_dev_replace_args *args);
> -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
> - const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
> - int read_src);
>  void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
> struct btrfs_ioctl_dev_replace_args *args);
>  int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
> 


[PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish

2018-11-07 Thread Anand Jain
When we successfully cancel the replace its scrub returns -ECANCELED,
which then passed to btrfs_dev_replace_finishing(), it cleans up based
on the scrub returned status and propagates the same -ECANCELED back
the parent function. So skip the -ECANCELED error to log the WARN.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index dae2b920f1a9..c14c41b70287 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -497,7 +497,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info 
*fs_info,
ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS) {
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
-   } else {
+   } else if (ret != -ECANCELED) {
WARN_ON(ret);
}
 
@@ -956,7 +956,8 @@ static int btrfs_dev_replace_kthread(void *data)
  btrfs_device_get_total_bytes(dev_replace->srcdev),
  _replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
-   WARN_ON(ret);
+   if (ret != -ECANCELED)
+   WARN_ON(ret);
 
clear_bit(BTRFS_FS_EXCL_OP, _info->flags);
return 0;
-- 
1.8.3.1



[PATCH 8/9] btrfs: user requsted replace cancel is not an error

2018-11-07 Thread Anand Jain
As of now only user requested replace cancel can cancel the replace-scrub
so no need to log error for it.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index c14c41b70287..cf3554554616 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -622,7 +622,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info 
*fs_info,
src_device,
tgt_device);
} else {
-   btrfs_err_in_rcu(fs_info,
+   if (scrub_ret != -ECANCELED)
+   btrfs_err_in_rcu(fs_info,
 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 btrfs_dev_name(src_device),
 src_device->devid,
-- 
1.8.3.1



[PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state

2018-11-07 Thread Anand Jain
When the replace state is placed in the suspended state,
btrfs_scrub_cancel() should fail with -ENOTCONN as there is no
scrub running, as a safety catch check if btrfs_scrub_cancel()
returns -ENOTCONN and assert if it doesn't.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index c092ed559714..dae2b920f1a9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -831,7 +831,9 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 
btrfs_dev_replace_write_unlock(dev_replace);
 
-   btrfs_scrub_cancel(fs_info);
+   /* scrub for replace must not be running in suspended state */
+   if (btrfs_scrub_cancel(fs_info) != -ENOTCONN)
+   ASSERT(0);
 
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
-- 
1.8.3.1



[PATCH 9/9] btrfs: add explicit check for replace result no error

2018-11-07 Thread Anand Jain
We recast the replace return status
BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to 0, to indicate no
error.
And since BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR should also return 0,
which is also declared as 0, so we just return. Instead add it to the if
statement so that there is enough clarity while reading the code.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cf3554554616..ca44998189c7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -533,8 +533,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info 
*fs_info,

args->start.cont_reading_from_srcdev_mode);
args->result = ret;
/* don't warn if EINPROGRESS, someone else might be running scrub */
-   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
-   ret = 0;
+   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
+   ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
+   return 0;
 
return ret;
 }
-- 
1.8.3.1



[PATCH 0/9] fix replace-start and replace-cancel racing

2018-11-07 Thread Anand Jain
Replace-start and replace-cancel threads can race to create a messy
situation leading to UAF. We use the scrub code to write
the blocks on the replace target. So if we haven't have set the
replace-scrub-running yet, without this patch we just ignore the error
and free the target device. When this happens the system panics with
UAF error.

Its nice to see that btrfs_dev_replace_finishing() already handles
the ECANCELED (replace canceled) situation, but for an unknown reason
we aren't using it to cleanup the replace cancel situation, instead
we just let the replace cancel ioctl thread to cleanup the target
device and return and out of synchronous with the scrub code.

This patch 4/9, 5/9 and 6/9 uses the return code of btrfs_scrub_cancel()
to check if the scrub was really running. And if its not then shall
return an error to the user (replace not started error) so that user
can retry replace cancel. And uses btrfs_dev_replace_finishing() code
to cleanup after successful cancel of the replace scrub.

Further, a suspended replace, when tries to restart, and if it fails
(for example target device missing, or excl ops running) it goes to the
started state, and so the cli 'btrfs replace status /mnt' hangs with no
progress. So patches 2/9 and 3/9 fixes that.

As the originals code idea of ECANCELED was limited to the situation of
the error only and not user requested, there are unnecessary error log
and warn log which 7/9 and 8/9 patches fixes.

Patches 1/9 and 9/9 are good to have fixes. Makes a function static and
code readability good.

Testing: (I did some attempt to convert these into xfstests but need a
mechanism where kernel thread can wait for user land script. I thought
I could do it using ebfp, but needs more digging on how).
As of now hand tested with using procfs to hold kernel thread at
(wait_for_user(..)) until user land issues go.


1.
umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount 
/dev/sdb /btrfs && fillfs /btrfs 1
btrfs replace start /dev/sdb /dev/sdc /btrfs
  wait_for_user("scrub running is set..waiting"); AND OR
  wait_for_user("scrub running is NOT set..waiting");
btrfs replace cancel /btrfs
  wait_for_user_go();
btrfs replace status /btrfs

2.
umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount 
/dev/sdb /btrfs && fillfs /btrfs 1
btrfs replace start /dev/sdb /dev/sdc /btrfs
  wait_for_user("scrub running is set..waiting"); AND OR
  wait_for_user("scrub running is NOT set..waiting");
reboot
mount -o device=/dev/sdc /dev/sdb /btrfs
  wait_for_user_go();
btrfs replace status /btrfs
btrfs replace cancel /btrfs
btrfs replace status /btrfs

3.
umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount 
/dev/sdb /btrfs && fillfs /btrfs 1
btrfs replace start /dev/sdb /dev/sdc /btrfs
  wait_for_user("scrub running is set..waiting"); AND OR
  wait_for_user("scrub running is NOT set..waiting");
reboot
mount -o degraded /dev/sdb /btrfs
btrfs replace status /btrfs
btrfs replace cancel /btrfs
btrfs replace status /btrfs
umount /btrfs
mount /dev/sdb /btrfs


Anand Jain (9):
  btrfs: mark btrfs_dev_replace_start() as static
  btrfs: replace go back to suspended if target missing
  btrfs: replace back to suspend state if EXCL OP is running
  btrfs: fix UAF due to race between replace start and cancel
  btrfs: replace cancel is successful if scrub cancel is successful
  btrfs: replace's scrub must not be running in replace suspended state
  btrfs: quiten warn if the replace is canceled at finish
  btrfs: user requsted replace cancel is not an error
  btrfs: add explicit check for replace result no error

 fs/btrfs/dev-replace.c | 90 ++
 fs/btrfs/dev-replace.h |  3 --
 2 files changed, 62 insertions(+), 31 deletions(-)

-- 
1.8.3.1



[PATCH 5/9] btrfs: replace cancel is successful if scrub cancel is successful

2018-11-07 Thread Anand Jain
In btrfs_dev_replace_cancel() we should check if the
btrfs_scrub_cancel() is successful. If the btrfs_scrub_cancel() fails, return
BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED so that user can try to
cancel the replace again.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 90c124edec76..c092ed559714 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -799,18 +799,22 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
btrfs_dev_replace_write_unlock(dev_replace);
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
-   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
btrfs_dev_replace_write_unlock(dev_replace);
-   btrfs_scrub_cancel(fs_info);
-   /*
-* btrfs_dev_replace_finishing() will handle the cleanup part
-*/
-   btrfs_info_in_rcu(fs_info,
-   "dev_replace from %s (devid %llu) to %s canceled",
-   btrfs_dev_name(src_device), src_device->devid,
-   btrfs_dev_name(tgt_device));
+   ret = btrfs_scrub_cancel(fs_info);
+   if (ret) {
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+   } else {
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   /*
+* btrfs_dev_replace_finishing() will handle the 
cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s 
canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   }
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
-- 
1.8.3.1



[PATCH 2/9] btrfs: replace go back to suspended if target missing

2018-11-07 Thread Anand Jain
At the time of forced unmount we place the running replace to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
back and suppose the target device is missing, then let the replace
state continue to be in BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state
instead of BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any
matching scrub running as part of replace.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 59991165e126..47d6768a9cde 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -884,6 +884,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info 
*fs_info)
   "cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
   "you may cancel the operation after 'mount -o 
degraded'");
+   dev_replace->replace_state =
+   BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+   dev_replace->item_needs_writeback = 1;
btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}
-- 
1.8.3.1



[PATCH 1/9] btrfs: mark btrfs_dev_replace_start() as static

2018-11-07 Thread Anand Jain
There isn't any other consumer other than in its own file dev-replace.c.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/dev-replace.h | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2aa48aecc52b..59991165e126 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -390,7 +390,7 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name);
 }
 
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
+static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
 {
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 795c551f5b5e..27e3bb0cca11 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -13,9 +13,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
  struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
-   const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
-   int read_src);
 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
  struct btrfs_ioctl_dev_replace_args *args);
 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
-- 
1.8.3.1



[PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel

2018-11-07 Thread Anand Jain
replace cancel thread can race with the replace start thread and if
fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail
to stop the scrub thread, so the scrub thread continue with the scrub for
replace which then shall try to write to the target device and which is
already freed by the cancel thread. Please ref to the logs below.

So scrub_setup_ctx() warns as tgtdev is null.

static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int
is_dev_replace)
{
::
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);  <===
sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
sctx->flush_all_writes = false;
}

[ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc started
[ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc canceled
[ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 
scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.432970] Call Trace:
[ 6852.433202]  btrfs_scrub_dev+0x19b/0x5c0 [btrfs]
[ 6852.433471]  btrfs_dev_replace_start+0x48c/0x6a0 [btrfs]
[ 6852.433800]  btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs]
[ 6852.434097]  btrfs_ioctl+0x2476/0x2d20 [btrfs]
[ 6852.434365]  ? do_sigaction+0x7d/0x1e0
[ 6852.434623]  do_vfs_ioctl+0xa9/0x6c0
[ 6852.434865]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435124]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435387]  ksys_ioctl+0x60/0x90
[ 6852.435663]  __x64_sys_ioctl+0x16/0x20
[ 6852.435907]  do_syscall_64+0x50/0x180
[ 6852.436150]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Further, as the replace thread enters the
scrub_write_page_to_dev_replace() without the target device it panics

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
::
  bio_set_dev(bio, sbio->dev->bdev); <==

[ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 
00a0
::
[ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs]
[ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260
[btrfs]
::
[ 6929.721430] Call Trace:
[ 6929.721663]  scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs]
[ 6929.721975]  scrub_bio_end_io_worker+0x1af/0x490 [btrfs]
[ 6929.722277]  normal_work_helper+0xf0/0x4c0 [btrfs]
[ 6929.722552]  process_one_work+0x1f4/0x520
[ 6929.722805]  ? process_one_work+0x16e/0x520
[ 6929.723063]  worker_thread+0x46/0x3d0
[ 6929.723313]  kthread+0xf8/0x130
[ 6929.723544]  ? process_one_work+0x520/0x520
[ 6929.723800]  ? kthread_delayed_work_timer_fn+0x80/0x80
[ 6929.724081]  ret_from_fork+0x3a/0x50

Fix this by letting the btrfs_dev_replace_finishing() to do the job of
cleaning after the cancel, including freeing of the target device.
btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns
along with the scrub return status.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 61 --
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e001c2418940..90c124edec76 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
btrfs_dev_replace_write_unlock(dev_replace);
-   goto leave;
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   tgt_device = dev_replace->tgtdev;
+   src_device = dev_replace->srcdev;
+   btrfs_dev_replace_write_unlock(dev_replace);
+   btrfs_scrub_cancel(fs_info);
+   /*
+* btrfs_dev_replace_finishing() will handle the cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+   /*
+* scrub doing the replace isn't running so do the cleanup here
+*/
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
-   break;
-   }
-   dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
-   dev_replace->time_stopped = 

[PATCH 3/9] btrfs: replace back to suspend state if EXCL OP is running

2018-11-07 Thread Anand Jain
In a secnario where balance and replace co-exists as below,
  start balance; pause balance; start replace; reboot
and when system restarts balance restarts first and the
replace restart will fail as EXCL OP lock is already held by
the balance. If so place the replace state back to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 47d6768a9cde..e001c2418940 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -898,6 +898,11 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info 
*fs_info)
 * dev-replace to start anyway.
 */
if (test_and_set_bit(BTRFS_FS_EXCL_OP, _info->flags)) {
+   btrfs_dev_replace_write_lock(dev_replace);
+   dev_replace->replace_state =
+   BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+   dev_replace->item_needs_writeback = 1;
+   btrfs_dev_replace_write_unlock(dev_replace);
btrfs_info(fs_info,
"cannot resume dev-replace, other exclusive operation running");
return 0;
-- 
1.8.3.1



Re: [PATCH 09/13] btrfs-progs: Revert "btrfs-progs: Record orphan data extent ref to corresponding root."

2018-11-07 Thread Qu Wenruo



On 2018/11/7 下午5:09, Su Yanjun  wrote:
> 
> 
> On 10/24/2018 8:29 AM, Qu Wenruo wrote:
>>
>> On 2018/10/23 下午5:41, Su Yue wrote:
>>> From: Su Yanjun 
>>>
>>> The reason for revert is that according to the existing situation, the
>>> probability of problem in the extent tree is higher than in the fs Tree.
>>> So this feature should be removed.
>>>
>> The same problem as previous patch.
>>
>> We need an example on how such repair could lead to further corruption.
>>
>> Thanks,
>> Qu
> 
> Firstly In record_orphan_data_extents() function:
> 
> key.objectid = dback->owner;
> key.type = BTRFS_EXTENT_DATA_KEY;
> key.offset = dback->offset;//
> ret = btrfs_search_slot(NULL, dest_root, , , 0, 0);
> 
> 'dback->offset' is wrong use here.
> 
> Secondly when disk bytenr in file extent item is wrong, the file extent
> data is always inserted to fs tree which will lead to fs check failed.

What about an example like:

Before:
 ( EXTENT_DATA )
   #And some description about missing INODE_ITEM

After repair:
 ( EXTENT_DATA )
  # Then describe what's wrong.

IMHO, with a format similar to inspect tree, along with some
description, it will be much easier for us to understand why the old
repair is causing more problem.

Thanks,
Qu

> 
> Thanks,
> Su
> 
>>> Signed-off-by: Su Yanjun 
>>> ---
>>>   check/main.c  | 120 +-
>>>   check/mode-original.h |  17 --
>>>   ctree.h   |  10 
>>>   disk-io.c |   1 -
>>>   4 files changed, 1 insertion(+), 147 deletions(-)
>>>
>>> diff --git a/check/main.c b/check/main.c
>>> index 268de5dd5f26..bd1f322e0f12 100644
>>> --- a/check/main.c
>>> +++ b/check/main.c
>>> @@ -511,22 +511,6 @@ cleanup:
>>>   return ERR_PTR(ret);
>>>   }
>>>   -static void print_orphan_data_extents(struct list_head
>>> *orphan_extents,
>>> -  u64 objectid)
>>> -{
>>> -    struct orphan_data_extent *orphan;
>>> -
>>> -    if (list_empty(orphan_extents))
>>> -    return;
>>> -    printf("The following data extent is lost in tree %llu:\n",
>>> -   objectid);
>>> -    list_for_each_entry(orphan, orphan_extents, list) {
>>> -    printf("\tinode: %llu, offset:%llu, disk_bytenr: %llu,
>>> disk_len: %llu\n",
>>> -   orphan->objectid, orphan->offset, orphan->disk_bytenr,
>>> -   orphan->disk_len);
>>> -    }
>>> -}
>>> -
>>>   static void print_inode_error(struct btrfs_root *root, struct
>>> inode_record *rec)
>>>   {
>>>   u64 root_objectid = root->root_key.objectid;
>>> @@ -578,9 +562,6 @@ static void print_inode_error(struct btrfs_root
>>> *root, struct inode_record *rec)
>>>   if (errors & I_ERR_INLINE_RAM_BYTES_WRONG)
>>>   fprintf(stderr, ", invalid inline ram bytes");
>>>   fprintf(stderr, "\n");
>>> -    /* Print the orphan extents if needed */
>>> -    if (errors & I_ERR_FILE_EXTENT_ORPHAN)
>>> -    print_orphan_data_extents(>orphan_extents,
>>> root->objectid);
>>>     /* Print the holes if needed */
>>>   if (errors & I_ERR_FILE_EXTENT_DISCOUNT) {
>>> @@ -683,18 +664,6 @@ static struct inode_record *get_inode_rec(struct
>>> cache_tree *inode_cache,
>>>   return rec;
>>>   }
>>>   -static void free_orphan_data_extents(struct list_head
>>> *orphan_extents)
>>> -{
>>> -    struct orphan_data_extent *orphan;
>>> -
>>> -    while (!list_empty(orphan_extents)) {
>>> -    orphan = list_entry(orphan_extents->next,
>>> -    struct orphan_data_extent, list);
>>> -    list_del(>list);
>>> -    free(orphan);
>>> -    }
>>> -}
>>> -
>>>   static void free_inode_rec(struct inode_record *rec)
>>>   {
>>>   struct inode_backref *backref;
>>> @@ -707,7 +676,6 @@ static void free_inode_rec(struct inode_record *rec)
>>>   list_del(>list);
>>>   free(backref);
>>>   }
>>> -    free_orphan_data_extents(>orphan_extents);
>>>   free_file_extent_holes(>holes);
>>>   free(rec);
>>>   }
>>> @@ -3286,7 +3254,6 @@ skip_walking:
>>>     free_corrupt_blocks_tree(_blocks);
>>>   root->fs_info->corrupt_blocks = NULL;
>>> -    free_orphan_data_extents(>orphan_data_extents);
>>>   return ret;
>>>   }
>>>   @@ -7137,88 +7104,6 @@ static int find_possible_backrefs(struct
>>> btrfs_fs_info *info,
>>>   return 0;
>>>   }
>>>   -/*
>>> - * Record orphan data ref into corresponding root.
>>> - *
>>> - * Return 0 if the extent item contains data ref and recorded.
>>> - * Return 1 if the extent item contains no useful data ref
>>> - *   On that case, it may contains only shared_dataref or metadata
>>> backref
>>> - *   or the file extent exists(this should be handled by the extent
>>> bytenr
>>> - *   recovery routine)
>>> - * Return <0 if something goes wrong.
>>> - */
>>> -static int record_orphan_data_extents(struct btrfs_fs_info *fs_info,
>>> -  struct extent_record *rec)
>>> -{
>>> -    struct btrfs_key key;
>>> -    struct btrfs_root *dest_root;

Re: [PATCH 09/13] btrfs-progs: Revert "btrfs-progs: Record orphan data extent ref to corresponding root."

2018-11-07 Thread Su Yanjun




On 10/24/2018 8:29 AM, Qu Wenruo wrote:


On 2018/10/23 下午5:41, Su Yue wrote:

From: Su Yanjun 

The reason for revert is that according to the existing situation, the
probability of problem in the extent tree is higher than in the fs Tree.
So this feature should be removed.


The same problem as previous patch.

We need an example on how such repair could lead to further corruption.

Thanks,
Qu


Firstly In record_orphan_data_extents() function:

key.objectid = dback->owner;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = dback->offset;//
ret = btrfs_search_slot(NULL, dest_root, , , 0, 0);

'dback->offset' is wrong use here.

Secondly when disk bytenr in file extent item is wrong, the file extent data is 
always inserted to fs tree which will lead to fs check failed.

Thanks,
Su


Signed-off-by: Su Yanjun 
---
  check/main.c  | 120 +-
  check/mode-original.h |  17 --
  ctree.h   |  10 
  disk-io.c |   1 -
  4 files changed, 1 insertion(+), 147 deletions(-)

diff --git a/check/main.c b/check/main.c
index 268de5dd5f26..bd1f322e0f12 100644
--- a/check/main.c
+++ b/check/main.c
@@ -511,22 +511,6 @@ cleanup:
return ERR_PTR(ret);
  }
  
-static void print_orphan_data_extents(struct list_head *orphan_extents,

- u64 objectid)
-{
-   struct orphan_data_extent *orphan;
-
-   if (list_empty(orphan_extents))
-   return;
-   printf("The following data extent is lost in tree %llu:\n",
-  objectid);
-   list_for_each_entry(orphan, orphan_extents, list) {
-   printf("\tinode: %llu, offset:%llu, disk_bytenr: %llu, disk_len: 
%llu\n",
-  orphan->objectid, orphan->offset, orphan->disk_bytenr,
-  orphan->disk_len);
-   }
-}
-
  static void print_inode_error(struct btrfs_root *root, struct inode_record 
*rec)
  {
u64 root_objectid = root->root_key.objectid;
@@ -578,9 +562,6 @@ static void print_inode_error(struct btrfs_root *root, 
struct inode_record *rec)
if (errors & I_ERR_INLINE_RAM_BYTES_WRONG)
fprintf(stderr, ", invalid inline ram bytes");
fprintf(stderr, "\n");
-   /* Print the orphan extents if needed */
-   if (errors & I_ERR_FILE_EXTENT_ORPHAN)
-   print_orphan_data_extents(>orphan_extents, root->objectid);
  
  	/* Print the holes if needed */

if (errors & I_ERR_FILE_EXTENT_DISCOUNT) {
@@ -683,18 +664,6 @@ static struct inode_record *get_inode_rec(struct 
cache_tree *inode_cache,
return rec;
  }
  
-static void free_orphan_data_extents(struct list_head *orphan_extents)

-{
-   struct orphan_data_extent *orphan;
-
-   while (!list_empty(orphan_extents)) {
-   orphan = list_entry(orphan_extents->next,
-   struct orphan_data_extent, list);
-   list_del(>list);
-   free(orphan);
-   }
-}
-
  static void free_inode_rec(struct inode_record *rec)
  {
struct inode_backref *backref;
@@ -707,7 +676,6 @@ static void free_inode_rec(struct inode_record *rec)
list_del(>list);
free(backref);
}
-   free_orphan_data_extents(>orphan_extents);
free_file_extent_holes(>holes);
free(rec);
  }
@@ -3286,7 +3254,6 @@ skip_walking:
  
  	free_corrupt_blocks_tree(_blocks);

root->fs_info->corrupt_blocks = NULL;
-   free_orphan_data_extents(>orphan_data_extents);
return ret;
  }
  
@@ -7137,88 +7104,6 @@ static int find_possible_backrefs(struct btrfs_fs_info *info,

return 0;
  }
  
-/*

- * Record orphan data ref into corresponding root.
- *
- * Return 0 if the extent item contains data ref and recorded.
- * Return 1 if the extent item contains no useful data ref
- *   On that case, it may contains only shared_dataref or metadata backref
- *   or the file extent exists(this should be handled by the extent bytenr
- *   recovery routine)
- * Return <0 if something goes wrong.
- */
-static int record_orphan_data_extents(struct btrfs_fs_info *fs_info,
- struct extent_record *rec)
-{
-   struct btrfs_key key;
-   struct btrfs_root *dest_root;
-   struct extent_backref *back, *tmp;
-   struct data_backref *dback;
-   struct orphan_data_extent *orphan;
-   struct btrfs_path path;
-   int recorded_data_ref = 0;
-   int ret = 0;
-
-   if (rec->metadata)
-   return 1;
-   btrfs_init_path();
-   rbtree_postorder_for_each_entry_safe(back, tmp,
->backref_tree, node) {
-   if (back->full_backref || !back->is_data ||
-   !back->found_extent_tree)
-   continue;
-   dback = to_data_backref(back);
-   if (dback->found_ref)
-