Re: [PATCH RFC ver.B] btrfs: scrub: Don't use inode pages for device replace

2018-06-08 Thread David Sterba
On Tue, Jun 05, 2018 at 10:14:51PM +0800, Qu Wenruo wrote:
> 
> 
> On 2018年06月05日 22:07, David Sterba wrote:
> > On Tue, Jun 05, 2018 at 09:47:46PM +0800, Qu Wenruo wrote:
> >>
> >>
> >> On 2018年06月05日 21:42, David Sterba wrote:
> >>> On Tue, Jun 05, 2018 at 01:34:03PM +0800, Qu Wenruo wrote:
>  Hi David,
> 
>  It would be pretty nice if we could get this fix (or previous RFC patch)
>  to get into current release cycle.
> 
>  As it's a unrecoverable data corruption, it would be better to get it
>  fixed as soon as possible.
> >>>
> >>> That we can do, I'm planning to send 2nd pull by the end of the next
> >>> week as there's at least one patch in the queue now.
> >>>
> >>> This patch seems to big, can you please prepare a minimal version?
> >>
> >> The previous version (a completely different direction though) is much
> >> smaller.
> >> https://patchwork.kernel.org/patch/10440541/
> >>
> >> However personally speaking, I still prefer this one, as it's much simpler.
> > 
> > As this will go to older stable kernels, I'd rather split that to more
> > patches where the first one is
> > 
> > --- a/fs/btrfs/scrub.c
> > +++ b/fs/btrfs/scrub.c
> > @@ -2799,7 +2799,7 @@ static int scrub_extent(struct scrub_ctx *sctx, 
> > struct map_lookup *map,
> > have_csum = scrub_find_csum(sctx, logical, csum);
> > if (have_csum == 0)
> > ++sctx->stat.no_csum;
> > -   if (sctx->is_dev_replace && !have_csum) {
> > +   if (0 && sctx->is_dev_replace && !have_csum) {
> > ret = copy_nocow_pages(sctx, logical, l,
> >mirror_num,
> >   
> > physical_for_dev_replace);
> > ---
> > 
> > and then the whole callchain of copy_nocow_pages continues.
> 
> Understood.
> I could go this method.

FYI, I'd need to send the 2nd pull request on Tuesday so I'm adding the
proposed fix with the current changelog to the queue now.

https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git/commit/?h=next-fixes=8c83e0b1b20b094491bec6c52839aa3596a87f03
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] btrfs-progs: Add delayed refs infrastructure

2018-06-08 Thread Nikolay Borisov
This commit pulls those portions of the kernel implementation of
delayed refs which are necessary to have them working in user-space.
I've done the following modifications:

1. Replaced all kmem_cache_alloc calls to kmalloc.

2. Removed all locking-related code, since we are single threaded in
userspace.

3. Removed code which deals with data refs - delayed refs in user space
are going to be used only for cowonly trees.

Signed-off-by: Nikolay Borisov 
---

V2: 
 * removed definitions of delayed data ref structure. 

 Makefile  |   3 +-
 ctree.h   |   3 +
 delayed-ref.c | 608 ++
 delayed-ref.h | 210 
 extent-tree.c | 228 ++
 kerncompat.h  |   8 +
 transaction.h |   4 +
 7 files changed, 1063 insertions(+), 1 deletion(-)
 create mode 100644 delayed-ref.c
 create mode 100644 delayed-ref.h

diff --git a/Makefile b/Makefile
index 544410e6440c..9508ad4f11e6 100644
--- a/Makefile
+++ b/Makefile
@@ -116,7 +116,8 @@ objects = ctree.o disk-io.o kernel-lib/radix-tree.o 
extent-tree.o print-tree.o \
  qgroup.o free-space-cache.o kernel-lib/list_sort.o props.o \
  kernel-shared/ulist.o qgroup-verify.o backref.o string-table.o 
task-utils.o \
  inode.o file.o find-root.o free-space-tree.o help.o send-dump.o \
- fsfeatures.o kernel-lib/tables.o kernel-lib/raid56.o transaction.o
+ fsfeatures.o kernel-lib/tables.o kernel-lib/raid56.o transaction.o \
+ delayed-ref.o
 cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
   cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
   cmds-quota.o cmds-qgroup.o cmds-replace.o check/main.o \
diff --git a/ctree.h b/ctree.h
index b30a946658ce..d1ea45571d1e 100644
--- a/ctree.h
+++ b/ctree.h
@@ -2812,4 +2812,7 @@ int btrfs_punch_hole(struct btrfs_trans_handle *trans,
 int btrfs_read_file(struct btrfs_root *root, u64 ino, u64 start, int len,
char *dest);
 
+
+/* extent-tree.c */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long nr);
 #endif
diff --git a/delayed-ref.c b/delayed-ref.c
new file mode 100644
index ..f3fa50239380
--- /dev/null
+++ b/delayed-ref.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ */
+
+#include "ctree.h"
+#include "btrfs-list.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking.  For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing.   This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ */
+
+/*
+ * compare two delayed tree backrefs with same bytenr and type
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
+ struct btrfs_delayed_tree_ref *ref2)
+{
+   if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+   if (ref1->root < ref2->root)
+   return -1;
+   if (ref1->root > ref2->root)
+   return 1;
+   } else {
+   if (ref1->parent < ref2->parent)
+   return -1;
+   if (ref1->parent > ref2->parent)
+   return 1;
+   }
+   return 0;
+}
+
+static int comp_refs(struct btrfs_delayed_ref_node *ref1,
+struct btrfs_delayed_ref_node *ref2,
+bool check_seq)
+{
+   int ret = 0;
+
+   if (ref1->type < ref2->type)
+   return -1;
+   if (ref1->type > ref2->type)
+   return 1;
+   if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+   ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
+   ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
+btrfs_delayed_node_to_tree_ref(ref2));
+   else
+   BUG();
+
+   if (ret)
+   return ret;
+   if (check_seq) {
+   if (ref1->seq < ref2->seq)
+   return -1;
+   if (ref1->seq > ref2->seq)
+   return 1;
+   }
+   return 0;
+}
+
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+  struct rb_node *node)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent_node = NULL;
+   struct btrfs_delayed_ref_head *entry;
+   struct btrfs_delayed_ref_head *ins;
+   u64 bytenr;
+
+   ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+   bytenr = ins->bytenr;
+   while (*p) {
+   parent_node = *p;
+   entry = rb_entry(parent_node, struct 

[PATCH v2] btrfs-progs: Remove old delayed refs infrastructure

2018-06-08 Thread Nikolay Borisov
Given that the new delayed refs infrastructure is implemented and
wired up, there is no point in keeping the old code. So just remove it.

Signed-off-by: Nikolay Borisov 
---

V2: 

 * Remove fs_info->pending_del references in disk-io.c . This prevented 
 compilation. 

 ctree.h   |   2 -
 disk-io.c |   2 -
 extent-tree.c | 137 --
 3 files changed, 141 deletions(-)

diff --git a/ctree.h b/ctree.h
index d1ea45571d1e..3e9ca2ca8432 100644
--- a/ctree.h
+++ b/ctree.h
@@ -1098,7 +1098,6 @@ struct btrfs_fs_info {
struct extent_io_tree free_space_cache;
struct extent_io_tree block_group_cache;
struct extent_io_tree pinned_extents;
-   struct extent_io_tree pending_del;
struct extent_io_tree extent_ins;
struct extent_io_tree *excluded_extents;
 
@@ -2503,7 +2502,6 @@ int btrfs_fix_block_accounting(struct btrfs_trans_handle 
*trans);
 void btrfs_pin_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 
num_bytes);
 void btrfs_unpin_extent(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 btrfs_fs_info *info,
 u64 bytenr);
diff --git a/disk-io.c b/disk-io.c
index 4a609a892be7..8da6e3ce5fc8 100644
--- a/disk-io.c
+++ b/disk-io.c
@@ -726,7 +726,6 @@ struct btrfs_fs_info *btrfs_new_fs_info(int writable, u64 
sb_bytenr)
extent_io_tree_init(_info->free_space_cache);
extent_io_tree_init(_info->block_group_cache);
extent_io_tree_init(_info->pinned_extents);
-   extent_io_tree_init(_info->pending_del);
extent_io_tree_init(_info->extent_ins);
fs_info->excluded_extents = NULL;
 
@@ -984,7 +983,6 @@ void btrfs_cleanup_all_caches(struct btrfs_fs_info *fs_info)
extent_io_tree_cleanup(_info->free_space_cache);
extent_io_tree_cleanup(_info->block_group_cache);
extent_io_tree_cleanup(_info->pinned_extents);
-   extent_io_tree_cleanup(_info->pending_del);
extent_io_tree_cleanup(_info->extent_ins);
 }
 
diff --git a/extent-tree.c b/extent-tree.c
index 9d085158f2d8..b9d51b388c9a 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -52,8 +52,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 u64 bytenr, u64 num_bytes, u64 parent,
 u64 root_objectid, u64 owner_objectid,
 u64 owner_offset, int refs_to_drop);
-static int finish_current_insert(struct btrfs_trans_handle *trans);
-static int del_pending_extents(struct btrfs_trans_handle *trans);
 static struct btrfs_block_group_cache *
 btrfs_find_block_group(struct btrfs_root *root, struct btrfs_block_group_cache
   *hint, u64 search_start, int data, int owner);
@@ -1422,13 +1420,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle 
*trans,
return err;
 }
 
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans)
-{
-   finish_current_insert(trans);
-   del_pending_extents(trans);
-   return 0;
-}
-
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info, u64 bytenr,
 u64 offset, int metadata, u64 *refs, u64 *flags)
@@ -2013,74 +2004,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle 
*trans,
return 0;
 }
 
-static int extent_root_pending_ops(struct btrfs_fs_info *info)
-{
-   u64 start;
-   u64 end;
-   int ret;
-
-   ret = find_first_extent_bit(>extent_ins, 0, ,
-   , EXTENT_LOCKED);
-   if (!ret) {
-   ret = find_first_extent_bit(>pending_del, 0, , ,
-   EXTENT_LOCKED);
-   }
-   return ret == 0;
-
-}
-static int finish_current_insert(struct btrfs_trans_handle *trans)
-{
-   u64 start;
-   u64 end;
-   u64 priv;
-   struct btrfs_fs_info *info = trans->fs_info;
-   struct btrfs_root *extent_root = info->extent_root;
-   struct pending_extent_op *extent_op;
-   struct btrfs_key key;
-   int ret;
-   int skinny_metadata =
-   btrfs_fs_incompat(extent_root->fs_info, SKINNY_METADATA);
-
-
-   while(1) {
-   ret = find_first_extent_bit(>extent_ins, 0, ,
-   , EXTENT_LOCKED);
-   if (ret)
-   break;
-
-   ret = get_state_private(>extent_ins, start, );
-   BUG_ON(ret);
-   extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
-   if (extent_op->type == PENDING_EXTENT_INSERT) {
-   key.objectid = start;
-   if (skinny_metadata) {
-   

Re: [PATCH 00/15] Add delayed-refs support to btrfs-progs

2018-06-08 Thread Qu Wenruo



On 2018年06月08日 22:08, Nikolay Borisov wrote:
> 
> 
> On  8.06.2018 16:50, Qu Wenruo wrote:
>>  details?
>> Personally speaking, I'd like to avoid introducing complex delayed-ref
>> into btrfs-progs if possible.
>>
>> And in my (possibly wrong) understanding, the main purpose of
>> delayed-ref is to reduce the race on extent tree, thus to improve
>> performance.
>> However in btrfs-progs, it's the least important aspect.
>>
>> So extra comment on this is appreciated.
> 
> So in order to have freespace tree repair code working I needed to hook
> up its add_to_free_space_tree/remove_from_free_space_tree to
> alloc_reserved_tree_block/__free_extent. In my testing this lead to a
> very deep recursion - it crashed on 58k call frames. So the idea was to
> have delayed refs which would record and accumulate modifications and
> then the freespace tree freeing code would piggy back on them to rely on
> correct operation.

In fact, I have a pretty nasty idea on this problem.
Mark one or more metadata chunks without free space tree cache.

Then at least recursion could be easily resolved (although need extra
extent allocation hook to handle fst allocation)

> 
> I guess I could try and debug the freespace code and see why I was going
> into this infinite recursion so to speak.
> 
> Also the delayed refs code in progs is actually a lot simpler than the
> kernel counterpart due to the lack of locking.

Right.
And no need to do the async delayed ref execution should also makes
things easier.

> One more benefit of
> having this code in progs is the fact one can go through it with a
> debugger and really inspect/understand how it works

Indeed, this makes a lot of sense.

I'll take some time to do more review on this patchset, and dig deeper
into delayed-ref facility.

Thanks,
Qu

> - i.e addition of
> refs, selection of refs etc. Furthermore, it at least unifies the logic
> between kernel and userspace, since right now there is code which mimics
> the delayed refs - check the code being removed in the last patch.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/15] Add delayed-refs support to btrfs-progs

2018-06-08 Thread Nikolay Borisov



On  8.06.2018 16:50, Qu Wenruo wrote:
>  details?
> Personally speaking, I'd like to avoid introducing complex delayed-ref
> into btrfs-progs if possible.
> 
> And in my (possibly wrong) understanding, the main purpose of
> delayed-ref is to reduce the race on extent tree, thus to improve
> performance.
> However in btrfs-progs, it's the least important aspect.
> 
> So extra comment on this is appreciated.

So in order to have freespace tree repair code working I needed to hook
up its add_to_free_space_tree/remove_from_free_space_tree to
alloc_reserved_tree_block/__free_extent. In my testing this lead to a
very deep recursion - it crashed on 58k call frames. So the idea was to
have delayed refs which would record and accumulate modifications and
then the freespace tree freeing code would piggy back on them to rely on
correct operation.

I guess I could try and debug the freespace code and see why I was going
into this infinite recursion so to speak.

Also the delayed refs code in progs is actually a lot simpler than the
kernel counterpart due to the lack of locking. One more benefit of
having this code in progs is the fact one can go through it with a
debugger and really inspect/understand how it works - i.e addition of
refs, selection of refs etc. Furthermore, it at least unifies the logic
between kernel and userspace, since right now there is code which mimics
the delayed refs - check the code being removed in the last patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/15] Add delayed-refs support to btrfs-progs

2018-06-08 Thread Qu Wenruo



On 2018年06月08日 20:47, Nikolay Borisov wrote:
> Hello,
>   
>   
>   
> Here is a series which adds support for delayed refs. This is needed to 
> enable  
> later work on adding freespace tree repair code.

Would it be possible to explain this in details?
Personally speaking, I'd like to avoid introducing complex delayed-ref
into btrfs-progs if possible.

And in my (possibly wrong) understanding, the main purpose of
delayed-ref is to reduce the race on extent tree, thus to improve
performance.
However in btrfs-progs, it's the least important aspect.

So extra comment on this is appreciated.

Thanks,
Qu

> Additionally, it results in  
> more code sharing between kernel/user space.
> 
> Patches 1-9 are simple prep patches removing some arguments, causing problems
> later. They can go independently of the delayed refs work. They don't 
> introduce
> any functional changes. Next, patches 10-13 introduce the needed 
> infrastructure
> to for delayed refs without actually activating it. Patch 14 finally wires it
> up by adding the necessary call outs to btrfs_run_delayed refs and reworking 
> the
> extent addition/freeing functions. With all of this done, patch 15 finally
> removes the old code.
> 
> This series passes all btrfs progs fsck and misc tests + fuzz tests apart from
> fuzz-003/007/009 - but those fail without this series so it's unlikely it's
> caused by it.
> 
> Nikolay Borisov (15):
>   btrfs-progs: Remove root argument from pin_down_bytes
>   btrfs-progs: Remove root argument from btrfs_del_csums
>   btrfs-progs: Add functions to modify the used space by a root
>   btrfs-progs: Refactor the root used bytes are updated
>   btrfs-progs: Make update_block_group take fs_info instead of root
>   btrfs-progs: check: Drop trans/root arguments from free_extent_hook
>   btrfs-progs: Remove root argument from __free_extent
>   btrfs-progs: Remove root argument from alloc_reserved_tree_block
>   btrfs-progs: Always pass 0 for offset when calling btrfs_free_extent
> for btree blocks.
>   btrfs-progs: Add boolean to signal whether we are re-initing extent
> tree
>   btrfs-progs: Add delayed refs infrastructure
>   btrfs-progs: Add __free_extent2 function
>   btrfs-progs: Add alloc_reserved_tree_block2 function
>   btrfs-progs: Wire up delayed refs
>   btrfs-progs: Remove old delayed refs infrastructure
> 
>  Makefile  |   3 +-
>  btrfs-corrupt-block.c |   2 +-
>  check/main.c  |   8 +-
>  ctree.c   |  29 ++-
>  ctree.h   |  11 +-
>  delayed-ref.c | 608 
> ++
>  delayed-ref.h | 225 +++
>  extent-tree.c | 604 +
>  file-item.c   |  20 +-
>  kerncompat.h  |   8 +
>  transaction.c |  25 +++
>  transaction.h |   5 +
>  12 files changed, 1280 insertions(+), 268 deletions(-)
>  create mode 100644 delayed-ref.c
>  create mode 100644 delayed-ref.h
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/15] btrfs-progs: Remove root argument from btrfs_del_csums

2018-06-08 Thread Nikolay Borisov
It's not needed, since we can obtain a reference to fs_info from the
passed transaction handle. This is needed by delayed refs code.

Signed-off-by: Nikolay Borisov 
---
 btrfs-corrupt-block.c |  2 +-
 ctree.h   |  3 +--
 extent-tree.c |  2 +-
 file-item.c   | 20 ++--
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/btrfs-corrupt-block.c b/btrfs-corrupt-block.c
index 4fbea26cda20..3add8e63b7bb 100644
--- a/btrfs-corrupt-block.c
+++ b/btrfs-corrupt-block.c
@@ -926,7 +926,7 @@ static int delete_csum(struct btrfs_root *root, u64 bytenr, 
u64 bytes)
return PTR_ERR(trans);
}
 
-   ret = btrfs_del_csums(trans, root, bytenr, bytes);
+   ret = btrfs_del_csums(trans, bytenr, bytes);
if (ret)
fprintf(stderr, "Error deleting csums %d\n", ret);
btrfs_commit_transaction(trans, root);
diff --git a/ctree.h b/ctree.h
index de4b1b7e6416..082726238b91 100644
--- a/ctree.h
+++ b/ctree.h
@@ -2752,8 +2752,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
u64 ino, u64 parent_ino, u64 *index);
 
 /* file-item.c */
-int btrfs_del_csums(struct btrfs_trans_handle *trans,
-   struct btrfs_root *root, u64 bytenr, u64 len);
+int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 bytenr, u64 len);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
 u64 objectid, u64 pos, u64 offset,
diff --git a/extent-tree.c b/extent-tree.c
index cbc022f6cef6..c6f09b52800f 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -2372,7 +2372,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
 
if (is_data) {
-   ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+   ret = btrfs_del_csums(trans, bytenr, num_bytes);
BUG_ON(ret);
}
 
diff --git a/file-item.c b/file-item.c
index 7b0ff3585509..71d4e89f78d1 100644
--- a/file-item.c
+++ b/file-item.c
@@ -394,8 +394,7 @@ static noinline int truncate_one_csum(struct btrfs_root 
*root,
  * deletes the csum items from the csum tree for a given
  * range of bytes.
  */
-int btrfs_del_csums(struct btrfs_trans_handle *trans,
-   struct btrfs_root *root, u64 bytenr, u64 len)
+int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 bytenr, u64 len)
 {
struct btrfs_path *path;
struct btrfs_key key;
@@ -403,11 +402,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 csum_end;
struct extent_buffer *leaf;
int ret;
-   u16 csum_size =
-   btrfs_super_csum_size(root->fs_info->super_copy);
-   int blocksize = root->fs_info->sectorsize;
+   u16 csum_size = btrfs_super_csum_size(trans->fs_info->super_copy);
+   int blocksize = trans->fs_info->sectorsize;
+   struct btrfs_root *csum_root = trans->fs_info->csum_root;
 
-   root = root->fs_info->csum_root;
 
path = btrfs_alloc_path();
if (!path)
@@ -418,7 +416,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
 
-   ret = btrfs_search_slot(trans, root, , path, -1, 1);
+   ret = btrfs_search_slot(trans, csum_root, , path, -1, 1);
if (ret > 0) {
if (path->slots[0] == 0)
goto out;
@@ -445,7 +443,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 
/* delete the entire item, it is inside our range */
if (key.offset >= bytenr && csum_end <= end_byte) {
-   ret = btrfs_del_item(trans, root, path);
+   ret = btrfs_del_item(trans, csum_root, path);
BUG_ON(ret);
} else if (key.offset < bytenr && csum_end > end_byte) {
unsigned long offset;
@@ -485,12 +483,14 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 * btrfs_split_item returns -EAGAIN when the
 * item changed size or key
 */
-   ret = btrfs_split_item(trans, root, path, , offset);
+   ret = btrfs_split_item(trans, csum_root, path, ,
+  offset);
BUG_ON(ret && ret != -EAGAIN);
 
key.offset = end_byte - 1;
} else {
-   ret = truncate_one_csum(root, path, , bytenr, len);
+   ret = truncate_one_csum(csum_root, path, , bytenr,
+   len);
BUG_ON(ret);
}
btrfs_release_path(path);
-- 
2.7.4

--
To 

[PATCH 09/15] btrfs-progs: Always pass 0 for offset when calling btrfs_free_extent for btree blocks.

2018-06-08 Thread Nikolay Borisov
Currently some instances of btrfs_free_extent are called with the
last parameter ("offset") being set to 1. This makes no sense, since
offset is used for data extents. I suspect this is a left-over from
95d3f20b51e9 ("Mixed back reference  (FORWARD ROLLING FORMAT CHANGE)")
since this commit changed the signature of the function from :

-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 ref_generation,
- u64 owner_objectid, int pin);

to

+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 owner, u64 offset);

I.e the last parameter was "pin" and not offset. So these are just
leftovers with no semantic meaning. Fix this by passing 0.

Signed-off-by: Nikolay Borisov 
---
 ctree.c   | 4 ++--
 extent-tree.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ctree.c b/ctree.c
index 8f3338b4693a..d8a6883aa85f 100644
--- a/ctree.c
+++ b/ctree.c
@@ -334,7 +334,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(btrfs_header_generation(parent) != trans->transid);
 
btrfs_free_extent(trans, root, buf->start, buf->len,
- 0, root->root_key.objectid, level, 1);
+ 0, root->root_key.objectid, level, 0);
}
if (!list_empty(>recow)) {
list_del_init(>recow);
@@ -738,7 +738,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 
ret = btrfs_free_extent(trans, root, mid->start, mid->len,
0, root->root_key.objectid,
-   level, 1);
+   level, 0);
/* once for the root ptr */
free_extent_buffer(mid);
return ret;
diff --git a/extent-tree.c b/extent-tree.c
index 079204ed290f..ab57c20d9dee 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -2961,7 +2961,7 @@ static int noinline walk_down_tree(struct 
btrfs_trans_handle *trans,
path->slots[*level]++;
ret = btrfs_free_extent(trans, root, bytenr, blocksize,
parent->start, root_owner,
-   root_gen, *level - 1, 1);
+   root_gen, *level - 1, 0);
BUG_ON(ret);
continue;
}
@@ -3003,7 +3003,7 @@ static int noinline walk_down_tree(struct 
btrfs_trans_handle *trans,
root_gen = btrfs_header_generation(parent);
ret = btrfs_free_extent(trans, root, path->nodes[*level]->start,
path->nodes[*level]->len, parent->start,
-   root_owner, root_gen, *level, 1);
+   root_owner, root_gen, *level, 0);
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
*level += 1;
@@ -3054,7 +3054,7 @@ static int noinline walk_up_tree(struct 
btrfs_trans_handle *trans,
path->nodes[*level]->start,
path->nodes[*level]->len,
parent->start, root_owner,
-   root_gen, *level, 1);
+   root_gen, *level, 0);
BUG_ON(ret);
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/15] btrfs-progs: Wire up delayed refs

2018-06-08 Thread Nikolay Borisov
This commit enables the delayed refs infrastructures. This entails doing
the following:

1. Replacing existing calls of btrfs_extent_post_op (which is the
equivalent of delayed refs) with the proper btrfs_run_delayed_refs.
As well as eliminating open-coded calls to finish_current_insert and
del_pending_extents which execute the delayed ops.

2. Wiring up the addition of delayed refs when freeing extents
(btrfs_free_extent) and when adding new extents (alloc_tree_block).

3. Adding calls to btrfs_run_delayed refs in the transaction commit
path alongside comments why every call is needed, since it's not always
obvious (those call sites were derived empirically by running and
debugging existing tests)

4. Correctly flagging the transaction in which we are reinitialising
the extent tree.

Signed-off-by: Nikolay Borisov 
---
 check/main.c  |   3 +-
 extent-tree.c | 166 ++
 transaction.c |  24 +
 3 files changed, 111 insertions(+), 82 deletions(-)

diff --git a/check/main.c b/check/main.c
index b84903acdb25..7c9689f29fd3 100644
--- a/check/main.c
+++ b/check/main.c
@@ -8634,7 +8634,7 @@ static int reinit_extent_tree(struct btrfs_trans_handle 
*trans,
fprintf(stderr, "Error adding block group\n");
return ret;
}
-   btrfs_extent_post_op(trans);
+   btrfs_run_delayed_refs(trans, -1);
}
 
ret = reset_balance(trans, fs_info);
@@ -9682,6 +9682,7 @@ int cmd_check(int argc, char **argv)
goto close_out;
}
 
+   trans->reinit_extent_tree = true;
if (init_extent_tree) {
printf("Creating a new extent tree\n");
ret = reinit_extent_tree(trans, info,
diff --git a/extent-tree.c b/extent-tree.c
index 3208ed11cb91..9d085158f2d8 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -1418,8 +1418,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
err = ret;
 out:
btrfs_free_path(path);
-   finish_current_insert(trans);
-   del_pending_extents(trans);
BUG_ON(err);
return err;
 }
@@ -1602,8 +1600,6 @@ int btrfs_set_block_flags(struct btrfs_trans_handle 
*trans, u64 bytenr,
btrfs_set_extent_flags(l, item, flags);
 out:
btrfs_free_path(path);
-   finish_current_insert(trans);
-   del_pending_extents(trans);
return ret;
 }
 
@@ -1701,7 +1697,6 @@ static int write_one_cache_group(struct 
btrfs_trans_handle *trans,
 struct btrfs_block_group_cache *cache)
 {
int ret;
-   int pending_ret;
struct btrfs_root *extent_root = trans->fs_info->extent_root;
unsigned long bi;
struct extent_buffer *leaf;
@@ -1717,12 +1712,8 @@ static int write_one_cache_group(struct 
btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
 fail:
-   finish_current_insert(trans);
-   pending_ret = del_pending_extents(trans);
if (ret)
return ret;
-   if (pending_ret)
-   return pending_ret;
return 0;
 
 }
@@ -2050,6 +2041,7 @@ static int finish_current_insert(struct 
btrfs_trans_handle *trans)
int skinny_metadata =
btrfs_fs_incompat(extent_root->fs_info, SKINNY_METADATA);
 
+
while(1) {
ret = find_first_extent_bit(>extent_ins, 0, ,
, EXTENT_LOCKED);
@@ -2081,6 +2073,8 @@ static int finish_current_insert(struct 
btrfs_trans_handle *trans)
BUG_ON(1);
}
 
+
+   printf("shouldn't be executed\n");
clear_extent_bits(>extent_ins, start, end, EXTENT_LOCKED);
kfree(extent_op);
}
@@ -2380,7 +2374,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
}
 fail:
btrfs_free_path(path);
-   finish_current_insert(trans);
return ret;
 }
 
@@ -2463,33 +2456,30 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
  u64 bytenr, u64 num_bytes, u64 parent,
  u64 root_objectid, u64 owner, u64 offset)
 {
-   struct btrfs_root *extent_root = root->fs_info->extent_root;
-   int pending_ret;
int ret;
 
WARN_ON(num_bytes < root->fs_info->sectorsize);
-   if (root == extent_root) {
-   struct pending_extent_op *extent_op;
-
-   extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-   BUG_ON(!extent_op);
-
-   extent_op->type = PENDING_EXTENT_DELETE;
-   extent_op->bytenr = bytenr;
-   extent_op->num_bytes = num_bytes;
-   extent_op->level = (int)owner;
-
-   set_extent_bits(>fs_info->pending_del,
-   bytenr, bytenr + num_bytes - 1,
- 

[PATCH 07/15] btrfs-progs: Remove root argument from __free_extent

2018-06-08 Thread Nikolay Borisov
This argument is no longer used in this function so remove it.

Signed-off-by: Nikolay Borisov 
---
 extent-tree.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/extent-tree.c b/extent-tree.c
index 9132cb3f8e15..c16bd85e92be 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -50,7 +50,6 @@ static int alloc_reserved_tree_block(struct 
btrfs_trans_handle *trans,
 u64 flags, struct btrfs_disk_key *key,
 int level, struct btrfs_key *ins);
 static int __free_extent(struct btrfs_trans_handle *trans,
-struct btrfs_root *root,
 u64 bytenr, u64 num_bytes, u64 parent,
 u64 root_objectid, u64 owner_objectid,
 u64 owner_offset, int refs_to_drop);
@@ -2141,7 +2140,6 @@ void btrfs_unpin_extent(struct btrfs_fs_info *fs_info,
  * remove an extent from the root, returns 0 on success
  */
 static int __free_extent(struct btrfs_trans_handle *trans,
-struct btrfs_root *root,
 u64 bytenr, u64 num_bytes, u64 parent,
 u64 root_objectid, u64 owner_objectid,
 u64 owner_offset, int refs_to_drop)
@@ -2149,7 +2147,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 
struct btrfs_key key;
struct btrfs_path *path;
-   struct btrfs_root *extent_root = root->fs_info->extent_root;
+   struct btrfs_root *extent_root = trans->fs_info->extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
@@ -2409,8 +2407,7 @@ static int del_pending_extents(struct btrfs_trans_handle 
*trans)
 
if (!test_range_bit(extent_ins, start, end,
EXTENT_LOCKED, 0)) {
-   ret = __free_extent(trans, extent_root,
-   start, end + 1 - start, 0,
+   ret = __free_extent(trans, start, end + 1 - start, 0,
extent_root->root_key.objectid,
extent_op->level, 0, 1);
kfree(extent_op);
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 15/15] btrfs-progs: Remove old delayed refs infrastructure

2018-06-08 Thread Nikolay Borisov
Given that the new delayed refs infrastructure is implemented and
wired up, there is no point in keeping the old code. So just remove it.

Signed-off-by: Nikolay Borisov 
---
 ctree.h   |   2 -
 extent-tree.c | 137 --
 2 files changed, 139 deletions(-)

diff --git a/ctree.h b/ctree.h
index d1ea45571d1e..3e9ca2ca8432 100644
--- a/ctree.h
+++ b/ctree.h
@@ -1098,7 +1098,6 @@ struct btrfs_fs_info {
struct extent_io_tree free_space_cache;
struct extent_io_tree block_group_cache;
struct extent_io_tree pinned_extents;
-   struct extent_io_tree pending_del;
struct extent_io_tree extent_ins;
struct extent_io_tree *excluded_extents;
 
@@ -2503,7 +2502,6 @@ int btrfs_fix_block_accounting(struct btrfs_trans_handle 
*trans);
 void btrfs_pin_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 
num_bytes);
 void btrfs_unpin_extent(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 btrfs_fs_info *info,
 u64 bytenr);
diff --git a/extent-tree.c b/extent-tree.c
index 9d085158f2d8..b9d51b388c9a 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -52,8 +52,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 u64 bytenr, u64 num_bytes, u64 parent,
 u64 root_objectid, u64 owner_objectid,
 u64 owner_offset, int refs_to_drop);
-static int finish_current_insert(struct btrfs_trans_handle *trans);
-static int del_pending_extents(struct btrfs_trans_handle *trans);
 static struct btrfs_block_group_cache *
 btrfs_find_block_group(struct btrfs_root *root, struct btrfs_block_group_cache
   *hint, u64 search_start, int data, int owner);
@@ -1422,13 +1420,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle 
*trans,
return err;
 }
 
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans)
-{
-   finish_current_insert(trans);
-   del_pending_extents(trans);
-   return 0;
-}
-
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info, u64 bytenr,
 u64 offset, int metadata, u64 *refs, u64 *flags)
@@ -2013,74 +2004,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle 
*trans,
return 0;
 }
 
-static int extent_root_pending_ops(struct btrfs_fs_info *info)
-{
-   u64 start;
-   u64 end;
-   int ret;
-
-   ret = find_first_extent_bit(>extent_ins, 0, ,
-   , EXTENT_LOCKED);
-   if (!ret) {
-   ret = find_first_extent_bit(>pending_del, 0, , ,
-   EXTENT_LOCKED);
-   }
-   return ret == 0;
-
-}
-static int finish_current_insert(struct btrfs_trans_handle *trans)
-{
-   u64 start;
-   u64 end;
-   u64 priv;
-   struct btrfs_fs_info *info = trans->fs_info;
-   struct btrfs_root *extent_root = info->extent_root;
-   struct pending_extent_op *extent_op;
-   struct btrfs_key key;
-   int ret;
-   int skinny_metadata =
-   btrfs_fs_incompat(extent_root->fs_info, SKINNY_METADATA);
-
-
-   while(1) {
-   ret = find_first_extent_bit(>extent_ins, 0, ,
-   , EXTENT_LOCKED);
-   if (ret)
-   break;
-
-   ret = get_state_private(>extent_ins, start, );
-   BUG_ON(ret);
-   extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
-   if (extent_op->type == PENDING_EXTENT_INSERT) {
-   key.objectid = start;
-   if (skinny_metadata) {
-   key.offset = extent_op->level;
-   key.type = BTRFS_METADATA_ITEM_KEY;
-   } else {
-   key.offset = extent_op->num_bytes;
-   key.type = BTRFS_EXTENT_ITEM_KEY;
-   }
-
-   ret = alloc_reserved_tree_block(trans,
-   extent_root->root_key.objectid,
-   trans->transid,
-   extent_op->flags,
-   _op->key,
-   extent_op->level, );
-   BUG_ON(ret);
-   } else {
-   BUG_ON(1);
-   }
-
-
-   printf("shouldn't be executed\n");
-   clear_extent_bits(>extent_ins, start, end, EXTENT_LOCKED);
-   kfree(extent_op);
-   }

[PATCH 01/15] btrfs-progs: Remove root argument from pin_down_bytes

2018-06-08 Thread Nikolay Borisov
This argument is used to obtain a reference to fs_info, which can
already be done from the passed trans handle, so use that instead.
This is in preparation for delayed refs support.

Signed-off-by: Nikolay Borisov 
---
 extent-tree.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/extent-tree.c b/extent-tree.c
index 0643815bd41c..cbc022f6cef6 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -2098,9 +2098,8 @@ static int finish_current_insert(struct 
btrfs_trans_handle *trans)
return 0;
 }
 
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int is_data)
+static int pin_down_bytes(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes, int is_data)
 {
int err = 0;
struct extent_buffer *buf;
@@ -2108,7 +2107,7 @@ static int pin_down_bytes(struct btrfs_trans_handle 
*trans,
if (is_data)
goto pinit;
 
-   buf = btrfs_find_tree_block(root->fs_info, bytenr, num_bytes);
+   buf = btrfs_find_tree_block(trans->fs_info, bytenr, num_bytes);
if (!buf)
goto pinit;
 
@@ -2360,7 +2359,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
}
 
if (pin) {
-   ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+   ret = pin_down_bytes(trans, bytenr, num_bytes,
 is_data);
if (ret > 0)
mark_free = 1;
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/15] btrfs-progs: Refactor the root used bytes are updated

2018-06-08 Thread Nikolay Borisov
Instead of updating this during update_block_group, move the updating
code at the places where we free/allocate a block. This resembles the
current state of the kernel code. This is in prep for delayed refs.

Signed-off-by: Nikolay Borisov 
---
 ctree.c   | 13 +
 extent-tree.c |  8 
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/ctree.c b/ctree.c
index 7b74716bf92f..8f3338b4693a 100644
--- a/ctree.c
+++ b/ctree.c
@@ -734,6 +734,8 @@ static int balance_level(struct btrfs_trans_handle *trans,
/* once for the path */
free_extent_buffer(mid);
 
+   root_sub_used(root, mid->len);
+
ret = btrfs_free_extent(trans, root, mid->start, mid->len,
0, root->root_key.objectid,
level, 1);
@@ -789,6 +791,8 @@ static int balance_level(struct btrfs_trans_handle *trans,
wret = btrfs_del_ptr(root, path, level + 1, pslot + 1);
if (wret)
ret = wret;
+
+   root_sub_used(root, right->len);
wret = btrfs_free_extent(trans, root, bytenr,
 blocksize, 0,
 root->root_key.objectid,
@@ -835,6 +839,8 @@ static int balance_level(struct btrfs_trans_handle *trans,
wret = btrfs_del_ptr(root, path, level + 1, pslot);
if (wret)
ret = wret;
+
+   root_sub_used(root, blocksize);
wret = btrfs_free_extent(trans, root, bytenr, blocksize,
 0, root->root_key.objectid,
 level, 0);
@@ -1466,6 +1472,8 @@ static int noinline insert_new_root(struct 
btrfs_trans_handle *trans,
btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(c, root->root_key.objectid);
 
+   root_add_used(root, root->fs_info->nodesize);
+
write_extent_buffer(c, root->fs_info->fsid,
btrfs_header_fsid(), BTRFS_FSID_SIZE);
 
@@ -1593,6 +1601,7 @@ static int split_node(struct btrfs_trans_handle *trans, 
struct btrfs_root
btrfs_header_chunk_tree_uuid(split),
BTRFS_UUID_SIZE);
 
+   root_add_used(root, root->fs_info->nodesize);
 
copy_extent_buffer(split, c,
   btrfs_node_key_ptr_offset(0),
@@ -2175,6 +2184,8 @@ static noinline int split_leaf(struct btrfs_trans_handle 
*trans,
btrfs_header_chunk_tree_uuid(right),
BTRFS_UUID_SIZE);
 
+   root_add_used(root, root->fs_info->nodesize);
+
if (split == 0) {
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
@@ -2694,6 +2705,8 @@ static noinline int btrfs_del_leaf(struct 
btrfs_trans_handle *trans,
if (ret)
return ret;
 
+   root_sub_used(root, leaf->len);
+
ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
0, root->root_key.objectid, 0, 0);
return ret;
diff --git a/extent-tree.c b/extent-tree.c
index c6f09b52800f..07b5fb99e8cf 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -1932,14 +1932,6 @@ static int update_block_group(struct btrfs_root *root,
old_val -= num_bytes;
btrfs_set_super_bytes_used(info->super_copy, old_val);
 
-   /* block accounting for root item */
-   old_val = btrfs_root_used(>root_item);
-   if (alloc)
-   old_val += num_bytes;
-   else
-   old_val -= num_bytes;
-   btrfs_set_root_used(>root_item, old_val);
-
while(total) {
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/15] btrfs-progs: Add alloc_reserved_tree_block2 function

2018-06-08 Thread Nikolay Borisov
This is a simple adapter function to convert the delayed-refs structures
to the current arguments of alloc_reserved_tree_block.

Signed-off-by: Nikolay Borisov 
---
 extent-tree.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/extent-tree.c b/extent-tree.c
index 8789a43c7fea..3208ed11cb91 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -2687,6 +2687,30 @@ int btrfs_reserve_extent(struct btrfs_trans_handle 
*trans,
return ret;
 }
 
+static int alloc_reserved_tree_block2(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+
+   struct btrfs_delayed_tree_ref *ref = 
btrfs_delayed_node_to_tree_ref(node);
+   struct btrfs_key ins;
+   bool skinny_metadata = btrfs_fs_incompat(trans->fs_info, 
SKINNY_METADATA);
+
+   ins.objectid = node->bytenr;
+   if (skinny_metadata) {
+   ins.offset = ref->level;
+   ins.type = BTRFS_METADATA_ITEM_KEY;
+   } else {
+   ins.offset = node->num_bytes;
+   ins.type = BTRFS_EXTENT_ITEM_KEY;
+   }
+
+   return alloc_reserved_tree_block(trans, ref->root, trans->transid,
+extent_op->flags_to_set,
+_op->key, ref->level, );
+
+}
+
 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 u64 root_objectid, u64 generation,
 u64 flags, struct btrfs_disk_key *key,
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/15] btrfs-progs: Add __free_extent2 function

2018-06-08 Thread Nikolay Borisov
This is a simple adapter to convert the arguments delayed ref arguments
to the existing arguments of __free_extent.

Signed-off-by: Nikolay Borisov 
---
 extent-tree.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/extent-tree.c b/extent-tree.c
index aff00e536c9c..8789a43c7fea 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -2136,6 +2136,17 @@ void btrfs_unpin_extent(struct btrfs_fs_info *fs_info,
update_pinned_extents(fs_info, bytenr, num_bytes, 0);
 }
 
+static int __free_extent2(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *node,
+ struct btrfs_delayed_extent_op *extent_op)
+{
+
+   struct btrfs_delayed_tree_ref *ref = 
btrfs_delayed_node_to_tree_ref(node);
+
+   return __free_extent(trans, node->bytenr, node->num_bytes,
+ref->parent, ref->root, ref->level, 0, 1);
+}
+
 /*
  * remove an extent from the root, returns 0 on success
  */
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/15] btrfs-progs: Add functions to modify the used space by a root

2018-06-08 Thread Nikolay Borisov
Pull the necessary function, excluding locking. Required to enable
integration of delayed refs.

Signed-off-by: Nikolay Borisov 
---
 ctree.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/ctree.c b/ctree.c
index 2c51580fec65..7b74716bf92f 100644
--- a/ctree.c
+++ b/ctree.c
@@ -76,6 +76,18 @@ void add_root_to_dirty_list(struct btrfs_root *root)
}
 }
 
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+btrfs_set_root_used(>root_item,
+btrfs_root_used(>root_item) + size);
+}
+
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+btrfs_set_root_used(>root_item,
+btrfs_root_used(>root_item) - size);
+}
+
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
  struct btrfs_root *root,
  struct extent_buffer *buf,
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/15] btrfs-progs: Add boolean to signal whether we are re-initing extent tree

2018-06-08 Thread Nikolay Borisov
Add a boolean to record whether the extent tree is being re-initialised
in the current transaction. This is going to be needed by the
delayed refs code.

Signed-off-by: Nikolay Borisov 
---
 transaction.c | 1 +
 transaction.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/transaction.c b/transaction.c
index 9619265ef6e8..ecafbb156610 100644
--- a/transaction.c
+++ b/transaction.c
@@ -46,6 +46,7 @@ struct btrfs_trans_handle* btrfs_start_transaction(struct 
btrfs_root *root,
fs_info->generation++;
h->transid = fs_info->generation;
h->blocks_reserved = num_blocks;
+   h->reinit_extent_tree = false;
root->last_trans = h->transid;
root->commit_root = root->node;
extent_buffer_get(root->node);
diff --git a/transaction.h b/transaction.h
index 470ee3de1358..750e329e1ba8 100644
--- a/transaction.h
+++ b/transaction.h
@@ -27,6 +27,7 @@ struct btrfs_trans_handle {
u64 transid;
u64 alloc_exclude_start;
u64 alloc_exclude_nr;
+   bool reinit_extent_tree;
unsigned long blocks_reserved;
unsigned long blocks_used;
struct btrfs_block_group_cache *block_group;
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/15] btrfs-progs: Make update_block_group take fs_info instead of root

2018-06-08 Thread Nikolay Borisov
This is in preparation of delayed refs code.

Signed-off-by: Nikolay Borisov 
---
 extent-tree.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/extent-tree.c b/extent-tree.c
index 07b5fb99e8cf..6e7a19323efc 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -1912,12 +1912,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle 
*trans,
return 0;
 }
 
-static int update_block_group(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc,
- int mark_free)
+static int update_block_group(struct btrfs_fs_info *info, u64 bytenr,
+ u64 num_bytes, int alloc, int mark_free)
 {
struct btrfs_block_group_cache *cache;
-   struct btrfs_fs_info *info = root->fs_info;
u64 total = num_bytes;
u64 old_val;
u64 byte_in_group;
@@ -2368,7 +2366,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
 
-   update_block_group(root, bytenr, num_bytes, 0, mark_free);
+   update_block_group(trans->fs_info, bytenr, num_bytes, 0,
+  mark_free);
}
 fail:
btrfs_free_path(path);
@@ -2730,7 +2729,7 @@ static int alloc_reserved_tree_block(struct 
btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
 
-   ret = update_block_group(root, ins->objectid, fs_info->nodesize,
+   ret = update_block_group(fs_info, ins->objectid, fs_info->nodesize,
 1, 0);
return ret;
 }
@@ -3413,7 +3412,7 @@ int btrfs_update_block_group(struct btrfs_root *root,
 u64 bytenr, u64 num_bytes, int alloc,
 int mark_free)
 {
-   return update_block_group(root, bytenr, num_bytes,
+   return update_block_group(root->fs_info, bytenr, num_bytes,
  alloc, mark_free);
 }
 
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/15] btrfs-progs: check: Drop trans/root arguments from free_extent_hook

2018-06-08 Thread Nikolay Borisov
They are not really needed, what free_extent_hook wants is really a
pointer to fs_info so give it to it directly. This is in preparation
of delayed refs code.

Signed-off-by: Nikolay Borisov 
---
 check/main.c  | 5 ++---
 ctree.h   | 3 +--
 extent-tree.c | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/check/main.c b/check/main.c
index 9a1f238800b0..b84903acdb25 100644
--- a/check/main.c
+++ b/check/main.c
@@ -6234,8 +6234,7 @@ static int add_root_to_pending(struct extent_buffer *buf,
  * we're tracking for repair.  This hook makes sure we
  * remove any backrefs for blocks as we are fixing them.
  */
-static int free_extent_hook(struct btrfs_trans_handle *trans,
-   struct btrfs_root *root,
+static int free_extent_hook(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset,
int refs_to_drop)
@@ -6243,7 +6242,7 @@ static int free_extent_hook(struct btrfs_trans_handle 
*trans,
struct extent_record *rec;
struct cache_extent *cache;
int is_data;
-   struct cache_tree *extent_cache = root->fs_info->fsck_extent_cache;
+   struct cache_tree *extent_cache = fs_info->fsck_extent_cache;
 
is_data = owner >= BTRFS_FIRST_FREE_OBJECTID;
cache = lookup_cache_extent(extent_cache, bytenr, num_bytes);
diff --git a/ctree.h b/ctree.h
index 082726238b91..b30a946658ce 100644
--- a/ctree.h
+++ b/ctree.h
@@ -1143,8 +1143,7 @@ struct btrfs_fs_info {
 
int transaction_aborted;
 
-   int (*free_extent_hook)(struct btrfs_trans_handle *trans,
-   struct btrfs_root *root,
+   int (*free_extent_hook)(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset,
int refs_to_drop);
diff --git a/extent-tree.c b/extent-tree.c
index 6e7a19323efc..9132cb3f8e15 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -2163,8 +2163,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
int skinny_metadata =
btrfs_fs_incompat(extent_root->fs_info, SKINNY_METADATA);
 
-   if (root->fs_info->free_extent_hook) {
-   root->fs_info->free_extent_hook(trans, root, bytenr, num_bytes,
+   if (trans->fs_info->free_extent_hook) {
+   trans->fs_info->free_extent_hook(trans->fs_info, bytenr, 
num_bytes,
parent, root_objectid, 
owner_objectid,
owner_offset, refs_to_drop);
 
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/15] btrfs-progs: Add delayed refs infrastructure

2018-06-08 Thread Nikolay Borisov
This commit pulls those portions of the kernel implementation of
delayed refs which are necessary to have them working in user-space.
I've done the following modifications:

1. Replaced all kmem_cache_alloc calls to kmalloc.

2. Removed all locking-related code, since we are single threaded in
userspace.

3. Removed code which deals with data refs - delayed refs in user space
are going to be used only for cowonly trees.

Signed-off-by: Nikolay Borisov 
---
 Makefile  |   3 +-
 ctree.h   |   3 +
 delayed-ref.c | 608 ++
 delayed-ref.h | 225 ++
 extent-tree.c | 228 ++
 kerncompat.h  |   8 +
 transaction.h |   4 +
 7 files changed, 1078 insertions(+), 1 deletion(-)
 create mode 100644 delayed-ref.c
 create mode 100644 delayed-ref.h

diff --git a/Makefile b/Makefile
index 544410e6440c..9508ad4f11e6 100644
--- a/Makefile
+++ b/Makefile
@@ -116,7 +116,8 @@ objects = ctree.o disk-io.o kernel-lib/radix-tree.o 
extent-tree.o print-tree.o \
  qgroup.o free-space-cache.o kernel-lib/list_sort.o props.o \
  kernel-shared/ulist.o qgroup-verify.o backref.o string-table.o 
task-utils.o \
  inode.o file.o find-root.o free-space-tree.o help.o send-dump.o \
- fsfeatures.o kernel-lib/tables.o kernel-lib/raid56.o transaction.o
+ fsfeatures.o kernel-lib/tables.o kernel-lib/raid56.o transaction.o \
+ delayed-ref.o
 cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
   cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
   cmds-quota.o cmds-qgroup.o cmds-replace.o check/main.o \
diff --git a/ctree.h b/ctree.h
index b30a946658ce..d1ea45571d1e 100644
--- a/ctree.h
+++ b/ctree.h
@@ -2812,4 +2812,7 @@ int btrfs_punch_hole(struct btrfs_trans_handle *trans,
 int btrfs_read_file(struct btrfs_root *root, u64 ino, u64 start, int len,
char *dest);
 
+
+/* extent-tree.c */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long nr);
 #endif
diff --git a/delayed-ref.c b/delayed-ref.c
new file mode 100644
index ..f3fa50239380
--- /dev/null
+++ b/delayed-ref.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ */
+
+#include "ctree.h"
+#include "btrfs-list.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking.  For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing.   This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ */
+
+/*
+ * compare two delayed tree backrefs with same bytenr and type
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
+ struct btrfs_delayed_tree_ref *ref2)
+{
+   if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+   if (ref1->root < ref2->root)
+   return -1;
+   if (ref1->root > ref2->root)
+   return 1;
+   } else {
+   if (ref1->parent < ref2->parent)
+   return -1;
+   if (ref1->parent > ref2->parent)
+   return 1;
+   }
+   return 0;
+}
+
+static int comp_refs(struct btrfs_delayed_ref_node *ref1,
+struct btrfs_delayed_ref_node *ref2,
+bool check_seq)
+{
+   int ret = 0;
+
+   if (ref1->type < ref2->type)
+   return -1;
+   if (ref1->type > ref2->type)
+   return 1;
+   if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+   ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
+   ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
+btrfs_delayed_node_to_tree_ref(ref2));
+   else
+   BUG();
+
+   if (ret)
+   return ret;
+   if (check_seq) {
+   if (ref1->seq < ref2->seq)
+   return -1;
+   if (ref1->seq > ref2->seq)
+   return 1;
+   }
+   return 0;
+}
+
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+  struct rb_node *node)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent_node = NULL;
+   struct btrfs_delayed_ref_head *entry;
+   struct btrfs_delayed_ref_head *ins;
+   u64 bytenr;
+
+   ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+   bytenr = ins->bytenr;
+   while (*p) {
+   parent_node = *p;
+   entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
+

[PATCH 00/15] Add delayed-refs support to btrfs-progs

2018-06-08 Thread Nikolay Borisov
Hello,  

Here is a series which adds support for delayed refs. This is needed to enable  
later work on adding freespace tree repair code. Additionally, it results in  
more code sharing between kernel/user space.

Patches 1-9 are simple prep patches removing some arguments, causing problems
later. They can go independently of the delayed refs work. They don't introduce
any functional changes. Next, patches 10-13 introduce the needed infrastructure
to for delayed refs without actually activating it. Patch 14 finally wires it
up by adding the necessary call outs to btrfs_run_delayed refs and reworking the
extent addition/freeing functions. With all of this done, patch 15 finally
removes the old code.

This series passes all btrfs progs fsck and misc tests + fuzz tests apart from
fuzz-003/007/009 - but those fail without this series so it's unlikely it's
caused by it.

Nikolay Borisov (15):
  btrfs-progs: Remove root argument from pin_down_bytes
  btrfs-progs: Remove root argument from btrfs_del_csums
  btrfs-progs: Add functions to modify the used space by a root
  btrfs-progs: Refactor the root used bytes are updated
  btrfs-progs: Make update_block_group take fs_info instead of root
  btrfs-progs: check: Drop trans/root arguments from free_extent_hook
  btrfs-progs: Remove root argument from __free_extent
  btrfs-progs: Remove root argument from alloc_reserved_tree_block
  btrfs-progs: Always pass 0 for offset when calling btrfs_free_extent
for btree blocks.
  btrfs-progs: Add boolean to signal whether we are re-initing extent
tree
  btrfs-progs: Add delayed refs infrastructure
  btrfs-progs: Add __free_extent2 function
  btrfs-progs: Add alloc_reserved_tree_block2 function
  btrfs-progs: Wire up delayed refs
  btrfs-progs: Remove old delayed refs infrastructure

 Makefile  |   3 +-
 btrfs-corrupt-block.c |   2 +-
 check/main.c  |   8 +-
 ctree.c   |  29 ++-
 ctree.h   |  11 +-
 delayed-ref.c | 608 ++
 delayed-ref.h | 225 +++
 extent-tree.c | 604 +
 file-item.c   |  20 +-
 kerncompat.h  |   8 +
 transaction.c |  25 +++
 transaction.h |   5 +
 12 files changed, 1280 insertions(+), 268 deletions(-)
 create mode 100644 delayed-ref.c
 create mode 100644 delayed-ref.h

-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: check: Initialize all filed of btrfs_inode_item in insert_inode_item()

2018-06-08 Thread Su Yue




On 06/08/2018 08:43 AM, Misono Tomohiro wrote:

On 2018/06/07 21:22, David Sterba wrote:

On Thu, Jun 07, 2018 at 11:49:58AM +0900, Misono Tomohiro wrote:

Initialize all filed of btrfs_inode_item to zero in order to prevent
having some garbage, especially for flags field.


Have you observed in practice or is it a matter of precaution?


I saw failure of fsck-test/010 in yesterday's devel branch and
made this patch. It turned out that root cause was wrong flag comparison
in btrfs check.
(https://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg77758.html)

With Su's fix, failure of fsck-test/010 is also gone without this patch,
but it is better to initialize the variables anyway.


Agreed. I saw odd flags reported by btrfs check too.
The callers of insert_inode_item() don't set inode flags manully.


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html





--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Bug 199931] New: systemd/rtorrent file data corruption when using echo 3 >/proc/sys/vm/drop_caches

2018-06-08 Thread Duncan
Marc Lehmann posted on Wed, 06 Jun 2018 21:06:35 +0200 as excerpted:

> Not sure what exactly you mean with btrfs mirroring (there are many
> btrfs features this could refer to), but the closest thing to that that
> I use is dup for metadata (which is always checksummed), data is always
> single. All btrfs filesystems are on lvm (not mirrored), and most (but
> not all) are encrypted. One affected fs is on a hardware raid
> controller, one is on an ssd. I have a single btrfs fs in that box with
> raid1 for metadata, as an experiment, but I haven't used it for testing
> yet.

On the off chance, tho it doesn't sound like it from your description...

You're not doing LVM snapshots of the volumes with btrfs on them, 
correct?  Because btrfs depends on filesystem GUIDs being just that, 
globally unique, using them to find the possible multiple devices of a 
multi-device btrfs (normal single-device filesystems don't have the issue 
as they don't have to deal with multi-device as btrfs does), and btrfs 
can get very confused, with data-loss potential, if it sees multiple 
copies of a device with the same filesystem GUID, as can happen if lvm 
snapshots (which obviously have the same filesystem GUID as the original) 
are taken and both the snapshot and the source are exposed to btrfs 
device scan (which is auto-triggered by udev when the new device 
appears), with one of them mounted.

Presumably you'd consider lvm snapshotting a form of mirroring and you've 
already said you're not doing that in any form, but just in case, because 
this is a rather obscure trap people using lvm could find themselves in, 
without a clue as to the danger, and the resulting symptoms could be 
rather hard to troubleshoot if this possibility wasn't considered.

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Exporting a unique ino/dev pair to user space

2018-06-08 Thread Amir Goldstein
On Fri, Jun 8, 2018 at 12:06 AM, Mark Fasheh  wrote:
[...]

>> >  2c) I don't think we can really use a dedicated callback without
>> >  passing the vfsmount through since Overlayfs ->getattr might call
>> >  the lower fs ->getattr. At that point we might as well use getattr.
>> >
>>
>> Didn't get the Overlayfs lower fs getattr argument.
>> Overlayfs doesn't use the vfsmount passed into getattr
>> and it could very well pass a dentry to lower fs getattr.
>
> My main point in 2c) is that, from my understanding, Overlayfs may need to
> call down to one of the filesystem ->getattr() calls. Since those take a
> vfsmount we don't gain anything by having a unique callback from this - the
> plumbing work would be the same.
>

I guess I don't understand what you mean by "dedicated callback", but I
think we both in agreement that changing fs getattr() to take dentry is
preferred.

>
>> As a matter of fact, out of 35 getattr implementations in the kernel:
>> (git grep "\s\.getattr\s" fs| awk '{print $4}'| sort -u|grep -v
>> "nfs.*_proc_getattr"|wc -l)
>> there is only one using the vfsmount - nfs_getattr() for MNT_NOATIME
>> check and most of them only ever use d_inode(path->dentry).
>>
>> This API seems quite odd.
>> Maybe it should be fixed so more in kernel call sites could call getattr
>> without a vfsmount.
>> Not sure what would be the best way to handle nfs_getattr().
>
> Yeah I saw that nfs_getattr() is the only user of the vfsmount. I totally
> agree that this would be tons easier if we didn't have to pass the vfsmount
> (and like you said, there's only the ONE user).
>
> This is a bit hacky, but I wonder if we could blow the function signature
> back out to a dentry + vfsmount and make the vfsmount optional when getattr
> is called only for ino/dev. It's a bit ugly to have optional arguments like
> that but nfs would work with just a line or two change and the other fs
> would never even care.

OR.. change the signature of fs getattr() and vfs_getattr_nosec() to dentry
and set a kernel query_flag AT_STATX_NO_REMOTE_ATIME from
vfs_getattr(). No need to pass vfsmount to nfs.

Then users that access i_ino/s_dev can call vfs_getattr_nosec() with a
bonus of not having to go through security modules (which now they don't).

The only current user of vfs_getattr_nosec() expfs.c queries STATX_INO,
so change is safe.

Overlayfs doesn't need to get vfsmount in ovl_getattr() - it knows the
lower fs vfsmount for calling vfs_getattr() regardless and in other
places overlayfs just needs to query  STATX_INO, so it may also
use the light vfs_getattr_nosec() callback.

Thanks,
Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] fstests: btrfs: Test if btrfs will corrupt nodatasum compressed extent when replacing device

2018-06-08 Thread Qu Wenruo
This is a long existing bug (from 2012) but exposed by a reporter
recently, that when compressed extent without data csum get written to
device-replace target device, the written data is in fact uncompressed data
other than the original compressed data.

And since btrfs still consider the data is compressed and will try to read it
as compressed, it can cause read error.

The root cause is located, and one RFC patch already sent to fix it,
titled "[PATCH] btrfs: scrub: Don't use inode pages for device replace".
(The RFC is only for the extra possible way to fix the bug, the fix
itself should work without problem)

Reported-by: James Harvey 
Signed-off-by: Qu Wenruo 
---
changelog:
v2:
  Now the fix patch is no longer RFC.
  Remove _require_test as we don't really touch it.
  Add comment on the mount cycle.
  Add the test to group 'volume'.
---
 tests/btrfs/161 | 91 +
 tests/btrfs/161.out |  2 +
 tests/btrfs/group   |  1 +
 3 files changed, 94 insertions(+)
 create mode 100755 tests/btrfs/161
 create mode 100644 tests/btrfs/161.out

diff --git a/tests/btrfs/161 b/tests/btrfs/161
new file mode 100755
index ..ce1b0e04
--- /dev/null
+++ b/tests/btrfs/161
@@ -0,0 +1,91 @@
+#! /bin/bash
+# FS QA Test 161
+#
+# Test if btrfs will corrupt compressed data extent without data csum
+# by replacing it with uncompressed data, when doing replacing device.
+#
+# This could be fixed by the following RFC patch:
+# "[PATCH] btrfs: scrub: Don't use inode pages for device replace"
+#
+#---
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch_dev_pool 2
+_require_scratch_dev_pool_equal_size
+
+
+_scratch_dev_pool_get 1
+_spare_dev_get
+_scratch_pool_mkfs >> $seqres.full 2>&1
+
+# Create nodatasum inode
+_scratch_mount "-o nodatasum"
+touch $SCRATCH_MNT/nodatasum_file
+_scratch_remount "datasum,compress"
+_pwrite_byte 0xcd 0 128K $SCRATCH_MNT/nodatasum_file > /dev/null
+
+# Write the compressed data back to disk
+sync
+
+# Replace the device
+_run_btrfs_util_prog replace start -Bf 1 $SPARE_DEV $SCRATCH_MNT
+
+# Unmount to drop all cache so next read will read from disk
+_scratch_unmount
+_mount $SPARE_DEV $SCRATCH_MNT
+
+# Now the EXTENT_DATA item still marks the extent as compressed,
+# but the on-disk data is uncompressed, thus reading it as compressed
+# will definitely cause EIO.
+cat $SCRATCH_MNT/nodatasum_file > /dev/null
+
+_scratch_unmount
+_spare_dev_put
+_scratch_dev_pool_put
+
+echo "Silence is golden"
+# success, all done
+status=0
+exit
diff --git a/tests/btrfs/161.out b/tests/btrfs/161.out
new file mode 100644
index ..1752a243
--- /dev/null
+++ b/tests/btrfs/161.out
@@ -0,0 +1,2 @@
+QA output created by 161
+Silence is golden
diff --git a/tests/btrfs/group b/tests/btrfs/group
index f04ee8d5..9195b368 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -163,3 +163,4 @@
 158 auto quick raid scrub
 159 auto quick
 160 auto quick
+161 auto quick replace volume
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html