from:"Lu Fengqi"

[PATCH] btrfs: skip file_extent generation check for free_space_inode in run_delalloc_nocow

2018-11-29 Thread Lu Fengqi

The btrfs/001 with inode_cache mount option will encounter the following
warning:

WARNING: CPU: 1 PID: 23700 at fs/btrfs/inode.c:956 
cow_file_range.isra.19+0x32b/0x430 [btrfs]
CPU: 1 PID: 23700 Comm: btrfs Kdump: loaded Tainted: GW  O  
4.20.0-rc4-custom+ #30
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
RIP: 0010:cow_file_range.isra.19+0x32b/0x430 [btrfs]
Call Trace:
 ? free_extent_buffer+0x46/0x90 [btrfs]
 run_delalloc_nocow+0x455/0x900 [btrfs]
 btrfs_run_delalloc_range+0x1a7/0x360 [btrfs]
 writepage_delalloc+0xf9/0x150 [btrfs]
 __extent_writepage+0x125/0x3e0 [btrfs]
 extent_write_cache_pages+0x1b6/0x3e0 [btrfs]
 ? __wake_up_common_lock+0x63/0xc0
 extent_writepages+0x50/0x80 [btrfs]
 do_writepages+0x41/0xd0
 ? __filemap_fdatawrite_range+0x9e/0xf0
 __filemap_fdatawrite_range+0xbe/0xf0
 btrfs_fdatawrite_range+0x1b/0x50 [btrfs]
 __btrfs_write_out_cache+0x42c/0x480 [btrfs]
 btrfs_write_out_ino_cache+0x84/0xd0 [btrfs]
 btrfs_save_ino_cache+0x551/0x660 [btrfs]
 commit_fs_roots+0xc5/0x190 [btrfs]
 btrfs_commit_transaction+0x2bf/0x8d0 [btrfs]
 btrfs_mksubvol+0x48d/0x4d0 [btrfs]
 btrfs_ioctl_snap_create_transid+0x170/0x180 [btrfs]
 btrfs_ioctl_snap_create_v2+0x124/0x180 [btrfs]
 btrfs_ioctl+0x123f/0x3030 [btrfs]

The file extent generation of the free space inode is equal to the last
snapshot of the file root, so the inode will be passed to cow_file_rage.
But the inode was created and its extents were preallocated in
btrfs_save_ino_cache, there are no cow copies on disk.

The preallocated extents don't present on disk, and the
btrfs_cross_ref_exist will ignore the -ENOENT returned by
check_committed_ref, so we can directly write the inode to the disk.

Fixes: 78d4295b1eee ("btrfs: lift some btrfs_cross_ref_exist checks in nocow 
path")
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d54bdef16d8d..9c5e9629eb6c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1369,7 +1369,8 @@ static noinline int run_delalloc_nocow(struct inode 
*inode,
 * Do the same check as in btrfs_cross_ref_exist but
 * without the unnecessary search.
 */
-   if (btrfs_file_extent_generation(leaf, fi) <=
+   if (!nolock &&
+   btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(>root_item))
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
-- 
2.19.2

[PATCH v2 1/3] btrfs: remove always true if branch in find_delalloc_range

2018-11-28 Thread Lu Fengqi

The @found is always false when it comes to the if branch. Besides, the
bool type is more suitable for @found. Change the return value of the
function and its caller to bool as well.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/extent_io.c | 31 +++
 fs/btrfs/extent_io.h |  2 +-
 fs/btrfs/tests/extent-io-tests.c |  2 +-
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b2769e92b556..4b6b87e63b4a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1452,16 +1452,16 @@ int find_first_extent_bit(struct extent_io_tree *tree, 
u64 start,
  * find a contiguous range of bytes in the file marked as delalloc, not
  * more than 'max_bytes'.  start and end are used to return the range,
  *
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * true is returned if we find something, false if nothing was in the tree
  */
-static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+static noinline bool find_delalloc_range(struct extent_io_tree *tree,
u64 *start, u64 *end, u64 max_bytes,
struct extent_state **cached_state)
 {
struct rb_node *node;
struct extent_state *state;
u64 cur_start = *start;
-   u64 found = 0;
+   bool found = false;
u64 total_bytes = 0;
 
spin_lock(>lock);
@@ -1472,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct 
extent_io_tree *tree,
 */
node = tree_search(tree, cur_start);
if (!node) {
-   if (!found)
-   *end = (u64)-1;
+   *end = (u64)-1;
goto out;
}
 
@@ -1493,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct 
extent_io_tree *tree,
*cached_state = state;
refcount_inc(>refs);
}
-   found++;
+   found = true;
*end = state->end;
cur_start = state->end + 1;
node = rb_next(node);
@@ -1551,13 +1550,13 @@ static noinline int lock_delalloc_pages(struct inode 
*inode,
 }
 
 /*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'.  start and end are used to return the range,
+ * find and lock a contiguous range of bytes in the file marked as delalloc,
+ * not more than 'max_bytes'.  start and end are used to return the range,
  *
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * true is returned if we find something, false if nothing was in the tree
  */
 EXPORT_FOR_TESTS
-noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
+noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end)
@@ -1565,7 +1564,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct 
inode *inode,
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
-   u64 found;
+   bool found;
struct extent_state *cached_state = NULL;
int ret;
int loops = 0;
@@ -1580,7 +1579,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct 
inode *inode,
*start = delalloc_start;
*end = delalloc_end;
free_extent_state(cached_state);
-   return 0;
+   return false;
}
 
/*
@@ -1612,7 +1611,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct 
inode *inode,
loops = 1;
goto again;
} else {
-   found = 0;
+   found = false;
goto out_failed;
}
}
@@ -3195,7 +3194,7 @@ static noinline_for_stack int writepage_delalloc(struct 
inode *inode,
 {
struct extent_io_tree *tree = _I(inode)->io_tree;
u64 page_end = delalloc_start + PAGE_SIZE - 1;
-   u64 nr_delalloc;
+   bool found;
u64 delalloc_to_write = 0;
u64 delalloc_end = 0;
int ret;
@@ -3203,11 +3202,11 @@ static noinline_for_stack int writepage_delalloc(struct 
inode *inode,
 
 
while (delalloc_end < page_end) {
-   nr_delalloc = find_lock_delalloc_range(inode, tree,
+   found = find_lock_delalloc_range(inode, tree,
   page,
   _start,
   _end);
-   if (nr_delalloc == 0) {
+   if (!found) {
delalloc_start = delalloc_end + 1;
continue;
}
diff --git a/fs/btrfs/extent_io.h b/fs

Re: [PATCH 1/3] btrfs: remove always true if branch in find_delalloc_range

2018-11-28 Thread Lu Fengqi

On Wed, Nov 28, 2018 at 09:01:42AM +0200, Nikolay Borisov wrote:
>
>
>On 28.11.18 г. 5:21 ч., Lu Fengqi wrote:
>> The @found is always false when it comes to the if branch. Besides, the
>> bool type is more suitable for @found.
>
>Well if you are ranging the type of found variable it also makes sense
>to change the return value of the function to bool as well.

Good catch.

-- 
Thanks,
Lu

>
>> 
>> Signed-off-by: Lu Fengqi 
>> ---
>>  fs/btrfs/extent_io.c | 7 +++
>>  1 file changed, 3 insertions(+), 4 deletions(-)
>> 
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 582b4b1c41e0..b4ee3399be96 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -1461,7 +1461,7 @@ static noinline u64 find_delalloc_range(struct 
>> extent_io_tree *tree,
>>  struct rb_node *node;
>>  struct extent_state *state;
>>  u64 cur_start = *start;
>> -u64 found = 0;
>> +bool found = false;
>>  u64 total_bytes = 0;
>>  
>>  spin_lock(>lock);
>> @@ -1472,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct 
>> extent_io_tree *tree,
>>   */
>>  node = tree_search(tree, cur_start);
>>  if (!node) {
>> -if (!found)
>> -*end = (u64)-1;
>> +*end = (u64)-1;
>>  goto out;
>>  }
>>  
>> @@ -1493,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct 
>> extent_io_tree *tree,
>>  *cached_state = state;
>>  refcount_inc(>refs);
>>  }
>> -found++;
>> +found = true;
>>  *end = state->end;
>>  cur_start = state->end + 1;
>>  node = rb_next(node);
>> 
>
>

Re: [RFC PATCH] btrfs: drop file privileges in btrfs_clone_files

2018-11-28 Thread Lu Fengqi

On Wed, Nov 28, 2018 at 09:48:07AM +0200, Nikolay Borisov wrote:
>
>
>On 28.11.18 г. 9:46 ч., Christoph Hellwig wrote:
>> On Wed, Nov 28, 2018 at 09:44:59AM +0200, Nikolay Borisov wrote:
>>>
>>>
>>> On 28.11.18 г. 5:07 ч., Lu Fengqi wrote:
>>>> The generic/513 tell that cloning into a file did not strip security
>>>> privileges (suid, capabilities) like a regular write would.
>>>>
>>>> Signed-off-by: Lu Fengqi 
>>>> ---
>>>> The xfs and ocfs2 call generic_remap_file_range_prep to drop file
>>>> privileges, I'm not sure whether btrfs should do the same thing.
>>>
>>> Why do you think btrfs shouldn't do the same thing. Looking at

I'm not sure btrfs doesn't use generic check intentionally for some reason.

>>> remap_file_range_prep it seems that btrfs is missing a ton of checks
>>> that are useful i.e immutable files/aligned offsets etc.

It is indeed.

In addition, generic_remap_file_range_prep will invoke inode_dio_wait
filemap_write_and_wait_range for the source and destination inode/range.
For the dedupe case, it will call vfs_dedupe_file_range_compare.

I still can't judge whether these operations are welcome by btrfs. I
will go deep into the code.

>> 
>> Any chance we could move btrfs over to use remap_file_range_prep so that
>> all file systems share the exact same checks?

In theory we can call generic_remap_file_range_prep in
btrfs_remap_file_range, which give us the opportunity to clean up the
duplicate check code in btrfs_extent_same and btrfs_clone_files.

>
>I'm not very familiar with the, Filipe is more familiar so adding to CC.
>But IMO we should do that provided there are no blockers.
>
>Filipe, what do you think, is it feasible?

I'm all ears for the suggestions.

-- 
Thanks,
Lu

[PATCH 3/3] btrfs: remove redundant nowait check for buffered_write

2018-11-27 Thread Lu Fengqi

The generic_write_checks will check the combination of IOCB_NOWAIT and
!IOCB_DIRECT.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/file.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3835bb8c146d..190db9a685a2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1889,10 +1889,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
loff_t oldsize;
int clean_page = 0;
 
-   if (!(iocb->ki_flags & IOCB_DIRECT) &&
-   (iocb->ki_flags & IOCB_NOWAIT))
-   return -EOPNOTSUPP;
-
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
-- 
2.19.2

[PATCH 2/3] btrfs: cleanup the useless DEFINE_WAIT in cleanup_transaction

2018-11-27 Thread Lu Fengqi

When it is introduced at commit f094ac32aba3 ("Btrfs: fix NULL pointer
after aborting a transaction"), it's useless.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/transaction.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f92c0a88c4ad..67e84939b758 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1840,7 +1840,6 @@ static void cleanup_transaction(struct btrfs_trans_handle 
*trans, int err)
 {
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
-   DEFINE_WAIT(wait);
 
WARN_ON(refcount_read(>use_count) > 1);
 
-- 
2.19.2

[PATCH 1/3] btrfs: remove always true if branch in find_delalloc_range

2018-11-27 Thread Lu Fengqi

The @found is always false when it comes to the if branch. Besides, the
bool type is more suitable for @found.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/extent_io.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 582b4b1c41e0..b4ee3399be96 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1461,7 +1461,7 @@ static noinline u64 find_delalloc_range(struct 
extent_io_tree *tree,
struct rb_node *node;
struct extent_state *state;
u64 cur_start = *start;
-   u64 found = 0;
+   bool found = false;
u64 total_bytes = 0;
 
spin_lock(>lock);
@@ -1472,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct 
extent_io_tree *tree,
 */
node = tree_search(tree, cur_start);
if (!node) {
-   if (!found)
-   *end = (u64)-1;
+   *end = (u64)-1;
goto out;
}
 
@@ -1493,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct 
extent_io_tree *tree,
*cached_state = state;
refcount_inc(>refs);
}
-   found++;
+   found = true;
*end = state->end;
cur_start = state->end + 1;
node = rb_next(node);
-- 
2.19.2

[RFC PATCH] btrfs: drop file privileges in btrfs_clone_files

2018-11-27 Thread Lu Fengqi

The generic/513 tell that cloning into a file did not strip security
privileges (suid, capabilities) like a regular write would.

Signed-off-by: Lu Fengqi 
---
The xfs and ocfs2 call generic_remap_file_range_prep to drop file
privileges, I'm not sure whether btrfs should do the same thing.

Any suggestion?

 fs/btrfs/ioctl.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 410c7e007ba8..bc33c480603b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4312,6 +4312,10 @@ static noinline int btrfs_clone_files(struct file *file, 
struct file *file_src,
goto out_unlock;
}
 
+   ret = file_remove_privs(file);
+   if (ret)
+   goto out_unlock;
+
if (destoff > inode->i_size) {
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
-- 
2.19.2

Re: [PATCH 3/6] btrfs: cleanup extent_op handling

2018-11-22 Thread Lu Fengqi

On Wed, Nov 21, 2018 at 01:59:09PM -0500, Josef Bacik wrote:
>From: Josef Bacik 
>
>The cleanup_extent_op function actually would run the extent_op if it
>needed running, which made the name sort of a misnomer.  Change it to
>run_and_cleanup_extent_op, and move the actual cleanup work to
>cleanup_extent_op so it can be used by check_ref_cleanup() in order to
>unify the extent op handling.
>
>Signed-off-by: Josef Bacik 

One nitpick below.

Reviewed-by: Lu Fengqi 

>---
> fs/btrfs/extent-tree.c | 36 +++-
> 1 file changed, 23 insertions(+), 13 deletions(-)
>
>diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>index e3ed3507018d..8a776dc9cb38 100644
>--- a/fs/btrfs/extent-tree.c
>+++ b/fs/btrfs/extent-tree.c
>@@ -2424,19 +2424,33 @@ static void unselect_delayed_ref_head(struct 
>btrfs_delayed_ref_root *delayed_ref
>   btrfs_delayed_ref_unlock(head);
> }
> 
>-static int cleanup_extent_op(struct btrfs_trans_handle *trans,
>-   struct btrfs_delayed_ref_head *head)
>+static struct btrfs_delayed_extent_op *
>+cleanup_extent_op(struct btrfs_trans_handle *trans,

The trans parameter seems useless.

-- 
Thanks,
Lu

>+struct btrfs_delayed_ref_head *head)
> {
>   struct btrfs_delayed_extent_op *extent_op = head->extent_op;
>-  int ret;
> 
>   if (!extent_op)
>-  return 0;
>-  head->extent_op = NULL;
>+  return NULL;
>+
>   if (head->must_insert_reserved) {
>+  head->extent_op = NULL;
>   btrfs_free_delayed_extent_op(extent_op);
>-  return 0;
>+  return NULL;
>   }
>+  return extent_op;
>+}
>+
>+static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
>+   struct btrfs_delayed_ref_head *head)
>+{
>+  struct btrfs_delayed_extent_op *extent_op =
>+  cleanup_extent_op(trans, head);
>+  int ret;
>+
>+  if (!extent_op)
>+  return 0;
>+  head->extent_op = NULL;
>   spin_unlock(>lock);
>   ret = run_delayed_extent_op(trans, head, extent_op);
>   btrfs_free_delayed_extent_op(extent_op);
>@@ -2488,7 +2502,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle 
>*trans,
> 
>   delayed_refs = >transaction->delayed_refs;
> 
>-  ret = cleanup_extent_op(trans, head);
>+  ret = run_and_cleanup_extent_op(trans, head);
>   if (ret < 0) {
>   unselect_delayed_ref_head(delayed_refs, head);
>   btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
>@@ -6977,12 +6991,8 @@ static noinline int check_ref_cleanup(struct 
>btrfs_trans_handle *trans,
>   if (!RB_EMPTY_ROOT(>ref_tree.rb_root))
>   goto out;
> 
>-  if (head->extent_op) {
>-  if (!head->must_insert_reserved)
>-  goto out;
>-  btrfs_free_delayed_extent_op(head->extent_op);
>-  head->extent_op = NULL;
>-  }
>+  if (cleanup_extent_op(trans, head) != NULL)
>+  goto out;
> 
>   /*
>* waiting for the lock here would deadlock.  If someone else has it
>-- 
>2.14.3
>
>
>

Re: [PATCH] btrfs: Fix suspicious RCU usage warning in device_list_add

2018-11-14 Thread Lu Fengqi

On Wed, Nov 14, 2018 at 05:05:48PM +0100, David Sterba wrote:
>On Wed, Nov 14, 2018 at 03:24:56PM +0800, Lu Fengqi wrote:
>> =
>> WARNING: suspicious RCU usage
>> 4.20.0-rc2+ #23 Tainted: G   O
>> -
>> fs/btrfs/volumes.c:886 suspicious rcu_dereference_check() usage!
>> 
>> Use btrfs_info_in_rcu instead of pr_info for the required lock/unlock of
>> RCU string.
>> 
>> Fixes: 1f265fc6f58b ("btrfs: harden agaist duplicate fsid on scanned 
>> devices")
>
>Thanks for the fix.
>
>Please note that the patch is still in the devel queue (misc-next) so
>the commit id is unstable, and such fixups get folded to the patch.
>
>You may also reply to the original mail with patch, but sending a bare
>code change without a full changelog is also fine if the original patch
>was sent long time ago and the fixup could get lost.

Got it.

-- 
Thanks,
Lu

>
>> Signed-off-by: Lu Fengqi 
>> ---
>>  fs/btrfs/volumes.c | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>> 
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 2186300bab91..6039ae5c549e 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -873,15 +873,15 @@ static noinline struct btrfs_device 
>> *device_list_add(const char *path,
>>  if (device->bdev != path_bdev) {
>>  bdput(path_bdev);
>>  mutex_unlock(_devices->device_list_mutex);
>> -pr_warn(
>> -"BTRFS: duplicate device fsid:devid for %pU:%llu old:%s 
>> new:%s\n",
>> +btrfs_warn_in_rcu(device->fs_info,
>> +"duplicate device fsid:devid for %pU:%llu old:%s 
>> new:%s\n",
>
>The trailing newline is appended by all btrfs_* message helpers, removed
>in the commit.
>
>

[PATCH] btrfs: Fix suspicious RCU usage warning in device_list_add

2018-11-13 Thread Lu Fengqi

=
WARNING: suspicious RCU usage
4.20.0-rc2+ #23 Tainted: G   O
-
fs/btrfs/volumes.c:886 suspicious rcu_dereference_check() usage!

Use btrfs_info_in_rcu instead of pr_info for the required lock/unlock of
RCU string.

Fixes: 1f265fc6f58b ("btrfs: harden agaist duplicate fsid on scanned devices")
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/volumes.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2186300bab91..6039ae5c549e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -873,15 +873,15 @@ static noinline struct btrfs_device 
*device_list_add(const char *path,
if (device->bdev != path_bdev) {
bdput(path_bdev);
mutex_unlock(_devices->device_list_mutex);
-   pr_warn(
-   "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s 
new:%s\n",
+   btrfs_warn_in_rcu(device->fs_info,
+   "duplicate device fsid:devid for %pU:%llu old:%s 
new:%s\n",
disk_super->fsid, devid,
rcu_str_deref(device->name), path);
return ERR_PTR(-EEXIST);
}
bdput(path_bdev);
-   pr_info(
-   "BTRFS: device fsid %pU devid %llu moved old:%s 
new:%s\n",
+   btrfs_info_in_rcu(device->fs_info,
+   "device fsid %pU devid %llu moved old:%s new:%s\n",
disk_super->fsid, devid,
rcu_str_deref(device->name), path);
}
-- 
2.19.1

Re: [PATCH v15.1 00/13] Btrfs In-band De-duplication

2018-11-13 Thread Lu Fengqi

On Tue, Nov 13, 2018 at 02:45:45PM +0100, David Sterba wrote:
>On Tue, Nov 06, 2018 at 02:41:09PM +0800, Lu Fengqi wrote:
>> This patchset can be fetched from github:
>> https://github.com/littleroad/linux.git dedupe_latest
>> 
>> Now the new base is v4.20-rc1.
>
>Before anybody spends more time with this patchset: this is a big
>feature and quite intrusive to several btrfs subsystems. Currently it's
>on hold as it requires finishing the design phase, it's still only the
>in-memory backend and before we claim in-band dedupe, the persistent
>hash tree needs to be at least drafted or prototyped.

Thanks for your explanation. However, I'm not sure why we need to draft
a prototype of the persistent hash tree first when we are talking about
the memory backend.

-- 
Thanks,
Lu

>
>At this point there are several features that are in a more complete
>state so they get preferred when it comes to merging. I would have to
>look up what was agreed long time ago as merging plan, but at this point
>this series would require a lot of work.
>
>

[PATCH v10.6 3/5] btrfs-progs: dedupe: Add disable support for inband dedupelication

2018-11-05 Thread Lu Fengqi

From: Qu Wenruo 

Add disable subcommand for dedupe command group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  5 +++
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 41 ++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index d895aafbcf45..3452f690e3e5 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -22,6 +22,11 @@ use with caution.
 
 SUBCOMMAND
 --
+*disable* ::
+Disable in-band de-duplication for a filesystem.
++
+This will trash all stored dedupe hash.
++
 *enable* [options] ::
 Enable in-band de-duplication for a filesystem.
 +
diff --git a/btrfs-completion b/btrfs-completion
index 621801cf12fb..e6ec785bf849 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -34,7 +34,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe_inband='enable'
+   commands_dedupe_inband='enable disable'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index 4d499677d9ae..91b6fe234043 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -259,10 +259,51 @@ out:
return ret;
 }
 
+static const char * const cmd_dedupe_ib_disable_usage[] = {
+   "btrfs dedupe-inband disable ",
+   "Disable in-band(write time) de-duplication of a btrfs.",
+   NULL
+};
+
+static int cmd_dedupe_ib_disable(int argc, char **argv)
+{
+   struct btrfs_ioctl_dedupe_args dargs;
+   DIR *dirstream;
+   char *path;
+   int fd;
+   int ret;
+
+   if (check_argc_exact(argc, 2))
+   usage(cmd_dedupe_ib_disable_usage);
+
+   path = argv[1];
+   fd = open_file_or_dir(path, );
+   if (fd < 0) {
+   error("failed to open file or directory: %s", path);
+   return 1;
+   }
+   memset(, 0, sizeof(dargs));
+   dargs.cmd = BTRFS_DEDUPE_CTL_DISABLE;
+
+   ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, );
+   if (ret < 0) {
+   error("failed to disable inband deduplication: %m");
+   ret = 1;
+   goto out;
+   }
+   ret = 0;
+
+out:
+   close_file_or_dir(fd, dirstream);
+   return 0;
+}
+
 const struct cmd_group dedupe_ib_cmd_group = {
dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, {
{ "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage,
  NULL, 0},
+   { "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage,
+ NULL, 0},
NULL_CMD_STRUCT
}
 };
-- 
2.19.1

[PATCH v10.6 1/5] btrfs-progs: Basic framework for dedupe-inband command group

2018-11-05 Thread Lu Fengqi

From: Qu Wenruo 

Add basic ioctl header and command group framework for later use.
Alone with basic man page doc.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/Makefile.in  |  1 +
 Documentation/btrfs-dedupe-inband.asciidoc | 40 ++
 Documentation/btrfs.asciidoc   |  4 +++
 Makefile   |  3 +-
 btrfs.c|  2 ++
 cmds-dedupe-ib.c   | 35 +++
 commands.h |  2 ++
 dedupe-ib.h| 28 +++
 ioctl.h| 36 +++
 9 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc
 create mode 100644 cmds-dedupe-ib.c
 create mode 100644 dedupe-ib.h

diff --git a/Documentation/Makefile.in b/Documentation/Makefile.in
index afc16980c6d9..c0d797324c25 100644
--- a/Documentation/Makefile.in
+++ b/Documentation/Makefile.in
@@ -28,6 +28,7 @@ MAN8_TXT += btrfs-qgroup.asciidoc
 MAN8_TXT += btrfs-replace.asciidoc
 MAN8_TXT += btrfs-restore.asciidoc
 MAN8_TXT += btrfs-property.asciidoc
+MAN8_TXT += btrfs-dedupe-inband.asciidoc
 
 # Category 5 manual page
 MAN5_TXT += btrfs-man5.asciidoc
diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
new file mode 100644
index ..83113f5487e2
--- /dev/null
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -0,0 +1,40 @@
+btrfs-dedupe-inband(8)
+==
+
+NAME
+
+btrfs-dedupe-inband - manage in-band (write time) de-duplication of a btrfs
+filesystem
+
+SYNOPSIS
+
+*btrfs dedupe-inband*  
+
+DESCRIPTION
+---
+*btrfs dedupe-inband* is used to enable/disable or show current in-band 
de-duplication
+status of a btrfs filesystem.
+
+Kernel support for in-band de-duplication starts from 4.19.
+
+WARNING: In-band de-duplication is still an experimental feautre of btrfs,
+use with caution.
+
+SUBCOMMAND
+--
+Nothing yet
+
+EXIT STATUS
+---
+*btrfs dedupe-inband* returns a zero exit status if it succeeds. Non zero is
+returned in case of failure.
+
+AVAILABILITY
+
+*btrfs* is part of btrfs-progs.
+Please refer to the btrfs wiki http://btrfs.wiki.kernel.org for
+further details.
+
+SEE ALSO
+
+`mkfs.btrfs`(8),
diff --git a/Documentation/btrfs.asciidoc b/Documentation/btrfs.asciidoc
index 7316ac094413..1cf5bddec335 100644
--- a/Documentation/btrfs.asciidoc
+++ b/Documentation/btrfs.asciidoc
@@ -50,6 +50,10 @@ COMMANDS
Do off-line check on a btrfs filesystem. +
See `btrfs-check`(8) for details.
 
+*dedupe-inband*::
+   Control btrfs in-band(write time) de-duplication. +
+   See `btrfs-dedupe-inband`(8) for details.
+
 *device*::
Manage devices managed by btrfs, including add/delete/scan and so
on. +
diff --git a/Makefile b/Makefile
index f4ab14ea74c8..f155252c91f1 100644
--- a/Makefile
+++ b/Makefile
@@ -124,7 +124,8 @@ cmds_objects = cmds-subvolume.o cmds-filesystem.o 
cmds-device.o cmds-scrub.o \
   cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o \
   cmds-property.o cmds-fi-usage.o cmds-inspect-dump-tree.o \
   cmds-inspect-dump-super.o cmds-inspect-tree-stats.o cmds-fi-du.o 
\
-  mkfs/common.o check/mode-common.o check/mode-lowmem.o
+  mkfs/common.o check/mode-common.o check/mode-lowmem.o \
+  cmds-dedupe-ib.o
 libbtrfs_objects = send-stream.o send-utils.o kernel-lib/rbtree.o btrfs-list.o 
\
   kernel-lib/crc32c.o messages.o \
   uuid-tree.o utils-lib.o rbtree-utils.o
diff --git a/btrfs.c b/btrfs.c
index 2d39f2ced3e8..2168f5a8bc7f 100644
--- a/btrfs.c
+++ b/btrfs.c
@@ -255,6 +255,8 @@ static const struct cmd_group btrfs_cmd_group = {
{ "quota", cmd_quota, NULL, _cmd_group, 0 },
{ "qgroup", cmd_qgroup, NULL, _cmd_group, 0 },
{ "replace", cmd_replace, NULL, _cmd_group, 0 },
+   { "dedupe-inband", cmd_dedupe_ib, NULL, _ib_cmd_group,
+   0 },
{ "help", cmd_help, cmd_help_usage, NULL, 0 },
{ "version", cmd_version, cmd_version_usage, NULL, 0 },
NULL_CMD_STRUCT
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
new file mode 100644
index ..73c923a797da
--- /dev/null
+++ b/cmds-dedupe-ib.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Fujitsu.  All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+
+#include "ctree.h"
+#include "ioctl.h"
+
+#include "commands.h"
+#include "utils.h"
+#include "kerncompat.h"
+#include "dedupe-ib.h"
+
+static const char * const dedupe_ib_cmd_gro

[PATCH v10.6 5/5] btrfs-progs: dedupe: introduce reconfigure subcommand

2018-11-05 Thread Lu Fengqi

From: Qu Wenruo 

Introduce reconfigure subcommand to co-operate with new kernel ioctl
modification.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  7 +++
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 73 +-
 3 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 6096389cb0b4..78c806f772d6 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -86,6 +86,13 @@ And compression has higher priority than in-band 
de-duplication, means if
 compression and de-duplication is enabled at the same time, only compression
 will work.
 
+*reconfigure* [options] ::
+Re-configure in-band de-duplication parameters of a filesystem.
++
+In-band de-duplication must be enbaled first before re-configuration.
++
+[Options] are the same with 'btrfs dedupe-inband enable'.
+
 *status* ::
 Show current in-band de-duplication status of a filesystem.
 
diff --git a/btrfs-completion b/btrfs-completion
index 0808f9a14df9..a3e05b238eda 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -34,7 +34,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe_inband='enable disable status'
+   commands_dedupe_inband='enable disable status reconfigure'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index e778457e25a8..e52f939c9ced 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -56,7 +56,6 @@ static const char * const cmd_dedupe_ib_enable_usage[] = {
NULL
 };
 
-
 #define report_fatal_parameter(dargs, old, member, type, err_val, fmt) \
 ({ \
if (dargs->member != old->member && \
@@ -88,6 +87,12 @@ static void report_parameter_error(struct 
btrfs_ioctl_dedupe_args *dargs,
}
report_option_parameter(dargs, old, flags, u8, -1, x);
}
+
+   if (dargs->status == 0 && old->cmd == BTRFS_DEDUPE_CTL_RECONF) {
+   error("must enable dedupe before reconfiguration");
+   return;
+   }
+
if (report_fatal_parameter(dargs, old, cmd, u16, -1, u) ||
report_fatal_parameter(dargs, old, blocksize, u64, -1, llu) ||
report_fatal_parameter(dargs, old, backend, u16, -1, u) ||
@@ -100,14 +105,17 @@ static void report_parameter_error(struct 
btrfs_ioctl_dedupe_args *dargs,
old->limit_nr, old->limit_mem);
 }
 
-static int cmd_dedupe_ib_enable(int argc, char **argv)
+static int enable_reconfig_dedupe(int argc, char **argv, int reconf)
 {
int ret;
int fd = -1;
char *path;
u64 blocksize = BTRFS_DEDUPE_BLOCKSIZE_DEFAULT;
+   int blocksize_set = 0;
u16 hash_algo = BTRFS_DEDUPE_HASH_SHA256;
+   int hash_algo_set = 0;
u16 backend = BTRFS_DEDUPE_BACKEND_INMEMORY;
+   int backend_set = 0;
u64 limit_nr = 0;
u64 limit_mem = 0;
u64 sys_mem = 0;
@@ -134,15 +142,17 @@ static int cmd_dedupe_ib_enable(int argc, char **argv)
break;
switch (c) {
case 's':
-   if (!strcasecmp("inmemory", optarg))
+   if (!strcasecmp("inmemory", optarg)) {
backend = BTRFS_DEDUPE_BACKEND_INMEMORY;
-   else {
+   backend_set = 1;
+   } else {
error("unsupported dedupe backend: %s", optarg);
exit(1);
}
break;
case 'b':
blocksize = parse_size(optarg);
+   blocksize_set = 1;
break;
case 'a':
if (strcmp("sha256", optarg)) {
@@ -224,26 +234,40 @@ static int cmd_dedupe_ib_enable(int argc, char **argv)
return 1;
}
memset(, -1, sizeof(dargs));
-   dargs.cmd = BTRFS_DEDUPE_CTL_ENABLE;
-   dargs.blocksize = blocksize;
-   dargs.hash_algo = hash_algo;
-   dargs.limit_nr = limit_nr;
-   dargs.limit_mem = limit_mem;
-   dargs.backend = backend;
-   if (force)
-   dargs.flags |= BTRFS_DEDUPE_FLAG_FORCE;
-   else
-   dargs.flags = 0;
+   if (reconf) {
+   dargs.cmd = BTRFS_

[PATCH v10.6 2/5] btrfs-progs: dedupe: Add enable command for dedupe command group

2018-11-05 Thread Lu Fengqi

From: Qu Wenruo 

Add enable subcommand for dedupe commmand group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc | 114 +-
 btrfs-completion   |   6 +-
 cmds-dedupe-ib.c   | 238 +
 ioctl.h|   2 +
 4 files changed, 358 insertions(+), 2 deletions(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 83113f5487e2..d895aafbcf45 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -22,7 +22,119 @@ use with caution.
 
 SUBCOMMAND
 --
-Nothing yet
+*enable* [options] ::
+Enable in-band de-duplication for a filesystem.
++
+`Options`
++
+-f|--force
+Force 'enable' command to be exected.
+Will skip memory limit check and allow 'enable' to be executed even in-band
+de-duplication is already enabled.
++
+NOTE: If re-enable dedupe with '-f' option, any unspecified parameter will be
+reset to its default value.
+
+-s|--storage-backend 
+Specify de-duplication hash storage backend.
+Only 'inmemory' backend is supported yet.
+If not specified, default value is 'inmemory'.
++
+Refer to *BACKENDS* sector for more information.
+
+-b|--blocksize 
+Specify dedupe block size.
+Supported values are power of 2 from '16K' to '8M'.
+Default value is '128K'.
++
+Refer to *BLOCKSIZE* sector for more information.
+
+-a|--hash-algorithm 
+Specify hash algorithm.
+Only 'sha256' is supported yet.
+
+-l|--limit-hash 
+Specify maximum number of hashes stored in memory.
+Only works for 'inmemory' backend.
+Conflicts with '-m' option.
++
+Only positive values are valid.
+Default value is '32K'.
+
+-m|--limit-memory 
+Specify maximum memory used for hashes.
+Only works for 'inmemory' backend.
+Conflicts with '-l' option.
++
+Only value larger than or equal to '1024' is valid.
+No default value.
++
+NOTE: Memory limit will be rounded down to kernel internal hash size,
+so the memory limit shown in 'btrfs dedupe-inband status' may be different
+from the .
+
+WARNING: Too large value for '-l' or '-m' will easily trigger OOM.
+Please use with caution according to system memory.
+
+NOTE: In-band de-duplication is not compactible with compression yet.
+And compression has higher priority than in-band de-duplication, means if
+compression and de-duplication is enabled at the same time, only compression
+will work.
+
+BACKENDS
+
+Btrfs in-band de-duplication will support different storage backends, with
+different use case and features.
+
+In-memory backend::
+This backend provides backward-compatibility, and more fine-tuning options.
+But hash pool is non-persistent and may exhaust kernel memory if not setup
+properly.
++
+This backend can be used on old btrfs(without '-O dedupe' mkfs option).
+When used on old btrfs, this backend needs to be enabled manually after mount.
++
+Designed for fast hash search speed, in-memory backend will keep all dedupe
+hashes in memory. (Although overall performance is still much the same with
+'ondisk' backend if all 'ondisk' hash can be cached in memory)
++
+And only keeps limited number of hash in memory to avoid exhausting memory.
+Hashes over the limit will be dropped following Last-Recent-Use behavior.
+So this backend has a consistent overhead for given limit but can\'t ensure
+all duplicated blocks will be de-duplicated.
++
+After umount and mount, in-memory backend need to refill its hash pool.
+
+On-disk backend::
+This backend provides persistent hash pool, with more smart memory management
+for hash pool.
+But it\'s not backward-compatible, meaning it must be used with '-O dedupe' 
mkfs
+option and older kernel can\'t mount it read-write.
++
+Designed for de-duplication rate, hash pool is stored as btrfs B+ tree on disk.
+This behavior may cause extra disk IO for hash search under high memory
+pressure.
++
+After umount and mount, on-disk backend still has its hash on disk, no need to
+refill its dedupe hash pool.
+
+Currently, only 'inmemory' backend is supported in btrfs-progs.
+
+DEDUPE BLOCK SIZE
+
+In-band de-duplication is done at dedupe block size.
+Any data smaller than dedupe block size won\'t go through in-band
+de-duplication.
+
+And dedupe block size affects dedupe rate and fragmentation heavily.
+
+Smaller block size will cause more fragments, but higher dedupe rate.
+
+Larger block size will cause less fragments, but lower dedupe rate.
+
+In-band de-duplication rate is highly related to the workload pattern.
+So it\'s highly recommended to align dedupe block size to the workload
+block size to make full use of de-duplication.
 
 EXIT STATUS
 ---
diff --git a/btrfs-completion b/btrfs-completion
index 6ae57d1b752b..621801cf12fb 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -22,7 +22,7 @@ _btrfs()
 
local cmd=${words[1]}
 
-   commands

[PATCH v10.6 0/5] In-band de-duplication for btrfs-progs

2018-11-05 Thread Lu Fengqi

Patchset can be fetched from github:
https://github.com/littleroad/btrfs-progs.git dedupe_latest

Inband dedupe(in-memory backend only) ioctl support for btrfs-progs.

v7 changes:
   Update ctree.h to follow kernel structure change
   Update print-tree to follow kernel structure change
V8 changes:
   Move dedup props and on-disk backend support out of the patchset
   Change command group name to "dedupe-inband", to avoid confusion with
   possible out-of-band dedupe. Suggested by Mark.
   Rebase to latest devel branch.
V9 changes:
   Follow kernels ioctl change to support FORCE flag, new reconf ioctl,
   and more precious error reporting.
v10 changes:
   Rebase to v4.10.
   Add BUILD_ASSERT for btrfs_ioctl_dedupe_args
v10.1 changes:
   Rebase to v4.14.
v10.2 changes:
   Rebase to v4.16.1.
v10.3 changes:
   Rebase to v4.17.
v10.4 changes:
   Deal with offline reviews from Misono Tomohiro.
   1. s/btrfs-dedupe/btrfs-dedupe-inband
   2. Replace strerror(errno) with %m
   3. Use SZ_* instead of intermedia number
   4. update btrfs-completion for reconfigure subcommand
v10.5 changes:
   Rebase to v4.17.1.
v10.6 changes:
   Rebase to v4.19.

Qu Wenruo (5):
  btrfs-progs: Basic framework for dedupe-inband command group
  btrfs-progs: dedupe: Add enable command for dedupe command group
  btrfs-progs: dedupe: Add disable support for inband dedupelication
  btrfs-progs: dedupe: Add status subcommand
  btrfs-progs: dedupe: introduce reconfigure subcommand

 Documentation/Makefile.in  |   1 +
 Documentation/btrfs-dedupe-inband.asciidoc | 167 
 Documentation/btrfs.asciidoc   |   4 +
 Makefile   |   3 +-
 btrfs-completion   |   6 +-
 btrfs.c|   2 +
 cmds-dedupe-ib.c   | 437 +
 commands.h |   2 +
 dedupe-ib.h|  28 ++
 ioctl.h|  38 ++
 10 files changed, 686 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc
 create mode 100644 cmds-dedupe-ib.c
 create mode 100644 dedupe-ib.h

-- 
2.19.1

[PATCH v10.6 4/5] btrfs-progs: dedupe: Add status subcommand

2018-11-05 Thread Lu Fengqi

From: Qu Wenruo 

Add status subcommand for dedupe command group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  3 +
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 80 ++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 3452f690e3e5..6096389cb0b4 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -86,6 +86,9 @@ And compression has higher priority than in-band 
de-duplication, means if
 compression and de-duplication is enabled at the same time, only compression
 will work.
 
+*status* ::
+Show current in-band de-duplication status of a filesystem.
+
 BACKENDS
 
 Btrfs in-band de-duplication will support different storage backends, with
diff --git a/btrfs-completion b/btrfs-completion
index e6ec785bf849..0808f9a14df9 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -34,7 +34,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe_inband='enable disable'
+   commands_dedupe_inband='enable disable status'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index 91b6fe234043..e778457e25a8 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -298,12 +298,92 @@ out:
return 0;
 }
 
+static const char * const cmd_dedupe_ib_status_usage[] = {
+   "btrfs dedupe-inband status ",
+   "Show current in-band(write time) de-duplication status of a btrfs.",
+   NULL
+};
+
+static int cmd_dedupe_ib_status(int argc, char **argv)
+{
+   struct btrfs_ioctl_dedupe_args dargs;
+   DIR *dirstream;
+   char *path;
+   int fd;
+   int ret;
+   int print_limit = 1;
+
+   if (check_argc_exact(argc, 2))
+   usage(cmd_dedupe_ib_status_usage);
+
+   path = argv[1];
+   fd = open_file_or_dir(path, );
+   if (fd < 0) {
+   error("failed to open file or directory: %s", path);
+   ret = 1;
+   goto out;
+   }
+   memset(, 0, sizeof(dargs));
+   dargs.cmd = BTRFS_DEDUPE_CTL_STATUS;
+
+   ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, );
+   if (ret < 0) {
+   error("failed to get inband deduplication status: %m");
+   ret = 1;
+   goto out;
+   }
+   ret = 0;
+   if (dargs.status == 0) {
+   printf("Status: \t\t\tDisabled\n");
+   goto out;
+   }
+   printf("Status:\t\t\tEnabled\n");
+
+   if (dargs.hash_algo == BTRFS_DEDUPE_HASH_SHA256)
+   printf("Hash algorithm:\t\tSHA-256\n");
+   else
+   printf("Hash algorithm:\t\tUnrecognized(%x)\n",
+   dargs.hash_algo);
+
+   if (dargs.backend == BTRFS_DEDUPE_BACKEND_INMEMORY) {
+   printf("Backend:\t\tIn-memory\n");
+   print_limit = 1;
+   } else  {
+   printf("Backend:\t\tUnrecognized(%x)\n",
+   dargs.backend);
+   }
+
+   printf("Dedup Blocksize:\t%llu\n", dargs.blocksize);
+
+   if (print_limit) {
+   u64 cur_mem;
+
+   /* Limit nr may be 0 */
+   if (dargs.limit_nr)
+   cur_mem = dargs.current_nr * (dargs.limit_mem /
+   dargs.limit_nr);
+   else
+   cur_mem = 0;
+
+   printf("Number of hash: \t[%llu/%llu]\n", dargs.current_nr,
+   dargs.limit_nr);
+   printf("Memory usage: \t\t[%s/%s]\n",
+   pretty_size(cur_mem),
+   pretty_size(dargs.limit_mem));
+   }
+out:
+   close_file_or_dir(fd, dirstream);
+   return ret;
+}
+
 const struct cmd_group dedupe_ib_cmd_group = {
dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, {
{ "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage,
  NULL, 0},
{ "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage,
  NULL, 0},
+   { "status", cmd_dedupe_ib_status, cmd_dedupe_ib_status_usage,
+ NULL, 0},
NULL_CMD_STRUCT
}
 };
-- 
2.19.1

[PATCH v15.1 09/13] btrfs: introduce type based delalloc metadata reserve

2018-11-05 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce type based metadata reserve parameter for delalloc space
reservation/freeing function.

The problem we are going to solve is, btrfs use different max extent
size for different mount options.

For de-duplication, the max extent size can be set by the dedupe ioctl,
while for normal write it's 128M.
And furthermore, split/merge extent hook highly depends that max extent
size.

Such situation contributes to quite a lot of false ENOSPC.

So this patch introduces the facility to help solve these false ENOSPC
related to different max extent size.

Currently, only normal 128M extent size is supported. More types will
follow soon.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h |  43 ++---
 fs/btrfs/extent-tree.c   |  48 ---
 fs/btrfs/file.c  |  30 +
 fs/btrfs/free-space-cache.c  |   6 +-
 fs/btrfs/inode-map.c |   9 ++-
 fs/btrfs/inode.c | 115 +--
 fs/btrfs/ioctl.c |  23 +++
 fs/btrfs/ordered-data.c  |   6 +-
 fs/btrfs/ordered-data.h  |   3 +-
 fs/btrfs/relocation.c|  22 ---
 fs/btrfs/tests/inode-tests.c |  15 +++--
 11 files changed, 223 insertions(+), 97 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 910050d904ef..b119a19cbeaf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -92,11 +92,24 @@ static const int btrfs_csum_sizes[] = { 4 };
 /*
  * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
  */
-static inline u32 count_max_extents(u64 size)
+static inline u32 count_max_extents(u64 size, u64 max_extent_size)
 {
-   return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+   return div_u64(size + max_extent_size - 1, max_extent_size);
 }
 
+/*
+ * Type based metadata reserve type
+ * This affects how btrfs reserve metadata space for buffered write.
+ *
+ * This is caused by the different max extent size for normal COW
+ * and further in-band dedupe
+ */
+enum btrfs_metadata_reserve_type {
+   BTRFS_RESERVE_NORMAL,
+};
+
+u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
 };
@@ -2732,8 +2745,9 @@ int btrfs_check_data_free_space(struct inode *inode,
 void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved,
- u64 start, u64 len, bool qgroup_free);
+   struct extent_changeset *reserved,
+   u64 start, u64 len, bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -2743,13 +2757,17 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
  struct btrfs_block_rsv *rsv);
 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
-   bool qgroup_free);
+   bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
-bool qgroup_free);
+   bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 int btrfs_delalloc_reserve_space(struct inode *inode,
-   struct extent_changeset **reserved, u64 start, u64 len);
+   struct extent_changeset **reserved, u64 start, u64 len,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
  unsigned short type);
@@ -3152,7 +3170,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
  unsigned int extra_bits,
- struct extent_state **cached_state, int dedupe

[PATCH v15.1 10/13] btrfs: dedupe: Inband in-memory only de-duplication implement

2018-11-05 Thread Lu Fengqi

From: Qu Wenruo 

Core implement for inband de-duplication.
It reuses the async_cow_start() facility to do the calculate dedupe hash.
And use dedupe hash to do inband de-duplication at extent level.

The workflow is as below:
1) Run delalloc range for an inode
2) Calculate hash for the delalloc range at the unit of dedupe_bs
3) For hash match(duplicated) case, just increase source extent ref
   and insert file extent.
   For hash mismatch case, go through the normal cow_file_range()
   fallback, and add hash into dedupe_tree.
   Compress for hash miss case is not supported yet.

Current implement restore all dedupe hash in memory rb-tree, with LRU
behavior to control the limit.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |   4 +-
 fs/btrfs/dedupe.h  |  15 ++
 fs/btrfs/extent-tree.c |  31 +++-
 fs/btrfs/extent_io.c   |   7 +-
 fs/btrfs/extent_io.h   |   1 +
 fs/btrfs/file.c|   4 +
 fs/btrfs/inode.c   | 319 ++---
 fs/btrfs/ioctl.c   |   1 +
 fs/btrfs/relocation.c  |  18 +++
 9 files changed, 343 insertions(+), 57 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b119a19cbeaf..3a8e35b5328a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -106,9 +106,11 @@ static inline u32 count_max_extents(u64 size, u64 
max_extent_size)
  */
 enum btrfs_metadata_reserve_type {
BTRFS_RESERVE_NORMAL,
+   BTRFS_RESERVE_DEDUPE,
 };
 
-u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+u64 btrfs_max_extent_size(struct btrfs_inode *inode,
+ enum btrfs_metadata_reserve_type reserve_type);
 
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 87f5b7ce7766..8157b17c4d11 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -7,6 +7,7 @@
 #define BTRFS_DEDUPE_H
 
 #include 
+#include "btrfs_inode.h"
 
 /* 32 bytes for SHA256 */
 static const int btrfs_hash_sizes[] = { 32 };
@@ -47,6 +48,20 @@ struct btrfs_dedupe_info {
u64 current_nr;
 };
 
+static inline u64 btrfs_dedupe_blocksize(struct btrfs_inode *inode)
+{
+   struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+   return fs_info->dedupe_info->blocksize;
+}
+
+static inline int inode_need_dedupe(struct inode *inode)
+{
+   struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+
+   return fs_info->dedupe_enabled;
+}
+
 static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
 {
return (hash && hash->bytenr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2c8992b919ae..fa3654045ba8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -28,6 +28,7 @@
 #include "sysfs.h"
 #include "qgroup.h"
 #include "ref-verify.h"
+#include "dedupe.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -2492,6 +2493,17 @@ static int cleanup_ref_head(struct btrfs_trans_handle 
*trans,
btrfs_pin_extent(fs_info, head->bytenr,
 head->num_bytes, 1);
if (head->is_data) {
+   /*
+* If insert_reserved is given, it means
+* a new extent is revered, then deleted
+* in one tran, and inc/dec get merged to 0.
+*
+* In this case, we need to remove its dedupe
+* hash.
+*/
+   ret = btrfs_dedupe_del(fs_info, head->bytenr);
+   if (ret < 0)
+   return ret;
ret = btrfs_del_csums(trans, fs_info, head->bytenr,
  head->num_bytes);
}
@@ -5913,13 +5925,15 @@ static void btrfs_calculate_inode_block_rsv_size(struct 
btrfs_fs_info *fs_info,
spin_unlock(_rsv->lock);
 }
 
-u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type)
+u64 btrfs_max_extent_size(struct btrfs_inode *inode,
+ enum btrfs_metadata_reserve_type reserve_type)
 {
if (reserve_type == BTRFS_RESERVE_NORMAL)
return BTRFS_MAX_EXTENT_SIZE;
-
-   ASSERT(0);
-   return BTRFS_MAX_EXTENT_SIZE;
+   else if (reserve_type == BTRFS_RESERVE_DEDUPE)
+   return btrfs_dedupe_blocksize(inode);
+   else
+   return BTRFS_MAX_EXTENT_SIZE;
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
@@ -5930,7 +5944,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode 
*inode, u64 num_bytes,
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
-   u64 max_extent_size = btrfs_max_extent_size(r

[PATCH v15.1 02/13] btrfs: dedupe: Introduce function to initialize dedupe info

2018-11-05 Thread Lu Fengqi

From: Wang Xiaoguang 

Add generic function to initialize dedupe info.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/Makefile  |   2 +-
 fs/btrfs/dedupe.c  | 169 +
 fs/btrfs/dedupe.h  |  12 +++
 include/uapi/linux/btrfs.h |   3 +
 4 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/dedupe.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ca693dd554e9..78fdc87dba39 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-  uuid-tree.o props.o free-space-tree.o tree-checker.o
+  uuid-tree.o props.o free-space-tree.o tree-checker.o dedupe.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
new file mode 100644
index ..06523162753d
--- /dev/null
+++ b/fs/btrfs/dedupe.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2016 Fujitsu.  All rights reserved.
+ */
+
+#include "ctree.h"
+#include "dedupe.h"
+#include "btrfs_inode.h"
+#include "delayed-ref.h"
+
+struct inmem_hash {
+   struct rb_node hash_node;
+   struct rb_node bytenr_node;
+   struct list_head lru_list;
+
+   u64 bytenr;
+   u32 num_bytes;
+
+   u8 hash[];
+};
+
+static struct btrfs_dedupe_info *
+init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = kzalloc(sizeof(*dedupe_info), GFP_NOFS);
+   if (!dedupe_info)
+   return ERR_PTR(-ENOMEM);
+
+   dedupe_info->hash_algo = dargs->hash_algo;
+   dedupe_info->backend = dargs->backend;
+   dedupe_info->blocksize = dargs->blocksize;
+   dedupe_info->limit_nr = dargs->limit_nr;
+
+   /* only support SHA256 yet */
+   dedupe_info->dedupe_driver = crypto_alloc_shash("sha256", 0, 0);
+   if (IS_ERR(dedupe_info->dedupe_driver)) {
+   kfree(dedupe_info);
+   return ERR_CAST(dedupe_info->dedupe_driver);
+   }
+
+   dedupe_info->hash_root = RB_ROOT;
+   dedupe_info->bytenr_root = RB_ROOT;
+   dedupe_info->current_nr = 0;
+   INIT_LIST_HEAD(_info->lru_list);
+   mutex_init(_info->lock);
+
+   return dedupe_info;
+}
+
+/*
+ * Helper to check if parameters are valid.
+ * The first invalid field will be set to (-1), to info user which parameter
+ * is invalid.
+ * Except dargs->limit_nr or dargs->limit_mem, in that case, 0 will returned
+ * to info user, since user can specify any value to limit, except 0.
+ */
+static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   u64 blocksize = dargs->blocksize;
+   u64 limit_nr = dargs->limit_nr;
+   u64 limit_mem = dargs->limit_mem;
+   u16 hash_algo = dargs->hash_algo;
+   u8 backend = dargs->backend;
+
+   /*
+* Set all reserved fields to -1, allow user to detect
+* unsupported optional parameters.
+*/
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   if (blocksize > BTRFS_DEDUPE_BLOCKSIZE_MAX ||
+   blocksize < BTRFS_DEDUPE_BLOCKSIZE_MIN ||
+   blocksize < fs_info->sectorsize ||
+   !is_power_of_2(blocksize) ||
+   blocksize < PAGE_SIZE) {
+   dargs->blocksize = (u64)-1;
+   return -EINVAL;
+   }
+   if (hash_algo >= ARRAY_SIZE(btrfs_hash_sizes)) {
+   dargs->hash_algo = (u16)-1;
+   return -EINVAL;
+   }
+   if (backend >= BTRFS_DEDUPE_BACKEND_COUNT) {
+   dargs->backend = (u8)-1;
+   return -EINVAL;
+   }
+
+   /* Backend specific check */
+   if (backend == BTRFS_DEDUPE_BACKEND_INMEMORY) {
+   /* only one limit is accepted for enable*/
+   if (dargs->limit_nr && dargs->limit_mem) {
+   dargs->limit_nr = 0;
+   dargs->limit_mem = 0;
+   return -EINVAL;
+   }
+
+   if (!limit_nr && !limit_mem)
+   dargs->limit_nr = BTRFS_DEDUPE_LIMIT_NR_DEFAULT;
+   else {
+   u64 tmp = (u64)-1;
+
+   if (limit_mem) {
+   tmp = div_u64(limit_mem,
+

[PATCH v15.1 11/13] btrfs: dedupe: Add ioctl for inband deduplication

2018-11-05 Thread Lu Fengqi

From: Wang Xiaoguang 

Add ioctl interface for inband deduplication, which includes:
1) enable
2) disable
3) status

And a pseudo RO compat flag, to imply that btrfs now supports inband
dedup.
However we don't add any ondisk format change, it's just a pseudo RO
compat flag.

All these ioctl interfaces are state-less, which means caller don't need
to bother previous dedupe state before calling them, and only need to
care the final desired state.

For example, if user want to enable dedupe with specified block size and
limit, just fill the ioctl structure and call enable ioctl.
No need to check if dedupe is already running.

These ioctls will handle things like re-configure or disable quite well.

Also, for invalid parameters, enable ioctl interface will set the field
of the first encountered invalid parameter to (-1) to inform caller.
While for limit_nr/limit_mem, the value will be (0).

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 50 ++
 fs/btrfs/dedupe.h  | 17 +---
 fs/btrfs/disk-io.c |  3 ++
 fs/btrfs/ioctl.c   | 85 ++
 fs/btrfs/sysfs.c   |  2 +
 include/uapi/linux/btrfs.h | 12 +-
 6 files changed, 163 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 6199215022e6..76a967cca68e 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -29,6 +29,35 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled || !dedupe_info) {
+   dargs->status = 0;
+   dargs->blocksize = 0;
+   dargs->backend = 0;
+   dargs->hash_algo = 0;
+   dargs->limit_nr = 0;
+   dargs->current_nr = 0;
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   return;
+   }
+   mutex_lock(_info->lock);
+   dargs->status = 1;
+   dargs->blocksize = dedupe_info->blocksize;
+   dargs->backend = dedupe_info->backend;
+   dargs->hash_algo = dedupe_info->hash_algo;
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   dargs->current_nr = dedupe_info->current_nr;
+   mutex_unlock(_info->lock);
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+}
+
 static struct btrfs_dedupe_info *
 init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -402,6 +431,27 @@ static void unblock_all_writers(struct btrfs_fs_info 
*fs_info)
percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
 }
 
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   fs_info->dedupe_enabled = 0;
+   /* same as disable */
+   smp_wmb();
+   dedupe_info = fs_info->dedupe_info;
+   fs_info->dedupe_info = NULL;
+
+   if (!dedupe_info)
+   return 0;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   crypto_free_shash(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return 0;
+}
+
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
 {
struct btrfs_dedupe_info *dedupe_info;
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 8157b17c4d11..fdd00355d6b5 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -90,6 +90,15 @@ static inline struct btrfs_dedupe_hash 
*btrfs_dedupe_alloc_hash(u16 algo)
 int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dedupe_args *dargs);
 
+
+/*
+ * Get inband dedupe info
+ * Since it needs to access different backends' hash size, which
+ * is not exported, we need such simple function.
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
 /*
  * Disable dedupe and invalidate all its dedupe data.
  * Called at dedupe disable time.
@@ -101,12 +110,10 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
 
 /*
- * Get current dedupe status.
- * Return 0 for success
- * No possible error yet
+ * Cleanup current btrfs_dedupe_info
+ * Called in umount time
  */
-void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
-struct btrfs_ioctl_dedupe_args *dargs);
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info);
 
 /*
  * Calculate hash for dedupe.
diff --git a/fs

[PATCH v15.1 06/13] btrfs: dedupe: Introduce function to search for an existing hash

2018-11-05 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce static function inmem_search() to handle the job for in-memory
hash tree.

The trick is, we must ensure the delayed ref head is not being run at
the time we search the for the hash.

With inmem_search(), we can implement the btrfs_dedupe_search()
interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 210 +-
 1 file changed, 209 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 951fefd19fde..03ad41423c01 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -7,6 +7,8 @@
 #include "dedupe.h"
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
+#include "qgroup.h"
+#include "transaction.h"
 
 struct inmem_hash {
struct rb_node hash_node;
@@ -242,7 +244,6 @@ static int inmem_add(struct btrfs_dedupe_info *dedupe_info,
struct inmem_hash *ihash;
 
ihash = inmem_alloc_hash(algo);
-
if (!ihash)
return -ENOMEM;
 
@@ -436,3 +437,210 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
kfree(dedupe_info);
return 0;
 }
+
+/*
+ * Caller must ensure the corresponding ref head is not being run.
+ */
+static struct inmem_hash *
+inmem_search_hash(struct btrfs_dedupe_info *dedupe_info, u8 *hash)
+{
+   struct rb_node **p = _info->hash_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+   u16 hash_algo = dedupe_info->hash_algo;
+   int hash_len = btrfs_hash_sizes[hash_algo];
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+
+   if (memcmp(hash, entry->hash, hash_len) < 0) {
+   p = &(*p)->rb_left;
+   } else if (memcmp(hash, entry->hash, hash_len) > 0) {
+   p = &(*p)->rb_right;
+   } else {
+   /* Found, need to re-add it to LRU list head */
+   list_del(>lru_list);
+   list_add(>lru_list, _info->lru_list);
+   return entry;
+   }
+   }
+   return NULL;
+}
+
+static int inmem_search(struct btrfs_dedupe_info *dedupe_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash)
+{
+   int ret;
+   struct btrfs_root *root = BTRFS_I(inode)->root;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_delayed_ref_root *delayed_refs;
+   struct btrfs_delayed_ref_head *head;
+   struct btrfs_delayed_ref_head *insert_head;
+   struct btrfs_delayed_data_ref *insert_dref;
+   struct btrfs_qgroup_extent_record *insert_qrecord = NULL;
+   struct inmem_hash *found_hash;
+   int free_insert = 1;
+   int qrecord_inserted = 0;
+   u64 ref_root = root->root_key.objectid;
+   u64 bytenr;
+   u32 num_bytes;
+
+   insert_head = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+   if (!insert_head)
+   return -ENOMEM;
+   insert_head->extent_op = NULL;
+
+   insert_dref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
+   if (!insert_dref) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep, insert_head);
+   return -ENOMEM;
+   }
+   if (test_bit(BTRFS_FS_QUOTA_ENABLED, >fs_info->flags) &&
+   is_fstree(ref_root)) {
+   insert_qrecord = kmalloc(sizeof(*insert_qrecord), GFP_NOFS);
+   if (!insert_qrecord) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep,
+   insert_head);
+   kmem_cache_free(btrfs_delayed_data_ref_cachep,
+   insert_dref);
+   return -ENOMEM;
+   }
+   }
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto free_mem;
+   }
+
+again:
+   mutex_lock(_info->lock);
+   found_hash = inmem_search_hash(dedupe_info, hash->hash);
+   /* If we don't find a duplicated extent, just return. */
+   if (!found_hash) {
+   ret = 0;
+   goto out;
+   }
+   bytenr = found_hash->bytenr;
+   num_bytes = found_hash->num_bytes;
+
+   btrfs_init_delayed_ref_head(insert_head, insert_qrecord, bytenr,
+   num_bytes, ref_root, 0, BTRFS_ADD_DELAYED_REF, true,
+   false);
+
+   btrfs_init_delayed_ref_common(trans->fs_info, _dref->node,
+   bytenr, num_bytes, ref_root, BTRFS_ADD_DELAYED_REF,
+   BTRFS_EXTENT_DATA_REF_KEY);
+   insert_dref->r

[PATCH v15.1 01/13] btrfs: dedupe: Introduce dedupe framework and its header

2018-11-05 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce the header for btrfs in-band(write time) de-duplication
framework and needed header.

The new de-duplication framework is going to support 2 different dedupe
methods and 1 dedupe hash.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |   7 ++
 fs/btrfs/dedupe.h  | 128 -
 fs/btrfs/disk-io.c |   1 +
 include/uapi/linux/btrfs.h |  34 ++
 4 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80953528572d..910050d904ef 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1118,6 +1118,13 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
 #endif
+
+   /*
+* Inband de-duplication related structures
+*/
+   unsigned long dedupe_enabled:1;
+   struct btrfs_dedupe_info *dedupe_info;
+   struct mutex dedupe_ioctl_lock;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 90281a7a35a8..222ce7b4d827 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -6,7 +6,131 @@
 #ifndef BTRFS_DEDUPE_H
 #define BTRFS_DEDUPE_H
 
-/* later in-band dedupe will expand this struct */
-struct btrfs_dedupe_hash;
+#include 
 
+/* 32 bytes for SHA256 */
+static const int btrfs_hash_sizes[] = { 32 };
+
+/*
+ * For caller outside of dedupe.c
+ *
+ * Different dedupe backends should have their own hash structure
+ */
+struct btrfs_dedupe_hash {
+   u64 bytenr;
+   u32 num_bytes;
+
+   /* last field is a variable length array of dedupe hash */
+   u8 hash[];
+};
+
+struct btrfs_dedupe_info {
+   /* dedupe blocksize */
+   u64 blocksize;
+   u16 backend;
+   u16 hash_algo;
+
+   struct crypto_shash *dedupe_driver;
+
+   /*
+* Use mutex to portect both backends
+* Even for in-memory backends, the rb-tree can be quite large,
+* so mutex is better for such use case.
+*/
+   struct mutex lock;
+
+   /* following members are only used in in-memory backend */
+   struct rb_root hash_root;
+   struct rb_root bytenr_root;
+   struct list_head lru_list;
+   u64 limit_nr;
+   u64 current_nr;
+};
+
+static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
+{
+   return (hash && hash->bytenr);
+}
+
+/*
+ * Initial inband dedupe info
+ * Called at dedupe enable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (from unsupported param to tree creation error for some backends)
+ */
+int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
+   struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Disable dedupe and invalidate all its dedupe data.
+ * Called at dedupe disable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
+
+/*
+ * Get current dedupe status.
+ * Return 0 for success
+ * No possible error yet
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Calculate hash for dedupe.
+ * Caller must ensure [start, start + dedupe_bs) has valid data.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (error from hash codes)
+ */
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash);
+
+/*
+ * Search for duplicated extents by calculated hash
+ * Caller must call btrfs_dedupe_calc_hash() first to get the hash.
+ *
+ * @inode: the inode for we are writing
+ * @file_pos: offset inside the inode
+ * As we will increase extent ref immediately after a hash match,
+ * we need @file_pos and @inode in this case.
+ *
+ * Return > 0 for a hash match, and the extent ref will be
+ * *INCREASED*, and hash->bytenr/num_bytes will record the existing
+ * extent data.
+ * Return 0 for a hash miss. Nothing is done
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash);
+
+/*
+ * Add a dedupe hash into dedupe info
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_add(struct btrfs_fs_info *fs_info,
+struct btrfs_dedupe_hash *hash);
+
+/*
+ * Remove a dedupe hash from dedupe info
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ *
+ * NOTE: if hash deletion error is not handled well, it will lead
+ * to corrupted fs, as later dedupe write can points to non-exist

[PATCH v15.1 04/13] btrfs: dedupe: Introduce function to remove hash from in-memory tree

2018-11-05 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce static function inmem_del() to remove hash from in-memory
dedupe tree.
And implement btrfs_dedupe_del() and btrfs_dedup_disable() interfaces.

Also for btrfs_dedupe_disable(), add new functions to wait existing
writer and block incoming writers to eliminate all possible race.

Cc: Mark Fasheh 
Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 131 +++---
 1 file changed, 125 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 784bb3a8a5ab..951fefd19fde 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -170,12 +170,6 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
return ret;
 }
 
-int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
-{
-   /* Place holder for bisect, will be implemented in later patches */
-   return 0;
-}
-
 static int inmem_insert_hash(struct rb_root *root,
 struct inmem_hash *hash, int hash_len)
 {
@@ -317,3 +311,128 @@ int btrfs_dedupe_add(struct btrfs_fs_info *fs_info,
return inmem_add(dedupe_info, hash);
return -EINVAL;
 }
+
+static struct inmem_hash *
+inmem_search_bytenr(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct rb_node **p = _info->bytenr_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+
+   if (bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return entry;
+   }
+
+   return NULL;
+}
+
+/* Delete a hash from in-memory dedupe tree */
+static int inmem_del(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct inmem_hash *hash;
+
+   mutex_lock(_info->lock);
+   hash = inmem_search_bytenr(dedupe_info, bytenr);
+   if (!hash) {
+   mutex_unlock(_info->lock);
+   return 0;
+   }
+
+   __inmem_del(dedupe_info, hash);
+   mutex_unlock(_info->lock);
+   return 0;
+}
+
+/* Remove a dedupe hash from dedupe tree */
+int btrfs_dedupe_del(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   return inmem_del(dedupe_info, bytenr);
+   return -EINVAL;
+}
+
+static void inmem_destroy(struct btrfs_dedupe_info *dedupe_info)
+{
+   struct inmem_hash *entry, *tmp;
+
+   mutex_lock(_info->lock);
+   list_for_each_entry_safe(entry, tmp, _info->lru_list, lru_list)
+   __inmem_del(dedupe_info, entry);
+   mutex_unlock(_info->lock);
+}
+
+/*
+ * Helper function to wait and block all incoming writers
+ *
+ * Use rw_sem introduced for freeze to wait/block writers.
+ * So during the block time, no new write will happen, so we can
+ * do something quite safe, espcially helpful for dedupe disable,
+ * as it affect buffered write.
+ */
+static void block_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+   down_write(>s_umount);
+}
+
+static void unblock_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   up_write(>s_umount);
+   percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+}
+
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+   int ret;
+
+   dedupe_info = fs_info->dedupe_info;
+
+   if (!dedupe_info)
+   return 0;
+
+   /* Don't allow disable status change in RO mount */
+   if (fs_info->sb->s_flags & MS_RDONLY)
+   return -EROFS;
+
+   /*
+* Wait for all unfinished writers and block further writers.
+* Then sync the whole fs so all current write will go through
+* dedupe, and all later write won't go through dedupe.
+*/
+   block_all_writers(fs_info);
+   ret = sync_filesystem(fs_info->sb);
+   fs_info->dedupe_enabled = 0;
+   fs_info->dedupe_info = NULL;
+   unblock_all_writers(fs_info);
+   if (ret < 0)
+   return ret;
+
+   /* now we are OK to clean up everything */
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   crypto_free_shash(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return 0;
+}
-- 
2.19.1

[PATCH v15.1 13/13] btrfs: dedupe: Introduce new reconfigure ioctl

2018-11-05 Thread Lu Fengqi

From: Qu Wenruo 

Introduce new reconfigure ioctl and new FORCE flag for in-band dedupe
ioctls.

Now dedupe enable and reconfigure ioctl are stateful.


| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Not allowed |
| Enabled   |  reconf| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  dsiable   | Disabled|
| Disabled  |  reconf| Not allowed |

(While disable is always stateless)

While for guys prefer stateless ioctl (myself for example), new FORCE
flag is introduced.

In FORCE mode, enable/disable is completely stateless.

| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  disable   | Disabled|


Also, re-configure ioctl will only modify specified fields.
Unlike enable, un-specified fields will be filled with default value.

For example:
 # btrfs dedupe enable --block-size 64k /mnt
 # btrfs dedupe reconfigure --limit-hash 1m /mnt
Will leads to:
 dedupe blocksize: 64K
 dedupe hash limit nr: 1m

While for enable:
 # btrfs dedupe enable --force --block-size 64k /mnt
 # btrfs dedupe enable --force --limit-hash 1m /mnt
Will reset blocksize to default value:
 dedupe blocksize: 128K << reset
 dedupe hash limit nr: 1m

Suggested-by: David Sterba 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 132 ++---
 fs/btrfs/dedupe.h  |  13 
 fs/btrfs/ioctl.c   |  13 
 include/uapi/linux/btrfs.h |  11 +++-
 4 files changed, 143 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 76a967cca68e..92152134d3c0 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -29,6 +29,40 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+/*
+ * Copy from current dedupe info to fill dargs.
+ * For reconf case, only fill members which is uninitialized.
+ */
+static void get_dedupe_status(struct btrfs_dedupe_info *dedupe_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   int reconf = (dargs->cmd == BTRFS_DEDUPE_CTL_RECONF);
+
+   dargs->status = 1;
+
+   if (!reconf || (reconf && dargs->blocksize == (u64)-1))
+   dargs->blocksize = dedupe_info->blocksize;
+   if (!reconf || (reconf && dargs->backend == (u16)-1))
+   dargs->backend = dedupe_info->backend;
+   if (!reconf || (reconf && dargs->hash_algo == (u16)-1))
+   dargs->hash_algo = dedupe_info->hash_algo;
+
+   /*
+* For re-configure case, if not modifying limit,
+* therir limit will be set to 0, unlike other fields
+*/
+   if (!reconf || !(dargs->limit_nr || dargs->limit_mem)) {
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
+   /* current_nr doesn't makes sense for reconfig case */
+   if (!reconf)
+   dargs->current_nr = dedupe_info->current_nr;
+}
+
 void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
 struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -45,15 +79,7 @@ void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
return;
}
mutex_lock(_info->lock);
-   dargs->status = 1;
-   dargs->blocksize = dedupe_info->blocksize;
-   dargs->backend = dedupe_info->backend;
-   dargs->hash_algo = dedupe_info->hash_algo;
-   dargs->limit_nr = dedupe_info->limit_nr;
-   dargs->limit_mem = dedupe_info->limit_nr *
-   (sizeof(struct inmem_hash) +
-btrfs_hash_sizes[dedupe_info->hash_algo]);
-   dargs->current_nr = dedupe_info->current_nr;
+   get_dedupe_status(dedupe_info, dargs);
mutex_unlock(_info->lock);
memset(dargs->__unused, -1, sizeof(dargs->__unused));
 }
@@ -98,17 +124,50 @@ init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
 static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
  struct btrfs_ioctl_dedupe_args *dargs)
 {
-   u64 blocksize = dargs->blocksize;
-   u64 limit_nr = dargs->limit_nr;
-   u64 limit_mem = dargs->limit_mem;
-   u16 hash_algo = dargs->hash_algo;
-   u8 backend

[PATCH v15.1 12/13] btrfs: relocation: Enhance error handling to avoid BUG_ON

2018-11-05 Thread Lu Fengqi

From: Qu Wenruo 

Since the introduction of btrfs dedupe tree, it's possible that balance can
race with dedupe disabling.

When this happens, dedupe_enabled will make btrfs_get_fs_root() return
PTR_ERR(-ENOENT).
But due to a bug in error handling branch, when this happens
backref_cache->nr_nodes is increased but the node is neither added to
backref_cache or nr_nodes decreased.
Causing BUG_ON() in backref_cache_cleanup()

[ 2611.668810] [ cut here ]
[ 2611.669946] kernel BUG at
/home/sat/ktest/linux/fs/btrfs/relocation.c:243!
[ 2611.670572] invalid opcode:  [#1] SMP
[ 2611.686797] Call Trace:
[ 2611.687034]  []
btrfs_relocate_block_group+0x1b3/0x290 [btrfs]
[ 2611.687706]  []
btrfs_relocate_chunk.isra.40+0x47/0xd0 [btrfs]
[ 2611.688385]  [] btrfs_balance+0xb22/0x11e0 [btrfs]
[ 2611.688966]  [] btrfs_ioctl_balance+0x391/0x3a0
[btrfs]
[ 2611.689587]  [] btrfs_ioctl+0x1650/0x2290 [btrfs]
[ 2611.690145]  [] ? lru_cache_add+0x3a/0x80
[ 2611.690647]  [] ?
lru_cache_add_active_or_unevictable+0x4c/0xc0
[ 2611.691310]  [] ? handle_mm_fault+0xcd4/0x17f0
[ 2611.691842]  [] ? cp_new_stat+0x153/0x180
[ 2611.692342]  [] ? __vma_link_rb+0xfd/0x110
[ 2611.692842]  [] ? vma_link+0xb9/0xc0
[ 2611.693303]  [] do_vfs_ioctl+0xa1/0x5a0
[ 2611.693781]  [] ? __do_page_fault+0x1b4/0x400
[ 2611.694310]  [] SyS_ioctl+0x41/0x70
[ 2611.694758]  [] entry_SYSCALL_64_fastpath+0x12/0x71
[ 2611.695331] Code: ff 48 8b 45 bf 49 83 af a8 05 00 00 01 49 89 87 a0
05 00 00 e9 2e fd ff ff b8 f4 ff ff ff e9 e4 fb ff ff 0f 0b 0f 0b 0f 0b
0f 0b <0f> 0b 0f 0b 41 89 c6 e9 b8 fb ff ff e8 9e a6 e8 e0 4c 89 e7 44
[ 2611.697870] RIP  []
relocate_block_group+0x741/0x7a0 [btrfs]
[ 2611.698818]  RSP 

This patch will call remove_backref_node() in error handling branch, and
cache the returned -ENOENT in relocate_tree_block() and continue
balancing.

Reported-by: Satoru Takeuchi 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/relocation.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b7c304c6e741..ee96390d1e42 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -854,6 +854,13 @@ struct backref_node *build_backref_tree(struct 
reloc_control *rc,
root = read_fs_root(rc->extent_root->fs_info, key.offset);
if (IS_ERR(root)) {
err = PTR_ERR(root);
+   /*
+* Don't forget to cleanup current node.
+* As it may not be added to backref_cache but nr_node
+* increased.
+* This will cause BUG_ON() in backref_cache_cleanup().
+*/
+   remove_backref_node(>backref_cache, cur);
goto out;
}
 
@@ -3021,8 +3028,15 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
node = build_backref_tree(rc, >key,
  block->level, block->bytenr);
if (IS_ERR(node)) {
+   /*
+* The root(dedupe tree yet) of the tree block is
+* going to be freed and can't be reached.
+* Just skip it and continue balancing.
+*/
+   if (PTR_ERR(node) == -ENOENT)
+   continue;
err = PTR_ERR(node);
-   goto out;
+   break;
}
 
ret = relocate_tree_block(trans, rc, node, >key,
@@ -3030,10 +3044,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
if (ret < 0) {
if (ret != -EAGAIN || >rb_node == 
rb_first(blocks))
err = ret;
-   goto out;
+   break;
}
}
-out:
err = finish_pending_nodes(trans, rc, path, err);
 
 out_free_path:
-- 
2.19.1

[PATCH v15.1 00/13] Btrfs In-band De-duplication

2018-11-05 Thread Lu Fengqi

This patchset can be fetched from github:
https://github.com/littleroad/linux.git dedupe_latest

Now the new base is v4.20-rc1.

Normal test cases from auto group exposes no regression, and ib-dedupe
group can pass without problem.

xfstests ib-dedupe group can be fetched from github:
https://github.com/littleroad/xfstests-dev.git btrfs_dedupe_latest

Changelog:
v2:
  Totally reworked to handle multiple backends
v3:
  Fix a stupid but deadly on-disk backend bug
  Add handle for multiple hash on same bytenr corner case to fix abort
  trans error
  Increase dedup rate by enhancing delayed ref handler for both backend.
  Move dedup_add() to run_delayed_ref() time, to fix abort trans error.
  Increase dedup block size up limit to 8M.
v4:
  Add dedup prop for disabling dedup for given files/dirs.
  Merge inmem_search() and ondisk_search() into generic_search() to save
  some code
  Fix another delayed_ref related bug.
  Use the same mutex for both inmem and ondisk backend.
  Move dedup_add() back to btrfs_finish_ordered_io() to increase dedup
  rate.
v5:
  Reuse compress routine for much simpler dedup function.
  Slightly improved performance due to above modification.
  Fix race between dedup enable/disable
  Fix for false ENOSPC report
v6:
  Further enable/disable race window fix.
  Minor format change according to checkpatch.
v7:
  Fix one concurrency bug with balance.
  Slightly modify return value from -EINVAL to -EOPNOTSUPP for
  btrfs_dedup_ioctl() to allow progs to distinguish unsupported commands
  and wrong parameter.
  Rebased to integration-4.6.
v8:
  Rename 'dedup' to 'dedupe'.
  Add support to allow dedupe and compression work at the same time.
  Fix several balance related bugs. Special thanks to Satoru Takeuchi,
  who exposed most of them.
  Small dedupe hit case performance improvement.
v9:
  Re-order the patchset to completely separate pure in-memory and any
  on-disk format change.
  Fold bug fixes into its original patch.
v10:
  Adding back missing bug fix patch.
  Reduce on-disk item size.
  Hide dedupe ioctl under CONFIG_BTRFS_DEBUG.
v11:
  Remove other backend and props support to focus on the framework and
  in-memory backend. Suggested by David.
  Better disable and buffered write race protection.
  Comprehensive fix to dedupe metadata ENOSPC problem.
v12:
  Stateful 'enable' ioctl and new 'reconf' ioctl
  New FORCE flag for enable ioctl to allow stateless ioctl
  Precise error report and extendable ioctl structure.
v12.1
  Rebase to David's for-next-20160704 branch
  Add co-ordinate patch for subpage and dedupe patchset.
v12.2
  Rebase to David's for-next-20160715 branch
  Add co-ordinate patch for other patchset.
v13
  Rebase to David's for-next-20160906 branch
  Fix a reserved space leak bug, which only frees quota reserved space
  but not space_info->byte_may_use.
v13.1
  Rebase to Chris' for-linux-4.9 branch
v14
  Use generic ENOSPC fix for both compression and dedupe.
v14.1
  Further split ENOSPC fix.
v14.2
  Rebase to v4.11-rc2.
  Co-operate with count_max_extent() to calculate num_extents.
  No longer rely on qgroup fixes.
v14.3
  Rebase to v4.12-rc1.
v14.4
  Rebase to kdave/for-4.13-part1.
v14.5
  Rebase to v4.15-rc3.
v14.6
  Rebase to v4.17-rc5.
v14.7
  Replace SHASH_DESC_ON_STACK with kmalloc to remove VLA.
  Fixed the following errors by switching to div_u64.
  ├── arm-allmodconfig
  │   └── ERROR:__aeabi_uldivmod-fs-btrfs-btrfs.ko-undefined
  └── i386-allmodconfig
  └── ERROR:__udivdi3-fs-btrfs-btrfs.ko-undefined
v14.8
  Rebase to v4.18-rc4.
v15
  Rebase to v4.19-rc2.
  Drop "btrfs: Introduce COMPRESS reserve type to fix false enospc for 
compression".
  Remove the ifdef around btrfs inband dedupe ioctl.
v15.1
  Rebase to v4.20-rc1.

Qu Wenruo (4):
  btrfs: delayed-ref: Add support for increasing data ref under spinlock
  btrfs: dedupe: Inband in-memory only de-duplication implement
  btrfs: relocation: Enhance error handling to avoid BUG_ON
  btrfs: dedupe: Introduce new reconfigure ioctl

Wang Xiaoguang (9):
  btrfs: dedupe: Introduce dedupe framework and its header
  btrfs: dedupe: Introduce function to initialize dedupe info
  btrfs: dedupe: Introduce function to add hash into in-memory tree
  btrfs: dedupe: Introduce function to remove hash from in-memory tree
  btrfs: dedupe: Introduce function to search for an existing hash
  btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface
  btrfs: ordered-extent: Add support for dedupe
  btrfs: introduce type based delalloc metadata reserve
  btrfs: dedupe: Add ioctl for inband deduplication

 fs/btrfs/Makefile|   2 +-
 fs/btrfs/ctree.h |  52 ++-
 fs/btrfs/dedupe.c| 828 +++
 fs/btrfs/dedupe.h| 175 +++-
 fs/btrfs/delayed-ref.c   |  53 ++-
 fs/btrfs/delayed-ref.h   |  15 +
 fs/btrfs/disk-io.c   |   4 +
 fs/btrfs/extent-tree.c   |  67 ++-
 fs/btrfs/extent_io.c |   7 +-
 fs/btrfs/extent_io.h |   1 +

[PATCH v15.1 03/13] btrfs: dedupe: Introduce function to add hash into in-memory tree

2018-11-05 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce static function inmem_add() to add hash into in-memory tree.
And now we can implement the btrfs_dedupe_add() interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 150 ++
 1 file changed, 150 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 06523162753d..784bb3a8a5ab 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -19,6 +19,14 @@ struct inmem_hash {
u8 hash[];
 };
 
+static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
+{
+   if (WARN_ON(algo >= ARRAY_SIZE(btrfs_hash_sizes)))
+   return NULL;
+   return kzalloc(sizeof(struct inmem_hash) + btrfs_hash_sizes[algo],
+   GFP_NOFS);
+}
+
 static struct btrfs_dedupe_info *
 init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -167,3 +175,145 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
/* Place holder for bisect, will be implemented in later patches */
return 0;
 }
+
+static int inmem_insert_hash(struct rb_root *root,
+struct inmem_hash *hash, int hash_len)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+   if (memcmp(hash->hash, entry->hash, hash_len) < 0)
+   p = &(*p)->rb_left;
+   else if (memcmp(hash->hash, entry->hash, hash_len) > 0)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(>hash_node, parent, p);
+   rb_insert_color(>hash_node, root);
+   return 0;
+}
+
+static int inmem_insert_bytenr(struct rb_root *root,
+  struct inmem_hash *hash)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+   if (hash->bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (hash->bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(>bytenr_node, parent, p);
+   rb_insert_color(>bytenr_node, root);
+   return 0;
+}
+
+static void __inmem_del(struct btrfs_dedupe_info *dedupe_info,
+   struct inmem_hash *hash)
+{
+   list_del(>lru_list);
+   rb_erase(>hash_node, _info->hash_root);
+   rb_erase(>bytenr_node, _info->bytenr_root);
+
+   if (!WARN_ON(dedupe_info->current_nr == 0))
+   dedupe_info->current_nr--;
+
+   kfree(hash);
+}
+
+/*
+ * Insert a hash into in-memory dedupe tree
+ * Will remove exceeding last recent use hash.
+ *
+ * If the hash mathced with existing one, we won't insert it, to
+ * save memory
+ */
+static int inmem_add(struct btrfs_dedupe_info *dedupe_info,
+struct btrfs_dedupe_hash *hash)
+{
+   int ret = 0;
+   u16 algo = dedupe_info->hash_algo;
+   struct inmem_hash *ihash;
+
+   ihash = inmem_alloc_hash(algo);
+
+   if (!ihash)
+   return -ENOMEM;
+
+   /* Copy the data out */
+   ihash->bytenr = hash->bytenr;
+   ihash->num_bytes = hash->num_bytes;
+   memcpy(ihash->hash, hash->hash, btrfs_hash_sizes[algo]);
+
+   mutex_lock(_info->lock);
+
+   ret = inmem_insert_bytenr(_info->bytenr_root, ihash);
+   if (ret > 0) {
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   ret = inmem_insert_hash(_info->hash_root, ihash,
+   btrfs_hash_sizes[algo]);
+   if (ret > 0) {
+   /*
+* We only keep one hash in tree to save memory, so if
+* hash conflicts, free the one to insert.
+*/
+   rb_erase(>bytenr_node, _info->bytenr_root);
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   list_add(>lru_list, _info->lru_list);
+   dedupe_info->current_nr++;
+
+   /* Remove the last dedupe hash if we exceed limit */
+   while (dedupe_info->current_nr > dedupe_info->limit_nr) {
+   struct inmem_hash *last;
+
+   last = list_entry(dedupe_info->lru_list.prev,
+ struct inmem_hash, lru_list);
+   __inmem_del(dedupe_info, last);
+   }
+out:
+   mutex_unlock(_info->lock);
+

[PATCH v15.1 05/13] btrfs: delayed-ref: Add support for increasing data ref under spinlock

2018-11-05 Thread Lu Fengqi

From: Qu Wenruo 

For in-band dedupe, btrfs needs to increase data ref with delayed_ref
locked, so add a new function btrfs_add_delayed_data_ref_lock() to
increase extent ref with delayed_refs already locked. Export
init_delayed_ref_head and init_delayed_ref_common for inband dedupe.

Signed-off-by: Qu Wenruo 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 53 +-
 fs/btrfs/delayed-ref.h | 15 
 2 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9301b3ad9217..ae8968f10ce0 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -533,7 +533,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root 
*delayed_refs,
spin_unlock(>lock);
 }
 
-static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+void btrfs_init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
  struct btrfs_qgroup_extent_record *qrecord,
  u64 bytenr, u64 num_bytes, u64 ref_root,
  u64 reserved, int action, bool is_data,
@@ -661,7 +661,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 }
 
 /*
- * init_delayed_ref_common - Initialize the structure which represents a
+ * btrfs_init_delayed_ref_common - Initialize the structure which represents a
  *  modification to a an extent.
  *
  * @fs_info:Internal to the mounted filesystem mount structure.
@@ -685,7 +685,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
  * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
  * BTRFS_EXTENT_DATA_REF_KEY when recording data extent
  */
-static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
+void btrfs_init_delayed_ref_common(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 ref_root,
int action, u8 ref_type)
@@ -758,14 +758,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle 
*trans,
else
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
 
-   init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
-   ref_root, action, ref_type);
+   btrfs_init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
+ ref_root, action, ref_type);
ref->root = ref_root;
ref->parent = parent;
ref->level = level;
 
-   init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- ref_root, 0, action, false, is_system);
+   btrfs_init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
+   ref_root, 0, action, false, is_system);
head_ref->extent_op = extent_op;
 
delayed_refs = >transaction->delayed_refs;
@@ -794,6 +794,29 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle 
*trans,
return 0;
 }
 
+/*
+ * Do real delayed data ref insert.
+ * Caller must hold delayed_refs->lock and allocation memory
+ * for dref,head_ref and record.
+ */
+int btrfs_add_delayed_data_ref_locked(struct btrfs_trans_handle *trans,
+   struct btrfs_delayed_ref_head *head_ref,
+   struct btrfs_qgroup_extent_record *qrecord,
+   struct btrfs_delayed_data_ref *ref, int action,
+   int *qrecord_inserted_ret, int *old_ref_mod,
+   int *new_ref_mod)
+{
+   struct btrfs_delayed_ref_root *delayed_refs;
+
+   head_ref = add_delayed_ref_head(trans, head_ref, qrecord,
+   action, qrecord_inserted_ret,
+   old_ref_mod, new_ref_mod);
+
+   delayed_refs = >transaction->delayed_refs;
+
+   return insert_delayed_ref(trans, delayed_refs, head_ref, >node);
+}
+
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
@@ -820,7 +843,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle 
*trans,
ref_type = BTRFS_SHARED_DATA_REF_KEY;
else
ref_type = BTRFS_EXTENT_DATA_REF_KEY;
-   init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
+   btrfs_init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
ref_root, action, ref_type);
ref->root = ref_root;
ref->parent = parent;
@@ -845,8 +868,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle 
*trans,
}
}
 
-   init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
- reserved, action, true, false);
+   btrfs_init_delayed_ref_head(head_ref, record, bytenr, num_byte

[PATCH v15.1 08/13] btrfs: ordered-extent: Add support for dedupe

2018-11-05 Thread Lu Fengqi

From: Wang Xiaoguang 

Add ordered-extent support for dedupe.

Note, current ordered-extent support only supports non-compressed source
extent.
Support for compressed source extent will be added later.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
---
 fs/btrfs/ordered-data.c | 46 +
 fs/btrfs/ordered-data.h | 13 
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0c4ef208b8b9..4b112258a79b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -12,6 +12,7 @@
 #include "extent_io.h"
 #include "disk-io.h"
 #include "compression.h"
+#include "dedupe.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
@@ -170,7 +171,8 @@ static inline struct rb_node *tree_search(struct 
btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
  u64 start, u64 len, u64 disk_len,
- int type, int dio, int compress_type)
+ int type, int dio, int compress_type,
+ struct btrfs_dedupe_hash *hash)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -191,6 +193,33 @@ static int __btrfs_add_ordered_extent(struct inode *inode, 
u64 file_offset,
entry->inode = igrab(inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
+   entry->hash = NULL;
+   /*
+* A hash hit means we have already incremented the extents delayed
+* ref.
+* We must handle this even if another process is trying to
+* turn off dedupe, otherwise we will leak a reference.
+*/
+   if (hash && (hash->bytenr || root->fs_info->dedupe_enabled)) {
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = root->fs_info->dedupe_info;
+   if (WARN_ON(dedupe_info == NULL)) {
+   kmem_cache_free(btrfs_ordered_extent_cache,
+   entry);
+   return -EINVAL;
+   }
+   entry->hash = btrfs_dedupe_alloc_hash(dedupe_info->hash_algo);
+   if (!entry->hash) {
+   kmem_cache_free(btrfs_ordered_extent_cache, entry);
+   return -ENOMEM;
+   }
+   entry->hash->bytenr = hash->bytenr;
+   entry->hash->num_bytes = hash->num_bytes;
+   memcpy(entry->hash->hash, hash->hash,
+  btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, >flags);
 
@@ -245,15 +274,23 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 
file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
+int btrfs_add_ordered_extent_dedupe(struct inode *inode, u64 file_offset,
+  u64 start, u64 len, u64 disk_len, int type,
+  struct btrfs_dedupe_hash *hash)
+{
+   return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0,
+ BTRFS_COMPRESS_NONE, hash);
+}
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 u64 start, u64 len, u64 disk_len, int type)
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 1,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
@@ -262,7 +299,7 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, 
u64 file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- compress_type);
+ compress_type, NULL);
 }
 
 /*
@@ -444,6 +481,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent 
*entry)
list_del(>list);
kfree(sum);
}
+   kfree(entry->hash);
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 02d813aaa261..08c7ee986bb9 100644
---

[PATCH v15.1 07/13] btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface

2018-11-05 Thread Lu Fengqi

From: Wang Xiaoguang 

Unlike in-memory or on-disk dedupe method, only SHA256 hash method is
supported yet, so implement btrfs_dedupe_calc_hash() interface using
SHA256.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 50 +++
 1 file changed, 50 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 03ad41423c01..6199215022e6 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -644,3 +644,53 @@ int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
}
return ret;
 }
+
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash)
+{
+   int i;
+   int ret;
+   struct page *p;
+   struct shash_desc *shash;
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+   struct crypto_shash *tfm = dedupe_info->dedupe_driver;
+   u64 dedupe_bs;
+   u64 sectorsize = fs_info->sectorsize;
+
+   shash = kmalloc(sizeof(*shash) + crypto_shash_descsize(tfm), GFP_NOFS);
+   if (!shash)
+   return -ENOMEM;
+
+   if (!fs_info->dedupe_enabled || !hash)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   WARN_ON(!IS_ALIGNED(start, sectorsize));
+
+   dedupe_bs = dedupe_info->blocksize;
+
+   shash->tfm = tfm;
+   shash->flags = 0;
+   ret = crypto_shash_init(shash);
+   if (ret)
+   return ret;
+   for (i = 0; sectorsize * i < dedupe_bs; i++) {
+   char *d;
+
+   p = find_get_page(inode->i_mapping,
+ (start >> PAGE_SHIFT) + i);
+   if (WARN_ON(!p))
+   return -ENOENT;
+   d = kmap(p);
+   ret = crypto_shash_update(shash, d, sectorsize);
+   kunmap(p);
+   put_page(p);
+   if (ret)
+   return ret;
+   }
+   ret = crypto_shash_final(shash, hash->hash);
+   return ret;
+}
-- 
2.19.1

Re: [PATCH] Btrfs: fix missing delayed iputs on unmount

2018-10-31 Thread Lu Fengqi

On Tue, Oct 30, 2018 at 05:14:42PM -0700, Omar Sandoval wrote:
>From: Omar Sandoval 
>
>There's a race between close_ctree() and cleaner_kthread().
>close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
>sees it set, but this is racy; the cleaner might have already checked
>the bit and could be cleaning stuff. In particular, if it deletes unused
>block groups, it will create delayed iputs for the free space cache
>inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
>longer running delayed iputs after a commit. Therefore, if the cleaner
>creates more delayed iputs after delayed iputs are run in
>btrfs_commit_super(), we will leak inodes on unmount and get a busy

Since the assert added via commit e187831e1875 ("btrfs: assert on non-empty
delayed iputs") wasn't triggered, it doesn't seem to be the cause of inode
leak.

-- 
Thanks,
Lu

>inode crash from the VFS.
>
>Fix it by parking the cleaner before we actually close anything. Then,
>any remaining delayed iputs will always be handled in
>btrfs_commit_super(). This also ensures that the commit in close_ctree()
>is really the last commit, so we can get rid of the commit in
>cleaner_kthread().
>
>Fixes: 30928e9baac2 ("btrfs: don't run delayed_iputs in commit")
>Signed-off-by: Omar Sandoval 
>---
>We found this with a stress test that our containers team runs. I'm
>wondering if this same race could have caused any other issues other
>than this new iput thing, but I couldn't identify any.
>
> fs/btrfs/disk-io.c | 40 +++-
> 1 file changed, 7 insertions(+), 33 deletions(-)
>
>diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>index b0ab41da91d1..7c17284ae3c2 100644
>--- a/fs/btrfs/disk-io.c
>+++ b/fs/btrfs/disk-io.c
>@@ -1664,9 +1664,8 @@ static int cleaner_kthread(void *arg)
>   struct btrfs_root *root = arg;
>   struct btrfs_fs_info *fs_info = root->fs_info;
>   int again;
>-  struct btrfs_trans_handle *trans;
> 
>-  do {
>+  while (1) {
>   again = 0;
> 
>   /* Make the cleaner go to sleep early. */
>@@ -1715,42 +1714,16 @@ static int cleaner_kthread(void *arg)
>*/
>   btrfs_delete_unused_bgs(fs_info);
> sleep:
>+  if (kthread_should_park())
>+  kthread_parkme();
>+  if (kthread_should_stop())
>+  return 0;
>   if (!again) {
>   set_current_state(TASK_INTERRUPTIBLE);
>-  if (!kthread_should_stop())
>-  schedule();
>+  schedule();
>   __set_current_state(TASK_RUNNING);
>   }
>-  } while (!kthread_should_stop());
>-
>-  /*
>-   * Transaction kthread is stopped before us and wakes us up.
>-   * However we might have started a new transaction and COWed some
>-   * tree blocks when deleting unused block groups for example. So
>-   * make sure we commit the transaction we started to have a clean
>-   * shutdown when evicting the btree inode - if it has dirty pages
>-   * when we do the final iput() on it, eviction will trigger a
>-   * writeback for it which will fail with null pointer dereferences
>-   * since work queues and other resources were already released and
>-   * destroyed by the time the iput/eviction/writeback is made.
>-   */
>-  trans = btrfs_attach_transaction(root);
>-  if (IS_ERR(trans)) {
>-  if (PTR_ERR(trans) != -ENOENT)
>-  btrfs_err(fs_info,
>-"cleaner transaction attach returned %ld",
>-PTR_ERR(trans));
>-  } else {
>-  int ret;
>-
>-  ret = btrfs_commit_transaction(trans);
>-  if (ret)
>-  btrfs_err(fs_info,
>-"cleaner open transaction commit returned %d",
>-ret);
>   }
>-
>-  return 0;
> }
> 
> static int transaction_kthread(void *arg)
>@@ -3931,6 +3904,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
>   int ret;
> 
>   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
>+  kthread_park(fs_info->cleaner_kthread);
> 
>   /* wait for the qgroup rescan worker to stop */
>   btrfs_qgroup_wait_for_completion(fs_info, false);
>-- 
>2.19.1
>
>
>
>

Re: [PATCH] Btrfs: fix missing delayed iputs on unmount

2018-10-31 Thread Lu Fengqi

On Tue, Oct 30, 2018 at 05:14:42PM -0700, Omar Sandoval wrote:
>From: Omar Sandoval 
>
>There's a race between close_ctree() and cleaner_kthread().
>close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
>sees it set, but this is racy; the cleaner might have already checked
>the bit and could be cleaning stuff. In particular, if it deletes unused
>block groups, it will create delayed iputs for the free space cache
>inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
>longer running delayed iputs after a commit. Therefore, if the cleaner
>creates more delayed iputs after delayed iputs are run in
>btrfs_commit_super(), we will leak inodes on unmount and get a busy
>inode crash from the VFS.
>
>Fix it by parking the cleaner before we actually close anything. Then,
>any remaining delayed iputs will always be handled in
>btrfs_commit_super(). This also ensures that the commit in close_ctree()
>is really the last commit, so we can get rid of the commit in
>cleaner_kthread().
>
>Fixes: 30928e9baac2 ("btrfs: don't run delayed_iputs in commit")
>Signed-off-by: Omar Sandoval 
>---
>We found this with a stress test that our containers team runs. I'm
>wondering if this same race could have caused any other issues other
>than this new iput thing, but I couldn't identify any.

I noticed an inode leak issue in generic/475, but whether dropping commit
30928e9baac2 ("btrfs: don't run delayed_iputs in commit") or applying
this patch, the issue still exists.

I have attached the dmesg.

>
> fs/btrfs/disk-io.c | 40 +++-
> 1 file changed, 7 insertions(+), 33 deletions(-)
>
>diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>index b0ab41da91d1..7c17284ae3c2 100644
>--- a/fs/btrfs/disk-io.c
>+++ b/fs/btrfs/disk-io.c
>@@ -1664,9 +1664,8 @@ static int cleaner_kthread(void *arg)
>   struct btrfs_root *root = arg;
>   struct btrfs_fs_info *fs_info = root->fs_info;
>   int again;
>-  struct btrfs_trans_handle *trans;
> 
>-  do {
>+  while (1) {
>   again = 0;
> 
>   /* Make the cleaner go to sleep early. */
>@@ -1715,42 +1714,16 @@ static int cleaner_kthread(void *arg)
>*/
>   btrfs_delete_unused_bgs(fs_info);
> sleep:
>+  if (kthread_should_park())
>+  kthread_parkme();
>+  if (kthread_should_stop())
>+  return 0;
>   if (!again) {
>   set_current_state(TASK_INTERRUPTIBLE);
>-  if (!kthread_should_stop())
>-  schedule();
>+  schedule();
>   __set_current_state(TASK_RUNNING);
>   }
>-  } while (!kthread_should_stop());
>-
>-  /*
>-   * Transaction kthread is stopped before us and wakes us up.
>-   * However we might have started a new transaction and COWed some
>-   * tree blocks when deleting unused block groups for example. So
>-   * make sure we commit the transaction we started to have a clean
>-   * shutdown when evicting the btree inode - if it has dirty pages
>-   * when we do the final iput() on it, eviction will trigger a
>-   * writeback for it which will fail with null pointer dereferences
>-   * since work queues and other resources were already released and
>-   * destroyed by the time the iput/eviction/writeback is made.
>-   */
>-  trans = btrfs_attach_transaction(root);
>-  if (IS_ERR(trans)) {
>-  if (PTR_ERR(trans) != -ENOENT)
>-  btrfs_err(fs_info,
>-"cleaner transaction attach returned %ld",
>-PTR_ERR(trans));
>-  } else {
>-  int ret;
>-
>-  ret = btrfs_commit_transaction(trans);
>-  if (ret)
>-  btrfs_err(fs_info,
>-"cleaner open transaction commit returned %d",
>-ret);
>   }
>-
>-  return 0;
> }
> 
> static int transaction_kthread(void *arg)
>@@ -3931,6 +3904,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
>   int ret;
> 
>   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
>+  kthread_park(fs_info->cleaner_kthread);

Since we are not going to call kthread_unpark, I am not sure why
kthread_park is used instead of kthread_stop here. It looks like there
is no significant difference between stopping instantly and delayed stop.

-- 
Thanks,
Lu

> 
>   /* wait for the qgroup rescan worker to stop */
>   btrfs_qgroup_wait_for_completion(fs_info, false);
>-- 
>2.19.1
>
>
>
>


[  366.955193] run fstests generic/475 at 2018-10-31 15:06:43
[  367.495791] BTRFS: device fsid 812f883c-40b2-4456-9769-b94ddf1cb07e devid 1 
transid 5 /dev/nvme0n1p2
[  367.624469] BTRFS info (device dm-3): disk space caching is enabled
[  367.627305] BTRFS info (device dm-3): has skinny extents
[

[PATCH 3/3] btrfs: fix pinned underflow after transaction aborted

2018-10-24 Thread Lu Fengqi

When running generic/475, we may get the following warning in the dmesg.

[ 6902.102154] WARNING: CPU: 3 PID: 18013 at fs/btrfs/extent-tree.c:9776 
btrfs_free_block_groups+0x2af/0x3b0 [btrfs]
[ 6902.104886] Modules linked in: btrfs(O) xor zstd_decompress zstd_compress 
xxhash raid6_pq efivarfs xfs nvme nvme_core [last unloaded: btrfs]
[ 6902.109160] CPU: 3 PID: 18013 Comm: umount Tainted: GW  O  
4.19.0-rc8+ #8
[ 6902.110971] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 
02/06/2015
[ 6902.112857] RIP: 0010:btrfs_free_block_groups+0x2af/0x3b0 [btrfs]
[ 6902.114377] Code: c6 48 89 04 24 48 8b 83 50 17 00 00 48 39 c6 0f 84 ab 00 
00 00 4c 8b ab 50 17 00 00 49 83 bd 50 ff ff ff 00 0f 84 b4 00 00 00 <0f> 0b 31 
c9 49 8d b5 f8 fe ff ff 31 d2 48
89 df e8 fc 76 ff ff 49
[ 6902.118921] RSP: 0018:c9000459bdb0 EFLAGS: 00010286
[ 6902.120315] RAX: 880175050bb0 RBX: 8801124a8000 RCX: 00170007
[ 6902.121969] RDX: 0002 RSI: 00170007 RDI: 8125fb74
[ 6902.123716] RBP: 880175055d10 R08:  R09: 
[ 6902.125417] R10:  R11:  R12: 880175055d88
[ 6902.127129] R13: 880175050bb0 R14:  R15: dead0100
[ 6902.129060] FS:  7f4507223780() GS:88017ba0() 
knlGS:
[ 6902.130996] CS:  0010 DS:  ES:  CR0: 80050033
[ 6902.132558] CR2: 5623599cac78 CR3: 00014b71 CR4: 003606e0
[ 6902.134270] DR0:  DR1:  DR2: 
[ 6902.135981] DR3:  DR6: fffe0ff0 DR7: 0400
[ 6902.137836] Call Trace:
[ 6902.138939]  close_ctree+0x171/0x330 [btrfs]
[ 6902.140181]  ? kthread_stop+0x146/0x1f0
[ 6902.141277]  generic_shutdown_super+0x6c/0x100
[ 6902.142517]  kill_anon_super+0x14/0x30
[ 6902.143554]  btrfs_kill_super+0x13/0x100 [btrfs]
[ 6902.144790]  deactivate_locked_super+0x2f/0x70
[ 6902.146014]  cleanup_mnt+0x3b/0x70
[ 6902.147020]  task_work_run+0x9e/0xd0
[ 6902.148036]  do_syscall_64+0x470/0x600
[ 6902.149142]  ? trace_hardirqs_off_thunk+0x1a/0x1c
[ 6902.150375]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 6902.151640] RIP: 0033:0x7f45077a6a7b
[ 6902.152782] Code: 23 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 90 f3 0f 1e fa 
31 f6 e9 05 00 00 00 90 0f 1f 40 00 f3 0f 1e fa b8 a6 00 00 00 0f 05 <48> 3d 01 
f0 ff ff 73 01 c3 48 8b 0d b5 23
0c 00 f7 d8 64 89 01 48
[ 6902.157324] RSP: 002b:7ffd589f3e68 EFLAGS: 0246 ORIG_RAX: 
00a6
[ 6902.159187] RAX:  RBX: 55e8eec732b0 RCX: 7f45077a6a7b
[ 6902.160834] RDX: 0001 RSI:  RDI: 55e8eec73490
[ 6902.162526] RBP:  R08: 55e8eec734b0 R09: 7ffd589f26c0
[ 6902.164141] R10:  R11: 0246 R12: 55e8eec73490
[ 6902.165815] R13: 7f4507ac61a4 R14:  R15: 7ffd589f40d8
[ 6902.167553] irq event stamp: 0
[ 6902.168998] hardirqs last  enabled at (0): [<>]   
(null)
[ 6902.170731] hardirqs last disabled at (0): [] 
copy_process.part.55+0x3b0/0x1f00
[ 6902.172773] softirqs last  enabled at (0): [] 
copy_process.part.55+0x3b0/0x1f00
[ 6902.174671] softirqs last disabled at (0): [<>]   
(null)
[ 6902.176407] ---[ end trace 463138c2986b275c ]---
[ 6902.177636] BTRFS info (device dm-3): space_info 4 has 273465344 free, is 
not full
[ 6902.179453] BTRFS info (device dm-3): space_info total=276824064, 
used=4685824, pinned=18446744073708158976, reserved=0, may_use=0, readonly=65536

^^^

obviously underflow

When transaction_kthread is running cleanup_transaction(), another
fsstress is running btrfs_commit_transaction(). The
btrfs_finish_extent_commit() may get the same range as
btrfs_destroy_pinned_extent() got, which causes the pinned underflow.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/disk-io.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ab41da91d1..00ee5e37e989 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4359,13 +4359,23 @@ static int btrfs_destroy_pinned_extent(struct 
btrfs_fs_info *fs_info,
unpin = pinned_extents;
 again:
while (1) {
+   /*
+* The btrfs_finish_extent_commit() may get the same range as
+* ours between find_first_extent_bit and clear_extent_dirty.
+* Hence, hold the unused_bg_unpin_mutex to avoid double unpin
+* the same extent range.
+*/
+   mutex_lock(_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, , ,

[PATCH 0/3] fix pinned underflow in generic/475

2018-10-24 Thread Lu Fengqi

When running generic/475, pinned underflow may occur. This patch will
fix this problem, but there are still other warnings need to addressed in
this case.

Patch 1-2 introduce a macro and wrappers to help detect underflow
Patch 3 the fix patch of pinned underflow

Lu Fengqi (2):
  btrfs: extent-tree: Detect bytes_pinned underflow earlier
  btrfs: fix pinned underflow after transaction aborted

Qu Wenruo (1):
  btrfs: extent-tree: Detect bytes_may_use underflow earlier

 fs/btrfs/disk-io.c | 12 +-
 fs/btrfs/extent-tree.c | 53 ++
 2 files changed, 44 insertions(+), 21 deletions(-)

-- 
2.19.1

[PATCH 2/3] btrfs: extent-tree: Detect bytes_pinned underflow earlier

2018-10-24 Thread Lu Fengqi

Introduce a new wrapper update_bytes_pinned to replace open coded
bytes_pinned modifiers.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/extent-tree.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c0147a1307e7..bb91db944d21 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -65,6 +65,7 @@ static inline void update_##name(struct btrfs_space_info 
*sinfo,  \
 }
 
 DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
+DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
 
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
   struct btrfs_delayed_ref_node *node, u64 parent,
@@ -6163,7 +6164,7 @@ static int update_block_group(struct btrfs_trans_handle 
*trans,
old_val -= num_bytes;
btrfs_set_block_group_used(>item, old_val);
cache->pinned += num_bytes;
-   cache->space_info->bytes_pinned += num_bytes;
+   update_bytes_pinned(cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
spin_unlock(>lock);
@@ -6234,7 +6235,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,
spin_lock(>space_info->lock);
spin_lock(>lock);
cache->pinned += num_bytes;
-   cache->space_info->bytes_pinned += num_bytes;
+   update_bytes_pinned(cache->space_info, num_bytes);
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
@@ -6599,7 +6600,7 @@ static int unpin_extent_range(struct btrfs_fs_info 
*fs_info,
spin_lock(_info->lock);
spin_lock(>lock);
cache->pinned -= len;
-   space_info->bytes_pinned -= len;
+   update_bytes_pinned(space_info, -len);
 
trace_btrfs_space_reservation(fs_info, "pinned",
  space_info->flags, len, 0);
@@ -10710,7 +10711,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info 
*fs_info)
spin_lock(_info->lock);
spin_lock(_group->lock);
 
-   space_info->bytes_pinned -= block_group->pinned;
+   update_bytes_pinned(space_info, -block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
percpu_counter_add_batch(_info->total_bytes_pinned,
   -block_group->pinned,
-- 
2.19.1

[PATCH 1/3] btrfs: extent-tree: Detect bytes_may_use underflow earlier

2018-10-24 Thread Lu Fengqi

From: Qu Wenruo 

Although we have space_info::bytes_may_use underflow detection in
btrfs_free_reserved_data_space_noquota(), we have more callers who are
subtracting number from space_info::bytes_may_use.

So instead of doing underflow detection for every caller, introduce a
new wrapper update_bytes_may_use() to replace open coded bytes_may_use
modifiers.

This also introduce a macro to declare more wrappers, but currently
space_info::bytes_may_use is the mostly interesting one.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c | 44 +++---
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a1febf155747..c0147a1307e7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -51,6 +51,21 @@ enum {
CHUNK_ALLOC_FORCE = 2,
 };
 
+/* Helper function to detect various space info bytes underflow */
+#define DECLARE_SPACE_INFO_UPDATE(name)
\
+static inline void update_##name(struct btrfs_space_info *sinfo,   \
+s64 bytes) \
+{  \
+   if (bytes < 0 && sinfo->name < -bytes) {\
+   WARN_ON(1); \
+   sinfo->name = 0;\
+   return; \
+   }   \
+   sinfo->name += bytes;   \
+}
+
+DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
+
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
   struct btrfs_delayed_ref_node *node, u64 parent,
   u64 root_objectid, u64 owner_objectid,
@@ -4256,7 +4271,7 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode 
*inode, u64 bytes)
  data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
-   data_sinfo->bytes_may_use += bytes;
+   update_bytes_may_use(data_sinfo, bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
  data_sinfo->flags, bytes, 1);
spin_unlock(_sinfo->lock);
@@ -4309,10 +4324,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode 
*inode, u64 start,
 
data_sinfo = fs_info->data_sinfo;
spin_lock(_sinfo->lock);
-   if (WARN_ON(data_sinfo->bytes_may_use < len))
-   data_sinfo->bytes_may_use = 0;
-   else
-   data_sinfo->bytes_may_use -= len;
+   update_bytes_may_use(data_sinfo, -len);
trace_btrfs_space_reservation(fs_info, "space_info",
  data_sinfo->flags, len, 0);
spin_unlock(_sinfo->lock);
@@ -5108,7 +5120,7 @@ static int wait_reserve_ticket(struct btrfs_fs_info 
*fs_info,
list_del_init(>list);
if (ticket->bytes && ticket->bytes < orig_bytes) {
u64 num_bytes = orig_bytes - ticket->bytes;
-   space_info->bytes_may_use -= num_bytes;
+   update_bytes_may_use(space_info, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
  space_info->flags, num_bytes, 0);
}
@@ -5154,13 +5166,13 @@ static int __reserve_metadata_bytes(struct 
btrfs_fs_info *fs_info,
 * If not things get more complicated.
 */
if (used + orig_bytes <= space_info->total_bytes) {
-   space_info->bytes_may_use += orig_bytes;
+   update_bytes_may_use(space_info, orig_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
  space_info->flags, orig_bytes, 1);
ret = 0;
} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
  system_chunk)) {
-   space_info->bytes_may_use += orig_bytes;
+   update_bytes_may_use(space_info, orig_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
  space_info->flags, orig_bytes, 1);
ret = 0;
@@ -5223,7 +5235,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info 
*fs_info,
if (ticket.bytes) {
if (ticket.bytes < orig_bytes) {
u64 num_bytes = orig_bytes - ticket.bytes;
-   space_info->bytes_may_use -= num_bytes;
+   update_bytes_may_use(space_info, -num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
  space_info->flags,
  num_bytes, 0);

[PATCH] btrfs: delayed-ref: extract find_first_ref_head from find_ref_head

2018-10-15 Thread Lu Fengqi

The find_ref_head shouldn't return the first entry even if no exact match
is found. So move the hidden behavior to higher level.

Besides, remove the useless local variables in the btrfs_select_ref_head.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 45 +++---
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6e8be384398e..a92f104cf06f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -164,14 +164,28 @@ static struct btrfs_delayed_ref_node* tree_insert(struct 
rb_root_cached *root,
return NULL;
 }
 
+static struct btrfs_delayed_ref_head *find_first_ref_head(
+   struct btrfs_delayed_ref_root *dr)
+{
+   struct rb_node *n;
+   struct btrfs_delayed_ref_head *entry;
+
+   n = rb_first_cached(>href_root);
+   if (!n)
+   return NULL;
+
+   entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+   return entry;
+}
+
 /*
  * find an head entry based on bytenr. This returns the delayed ref
  * head if it was able to find one, or NULL if nothing was in that spot.
  * If return_bigger is given, the next bigger entry is returned if no exact
- * match is found. But if no bigger one is found then the first node of the
- * ref head tree will be returned.
+ * match is found.
  */
-static struct btrfs_delayed_ref_head* find_ref_head(
+static struct btrfs_delayed_ref_head *find_ref_head(
struct btrfs_delayed_ref_root *dr, u64 bytenr,
bool return_bigger)
 {
@@ -195,10 +209,9 @@ static struct btrfs_delayed_ref_head* find_ref_head(
if (bytenr > entry->bytenr) {
n = rb_next(>href_node);
if (!n)
-   n = rb_first_cached(>href_root);
+   return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head,
 href_node);
-   return entry;
}
return entry;
}
@@ -358,33 +371,25 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs)
 {
struct btrfs_delayed_ref_head *head;
-   u64 start;
-   bool loop = false;
 
 again:
-   start = delayed_refs->run_delayed_start;
-   head = find_ref_head(delayed_refs, start, true);
-   if (!head && !loop) {
+   head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
+true);
+   if (!head && delayed_refs->run_delayed_start != 0) {
delayed_refs->run_delayed_start = 0;
-   start = 0;
-   loop = true;
-   head = find_ref_head(delayed_refs, start, true);
-   if (!head)
-   return NULL;
-   } else if (!head && loop) {
-   return NULL;
+   head = find_first_ref_head(delayed_refs);
}
+   if (!head)
+   return NULL;
 
while (head->processing) {
struct rb_node *node;
 
node = rb_next(>href_node);
if (!node) {
-   if (loop)
+   if (delayed_refs->run_delayed_start == 0)
return NULL;
delayed_refs->run_delayed_start = 0;
-   start = 0;
-   loop = true;
goto again;
}
head = rb_entry(node, struct btrfs_delayed_ref_head,
-- 
2.19.1

Re: [PATCH 0/6] Some trivail cleanup about dealyed-refs

2018-10-14 Thread Lu Fengqi

On Thu, Oct 11, 2018 at 01:51:37PM +0200, David Sterba wrote:
>On Thu, Oct 11, 2018 at 01:40:32PM +0800, Lu Fengqi wrote:
>> There is no functional change. Just improve readablity.
>> 
>> PATCH 1-4 parameter cleanup patches
>> PATCH 5 cleanup about btrfs_select_ref_head
>> PATCH 6 switch int to bool; add some comment
>> 
>> Lu Fengqi (6):
>>   btrfs: delayed-ref: pass delayed_refs directly to
>> btrfs_select_ref_head()
>>   btrfs: delayed-ref: pass delayed_refs directly to
>> btrfs_delayed_ref_lock()
>>   btrfs: remove fs_info from btrfs_check_space_for_delayed_refs
>>   btrfs: remove fs_info from btrfs_should_throttle_delayed_refs
>>   btrfs: simplify btrfs_select_ref_head and cleanup some local variables
>>   btrfs: switch return_bigger to bool in find_ref_head
>
>1-4 and 6 added to misc-next, thanks.

There is not patch 2 at the misc-next branch. So it was forgotten?

-- 
Thanks,
Lu

Re: [PATCH 5/6] btrfs: simplify btrfs_select_ref_head and cleanup some local variables

2018-10-14 Thread Lu Fengqi

On Thu, Oct 11, 2018 at 02:45:04PM +0200, David Sterba wrote:
>On Thu, Oct 11, 2018 at 03:28:15PM +0300, Nikolay Borisov wrote:
>> > I noticed that there is a macro called SCRAMBLE_DELAYED_REFS in the
>> > extent-tree.c. I am a bit curious whether it has been forgotten by
>> > everyone, I have not found any test results about its performance impact.
>> 
>> I guess it was used during testing but nothing currently sets it. I.e it
>> might make sense to enable it if BTRFS_DEBUG is set.
>
>Agreed, the way the scrambling is supposed to be used does not align
>very well with the typical testing workflow so adding to ti the
>BTRFS_DEBUG set is ok, unless there are severe performance problems.

I will add it to the BTRFS_DEBUG set, and test if it has severe
performance problems.

>
>The part in btrfs_run_delayed_refs would be better hidden in a function
>similar to btrfs_debug_check_extent_io_range or btrfs_leak_debug_check.

Got it.

-- 
Thanks,
Lu

Re: [PATCH 5/6] btrfs: simplify btrfs_select_ref_head and cleanup some local variables

2018-10-14 Thread Lu Fengqi

On Thu, Oct 11, 2018 at 03:28:15PM +0300, Nikolay Borisov wrote:
>
>
>On 11.10.2018 15:15, Lu Fengqi wrote:
>> On Thu, Oct 11, 2018 at 09:40:52AM +0300, Nikolay Borisov wrote:
>>>
>>>
>>> On 11.10.2018 08:40, Lu Fengqi wrote:
>>>> If the return value of find_ref_head() is NULL, the only possibility is
>>>> that delayed_refs' head ref rbtree is empty. Hence, the second
>>>> find_ref_head() is pointless.
>>>>> Besides, the local variables loop and start are unnecessary, just remove
>>>> them.
>>>
>>> So the objective of that function is to get a reference to the first
>>> delayed head which is not processed. This is done by essentially keeping
>>> track of the last range that was processed in
>>> delayed_refs->run_delayed_start
>>>>
>>>> Signed-off-by: Lu Fengqi 
>>>> ---
>>>>  fs/btrfs/delayed-ref.c | 17 +++--
>>>>  1 file changed, 3 insertions(+), 14 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
>>>> index 885581852bea..2726d2fb4bbe 100644
>>>> --- a/fs/btrfs/delayed-ref.c
>>>> +++ b/fs/btrfs/delayed-ref.c
>>>> @@ -354,20 +354,11 @@ struct btrfs_delayed_ref_head *
>>>>  btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs)
>>>>  {
>>>>struct btrfs_delayed_ref_head *head;
>>>> -  u64 start;
>>>> -  bool loop = false;
>>>>  
>>>>  again:
>>>> -  start = delayed_refs->run_delayed_start;
>>>> -  head = find_ref_head(delayed_refs, start, 1);
>>>> -  if (!head && !loop) {
>>>> +  head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 1);
>>>> +  if (!head) {
>>>>delayed_refs->run_delayed_start = 0;
>>>> -  start = 0;
>>>> -  loop = true;
>>>> -  head = find_ref_head(delayed_refs, start, 1);
>>>> -  if (!head)
>>>> -  return NULL;
>>>> -  } else if (!head && loop) {
>>>
>>> I believe this will have a negative impact since it actually will
>>> prevent finding a head which was added BEFORE the last processed head.
>>> So when a ref head is selected in btrfs_obtain_ref_head then the
>>> delayed_refs->lock is dropped and the given head is locked and
>>> delayed_refs->run_delayed_start points to the end of the selected range
>>> that the head represents. At this point it's possible that another
>>> thread modifies a different range which is before the one we have
>>> selected so graphically it will be something like:
>>>
>>>
>>> ---[HEAD2]->[HEAD1]--
>>> 0N
>>>
>>> Where HEAD1 is the head returned from first invocation of
>>> btrfs_obtain_ref_head. Once  btrfs_obtain_ref_head is called the 2nd
>>> time it will not find HEAD2 so will just reset run_delayed_start to 0
>>> and return. So it will be up to another run of the delayed refs to
>>> actually find head2. Essentially you made btrfs_obtain_ref_head less
>> 
>> Not exactly. In fact, find_ref_head hides such a logic. When
>> return_bigger is set, if there is no larger entry to return, the first
>> entry will be returned. Please see the comment I add in the PATCH 6.
>> 
>> Hence, the 2nd invocation of btrfs_obtain_ref_head still will return
>> HEAD2. There is no functional change here.
>> 
>> However, your question makes me consider whether such hidden logic
>> should be extracted from find_ref_head to btrfs_select_ref_head.
>
>Right I agree with your. As it stands I will expect that if
>return_bigger is true to specifically return a bigger entry or if
>nothing is found to return null. IMO this behavior is higher level and

This is also exactly what I want. The patch is on the way.

>belongs to btrfs_delayed_ref_head.
>
>> 
>>> greedy. Have you characterized what kind of performance impact this have?
>> 
>> I noticed that there is a macro called SCRAMBLE_DELAYED_REFS in the
>> extent-tree.c. I am a bit curious whether it has been forgotten by
>> everyone, I have not found any test results about its performance impact.
>
>I guess it was used during testing but nothing currently sets it. I.e it
>might make sense to enable it if BTRFS_DEBUG is set.
>

Make sense.

-- 
Thanks,
Lu

Re: [PATCH 5/6] btrfs: simplify btrfs_select_ref_head and cleanup some local variables

2018-10-11 Thread Lu Fengqi

On Thu, Oct 11, 2018 at 09:40:52AM +0300, Nikolay Borisov wrote:
>
>
>On 11.10.2018 08:40, Lu Fengqi wrote:
>> If the return value of find_ref_head() is NULL, the only possibility is
>> that delayed_refs' head ref rbtree is empty. Hence, the second
>> find_ref_head() is pointless.
>> > Besides, the local variables loop and start are unnecessary, just remove
>> them.
>
>So the objective of that function is to get a reference to the first
>delayed head which is not processed. This is done by essentially keeping
>track of the last range that was processed in
>delayed_refs->run_delayed_start
>> 
>> Signed-off-by: Lu Fengqi 
>> ---
>>  fs/btrfs/delayed-ref.c | 17 +++--
>>  1 file changed, 3 insertions(+), 14 deletions(-)
>> 
>> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
>> index 885581852bea..2726d2fb4bbe 100644
>> --- a/fs/btrfs/delayed-ref.c
>> +++ b/fs/btrfs/delayed-ref.c
>> @@ -354,20 +354,11 @@ struct btrfs_delayed_ref_head *
>>  btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs)
>>  {
>>  struct btrfs_delayed_ref_head *head;
>> -u64 start;
>> -bool loop = false;
>>  
>>  again:
>> -start = delayed_refs->run_delayed_start;
>> -head = find_ref_head(delayed_refs, start, 1);
>> -if (!head && !loop) {
>> +head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 1);
>> +if (!head) {
>>  delayed_refs->run_delayed_start = 0;
>> -start = 0;
>> -loop = true;
>> -head = find_ref_head(delayed_refs, start, 1);
>> -if (!head)
>> -return NULL;
>> -} else if (!head && loop) {
>
>I believe this will have a negative impact since it actually will
>prevent finding a head which was added BEFORE the last processed head.
>So when a ref head is selected in btrfs_obtain_ref_head then the
>delayed_refs->lock is dropped and the given head is locked and
>delayed_refs->run_delayed_start points to the end of the selected range
>that the head represents. At this point it's possible that another
>thread modifies a different range which is before the one we have
>selected so graphically it will be something like:
>
>
>---[HEAD2]->[HEAD1]--
>0N
>
>Where HEAD1 is the head returned from first invocation of
>btrfs_obtain_ref_head. Once  btrfs_obtain_ref_head is called the 2nd
>time it will not find HEAD2 so will just reset run_delayed_start to 0
>and return. So it will be up to another run of the delayed refs to
>actually find head2. Essentially you made btrfs_obtain_ref_head less

Not exactly. In fact, find_ref_head hides such a logic. When
return_bigger is set, if there is no larger entry to return, the first
entry will be returned. Please see the comment I add in the PATCH 6.

Hence, the 2nd invocation of btrfs_obtain_ref_head still will return
HEAD2. There is no functional change here.

However, your question makes me consider whether such hidden logic
should be extracted from find_ref_head to btrfs_select_ref_head.

>greedy. Have you characterized what kind of performance impact this have?

I noticed that there is a macro called SCRAMBLE_DELAYED_REFS in the
extent-tree.c. I am a bit curious whether it has been forgotten by
everyone, I have not found any test results about its performance impact.

-- 
Thanks,
Lu

>
>
>
>
>>  return NULL;
>>  }
>>  
>> @@ -376,11 +367,9 @@ btrfs_select_ref_head(struct btrfs_delayed_ref_root 
>> *delayed_refs)
>>  
>>  node = rb_next(>href_node);
>>  if (!node) {
>> -if (loop)
>> +if (delayed_refs->run_delayed_start == 0)
>>  return NULL;
>>  delayed_refs->run_delayed_start = 0;
>> -start = 0;
>> -loop = true;
>>  goto again;
>>  }
>>  head = rb_entry(node, struct btrfs_delayed_ref_head,
>> 
>
>

[PATCH] btrfs: qgroup: move the qgroup->members check out from (!qgroup)'s else branch

2018-10-10 Thread Lu Fengqi

There is no reason to put this check in (!qgroup)'s else branch because if
qgroup is null, it will goto out directly. So move it out to reduce
indent.

No Functional Change.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/qgroup.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 27f517315388..af65ab1640b0 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1416,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle 
*trans, u64 qgroupid)
if (!qgroup) {
ret = -ENOENT;
goto out;
-   } else {
-   /* check if there are no children of this qgroup */
-   if (!list_empty(>members)) {
-   ret = -EBUSY;
-   goto out;
-   }
}
+
+   /* check if there are no children of this qgroup */
+   if (!list_empty(>members)) {
+   ret = -EBUSY;
+   goto out;
+   }
+
ret = del_qgroup_item(trans, qgroupid);
if (ret && ret != -ENOENT)
goto out;
-- 
2.19.1

[PATCH 0/6] Some trivail cleanup about dealyed-refs

2018-10-10 Thread Lu Fengqi

There is no functional change. Just improve readablity.

PATCH 1-4 parameter cleanup patches
PATCH 5 cleanup about btrfs_select_ref_head
PATCH 6 switch int to bool; add some comment

Lu Fengqi (6):
  btrfs: delayed-ref: pass delayed_refs directly to
btrfs_select_ref_head()
  btrfs: delayed-ref: pass delayed_refs directly to
btrfs_delayed_ref_lock()
  btrfs: remove fs_info from btrfs_check_space_for_delayed_refs
  btrfs: remove fs_info from btrfs_should_throttle_delayed_refs
  btrfs: simplify btrfs_select_ref_head and cleanup some local variables
  btrfs: switch return_bigger to bool in find_ref_head

 fs/btrfs/ctree.h   |  6 ++
 fs/btrfs/delayed-ref.c | 35 ++-
 fs/btrfs/delayed-ref.h |  4 ++--
 fs/btrfs/extent-tree.c | 15 +++
 fs/btrfs/inode.c   |  7 +++
 fs/btrfs/transaction.c |  4 ++--
 6 files changed, 26 insertions(+), 45 deletions(-)

-- 
2.19.1

[PATCH 2/6] btrfs: delayed-ref: pass delayed_refs directly to btrfs_delayed_ref_lock()

2018-10-10 Thread Lu Fengqi

Since trans is only used for referring to delayed_refs, there is no need
to pass it instead of delayed_refs to btrfs_delayed_ref_lock().

No functional change.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 5 +
 fs/btrfs/delayed-ref.h | 2 +-
 fs/btrfs/extent-tree.c | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 13ae86252c4c..885581852bea 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -204,12 +204,9 @@ static struct btrfs_delayed_ref_head* find_ref_head(
return NULL;
 }
 
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
   struct btrfs_delayed_ref_head *head)
 {
-   struct btrfs_delayed_ref_root *delayed_refs;
-
-   delayed_refs = >transaction->delayed_refs;
lockdep_assert_held(_refs->lock);
if (mutex_trylock(>mutex))
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index aa66ad6919ab..ef6f5cf75b3e 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -255,7 +255,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle 
*trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
   struct btrfs_delayed_ref_head *head);
 static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head 
*head)
 {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 22acc1545147..77156bd2a9a7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2521,7 +2521,7 @@ static struct btrfs_delayed_ref_head 
*btrfs_obtain_ref_head(
 * Grab the lock that says we are going to process all the refs for
 * this head
 */
-   ret = btrfs_delayed_ref_lock(trans, head);
+   ret = btrfs_delayed_ref_lock(delayed_refs, head);
spin_unlock(_refs->lock);
 
/*
-- 
2.19.1

[PATCH 1/6] btrfs: delayed-ref: pass delayed_refs directly to btrfs_select_ref_head()

2018-10-10 Thread Lu Fengqi

Since trans is only used for referring to delayed_refs, there is no need
to pass it instead of delayed_refs to btrfs_select_ref_head().

No functional change.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 5 +
 fs/btrfs/delayed-ref.h | 2 +-
 fs/btrfs/extent-tree.c | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 7f567c944fec..13ae86252c4c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -354,15 +354,12 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info 
*fs_info, u64 seq)
 }
 
 struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans)
+btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs)
 {
-   struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
u64 start;
bool loop = false;
 
-   delayed_refs = >transaction->delayed_refs;
-
 again:
start = delayed_refs->run_delayed_start;
head = find_ref_head(delayed_refs, start, 1);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c3e3486a126c..aa66ad6919ab 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -264,7 +264,7 @@ static inline void btrfs_delayed_ref_unlock(struct 
btrfs_delayed_ref_head *head)
 
 
 struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans);
+btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs);
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 22b9269ae84c..22acc1545147 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2511,7 +2511,7 @@ static struct btrfs_delayed_ref_head 
*btrfs_obtain_ref_head(
int ret;
 
spin_lock(_refs->lock);
-   head = btrfs_select_ref_head(trans);
+   head = btrfs_select_ref_head(delayed_refs);
if (!head) {
spin_unlock(_refs->lock);
return head;
-- 
2.19.1

[PATCH 5/6] btrfs: simplify btrfs_select_ref_head and cleanup some local variables

2018-10-10 Thread Lu Fengqi

If the return value of find_ref_head() is NULL, the only possibility is
that delayed_refs' head ref rbtree is empty. Hence, the second
find_ref_head() is pointless.

Besides, the local variables loop and start are unnecessary, just remove
them.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 17 +++--
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 885581852bea..2726d2fb4bbe 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -354,20 +354,11 @@ struct btrfs_delayed_ref_head *
 btrfs_select_ref_head(struct btrfs_delayed_ref_root *delayed_refs)
 {
struct btrfs_delayed_ref_head *head;
-   u64 start;
-   bool loop = false;
 
 again:
-   start = delayed_refs->run_delayed_start;
-   head = find_ref_head(delayed_refs, start, 1);
-   if (!head && !loop) {
+   head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 1);
+   if (!head) {
delayed_refs->run_delayed_start = 0;
-   start = 0;
-   loop = true;
-   head = find_ref_head(delayed_refs, start, 1);
-   if (!head)
-   return NULL;
-   } else if (!head && loop) {
return NULL;
}
 
@@ -376,11 +367,9 @@ btrfs_select_ref_head(struct btrfs_delayed_ref_root 
*delayed_refs)
 
node = rb_next(>href_node);
if (!node) {
-   if (loop)
+   if (delayed_refs->run_delayed_start == 0)
return NULL;
delayed_refs->run_delayed_start = 0;
-   start = 0;
-   loop = true;
goto again;
}
head = rb_entry(node, struct btrfs_delayed_ref_head,
-- 
2.19.1

[PATCH 4/6] btrfs: remove fs_info from btrfs_should_throttle_delayed_refs

2018-10-10 Thread Lu Fengqi

The avg_delayed_ref_runtime can be referenced from the transaction handle.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   | 3 +--
 fs/btrfs/extent-tree.c | 5 ++---
 fs/btrfs/inode.c   | 5 ++---
 fs/btrfs/transaction.c | 2 +-
 4 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4002c9fd924b..68ca41dbbef3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2598,8 +2598,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct 
btrfs_fs_info *fs_info,
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
 }
 
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info);
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 const u64 start);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 831dc2ac1942..241de034ba09 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2826,8 +2826,7 @@ int btrfs_check_space_for_delayed_refs(struct 
btrfs_trans_handle *trans)
return ret;
 }
 
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info)
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
 {
u64 num_entries =
atomic_read(>transaction->delayed_refs.num_entries);
@@ -2835,7 +2834,7 @@ int btrfs_should_throttle_delayed_refs(struct 
btrfs_trans_handle *trans,
u64 val;
 
smp_mb();
-   avg_runtime = fs_info->avg_delayed_ref_runtime;
+   avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
val = num_entries * avg_runtime;
if (val >= NSEC_PER_SEC)
return 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6a5557e8909d..f22f77172c5f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4713,7 +4713,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle 
*trans,
btrfs_abort_transaction(trans, ret);
break;
}
-   if (btrfs_should_throttle_delayed_refs(trans, fs_info))
+   if (btrfs_should_throttle_delayed_refs(trans))
btrfs_async_run_delayed_refs(fs_info,
trans->delayed_ref_updates * 2,
trans->transid, 0);
@@ -4722,8 +4722,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle 
*trans,
 extent_num_bytes)) {
should_end = true;
}
-   if (btrfs_should_throttle_delayed_refs(trans,
-  fs_info))
+   if (btrfs_should_throttle_delayed_refs(trans))
should_throttle = true;
}
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c5015458c5c8..5686290a50e1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -835,7 +835,7 @@ static int __btrfs_end_transaction(struct 
btrfs_trans_handle *trans,
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
-   btrfs_should_throttle_delayed_refs(trans, info);
+   btrfs_should_throttle_delayed_refs(trans);
cur = max_t(unsigned long, cur, 32);
 
/*
-- 
2.19.1

[PATCH 3/6] btrfs: remove fs_info from btrfs_check_space_for_delayed_refs

2018-10-10 Thread Lu Fengqi

It can be referenced from the transaction handle.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   | 3 +--
 fs/btrfs/extent-tree.c | 6 +++---
 fs/btrfs/inode.c   | 2 +-
 fs/btrfs/transaction.c | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 15c659f23411..4002c9fd924b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2600,8 +2600,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct 
btrfs_fs_info *fs_info,
 
 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
   struct btrfs_fs_info *fs_info);
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 const u64 start);
 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 77156bd2a9a7..831dc2ac1942 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2789,9 +2789,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info 
*fs_info, u64 csum_bytes)
return num_csums;
 }
 
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info)
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans)
 {
+   struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
@@ -2842,7 +2842,7 @@ int btrfs_should_throttle_delayed_refs(struct 
btrfs_trans_handle *trans,
if (val >= NSEC_PER_SEC / 2)
return 2;
 
-   return btrfs_check_space_for_delayed_refs(trans, fs_info);
+   return btrfs_check_space_for_delayed_refs(trans);
 }
 
 struct async_delayed_refs {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6c476dc81b8e..6a5557e8909d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5319,7 +5319,7 @@ static struct btrfs_trans_handle 
*evict_refill_and_join(struct btrfs_root *root,
 * Try to steal from the global reserve if there is space for
 * it.
 */
-   if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
+   if (!btrfs_check_space_for_delayed_refs(trans) &&
!btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false))
return trans;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e7f618b17b07..c5015458c5c8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -760,7 +760,7 @@ static int should_end_transaction(struct btrfs_trans_handle 
*trans)
 {
struct btrfs_fs_info *fs_info = trans->fs_info;
 
-   if (btrfs_check_space_for_delayed_refs(trans, fs_info))
+   if (btrfs_check_space_for_delayed_refs(trans))
return 1;
 
return !!btrfs_block_rsv_check(_info->global_block_rsv, 5);
-- 
2.19.1

[PATCH 6/6] btrfs: switch return_bigger to bool in find_ref_head

2018-10-10 Thread Lu Fengqi

Using bool is more suitable than int here, and add the comment about the
return_bigger.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 2726d2fb4bbe..61a19376239e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -168,11 +168,12 @@ static struct btrfs_delayed_ref_node* tree_insert(struct 
rb_root_cached *root,
  * find an head entry based on bytenr. This returns the delayed ref
  * head if it was able to find one, or NULL if nothing was in that spot.
  * If return_bigger is given, the next bigger entry is returned if no exact
- * match is found.
+ * match is found. But if no bigger one is found then the first node of the
+ * ref head tree will be returned.
  */
 static struct btrfs_delayed_ref_head* find_ref_head(
struct btrfs_delayed_ref_root *dr, u64 bytenr,
-   int return_bigger)
+   bool return_bigger)
 {
struct rb_root *root = >href_root.rb_root;
struct rb_node *n;
@@ -356,7 +357,8 @@ btrfs_select_ref_head(struct btrfs_delayed_ref_root 
*delayed_refs)
struct btrfs_delayed_ref_head *head;
 
 again:
-   head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 1);
+   head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
+true);
if (!head) {
delayed_refs->run_delayed_start = 0;
return NULL;
@@ -894,7 +896,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info 
*fs_info,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 
bytenr)
 {
-   return find_ref_head(delayed_refs, bytenr, 0);
+   return find_ref_head(delayed_refs, bytenr, false);
 }
 
 void __cold btrfs_delayed_ref_exit(void)
-- 
2.19.1

Re: [PATCH 0/3] Misc refactoring of check_file_extent

2018-09-13 Thread Lu Fengqi

On Thu, Sep 13, 2018 at 03:05:04PM +0300, Nikolay Borisov wrote:
>While looking at check_file_extent I thought that the code might be a bit 
>cleaner than it actually is and cleaner as well. The first patch factors out 
>the code dealing with inline extents into a separate function aptly named 
>check_file_extent_inline. This allows to remove some inline-specific variable 
>from check_file_extent. Patch 2 just moves the final check in the new function 
>into the already existing branch handling the !compressed case. Finally 
>the check which detects unknown extent types is moved first in 
>check_file_extent, 
>followed by the code to handle inline extents and finally the existing code to 
>handle regular/prealloc extents is left intact. 
>
>This patchset brings no functional changes. 

For the series,

Reviewed-by: Lu Fengqi 

-- 
Thanks,
Lu

>
>Nikolay Borisov (3):
>  btrfs-progs: check: lowmem: Factor out inline extent checking code in
>its own function
>  btrfs-progs: check: lowmem: Refactor extent len test in
>check_file_extent_inline
>  btrfs-progs: check: lowmem: Refactor extent type checks in
>check_file_extent
>
> check/mode-lowmem.c | 151 ++--
> 1 file changed, 89 insertions(+), 62 deletions(-)
>
>-- 
>2.17.1
>
>
>

Re: [PATCH] btrfs-progs: lowmem: fix false alert about the existence of gaps in the check_file_extent

2018-09-13 Thread Lu Fengqi

On Thu, Sep 13, 2018 at 12:12:27PM +0300, Nikolay Borisov wrote:
>
>
>On 13.09.2018 11:20, Lu Fengqi wrote:
>> In the check_inode_item function, the extent_end variable used to store the
>> end of the last file extent that has checked. When it passes to
>> check_file_extent, if the offset of the next file extent is not equal to
>> it, there is a gap between the two file extents.
>
>The 'end' parameter of check_file_extent tracks the ending offset of the
>last checked extent. This is used to detect gaps between adjacent extents.
>
>> 
>> In the case of a gap existing, it is wrong that only add the
>> extent_num_bytes of this file extent to the invalid extent_end variable as
>> before. Therefore, lowmem check will false alert that there are gaps
>> between the subsequent file extents of this inode due to the wrong
>> extent_end variable.
>
>Currently such gaps are wrongly detected since for regular extents only
>the size of the extent is added to the 'end' parameter. This results in
>wrongly considering all extents of a file as having gaps between them
>when only 2 of them really have a gap as seen in the example below.

Thank you for refining the commit message for me.

>
>> 
>> Solution:
>> The extent_end variable should set to the sum of the offset and the
>> extent_num_bytes of the file extent.
>> 
>> Example:
>> Suppose that lowmem check the following file extent of inode 257.
>> 
>> item 6 key (257 EXTENT_DATA 0) itemoff 15813 itemsize 53
>> generation 6 type 1 (regular)
>> extent data disk byte 13631488 nr 4096
>> extent data offset 0 nr 4096 ram 4096
>> extent compression 0 (none)
>> item 7 key (257 EXTENT_DATA 8192) itemoff 15760 itemsize 53
>> generation 6 type 1 (regular)
>> extent data disk byte 13631488 nr 4096
>> extent data offset 0 nr 4096 ram 4096
>> extent compression 0 (none)
>> item 8 key (257 EXTENT_DATA 12288) itemoff 15707 itemsize 53
>> generation 6 type 1 (regular)
>> extent data disk byte 13631488 nr 4096
>> extent data offset 0 nr 4096 ram 4096
>> extent compression 0 (none)
>> 
>> For inode 257, check_inode_item set extent_end to 0, then call
>> check_file_extent to check item {6,7,8}.
>> item 6)
>>  offset(0) == extent_end(0)
>>  extent_end = extent_end(0) + extent_num_bytes(4096)
>> item 7)
>>  offset(8192) != extent_end(4096)
>>  extent_end = extent_end(4096) + extent_num_bytes(4096)
>>  ^^^
>>  The old extent_end should replace by offset(8192).
>> item 8)
>>  offset(12288) != extent_end(8192)
>>  ^^^
>>  But there is no gap between item {7,8}.
>
>The example makes sense. But can the same thing happen with the inline
>extents, ie should the same adjustments be made for the code in if
>(extent_type == BTRFS_FILE_EXTENT_INLINE) ?
>

IIRC, generally there is only one inline extent per file. Although there
will be other regular extents, the inline extent must be the first one.
So it seems that there is no need to change the code in if (extent_type
== BTRFS_FILE_EXTENT_INLINE).

-- 
Thanks,
Lu

>> 
>> Fixes: d88da10ddd42 ("btrfs-progs: check: introduce function to check file 
>> extent")
>> Signed-off-by: Lu Fengqi 
>> ---
>>  check/mode-lowmem.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
>> index 1bce44f5658a..370318f0e631 100644
>> --- a/check/mode-lowmem.c
>> +++ b/check/mode-lowmem.c
>> @@ -1974,7 +1974,7 @@ static int check_file_extent(struct btrfs_root *root, 
>> struct btrfs_path *path,
>>  }
>>  }
>>  
>> -*end += extent_num_bytes;
>> +*end = fkey.offset + extent_num_bytes;
>>  if (!is_hole)
>>  *size += extent_num_bytes;
>>  
>> 
>
>

Re: [PATCH] btrfs: Handle error of get_old_root

2018-09-13 Thread Lu Fengqi

On Thu, Sep 13, 2018 at 11:35:10AM +0300, Nikolay Borisov wrote:
>In btrfs_search_old_slot get_old_root is always used with the
>assumption it cannot fail. However, this is not true in rare
>circumstance it can fail and return null. This will lead to null
>point dereference when the header is read. Fix this by checking the
>return value and properly handling NULL by setting ret to -EIO and
>returning gracefully.
>
>CID: 1087503
>Signed-off-by: Nikolay Borisov 

Reviewed-by: Lu Fengqi 

-- 
Thanks,
Lu

>---
> fs/btrfs/ctree.c | 4 
> 1 file changed, 4 insertions(+)
>
>diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>index 1124d236291d..a5399fd49c17 100644
>--- a/fs/btrfs/ctree.c
>+++ b/fs/btrfs/ctree.c
>@@ -2961,6 +2961,10 @@ int btrfs_search_old_slot(struct btrfs_root *root, 
>const struct btrfs_key *key,
> 
> again:
>   b = get_old_root(root, time_seq);
>+  if (!b) {
>+  ret = -EIO;
>+  goto done;
>+  }
>   level = btrfs_header_level(b);
>   p->locks[level] = BTRFS_READ_LOCK;
> 
>-- 
>2.7.4
>
>
>

Re: [PATCH] btrfs: Remove logically dead code from btrfs_orphan_cleanup

2018-09-13 Thread Lu Fengqi

On Thu, Sep 13, 2018 at 11:35:00AM +0300, Nikolay Borisov wrote:
>In btrfs_orphan_cleanup the final 'if (ret) goto out' cannot ever be
>executed. This is due to the last assignment to 'ret' depending on
>the return value of btrfs_iget. If an error other than -ENOENT is
>returned then the loop is prematurely terminated by 'goto out'.
>On the other hand, if the error value is ENOENT then a subsequent
>if branch is executed that always re-assigns 'ret' and in case it's
>an error just terminates the loop. No functional changes.
>
>CID: 1437392
>Signed-off-by: Nikolay Borisov 

Reviewed-by: Lu Fengqi 

>---
> fs/btrfs/inode.c | 2 --
> 1 file changed, 2 deletions(-)
>
>diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>index 3f03fec06a3a..64df0378a22f 100644
>--- a/fs/btrfs/inode.c
>+++ b/fs/btrfs/inode.c
>@@ -3471,8 +3471,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
> 
>   /* this will do delete_inode and everything for us */
>   iput(inode);
>-  if (ret)
>-  goto out;
>   }
>   /* release the path since we're done with it */
>   btrfs_release_path(path);
>-- 
>2.7.4
>
>
>

-- 
Thanks,
Lu

Re: [PATCH] btrfs-progs: calibrate extent_end when found a gap

2018-09-13 Thread Lu Fengqi

On Thu, Sep 13, 2018 at 04:30:28PM +0800, Lu Fengqi wrote:
>On Tue, Sep 11, 2018 at 04:41:21PM +0200, David Sterba wrote:
>>On Tue, Sep 04, 2018 at 08:42:01PM +0800, Lu Fengqi wrote:
>>> The extent_end will be used to check whether there is gap between this
>>> extent and next extent. If it is not calibrated, check_file_extent will
>>
>>Do you mean 'synchronized' or 'matching'.
>
>I apologize for this incomprehensible commit message, and I have updated
>the commit message.
>
>[PATCH] btrfs-progs: lowmem: fix false alert about the existence of gaps in 
>the check_file_extent
>
>>
>>> mistake that there are gaps between the remaining extents.
>>
>>If this is a bugfix, do you have a testcase? Thanks.
>>
>
>The testcase requires some check repair's fixes (including originl and lowmem)
>that my colleagues are working on. After they get it, I will send the
>testcase.
>
>The attached is the image which can trigger the false alert.

Sorry, I miss the attached.

-- 
Thanks,
Lu

>
>Without the patch mentioned before, lowmem check will false alert that expect
>the hole extent [257 EXTENT_DATA 8192].
>
>ERROR: root 5 EXTENT_DATA[257 12288] gap exists, expected: EXTENT_DATA[257 
>8192]
>
>-- 
>Thanks,
>Lu
>
>




file_extent_with_gap.img
Description: Binary data

Re: [PATCH] btrfs-progs: calibrate extent_end when found a gap

2018-09-13 Thread Lu Fengqi

On Tue, Sep 11, 2018 at 04:41:21PM +0200, David Sterba wrote:
>On Tue, Sep 04, 2018 at 08:42:01PM +0800, Lu Fengqi wrote:
>> The extent_end will be used to check whether there is gap between this
>> extent and next extent. If it is not calibrated, check_file_extent will
>
>Do you mean 'synchronized' or 'matching'.

I apologize for this incomprehensible commit message, and I have updated
the commit message.

[PATCH] btrfs-progs: lowmem: fix false alert about the existence of gaps in the 
check_file_extent

>
>> mistake that there are gaps between the remaining extents.
>
>If this is a bugfix, do you have a testcase? Thanks.
>

The testcase requires some check repair's fixes (including originl and lowmem)
that my colleagues are working on. After they get it, I will send the
testcase.

The attached is the image which can trigger the false alert.

Without the patch mentioned before, lowmem check will false alert that expect
the hole extent [257 EXTENT_DATA 8192].

ERROR: root 5 EXTENT_DATA[257 12288] gap exists, expected: EXTENT_DATA[257 8192]

-- 
Thanks,
Lu

[PATCH] btrfs-progs: lowmem: fix false alert about the existence of gaps in the check_file_extent

2018-09-13 Thread Lu Fengqi

In the check_inode_item function, the extent_end variable used to store the
end of the last file extent that has checked. When it passes to
check_file_extent, if the offset of the next file extent is not equal to
it, there is a gap between the two file extents.

In the case of a gap existing, it is wrong that only add the
extent_num_bytes of this file extent to the invalid extent_end variable as
before. Therefore, lowmem check will false alert that there are gaps
between the subsequent file extents of this inode due to the wrong
extent_end variable.

Solution:
The extent_end variable should set to the sum of the offset and the
extent_num_bytes of the file extent.

Example:
Suppose that lowmem check the following file extent of inode 257.

item 6 key (257 EXTENT_DATA 0) itemoff 15813 itemsize 53
generation 6 type 1 (regular)
extent data disk byte 13631488 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression 0 (none)
item 7 key (257 EXTENT_DATA 8192) itemoff 15760 itemsize 53
generation 6 type 1 (regular)
extent data disk byte 13631488 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression 0 (none)
item 8 key (257 EXTENT_DATA 12288) itemoff 15707 itemsize 53
generation 6 type 1 (regular)
extent data disk byte 13631488 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression 0 (none)

For inode 257, check_inode_item set extent_end to 0, then call
check_file_extent to check item {6,7,8}.
item 6)
offset(0) == extent_end(0)
extent_end = extent_end(0) + extent_num_bytes(4096)
item 7)
offset(8192) != extent_end(4096)
extent_end = extent_end(4096) + extent_num_bytes(4096)
^^^
The old extent_end should replace by offset(8192).
item 8)
offset(12288) != extent_end(8192)
^^^
But there is no gap between item {7,8}.

Fixes: d88da10ddd42 ("btrfs-progs: check: introduce function to check file 
extent")
Signed-off-by: Lu Fengqi 
---
 check/mode-lowmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
index 1bce44f5658a..370318f0e631 100644
--- a/check/mode-lowmem.c
+++ b/check/mode-lowmem.c
@@ -1974,7 +1974,7 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_path *path,
}
}
 
-   *end += extent_num_bytes;
+   *end = fkey.offset + extent_num_bytes;
if (!is_hole)
*size += extent_num_bytes;
 
-- 
2.18.0

Re: [RFC PATCH v2 1/4] btrfs: factor out btrfs_link_subvol from create_subvol

2018-09-11 Thread Lu Fengqi

On Tue, Sep 11, 2018 at 07:57:03PM +0800, Qu Wenruo wrote:
>
>
>On 2018/9/11 下午7:29, Lu Fengqi wrote:
>> The function btrfs_link_subvol is responsible to link the subvolume to
>> the specified directory, which is the opposite of what
>> btrfs_unlink_subvol does.
>> 
>> No functional change.
>> 
>> Signed-off-by: Lu Fengqi 
>
>The patch itself is OK.
>
>Just small nitpicks inlined below.
>
>> ---
>>  fs/btrfs/ioctl.c | 64 +++-
>>  1 file changed, 41 insertions(+), 23 deletions(-)
>> 
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index 4905d13dee0a..1b03d07acde2 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -542,6 +542,45 @@ int btrfs_is_empty_uuid(u8 *uuid)
>>  return 1;
>>  }
>>  
>> +static int btrfs_link_subvol(struct btrfs_trans_handle *trans,
>> + struct inode *dir, u64 objectid, const char *name,
>> + int namelen)
>> +{
>> +struct btrfs_root *root = BTRFS_I(dir)->root;
>> +struct btrfs_key key;
>> +u64 index = 0;
>> +int ret;
>> +
>> +/*
>> + * insert the directory item
>> + */
>> +ret = btrfs_set_inode_index(BTRFS_I(dir), );
>> +if (ret) {
>> +btrfs_abort_transaction(trans, ret);
>> +return ret;
>> +}
>> +
>> +key.objectid = objectid;
>> +key.type = BTRFS_ROOT_ITEM_KEY;
>> +key.offset = -1;
>> +ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), ,
>> +BTRFS_FT_DIR, index);
>> +if (ret) {
>> +btrfs_abort_transaction(trans, ret);
>> +return ret;
>> +}
>> +
>> +btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
>> +ret = btrfs_update_inode(trans, root, dir);
>> +BUG_ON(ret);
>
>What about clean up this BUG_ON()?
>
>> +
>> +ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
>> + btrfs_ino(BTRFS_I(dir)), index, name, namelen);
>> +BUG_ON(ret);
>
>And this one?

Sorry, this makes you confused. This is exactly the cleanup done by Patch 2, 
because
I want to just move the code in Patch 1.

Thanks,
Lu

>
>Thanks,
>Qu
>
>> +
>> +return ret;
>> +}
>> +
>>  static noinline int create_subvol(struct inode *dir,
>>struct dentry *dentry,
>>const char *name, int namelen,
>> @@ -563,7 +602,6 @@ static noinline int create_subvol(struct inode *dir,
>>  int err;
>>  u64 objectid;
>>  u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
>> -u64 index = 0;
>>  uuid_le new_uuid;
>>  
>>  root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
>> @@ -677,29 +715,9 @@ static noinline int create_subvol(struct inode *dir,
>>  new_root->highest_objectid = new_dirid;
>>  mutex_unlock(_root->objectid_mutex);
>>  
>> -/*
>> - * insert the directory item
>> - */
>> -ret = btrfs_set_inode_index(BTRFS_I(dir), );
>> -if (ret) {
>> -btrfs_abort_transaction(trans, ret);
>> -goto fail;
>> -}
>> -
>> -ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), ,
>> -BTRFS_FT_DIR, index);
>> -if (ret) {
>> -btrfs_abort_transaction(trans, ret);
>> +ret = btrfs_link_subvol(trans, dir, objectid, name, namelen);
>> +if (ret)
>>  goto fail;
>> -}
>> -
>> -btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
>> -ret = btrfs_update_inode(trans, root, dir);
>> -BUG_ON(ret);
>> -
>> -ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
>> - btrfs_ino(BTRFS_I(dir)), index, name, namelen);
>> -BUG_ON(ret);
>>  
>>  ret = btrfs_uuid_tree_add(trans, root_item->uuid,
>>BTRFS_UUID_KEY_SUBVOL, objectid);
>> 
>

Re: [PATCH v3 00/10] undelete subvolume offline version

2018-09-11 Thread Lu Fengqi

On Mon, May 07, 2018 at 11:10:23AM +0800, Lu Fengqi wrote:
>This patchset will add undelete-subvol subcommand for btrfs rescue.
>

Hi David

Although there are some disagreements about undeleting subvolumes online
implementation, the offline version is considered more acceptable. Would
you like to spend some time talking about your opinions?

-- 
Thanks,
Lu

>Patchset can be fetched from github:
>https://github.com/littleroad/btrfs-progs.git undelete
>
>v2->v3: fixed some issues pointed out by Qu.
>v1->v2: add -s option to allow user specify the subvolume which will be
>recovered.
>
>The first patch are not modified.
>For the rest, please see the changelog in the patches.
>
>Lu Fengqi (10):
>  btrfs-progs: copy btrfs_del_orphan_item from kernel
>  btrfs-progs: extract btrfs_link_subvol from btrfs_mksubvol
>  btrfs-progs: use btrfs_find_free_dir_index to find free inode index
>  btrfs-progs: undelete-subvol: introduce is_subvol_intact
>  btrfs-progs: undelete-subvol: introduce recover_dead_root
>  btrfs-progs: undelete-subvol: introduce link_subvol_to_lostfound
>  btrfs-progs: undelete-subvol: introduce btrfs_undelete_subvols
>  btrfs-progs: undelete-subvol: add undelete-subvol subcommand
>  btrfs-progs: tests: add testcase for undelete-subvol
>  btrfs-progs: undelete-subvol: update completion and documentation
>
> Documentation/btrfs-rescue.asciidoc   |  12 +
> Makefile  |   3 +-
> btrfs-completion  |   2 +-
> cmds-rescue.c |  69 ++
> convert/main.c|  59 -
> ctree.h   |   8 +-
> inode.c   | 119 +
> .../031-undelete-subvol/intact_subvolume.img  | Bin 0 -> 4096 bytes
> .../subvolume_in_drop_progress.raw.xz | Bin 0 -> 23452 bytes
> tests/misc-tests/031-undelete-subvol/test.sh  |  38 +++
> undelete-subvol.c | 227 ++
> undelete-subvol.h |  11 +
> 12 files changed, 501 insertions(+), 47 deletions(-)
> create mode 100644 tests/misc-tests/031-undelete-subvol/intact_subvolume.img
> create mode 100644 
> tests/misc-tests/031-undelete-subvol/subvolume_in_drop_progress.raw.xz
> create mode 100755 tests/misc-tests/031-undelete-subvol/test.sh
> create mode 100644 undelete-subvol.c
> create mode 100644 undelete-subvol.h
>
>-- 
>2.17.0
>
>
>
>--
>To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>the body of a message to majord...@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH v2 2/2] btrfs-progs: subvolume: undelete: add btrfs subvolume undelete subcommand

2018-09-11 Thread Lu Fengqi

Add the undelete subcommand, this is depend on the
BTRFS_IOC_SUBVOL_UNDELETE ioctl.

Signed-off-by: Lu Fengqi 
---
 btrfs-completion |  2 +-
 cmds-subvolume.c | 70 
 2 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/btrfs-completion b/btrfs-completion
index ae683f4ecf61..2b43fbd63023 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -30,7 +30,7 @@ _btrfs()
local cmd=${words[1]}
 
commands='subvolume filesystem balance device scrub check rescue 
restore inspect-internal property send receive quota qgroup replace help 
version'
-   commands_subvolume='create delete list snapshot find-new get-default 
set-default show sync'
+   commands_subvolume='create delete list snapshot find-new get-default 
set-default show sync undelete'
commands_filesystem='defragment sync resize show df du label usage'
commands_balance='start pause cancel resume status'
commands_device='scan add delete remove ready stats usage'
diff --git a/cmds-subvolume.c b/cmds-subvolume.c
index e7a884af1f5d..228d0b9e9b34 100644
--- a/cmds-subvolume.c
+++ b/cmds-subvolume.c
@@ -1219,6 +1219,74 @@ out:
return !!ret;
 }
 
+static const char * const cmd_subvol_undelete_usage[] = {
+   "btrfs subvolume undelete [-n ]  ",
+   "Undelete the subvolume of the given  to .",
+   "",
+   "-n   recover the subvolume with .",
+   NULL
+};
+
+static int cmd_subvol_undelete(int argc, char **argv)
+{
+   struct btrfs_ioctl_subvol_undelete_args args;
+   bool need_assign_name = true;
+   DIR *dirstream = NULL;
+   char *dest;
+   int fd = -1;
+   int ret;
+
+   memset(, 0, sizeof(args));
+
+   while (1) {
+   int c = getopt(argc, argv, "n:");
+
+   if (c < 0)
+   break;
+
+   switch (c) {
+   case 'n':
+   strncpy_null(args.name, optarg);
+   need_assign_name = false;
+   break;
+   default:
+   usage(cmd_subvol_undelete_usage);
+   }
+   }
+   if (!need_assign_name) {
+   if (!test_issubvolname(args.name)) {
+   error("invalid subvolume name: %s", args.name);
+   return -EINVAL;
+   } else if (strlen(args.name) > BTRFS_VOL_NAME_MAX) {
+   error("subvolume name too long: %s", args.name);
+   return -EINVAL;
+   }
+   }
+
+   if (check_argc_exact(argc - optind, 2))
+   usage(cmd_subvol_undelete_usage);
+
+   args.subvol_id = arg_strtou64(argv[optind]);
+   if (need_assign_name)
+   snprintf(args.name, BTRFS_VOL_NAME_MAX, "sub_%llu",
+   args.subvol_id);
+
+   dest = argv[optind + 1];
+   fd = btrfs_open_dir(dest, , 1);
+   if (fd < 0) {
+   error("can't access '%s'", dest);
+   return -1;
+   }
+
+   ret = ioctl(fd, BTRFS_IOC_SUBVOL_UNDELETE, );
+   if (ret)
+   perror("BTRFS_IOC_SUBVOL_UNDELETE");
+
+   close_file_or_dir(fd, dirstream);
+
+   return ret;
+}
+
 static const char subvolume_cmd_group_info[] =
 "manage subvolumes: create, delete, list, etc";
 
@@ -1237,6 +1305,8 @@ const struct cmd_group subvolume_cmd_group = {
NULL, 0 },
{ "show", cmd_subvol_show, cmd_subvol_show_usage, NULL, 0 },
{ "sync", cmd_subvol_sync, cmd_subvol_sync_usage, NULL, 0 },
+   { "undelete", cmd_subvol_undelete, cmd_subvol_undelete_usage,
+   NULL, 0 },
NULL_CMD_STRUCT
}
 };
-- 
2.18.0

[RFC PATCH v2 1/2] btrfs-progs: ioctl: add BTRFS_IOC_SUBVOL_UNDELETE to ioctl.h

2018-09-11 Thread Lu Fengqi

Copied from uapi/linux/btrfs.h.

Signed-off-by: Lu Fengqi 
---
 ioctl.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/ioctl.h b/ioctl.h
index 709e996f401c..75978a4e8265 100644
--- a/ioctl.h
+++ b/ioctl.h
@@ -670,6 +670,11 @@ struct btrfs_ioctl_send_args_64 {
 } __attribute__((packed));
 BUILD_ASSERT(sizeof(struct btrfs_ioctl_send_args_64) == 72);
 
+struct btrfs_ioctl_subvol_undelete_args {
+   __u64 subvol_id;
+   char name[BTRFS_PATH_NAME_MAX + 1];
+};
+
 #define BTRFS_IOC_SEND_64_COMPAT_DEFINED 1
 
 /* Error codes as returned by the kernel */
@@ -828,6 +833,8 @@ static inline char *btrfs_err_str(enum btrfs_err_code 
err_code)
   struct btrfs_ioctl_feature_flags[3])
 #define BTRFS_IOC_RM_DEV_V2_IOW(BTRFS_IOCTL_MAGIC, 58, \
   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_UNDELETE _IOWR(BTRFS_IOCTL_MAGIC, 63, \
+   struct btrfs_ioctl_subvol_undelete_args)
 #ifdef __cplusplus
 }
 #endif
-- 
2.18.0

[RFC PATCH v2 4/4] btrfs: undelete: Add BTRFS_IOCTL_SUBVOL_UNDELETE ioctl

2018-09-11 Thread Lu Fengqi

This ioctl will provide user the ability to recover the subvolume of the
given id to the given directory.

Note: It will lock fs_info->cleaner_mutex to keep the cleaner kthread
from deleting the subvolume which we want to recover.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ioctl.c   | 64 ++
 include/uapi/linux/btrfs.h |  7 +
 2 files changed, 71 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f088dea53c16..3ddf6e1c117b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1993,6 +1993,68 @@ static int btrfs_undelete_subvolume(struct btrfs_root 
*root,
return ret;
 }
 
+static int btrfs_ioctl_undelete(struct file *file, void __user *argp)
+{
+   struct btrfs_ioctl_subvol_undelete_args *args;
+   struct inode *inode = file_inode(file);
+   struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+   struct btrfs_root *root;
+   int ret = 0;
+
+   if (!S_ISDIR(inode->i_mode))
+   return -ENOTDIR;
+
+   args = memdup_user(argp, sizeof(*args));
+   if (IS_ERR(args))
+   return PTR_ERR(args);
+
+   args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+   if (!capable(CAP_SYS_ADMIN)) {
+   ret = -EPERM;
+   goto free;
+   }
+
+   ret = mnt_want_write_file(file);
+   if (ret)
+   goto free;
+
+   ret = -ENOENT;
+   spin_lock(_info->trans_lock);
+   list_for_each_entry(root, _info->dead_roots, root_list) {
+   if (root->root_key.objectid == args->subvol_id) {
+   list_del_init(>root_list);
+   ret = 0;
+   break;
+   }
+   }
+   spin_unlock(_info->trans_lock);
+   if (ret)
+   goto drop_write;
+
+   /*
+* Lock cleaner_mutex to prevent the cleaner kthread from deleting the
+* subvolume we want to recover so that we can perform the next rescue
+* in a relaxed manner.
+*/
+   mutex_lock(_info->cleaner_mutex);
+
+   ret = btrfs_undelete_subvolume(root, file->f_path.dentry, args->name,
+  strlen(args->name));
+   if (ret) {
+   btrfs_add_dead_root(root);
+   goto unlock;
+   }
+
+unlock:
+   mutex_unlock(_info->cleaner_mutex);
+drop_write:
+   mnt_drop_write_file(file);
+free:
+   kfree(args);
+   return ret;
+}
+
 static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
void __user *arg)
 {
@@ -6118,6 +6180,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_subvol_rootref(file, argp);
case BTRFS_IOC_INO_LOOKUP_USER:
return btrfs_ioctl_ino_lookup_user(file, argp);
+   case BTRFS_IOC_SUBVOL_UNDELETE:
+   return btrfs_ioctl_undelete(file, argp);
}
 
return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5ca1d21fc4a7..e6d3c8e24bb8 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -816,6 +816,11 @@ struct btrfs_ioctl_get_subvol_rootref_args {
__u8 align[7];
 };
 
+struct btrfs_ioctl_subvol_undelete_args {
+   __u64 subvol_id;
+   char name[BTRFS_PATH_NAME_MAX + 1];
+};
+
 /* Error codes as returned by the kernel */
 enum btrfs_err_code {
BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -940,5 +945,7 @@ enum btrfs_err_code {
struct btrfs_ioctl_get_subvol_rootref_args)
 #define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \
struct btrfs_ioctl_ino_lookup_user_args)
+#define BTRFS_IOC_SUBVOL_UNDELETE _IOWR(BTRFS_IOCTL_MAGIC, 63, \
+   struct btrfs_ioctl_subvol_undelete_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */
-- 
2.18.0

[RFC PATCH v2 3/4] btrfs: undelete: introduce btrfs_undelete_subvolume

2018-09-11 Thread Lu Fengqi

The function will do the following things which are almost the opposite
of what btrfs_delete_subvolume() does:

1. link the subvolume to the parent specified;
2. clear root flag and set root_refs to 1;
3. add the subvol to the uuid_tree;
4. delete the orphan_item.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ioctl.c | 113 +++
 1 file changed, 113 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f6173d4e7ced..f088dea53c16 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1880,6 +1880,119 @@ static noinline int btrfs_ioctl_snap_create_v2(struct 
file *file,
return ret;
 }
 
+static int btrfs_undelete_subvolume(struct btrfs_root *root,
+   struct dentry *parent, const char *name,
+   int namelen)
+{
+   struct inode *dir = d_inode(parent);
+   struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+   struct btrfs_root_item *root_item = >root_item;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_block_rsv block_rsv;
+   struct dentry *dentry;
+   struct inode *inode;
+   u64 root_flags;
+   int ret;
+
+   btrfs_debug(fs_info, "Undelete subvolume %llu",
+   root->root_key.objectid);
+
+   /* only care about the intact subvolume */
+   if (btrfs_disk_key_objectid(_item->drop_progress) != 0)
+   return 0;
+
+   ret = down_write_killable_nested(>i_rwsem, I_MUTEX_PARENT);
+   if (ret == -EINTR)
+   return ret;
+
+   dentry = lookup_one_len(name, parent, namelen);
+   if (IS_ERR(dentry)) {
+   ret = PTR_ERR(dentry);
+   goto out_unlock;
+   }
+
+   down_write(_info->subvol_sem);
+
+   ret = btrfs_may_create(dir, dentry);
+   if (ret)
+   goto out_up_write;
+
+   ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino,
+name, namelen);
+   if (ret)
+   goto out_up_write;
+
+   btrfs_init_block_rsv(_rsv, BTRFS_BLOCK_RSV_TEMP);
+   /*
+* 1 - parent dir inode
+* 2 - dir entries
+* 2 - root ref/backref
+* 1 - UUID item
+*/
+   ret = btrfs_subvolume_reserve_metadata(root, _rsv, 6, false);
+   if (ret)
+   goto out_up_write;
+
+   trans = btrfs_start_transaction(BTRFS_I(dir)->root, 0);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   btrfs_subvolume_release_metadata(fs_info, _rsv);
+   goto out_up_write;
+   }
+
+   trans->block_rsv = _rsv;
+   trans->bytes_reserved = block_rsv.size;
+
+   ret = btrfs_link_subvol(trans, dir, root->root_key.objectid, name,
+   namelen);
+   if (ret)
+   goto fail;
+
+   /* clear BTRFS_ROOT_SUBVOL_DEAD root flag and set root_refs to 1*/
+   root_flags = btrfs_root_flags(root_item);
+   btrfs_set_root_flags(root_item,
+root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+   btrfs_set_root_refs(root_item, 1);
+   ret = btrfs_update_root(trans, fs_info->tree_root,
+   >root_key, >root_item);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   goto fail;
+   }
+
+   ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
+ root->root_key.objectid);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   goto fail;
+   }
+
+   ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
+   root->root_key.objectid);
+   if (ret && ret != -ENOENT) {
+   btrfs_abort_transaction(trans, ret);
+   goto fail;
+   }
+fail:
+   trans->block_rsv = NULL;
+   trans->bytes_reserved = 0;
+   btrfs_subvolume_release_metadata(fs_info, _rsv);
+   ret = btrfs_commit_transaction(trans);
+   if (!ret) {
+   inode = btrfs_lookup_dentry(dir, dentry);
+   if (IS_ERR(inode))
+   return PTR_ERR(inode);
+   d_instantiate(dentry, inode);
+   fsnotify_mkdir(dir, dentry);
+   }
+out_up_write:
+   up_write(_info->subvol_sem);
+   dput(dentry);
+out_unlock:
+   inode_unlock(dir);
+   return ret;
+}
+
 static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
void __user *arg)
 {
-- 
2.18.0

[RFC PATCH v2 0/4] undelete subvolume online version

2018-09-11 Thread Lu Fengqi

This patchset will add the BTRFS_IOC_SUBVOL_UNDELETE ioctl for online
btrfs subvolume undelete.

And using the online_undelete version of btrfs-progs, user can recover
the subvolume given by  to the directory given by . The
optional parameter [-n ] can be used to set the name of the
recovered subvolume.

# btrfs subvolume undelete [-n ]  

btrfs online undelete version:
https://github.com/littleroad/linux.git undelete

btrfs-progs online undelete version:
https://github.com/littleroad/btrfs-progs.git online_undelete

Issue: #82

Lu Fengqi (4):
  btrfs: factor out btrfs_link_subvol from create_subvol
  btrfs: don't BUG_ON() in btrfs_link_subvol()
  btrfs: undelete: introduce btrfs_undelete_subvolume
  btrfs: undelete: Add BTRFS_IOCTL_SUBVOL_UNDELETE ioctl

 fs/btrfs/ioctl.c   | 247 +
 include/uapi/linux/btrfs.h |   7 ++
 2 files changed, 231 insertions(+), 23 deletions(-)

-- 
2.18.0

[RFC PATCH v2 1/4] btrfs: factor out btrfs_link_subvol from create_subvol

2018-09-11 Thread Lu Fengqi

The function btrfs_link_subvol is responsible to link the subvolume to
the specified directory, which is the opposite of what
btrfs_unlink_subvol does.

No functional change.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ioctl.c | 64 +++-
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4905d13dee0a..1b03d07acde2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -542,6 +542,45 @@ int btrfs_is_empty_uuid(u8 *uuid)
return 1;
 }
 
+static int btrfs_link_subvol(struct btrfs_trans_handle *trans,
+struct inode *dir, u64 objectid, const char *name,
+int namelen)
+{
+   struct btrfs_root *root = BTRFS_I(dir)->root;
+   struct btrfs_key key;
+   u64 index = 0;
+   int ret;
+
+   /*
+* insert the directory item
+*/
+   ret = btrfs_set_inode_index(BTRFS_I(dir), );
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   return ret;
+   }
+
+   key.objectid = objectid;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = -1;
+   ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), ,
+   BTRFS_FT_DIR, index);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   return ret;
+   }
+
+   btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
+   ret = btrfs_update_inode(trans, root, dir);
+   BUG_ON(ret);
+
+   ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
+btrfs_ino(BTRFS_I(dir)), index, name, namelen);
+   BUG_ON(ret);
+
+   return ret;
+}
+
 static noinline int create_subvol(struct inode *dir,
  struct dentry *dentry,
  const char *name, int namelen,
@@ -563,7 +602,6 @@ static noinline int create_subvol(struct inode *dir,
int err;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
-   u64 index = 0;
uuid_le new_uuid;
 
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
@@ -677,29 +715,9 @@ static noinline int create_subvol(struct inode *dir,
new_root->highest_objectid = new_dirid;
mutex_unlock(_root->objectid_mutex);
 
-   /*
-* insert the directory item
-*/
-   ret = btrfs_set_inode_index(BTRFS_I(dir), );
-   if (ret) {
-   btrfs_abort_transaction(trans, ret);
-   goto fail;
-   }
-
-   ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), ,
-   BTRFS_FT_DIR, index);
-   if (ret) {
-   btrfs_abort_transaction(trans, ret);
+   ret = btrfs_link_subvol(trans, dir, objectid, name, namelen);
+   if (ret)
goto fail;
-   }
-
-   btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
-   ret = btrfs_update_inode(trans, root, dir);
-   BUG_ON(ret);
-
-   ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
-btrfs_ino(BTRFS_I(dir)), index, name, namelen);
-   BUG_ON(ret);
 
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
  BTRFS_UUID_KEY_SUBVOL, objectid);
-- 
2.18.0

[RFC PATCH v2 2/4] btrfs: don't BUG_ON() in btrfs_link_subvol()

2018-09-11 Thread Lu Fengqi

Both of btrfs_update_inode() and btrfs_add_root_ref() may fail because
of ENOMEM. So there's no reason to panic here, we can replace BUG_ON()
with btrfs_abort_transaction() here.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ioctl.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1b03d07acde2..f6173d4e7ced 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -572,11 +572,17 @@ static int btrfs_link_subvol(struct btrfs_trans_handle 
*trans,
 
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, dir);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   return ret;
+   }
 
ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
 btrfs_ino(BTRFS_I(dir)), index, name, namelen);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   return ret;
+   }
 
return ret;
 }
-- 
2.18.0

[PATCH] btrfs-progs: calibrate extent_end when found a gap

2018-09-04 Thread Lu Fengqi

The extent_end will be used to check whether there is gap between this
extent and next extent. If it is not calibrated, check_file_extent will
mistake that there are gaps between the remaining extents.

Signed-off-by: Lu Fengqi 
---
 check/mode-lowmem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
index 1bce44f5658a..0f14a4968e84 100644
--- a/check/mode-lowmem.c
+++ b/check/mode-lowmem.c
@@ -1972,6 +1972,7 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_path *path,
root->objectid, fkey.objectid, fkey.offset,
fkey.objectid, *end);
}
+   *end = fkey.offset;
}
 
*end += extent_num_bytes;
-- 
2.18.0

[PATCH v10.5 2/5] btrfs-progs: dedupe: Add enable command for dedupe command group

2018-09-04 Thread Lu Fengqi

From: Qu Wenruo 

Add enable subcommand for dedupe commmand group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc | 114 +-
 btrfs-completion   |   6 +-
 cmds-dedupe-ib.c   | 238 +
 ioctl.h|   2 +
 4 files changed, 358 insertions(+), 2 deletions(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 83113f5487e2..d895aafbcf45 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -22,7 +22,119 @@ use with caution.
 
 SUBCOMMAND
 --
-Nothing yet
+*enable* [options] ::
+Enable in-band de-duplication for a filesystem.
++
+`Options`
++
+-f|--force
+Force 'enable' command to be exected.
+Will skip memory limit check and allow 'enable' to be executed even in-band
+de-duplication is already enabled.
++
+NOTE: If re-enable dedupe with '-f' option, any unspecified parameter will be
+reset to its default value.
+
+-s|--storage-backend 
+Specify de-duplication hash storage backend.
+Only 'inmemory' backend is supported yet.
+If not specified, default value is 'inmemory'.
++
+Refer to *BACKENDS* sector for more information.
+
+-b|--blocksize 
+Specify dedupe block size.
+Supported values are power of 2 from '16K' to '8M'.
+Default value is '128K'.
++
+Refer to *BLOCKSIZE* sector for more information.
+
+-a|--hash-algorithm 
+Specify hash algorithm.
+Only 'sha256' is supported yet.
+
+-l|--limit-hash 
+Specify maximum number of hashes stored in memory.
+Only works for 'inmemory' backend.
+Conflicts with '-m' option.
++
+Only positive values are valid.
+Default value is '32K'.
+
+-m|--limit-memory 
+Specify maximum memory used for hashes.
+Only works for 'inmemory' backend.
+Conflicts with '-l' option.
++
+Only value larger than or equal to '1024' is valid.
+No default value.
++
+NOTE: Memory limit will be rounded down to kernel internal hash size,
+so the memory limit shown in 'btrfs dedupe-inband status' may be different
+from the .
+
+WARNING: Too large value for '-l' or '-m' will easily trigger OOM.
+Please use with caution according to system memory.
+
+NOTE: In-band de-duplication is not compactible with compression yet.
+And compression has higher priority than in-band de-duplication, means if
+compression and de-duplication is enabled at the same time, only compression
+will work.
+
+BACKENDS
+
+Btrfs in-band de-duplication will support different storage backends, with
+different use case and features.
+
+In-memory backend::
+This backend provides backward-compatibility, and more fine-tuning options.
+But hash pool is non-persistent and may exhaust kernel memory if not setup
+properly.
++
+This backend can be used on old btrfs(without '-O dedupe' mkfs option).
+When used on old btrfs, this backend needs to be enabled manually after mount.
++
+Designed for fast hash search speed, in-memory backend will keep all dedupe
+hashes in memory. (Although overall performance is still much the same with
+'ondisk' backend if all 'ondisk' hash can be cached in memory)
++
+And only keeps limited number of hash in memory to avoid exhausting memory.
+Hashes over the limit will be dropped following Last-Recent-Use behavior.
+So this backend has a consistent overhead for given limit but can\'t ensure
+all duplicated blocks will be de-duplicated.
++
+After umount and mount, in-memory backend need to refill its hash pool.
+
+On-disk backend::
+This backend provides persistent hash pool, with more smart memory management
+for hash pool.
+But it\'s not backward-compatible, meaning it must be used with '-O dedupe' 
mkfs
+option and older kernel can\'t mount it read-write.
++
+Designed for de-duplication rate, hash pool is stored as btrfs B+ tree on disk.
+This behavior may cause extra disk IO for hash search under high memory
+pressure.
++
+After umount and mount, on-disk backend still has its hash on disk, no need to
+refill its dedupe hash pool.
+
+Currently, only 'inmemory' backend is supported in btrfs-progs.
+
+DEDUPE BLOCK SIZE
+
+In-band de-duplication is done at dedupe block size.
+Any data smaller than dedupe block size won\'t go through in-band
+de-duplication.
+
+And dedupe block size affects dedupe rate and fragmentation heavily.
+
+Smaller block size will cause more fragments, but higher dedupe rate.
+
+Larger block size will cause less fragments, but lower dedupe rate.
+
+In-band de-duplication rate is highly related to the workload pattern.
+So it\'s highly recommended to align dedupe block size to the workload
+block size to make full use of de-duplication.
 
 EXIT STATUS
 ---
diff --git a/btrfs-completion b/btrfs-completion
index ae683f4ecf61..cfdf70966e47 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -29,7 +29,7 @@ _btrfs()
 
local cmd=${words[1]}
 
-   commands

[PATCH v10.5 5/5] btrfs-progs: dedupe: introduce reconfigure subcommand

2018-09-04 Thread Lu Fengqi

From: Qu Wenruo 

Introduce reconfigure subcommand to co-operate with new kernel ioctl
modification.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  7 +++
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 73 +-
 3 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 6096389cb0b4..78c806f772d6 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -86,6 +86,13 @@ And compression has higher priority than in-band 
de-duplication, means if
 compression and de-duplication is enabled at the same time, only compression
 will work.
 
+*reconfigure* [options] ::
+Re-configure in-band de-duplication parameters of a filesystem.
++
+In-band de-duplication must be enbaled first before re-configuration.
++
+[Options] are the same with 'btrfs dedupe-inband enable'.
+
 *status* ::
 Show current in-band de-duplication status of a filesystem.
 
diff --git a/btrfs-completion b/btrfs-completion
index 62a7bdd4d0d5..6ff48e4c2f6a 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -41,7 +41,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe_inband='enable disable status'
+   commands_dedupe_inband='enable disable status reconfigure'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index e778457e25a8..e52f939c9ced 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -56,7 +56,6 @@ static const char * const cmd_dedupe_ib_enable_usage[] = {
NULL
 };
 
-
 #define report_fatal_parameter(dargs, old, member, type, err_val, fmt) \
 ({ \
if (dargs->member != old->member && \
@@ -88,6 +87,12 @@ static void report_parameter_error(struct 
btrfs_ioctl_dedupe_args *dargs,
}
report_option_parameter(dargs, old, flags, u8, -1, x);
}
+
+   if (dargs->status == 0 && old->cmd == BTRFS_DEDUPE_CTL_RECONF) {
+   error("must enable dedupe before reconfiguration");
+   return;
+   }
+
if (report_fatal_parameter(dargs, old, cmd, u16, -1, u) ||
report_fatal_parameter(dargs, old, blocksize, u64, -1, llu) ||
report_fatal_parameter(dargs, old, backend, u16, -1, u) ||
@@ -100,14 +105,17 @@ static void report_parameter_error(struct 
btrfs_ioctl_dedupe_args *dargs,
old->limit_nr, old->limit_mem);
 }
 
-static int cmd_dedupe_ib_enable(int argc, char **argv)
+static int enable_reconfig_dedupe(int argc, char **argv, int reconf)
 {
int ret;
int fd = -1;
char *path;
u64 blocksize = BTRFS_DEDUPE_BLOCKSIZE_DEFAULT;
+   int blocksize_set = 0;
u16 hash_algo = BTRFS_DEDUPE_HASH_SHA256;
+   int hash_algo_set = 0;
u16 backend = BTRFS_DEDUPE_BACKEND_INMEMORY;
+   int backend_set = 0;
u64 limit_nr = 0;
u64 limit_mem = 0;
u64 sys_mem = 0;
@@ -134,15 +142,17 @@ static int cmd_dedupe_ib_enable(int argc, char **argv)
break;
switch (c) {
case 's':
-   if (!strcasecmp("inmemory", optarg))
+   if (!strcasecmp("inmemory", optarg)) {
backend = BTRFS_DEDUPE_BACKEND_INMEMORY;
-   else {
+   backend_set = 1;
+   } else {
error("unsupported dedupe backend: %s", optarg);
exit(1);
}
break;
case 'b':
blocksize = parse_size(optarg);
+   blocksize_set = 1;
break;
case 'a':
if (strcmp("sha256", optarg)) {
@@ -224,26 +234,40 @@ static int cmd_dedupe_ib_enable(int argc, char **argv)
return 1;
}
memset(, -1, sizeof(dargs));
-   dargs.cmd = BTRFS_DEDUPE_CTL_ENABLE;
-   dargs.blocksize = blocksize;
-   dargs.hash_algo = hash_algo;
-   dargs.limit_nr = limit_nr;
-   dargs.limit_mem = limit_mem;
-   dargs.backend = backend;
-   if (force)
-   dargs.flags |= BTRFS_DEDUPE_FLAG_FORCE;
-   else
-   dargs.flags = 0;
+   if (reconf) {
+   dargs.cmd = BTRFS_

[PATCH v10.5 3/5] btrfs-progs: dedupe: Add disable support for inband dedupelication

2018-09-04 Thread Lu Fengqi

From: Qu Wenruo 

Add disable subcommand for dedupe command group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  5 +++
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 41 ++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index d895aafbcf45..3452f690e3e5 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -22,6 +22,11 @@ use with caution.
 
 SUBCOMMAND
 --
+*disable* ::
+Disable in-band de-duplication for a filesystem.
++
+This will trash all stored dedupe hash.
++
 *enable* [options] ::
 Enable in-band de-duplication for a filesystem.
 +
diff --git a/btrfs-completion b/btrfs-completion
index cfdf70966e47..a74a23f42022 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -41,7 +41,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe_inband='enable'
+   commands_dedupe_inband='enable disable'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index 4d499677d9ae..91b6fe234043 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -259,10 +259,51 @@ out:
return ret;
 }
 
+static const char * const cmd_dedupe_ib_disable_usage[] = {
+   "btrfs dedupe-inband disable ",
+   "Disable in-band(write time) de-duplication of a btrfs.",
+   NULL
+};
+
+static int cmd_dedupe_ib_disable(int argc, char **argv)
+{
+   struct btrfs_ioctl_dedupe_args dargs;
+   DIR *dirstream;
+   char *path;
+   int fd;
+   int ret;
+
+   if (check_argc_exact(argc, 2))
+   usage(cmd_dedupe_ib_disable_usage);
+
+   path = argv[1];
+   fd = open_file_or_dir(path, );
+   if (fd < 0) {
+   error("failed to open file or directory: %s", path);
+   return 1;
+   }
+   memset(, 0, sizeof(dargs));
+   dargs.cmd = BTRFS_DEDUPE_CTL_DISABLE;
+
+   ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, );
+   if (ret < 0) {
+   error("failed to disable inband deduplication: %m");
+   ret = 1;
+   goto out;
+   }
+   ret = 0;
+
+out:
+   close_file_or_dir(fd, dirstream);
+   return 0;
+}
+
 const struct cmd_group dedupe_ib_cmd_group = {
dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, {
{ "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage,
  NULL, 0},
+   { "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage,
+ NULL, 0},
NULL_CMD_STRUCT
}
 };
-- 
2.18.0

[PATCH v10.5 4/5] btrfs-progs: dedupe: Add status subcommand

2018-09-04 Thread Lu Fengqi

From: Qu Wenruo 

Add status subcommand for dedupe command group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  3 +
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 80 ++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 3452f690e3e5..6096389cb0b4 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -86,6 +86,9 @@ And compression has higher priority than in-band 
de-duplication, means if
 compression and de-duplication is enabled at the same time, only compression
 will work.
 
+*status* ::
+Show current in-band de-duplication status of a filesystem.
+
 BACKENDS
 
 Btrfs in-band de-duplication will support different storage backends, with
diff --git a/btrfs-completion b/btrfs-completion
index a74a23f42022..62a7bdd4d0d5 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -41,7 +41,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe_inband='enable disable'
+   commands_dedupe_inband='enable disable status'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index 91b6fe234043..e778457e25a8 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -298,12 +298,92 @@ out:
return 0;
 }
 
+static const char * const cmd_dedupe_ib_status_usage[] = {
+   "btrfs dedupe-inband status ",
+   "Show current in-band(write time) de-duplication status of a btrfs.",
+   NULL
+};
+
+static int cmd_dedupe_ib_status(int argc, char **argv)
+{
+   struct btrfs_ioctl_dedupe_args dargs;
+   DIR *dirstream;
+   char *path;
+   int fd;
+   int ret;
+   int print_limit = 1;
+
+   if (check_argc_exact(argc, 2))
+   usage(cmd_dedupe_ib_status_usage);
+
+   path = argv[1];
+   fd = open_file_or_dir(path, );
+   if (fd < 0) {
+   error("failed to open file or directory: %s", path);
+   ret = 1;
+   goto out;
+   }
+   memset(, 0, sizeof(dargs));
+   dargs.cmd = BTRFS_DEDUPE_CTL_STATUS;
+
+   ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, );
+   if (ret < 0) {
+   error("failed to get inband deduplication status: %m");
+   ret = 1;
+   goto out;
+   }
+   ret = 0;
+   if (dargs.status == 0) {
+   printf("Status: \t\t\tDisabled\n");
+   goto out;
+   }
+   printf("Status:\t\t\tEnabled\n");
+
+   if (dargs.hash_algo == BTRFS_DEDUPE_HASH_SHA256)
+   printf("Hash algorithm:\t\tSHA-256\n");
+   else
+   printf("Hash algorithm:\t\tUnrecognized(%x)\n",
+   dargs.hash_algo);
+
+   if (dargs.backend == BTRFS_DEDUPE_BACKEND_INMEMORY) {
+   printf("Backend:\t\tIn-memory\n");
+   print_limit = 1;
+   } else  {
+   printf("Backend:\t\tUnrecognized(%x)\n",
+   dargs.backend);
+   }
+
+   printf("Dedup Blocksize:\t%llu\n", dargs.blocksize);
+
+   if (print_limit) {
+   u64 cur_mem;
+
+   /* Limit nr may be 0 */
+   if (dargs.limit_nr)
+   cur_mem = dargs.current_nr * (dargs.limit_mem /
+   dargs.limit_nr);
+   else
+   cur_mem = 0;
+
+   printf("Number of hash: \t[%llu/%llu]\n", dargs.current_nr,
+   dargs.limit_nr);
+   printf("Memory usage: \t\t[%s/%s]\n",
+   pretty_size(cur_mem),
+   pretty_size(dargs.limit_mem));
+   }
+out:
+   close_file_or_dir(fd, dirstream);
+   return ret;
+}
+
 const struct cmd_group dedupe_ib_cmd_group = {
dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, {
{ "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage,
  NULL, 0},
{ "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage,
  NULL, 0},
+   { "status", cmd_dedupe_ib_status, cmd_dedupe_ib_status_usage,
+ NULL, 0},
NULL_CMD_STRUCT
}
 };
-- 
2.18.0

[PATCH v10.5 0/5] In-band de-duplication for btrfs-progs

2018-09-04 Thread Lu Fengqi

Patchset can be fetched from github:
https://github.com/littleroad/btrfs-progs.git dedupe_latest

Inband dedupe(in-memory backend only) ioctl support for btrfs-progs.

v7 changes:
   Update ctree.h to follow kernel structure change
   Update print-tree to follow kernel structure change
V8 changes:
   Move dedup props and on-disk backend support out of the patchset
   Change command group name to "dedupe-inband", to avoid confusion with
   possible out-of-band dedupe. Suggested by Mark.
   Rebase to latest devel branch.
V9 changes:
   Follow kernels ioctl change to support FORCE flag, new reconf ioctl,
   and more precious error reporting.
v10 changes:
   Rebase to v4.10.
   Add BUILD_ASSERT for btrfs_ioctl_dedupe_args
v10.1 changes:
   Rebase to v4.14.
v10.2 changes:
   Rebase to v4.16.1.
v10.3 changes:
   Rebase to v4.17.
v10.4 changes:
   Deal with offline reviews from Misono Tomohiro.
   1. s/btrfs-dedupe/btrfs-dedupe-inband
   2. Replace strerror(errno) with %m
   3. Use SZ_* instead of intermedia number
   4. update btrfs-completion for reconfigure subcommand
v10.5 changes:
   Rebase to v4.17.1.

Qu Wenruo (5):
  btrfs-progs: Basic framework for dedupe-inband command group
  btrfs-progs: dedupe: Add enable command for dedupe command group
  btrfs-progs: dedupe: Add disable support for inband dedupelication
  btrfs-progs: dedupe: Add status subcommand
  btrfs-progs: dedupe: introduce reconfigure subcommand

 Documentation/Makefile.in  |   1 +
 Documentation/btrfs-dedupe-inband.asciidoc | 167 
 Documentation/btrfs.asciidoc   |   4 +
 Makefile   |   3 +-
 btrfs-completion   |   6 +-
 btrfs.c|   2 +
 cmds-dedupe-ib.c   | 437 +
 commands.h |   2 +
 dedupe-ib.h|  28 ++
 ioctl.h|  38 ++
 10 files changed, 686 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc
 create mode 100644 cmds-dedupe-ib.c
 create mode 100644 dedupe-ib.h

-- 
2.18.0

[PATCH v10.5 1/5] btrfs-progs: Basic framework for dedupe-inband command group

2018-09-04 Thread Lu Fengqi

From: Qu Wenruo 

Add basic ioctl header and command group framework for later use.
Alone with basic man page doc.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/Makefile.in  |  1 +
 Documentation/btrfs-dedupe-inband.asciidoc | 40 ++
 Documentation/btrfs.asciidoc   |  4 +++
 Makefile   |  3 +-
 btrfs.c|  2 ++
 cmds-dedupe-ib.c   | 35 +++
 commands.h |  2 ++
 dedupe-ib.h| 28 +++
 ioctl.h| 36 +++
 9 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc
 create mode 100644 cmds-dedupe-ib.c
 create mode 100644 dedupe-ib.h

diff --git a/Documentation/Makefile.in b/Documentation/Makefile.in
index 184647c41940..402155fae001 100644
--- a/Documentation/Makefile.in
+++ b/Documentation/Makefile.in
@@ -28,6 +28,7 @@ MAN8_TXT += btrfs-qgroup.asciidoc
 MAN8_TXT += btrfs-replace.asciidoc
 MAN8_TXT += btrfs-restore.asciidoc
 MAN8_TXT += btrfs-property.asciidoc
+MAN8_TXT += btrfs-dedupe-inband.asciidoc
 
 # Category 5 manual page
 MAN5_TXT += btrfs-man5.asciidoc
diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
new file mode 100644
index ..83113f5487e2
--- /dev/null
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -0,0 +1,40 @@
+btrfs-dedupe-inband(8)
+==
+
+NAME
+
+btrfs-dedupe-inband - manage in-band (write time) de-duplication of a btrfs
+filesystem
+
+SYNOPSIS
+
+*btrfs dedupe-inband*  
+
+DESCRIPTION
+---
+*btrfs dedupe-inband* is used to enable/disable or show current in-band 
de-duplication
+status of a btrfs filesystem.
+
+Kernel support for in-band de-duplication starts from 4.19.
+
+WARNING: In-band de-duplication is still an experimental feautre of btrfs,
+use with caution.
+
+SUBCOMMAND
+--
+Nothing yet
+
+EXIT STATUS
+---
+*btrfs dedupe-inband* returns a zero exit status if it succeeds. Non zero is
+returned in case of failure.
+
+AVAILABILITY
+
+*btrfs* is part of btrfs-progs.
+Please refer to the btrfs wiki http://btrfs.wiki.kernel.org for
+further details.
+
+SEE ALSO
+
+`mkfs.btrfs`(8),
diff --git a/Documentation/btrfs.asciidoc b/Documentation/btrfs.asciidoc
index 7316ac094413..1cf5bddec335 100644
--- a/Documentation/btrfs.asciidoc
+++ b/Documentation/btrfs.asciidoc
@@ -50,6 +50,10 @@ COMMANDS
Do off-line check on a btrfs filesystem. +
See `btrfs-check`(8) for details.
 
+*dedupe-inband*::
+   Control btrfs in-band(write time) de-duplication. +
+   See `btrfs-dedupe-inband`(8) for details.
+
 *device*::
Manage devices managed by btrfs, including add/delete/scan and so
on. +
diff --git a/Makefile b/Makefile
index fcfc815a2a5b..4052cecfae4d 100644
--- a/Makefile
+++ b/Makefile
@@ -123,7 +123,8 @@ cmds_objects = cmds-subvolume.o cmds-filesystem.o 
cmds-device.o cmds-scrub.o \
   cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o \
   cmds-property.o cmds-fi-usage.o cmds-inspect-dump-tree.o \
   cmds-inspect-dump-super.o cmds-inspect-tree-stats.o cmds-fi-du.o 
\
-  mkfs/common.o check/mode-common.o check/mode-lowmem.o
+  mkfs/common.o check/mode-common.o check/mode-lowmem.o \
+  cmds-dedupe-ib.o
 libbtrfs_objects = send-stream.o send-utils.o kernel-lib/rbtree.o btrfs-list.o 
\
   kernel-lib/crc32c.o messages.o \
   uuid-tree.o utils-lib.o rbtree-utils.o
diff --git a/btrfs.c b/btrfs.c
index 2d39f2ced3e8..2168f5a8bc7f 100644
--- a/btrfs.c
+++ b/btrfs.c
@@ -255,6 +255,8 @@ static const struct cmd_group btrfs_cmd_group = {
{ "quota", cmd_quota, NULL, _cmd_group, 0 },
{ "qgroup", cmd_qgroup, NULL, _cmd_group, 0 },
{ "replace", cmd_replace, NULL, _cmd_group, 0 },
+   { "dedupe-inband", cmd_dedupe_ib, NULL, _ib_cmd_group,
+   0 },
{ "help", cmd_help, cmd_help_usage, NULL, 0 },
{ "version", cmd_version, cmd_version_usage, NULL, 0 },
NULL_CMD_STRUCT
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
new file mode 100644
index ..73c923a797da
--- /dev/null
+++ b/cmds-dedupe-ib.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Fujitsu.  All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+
+#include "ctree.h"
+#include "ioctl.h"
+
+#include "commands.h"
+#include "utils.h"
+#include "kerncompat.h"
+#include "dedupe-ib.h"
+
+static const char * const dedupe_ib_cmd_gro

[PATCH v15 09/13] btrfs: introduce type based delalloc metadata reserve

2018-09-04 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce type based metadata reserve parameter for delalloc space
reservation/freeing function.

The problem we are going to solve is, btrfs use different max extent
size for different mount options.

For de-duplication, the max extent size can be set by the dedupe ioctl,
while for normal write it's 128M.
And furthermore, split/merge extent hook highly depends that max extent
size.

Such situation contributes to quite a lot of false ENOSPC.

So this patch introduces the facility to help solve these false ENOSPC
related to different max extent size.

Currently, only normal 128M extent size is supported. More types will
follow soon.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h |  43 ++---
 fs/btrfs/extent-tree.c   |  48 ---
 fs/btrfs/file.c  |  30 +
 fs/btrfs/free-space-cache.c  |   6 +-
 fs/btrfs/inode-map.c |   9 ++-
 fs/btrfs/inode.c | 115 +--
 fs/btrfs/ioctl.c |  23 +++
 fs/btrfs/ordered-data.c  |   6 +-
 fs/btrfs/ordered-data.h  |   3 +-
 fs/btrfs/relocation.c|  22 ---
 fs/btrfs/tests/inode-tests.c |  15 +++--
 11 files changed, 223 insertions(+), 97 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 741ef21a6185..4f0b6a12ecb1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -98,11 +98,24 @@ static const int btrfs_csum_sizes[] = { 4 };
 /*
  * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
  */
-static inline u32 count_max_extents(u64 size)
+static inline u32 count_max_extents(u64 size, u64 max_extent_size)
 {
-   return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+   return div_u64(size + max_extent_size - 1, max_extent_size);
 }
 
+/*
+ * Type based metadata reserve type
+ * This affects how btrfs reserve metadata space for buffered write.
+ *
+ * This is caused by the different max extent size for normal COW
+ * and further in-band dedupe
+ */
+enum btrfs_metadata_reserve_type {
+   BTRFS_RESERVE_NORMAL,
+};
+
+u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
 };
@@ -2742,8 +2755,9 @@ int btrfs_check_data_free_space(struct inode *inode,
 void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved,
- u64 start, u64 len, bool qgroup_free);
+   struct extent_changeset *reserved,
+   u64 start, u64 len, bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -2753,13 +2767,17 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
  struct btrfs_block_rsv *rsv);
 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
-   bool qgroup_free);
+   bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
-bool qgroup_free);
+   bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 int btrfs_delalloc_reserve_space(struct inode *inode,
-   struct extent_changeset **reserved, u64 start, u64 len);
+   struct extent_changeset **reserved, u64 start, u64 len,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
  unsigned short type);
@@ -3165,7 +3183,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
  unsigned int extra_bits,
- struct extent_state **cached_state, int dedupe

[PATCH v15 03/13] btrfs: dedupe: Introduce function to add hash into in-memory tree

2018-09-04 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce static function inmem_add() to add hash into in-memory tree.
And now we can implement the btrfs_dedupe_add() interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 150 ++
 1 file changed, 150 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 06523162753d..784bb3a8a5ab 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -19,6 +19,14 @@ struct inmem_hash {
u8 hash[];
 };
 
+static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
+{
+   if (WARN_ON(algo >= ARRAY_SIZE(btrfs_hash_sizes)))
+   return NULL;
+   return kzalloc(sizeof(struct inmem_hash) + btrfs_hash_sizes[algo],
+   GFP_NOFS);
+}
+
 static struct btrfs_dedupe_info *
 init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -167,3 +175,145 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
/* Place holder for bisect, will be implemented in later patches */
return 0;
 }
+
+static int inmem_insert_hash(struct rb_root *root,
+struct inmem_hash *hash, int hash_len)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+   if (memcmp(hash->hash, entry->hash, hash_len) < 0)
+   p = &(*p)->rb_left;
+   else if (memcmp(hash->hash, entry->hash, hash_len) > 0)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(>hash_node, parent, p);
+   rb_insert_color(>hash_node, root);
+   return 0;
+}
+
+static int inmem_insert_bytenr(struct rb_root *root,
+  struct inmem_hash *hash)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+   if (hash->bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (hash->bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(>bytenr_node, parent, p);
+   rb_insert_color(>bytenr_node, root);
+   return 0;
+}
+
+static void __inmem_del(struct btrfs_dedupe_info *dedupe_info,
+   struct inmem_hash *hash)
+{
+   list_del(>lru_list);
+   rb_erase(>hash_node, _info->hash_root);
+   rb_erase(>bytenr_node, _info->bytenr_root);
+
+   if (!WARN_ON(dedupe_info->current_nr == 0))
+   dedupe_info->current_nr--;
+
+   kfree(hash);
+}
+
+/*
+ * Insert a hash into in-memory dedupe tree
+ * Will remove exceeding last recent use hash.
+ *
+ * If the hash mathced with existing one, we won't insert it, to
+ * save memory
+ */
+static int inmem_add(struct btrfs_dedupe_info *dedupe_info,
+struct btrfs_dedupe_hash *hash)
+{
+   int ret = 0;
+   u16 algo = dedupe_info->hash_algo;
+   struct inmem_hash *ihash;
+
+   ihash = inmem_alloc_hash(algo);
+
+   if (!ihash)
+   return -ENOMEM;
+
+   /* Copy the data out */
+   ihash->bytenr = hash->bytenr;
+   ihash->num_bytes = hash->num_bytes;
+   memcpy(ihash->hash, hash->hash, btrfs_hash_sizes[algo]);
+
+   mutex_lock(_info->lock);
+
+   ret = inmem_insert_bytenr(_info->bytenr_root, ihash);
+   if (ret > 0) {
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   ret = inmem_insert_hash(_info->hash_root, ihash,
+   btrfs_hash_sizes[algo]);
+   if (ret > 0) {
+   /*
+* We only keep one hash in tree to save memory, so if
+* hash conflicts, free the one to insert.
+*/
+   rb_erase(>bytenr_node, _info->bytenr_root);
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   list_add(>lru_list, _info->lru_list);
+   dedupe_info->current_nr++;
+
+   /* Remove the last dedupe hash if we exceed limit */
+   while (dedupe_info->current_nr > dedupe_info->limit_nr) {
+   struct inmem_hash *last;
+
+   last = list_entry(dedupe_info->lru_list.prev,
+ struct inmem_hash, lru_list);
+   __inmem_del(dedupe_info, last);
+   }
+out:
+   mutex_unlock(_info->lock);
+

[PATCH v15 00/13] Btrfs In-band De-duplication

2018-09-04 Thread Lu Fengqi

This patchset can be fetched from github:
https://github.com/littleroad/linux.git dedupe_latest

Now the new base is v4.19-rc2, and drop the patch about compression
which conflict with compression heuristic.

Normal test cases from auto group exposes no regression, and ib-dedupe
group can pass without problem.

xfstests ib-dedupe group can be fetched from github:
https://github.com/littleroad/xfstests-dev.git btrfs_dedupe_latest

Changelog:
v2:
  Totally reworked to handle multiple backends
v3:
  Fix a stupid but deadly on-disk backend bug
  Add handle for multiple hash on same bytenr corner case to fix abort
  trans error
  Increase dedup rate by enhancing delayed ref handler for both backend.
  Move dedup_add() to run_delayed_ref() time, to fix abort trans error.
  Increase dedup block size up limit to 8M.
v4:
  Add dedup prop for disabling dedup for given files/dirs.
  Merge inmem_search() and ondisk_search() into generic_search() to save
  some code
  Fix another delayed_ref related bug.
  Use the same mutex for both inmem and ondisk backend.
  Move dedup_add() back to btrfs_finish_ordered_io() to increase dedup
  rate.
v5:
  Reuse compress routine for much simpler dedup function.
  Slightly improved performance due to above modification.
  Fix race between dedup enable/disable
  Fix for false ENOSPC report
v6:
  Further enable/disable race window fix.
  Minor format change according to checkpatch.
v7:
  Fix one concurrency bug with balance.
  Slightly modify return value from -EINVAL to -EOPNOTSUPP for
  btrfs_dedup_ioctl() to allow progs to distinguish unsupported commands
  and wrong parameter.
  Rebased to integration-4.6.
v8:
  Rename 'dedup' to 'dedupe'.
  Add support to allow dedupe and compression work at the same time.
  Fix several balance related bugs. Special thanks to Satoru Takeuchi,
  who exposed most of them.
  Small dedupe hit case performance improvement.
v9:
  Re-order the patchset to completely separate pure in-memory and any
  on-disk format change.
  Fold bug fixes into its original patch.
v10:
  Adding back missing bug fix patch.
  Reduce on-disk item size.
  Hide dedupe ioctl under CONFIG_BTRFS_DEBUG.
v11:
  Remove other backend and props support to focus on the framework and
  in-memory backend. Suggested by David.
  Better disable and buffered write race protection.
  Comprehensive fix to dedupe metadata ENOSPC problem.
v12:
  Stateful 'enable' ioctl and new 'reconf' ioctl
  New FORCE flag for enable ioctl to allow stateless ioctl
  Precise error report and extendable ioctl structure.
v12.1
  Rebase to David's for-next-20160704 branch
  Add co-ordinate patch for subpage and dedupe patchset.
v12.2
  Rebase to David's for-next-20160715 branch
  Add co-ordinate patch for other patchset.
v13
  Rebase to David's for-next-20160906 branch
  Fix a reserved space leak bug, which only frees quota reserved space
  but not space_info->byte_may_use.
v13.1
  Rebase to Chris' for-linux-4.9 branch
v14
  Use generic ENOSPC fix for both compression and dedupe.
v14.1
  Further split ENOSPC fix.
v14.2
  Rebase to v4.11-rc2.
  Co-operate with count_max_extent() to calculate num_extents.
  No longer rely on qgroup fixes.
v14.3
  Rebase to v4.12-rc1.
v14.4
  Rebase to kdave/for-4.13-part1.
v14.5
  Rebase to v4.15-rc3.
v14.6
  Rebase to v4.17-rc5.
v14.7
  Replace SHASH_DESC_ON_STACK with kmalloc to remove VLA.
  Fixed the following errors by switching to div_u64.
  ├── arm-allmodconfig
  │   └── ERROR:__aeabi_uldivmod-fs-btrfs-btrfs.ko-undefined
  └── i386-allmodconfig
  └── ERROR:__udivdi3-fs-btrfs-btrfs.ko-undefined
v14.8
  Rebase to v4.18-rc4.
v15
  Rebase to v4.19-rc2.
  Drop "btrfs: Introduce COMPRESS reserve type to fix false enospc for 
compression".
  Remove the ifdef around btrfs inband dedupe ioctl.

Qu Wenruo (4):
  btrfs: delayed-ref: Add support for increasing data ref under spinlock
  btrfs: dedupe: Inband in-memory only de-duplication implement
  btrfs: relocation: Enhance error handling to avoid BUG_ON
  btrfs: dedupe: Introduce new reconfigure ioctl

Wang Xiaoguang (9):
  btrfs: dedupe: Introduce dedupe framework and its header
  btrfs: dedupe: Introduce function to initialize dedupe info
  btrfs: dedupe: Introduce function to add hash into in-memory tree
  btrfs: dedupe: Introduce function to remove hash from in-memory tree
  btrfs: dedupe: Introduce function to search for an existing hash
  btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface
  btrfs: ordered-extent: Add support for dedupe
  btrfs: introduce type based delalloc metadata reserve
  btrfs: dedupe: Add ioctl for inband deduplication

 fs/btrfs/Makefile|   2 +-
 fs/btrfs/ctree.h |  52 ++-
 fs/btrfs/dedupe.c| 828 +++
 fs/btrfs/dedupe.h| 175 +++-
 fs/btrfs/delayed-ref.c   |  53 ++-
 fs/btrfs/delayed-ref.h   |  15 +
 fs/btrfs/disk-io.c   |   4 +
 fs/btrfs/extent-tree.c   |  67 ++-
 fs/btrfs/extent_io.c

[PATCH v15 13/13] btrfs: dedupe: Introduce new reconfigure ioctl

2018-09-04 Thread Lu Fengqi

From: Qu Wenruo 

Introduce new reconfigure ioctl and new FORCE flag for in-band dedupe
ioctls.

Now dedupe enable and reconfigure ioctl are stateful.


| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Not allowed |
| Enabled   |  reconf| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  dsiable   | Disabled|
| Disabled  |  reconf| Not allowed |

(While disable is always stateless)

While for guys prefer stateless ioctl (myself for example), new FORCE
flag is introduced.

In FORCE mode, enable/disable is completely stateless.

| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  disable   | Disabled|


Also, re-configure ioctl will only modify specified fields.
Unlike enable, un-specified fields will be filled with default value.

For example:
 # btrfs dedupe enable --block-size 64k /mnt
 # btrfs dedupe reconfigure --limit-hash 1m /mnt
Will leads to:
 dedupe blocksize: 64K
 dedupe hash limit nr: 1m

While for enable:
 # btrfs dedupe enable --force --block-size 64k /mnt
 # btrfs dedupe enable --force --limit-hash 1m /mnt
Will reset blocksize to default value:
 dedupe blocksize: 128K << reset
 dedupe hash limit nr: 1m

Suggested-by: David Sterba 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 132 ++---
 fs/btrfs/dedupe.h  |  13 
 fs/btrfs/ioctl.c   |  13 
 include/uapi/linux/btrfs.h |  11 +++-
 4 files changed, 143 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index a147e148bbb8..2be3e53acc6a 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -29,6 +29,40 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+/*
+ * Copy from current dedupe info to fill dargs.
+ * For reconf case, only fill members which is uninitialized.
+ */
+static void get_dedupe_status(struct btrfs_dedupe_info *dedupe_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   int reconf = (dargs->cmd == BTRFS_DEDUPE_CTL_RECONF);
+
+   dargs->status = 1;
+
+   if (!reconf || (reconf && dargs->blocksize == (u64)-1))
+   dargs->blocksize = dedupe_info->blocksize;
+   if (!reconf || (reconf && dargs->backend == (u16)-1))
+   dargs->backend = dedupe_info->backend;
+   if (!reconf || (reconf && dargs->hash_algo == (u16)-1))
+   dargs->hash_algo = dedupe_info->hash_algo;
+
+   /*
+* For re-configure case, if not modifying limit,
+* therir limit will be set to 0, unlike other fields
+*/
+   if (!reconf || !(dargs->limit_nr || dargs->limit_mem)) {
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
+   /* current_nr doesn't makes sense for reconfig case */
+   if (!reconf)
+   dargs->current_nr = dedupe_info->current_nr;
+}
+
 void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
 struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -45,15 +79,7 @@ void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
return;
}
mutex_lock(_info->lock);
-   dargs->status = 1;
-   dargs->blocksize = dedupe_info->blocksize;
-   dargs->backend = dedupe_info->backend;
-   dargs->hash_algo = dedupe_info->hash_algo;
-   dargs->limit_nr = dedupe_info->limit_nr;
-   dargs->limit_mem = dedupe_info->limit_nr *
-   (sizeof(struct inmem_hash) +
-btrfs_hash_sizes[dedupe_info->hash_algo]);
-   dargs->current_nr = dedupe_info->current_nr;
+   get_dedupe_status(dedupe_info, dargs);
mutex_unlock(_info->lock);
memset(dargs->__unused, -1, sizeof(dargs->__unused));
 }
@@ -98,17 +124,50 @@ init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
 static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
  struct btrfs_ioctl_dedupe_args *dargs)
 {
-   u64 blocksize = dargs->blocksize;
-   u64 limit_nr = dargs->limit_nr;
-   u64 limit_mem = dargs->limit_mem;
-   u16 hash_algo = dargs->hash_algo;
-   u8 backend

[PATCH v15 12/13] btrfs: relocation: Enhance error handling to avoid BUG_ON

2018-09-04 Thread Lu Fengqi

From: Qu Wenruo 

Since the introduction of btrfs dedupe tree, it's possible that balance can
race with dedupe disabling.

When this happens, dedupe_enabled will make btrfs_get_fs_root() return
PTR_ERR(-ENOENT).
But due to a bug in error handling branch, when this happens
backref_cache->nr_nodes is increased but the node is neither added to
backref_cache or nr_nodes decreased.
Causing BUG_ON() in backref_cache_cleanup()

[ 2611.668810] [ cut here ]
[ 2611.669946] kernel BUG at
/home/sat/ktest/linux/fs/btrfs/relocation.c:243!
[ 2611.670572] invalid opcode:  [#1] SMP
[ 2611.686797] Call Trace:
[ 2611.687034]  []
btrfs_relocate_block_group+0x1b3/0x290 [btrfs]
[ 2611.687706]  []
btrfs_relocate_chunk.isra.40+0x47/0xd0 [btrfs]
[ 2611.688385]  [] btrfs_balance+0xb22/0x11e0 [btrfs]
[ 2611.688966]  [] btrfs_ioctl_balance+0x391/0x3a0
[btrfs]
[ 2611.689587]  [] btrfs_ioctl+0x1650/0x2290 [btrfs]
[ 2611.690145]  [] ? lru_cache_add+0x3a/0x80
[ 2611.690647]  [] ?
lru_cache_add_active_or_unevictable+0x4c/0xc0
[ 2611.691310]  [] ? handle_mm_fault+0xcd4/0x17f0
[ 2611.691842]  [] ? cp_new_stat+0x153/0x180
[ 2611.692342]  [] ? __vma_link_rb+0xfd/0x110
[ 2611.692842]  [] ? vma_link+0xb9/0xc0
[ 2611.693303]  [] do_vfs_ioctl+0xa1/0x5a0
[ 2611.693781]  [] ? __do_page_fault+0x1b4/0x400
[ 2611.694310]  [] SyS_ioctl+0x41/0x70
[ 2611.694758]  [] entry_SYSCALL_64_fastpath+0x12/0x71
[ 2611.695331] Code: ff 48 8b 45 bf 49 83 af a8 05 00 00 01 49 89 87 a0
05 00 00 e9 2e fd ff ff b8 f4 ff ff ff e9 e4 fb ff ff 0f 0b 0f 0b 0f 0b
0f 0b <0f> 0b 0f 0b 41 89 c6 e9 b8 fb ff ff e8 9e a6 e8 e0 4c 89 e7 44
[ 2611.697870] RIP  []
relocate_block_group+0x741/0x7a0 [btrfs]
[ 2611.698818]  RSP 

This patch will call remove_backref_node() in error handling branch, and
cache the returned -ENOENT in relocate_tree_block() and continue
balancing.

Reported-by: Satoru Takeuchi 
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/relocation.c | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59a9c22ebf51..5f4b138fcb35 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -845,6 +845,13 @@ struct backref_node *build_backref_tree(struct 
reloc_control *rc,
root = read_fs_root(rc->extent_root->fs_info, key.offset);
if (IS_ERR(root)) {
err = PTR_ERR(root);
+   /*
+* Don't forget to cleanup current node.
+* As it may not be added to backref_cache but nr_node
+* increased.
+* This will cause BUG_ON() in backref_cache_cleanup().
+*/
+   remove_backref_node(>backref_cache, cur);
goto out;
}
 
@@ -3018,14 +3025,21 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
}
 
rb_node = rb_first(blocks);
-   while (rb_node) {
+   for (rb_node = rb_first(blocks); rb_node; rb_node = rb_next(rb_node)) {
block = rb_entry(rb_node, struct tree_block, rb_node);
 
node = build_backref_tree(rc, >key,
  block->level, block->bytenr);
if (IS_ERR(node)) {
+   /*
+* The root(dedupe tree yet) of the tree block is
+* going to be freed and can't be reached.
+* Just skip it and continue balancing.
+*/
+   if (PTR_ERR(node) == -ENOENT)
+   continue;
err = PTR_ERR(node);
-   goto out;
+   break;
}
 
ret = relocate_tree_block(trans, rc, node, >key,
@@ -3033,11 +3047,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
if (ret < 0) {
if (ret != -EAGAIN || rb_node == rb_first(blocks))
err = ret;
-   goto out;
+   break;
}
-   rb_node = rb_next(rb_node);
}
-out:
err = finish_pending_nodes(trans, rc, path, err);
 
 out_free_path:
-- 
2.18.0

[PATCH v15 07/13] btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface

2018-09-04 Thread Lu Fengqi

From: Wang Xiaoguang 

Unlike in-memory or on-disk dedupe method, only SHA256 hash method is
supported yet, so implement btrfs_dedupe_calc_hash() interface using
SHA256.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 50 +++
 1 file changed, 50 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 9c6152b7f0eb..9b0a90dd8e42 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -644,3 +644,53 @@ int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
}
return ret;
 }
+
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash)
+{
+   int i;
+   int ret;
+   struct page *p;
+   struct shash_desc *shash;
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+   struct crypto_shash *tfm = dedupe_info->dedupe_driver;
+   u64 dedupe_bs;
+   u64 sectorsize = fs_info->sectorsize;
+
+   shash = kmalloc(sizeof(*shash) + crypto_shash_descsize(tfm), GFP_NOFS);
+   if (!shash)
+   return -ENOMEM;
+
+   if (!fs_info->dedupe_enabled || !hash)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   WARN_ON(!IS_ALIGNED(start, sectorsize));
+
+   dedupe_bs = dedupe_info->blocksize;
+
+   shash->tfm = tfm;
+   shash->flags = 0;
+   ret = crypto_shash_init(shash);
+   if (ret)
+   return ret;
+   for (i = 0; sectorsize * i < dedupe_bs; i++) {
+   char *d;
+
+   p = find_get_page(inode->i_mapping,
+ (start >> PAGE_SHIFT) + i);
+   if (WARN_ON(!p))
+   return -ENOENT;
+   d = kmap(p);
+   ret = crypto_shash_update(shash, d, sectorsize);
+   kunmap(p);
+   put_page(p);
+   if (ret)
+   return ret;
+   }
+   ret = crypto_shash_final(shash, hash->hash);
+   return ret;
+}
-- 
2.18.0

[PATCH v15 11/13] btrfs: dedupe: Add ioctl for inband deduplication

2018-09-04 Thread Lu Fengqi

From: Wang Xiaoguang 

Add ioctl interface for inband deduplication, which includes:
1) enable
2) disable
3) status

And a pseudo RO compat flag, to imply that btrfs now supports inband
dedup.
However we don't add any ondisk format change, it's just a pseudo RO
compat flag.

All these ioctl interfaces are state-less, which means caller don't need
to bother previous dedupe state before calling them, and only need to
care the final desired state.

For example, if user want to enable dedupe with specified block size and
limit, just fill the ioctl structure and call enable ioctl.
No need to check if dedupe is already running.

These ioctls will handle things like re-configure or disable quite well.

Also, for invalid parameters, enable ioctl interface will set the field
of the first encountered invalid parameter to (-1) to inform caller.
While for limit_nr/limit_mem, the value will be (0).

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 50 +
 fs/btrfs/dedupe.h  | 17 +++---
 fs/btrfs/disk-io.c |  3 ++
 fs/btrfs/ioctl.c   | 65 ++
 fs/btrfs/sysfs.c   |  2 ++
 include/uapi/linux/btrfs.h | 12 ++-
 6 files changed, 143 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 9b0a90dd8e42..a147e148bbb8 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -29,6 +29,35 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled || !dedupe_info) {
+   dargs->status = 0;
+   dargs->blocksize = 0;
+   dargs->backend = 0;
+   dargs->hash_algo = 0;
+   dargs->limit_nr = 0;
+   dargs->current_nr = 0;
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   return;
+   }
+   mutex_lock(_info->lock);
+   dargs->status = 1;
+   dargs->blocksize = dedupe_info->blocksize;
+   dargs->backend = dedupe_info->backend;
+   dargs->hash_algo = dedupe_info->hash_algo;
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   dargs->current_nr = dedupe_info->current_nr;
+   mutex_unlock(_info->lock);
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+}
+
 static struct btrfs_dedupe_info *
 init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -402,6 +431,27 @@ static void unblock_all_writers(struct btrfs_fs_info 
*fs_info)
percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
 }
 
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   fs_info->dedupe_enabled = 0;
+   /* same as disable */
+   smp_wmb();
+   dedupe_info = fs_info->dedupe_info;
+   fs_info->dedupe_info = NULL;
+
+   if (!dedupe_info)
+   return 0;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   crypto_free_shash(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return 0;
+}
+
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
 {
struct btrfs_dedupe_info *dedupe_info;
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 8157b17c4d11..fdd00355d6b5 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -90,6 +90,15 @@ static inline struct btrfs_dedupe_hash 
*btrfs_dedupe_alloc_hash(u16 algo)
 int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dedupe_args *dargs);
 
+
+/*
+ * Get inband dedupe info
+ * Since it needs to access different backends' hash size, which
+ * is not exported, we need such simple function.
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
 /*
  * Disable dedupe and invalidate all its dedupe data.
  * Called at dedupe disable time.
@@ -101,12 +110,10 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
 
 /*
- * Get current dedupe status.
- * Return 0 for success
- * No possible error yet
+ * Cleanup current btrfs_dedupe_info
+ * Called in umount time
  */
-void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
-struct btrfs_ioctl_dedupe_args *dargs);
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info);
 
 /*
  * Calculate hash for dedupe.
diff

[PATCH v15 04/13] btrfs: dedupe: Introduce function to remove hash from in-memory tree

2018-09-04 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce static function inmem_del() to remove hash from in-memory
dedupe tree.
And implement btrfs_dedupe_del() and btrfs_dedup_disable() interfaces.

Also for btrfs_dedupe_disable(), add new functions to wait existing
writer and block incoming writers to eliminate all possible race.

Cc: Mark Fasheh 
Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 131 +++---
 1 file changed, 125 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 784bb3a8a5ab..951fefd19fde 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -170,12 +170,6 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
return ret;
 }
 
-int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
-{
-   /* Place holder for bisect, will be implemented in later patches */
-   return 0;
-}
-
 static int inmem_insert_hash(struct rb_root *root,
 struct inmem_hash *hash, int hash_len)
 {
@@ -317,3 +311,128 @@ int btrfs_dedupe_add(struct btrfs_fs_info *fs_info,
return inmem_add(dedupe_info, hash);
return -EINVAL;
 }
+
+static struct inmem_hash *
+inmem_search_bytenr(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct rb_node **p = _info->bytenr_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+
+   if (bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return entry;
+   }
+
+   return NULL;
+}
+
+/* Delete a hash from in-memory dedupe tree */
+static int inmem_del(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct inmem_hash *hash;
+
+   mutex_lock(_info->lock);
+   hash = inmem_search_bytenr(dedupe_info, bytenr);
+   if (!hash) {
+   mutex_unlock(_info->lock);
+   return 0;
+   }
+
+   __inmem_del(dedupe_info, hash);
+   mutex_unlock(_info->lock);
+   return 0;
+}
+
+/* Remove a dedupe hash from dedupe tree */
+int btrfs_dedupe_del(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   return inmem_del(dedupe_info, bytenr);
+   return -EINVAL;
+}
+
+static void inmem_destroy(struct btrfs_dedupe_info *dedupe_info)
+{
+   struct inmem_hash *entry, *tmp;
+
+   mutex_lock(_info->lock);
+   list_for_each_entry_safe(entry, tmp, _info->lru_list, lru_list)
+   __inmem_del(dedupe_info, entry);
+   mutex_unlock(_info->lock);
+}
+
+/*
+ * Helper function to wait and block all incoming writers
+ *
+ * Use rw_sem introduced for freeze to wait/block writers.
+ * So during the block time, no new write will happen, so we can
+ * do something quite safe, espcially helpful for dedupe disable,
+ * as it affect buffered write.
+ */
+static void block_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+   down_write(>s_umount);
+}
+
+static void unblock_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   up_write(>s_umount);
+   percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+}
+
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+   int ret;
+
+   dedupe_info = fs_info->dedupe_info;
+
+   if (!dedupe_info)
+   return 0;
+
+   /* Don't allow disable status change in RO mount */
+   if (fs_info->sb->s_flags & MS_RDONLY)
+   return -EROFS;
+
+   /*
+* Wait for all unfinished writers and block further writers.
+* Then sync the whole fs so all current write will go through
+* dedupe, and all later write won't go through dedupe.
+*/
+   block_all_writers(fs_info);
+   ret = sync_filesystem(fs_info->sb);
+   fs_info->dedupe_enabled = 0;
+   fs_info->dedupe_info = NULL;
+   unblock_all_writers(fs_info);
+   if (ret < 0)
+   return ret;
+
+   /* now we are OK to clean up everything */
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   crypto_free_shash(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return 0;
+}
-- 
2.18.0

[PATCH v15 02/13] btrfs: dedupe: Introduce function to initialize dedupe info

2018-09-04 Thread Lu Fengqi

From: Wang Xiaoguang 

Add generic function to initialize dedupe info.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/Makefile  |   2 +-
 fs/btrfs/dedupe.c  | 169 +
 fs/btrfs/dedupe.h  |  12 +++
 include/uapi/linux/btrfs.h |   3 +
 4 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/dedupe.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ca693dd554e9..78fdc87dba39 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-  uuid-tree.o props.o free-space-tree.o tree-checker.o
+  uuid-tree.o props.o free-space-tree.o tree-checker.o dedupe.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
new file mode 100644
index ..06523162753d
--- /dev/null
+++ b/fs/btrfs/dedupe.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2016 Fujitsu.  All rights reserved.
+ */
+
+#include "ctree.h"
+#include "dedupe.h"
+#include "btrfs_inode.h"
+#include "delayed-ref.h"
+
+struct inmem_hash {
+   struct rb_node hash_node;
+   struct rb_node bytenr_node;
+   struct list_head lru_list;
+
+   u64 bytenr;
+   u32 num_bytes;
+
+   u8 hash[];
+};
+
+static struct btrfs_dedupe_info *
+init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = kzalloc(sizeof(*dedupe_info), GFP_NOFS);
+   if (!dedupe_info)
+   return ERR_PTR(-ENOMEM);
+
+   dedupe_info->hash_algo = dargs->hash_algo;
+   dedupe_info->backend = dargs->backend;
+   dedupe_info->blocksize = dargs->blocksize;
+   dedupe_info->limit_nr = dargs->limit_nr;
+
+   /* only support SHA256 yet */
+   dedupe_info->dedupe_driver = crypto_alloc_shash("sha256", 0, 0);
+   if (IS_ERR(dedupe_info->dedupe_driver)) {
+   kfree(dedupe_info);
+   return ERR_CAST(dedupe_info->dedupe_driver);
+   }
+
+   dedupe_info->hash_root = RB_ROOT;
+   dedupe_info->bytenr_root = RB_ROOT;
+   dedupe_info->current_nr = 0;
+   INIT_LIST_HEAD(_info->lru_list);
+   mutex_init(_info->lock);
+
+   return dedupe_info;
+}
+
+/*
+ * Helper to check if parameters are valid.
+ * The first invalid field will be set to (-1), to info user which parameter
+ * is invalid.
+ * Except dargs->limit_nr or dargs->limit_mem, in that case, 0 will returned
+ * to info user, since user can specify any value to limit, except 0.
+ */
+static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   u64 blocksize = dargs->blocksize;
+   u64 limit_nr = dargs->limit_nr;
+   u64 limit_mem = dargs->limit_mem;
+   u16 hash_algo = dargs->hash_algo;
+   u8 backend = dargs->backend;
+
+   /*
+* Set all reserved fields to -1, allow user to detect
+* unsupported optional parameters.
+*/
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   if (blocksize > BTRFS_DEDUPE_BLOCKSIZE_MAX ||
+   blocksize < BTRFS_DEDUPE_BLOCKSIZE_MIN ||
+   blocksize < fs_info->sectorsize ||
+   !is_power_of_2(blocksize) ||
+   blocksize < PAGE_SIZE) {
+   dargs->blocksize = (u64)-1;
+   return -EINVAL;
+   }
+   if (hash_algo >= ARRAY_SIZE(btrfs_hash_sizes)) {
+   dargs->hash_algo = (u16)-1;
+   return -EINVAL;
+   }
+   if (backend >= BTRFS_DEDUPE_BACKEND_COUNT) {
+   dargs->backend = (u8)-1;
+   return -EINVAL;
+   }
+
+   /* Backend specific check */
+   if (backend == BTRFS_DEDUPE_BACKEND_INMEMORY) {
+   /* only one limit is accepted for enable*/
+   if (dargs->limit_nr && dargs->limit_mem) {
+   dargs->limit_nr = 0;
+   dargs->limit_mem = 0;
+   return -EINVAL;
+   }
+
+   if (!limit_nr && !limit_mem)
+   dargs->limit_nr = BTRFS_DEDUPE_LIMIT_NR_DEFAULT;
+   else {
+   u64 tmp = (u64)-1;
+
+   if (limit_mem) {
+   tmp = div_u64(limit_mem,
+

[PATCH v15 10/13] btrfs: dedupe: Inband in-memory only de-duplication implement

2018-09-04 Thread Lu Fengqi

From: Qu Wenruo 

Core implement for inband de-duplication.
It reuses the async_cow_start() facility to do the calculate dedupe hash.
And use dedupe hash to do inband de-duplication at extent level.

The workflow is as below:
1) Run delalloc range for an inode
2) Calculate hash for the delalloc range at the unit of dedupe_bs
3) For hash match(duplicated) case, just increase source extent ref
   and insert file extent.
   For hash mismatch case, go through the normal cow_file_range()
   fallback, and add hash into dedupe_tree.
   Compress for hash miss case is not supported yet.

Current implement restore all dedupe hash in memory rb-tree, with LRU
behavior to control the limit.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |   4 +-
 fs/btrfs/dedupe.h  |  15 ++
 fs/btrfs/extent-tree.c |  31 +++-
 fs/btrfs/extent_io.c   |   7 +-
 fs/btrfs/extent_io.h   |   1 +
 fs/btrfs/file.c|   4 +
 fs/btrfs/inode.c   | 316 +++--
 fs/btrfs/ioctl.c   |   1 +
 fs/btrfs/relocation.c  |  18 +++
 9 files changed, 341 insertions(+), 56 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4f0b6a12ecb1..627d617e3265 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -112,9 +112,11 @@ static inline u32 count_max_extents(u64 size, u64 
max_extent_size)
  */
 enum btrfs_metadata_reserve_type {
BTRFS_RESERVE_NORMAL,
+   BTRFS_RESERVE_DEDUPE,
 };
 
-u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+u64 btrfs_max_extent_size(struct btrfs_inode *inode,
+ enum btrfs_metadata_reserve_type reserve_type);
 
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 87f5b7ce7766..8157b17c4d11 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -7,6 +7,7 @@
 #define BTRFS_DEDUPE_H
 
 #include 
+#include "btrfs_inode.h"
 
 /* 32 bytes for SHA256 */
 static const int btrfs_hash_sizes[] = { 32 };
@@ -47,6 +48,20 @@ struct btrfs_dedupe_info {
u64 current_nr;
 };
 
+static inline u64 btrfs_dedupe_blocksize(struct btrfs_inode *inode)
+{
+   struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+   return fs_info->dedupe_info->blocksize;
+}
+
+static inline int inode_need_dedupe(struct inode *inode)
+{
+   struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+
+   return fs_info->dedupe_enabled;
+}
+
 static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
 {
return (hash && hash->bytenr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f90233ffcb27..131d48487c84 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -28,6 +28,7 @@
 #include "sysfs.h"
 #include "qgroup.h"
 #include "ref-verify.h"
+#include "dedupe.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -2489,6 +2490,17 @@ static int cleanup_ref_head(struct btrfs_trans_handle 
*trans,
btrfs_pin_extent(fs_info, head->bytenr,
 head->num_bytes, 1);
if (head->is_data) {
+   /*
+* If insert_reserved is given, it means
+* a new extent is revered, then deleted
+* in one tran, and inc/dec get merged to 0.
+*
+* In this case, we need to remove its dedupe
+* hash.
+*/
+   ret = btrfs_dedupe_del(fs_info, head->bytenr);
+   if (ret < 0)
+   return ret;
ret = btrfs_del_csums(trans, fs_info, head->bytenr,
  head->num_bytes);
}
@@ -5882,13 +5894,15 @@ static void btrfs_calculate_inode_block_rsv_size(struct 
btrfs_fs_info *fs_info,
spin_unlock(_rsv->lock);
 }
 
-u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type)
+u64 btrfs_max_extent_size(struct btrfs_inode *inode,
+ enum btrfs_metadata_reserve_type reserve_type)
 {
if (reserve_type == BTRFS_RESERVE_NORMAL)
return BTRFS_MAX_EXTENT_SIZE;
-
-   ASSERT(0);
-   return BTRFS_MAX_EXTENT_SIZE;
+   else if (reserve_type == BTRFS_RESERVE_DEDUPE)
+   return btrfs_dedupe_blocksize(inode);
+   else
+   return BTRFS_MAX_EXTENT_SIZE;
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
@@ -5899,7 +5913,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode 
*inode, u64 num_bytes,
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
-   u64 max_extent_size = btrfs_max_extent_size(r

[PATCH v15 05/13] btrfs: delayed-ref: Add support for increasing data ref under spinlock

2018-09-04 Thread Lu Fengqi

From: Qu Wenruo 

For in-band dedupe, btrfs needs to increase data ref with delayed_ref
locked, so add a new function btrfs_add_delayed_data_ref_lock() to
increase extent ref with delayed_refs already locked. Export
init_delayed_ref_head and init_delayed_ref_common for inband dedupe.

Signed-off-by: Qu Wenruo 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 53 +-
 fs/btrfs/delayed-ref.h | 15 
 2 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 62ff545ba1f7..faca30b334ee 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -526,7 +526,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root 
*delayed_refs,
spin_unlock(>lock);
 }
 
-static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+void btrfs_init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
  struct btrfs_qgroup_extent_record *qrecord,
  u64 bytenr, u64 num_bytes, u64 ref_root,
  u64 reserved, int action, bool is_data,
@@ -654,7 +654,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 }
 
 /*
- * init_delayed_ref_common - Initialize the structure which represents a
+ * btrfs_init_delayed_ref_common - Initialize the structure which represents a
  *  modification to a an extent.
  *
  * @fs_info:Internal to the mounted filesystem mount structure.
@@ -678,7 +678,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
  * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
  * BTRFS_EXTENT_DATA_REF_KEY when recording data extent
  */
-static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
+void btrfs_init_delayed_ref_common(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 ref_root,
int action, u8 ref_type)
@@ -751,14 +751,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle 
*trans,
else
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
 
-   init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
-   ref_root, action, ref_type);
+   btrfs_init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
+ ref_root, action, ref_type);
ref->root = ref_root;
ref->parent = parent;
ref->level = level;
 
-   init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- ref_root, 0, action, false, is_system);
+   btrfs_init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
+   ref_root, 0, action, false, is_system);
head_ref->extent_op = extent_op;
 
delayed_refs = >transaction->delayed_refs;
@@ -787,6 +787,29 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle 
*trans,
return 0;
 }
 
+/*
+ * Do real delayed data ref insert.
+ * Caller must hold delayed_refs->lock and allocation memory
+ * for dref,head_ref and record.
+ */
+int btrfs_add_delayed_data_ref_locked(struct btrfs_trans_handle *trans,
+   struct btrfs_delayed_ref_head *head_ref,
+   struct btrfs_qgroup_extent_record *qrecord,
+   struct btrfs_delayed_data_ref *ref, int action,
+   int *qrecord_inserted_ret, int *old_ref_mod,
+   int *new_ref_mod)
+{
+   struct btrfs_delayed_ref_root *delayed_refs;
+
+   head_ref = add_delayed_ref_head(trans, head_ref, qrecord,
+   action, qrecord_inserted_ret,
+   old_ref_mod, new_ref_mod);
+
+   delayed_refs = >transaction->delayed_refs;
+
+   return insert_delayed_ref(trans, delayed_refs, head_ref, >node);
+}
+
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
@@ -813,7 +836,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle 
*trans,
ref_type = BTRFS_SHARED_DATA_REF_KEY;
else
ref_type = BTRFS_EXTENT_DATA_REF_KEY;
-   init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
+   btrfs_init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
ref_root, action, ref_type);
ref->root = ref_root;
ref->parent = parent;
@@ -838,8 +861,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle 
*trans,
}
}
 
-   init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
- reserved, action, true, false);
+   btrfs_init_delayed_ref_head(head_ref, record, bytenr, num_byte

[PATCH v15 08/13] btrfs: ordered-extent: Add support for dedupe

2018-09-04 Thread Lu Fengqi

From: Wang Xiaoguang 

Add ordered-extent support for dedupe.

Note, current ordered-extent support only supports non-compressed source
extent.
Support for compressed source extent will be added later.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
---
 fs/btrfs/ordered-data.c | 46 +
 fs/btrfs/ordered-data.h | 13 
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0c4ef208b8b9..4b112258a79b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -12,6 +12,7 @@
 #include "extent_io.h"
 #include "disk-io.h"
 #include "compression.h"
+#include "dedupe.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
@@ -170,7 +171,8 @@ static inline struct rb_node *tree_search(struct 
btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
  u64 start, u64 len, u64 disk_len,
- int type, int dio, int compress_type)
+ int type, int dio, int compress_type,
+ struct btrfs_dedupe_hash *hash)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -191,6 +193,33 @@ static int __btrfs_add_ordered_extent(struct inode *inode, 
u64 file_offset,
entry->inode = igrab(inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
+   entry->hash = NULL;
+   /*
+* A hash hit means we have already incremented the extents delayed
+* ref.
+* We must handle this even if another process is trying to
+* turn off dedupe, otherwise we will leak a reference.
+*/
+   if (hash && (hash->bytenr || root->fs_info->dedupe_enabled)) {
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = root->fs_info->dedupe_info;
+   if (WARN_ON(dedupe_info == NULL)) {
+   kmem_cache_free(btrfs_ordered_extent_cache,
+   entry);
+   return -EINVAL;
+   }
+   entry->hash = btrfs_dedupe_alloc_hash(dedupe_info->hash_algo);
+   if (!entry->hash) {
+   kmem_cache_free(btrfs_ordered_extent_cache, entry);
+   return -ENOMEM;
+   }
+   entry->hash->bytenr = hash->bytenr;
+   entry->hash->num_bytes = hash->num_bytes;
+   memcpy(entry->hash->hash, hash->hash,
+  btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, >flags);
 
@@ -245,15 +274,23 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 
file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
+int btrfs_add_ordered_extent_dedupe(struct inode *inode, u64 file_offset,
+  u64 start, u64 len, u64 disk_len, int type,
+  struct btrfs_dedupe_hash *hash)
+{
+   return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0,
+ BTRFS_COMPRESS_NONE, hash);
+}
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 u64 start, u64 len, u64 disk_len, int type)
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 1,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
@@ -262,7 +299,7 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, 
u64 file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- compress_type);
+ compress_type, NULL);
 }
 
 /*
@@ -444,6 +481,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent 
*entry)
list_del(>list);
kfree(sum);
}
+   kfree(entry->hash);
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 02d813aaa261..08c7ee986bb9 100644
---

[PATCH v15 01/13] btrfs: dedupe: Introduce dedupe framework and its header

2018-09-04 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce the header for btrfs in-band(write time) de-duplication
framework and needed header.

The new de-duplication framework is going to support 2 different dedupe
methods and 1 dedupe hash.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |   7 ++
 fs/btrfs/dedupe.h  | 128 -
 fs/btrfs/disk-io.c |   1 +
 include/uapi/linux/btrfs.h |  34 ++
 4 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 53af9f5253f4..741ef21a6185 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1125,6 +1125,13 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
 #endif
+
+   /*
+* Inband de-duplication related structures
+*/
+   unsigned long dedupe_enabled:1;
+   struct btrfs_dedupe_info *dedupe_info;
+   struct mutex dedupe_ioctl_lock;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 90281a7a35a8..222ce7b4d827 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -6,7 +6,131 @@
 #ifndef BTRFS_DEDUPE_H
 #define BTRFS_DEDUPE_H
 
-/* later in-band dedupe will expand this struct */
-struct btrfs_dedupe_hash;
+#include 
 
+/* 32 bytes for SHA256 */
+static const int btrfs_hash_sizes[] = { 32 };
+
+/*
+ * For caller outside of dedupe.c
+ *
+ * Different dedupe backends should have their own hash structure
+ */
+struct btrfs_dedupe_hash {
+   u64 bytenr;
+   u32 num_bytes;
+
+   /* last field is a variable length array of dedupe hash */
+   u8 hash[];
+};
+
+struct btrfs_dedupe_info {
+   /* dedupe blocksize */
+   u64 blocksize;
+   u16 backend;
+   u16 hash_algo;
+
+   struct crypto_shash *dedupe_driver;
+
+   /*
+* Use mutex to portect both backends
+* Even for in-memory backends, the rb-tree can be quite large,
+* so mutex is better for such use case.
+*/
+   struct mutex lock;
+
+   /* following members are only used in in-memory backend */
+   struct rb_root hash_root;
+   struct rb_root bytenr_root;
+   struct list_head lru_list;
+   u64 limit_nr;
+   u64 current_nr;
+};
+
+static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
+{
+   return (hash && hash->bytenr);
+}
+
+/*
+ * Initial inband dedupe info
+ * Called at dedupe enable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (from unsupported param to tree creation error for some backends)
+ */
+int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
+   struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Disable dedupe and invalidate all its dedupe data.
+ * Called at dedupe disable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
+
+/*
+ * Get current dedupe status.
+ * Return 0 for success
+ * No possible error yet
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Calculate hash for dedupe.
+ * Caller must ensure [start, start + dedupe_bs) has valid data.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (error from hash codes)
+ */
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash);
+
+/*
+ * Search for duplicated extents by calculated hash
+ * Caller must call btrfs_dedupe_calc_hash() first to get the hash.
+ *
+ * @inode: the inode for we are writing
+ * @file_pos: offset inside the inode
+ * As we will increase extent ref immediately after a hash match,
+ * we need @file_pos and @inode in this case.
+ *
+ * Return > 0 for a hash match, and the extent ref will be
+ * *INCREASED*, and hash->bytenr/num_bytes will record the existing
+ * extent data.
+ * Return 0 for a hash miss. Nothing is done
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash);
+
+/*
+ * Add a dedupe hash into dedupe info
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_add(struct btrfs_fs_info *fs_info,
+struct btrfs_dedupe_hash *hash);
+
+/*
+ * Remove a dedupe hash from dedupe info
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ *
+ * NOTE: if hash deletion error is not handled well, it will lead
+ * to corrupted fs, as later dedupe write can points to non-exist

[PATCH v15 06/13] btrfs: dedupe: Introduce function to search for an existing hash

2018-09-04 Thread Lu Fengqi

From: Wang Xiaoguang 

Introduce static function inmem_search() to handle the job for in-memory
hash tree.

The trick is, we must ensure the delayed ref head is not being run at
the time we search the for the hash.

With inmem_search(), we can implement the btrfs_dedupe_search()
interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 210 +-
 1 file changed, 209 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 951fefd19fde..9c6152b7f0eb 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -7,6 +7,8 @@
 #include "dedupe.h"
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
+#include "qgroup.h"
+#include "transaction.h"
 
 struct inmem_hash {
struct rb_node hash_node;
@@ -242,7 +244,6 @@ static int inmem_add(struct btrfs_dedupe_info *dedupe_info,
struct inmem_hash *ihash;
 
ihash = inmem_alloc_hash(algo);
-
if (!ihash)
return -ENOMEM;
 
@@ -436,3 +437,210 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
kfree(dedupe_info);
return 0;
 }
+
+/*
+ * Caller must ensure the corresponding ref head is not being run.
+ */
+static struct inmem_hash *
+inmem_search_hash(struct btrfs_dedupe_info *dedupe_info, u8 *hash)
+{
+   struct rb_node **p = _info->hash_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+   u16 hash_algo = dedupe_info->hash_algo;
+   int hash_len = btrfs_hash_sizes[hash_algo];
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+
+   if (memcmp(hash, entry->hash, hash_len) < 0) {
+   p = &(*p)->rb_left;
+   } else if (memcmp(hash, entry->hash, hash_len) > 0) {
+   p = &(*p)->rb_right;
+   } else {
+   /* Found, need to re-add it to LRU list head */
+   list_del(>lru_list);
+   list_add(>lru_list, _info->lru_list);
+   return entry;
+   }
+   }
+   return NULL;
+}
+
+static int inmem_search(struct btrfs_dedupe_info *dedupe_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash)
+{
+   int ret;
+   struct btrfs_root *root = BTRFS_I(inode)->root;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_delayed_ref_root *delayed_refs;
+   struct btrfs_delayed_ref_head *head;
+   struct btrfs_delayed_ref_head *insert_head;
+   struct btrfs_delayed_data_ref *insert_dref;
+   struct btrfs_qgroup_extent_record *insert_qrecord = NULL;
+   struct inmem_hash *found_hash;
+   int free_insert = 1;
+   int qrecord_inserted = 0;
+   u64 ref_root = root->root_key.objectid;
+   u64 bytenr;
+   u32 num_bytes;
+
+   insert_head = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+   if (!insert_head)
+   return -ENOMEM;
+   insert_head->extent_op = NULL;
+
+   insert_dref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
+   if (!insert_dref) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep, insert_head);
+   return -ENOMEM;
+   }
+   if (test_bit(BTRFS_FS_QUOTA_ENABLED, >fs_info->flags) &&
+   is_fstree(ref_root)) {
+   insert_qrecord = kmalloc(sizeof(*insert_qrecord), GFP_NOFS);
+   if (!insert_qrecord) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep,
+   insert_head);
+   kmem_cache_free(btrfs_delayed_data_ref_cachep,
+   insert_dref);
+   return -ENOMEM;
+   }
+   }
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto free_mem;
+   }
+
+again:
+   mutex_lock(_info->lock);
+   found_hash = inmem_search_hash(dedupe_info, hash->hash);
+   /* If we don't find a duplicated extent, just return. */
+   if (!found_hash) {
+   ret = 0;
+   goto out;
+   }
+   bytenr = found_hash->bytenr;
+   num_bytes = found_hash->num_bytes;
+
+   btrfs_init_delayed_ref_head(insert_head, insert_qrecord, bytenr,
+   num_bytes, ref_root, 0, BTRFS_ADD_DELAYED_REF, true,
+   false);
+
+   btrfs_init_delayed_ref_common(trans->fs_info, _dref->node,
+   bytenr, num_bytes, ref_root, BTRFS_ADD_DELAYED_REF,
+   BTRFS_EXTENT_DATA_REF_KEY);
+   insert_dref->r

[PATCH v3] btrfs: fix qgroup_free wrong num_bytes in btrfs_subvolume_reserve_metadata()

2018-08-08 Thread Lu Fengqi

After btrfs_qgroup_reserve_meta_prealloc(), num_bytes will be assigned
again by btrfs_calc_trans_metadata_size(). Once block_rsv fails, we can't
properly free the num_bytes of the previous qgroup_reserve. Use a separate
variable to store the num_bytes of the qgroup_reserve.

Delete the comment for the qgroup_reserved that does not exist and add a
comment about use_global_rsv.

Fixes: c4c129db5da8 ("btrfs: drop unused parameter qgroup_reserved")
Signed-off-by: Lu Fengqi 
---
Changelog:
v2->v3: update the subject and commit message to reflect this is a fixes
v1->v2: break the line that exceed 80 char

 fs/btrfs/extent-tree.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index de6f75f5547b..2d9074295d7f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5800,7 +5800,7 @@ void btrfs_trans_release_chunk_metadata(struct 
btrfs_trans_handle *trans)
  * root: the root of the parent directory
  * rsv: block reservation
  * items: the number of items that we need do reservation
- * qgroup_reserved: used to return the reserved size in qgroup
+ * use_global_rsv: allow fallback to the global block reservation
  *
  * This function is used to reserve the space for snapshot/subvolume
  * creation and deletion. Those operations are different with the
@@ -5810,10 +5810,10 @@ void btrfs_trans_release_chunk_metadata(struct 
btrfs_trans_handle *trans)
  * the space reservation mechanism in start_transaction().
  */
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
-struct btrfs_block_rsv *rsv,
-int items,
+struct btrfs_block_rsv *rsv, int items,
 bool use_global_rsv)
 {
+   u64 qgroup_num_bytes = 0;
u64 num_bytes;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -5821,12 +5821,11 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
 
if (test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags)) {
/* One for parent inode, two for dir entries */
-   num_bytes = 3 * fs_info->nodesize;
-   ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+   qgroup_num_bytes = 3 * fs_info->nodesize;
+   ret = btrfs_qgroup_reserve_meta_prealloc(root,
+   qgroup_num_bytes, true);
if (ret)
return ret;
-   } else {
-   num_bytes = 0;
}
 
num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
@@ -5838,8 +5837,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
 
-   if (ret && num_bytes)
-   btrfs_qgroup_free_meta_prealloc(root, num_bytes);
+   if (ret && qgroup_num_bytes)
+   btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
 
return ret;
 }
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/5] btrfs: use a separate variable to store the num_bytes of the qgroup_reserve

2018-08-08 Thread Lu Fengqi

David Sterba  于2018年8月8日周三 下午9:57写道：
>
> On Wed, Aug 08, 2018 at 11:04:37AM +0800, Lu Fengqi wrote:
> > On Tue, Aug 07, 2018 at 06:19:12PM +0200, David Sterba wrote:
> > >On Sat, Aug 04, 2018 at 09:10:54PM +0800, Lu Fengqi wrote:
> > >> After btrfs_qgroup_reserve_meta_prealloc(), num_bytes will be assigned
> > >> again by btrfs_calc_trans_metadata_size(). Therefore, once block_rsv
> > >> fails, we cannot properly free the num_bytes of the previous
> > >> qgroup_reserve.
> > >
> > >This does not look like a trivial cleanup at all. There was an unused
> > >parameter, removed in c4c129db5da8f070147f175 ("btrfs: drop unused
> > >parameter qgroup_reserved"), that introduced the bug.  This was in this
> > >rc1 so it's a regression and I'll consider pushing it to the 4.18 final.
> >
> > I apologize for the inconvenience. I should add the Fixes tag, and really
> > shouldn't mix it into the trivial cleanup patch set.
>
> As the bug does not qualify as urgent regression, I'm not going to
> forward it to 4.18. Please update the subject and changelog so it
> reflects that's an actual fix. I'll add it to the 4.19 queue then.
> Thanks.

No problem. I will send it tomorrow.

-
Thanks,
Lu

> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 0/4] undelete subvolume online version

2018-08-08 Thread Lu Fengqi

On Wed, Aug 08, 2018 at 02:11:24PM +0800, Qu Wenruo wrote:
>
>
>On 2018年08月08日 00:39, David Sterba wrote:
>> On Sun, Aug 05, 2018 at 06:39:57PM +0800, Lu Fengqi wrote:
>>> This patchset will add the BTRFS_IOC_SUBVOL_UNDELETE ioctl for online
>>> btrfs subvolume undelete.
>>>
>>> And btrfs subvolume undelete subcommand was added to btrfs-progs.
>>>
>>> So user can use the following command to recover all the subolume that
>>> is left on the device. The recovered subvolume will be link to  dir
>>> named to .
>> 
>> Hm, I don't agree with the proposed interface - to recover all deleted
>> subvolumes. IMO this should recover just one subvolume of a given id a
>> to given directory.
>> 
>> The ioctl structure has to be reworked, I've skimmed the code and saw
>> some suspicious things but will have a look after the interface is
>> settled.
>
>My concern is, is such purpose really needed?
>
>Yes, it's possible user made some mistake and want to get back the data.
>But putting an ioctl for 'undelete', then user may consider btrfs is so
>powerful that can undelete everything.
>In short, this undelete feature gives user too high expectation.
>
>And don't expect user really to read man pages. There are already tons

There is no more way about the too high expectation of users. If we provide
a feature with a sufficiently detailed man page, but users do not read the
man page when using this feature, I can only think that they are not
responsible for their own data. So, this seems to be a problem they need to
consider.

>of reports where user execute btrfs check --repair without realizing
>--repair is pretty dangerous (and thanks to the work done to repair, it
>normally doesn't cause catastrophic result, but sometimes it indeed
>causes extra damage)

The good news is that online undelete is not as dangerous as btrfs check
--repair. In fact, I think it is safe enough.

>
>And when user tried and failed due to deleted tree blocks, they will get
>even more frustrated or even betrayed.

As mentioned previous, maybe we should do what we think is right, such as
give the user more abilities to protect/recover their data, not to take
care of any sensitive users?

>
>I prefer to put such undelete as an off-line rescue tool, instead of
>making it online with an ioctl interface.

I also think that the offline undelete is more useful. After all, umount
immediately to prevent further data loss is always the most effective after
a mistake. However, since we can give the ability of online undelete to a
user which couldn't umount the filesystem easily, and don't have any side
effect on existing features. IMHO, there is no reason to reject this.

-- 
Thanks,
Lu


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/5] btrfs: use a separate variable to store the num_bytes of the qgroup_reserve

2018-08-07 Thread Lu Fengqi

On Tue, Aug 07, 2018 at 06:19:12PM +0200, David Sterba wrote:
>On Sat, Aug 04, 2018 at 09:10:54PM +0800, Lu Fengqi wrote:
>> After btrfs_qgroup_reserve_meta_prealloc(), num_bytes will be assigned
>> again by btrfs_calc_trans_metadata_size(). Therefore, once block_rsv
>> fails, we cannot properly free the num_bytes of the previous
>> qgroup_reserve.
>
>This does not look like a trivial cleanup at all. There was an unused
>parameter, removed in c4c129db5da8f070147f175 ("btrfs: drop unused
>parameter qgroup_reserved"), that introduced the bug.  This was in this
>rc1 so it's a regression and I'll consider pushing it to the 4.18 final.
>
>

I apologize for the inconvenience. I should add the Fixes tag, and really
shouldn't mix it into the trivial cleanup patch set.

-- 
Thanks,
Lu


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 0/4] undelete subvolume online version

2018-08-07 Thread Lu Fengqi

On Tue, Aug 07, 2018 at 06:39:50PM +0200, David Sterba wrote:
>On Sun, Aug 05, 2018 at 06:39:57PM +0800, Lu Fengqi wrote:
>> This patchset will add the BTRFS_IOC_SUBVOL_UNDELETE ioctl for online
>> btrfs subvolume undelete.
>> 
>> And btrfs subvolume undelete subcommand was added to btrfs-progs.
>> 
>> So user can use the following command to recover all the subolume that
>> is left on the device. The recovered subvolume will be link to  dir
>> named to .
>
>Hm, I don't agree with the proposed interface - to recover all deleted
>subvolumes. IMO this should recover just one subvolume of a given id a
>to given directory.

Thank you for taking the time to respond. I may have thought too much about
the interface before. In my imagination, the cleaner kthread is like a
monster that devours user data at any time, so the user must perform an
online undelete operation as soon as possible, so there is no time to
determine the subvol_id that should be passed. However, I have to admit
that I don't know much about the user's actual usage scenarios, I will
accept the interface you provided. Of course, I really like this because it
greatly simplifies the ioctl structure.

>
>The ioctl structure has to be reworked, I've skimmed the code and saw
>some suspicious things but will have a look after the interface is
>settled.

When I rework the ioctl structure, I will carefully recheck the
incorrect place in the code.

-- 
Thanks,
Lu

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] btrfs: qgroup: Remove qgroup items along with subvolume deletion

2018-08-07 Thread Lu Fengqi

On Mon, Aug 06, 2018 at 01:53:28PM +0900, Misono Tomohiro wrote:
>When qgroup is on, subvolume deletion does not remove qgroup items
>of the subvolume (qgroup info, limit, relation) from quota tree and
>they need to get removed manually by "btrfs qgroup destroy".
>
>Since level 0 qgroup cannot be used/inherited by any other subvolume,
>let's remove them automatically when subvolume is deleted
>(to be precise, when the subvolume root is dropped).
>
>Reviewed-by: Lu Fengqi 
>Reviewed-by: Qu Wenruo 
>Signed-off-by: Misono Tomohiro 
>---
>v2 -> v3:
>  Use root->root_key.objectid instead of root->objectid
>  Add Reviewed-by tag
>
>v1 -> v2:
>  Move call of btrfs_remove_qgroup() from btrfs_delete_subvolume()
>  to btrfs_snapshot_destroy() so that it will be called after the
>  subvolume root is really dropped
>
> fs/btrfs/extent-tree.c | 16 
> 1 file changed, 12 insertions(+), 4 deletions(-)
>
>diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>index 9e7b237b9547..48edf839ed2c 100644
>--- a/fs/btrfs/extent-tree.c
>+++ b/fs/btrfs/extent-tree.c
>@@ -8871,12 +8871,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
>   struct btrfs_root_item *root_item = >root_item;
>   struct walk_control *wc;
>   struct btrfs_key key;
>+  u64 objectid = root->root_key.objectid;
>   int err = 0;
>   int ret;
>   int level;
>   bool root_dropped = false;
> 
>-  btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
>+  btrfs_debug(fs_info, "Drop subvolume %llu", objectid);
> 
>   path = btrfs_alloc_path();
>   if (!path) {
>@@ -9030,7 +9031,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
>   goto out_end_trans;
>   }
> 
>-  if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
>+  if (objectid != BTRFS_TREE_RELOC_OBJECTID) {
>   ret = btrfs_find_root(tree_root, >root_key, path,
> NULL, NULL);
>   if (ret < 0) {
>@@ -9043,8 +9044,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
>*
>* The most common failure here is just -ENOENT.
>*/
>-  btrfs_del_orphan_item(trans, tree_root,
>-root->root_key.objectid);
>+  btrfs_del_orphan_item(trans, tree_root, objectid);
>   }
>   }
> 
>@@ -9056,6 +9056,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
>   btrfs_put_fs_root(root);
>   }
>   root_dropped = true;
>+
>+   /* Remove level-0 qgroup items since no other subvolume can use them */
>+  ret = btrfs_remove_qgroup(trans, objectid);
>+  if (ret && ret != -EINVAL && ret != -ENOENT) {

I'm sorry for missing the snapshot case. If it is a snapshot, then when
we remove the relevant qgroup, we will not be able to perform
quick_update_accounting(), and it will return 1. So we shouldn't abort
the transaction when the return value = 1.

btrfs_remove_qgroup
-> __del_qgroup_relation
   -> quick_update_accounting << if qgroup->excl != qgroup->rfer; return 1

-- 
Thanks,
Lu

>+  btrfs_abort_transaction(trans, ret);
>+  err = ret;
>+  }
>+
> out_end_trans:
>   btrfs_end_transaction_throttle(trans);
> out_free:
>-- 
>2.14.4
>
>
>--
>To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>the body of a message to majord...@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 00/12] mkfs: Quota support through -R|--runtime quota

2018-08-07 Thread Lu Fengqi

On Mon, Jul 30, 2018 at 01:03:00PM +0800, Qu Wenruo wrote:
>Ping the 3rd time?
>
>Or should I just rebase the patchset?

Hi Qu

Could you rebase this patchset? Because I want to test existing test cases
with enabled quota, mkfs_qgroup seems to ease my workload.

-- 
Thanks,
Lu

>
>Thanks,
>Qu
>
>On 2018年03月08日 09:17, Qu Wenruo wrote:
>> Ping again.
>> 
>> Since David is planning to merge qgroup patchset, this feature would
>> greatly improve test coverage.
>> 
>> Thanks,
>> Qu
>> 
>> On 2018年01月11日 14:04, Qu Wenruo wrote:
>>> Ping?
>>>
>>> Or do I need to rebase the patchset?
>>>
>>> Thanks,
>>> Qu
>>>
>>> On 2017年11月07日 16:42, Qu Wenruo wrote:
 Can be fetched from github:
 https://github.com/adam900710/btrfs-progs/tree/mkfs_qgroup

 This patchset adds quota support, which means the result fs will have
 quota enabled by default, and its accounting is already consistent, no
 manually rescan or quota enable is needed.

 The overall design of such support is:
 1) Create needed tree
Both btrfs_root and real root item and tree root leaf.
For this, a new infrastructure, btrfs_create_tree(), is added for
this.

 2) Fill quota root with basic skeleton
Only 3 items are really needed
a) global quota status item
b) quota info for specified qgroup
c) quota limit for specified qgroup

Currently only 0/5 qgroup is passed.
If we're going to support extra subvolume at mkfs time, just pass the
subvolume id into insert_qgroup_items().

The content doesn't matter at all.

 3) Repair qgroups using infrastructure from qgroup-verify
In fact, qgroup repair is just offline rescan.
Although the original qgroup-verify infrastructure is mostly noisy,
modify it a little to make it silent to function as offline quota
rescan.

 And such support is mainly designed for developers and QA guys.

 As to enable quota, before we must normally mount the fs, enable quota
 (and rescan if needed).
 This ioctl based procedure is not common, and fstests doesn't provide
 such support.

 There are several attempts to make fstests to support it, but due to
 different reasons, all these attempts failed.

 To make it easier to test all existing test cases with btrfs quota
 enabled, the current best method is to support quota at mkfs time, and
 here comes the patchset.

 BTW with -R|--runtime-features, we have several possible target to add.
 Not limited to such ioctl based operation, but also mount option based
 ones.
 Like space-cache-tree (space_cache=v2).

 Qu Wenruo (12):
   btrfs-progs: qgroup-verify: Also repair qgroup status version
   btrfs-progs: qgroup-verify: Use fs_info->readonly to check if we
 should repair qgroups
   btrfs-progs: qgroup-verify: Move qgroup classification out of
 report_qgroups
   btrfs-progs: qgroup-verify: Allow repair_qgroups function to do silent
 repair
   btrfs-progs: ctree: Introduce function to create an empty tree
   btrfs-progs: mkfs: Introduce function to insert qgroup info and limit
 items

   ^^^ Above patches are not modified at all ^^^
   vvv Modification starts below vvv

   btrfs-progs: mkfs: Introduce function to setup quota root and rescan
   btrfs-progs: fsfeatures: Introduce a new set of features,
 runtime_features
   btrfs-progs: mkfs: Introduce --runtime-features option
   btrfs-progs: mkfs: Introduce quota runtime feature
   btrfs-progs: test/mkfs: Add test case for -R quota option
   btrfs-progs: test/mkfs: Add test case for --rootdir and -R quota

  Documentation/mkfs.btrfs.asciidoc  |  23 +++
  cmds-check.c   |   2 +-
  convert/main.c |   4 +-
  ctree.c| 109 ++
  ctree.h|   3 +
  fsfeatures.c   | 131 ++---
  fsfeatures.h   |  10 +-
  mkfs/main.c| 194 
 ++---
  qgroup-verify.c|  51 +--
  qgroup-verify.h|   2 +-
  tests/mkfs-tests/001-basic-profiles/test.sh|  10 ++
  tests/mkfs-tests/010-rootdir-and-quota/test.sh |  51 +++
  12 files changed, 529 insertions(+), 61 deletions(-)
  create mode 100755 tests/mkfs-tests/010-rootdir-and-quota/test.sh

>>>
>> 
>

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 4/4] btrfs: undelete: Add the btrfs_ioctl_undelete

2018-08-05 Thread Lu Fengqi

The function will traverse the root from the fs_info->dead_roots and try
to call btrfs_undelete_subvolume() to recover them.

Note: It will lock fs_info->cleaner_mutex to keep the cleaner kthread
from deleting the subvolume which we want to recover.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ioctl.c   | 83 ++
 include/uapi/linux/btrfs.h |  9 +
 2 files changed, 92 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7a11c4f8e450..83b9839799d0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1980,6 +1980,87 @@ static int btrfs_undelete_subvolume(const struct path 
*parent,
return ret;
 }
 
+static int btrfs_ioctl_undelete(struct file *file, void __user *argp)
+{
+   struct btrfs_ioctl_undelete_args __user *uarg;
+   struct btrfs_ioctl_undelete_args *args;
+   struct inode *inode = file_inode(file);
+   struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+   struct btrfs_root *root, *tmp;
+   char *name;
+   u64 count = 0;
+   u64 objectid;
+   int err = 0, ret;
+
+   /* copy search header and buffer size */
+   uarg = (struct btrfs_ioctl_undelete_args __user *)argp;
+   args = memdup_user(uarg, sizeof(*args));
+   if (IS_ERR(args))
+   return PTR_ERR(args);
+   args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+   name = kzalloc(BTRFS_PATH_NAME_MAX + 1, GFP_KERNEL);
+   if (IS_ERR(name)) {
+   err = PTR_ERR(name);
+   goto free_args;
+   }
+
+   if (!capable(CAP_SYS_ADMIN)) {
+   err = -EPERM;
+   goto free;
+   }
+
+   err = mnt_want_write_file(file);
+   if (err)
+   goto free;
+
+   /* Lock cleaner_mutex to prevent the cleaner kthread from deleting the
+* subvolume we want to recover so that we can perform the next rescue
+* in a relaxed manner.
+*/
+   mutex_lock(_info->cleaner_mutex);
+
+   list_for_each_entry_safe(root, tmp, _info->dead_roots, root_list) {
+   objectid = root->root_key.objectid;
+   snprintf(name, BTRFS_PATH_NAME_MAX, "%s%llu", args->name,
+   objectid);
+   ret = btrfs_undelete_subvolume(>f_path, root, name,
+  strlen(name));
+   if (ret)
+   continue;
+
+   /*
+* Feel free to remove this root from dead_root list since we
+* have recover it successfully.
+*/
+   spin_lock(_info->trans_lock);
+   list_del_init(>root_list);
+   spin_unlock(_info->trans_lock);
+
+   if ((count + 1) * sizeof(objectid) > args->buf_size)
+   continue;
+
+   /* copy the subvolume id to user space */
+   ret = copy_to_user(>buf[count], ,
+  sizeof(objectid));
+   if (ret)
+   err = -EFAULT;
+   count++;
+   }
+
+   mutex_unlock(_info->cleaner_mutex);
+   mnt_drop_write_file(file);
+
+   /* copy the count to user space */
+   if (copy_to_user(>count, , sizeof(count)))
+   err = -EFAULT;
+free:
+   kfree(name);
+free_args:
+   kfree(args);
+   return err;
+}
+
 static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
void __user *arg)
 {
@@ -6089,6 +6170,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_subvol_rootref(file, argp);
case BTRFS_IOC_INO_LOOKUP_USER:
return btrfs_ioctl_ino_lookup_user(file, argp);
+   case BTRFS_IOC_SUBVOL_UNDELETE:
+   return btrfs_ioctl_undelete(file, argp);
}
 
return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5ca1d21fc4a7..25d030687b27 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -816,6 +816,13 @@ struct btrfs_ioctl_get_subvol_rootref_args {
__u8 align[7];
 };
 
+struct btrfs_ioctl_undelete_args {
+   char name[BTRFS_PATH_NAME_MAX + 1]; /* in - subvolume name prefix */
+   __u64 buf_size; /* in - size of buffer */
+   __u64 count;/* out - store number of recoverd subvolumes */
+   __u64 buf[0];   /* out - store ids of recoverd subolumes */
+};
+
 /* Error codes as returned by the kernel */
 enum btrfs_err_code {
BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -940,5 +947,7 @@ enum btrfs_err_code {
struct btrfs_ioctl_get_subvol_rootref_args)
 #define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \
struct btrfs_ioctl_ino_lookup_user_args)
+

[RFC PATCH 0/4] undelete subvolume online version

2018-08-05 Thread Lu Fengqi

This patchset will add the BTRFS_IOC_SUBVOL_UNDELETE ioctl for online
btrfs subvolume undelete.

And btrfs subvolume undelete subcommand was added to btrfs-progs.

So user can use the following command to recover all the subolume that
is left on the device. The recovered subvolume will be link to  dir
named to .

# btrfs subvolume undelete [-p ] 

btrfs online undelete version:
https://github.com/littleroad/linux.git undelete

btrfs-progs online undelete version:
https://github.com/littleroad/btrfs-progs.git online_undelete

Issue: #82

Lu Fengqi (4):
  btrfs: factor out btrfs_link_subvol from create_subvol
  btrfs: don't BUG_ON() in btrfs_link_subvol()
  btrfs: undelete: introduce btrfs_undelete_subvolume
  btrfs: undelete: Add the btrfs_ioctl_undelete

 fs/btrfs/ioctl.c   | 270 +
 include/uapi/linux/btrfs.h |   9 ++
 2 files changed, 255 insertions(+), 24 deletions(-)

-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 1/4] btrfs: factor out btrfs_link_subvol from create_subvol

2018-08-05 Thread Lu Fengqi

The function btrfs_link_subvol is responsible to link the subvolume to
the specified directory, which is the opposite of what
btrfs_unlink_subvol does.

No functional change.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ioctl.c | 65 ++--
 1 file changed, 41 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d3a5d2a41e5f..d37c26f69112 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -542,6 +542,45 @@ int btrfs_is_empty_uuid(u8 *uuid)
return 1;
 }
 
+static int btrfs_link_subvol(struct btrfs_trans_handle *trans,
+struct inode *dir, u64 objectid, const char *name,
+int namelen)
+{
+   struct btrfs_root *root = BTRFS_I(dir)->root;
+   struct btrfs_key key;
+   u64 index = 0;
+   int ret;
+
+   /*
+* insert the directory item
+*/
+   ret = btrfs_set_inode_index(BTRFS_I(dir), );
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   return ret;
+   }
+
+   key.objectid = objectid;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = -1;
+   ret = btrfs_insert_dir_item(trans, root, name, namelen, BTRFS_I(dir),
+   , BTRFS_FT_DIR, index);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   return ret;
+   }
+
+   btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
+   ret = btrfs_update_inode(trans, root, dir);
+   BUG_ON(ret);
+
+   ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
+btrfs_ino(BTRFS_I(dir)), index, name, namelen);
+   BUG_ON(ret);
+
+   return ret;
+}
+
 static noinline int create_subvol(struct inode *dir,
  struct dentry *dentry,
  const char *name, int namelen,
@@ -563,7 +602,6 @@ static noinline int create_subvol(struct inode *dir,
int err;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
-   u64 index = 0;
uuid_le new_uuid;
 
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
@@ -677,30 +715,9 @@ static noinline int create_subvol(struct inode *dir,
new_root->highest_objectid = new_dirid;
mutex_unlock(_root->objectid_mutex);
 
-   /*
-* insert the directory item
-*/
-   ret = btrfs_set_inode_index(BTRFS_I(dir), );
-   if (ret) {
-   btrfs_abort_transaction(trans, ret);
-   goto fail;
-   }
-
-   ret = btrfs_insert_dir_item(trans, root,
-   name, namelen, BTRFS_I(dir), ,
-   BTRFS_FT_DIR, index);
-   if (ret) {
-   btrfs_abort_transaction(trans, ret);
+   ret = btrfs_link_subvol(trans, dir, objectid, name, namelen);
+   if (ret)
goto fail;
-   }
-
-   btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
-   ret = btrfs_update_inode(trans, root, dir);
-   BUG_ON(ret);
-
-   ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
-btrfs_ino(BTRFS_I(dir)), index, name, namelen);
-   BUG_ON(ret);
 
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
  BTRFS_UUID_KEY_SUBVOL, objectid);
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 3/4] btrfs: undelete: introduce btrfs_undelete_subvolume

2018-08-05 Thread Lu Fengqi

The function will do the following things which are almost the opposite
of what btrfs_delete_subvolume() does:

1. link the subvolume to the parent specified;
2. clear root flag and set root_refs to 1;
3. add the subvol to the uuid_tree;
4. delete the orphan_item.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ioctl.c | 116 +++
 1 file changed, 116 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e0b5a8fb15e7..7a11c4f8e450 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1864,6 +1864,122 @@ static noinline int btrfs_ioctl_snap_create_v2(struct 
file *file,
return ret;
 }
 
+static int btrfs_undelete_subvolume(const struct path *parent,
+   struct btrfs_root *root,
+   const char *name, int namelen)
+{
+   struct inode *dir = d_inode(parent->dentry);
+   struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+   struct btrfs_root_item *root_item = >root_item;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_block_rsv block_rsv;
+   struct dentry *dentry;
+   struct inode *inode;
+   u64 root_flags;
+   int ret;
+
+   btrfs_debug(fs_info, "Undelete subvolume %llu",
+   root->root_key.objectid);
+
+   /* only care about the intact subvolume */
+   if (btrfs_disk_key_objectid(_item->drop_progress) != 0)
+   return 0;
+
+   /* root_refs of destination parent root must not be 0 */
+   if (btrfs_root_refs(_I(dir)->root->root_item) == 0)
+   return -ENOENT;
+
+   ret = down_write_killable_nested(>i_rwsem, I_MUTEX_PARENT);
+   if (ret == -EINTR)
+   return ret;
+
+   dentry = lookup_one_len(name, parent->dentry, namelen);
+   if (IS_ERR(dentry)) {
+   ret = PTR_ERR(dentry);
+   goto out_unlock;
+   }
+
+   down_write(_info->subvol_sem);
+
+   ret = btrfs_may_create(dir, dentry);
+   if (ret)
+   goto out_up_write;
+
+   ret = btrfs_check_dir_item_collision(root, dir->i_ino, name, namelen);
+   if (ret)
+   goto out_up_write;
+
+   btrfs_init_block_rsv(_rsv, BTRFS_BLOCK_RSV_TEMP);
+   /*
+* 1 - parent dir inode
+* 2 - dir entries
+* 2 - root ref/backref
+* 1 - UUID item
+*/
+   ret = btrfs_subvolume_reserve_metadata(root, _rsv, 6, false);
+   if (ret)
+   goto out_up_write;
+
+   trans = btrfs_start_transaction(BTRFS_I(dir)->root, 0);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   btrfs_subvolume_release_metadata(fs_info, _rsv);
+   goto out_up_write;
+   }
+
+   trans->block_rsv = _rsv;
+   trans->bytes_reserved = block_rsv.size;
+
+   ret = btrfs_link_subvol(trans, dir, root->root_key.objectid, name,
+   namelen);
+   if (ret)
+   goto fail;
+
+   /* clear BTRFS_ROOT_SUBVOL_DEAD root flag and set root_refs to 1*/
+   root_flags = btrfs_root_flags(root_item);
+   btrfs_set_root_flags(root_item,
+root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+   btrfs_set_root_refs(root_item, 1);
+   ret = btrfs_update_root(trans, fs_info->tree_root,
+   >root_key, >root_item);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   goto fail;
+   }
+
+   ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
+ root->root_key.objectid);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   goto fail;
+   }
+
+   ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
+   root->root_key.objectid);
+   if (ret && ret != -ENOENT) {
+   btrfs_abort_transaction(trans, ret);
+   goto fail;
+   }
+fail:
+   trans->block_rsv = NULL;
+   trans->bytes_reserved = 0;
+   btrfs_subvolume_release_metadata(fs_info, _rsv);
+   ret = btrfs_commit_transaction(trans);
+   if (!ret) {
+   inode = btrfs_lookup_dentry(dir, dentry);
+   if (IS_ERR(inode))
+   return PTR_ERR(inode);
+   d_instantiate(dentry, inode);
+   fsnotify_mkdir(dir, dentry);
+   }
+out_up_write:
+   up_write(_info->subvol_sem);
+   dput(dentry);
+out_unlock:
+   inode_unlock(dir);
+   return ret;
+}
+
 static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
void __user *arg)
 {
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH 2/4] btrfs: don't BUG_ON() in btrfs_link_subvol()

2018-08-05 Thread Lu Fengqi

Both of btrfs_update_inode() and btrfs_add_root_ref() may fail because
of ENOMEM. So there's no reason to panic here, we can replace BUG_ON()
with btrfs_abort_transaction() here.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ioctl.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d37c26f69112..e0b5a8fb15e7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -572,11 +572,17 @@ static int btrfs_link_subvol(struct btrfs_trans_handle 
*trans,
 
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, dir);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   return ret;
+   }
 
ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
 btrfs_ino(BTRFS_I(dir)), index, name, namelen);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   return ret;
+   }
 
return ret;
 }
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 3 4 5 >

1 - 100 of 479 matches

Mail list logo