Re: [PATCH] Remove unused dedupe argument btrfs_set_extent_delalloc()

2017-10-10 Thread Qu Wenruo



On 2017年10月11日 10:29, Goldwyn Rodrigues wrote:

From: Goldwyn Rodrigues 


Signed-off-by: Goldwyn Rodrigues 
---
  fs/btrfs/ctree.h |  2 +-
  fs/btrfs/file.c  |  2 +-
  fs/btrfs/inode.c |  9 -
  fs/btrfs/relocation.c|  2 +-
  fs/btrfs/tests/inode-tests.c | 12 ++--
  5 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fc690384c58..ac7e2b02a4df 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3174,7 +3174,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, 
int delay_iput);
  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
   int nr);
  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
- struct extent_state **cached_state, int dedupe);
+ struct extent_state **cached_state);


Just as the variable name indicated, it's preparation for in-band dedupe.
Same for @delalloc_end.

If Fujitsu is not pushing inband dedupe anymore, it would be possible to 
remove it, but AFAIK it's not the case.


Thanks,
Qu


  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 struct btrfs_root *new_root,
 struct btrfs_root *parent_root,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a3d006d14683..46fa02e109f3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -504,7 +504,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page 
**pages,
  
  	end_of_last_block = start_pos + num_bytes - 1;

err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-   cached, 0);
+   cached);
if (err)
return err;
  
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index d94e3f68b9b1..9a3953fc3b45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2036,7 +2036,7 @@ static noinline int add_pending_csums(struct 
btrfs_trans_handle *trans,
  }
  
  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,

- struct extent_state **cached_state, int dedupe)
+ struct extent_state **cached_state)
  {
WARN_ON((end & (PAGE_SIZE - 1)) == 0);
return set_extent_delalloc(_I(inode)->io_tree, start, end,
@@ -2101,8 +2101,7 @@ static void btrfs_writepage_fixup_worker(struct 
btrfs_work *work)
goto out;
 }
  
-	btrfs_set_extent_delalloc(inode, page_start, page_end, _state,

- 0);
+   btrfs_set_extent_delalloc(inode, page_start, page_end, _state);
ClearPageChecked(page);
set_page_dirty(page);
  out:
@@ -4854,7 +4853,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t 
from, loff_t len,
  0, 0, _state, GFP_NOFS);
  
  	ret = btrfs_set_extent_delalloc(inode, block_start, block_end,

-   _state, 0);
+   _state);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
 _state, GFP_NOFS);
@@ -9253,7 +9252,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
  0, 0, _state, GFP_NOFS);
  
  	ret = btrfs_set_extent_delalloc(inode, page_start, end,

-   _state, 0);
+   _state);
if (ret) {
unlock_extent_cached(io_tree, page_start, page_end,
 _state, GFP_NOFS);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9841faef08ea..ff19edb84d0e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3266,7 +3266,7 @@ static int relocate_file_extent_cluster(struct inode 
*inode,
nr++;
}
  
-		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);

+   btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
set_page_dirty(page);
  
  		unlock_extent(_I(inode)->io_tree,

diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 8c91d03cc82d..1a7d8b65d500 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -970,7 +970,7 @@ static int test_extent_accounting(u32 sectorsize, u32 
nodesize)
/* [BTRFS_MAX_EXTENT_SIZE] */
BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
-   NULL, 0);
+   NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -986,7 +986,7 @@ static int test_extent_accounting(u32 sectorsize, u32 
nodesize)

[PATCH v3 1/4] btrfs: add_missing_dev() should return the actual error

2017-10-10 Thread Anand Jain
add_missing_dev() can return device pointer so that IS_ERR/
PTR_ERR can be used to check for the actual error occurred
in the function.

Signed-off-by: Anand Jain 
Reviewed-by: Liu Bo 
---
v2: add btrfs_err in read_one_dev too
v3: fix wrong commit id used for git send and
add missing change log

 fs/btrfs/volumes.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0e8f16c305df..1fb98c2ab9c0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6249,7 +6249,7 @@ static struct btrfs_device *add_missing_dev(struct 
btrfs_fs_devices *fs_devices,
 
device = btrfs_alloc_device(NULL, , dev_uuid);
if (IS_ERR(device))
-   return NULL;
+   return device;
 
list_add(>dev_list, _devices->devices);
device->fs_devices = fs_devices;
@@ -6454,9 +6454,12 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, 
struct btrfs_key *key,
map->stripes[i].dev =
add_missing_dev(fs_info->fs_devices, devid,
uuid);
-   if (!map->stripes[i].dev) {
+   if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
-   return -EIO;
+   btrfs_err(fs_info,
+   "failed to init missing dev %llu %ld",
+   devid, PTR_ERR(map->stripes[i].dev));
+   return PTR_ERR(map->stripes[i].dev);
}
btrfs_report_missing_device(fs_info, devid, uuid);
}
@@ -6582,8 +6585,12 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
}
 
device = add_missing_dev(fs_devices, devid, dev_uuid);
-   if (!device)
-   return -ENOMEM;
+   if (IS_ERR(device)) {
+   btrfs_err(fs_info,
+   "failed to add missing dev %llu %ld",
+   devid, PTR_ERR(device));
+   return PTR_ERR(device);
+   }
btrfs_report_missing_device(fs_info, devid, dev_uuid);
} else {
if (!device->bdev) {
-- 
2.13.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Remove unused dedupe argument btrfs_set_extent_delalloc()

2017-10-10 Thread Tsutomu Itoh

On 2017/10/11 11:29, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 

'int dedupe' was added to prepare for support of subpage sector size and 
in-band dedupe.

  commit ba8b04c1d4ad ("btrfs: extend btrfs_set_extent_delalloc and its friends 
to support in-band dedupe and subpage size patchset")

So please do not delete it.

Thanks,
Tsutomu

> 
> Signed-off-by: Goldwyn Rodrigues 
> ---
>  fs/btrfs/ctree.h |  2 +-
>  fs/btrfs/file.c  |  2 +-
>  fs/btrfs/inode.c |  9 -
>  fs/btrfs/relocation.c|  2 +-
>  fs/btrfs/tests/inode-tests.c | 12 ++--
>  5 files changed, 13 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 8fc690384c58..ac7e2b02a4df 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3174,7 +3174,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root 
> *root, int delay_iput);
>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
>  int nr);
>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
> -   struct extent_state **cached_state, int dedupe);
> +   struct extent_state **cached_state);
>  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
>struct btrfs_root *new_root,
>struct btrfs_root *parent_root,
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index a3d006d14683..46fa02e109f3 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -504,7 +504,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page 
> **pages,
>  
>   end_of_last_block = start_pos + num_bytes - 1;
>   err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
> - cached, 0);
> + cached);
>   if (err)
>   return err;
>  
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index d94e3f68b9b1..9a3953fc3b45 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -2036,7 +2036,7 @@ static noinline int add_pending_csums(struct 
> btrfs_trans_handle *trans,
>  }
>  
>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
> -   struct extent_state **cached_state, int dedupe)
> +   struct extent_state **cached_state)
>  {
>   WARN_ON((end & (PAGE_SIZE - 1)) == 0);
>   return set_extent_delalloc(_I(inode)->io_tree, start, end,
> @@ -2101,8 +2101,7 @@ static void btrfs_writepage_fixup_worker(struct 
> btrfs_work *work)
>   goto out;
>}
>  
> - btrfs_set_extent_delalloc(inode, page_start, page_end, _state,
> -   0);
> + btrfs_set_extent_delalloc(inode, page_start, page_end, _state);
>   ClearPageChecked(page);
>   set_page_dirty(page);
>  out:
> @@ -4854,7 +4853,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t 
> from, loff_t len,
> 0, 0, _state, GFP_NOFS);
>  
>   ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
> - _state, 0);
> + _state);
>   if (ret) {
>   unlock_extent_cached(io_tree, block_start, block_end,
>_state, GFP_NOFS);
> @@ -9253,7 +9252,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
> 0, 0, _state, GFP_NOFS);
>  
>   ret = btrfs_set_extent_delalloc(inode, page_start, end,
> - _state, 0);
> + _state);
>   if (ret) {
>   unlock_extent_cached(io_tree, page_start, page_end,
>_state, GFP_NOFS);
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index 9841faef08ea..ff19edb84d0e 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -3266,7 +3266,7 @@ static int relocate_file_extent_cluster(struct inode 
> *inode,
>   nr++;
>   }
>  
> - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
> + btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
>   set_page_dirty(page);
>  
>   unlock_extent(_I(inode)->io_tree,
> diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
> index 8c91d03cc82d..1a7d8b65d500 100644
> --- a/fs/btrfs/tests/inode-tests.c
> +++ b/fs/btrfs/tests/inode-tests.c
> @@ -970,7 +970,7 @@ static int test_extent_accounting(u32 sectorsize, u32 
> nodesize)
>   /* [BTRFS_MAX_EXTENT_SIZE] */
>   BTRFS_I(inode)->outstanding_extents++;
>   ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
> - NULL, 0);
> + NULL);
>   if (ret) {
>   

[PATCH v2.1 1/4] btrfs: add_missing_dev() should return the actual error

2017-10-10 Thread Anand Jain
add_missing_dev() can return device pointer so that IS_ERR/
PTR_ERR can be used to check for the actual error occurred
in the function.

Signed-off-by: Anand Jain 
Reviewed-by: Liu Bo 

---
 fs/btrfs/volumes.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0e8f16c305df..1fb98c2ab9c0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6249,7 +6249,7 @@ static struct btrfs_device *add_missing_dev(struct 
btrfs_fs_devices *fs_devices,
 
device = btrfs_alloc_device(NULL, , dev_uuid);
if (IS_ERR(device))
-   return NULL;
+   return device;
 
list_add(>dev_list, _devices->devices);
device->fs_devices = fs_devices;
@@ -6454,9 +6454,12 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, 
struct btrfs_key *key,
map->stripes[i].dev =
add_missing_dev(fs_info->fs_devices, devid,
uuid);
-   if (!map->stripes[i].dev) {
+   if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
-   return -EIO;
+   btrfs_err(fs_info,
+   "failed to init missing dev %llu %ld",
+   devid, PTR_ERR(map->stripes[i].dev));
+   return PTR_ERR(map->stripes[i].dev);
}
btrfs_report_missing_device(fs_info, devid, uuid);
}
@@ -6582,8 +6585,12 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
}
 
device = add_missing_dev(fs_devices, devid, dev_uuid);
-   if (!device)
-   return -ENOMEM;
+   if (IS_ERR(device)) {
+   btrfs_err(fs_info,
+   "failed to add missing dev %llu %ld",
+   devid, PTR_ERR(device));
+   return PTR_ERR(device);
+   }
btrfs_report_missing_device(fs_info, devid, dev_uuid);
} else {
if (!device->bdev) {
-- 
2.13.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: USB upgrade fun

2017-10-10 Thread Kai Hendry
On Tue, 10 Oct 2017, at 10:06 AM, Satoru Takeuchi wrote:
> Probably `btrfs device remove missing /mnt/raid1` works.

That command worked. Took a really long time, but it worked. However
when I unmounted /mnt/raid1 and tried mounting it again, it fails! :(

https://s.natalian.org/2017-10-11/btrfs.txt

mount: /mnt/raid1: wrong fs type, bad option, bad superblock on
/dev/sdb1, missing codepage or helper program, or other error.

"open_ctree failed" is back... sigh...

Any tips?

I'm going to check the disks one by one and I'll reboot the server a
little later.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Remove unused dedupe argument btrfs_set_extent_delalloc()

2017-10-10 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 


Signed-off-by: Goldwyn Rodrigues 
---
 fs/btrfs/ctree.h |  2 +-
 fs/btrfs/file.c  |  2 +-
 fs/btrfs/inode.c |  9 -
 fs/btrfs/relocation.c|  2 +-
 fs/btrfs/tests/inode-tests.c | 12 ++--
 5 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fc690384c58..ac7e2b02a4df 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3174,7 +3174,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, 
int delay_iput);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
   int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
- struct extent_state **cached_state, int dedupe);
+ struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 struct btrfs_root *new_root,
 struct btrfs_root *parent_root,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a3d006d14683..46fa02e109f3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -504,7 +504,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page 
**pages,
 
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-   cached, 0);
+   cached);
if (err)
return err;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d94e3f68b9b1..9a3953fc3b45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2036,7 +2036,7 @@ static noinline int add_pending_csums(struct 
btrfs_trans_handle *trans,
 }
 
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
- struct extent_state **cached_state, int dedupe)
+ struct extent_state **cached_state)
 {
WARN_ON((end & (PAGE_SIZE - 1)) == 0);
return set_extent_delalloc(_I(inode)->io_tree, start, end,
@@ -2101,8 +2101,7 @@ static void btrfs_writepage_fixup_worker(struct 
btrfs_work *work)
goto out;
 }
 
-   btrfs_set_extent_delalloc(inode, page_start, page_end, _state,
- 0);
+   btrfs_set_extent_delalloc(inode, page_start, page_end, _state);
ClearPageChecked(page);
set_page_dirty(page);
 out:
@@ -4854,7 +4853,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t 
from, loff_t len,
  0, 0, _state, GFP_NOFS);
 
ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
-   _state, 0);
+   _state);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
 _state, GFP_NOFS);
@@ -9253,7 +9252,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
  0, 0, _state, GFP_NOFS);
 
ret = btrfs_set_extent_delalloc(inode, page_start, end,
-   _state, 0);
+   _state);
if (ret) {
unlock_extent_cached(io_tree, page_start, page_end,
 _state, GFP_NOFS);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9841faef08ea..ff19edb84d0e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3266,7 +3266,7 @@ static int relocate_file_extent_cluster(struct inode 
*inode,
nr++;
}
 
-   btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
+   btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
set_page_dirty(page);
 
unlock_extent(_I(inode)->io_tree,
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 8c91d03cc82d..1a7d8b65d500 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -970,7 +970,7 @@ static int test_extent_accounting(u32 sectorsize, u32 
nodesize)
/* [BTRFS_MAX_EXTENT_SIZE] */
BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
-   NULL, 0);
+   NULL);
if (ret) {
test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
goto out;
@@ -986,7 +986,7 @@ static int test_extent_accounting(u32 sectorsize, u32 
nodesize)
BTRFS_I(inode)->outstanding_extents++;
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
-   NULL, 0);
+

[PATCH] btrfs: cleanup extent locking sequence

2017-10-10 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

Code cleanup for better understanding:
needs_unlock to be called extent_locked to show state as opposed to
action.
Changed the variable to int, to reduce code in the critical path(code usually
executed).

Signed-off-by: Goldwyn Rodrigues 
---
 fs/btrfs/file.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f6c6754cf52d..a3d006d14683 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1590,7 +1590,7 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
int ret = 0;
bool only_release_metadata = false;
bool force_page_uptodate = false;
-   bool need_unlock;
+   int extents_locked;
 
nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
PAGE_SIZE / (sizeof(struct page *)));
@@ -1670,7 +1670,6 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
}
 
release_bytes = reserve_bytes;
-   need_unlock = false;
 again:
/*
 * This is going to setup the pages array with the number of
@@ -1683,16 +1682,15 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
if (ret)
break;
 
-   ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages,
+   extents_locked = lock_and_cleanup_extent_if_need(
+   BTRFS_I(inode), pages,
num_pages, pos, write_bytes, ,
, _state);
-   if (ret < 0) {
+   if (extents_locked < 0) {
if (ret == -EAGAIN)
goto again;
+   ret = extents_locked;
break;
-   } else if (ret > 0) {
-   need_unlock = true;
-   ret = 0;
}
 
copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
@@ -1754,7 +1752,7 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
pos, copied, NULL);
-   if (need_unlock)
+   if (extents_locked)
unlock_extent_cached(_I(inode)->io_tree,
 lockstart, lockend, _state,
 GFP_NOFS);
-- 
2.14.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/3] btrfs-progs: device: add remove missing-all

2017-10-10 Thread Misono, Tomohiro
Add 'btrfs remove missing-all' to remove all the missing devices
at once for improving usability.

Example:
 sudo mkfs.btrfs -f -d raid1 /dev/sdb1 /dev/sdb2 /dev/sdb3 /dev/sdb4
 sudo wipefs -a /dev/sdb1 /dev/sdb3
 sudo mount -o degraded /dev/sdb2 /mnt
 sudo btrfs filesystem show /mnt
 sudo btrfs device remove missing-all /mnt
 sudo btrfs filesystem show /mnt

Signed-off-by: Tomohiro Misono 
---
 Documentation/btrfs-device.asciidoc |  1 +
 cmds-device.c   | 24 +++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-device.asciidoc 
b/Documentation/btrfs-device.asciidoc
index dd60415..f08d64d 100644
--- a/Documentation/btrfs-device.asciidoc
+++ b/Documentation/btrfs-device.asciidoc
@@ -79,6 +79,7 @@ lowest device id.
 If device is mounted as degraded mode (-o degraded), special term "missing"
 can be used for . In that case, the first device that is described by
 the filesystem metadata, but not preseted at the mount time will be removed.
+Also, "missing-all" can be used to remove all the missing devices.
 
 *delete* | [|...] ::
 Alias of remove kept for backward compatibility
diff --git a/cmds-device.c b/cmds-device.c
index d28ed0f..507ad04 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -164,6 +164,8 @@ static int _cmd_device_remove(int argc, char **argv,
struct  btrfs_ioctl_vol_args arg;
struct btrfs_ioctl_vol_args_v2 argv2 = {0};
int is_devid = 0;
+   int is_missing_all = 0;
+   int num_missing = 0;
int res;
 
if (string_is_numerical(argv[i])) {
@@ -173,12 +175,16 @@ static int _cmd_device_remove(int argc, char **argv,
} else if (is_block_device(argv[i]) == 1 ||
strcmp(argv[i], "missing") == 0) {
strncpy_null(argv2.name, argv[i]);
+   } else if (strcmp(argv[i], "missing-all") == 0) {
+   is_missing_all = 1;
+   strncpy_null(argv2.name, "missing");
} else {
error("not a block device: %s", argv[i]);
ret++;
continue;
}
 
+again:
/*
 * Positive values are from BTRFS_ERROR_DEV_*,
 * otherwise it's a generic error, one of errnos
@@ -202,6 +208,21 @@ static int _cmd_device_remove(int argc, char **argv,
res = ioctl(fdmnt, BTRFS_IOC_RM_DEV, );
}
 
+   if (is_missing_all) {
+   if (!res) {
+   num_missing++;
+   goto again;
+   }
+
+   if (num_missing > 0 &&
+   (res == BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET ||
+res == BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET ||
+res == BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET ||
+res == BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET ||
+res == BTRFS_ERROR_DEV_MISSING_NOT_FOUND))
+   continue;
+   }
+
if (res) {
const char *msg;
 
@@ -228,7 +249,8 @@ static int _cmd_device_remove(int argc, char **argv,
"", \
"If 'missing' is specified for , the first device that is", \
"described by the filesystem metadata, but not presented at the", \
-   "mount time will be removed."
+   "mount time will be removed. Use 'missing-all' to remove all the", \
+   "missing devices."
 
 static const char * const cmd_device_remove_usage[] = {
"btrfs device remove | [|...] ",
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/3] btrfs-progs: doc: add description of missing and example of device remove

2017-10-10 Thread Misono, Tomohiro
This patch updates help/document of "btrfs device remove" in two points:

1. Add explanation of 'missing' for 'device remove'. This is only
written in wikipage currently.
(https://btrfs.wiki.kernel.org/index.php/Using_Btrfs_with_Multiple_Devices)

2. Add example of device removal in the man document. This is because
that explanation of "remove" says "See the example section below", but
there is no example of removal currently.

Signed-off-by: Tomohiro Misono 
Reviewed-by: Satoru Takeuchi 
---
 Documentation/btrfs-device.asciidoc | 19 ++-
 cmds-device.c   |  8 
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-device.asciidoc 
b/Documentation/btrfs-device.asciidoc
index 88822ec..dd60415 100644
--- a/Documentation/btrfs-device.asciidoc
+++ b/Documentation/btrfs-device.asciidoc
@@ -68,13 +68,17 @@ Remove device(s) from a filesystem identified by 
 Device removal must satisfy the profile constraints, otherwise the command
 fails. The filesystem must be converted to profile(s) that would allow the
 removal. This can typically happen when going down from 2 devices to 1 and
-using the RAID1 profile. See the example section below.
+using the RAID1 profile. See the *TYPICAL USECASES* section below.
 +
 The operation can take long as it needs to move all data from the device.
 +
 It is possible to delete the device that was used to mount the filesystem. The
 device entry in mount table will be replaced by another device name with the
 lowest device id.
++
+If device is mounted as degraded mode (-o degraded), special term "missing"
+can be used for . In that case, the first device that is described by
+the filesystem metadata, but not preseted at the mount time will be removed.
 
 *delete* | [|...] ::
 Alias of remove kept for backward compatibility
@@ -206,6 +210,19 @@ data or the block groups occupy the whole first device.
 The device size of '/dev/sdb' as seen by the filesystem remains unchanged, but
 the logical space from 50-100GiB will be unused.
 
+ REMOVE DEVICE 
+
+Device removal must satisfy the profile constraints, otherwise the command
+fails. For example:
+
+ $ btrfs device remove /dev/sda /mnt
+ ERROR: error removing device '/dev/sda': unable to go below two devices on 
raid1
+
+In order to remove a device, you need to convert the profile in this case:
+
+ $ btrfs balance start -mconvert=dup -dconvert=single /mnt
+ $ btrfs device remove /dev/sda /mnt
+
 DEVICE STATS
 
 
diff --git a/cmds-device.c b/cmds-device.c
index 3b6b985..d28ed0f 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -224,9 +224,16 @@ static int _cmd_device_remove(int argc, char **argv,
return !!ret;
 }
 
+#define COMMON_USAGE_REMOVE_DELETE \
+   "", \
+   "If 'missing' is specified for , the first device that is", \
+   "described by the filesystem metadata, but not presented at the", \
+   "mount time will be removed."
+
 static const char * const cmd_device_remove_usage[] = {
"btrfs device remove | [|...] ",
"Remove a device from a filesystem",
+   COMMON_USAGE_REMOVE_DELETE,
NULL
 };
 
@@ -238,6 +245,7 @@ static int cmd_device_remove(int argc, char **argv)
 static const char * const cmd_device_delete_usage[] = {
"btrfs device delete | [|...] ",
"Remove a device from a filesystem (alias of \"btrfs device remove\")",
+   COMMON_USAGE_REMOVE_DELETE,
NULL
 };
 
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/3] btrfs-progs: device: add description of alias to help message

2017-10-10 Thread Misono, Tomohiro
State the 'delete' is the alias of 'remove' as the man page says.

Signed-off-by: Tomohiro Misono 
Reviewed-by: Satoru Takeuchi 
---
 cmds-device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmds-device.c b/cmds-device.c
index 4337eb2..3b6b985 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -237,7 +237,7 @@ static int cmd_device_remove(int argc, char **argv)
 
 static const char * const cmd_device_delete_usage[] = {
"btrfs device delete | [|...] ",
-   "Remove a device from a filesystem",
+   "Remove a device from a filesystem (alias of \"btrfs device remove\")",
NULL
 };
 
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/3] btrfs-progs: device: update btrfs device remove missing

2017-10-10 Thread Misono, Tomohiro
This series updates "btrfs device remove missing".

Currently, the document lacks the description of "remove missing" which is
written only in wikipage. First and second patch updates the documents.
Third patch introduces new keyword "missing-all" to remove the all missing
devices at once for improving usability.

changelog:
v1->v2:
 split the patch and add missing-all
 
Tomohiro Misono (3):
btrfs-progs: device: add description of alias to help message
btrfs-progs: doc: add description of missing and example of device 
remove
btrfs-progs: device: add remove missing-all

 Documentation/btrfs-device.asciidoc | 20 +++-
 cmds-device.c   | 32 +++-
 2 files changed, 50 insertions(+), 2 deletions(-)

-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: doc: update help/document of btrfs device remove

2017-10-10 Thread Misono, Tomohiro
On 2017/10/11 6:22, Satoru Takeuchi wrote:
> At Tue, 3 Oct 2017 17:12:39 +0900,
> Misono, Tomohiro wrote:
>>
>> This patch updates help/document of "btrfs device remove" in two points:
>>
>> 1. Add explanation of 'missing' for 'device remove'. This is only
>> written in wikipage currently.
>> (https://btrfs.wiki.kernel.org/index.php/Using_Btrfs_with_Multiple_Devices)
>>
>> 2. Add example of device removal in the man document. This is because
>> that explanation of "remove" says "See the example section below", but
>> there is no example of removal currently.
>>
>> Signed-off-by: Tomohiro Misono 
>> ---
>>  Documentation/btrfs-device.asciidoc | 19 +++
>>  cmds-device.c   | 10 +-
>>  2 files changed, 28 insertions(+), 1 deletion(-)
>>
>> diff --git a/Documentation/btrfs-device.asciidoc 
>> b/Documentation/btrfs-device.asciidoc
>> index 88822ec..dc523a9 100644
>> --- a/Documentation/btrfs-device.asciidoc
>> +++ b/Documentation/btrfs-device.asciidoc
>> @@ -75,6 +75,10 @@ The operation can take long as it needs to move all data 
>> from the device.
>>  It is possible to delete the device that was used to mount the filesystem. 
>> The
>>  device entry in mount table will be replaced by another device name with the
>>  lowest device id.
>> ++
>> +If device is mounted as degraded mode (-o degraded), special term "missing"
>> +can be used for . In that case, the first device that is described 
>> by
>> +the filesystem metadata, but not presented at the mount time will be 
>> removed.
>>  
>>  *delete* | [|...] ::
>>  Alias of remove kept for backward compatibility
>> @@ -206,6 +210,21 @@ data or the block groups occupy the whole first device.
>>  The device size of '/dev/sdb' as seen by the filesystem remains unchanged, 
>> but
>>  the logical space from 50-100GiB will be unused.
>>  
>> + REMOVE DEVICE 
> 
> It's a part of "TYPICAL USECASES" section. So it's also necessary to modify
> the following sentence
> 
> ===
> See the example section below.
> ===
> 
> to as follow.
> 
> ===
> See the *TYPICAL USECASES* section below.
> ===
> 
> Or just removing the above mentioned sentence is also OK since there is
> "See the section *TYPICAL USECASES* for some examples." in "DEVICE MANAGEMENT"
> section.
> 
>> +
>> +Device removal must satisfy the profile constraints, otherwise the command
>> +fails. For example:
>> +
>> + $ btrfs device remove /dev/sda /mnt
>> + $ ERROR: error removing device '/dev/sda': unable to go below two devices 
>> on raid1
> 
> s/^$  ERROR/ERROR/
> 
>> +
>> +
>> +In order to remove a device, you need to convert profile in this case:
>> +
>> + $ btrfs balance start -mconvert=dup /mnt
>> + $ btrfs balance start -dconvert=single /mnt
> 
> It's simpler to convert both the RAID configuration of data and metadata
> by the following one command.
> 
> $ btrfs balance -mconvert=dup -dconvert=single /mnt
> 
>> + $ btrfs device remove /dev/sda /mnt
>> +
>>  DEVICE STATS
>>  
>>  
>> diff --git a/cmds-device.c b/cmds-device.c
>> index 4337eb2..6cb53ff 100644
>> --- a/cmds-device.c
>> +++ b/cmds-device.c
>> @@ -224,9 +224,16 @@ static int _cmd_device_remove(int argc, char **argv,
>>  return !!ret;
>>  }
>>  
>> +#define COMMON_USAGE_REMOVE_DELETE \
>> +"", \
>> +"If 'missing' is specified for , the first device that is", \
>> +"described by the filesystem metadata, but not presented at the", \
>> +"mount time will be removed."
>> +
>>  static const char * const cmd_device_remove_usage[] = {
>>  "btrfs device remove | [|...] ",
>>  "Remove a device from a filesystem",
>> +COMMON_USAGE_REMOVE_DELETE,
>>  NULL
>>  };
>>  
>> @@ -237,7 +244,8 @@ static int cmd_device_remove(int argc, char **argv)
>>  
>>  static const char * const cmd_device_delete_usage[] = {
>>  "btrfs device delete | [|...] ",
>> -"Remove a device from a filesystem",
>> +"Remove a device from a filesystem (alias of \"btrfs device remove\")",
>> +COMMON_USAGE_REMOVE_DELETE,
>>  NULL
>>  };
> 
> This snippet is not related to the description of this patch.
> Dividing this patch is better.
> 
> Thanks,
> Satoru

Thanks for the review and I will follow the advice.
I think there should be 'missing-all' too to remove all the missing devices,
and will send the patches again.

Regards,
Tomohiro

> 
>>  
>> -- 
>> 2.9.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

[PATCH v2] btrfs: Use bd_dev to generate index when dev_state_hashtable add items.

2017-10-10 Thread Gu Jinxiang
From: Gu JinXiang 

Fix bug of
 ().

Description of this bug:
Use MOUNT_OPTIONS="-o check_int" when run xfstest, device can not be
mount successfully. So xfstest can not run.

Signed-off-by: Gu JinXiang 
---
 fs/btrfs/check-integrity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9d3854839038..86d79bc4cfb3 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -613,7 +613,7 @@ static void btrfsic_dev_state_hashtable_add(
struct btrfsic_dev_state_hashtable *h)
 {
const unsigned int hashval =
-   (((unsigned int)((uintptr_t)ds->bdev)) &
+   (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
 
list_add(>collision_resolving_node, h->table + hashval);
-- 
2.13.5



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs-progs: do not add stale device into fs_devices

2017-10-10 Thread Liu Bo
If one of btrfs's devices was pulled out and we've replaced it with a
new one, then they have the same uuid.

If that device gets reconnected, 'btrfs filesystem show' will show the
stale one instead of the new one, but on kernel side btrfs has a fix
to not include the stale one, this could confuse users as people may
monitor btrfs by running that cli.

This does the similar thing to what kernel side has done.

Signed-off-by: Liu Bo 
---
 volumes.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/volumes.c b/volumes.c
index 2f3943d..c7b7a41 100644
--- a/volumes.c
+++ b/volumes.c
@@ -138,7 +138,20 @@ static int device_list_add(const char *path,
list_add(>dev_list, _devices->devices);
device->fs_devices = fs_devices;
} else if (!device->name || strcmp(device->name, path)) {
-   char *name = strdup(path);
+   char *name;
+
+   /*
+* The existing device has newer generation, so this
+* one could be a stale one, don't add it.
+*/
+   if (found_transid < device->generation) {
+   warning("adding device %s gen %llu but found a existing 
device %s gen %llu\n",
+   path, found_transid, device->name,
+   device->generation, found_transid);
+   return -EEXIST;
+   }
+
+   name = strdup(path);
 if (!name)
 return -ENOMEM;
 kfree(device->name);
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/4] btrfs: add_missing_dev() should return the actual error

2017-10-10 Thread Liu Bo
On Mon, Oct 09, 2017 at 11:07:43AM +0800, Anand Jain wrote:
> add_missing_dev() can return device pointer so that IS_ERR/
> PTR_ERR can be used to check for the actual error occurred
> in the function.
> 
> Signed-off-by: Anand Jain 
> ---
> v2: This patch is a split from
> [PATCH 1/2] btrfs: fix read_one_chunk() return error code
> 
>  fs/btrfs/volumes.c | 13 -
>  1 file changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 0e8f16c305df..2f500a32089e 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -6249,7 +6249,7 @@ static struct btrfs_device *add_missing_dev(struct 
> btrfs_fs_devices *fs_devices,
>  
>   device = btrfs_alloc_device(NULL, , dev_uuid);
>   if (IS_ERR(device))
> - return NULL;
> + return device;
>  
>   list_add(>dev_list, _devices->devices);
>   device->fs_devices = fs_devices;
> @@ -6454,9 +6454,12 @@ static int read_one_chunk(struct btrfs_fs_info 
> *fs_info, struct btrfs_key *key,
>   map->stripes[i].dev =
>   add_missing_dev(fs_info->fs_devices, devid,
>   uuid);
> - if (!map->stripes[i].dev) {
> + if (IS_ERR(map->stripes[i].dev)) {
>   free_extent_map(em);
> - return -EIO;
> + btrfs_err(fs_info,
> + "failed to init missing dev %llu %ld",
> + devid, PTR_ERR(map->stripes[i].dev));
> + return PTR_ERR(map->stripes[i].dev);
>   }
>   btrfs_report_missing_device(fs_info, devid, uuid);
>   }
> @@ -6582,8 +6585,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
>   }
>  
>   device = add_missing_dev(fs_devices, devid, dev_uuid);
> - if (!device)
> - return -ENOMEM;
> + if (IS_ERR(device))
> + return PTR_ERR(device);

Could you please also add a btrfs_err() like above?

With that,

Reviewed-by: Liu Bo 

-liubo

>   btrfs_report_missing_device(fs_info, devid, dev_uuid);
>   } else {
>   if (!device->bdev) {
> -- 
> 2.7.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] btrfs-progs: fsck: Introduce --fix-dev-size option

2017-10-10 Thread Qu Wenruo



On 2017年10月10日 21:16, David Sterba wrote:

On Tue, Oct 10, 2017 at 07:51:11AM +, Qu Wenruo wrote:

Introduce --fix-dev-size option to fix device related problems.


Please don't add it to 'check', this is not the right place for the
targeted fixes. -> 'btrfs rescue'


I'm OK moving the super total_bytes fix to 'btrfs rescue'.

But what about the alignment/mismatch detection part?
Is it still OK to detect them in 'btrfs check'?

And further more, the unaligned device total_bytes problem is not a big 
problem that fits into 'rescue' territory.


I'm not really sure about the difference between rescue and check.

Thanks,
Qu




This repairing  is also included in --repair, but considering the safety
and potential affected users, it's better to provide a option to fix and
only fix device size related problem to avoid full --repair.

Reported-by: Asif Youssuff 
Reported-by: Rich Rauenzahn 
Signed-off-by: Qu Wenruo 
---
  Documentation/btrfs-check.asciidoc | 23 +++
  cmds-check.c   | 28 +++-
  2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-check.asciidoc 
b/Documentation/btrfs-check.asciidoc
index fbf48847ac25..e45c7a457bac 100644
--- a/Documentation/btrfs-check.asciidoc
+++ b/Documentation/btrfs-check.asciidoc
@@ -93,6 +93,29 @@ the entire free space cache. This option with 'v2' provides 
an alternative
  method of clearing the free space cache that doesn't require mounting the
  filesystem.
  
+--fix-dev-size::

+From v4.14-rc kernels, a more restrict device size checker is introduced, while
+old kernel doesn't strictly align its device size, so it may cause noisy kernel
+warning for newer kernel, like:
++
+
+WARNING: CPU: 3 PID: 439 at fs/btrfs/ctree.h:1559 
btrfs_update_device+0x1c5/0x1d0 [btrfs]
+
++
+And for some case where super block total device size may mismatch with all
+devices, and the filesystem will be unable to be mounted, with kernel message
+like:
++
+
+BTRFS error (device sdb): super_total_bytes 92017859088384 mismatch with 
fs_devices total_rw_bytes 92017859094528
+
++
+This option will fix both problems by aligning all size of devices, and
+re-calculating superblock total bytes.
++
+Although such repairing is included in *--repair* option, considering the
+safety of *--repair*, this option is provided to suppress all other dangerous
+repairing and only fix device sizes related problems.
  
  DANGEROUS OPTIONS

  -
diff --git a/cmds-check.c b/cmds-check.c
index 007781fa5d1b..fdb6d832eee1 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -11746,6 +11746,8 @@ out:
return err;
  }
  
+static int reset_devs_size(struct btrfs_fs_info *fs_info);

+
  static int do_check_chunks_and_extents(struct btrfs_fs_info *fs_info)
  {
int ret;
@@ -11756,6 +11758,12 @@ static int do_check_chunks_and_extents(struct 
btrfs_fs_info *fs_info)
ret = check_chunks_and_extents_v2(fs_info);
else
ret = check_chunks_and_extents(fs_info);
+   /* Also repair device sizes if needed */
+   if (repair && !ret) {
+   ret = reset_devs_size(fs_info);
+   if (ret > 0)
+   ret = 0;
+   }
  
  	return ret;

  }
@@ -13088,6 +13096,8 @@ const char * const cmd_check_usage[] = {
"-b|--backup use the first valid backup root copy",
"--force skip mount checks, repair is not possible",
"--repairtry to repair the filesystem",
+   "--fix-dev-size  repair device size related problem",
+   "will not trigger other repair",
"--readonly  run in read-only mode (default)",
"--init-csum-treecreate a new CRC tree",
"--init-extent-tree  create a new extent tree",
@@ -13128,6 +13138,7 @@ int cmd_check(int argc, char **argv)
int qgroups_repaired = 0;
unsigned ctree_flags = OPEN_CTREE_EXCLUSIVE;
int force = 0;
+   bool fix_dev_size = false;
  
  	while(1) {

int c;
@@ -13135,7 +13146,7 @@ int cmd_check(int argc, char **argv)
GETOPT_VAL_INIT_EXTENT, GETOPT_VAL_CHECK_CSUM,
GETOPT_VAL_READONLY, GETOPT_VAL_CHUNK_TREE,
GETOPT_VAL_MODE, GETOPT_VAL_CLEAR_SPACE_CACHE,
-   GETOPT_VAL_FORCE };
+   GETOPT_VAL_FORCE, GETOPT_VAL_FIX_DEV_SIZE };
static const struct option long_options[] = {
{ "super", required_argument, NULL, 's' },
{ "repair", no_argument, NULL, GETOPT_VAL_REPAIR },
@@ -13158,6 +13169,8 @@ int cmd_check(int argc, char **argv)
{ "clear-space-cache", required_argument, NULL,

[PATCH] Btrfs: remove rcu_barrier in btrfs_close_devices

2017-10-10 Thread Liu Bo
It was introduced because btrfs used to do blkdev_put in a deferred
work, now that btrfs has put blkdev in place, this rcu_barrier can be
removed.

Signed-off-by: Liu Bo 
---
 fs/btrfs/volumes.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0e8f16c..d983cea 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -958,12 +958,6 @@ int btrfs_close_devices(struct btrfs_fs_devices 
*fs_devices)
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
}
-   /*
-* Wait for rcu kworkers under __btrfs_close_devices
-* to finish all blkdev_puts so device is really
-* free when umount is done.
-*/
-   rcu_barrier();
return ret;
 }
 
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: free btrfs_device in place

2017-10-10 Thread Liu Bo
It's pointless to defer it to a kthread helper as we're not under any
special context.

Signed-off-by: Liu Bo 
---
 fs/btrfs/volumes.c | 14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d983cea..4a72c45 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -836,26 +836,16 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices 
*fs_devices, int step)
mutex_unlock(_mutex);
 }
 
-static void __free_device(struct work_struct *work)
+static void free_device(struct rcu_head *head)
 {
struct btrfs_device *device;
 
-   device = container_of(work, struct btrfs_device, rcu_work);
+   device = container_of(head, struct btrfs_device, rcu);
rcu_string_free(device->name);
bio_put(device->flush_bio);
kfree(device);
 }
 
-static void free_device(struct rcu_head *head)
-{
-   struct btrfs_device *device;
-
-   device = container_of(head, struct btrfs_device, rcu);
-
-   INIT_WORK(>rcu_work, __free_device);
-   schedule_work(>rcu_work);
-}
-
 static void btrfs_close_bdev(struct btrfs_device *device)
 {
if (device->bdev && device->writeable) {
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: doc: update help/document of btrfs device remove

2017-10-10 Thread Satoru Takeuchi
At Tue, 3 Oct 2017 17:12:39 +0900,
Misono, Tomohiro wrote:
> 
> This patch updates help/document of "btrfs device remove" in two points:
> 
> 1. Add explanation of 'missing' for 'device remove'. This is only
> written in wikipage currently.
> (https://btrfs.wiki.kernel.org/index.php/Using_Btrfs_with_Multiple_Devices)
> 
> 2. Add example of device removal in the man document. This is because
> that explanation of "remove" says "See the example section below", but
> there is no example of removal currently.
> 
> Signed-off-by: Tomohiro Misono 
> ---
>  Documentation/btrfs-device.asciidoc | 19 +++
>  cmds-device.c   | 10 +-
>  2 files changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/btrfs-device.asciidoc 
> b/Documentation/btrfs-device.asciidoc
> index 88822ec..dc523a9 100644
> --- a/Documentation/btrfs-device.asciidoc
> +++ b/Documentation/btrfs-device.asciidoc
> @@ -75,6 +75,10 @@ The operation can take long as it needs to move all data 
> from the device.
>  It is possible to delete the device that was used to mount the filesystem. 
> The
>  device entry in mount table will be replaced by another device name with the
>  lowest device id.
> ++
> +If device is mounted as degraded mode (-o degraded), special term "missing"
> +can be used for . In that case, the first device that is described by
> +the filesystem metadata, but not presented at the mount time will be removed.
>  
>  *delete* | [|...] ::
>  Alias of remove kept for backward compatibility
> @@ -206,6 +210,21 @@ data or the block groups occupy the whole first device.
>  The device size of '/dev/sdb' as seen by the filesystem remains unchanged, 
> but
>  the logical space from 50-100GiB will be unused.
>  
> + REMOVE DEVICE 

It's a part of "TYPICAL USECASES" section. So it's also necessary to modify
the following sentence

===
See the example section below.
===

to as follow.

===
See the *TYPICAL USECASES* section below.
===

Or just removing the above mentioned sentence is also OK since there is
"See the section *TYPICAL USECASES* for some examples." in "DEVICE MANAGEMENT"
section.

> +
> +Device removal must satisfy the profile constraints, otherwise the command
> +fails. For example:
> +
> + $ btrfs device remove /dev/sda /mnt
> + $ ERROR: error removing device '/dev/sda': unable to go below two devices 
> on raid1

s/^$  ERROR/ERROR/

> +
> +
> +In order to remove a device, you need to convert profile in this case:
> +
> + $ btrfs balance start -mconvert=dup /mnt
> + $ btrfs balance start -dconvert=single /mnt

It's simpler to convert both the RAID configuration of data and metadata
by the following one command.

$ btrfs balance -mconvert=dup -dconvert=single /mnt

> + $ btrfs device remove /dev/sda /mnt
> +
>  DEVICE STATS
>  
>  
> diff --git a/cmds-device.c b/cmds-device.c
> index 4337eb2..6cb53ff 100644
> --- a/cmds-device.c
> +++ b/cmds-device.c
> @@ -224,9 +224,16 @@ static int _cmd_device_remove(int argc, char **argv,
>   return !!ret;
>  }
>  
> +#define COMMON_USAGE_REMOVE_DELETE \
> + "", \
> + "If 'missing' is specified for , the first device that is", \
> + "described by the filesystem metadata, but not presented at the", \
> + "mount time will be removed."
> +
>  static const char * const cmd_device_remove_usage[] = {
>   "btrfs device remove | [|...] ",
>   "Remove a device from a filesystem",
> + COMMON_USAGE_REMOVE_DELETE,
>   NULL
>  };
>  
> @@ -237,7 +244,8 @@ static int cmd_device_remove(int argc, char **argv)
>  
>  static const char * const cmd_device_delete_usage[] = {
>   "btrfs device delete | [|...] ",
> - "Remove a device from a filesystem",
> + "Remove a device from a filesystem (alias of \"btrfs device remove\")",
> + COMMON_USAGE_REMOVE_DELETE,
>   NULL
>  };

This snippet is not related to the description of this patch.
Dividing this patch is better.

Thanks,
Satoru

>  
> -- 
> 2.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] Btrfs: avoid losing data raid profile when deleting a device

2017-10-10 Thread Liu Bo
We've avoided data losing raid profile when doing balance, but it
turns out that deleting a device could also result in the same
problem.

This fixes the problem by creating an empty data chunk before
relocating the data chunk.

Metadata/System chunk are supposed to have non-zero bytes all the time
so their raid profile is persistent.

Reported-by: James Alandt 
Signed-off-by: Liu Bo 
---

v2: - return the correct error.
- move helper ahead of __btrfs_balance().

 fs/btrfs/volumes.c | 84 ++
 1 file changed, 65 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4a72c45..a74396d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3018,6 +3018,48 @@ static int btrfs_relocate_sys_chunks(struct 
btrfs_fs_info *fs_info)
return ret;
 }
 
+/*
+ * return 1 : allocate a data chunk successfully,
+ * return <0: errors during allocating a data chunk,
+ * return 0 : no need to allocate a data chunk.
+ */
+static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset)
+{
+   struct btrfs_block_group_cache *cache;
+   u64 bytes_used;
+   u64 chunk_type;
+
+   cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+   ASSERT(cache);
+   chunk_type = cache->flags;
+   btrfs_put_block_group(cache);
+
+   if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
+   spin_lock(_info->data_sinfo->lock);
+   bytes_used = fs_info->data_sinfo->bytes_used;
+   spin_unlock(_info->data_sinfo->lock);
+
+   if (!bytes_used) {
+   struct btrfs_trans_handle *trans;
+   int ret;
+
+   trans = btrfs_join_transaction(fs_info->tree_root);
+   if (IS_ERR(trans))
+   return PTR_ERR(trans);
+
+   ret = btrfs_force_chunk_alloc(trans, fs_info,
+ BTRFS_BLOCK_GROUP_DATA);
+   btrfs_end_transaction(trans);
+   if (ret < 0)
+   return ret;
+
+   return 1;
+   }
+   }
+   return 0;
+}
+
 static int insert_balance_item(struct btrfs_fs_info *fs_info,
   struct btrfs_balance_control *bctl)
 {
@@ -3476,7 +3518,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
-   u64 bytes_used = 0;
 
/* step one make some room on all the devices */
devices = _info->fs_devices->devices;
@@ -3635,28 +3676,21 @@ static int __btrfs_balance(struct btrfs_fs_info 
*fs_info)
goto loop;
}
 
-   ASSERT(fs_info->data_sinfo);
-   spin_lock(_info->data_sinfo->lock);
-   bytes_used = fs_info->data_sinfo->bytes_used;
-   spin_unlock(_info->data_sinfo->lock);
-
-   if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
-   !chunk_reserved && !bytes_used) {
-   trans = btrfs_start_transaction(chunk_root, 0);
-   if (IS_ERR(trans)) {
-   mutex_unlock(_info->delete_unused_bgs_mutex);
-   ret = PTR_ERR(trans);
-   goto error;
-   }
-
-   ret = btrfs_force_chunk_alloc(trans, fs_info,
- BTRFS_BLOCK_GROUP_DATA);
-   btrfs_end_transaction(trans);
+   if (!chunk_reserved) {
+   /*
+* We may be relocating the only data chunk we have,
+* which could potentially end up with losing data's
+* raid profile, so lets allocate an empty one in
+* advance.
+*/
+   ret = btrfs_may_alloc_data_chunk(fs_info,
+found_key.offset);
if (ret < 0) {
mutex_unlock(_info->delete_unused_bgs_mutex);
goto error;
+   } else if (ret == 1) {
+   chunk_reserved = 1;
}
-   chunk_reserved = 1;
}
 
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
@@ -4419,6 +4453,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 
new_size)
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
 
+   /*
+* We may be relocating the only data chunk we have,
+  

Re: [PATCH v2 5/5] btrfs: ensure that metadata and flush are issued from the root cgroup

2017-10-10 Thread Liu Bo
On Tue, Oct 10, 2017 at 09:43:26AM -0700, Tejun Heo wrote:
> From 3bbed8c7747739cda48f592f165e8839da076a3a Mon Sep 17 00:00:00 2001
> 
> Issuing metdata or otherwise shared IOs from !root cgroup can lead to
> priority inversion.  This patch ensures that those IOs are always
> issued from the root cgroup.
> 
> This patch updates btrfs_update_iflags() to not set S_CGROUPWB on
> btree_inodes.  This isn't strictly necessary as those inodes don't
> call the function during init; however, this serves as documentation
> and prevents possible future mistakes.  If this isn't desirable,
> please feel free to drop the section.
> 
> v2: Fixed missing @bh in submit_bh_blkcg_css() call.
>

Looks good.

Reviewed-by: Liu Bo 

-liubo

> Signed-off-by: Tejun Heo 
> Cc: Chris Mason 
> Cc: Josef Bacik 
> ---
>  fs/btrfs/check-integrity.c | 2 +-
>  fs/btrfs/disk-io.c | 4 
>  fs/btrfs/ioctl.c   | 4 +++-
>  3 files changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
> index 7d5a9b5..d66774e 100644
> --- a/fs/btrfs/check-integrity.c
> +++ b/fs/btrfs/check-integrity.c
> @@ -2741,7 +2741,7 @@ int btrfsic_submit_bh(int op, int op_flags, struct 
> buffer_head *bh)
>   struct btrfsic_dev_state *dev_state;
>  
>   if (!btrfsic_is_initialized)
> - return submit_bh(op, op_flags, bh);
> + return submit_bh_blkcg_css(op, op_flags, bh, blkcg_root_css);
>  
>   mutex_lock(_mutex);
>   /* since btrfsic_submit_bh() might also be called before
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index dfdab84..fe8bbe1 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1025,6 +1025,8 @@ static blk_status_t btree_submit_bio_hook(void 
> *private_data, struct bio *bio,
>   int async = check_async_write(bio_flags);
>   blk_status_t ret;
>  
> + bio_associate_blkcg(bio, blkcg_root_css);
> +
>   if (bio_op(bio) != REQ_OP_WRITE) {
>   /*
>* called for a read, do the setup so that checksum validation
> @@ -3512,6 +3514,8 @@ static void write_dev_flush(struct btrfs_device *device)
>   return;
>  
>   bio_reset(bio);
> + bio_associate_blkcg(bio, blkcg_root_css);
> +
>   bio->bi_end_io = btrfs_end_empty_barrier;
>   bio_set_dev(bio, device->bdev);
>   bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 117cc63..8a7db6c 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -150,7 +150,9 @@ void btrfs_update_iflags(struct inode *inode)
>   new_fl |= S_NOATIME;
>   if (ip->flags & BTRFS_INODE_DIRSYNC)
>   new_fl |= S_DIRSYNC;
> - new_fl |= S_CGROUPWB;
> + /* btree_inodes are always in the root cgroup */
> + if (btrfs_ino(ip) != BTRFS_BTREE_INODE_OBJECTID)
> + new_fl |= S_CGROUPWB;
>  
>   set_mask_bits(>i_flags,
> S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
> -- 
> 2.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: avoid losing data raid profile when deleting a device

2017-10-10 Thread Liu Bo
On Tue, Oct 10, 2017 at 09:57:46AM +0300, Nikolay Borisov wrote:
> 
> 
> On  9.10.2017 21:01, Liu Bo wrote:
> > We've avoided data losing raid profile when doing balance, but it
> > turns out that deleting a device could also result in the same
> > problem.
> > 
> > This fixes the problem by creating an empty data chunk before
> > relocating the data chunk.
> > 
> > Metadata/System chunk are supposed to have non-zero bytes all the time
> > so their raid profile is persistent.
> 
> This patch introduces new warning:
> 
> fs/btrfs/volumes.c:3523:29: note: ‘trans’ was declared here
>   struct btrfs_trans_handle *trans;
>

Not sure how I missed this, thanks for pointing it out.

> 
> > 
> > Reported-by: James Alandt 
> > Signed-off-by: Liu Bo 
> > ---
> >  fs/btrfs/volumes.c | 87 
> > ++
> >  1 file changed, 68 insertions(+), 19 deletions(-)
> > 
> > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> > index 4a72c45..3f48bcd 100644
> > --- a/fs/btrfs/volumes.c
> > +++ b/fs/btrfs/volumes.c
> > @@ -144,6 +144,8 @@ static int __btrfs_map_block(struct btrfs_fs_info 
> > *fs_info,
> >  u64 logical, u64 *length,
> >  struct btrfs_bio **bbio_ret,
> >  int mirror_num, int need_raid_map);
> > +static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
> > + u64 chunk_offset);
> 
> Also there is no need to have this forward declaration, the function can
> just as well be put right before __btrfs_balance. Let's try and keep
> changes minimal.
>

OK.

> >  
> >  DEFINE_MUTEX(uuid_mutex);
> >  static LIST_HEAD(fs_uuids);
> > @@ -3476,7 +3478,6 @@ static int __btrfs_balance(struct btrfs_fs_info 
> > *fs_info)
> > u32 count_meta = 0;
> > u32 count_sys = 0;
> > int chunk_reserved = 0;
> > -   u64 bytes_used = 0;
> >  
> > /* step one make some room on all the devices */
> > devices = _info->fs_devices->devices;
> > @@ -3635,28 +3636,22 @@ static int __btrfs_balance(struct btrfs_fs_info 
> > *fs_info)
> > goto loop;
> > }
> >  
> > -   ASSERT(fs_info->data_sinfo);
> > -   spin_lock(_info->data_sinfo->lock);
> > -   bytes_used = fs_info->data_sinfo->bytes_used;
> > -   spin_unlock(_info->data_sinfo->lock);
> > -
> > -   if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
> > -   !chunk_reserved && !bytes_used) {
> > -   trans = btrfs_start_transaction(chunk_root, 0);
> > -   if (IS_ERR(trans)) {
> > -   mutex_unlock(_info->delete_unused_bgs_mutex);
> > -   ret = PTR_ERR(trans);
> > -   goto error;
> > -   }
> > -
> > -   ret = btrfs_force_chunk_alloc(trans, fs_info,
> > - BTRFS_BLOCK_GROUP_DATA);
> > -   btrfs_end_transaction(trans);
> > +   if (!chunk_reserved) {
> > +   /*
> > +* We may be relocating the only data chunk we have,
> > +* which could potentially end up with losing data's
> > +* raid profile, so lets allocate an empty one in
> > +* advance.
> > +*/
> > +   ret = btrfs_may_alloc_data_chunk(fs_info,
> > +found_key.offset);
> > if (ret < 0) {
> > mutex_unlock(_info->delete_unused_bgs_mutex);
> > +   ret = PTR_ERR(trans);

I'll remove this ret = PTR_ERR(trans);

-liubo

> > goto error;
> > +   } else if (ret == 1) {
> > +   chunk_reserved = 1;
> > }
> > -   chunk_reserved = 1;
> > }
> >  
> > ret = btrfs_relocate_chunk(fs_info, found_key.offset);
> > @@ -4327,6 +4322,48 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info 
> > *fs_info)
> >  }
> >  
> >  /*
> > + * return 1 : allocate a data chunk successfully,
> > + * return <0: errors during allocating a data chunk,
> > + * return 0 : no need to allocate a data chunk.
> > + */
> > +static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
> > + u64 chunk_offset)
> > +{
> > +   struct btrfs_block_group_cache *cache;
> > +   u64 bytes_used;
> > +   u64 chunk_type;
> > +
> > +   cache = btrfs_lookup_block_group(fs_info, chunk_offset);
> > +   ASSERT(cache);
> > +   chunk_type = cache->flags;
> > +   btrfs_put_block_group(cache);
> > +
> > +   if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
> > +   spin_lock(_info->data_sinfo->lock);
> > +   bytes_used = fs_info->data_sinfo->bytes_used;
> > +   

Re: [PATCH] btrfs: fix max chunk size on dup

2017-10-10 Thread Hans van Kranenburg
Sorry for the mail spam, it's an interesting code puzzle... :)

On 10/10/2017 07:22 PM, Hans van Kranenburg wrote:
> On 10/10/2017 07:07 PM, Hans van Kranenburg wrote:
>> On 10/10/2017 01:31 PM, David Sterba wrote:
>>> Hi,
>>>
>>> On Fri, Sep 29, 2017 at 04:20:51PM +0900, Naohiro Aota wrote:
 Balancing a fresh METADATA=dup btrfs file system (with size < 50G)
 generates a 128MB sized block group. While we set max_stripe_size =
 max_chunk_size = 256MB, we get this half sized block group:

 $ btrfs ins dump-t -t CHUNK_TREE btrfs.img|grep length
 length 8388608 owner 2 stripe_len 65536 type DATA
 length 33554432 owner 2 stripe_len 65536 type SYSTEM|DUP
 length 134217728 owner 2 stripe_len 65536 type METADATA|DUP

 Before commit 86db25785a6e ("Btrfs: fix max chunk size on raid5/6"), we
 used "stripe_size * ndevs > max_chunk_size * ncopies" to check the max
 chunk size. Since stripe_size = 256MB * dev_stripes (= 2) = 512MB, ndevs
 = 1, max_chunk_size = 256MB, and ncopies = 2, we allowed 256MB
 METADATA|DUP block group.

 But now, we use "stripe_size * data_stripes > max_chunk_size". Since
 data_stripes = 1 on DUP, it disallows the block group to have > 128MB.
 What missing here is "dev_stripes". Proper logical space used by the block
 group is "stripe_size * data_stripes / dev_stripes". Tweak the equations to
 use the right value.
>>>
>>> I started looking into it and still don't fully understand it. Change
>>> deep in the allocator can easily break some blockgroup combinations, so
>>> I'm rather conservative here.
>>
>> I think that the added usage of data_stripes in 86db25785a6e is the
>> problematic change. data_stripes is something that was introduced as
>> part of RAID56 in 53b381b3a and clearly only has a meaning that's
>> properly thought of for RAID56. The RAID56 commit already adds "this
>> will have to be fixed for RAID1 and RAID10 over more drives", only the
>> author doesn't catch the DUP case, which already breaks at that point.
>>
>> At the beginning it says:
>>
>> int data_stripes; /* number of stripes that count for block group size */
>>
>> For the example:
>>
>> This is DUP:
>>
>> .sub_stripes= 1,
>> .dev_stripes= 2,
>> .devs_max   = 1,
>> .devs_min   = 1,
>> .tolerated_failures = 0,
>> .devs_increment = 1,
>> .ncopies= 2,
>>
>> In the code:
>>
>> max_stripe_size = SZ_256M
>> max_chunk_size = max_stripe_size-> SZ_256M
>>
>> Then we have find_free_dev_extent:
>> max_stripe_size * dev_stripes   -> SZ_256M * 2 -> 512M
>>
>> So we like to find 512M on a disk, to stuff 2 stripes of 256M inside for
>> the DUP. (remember: The two parts of DUP *never* end up on a different
>> disk, even if you have multiple)
>>

Another better fix would be to make a change here...

>> If we find one:
>> stripe_size = devices_info[ndevs-1].max_avail   -> 512M, yay

...because this is not yay. The 512M is max_avail, which needs to holds
*2* stripes, not 1. So stripe_size is suddenly changed to twice the
stripe size for DUP.

So, an additional division again by dev_stripes would translate the
max_avail on device ndevs-1 to the stripe size to use.

>>
>> num_stripes = ndevs * dev_stripes -> 1 * 2  -> 2
>>
>> data_stripes = num_stripes / ncopies = 2 / 2 = 1
> 
> Oh, wow, this is not true of course, because the "number of stripes that
> count for block group size" should still be 1 for DUP...
> 
>> BOOM! There's the problem. The data_stripes only thinks about data that
>> is horizontally spread over disks, and not vertically spread...
> 
> Hm... no Hans, you're wrong.
> 
>> What I would propose is changing...
>>the data_stripes =  and afterwards trying to correct it with
>> some ifs
>> ...to...
>>a switch/case thing where the explicit logic is put to get the right
>> value for each specific raid type.
>>
>> In case of DUP this simply means data_stripes = 2, because there is no
>> need for fancy calculations about spreading DUP data over X devices.
>> It's always 2 copies on 1 device.
> 
> Eh, nope. 1.
> 
> So, then I end up at the "stripe_size * data_stripes > max_chunk_size"
> again.
> 
> So, yes, Naohiro is right, and DUP is the only case in which this logic
> breaks. DUP is the only one in which it this change makes a difference,
> because it's the only one which has dev_stripes set to something else
> than 1.

If stripe_size doesn't incorrectly get set to dev_stripes * stripe_size
max_avail above, the if works.

The comments below still stand, because doing all those calculations
with stripes and number of devices for something as predictable as DUP
is only confusing. :D

> 
> \o/
> 
>> ...
>>
>> My general feeling when looking at the code, is that this single part of
>> code is responsible for too many different cases, or, more possible
>> cases than a developer can reason about at once "in his head" when
>> working on it.
>>
>> 7 raid options * 3 

Re: [PATCH 1/4] Btrfs: compress_file_range() remove dead variable num_bytes

2017-10-10 Thread David Sterba
On Tue, Oct 03, 2017 at 06:06:01PM +0300, Timofey Titovets wrote:
> Remove dead assigment of num_bytes
> 
> Also as num_bytes only used in will_compress block as
> copy of total_in just replace that with total_in and drop num_bytes entire
> 
> Signed-off-by: Timofey Titovets 
> Reviewed-by: Nikolay Borisov 

Reviewed-by: David Sterba 

Added to the queue.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] Btrfs: btrfs_dedupe_file_range() ioctl, remove 16MiB restriction

2017-10-10 Thread David Sterba
On Tue, Oct 03, 2017 at 06:06:04PM +0300, Timofey Titovets wrote:
> At now btrfs_dedupe_file_range() restricted to 16MiB range for
> limit locking time and memory requirement for dedup ioctl()
> 
> For too big input rage code silently set range to 16MiB
> 
> Let's remove that restriction by do iterating over dedup range.
> That's backward compatible and will not change anything for request
> less then 16MiB.

This would make the ioctl more pleasant to use. So far I haven't found
any problems to do the iteration. One possible speedup could be done to
avoid the repeated allocations in btrfs_extent_same if we're going to
iterate more than once.

As this would mean the 16MiB length restriction is gone, this needs to
bubble up to the documentation
(http://man7.org/linux/man-pages/man2/ioctl_fideduperange.2.html)

Have you tested the behaviour with larger ranges?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: fix max chunk size on dup

2017-10-10 Thread Hans van Kranenburg
On 10/10/2017 07:07 PM, Hans van Kranenburg wrote:
> On 10/10/2017 01:31 PM, David Sterba wrote:
>> Hi,
>>
>> On Fri, Sep 29, 2017 at 04:20:51PM +0900, Naohiro Aota wrote:
>>> Balancing a fresh METADATA=dup btrfs file system (with size < 50G)
>>> generates a 128MB sized block group. While we set max_stripe_size =
>>> max_chunk_size = 256MB, we get this half sized block group:
>>>
>>> $ btrfs ins dump-t -t CHUNK_TREE btrfs.img|grep length
>>> length 8388608 owner 2 stripe_len 65536 type DATA
>>> length 33554432 owner 2 stripe_len 65536 type SYSTEM|DUP
>>> length 134217728 owner 2 stripe_len 65536 type METADATA|DUP
>>>
>>> Before commit 86db25785a6e ("Btrfs: fix max chunk size on raid5/6"), we
>>> used "stripe_size * ndevs > max_chunk_size * ncopies" to check the max
>>> chunk size. Since stripe_size = 256MB * dev_stripes (= 2) = 512MB, ndevs
>>> = 1, max_chunk_size = 256MB, and ncopies = 2, we allowed 256MB
>>> METADATA|DUP block group.
>>>
>>> But now, we use "stripe_size * data_stripes > max_chunk_size". Since
>>> data_stripes = 1 on DUP, it disallows the block group to have > 128MB.
>>> What missing here is "dev_stripes". Proper logical space used by the block
>>> group is "stripe_size * data_stripes / dev_stripes". Tweak the equations to
>>> use the right value.
>>
>> I started looking into it and still don't fully understand it. Change
>> deep in the allocator can easily break some blockgroup combinations, so
>> I'm rather conservative here.
> 
> I think that the added usage of data_stripes in 86db25785a6e is the
> problematic change. data_stripes is something that was introduced as
> part of RAID56 in 53b381b3a and clearly only has a meaning that's
> properly thought of for RAID56. The RAID56 commit already adds "this
> will have to be fixed for RAID1 and RAID10 over more drives", only the
> author doesn't catch the DUP case, which already breaks at that point.
> 
> At the beginning it says:
> 
> int data_stripes; /* number of stripes that count for block group size */
> 
> For the example:
> 
> This is DUP:
> 
> .sub_stripes= 1,
> .dev_stripes= 2,
> .devs_max   = 1,
> .devs_min   = 1,
> .tolerated_failures = 0,
> .devs_increment = 1,
> .ncopies= 2,
> 
> In the code:
> 
> max_stripe_size = SZ_256M
> max_chunk_size = max_stripe_size-> SZ_256M
> 
> Then we have find_free_dev_extent:
> max_stripe_size * dev_stripes   -> SZ_256M * 2 -> 512M
> 
> So we like to find 512M on a disk, to stuff 2 stripes of 256M inside for
> the DUP. (remember: The two parts of DUP *never* end up on a different
> disk, even if you have multiple)
> 
> If we find one:
> stripe_size = devices_info[ndevs-1].max_avail   -> 512M, yay
> 
> ndevs = min(ndevs, devs_max)  -> min($whatever, 1)   -> 1
> num_stripes = ndevs * dev_stripes -> 1 * 2  -> 2
> 
> data_stripes = num_stripes / ncopies = 2 / 2 = 1

Oh, wow, this is not true of course, because the "number of stripes that
count for block group size" should still be 1 for DUP...

> BOOM! There's the problem. The data_stripes only thinks about data that
> is horizontally spread over disks, and not vertically spread...

Hm... no Hans, you're wrong.

> What I would propose is changing...
>the data_stripes =  and afterwards trying to correct it with
> some ifs
> ...to...
>a switch/case thing where the explicit logic is put to get the right
> value for each specific raid type.
> 
> In case of DUP this simply means data_stripes = 2, because there is no
> need for fancy calculations about spreading DUP data over X devices.
> It's always 2 copies on 1 device.

Eh, nope. 1.

So, then I end up at the "stripe_size * data_stripes > max_chunk_size"
again.

So, yes, Naohiro is right, and DUP is the only case in which this logic
breaks. DUP is the only one in which it this change makes a difference,
because it's the only one which has dev_stripes set to something else
than 1.

\o/

> ...
> 
> My general feeling when looking at the code, is that this single part of
> code is responsible for too many different cases, or, more possible
> cases than a developer can reason about at once "in his head" when
> working on it.
> 
> 7 raid options * 3 different types (data, metadata, system) = 21
> already... Some parts of the algorithm make only sense for a subset of
> the combinations, but they're still part of the computation, which
> sometimes "by accident" results in the correct outcome. :)
> 
> If it can't be done in a way that's easier to understand when reading
> the code, it should have unit tests with a list of known input/output to
> detect unwanted changes.
> 


-- 
Hans van Kranenburg
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: fix max chunk size on dup

2017-10-10 Thread Hans van Kranenburg
On 10/10/2017 01:31 PM, David Sterba wrote:
> Hi,
> 
> On Fri, Sep 29, 2017 at 04:20:51PM +0900, Naohiro Aota wrote:
>> Balancing a fresh METADATA=dup btrfs file system (with size < 50G)
>> generates a 128MB sized block group. While we set max_stripe_size =
>> max_chunk_size = 256MB, we get this half sized block group:
>>
>> $ btrfs ins dump-t -t CHUNK_TREE btrfs.img|grep length
>> length 8388608 owner 2 stripe_len 65536 type DATA
>> length 33554432 owner 2 stripe_len 65536 type SYSTEM|DUP
>> length 134217728 owner 2 stripe_len 65536 type METADATA|DUP
>>
>> Before commit 86db25785a6e ("Btrfs: fix max chunk size on raid5/6"), we
>> used "stripe_size * ndevs > max_chunk_size * ncopies" to check the max
>> chunk size. Since stripe_size = 256MB * dev_stripes (= 2) = 512MB, ndevs
>> = 1, max_chunk_size = 256MB, and ncopies = 2, we allowed 256MB
>> METADATA|DUP block group.
>>
>> But now, we use "stripe_size * data_stripes > max_chunk_size". Since
>> data_stripes = 1 on DUP, it disallows the block group to have > 128MB.
>> What missing here is "dev_stripes". Proper logical space used by the block
>> group is "stripe_size * data_stripes / dev_stripes". Tweak the equations to
>> use the right value.
> 
> I started looking into it and still don't fully understand it. Change
> deep in the allocator can easily break some blockgroup combinations, so
> I'm rather conservative here.

I think that the added usage of data_stripes in 86db25785a6e is the
problematic change. data_stripes is something that was introduced as
part of RAID56 in 53b381b3a and clearly only has a meaning that's
properly thought of for RAID56. The RAID56 commit already adds "this
will have to be fixed for RAID1 and RAID10 over more drives", only the
author doesn't catch the DUP case, which already breaks at that point.

At the beginning it says:

int data_stripes; /* number of stripes that count for block group size */

For the example:

This is DUP:

.sub_stripes= 1,
.dev_stripes= 2,
.devs_max   = 1,
.devs_min   = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies= 2,

In the code:

max_stripe_size = SZ_256M
max_chunk_size = max_stripe_size-> SZ_256M

Then we have find_free_dev_extent:
max_stripe_size * dev_stripes   -> SZ_256M * 2 -> 512M

So we like to find 512M on a disk, to stuff 2 stripes of 256M inside for
the DUP. (remember: The two parts of DUP *never* end up on a different
disk, even if you have multiple)

If we find one:
stripe_size = devices_info[ndevs-1].max_avail   -> 512M, yay

ndevs = min(ndevs, devs_max)  -> min($whatever, 1)   -> 1
num_stripes = ndevs * dev_stripes -> 1 * 2  -> 2

data_stripes = num_stripes / ncopies = 2 / 2 = 1

BOOM! There's the problem. The data_stripes only thinks about data that
is horizontally spread over disks, and not vertically spread...

What I would propose is changing...
   the data_stripes =  and afterwards trying to correct it with
some ifs
...to...
   a switch/case thing where the explicit logic is put to get the right
value for each specific raid type.

In case of DUP this simply means data_stripes = 2, because there is no
need for fancy calculations about spreading DUP data over X devices.
It's always 2 copies on 1 device.

...

My general feeling when looking at the code, is that this single part of
code is responsible for too many different cases, or, more possible
cases than a developer can reason about at once "in his head" when
working on it.

7 raid options * 3 different types (data, metadata, system) = 21
already... Some parts of the algorithm make only sense for a subset of
the combinations, but they're still part of the computation, which
sometimes "by accident" results in the correct outcome. :)

If it can't be done in a way that's easier to understand when reading
the code, it should have unit tests with a list of known input/output to
detect unwanted changes.

-- 
Hans van Kranenburg
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/5] btrfs: ensure that metadata and flush are issued from the root cgroup

2017-10-10 Thread Tejun Heo
>From 3bbed8c7747739cda48f592f165e8839da076a3a Mon Sep 17 00:00:00 2001

Issuing metdata or otherwise shared IOs from !root cgroup can lead to
priority inversion.  This patch ensures that those IOs are always
issued from the root cgroup.

This patch updates btrfs_update_iflags() to not set S_CGROUPWB on
btree_inodes.  This isn't strictly necessary as those inodes don't
call the function during init; however, this serves as documentation
and prevents possible future mistakes.  If this isn't desirable,
please feel free to drop the section.

v2: Fixed missing @bh in submit_bh_blkcg_css() call.

Signed-off-by: Tejun Heo 
Cc: Chris Mason 
Cc: Josef Bacik 
---
 fs/btrfs/check-integrity.c | 2 +-
 fs/btrfs/disk-io.c | 4 
 fs/btrfs/ioctl.c   | 4 +++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7d5a9b5..d66774e 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2741,7 +2741,7 @@ int btrfsic_submit_bh(int op, int op_flags, struct 
buffer_head *bh)
struct btrfsic_dev_state *dev_state;
 
if (!btrfsic_is_initialized)
-   return submit_bh(op, op_flags, bh);
+   return submit_bh_blkcg_css(op, op_flags, bh, blkcg_root_css);
 
mutex_lock(_mutex);
/* since btrfsic_submit_bh() might also be called before
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfdab84..fe8bbe1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1025,6 +1025,8 @@ static blk_status_t btree_submit_bio_hook(void 
*private_data, struct bio *bio,
int async = check_async_write(bio_flags);
blk_status_t ret;
 
+   bio_associate_blkcg(bio, blkcg_root_css);
+
if (bio_op(bio) != REQ_OP_WRITE) {
/*
 * called for a read, do the setup so that checksum validation
@@ -3512,6 +3514,8 @@ static void write_dev_flush(struct btrfs_device *device)
return;
 
bio_reset(bio);
+   bio_associate_blkcg(bio, blkcg_root_css);
+
bio->bi_end_io = btrfs_end_empty_barrier;
bio_set_dev(bio, device->bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 117cc63..8a7db6c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -150,7 +150,9 @@ void btrfs_update_iflags(struct inode *inode)
new_fl |= S_NOATIME;
if (ip->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
-   new_fl |= S_CGROUPWB;
+   /* btree_inodes are always in the root cgroup */
+   if (btrfs_ino(ip) != BTRFS_BTREE_INODE_OBJECTID)
+   new_fl |= S_CGROUPWB;
 
set_mask_bits(>i_flags,
  S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] Btrfs: handle unaligned tail of data ranges more efficient

2017-10-10 Thread David Sterba
On Tue, Oct 03, 2017 at 06:06:03PM +0300, Timofey Titovets wrote:
> At now while switch page bits in data ranges
> we always hande +1 page, for cover case
> where end of data range is not page aligned

The 'end' is inclusive and thus not aligned in most cases, ie. it's
offset 4095 in the page, so the IS_ALIGNED is allways true and the code
is equivalent to the existing condition (index <= end_index).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] Btrfs: clear_dirty only on pages only in compression range

2017-10-10 Thread David Sterba
On Tue, Oct 03, 2017 at 06:06:02PM +0300, Timofey Titovets wrote:
> We need to call extent_range_clear_dirty_for_io()
> on compression range to prevent application from changing
> page content, while pages compressing.
> 
> but "(end - start)" can be much (up to 1024 times) bigger
> then compression range (BTRFS_MAX_UNCOMPRESSED), so optimize that
> by calculating compression range for
> that loop iteration, and flip bits only on that range

I'm not sure this is safe to do. Compress_file_range gets the whole
range [start,end] from async_cow_start, and other parts of code might
rely on the status of the whole range, ie. not partially redirtied.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHSET v2] cgroup, writeback, btrfs: make sure btrfs issues metadata IOs from the root cgroup

2017-10-10 Thread Tejun Heo
Hello,

Changes from the last version are

* blkcg_root_css exported to fix build breakage on modular btrfs.

* Use ext4_should_journal_data() test instead of
  EXT4_MOUNT_JOURNAL_DATA.

* Separated out create_bh_bio() and used it to implement
  submit_bh_blkcg_css() as suggested by Jan.

btrfs has different ways to issue metadata IOs and may end up issuing
metadata or otherwise shared IOs from a non-root cgroup, which can
lead to priority inversion and ineffective IO isolation.

This patchset makes sure that btrfs issues all metadata and shared IOs
from the root cgroup by exempting btree_inodes from cgroup writeback
and explicitly associating shared IOs with the root cgroup.

This patchset containst he following three patches

 [PATCH 1/5] blkcg: export blkcg_root_css
 [PATCH 2/5] cgroup, writeback: replace SB_I_CGROUPWB with per-inode
 [PATCH 3/5] buffer_head: separate out create_bh_bio() from
 [PATCH 4/5] cgroup, buffer_head: implement submit_bh_blkcg_css()
 [PATCH 5/5] btrfs: ensure that metadata and flush are issued from the

and is also available in the following git branch

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
review-cgroup-btrfs-metadata-v2

diffstat follows.  Thanks.

 block/blk-cgroup.c  |1 +
 fs/block_dev.c  |3 +--
 fs/btrfs/check-integrity.c  |2 +-
 fs/btrfs/disk-io.c  |4 
 fs/btrfs/ioctl.c|6 +-
 fs/btrfs/super.c|1 -
 fs/buffer.c |   42 ++
 fs/ext2/inode.c |3 ++-
 fs/ext2/super.c |1 -
 fs/ext4/inode.c |5 -
 fs/ext4/super.c |2 --
 include/linux/backing-dev.h |2 +-
 include/linux/buffer_head.h |3 +++
 include/linux/fs.h  |3 ++-
 14 files changed, 58 insertions(+), 20 deletions(-)

--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] buffer_head: separate out create_bh_bio() from submit_bh_wbc()

2017-10-10 Thread Tejun Heo
submit_bh_wbc() creates a bio matching the specific @bh and submits it
at the end.  This patch separates out the bio creation part to its own
function, create_bh_bio(), and reimplement submit_bh[_wbc]() using the
function.

As bio can now be manipulated before submitted, we can move out @wbc
handling into submit_bh_wbc() and similarly this will make adding more
submit_bh variants straight-forward.

This patch is pure refactoring and doesn't cause any functional
changes.

Signed-off-by: Tejun Heo 
Suggested-by: Jan Kara 
---
 fs/buffer.c | 30 ++
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 170df85..b4b2169 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3086,8 +3086,8 @@ void guard_bio_eod(int op, struct bio *bio)
}
 }
 
-static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
-enum rw_hint write_hint, struct writeback_control *wbc)
+struct bio *create_bh_bio(int op, int op_flags, struct buffer_head *bh,
+  enum rw_hint write_hint)
 {
struct bio *bio;
 
@@ -3109,11 +3109,6 @@ static int submit_bh_wbc(int op, int op_flags, struct 
buffer_head *bh,
 */
bio = bio_alloc(GFP_NOIO, 1);
 
-   if (wbc) {
-   wbc_init_bio(wbc, bio);
-   wbc_account_io(wbc, bh->b_page, bh->b_size);
-   }
-
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
@@ -3133,13 +3128,32 @@ static int submit_bh_wbc(int op, int op_flags, struct 
buffer_head *bh,
op_flags |= REQ_PRIO;
bio_set_op_attrs(bio, op, op_flags);
 
+   return bio;
+}
+
+static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
+enum rw_hint write_hint, struct writeback_control *wbc)
+{
+   struct bio *bio;
+
+   bio = create_bh_bio(op, op_flags, bh, write_hint);
+
+   if (wbc) {
+   wbc_init_bio(wbc, bio);
+   wbc_account_io(wbc, bh->b_page, bh->b_size);
+   }
+
submit_bio(bio);
return 0;
 }
 
 int submit_bh(int op, int op_flags, struct buffer_head *bh)
 {
-   return submit_bh_wbc(op, op_flags, bh, 0, NULL);
+   struct bio *bio;
+
+   bio = create_bh_bio(op, op_flags, bh, 0);
+   submit_bio(bio);
+   return 0;
 }
 EXPORT_SYMBOL(submit_bh);
 
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] cgroup, writeback: replace SB_I_CGROUPWB with per-inode S_CGROUPWB

2017-10-10 Thread Tejun Heo
Currently, filesystem can indiate cgroup writeback support per
superblock; however, depending on the filesystem, especially if inodes
are used to carry metadata, it can be useful to indicate cgroup
writeback support per inode.

This patch replaces the superblock flag SB_I_CGROUPWB with per-inode
S_CGROUPWB, so that cgroup writeback can be enabled selectively.

* block_dev sets the new flag in bdget() when initializing new inode.

* ext2 sets the new flag in ext2_set_inode_flags().

* ext4 sets the new flag in ext4_set_inode_flags() if
!ext4_should_journal_data(inode).  This makes inodes whose data is
journalled due to inode flags to bypass cgroup writeback.  This
behavior change is intended.

* btrfs sets the new flag in btrfs_update_iflags() function.  Note
  that this automatically excludes btree_inode which doesn't use
  btrfs_update_iflags() during initialization.  This behavior change
  is intended.

v2: Use ext4_should_journal_data() as suggested by Jan.

Signed-off-by: Tejun Heo 
Reviewed-by: Jan Kara 
Cc: Jens Axboe 
Cc: Chris Mason 
Cc: Josef Bacik 
Cc: linux-btrfs@vger.kernel.org
Cc: "Theodore Ts'o" 
Cc: Andreas Dilger 
Cc: linux-e...@vger.kernel.org
---
 fs/block_dev.c  | 3 +--
 fs/btrfs/ioctl.c| 4 +++-
 fs/btrfs/super.c| 1 -
 fs/ext2/inode.c | 3 ++-
 fs/ext2/super.c | 1 -
 fs/ext4/inode.c | 5 -
 fs/ext4/super.c | 2 --
 include/linux/backing-dev.h | 2 +-
 include/linux/fs.h  | 3 ++-
 9 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 93d088f..ff9c282 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -800,8 +800,6 @@ static struct dentry *bd_mount(struct file_system_type 
*fs_type,
 {
struct dentry *dent;
dent = mount_pseudo(fs_type, "bdev:", _sops, NULL, BDEVFS_MAGIC);
-   if (!IS_ERR(dent))
-   dent->d_sb->s_iflags |= SB_I_CGROUPWB;
return dent;
 }
 
@@ -893,6 +891,7 @@ struct block_device *bdget(dev_t dev)
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
+   inode->i_flags |= S_CGROUPWB;
inode->i_data.a_ops = _blk_aops;
mapping_set_gfp_mask(>i_data, GFP_USER);
spin_lock(_lock);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6c7a49f..117cc63 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -150,9 +150,11 @@ void btrfs_update_iflags(struct inode *inode)
new_fl |= S_NOATIME;
if (ip->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
+   new_fl |= S_CGROUPWB;
 
set_mask_bits(>i_flags,
- S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
+ S_CGROUPWB,
  new_fl);
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 35a128a..46a1488 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1136,7 +1136,6 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_flags |= MS_POSIXACL;
 #endif
sb->s_flags |= MS_I_VERSION;
-   sb->s_iflags |= SB_I_CGROUPWB;
 
err = super_setup_bdi(sb);
if (err) {
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 4dca6f3..3c5d822 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1372,7 +1372,7 @@ void ext2_set_inode_flags(struct inode *inode)
unsigned int flags = EXT2_I(inode)->i_flags;
 
inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
-   S_DIRSYNC | S_DAX);
+   S_DIRSYNC | S_DAX | S_CGROUPWB);
if (flags & EXT2_SYNC_FL)
inode->i_flags |= S_SYNC;
if (flags & EXT2_APPEND_FL)
@@ -1385,6 +1385,7 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DIRSYNC;
if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
inode->i_flags |= S_DAX;
+   inode->i_flags |= S_CGROUPWB;
 }
 
 struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1458706..e6ba669e 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -922,7 +922,6 @@ static int ext2_fill_super(struct super_block *sb, void 
*data, int silent)
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
 MS_POSIXACL : 0);
-   sb->s_iflags |= SB_I_CGROUPWB;
 
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
(EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 31db875..d92c356 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4591,8 +4591,11 @@ void 

[PATCH 4/5] cgroup, buffer_head: implement submit_bh_blkcg_css()

2017-10-10 Thread Tejun Heo
Implement submit_bh_blkcg_css() which will be used to override cgroup
membership on specific buffer_heads.

v2: Reimplemented using create_bh_bio() as suggested by Jan.

Signed-off-by: Tejun Heo 
Cc: Jan Kara 
Cc: Jens Axboe 
---
 fs/buffer.c | 12 
 include/linux/buffer_head.h |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/fs/buffer.c b/fs/buffer.c
index b4b2169..ed0e473 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3147,6 +3147,18 @@ static int submit_bh_wbc(int op, int op_flags, struct 
buffer_head *bh,
return 0;
 }
 
+int submit_bh_blkcg_css(int op, int op_flags, struct buffer_head *bh,
+   struct cgroup_subsys_state *blkcg_css)
+{
+   struct bio *bio;
+
+   bio = create_bh_bio(op, op_flags, bh, 0);
+   bio_associate_blkcg(bio, blkcg_css);
+   submit_bio(bio);
+   return 0;
+}
+EXPORT_SYMBOL(submit_bh_blkcg_css);
+
 int submit_bh(int op, int op_flags, struct buffer_head *bh)
 {
struct bio *bio;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c8dae55..dca5b3b 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_BLOCK
 
@@ -197,6 +198,8 @@ void ll_rw_block(int, int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int __sync_dirty_buffer(struct buffer_head *bh, int op_flags);
 void write_dirty_buffer(struct buffer_head *bh, int op_flags);
+int submit_bh_blkcg_css(int op, int op_flags, struct buffer_head *bh,
+   struct cgroup_subsys_state *blkcg_css);
 int submit_bh(int, int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize);
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/5] btrfs: ensure that metadata and flush are issued from the root cgroup

2017-10-10 Thread Tejun Heo
Issuing metdata or otherwise shared IOs from !root cgroup can lead to
priority inversion.  This patch ensures that those IOs are always
issued from the root cgroup.

This patch updates btrfs_update_iflags() to not set S_CGROUPWB on
btree_inodes.  This isn't strictly necessary as those inodes don't
call the function during init; however, this serves as documentation
and prevents possible future mistakes.  If this isn't desirable,
please feel free to drop the section.

Signed-off-by: Tejun Heo 
Cc: Chris Mason 
Cc: Josef Bacik 
---
 fs/btrfs/check-integrity.c | 2 +-
 fs/btrfs/disk-io.c | 4 
 fs/btrfs/ioctl.c   | 4 +++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7d5a9b5..058dea6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2741,7 +2741,7 @@ int btrfsic_submit_bh(int op, int op_flags, struct 
buffer_head *bh)
struct btrfsic_dev_state *dev_state;
 
if (!btrfsic_is_initialized)
-   return submit_bh(op, op_flags, bh);
+   return submit_bh_blkcg_css(op, op_flags, blkcg_root_css);
 
mutex_lock(_mutex);
/* since btrfsic_submit_bh() might also be called before
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfdab84..fe8bbe1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1025,6 +1025,8 @@ static blk_status_t btree_submit_bio_hook(void 
*private_data, struct bio *bio,
int async = check_async_write(bio_flags);
blk_status_t ret;
 
+   bio_associate_blkcg(bio, blkcg_root_css);
+
if (bio_op(bio) != REQ_OP_WRITE) {
/*
 * called for a read, do the setup so that checksum validation
@@ -3512,6 +3514,8 @@ static void write_dev_flush(struct btrfs_device *device)
return;
 
bio_reset(bio);
+   bio_associate_blkcg(bio, blkcg_root_css);
+
bio->bi_end_io = btrfs_end_empty_barrier;
bio_set_dev(bio, device->bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 117cc63..8a7db6c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -150,7 +150,9 @@ void btrfs_update_iflags(struct inode *inode)
new_fl |= S_NOATIME;
if (ip->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
-   new_fl |= S_CGROUPWB;
+   /* btree_inodes are always in the root cgroup */
+   if (btrfs_ino(ip) != BTRFS_BTREE_INODE_OBJECTID)
+   new_fl |= S_CGROUPWB;
 
set_mask_bits(>i_flags,
  S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] blkcg: export blkcg_root_css

2017-10-10 Thread Tejun Heo
Export blkcg_root_css so that filesystem modules can use it.

Signed-off-by: Tejun Heo 
---
 block/blk-cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3f56ba..597a457 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -45,6 +45,7 @@ struct blkcg blkcg_root;
 EXPORT_SYMBOL_GPL(blkcg_root);
 
 struct cgroup_subsys_state * const blkcg_root_css = _root.css;
+EXPORT_SYMBOL_GPL(blkcg_root_css);
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: use appropriate replacements for __sb_{start,end}_write calls

2017-10-10 Thread David Sterba
On Tue, Oct 10, 2017 at 01:48:05PM +0300, Rakesh Pandit wrote:
> Commit a53f4f8e9c8eb ("btrfs: Don't call btrfs_start_transaction() on
> frozen fs to avoid deadlock.") started using internal calls and we
> replace them with more suitable ones.
> 
> Signed-off-by: Rakesh Pandit 
> ---
>  fs/btrfs/super.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 35a128a..99c21ae 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -1205,8 +1205,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
>* happens. The pending operations are delayed to the
>* next commit after thawing.
>*/
> - if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
> - __sb_end_write(sb, SB_FREEZE_WRITE);
> + if (sb_start_write_trylock(sb))
> + sb_end_write(sb)

  CC [M]  fs/btrfs/print-tree.o
fs/btrfs/super.c: In function ‘btrfs_sync_fs’:
fs/btrfs/super.c:1210:4: error: expected ‘;’ before ‘else’
else
^~~~

will be fixed.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] btrfs-progs: fsck: Introduce --fix-dev-size option

2017-10-10 Thread David Sterba
On Tue, Oct 10, 2017 at 07:51:11AM +, Qu Wenruo wrote:
> Introduce --fix-dev-size option to fix device related problems.

Please don't add it to 'check', this is not the right place for the
targeted fixes. -> 'btrfs rescue'

> This repairing  is also included in --repair, but considering the safety
> and potential affected users, it's better to provide a option to fix and
> only fix device size related problem to avoid full --repair.
> 
> Reported-by: Asif Youssuff 
> Reported-by: Rich Rauenzahn 
> Signed-off-by: Qu Wenruo 
> ---
>  Documentation/btrfs-check.asciidoc | 23 +++
>  cmds-check.c   | 28 +++-
>  2 files changed, 50 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/btrfs-check.asciidoc 
> b/Documentation/btrfs-check.asciidoc
> index fbf48847ac25..e45c7a457bac 100644
> --- a/Documentation/btrfs-check.asciidoc
> +++ b/Documentation/btrfs-check.asciidoc
> @@ -93,6 +93,29 @@ the entire free space cache. This option with 'v2' 
> provides an alternative
>  method of clearing the free space cache that doesn't require mounting the
>  filesystem.
>  
> +--fix-dev-size::
> +From v4.14-rc kernels, a more restrict device size checker is introduced, 
> while
> +old kernel doesn't strictly align its device size, so it may cause noisy 
> kernel
> +warning for newer kernel, like:
> ++
> +
> +WARNING: CPU: 3 PID: 439 at fs/btrfs/ctree.h:1559 
> btrfs_update_device+0x1c5/0x1d0 [btrfs]
> +
> ++
> +And for some case where super block total device size may mismatch with all
> +devices, and the filesystem will be unable to be mounted, with kernel message
> +like:
> ++
> +
> +BTRFS error (device sdb): super_total_bytes 92017859088384 mismatch with 
> fs_devices total_rw_bytes 92017859094528 
> +
> ++
> +This option will fix both problems by aligning all size of devices, and
> +re-calculating superblock total bytes.
> ++
> +Although such repairing is included in *--repair* option, considering the
> +safety of *--repair*, this option is provided to suppress all other dangerous
> +repairing and only fix device sizes related problems.
>  
>  DANGEROUS OPTIONS
>  -
> diff --git a/cmds-check.c b/cmds-check.c
> index 007781fa5d1b..fdb6d832eee1 100644
> --- a/cmds-check.c
> +++ b/cmds-check.c
> @@ -11746,6 +11746,8 @@ out:
>   return err;
>  }
>  
> +static int reset_devs_size(struct btrfs_fs_info *fs_info);
> +
>  static int do_check_chunks_and_extents(struct btrfs_fs_info *fs_info)
>  {
>   int ret;
> @@ -11756,6 +11758,12 @@ static int do_check_chunks_and_extents(struct 
> btrfs_fs_info *fs_info)
>   ret = check_chunks_and_extents_v2(fs_info);
>   else
>   ret = check_chunks_and_extents(fs_info);
> + /* Also repair device sizes if needed */
> + if (repair && !ret) {
> + ret = reset_devs_size(fs_info);
> + if (ret > 0)
> + ret = 0;
> + }
>  
>   return ret;
>  }
> @@ -13088,6 +13096,8 @@ const char * const cmd_check_usage[] = {
>   "-b|--backup use the first valid backup root copy",
>   "--force skip mount checks, repair is not possible",
>   "--repairtry to repair the filesystem",
> + "--fix-dev-size  repair device size related problem",
> + "will not trigger other repair",
>   "--readonly  run in read-only mode (default)",
>   "--init-csum-treecreate a new CRC tree",
>   "--init-extent-tree  create a new extent tree",
> @@ -13128,6 +13138,7 @@ int cmd_check(int argc, char **argv)
>   int qgroups_repaired = 0;
>   unsigned ctree_flags = OPEN_CTREE_EXCLUSIVE;
>   int force = 0;
> + bool fix_dev_size = false;
>  
>   while(1) {
>   int c;
> @@ -13135,7 +13146,7 @@ int cmd_check(int argc, char **argv)
>   GETOPT_VAL_INIT_EXTENT, GETOPT_VAL_CHECK_CSUM,
>   GETOPT_VAL_READONLY, GETOPT_VAL_CHUNK_TREE,
>   GETOPT_VAL_MODE, GETOPT_VAL_CLEAR_SPACE_CACHE,
> - GETOPT_VAL_FORCE };
> + GETOPT_VAL_FORCE, GETOPT_VAL_FIX_DEV_SIZE };
>   static const struct option long_options[] = {
>   { "super", required_argument, NULL, 's' },
>   { "repair", no_argument, NULL, GETOPT_VAL_REPAIR },
> @@ -13158,6 +13169,8 @@ int cmd_check(int argc, char **argv)
>   { "clear-space-cache", required_argument, NULL,
>   GETOPT_VAL_CLEAR_SPACE_CACHE},
>   { "force", no_argument, NULL, GETOPT_VAL_FORCE },
> + { "fix-dev-size", no_argument, NULL,
> + GETOPT_VAL_FIX_DEV_SIZE },
>   { NULL, 0, NULL, 0}
>   };
>  

Re: [PATCH] btrfs: use appropriate replacements for __sb_{start,end}_write calls

2017-10-10 Thread Rakesh Pandit
On Tue, Oct 10, 2017 at 02:08:11PM +0300, Nikolay Borisov wrote:
> 
> 
> On 10.10.2017 13:48, Rakesh Pandit wrote:
> > Commit a53f4f8e9c8eb ("btrfs: Don't call btrfs_start_transaction() on
> > frozen fs to avoid deadlock.") started using internal calls and we
> > replace them with more suitable ones.
> > 
> > Signed-off-by: Rakesh Pandit 
> > ---
> >  fs/btrfs/super.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> > index 35a128a..99c21ae 100644
> > --- a/fs/btrfs/super.c
> > +++ b/fs/btrfs/super.c
> > @@ -1205,8 +1205,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
> >  * happens. The pending operations are delayed to the
> >  * next commit after thawing.
> >  */
> > -   if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
> > -   __sb_end_write(sb, SB_FREEZE_WRITE);
> > +   if (sb_start_write_trylock(sb))
> > +   sb_end_write(sb)
> > else
> > return 0;
> 
> On second thought, what's to prevent the filesystem to be frozen if
> sb_start_write/sb_end_write code executes? Or even after we are in the
> middle of btrfs_start_transaction?

sb->s_writers.frozen is protected by sb->s_umount and s_umount is
held when ->sync_fs is called.

> 
> 
> > trans = btrfs_start_transaction(root, 0);
> > 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Use bd_dev to generate index when dev_state_hashtable add items.

2017-10-10 Thread David Sterba
On Mon, Oct 09, 2017 at 01:54:14AM +, Gu, Jinxiang wrote:
> Hi,
> sorry for reply too late.
> 
> This patch fix for the bug descriped below.
> Use MOUNT_OPTIONS="-o check_int" when run xfstest, device
> Can not be mount successfully. So xfstest can not run.

So please update the changelog and resend. Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: fix max chunk size on dup

2017-10-10 Thread David Sterba
Hi,

On Fri, Sep 29, 2017 at 04:20:51PM +0900, Naohiro Aota wrote:
> Balancing a fresh METADATA=dup btrfs file system (with size < 50G)
> generates a 128MB sized block group. While we set max_stripe_size =
> max_chunk_size = 256MB, we get this half sized block group:
> 
> $ btrfs ins dump-t -t CHUNK_TREE btrfs.img|grep length
> length 8388608 owner 2 stripe_len 65536 type DATA
> length 33554432 owner 2 stripe_len 65536 type SYSTEM|DUP
> length 134217728 owner 2 stripe_len 65536 type METADATA|DUP
> 
> Before commit 86db25785a6e ("Btrfs: fix max chunk size on raid5/6"), we
> used "stripe_size * ndevs > max_chunk_size * ncopies" to check the max
> chunk size. Since stripe_size = 256MB * dev_stripes (= 2) = 512MB, ndevs
> = 1, max_chunk_size = 256MB, and ncopies = 2, we allowed 256MB
> METADATA|DUP block group.
> 
> But now, we use "stripe_size * data_stripes > max_chunk_size". Since
> data_stripes = 1 on DUP, it disallows the block group to have > 128MB.
> What missing here is "dev_stripes". Proper logical space used by the block
> group is "stripe_size * data_stripes / dev_stripes". Tweak the equations to
> use the right value.

I started looking into it and still don't fully understand it. Change
deep in the allocator can easily break some blockgroup combinations, so
I'm rather conservative here.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: use appropriate replacements for __sb_{start,end}_write calls

2017-10-10 Thread Rakesh Pandit
On Tue, Oct 10, 2017 at 02:00:20PM +0300, Nikolay Borisov wrote:
> 
> 
> On 10.10.2017 13:48, Rakesh Pandit wrote:
> > Commit a53f4f8e9c8eb ("btrfs: Don't call btrfs_start_transaction() on
> > frozen fs to avoid deadlock.") started using internal calls and we
> > replace them with more suitable ones.
> > 
> > Signed-off-by: Rakesh Pandit 
> > ---
> >  fs/btrfs/super.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> > index 35a128a..99c21ae 100644
> > --- a/fs/btrfs/super.c
> > +++ b/fs/btrfs/super.c
> > @@ -1205,8 +1205,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
> >  * happens. The pending operations are delayed to the
> >  * next commit after thawing.
> >  */
> > -   if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
> > -   __sb_end_write(sb, SB_FREEZE_WRITE);
> > +   if (sb_start_write_trylock(sb))
> > +   sb_end_write(sb)
> > else
> > return 0;
> > trans = btrfs_start_transaction(root, 0);
> 
> The non __ versions are just wrappers around the __ specific calls. So
> the code is identical.
> 
> Reviewed-by: Nikolay Borisov 
> 

Thanks, yes indeed.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: use appropriate replacements for __sb_{start,end}_write calls

2017-10-10 Thread Nikolay Borisov


On 10.10.2017 13:48, Rakesh Pandit wrote:
> Commit a53f4f8e9c8eb ("btrfs: Don't call btrfs_start_transaction() on
> frozen fs to avoid deadlock.") started using internal calls and we
> replace them with more suitable ones.
> 
> Signed-off-by: Rakesh Pandit 
> ---
>  fs/btrfs/super.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 35a128a..99c21ae 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -1205,8 +1205,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
>* happens. The pending operations are delayed to the
>* next commit after thawing.
>*/
> - if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
> - __sb_end_write(sb, SB_FREEZE_WRITE);
> + if (sb_start_write_trylock(sb))
> + sb_end_write(sb)
>   else
>   return 0;

On second thought, what's to prevent the filesystem to be frozen if
sb_start_write/sb_end_write code executes? Or even after we are in the
middle of btrfs_start_transaction?


>   trans = btrfs_start_transaction(root, 0);
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: use appropriate replacements for __sb_{start,end}_write calls

2017-10-10 Thread Nikolay Borisov


On 10.10.2017 13:48, Rakesh Pandit wrote:
> Commit a53f4f8e9c8eb ("btrfs: Don't call btrfs_start_transaction() on
> frozen fs to avoid deadlock.") started using internal calls and we
> replace them with more suitable ones.
> 
> Signed-off-by: Rakesh Pandit 
> ---
>  fs/btrfs/super.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 35a128a..99c21ae 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -1205,8 +1205,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
>* happens. The pending operations are delayed to the
>* next commit after thawing.
>*/
> - if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
> - __sb_end_write(sb, SB_FREEZE_WRITE);
> + if (sb_start_write_trylock(sb))
> + sb_end_write(sb)
>   else
>   return 0;
>   trans = btrfs_start_transaction(root, 0);

The non __ versions are just wrappers around the __ specific calls. So
the code is identical.

Reviewed-by: Nikolay Borisov 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: use appropriate replacements for __sb_{start,end}_write calls

2017-10-10 Thread Rakesh Pandit
Commit a53f4f8e9c8eb ("btrfs: Don't call btrfs_start_transaction() on
frozen fs to avoid deadlock.") started using internal calls and we
replace them with more suitable ones.

Signed-off-by: Rakesh Pandit 
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 35a128a..99c21ae 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1205,8 +1205,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 * happens. The pending operations are delayed to the
 * next commit after thawing.
 */
-   if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
-   __sb_end_write(sb, SB_FREEZE_WRITE);
+   if (sb_start_write_trylock(sb))
+   sb_end_write(sb)
else
return 0;
trans = btrfs_start_transaction(root, 0);
-- 
2.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] cgroup, writeback: implement submit_bh_blkcg_css()

2017-10-10 Thread Jan Kara
On Mon 09-10-17 14:29:10, Tejun Heo wrote:
> Add wbc->blkcg_css so that the blkcg_css association can be specified
> independently and implement submit_bh_blkcg_css() using it.  This will
> be used to override cgroup membership on specific buffer_heads.
> 
> Signed-off-by: Tejun Heo 
> Cc: Jan Kara 
> Cc: Jens Axboe 

Hum, I dislike growing wbc just to pass blkcg_css through one function in
fs/buffer.c (in otherwise empty wbc). Not that space would be a big concern
for wbc but it just gets messier... Cannot we just refactor the code a
bit like:

1) Create function

struct bio *create_bio_bh(int op, int op_flags, struct buffer_head *bh,
  enum rw_hint write_hint);

which would create bio from bh.

2) Make submit_bh(), submit_bh_wbc(), submit_bh_blkcg_css() use this and
the latter two would further tweak the bio as needed before submitting.

Thoughts?

Honza


> ---
>  fs/buffer.c | 12 
>  fs/fs-writeback.c   |  1 +
>  include/linux/buffer_head.h | 11 +++
>  include/linux/writeback.h   |  6 --
>  4 files changed, 28 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 170df85..fac4f9a 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -3143,6 +3143,18 @@ int submit_bh(int op, int op_flags, struct buffer_head 
> *bh)
>  }
>  EXPORT_SYMBOL(submit_bh);
>  
> +#ifdef CONFIG_CGROUP_WRITEBACK
> +int submit_bh_blkcg_css(int op, int op_flags,  struct buffer_head *bh,
> + struct cgroup_subsys_state *blkcg_css)
> +{
> + struct writeback_control wbc = { .blkcg_css = blkcg_css };
> +
> + /* @wbc is used just to override the bio's blkcg_css */
> + return submit_bh_wbc(op, op_flags, bh, 0, );
> +}
> +EXPORT_SYMBOL(submit_bh_blkcg_css);
> +#endif
> +
>  /**
>   * ll_rw_block: low-level access to block devices (DEPRECATED)
>   * @op: whether to %READ or %WRITE
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 245c430..cd882e6 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -538,6 +538,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control 
> *wbc,
>   }
>  
>   wbc->wb = inode_to_wb(inode);
> + wbc->blkcg_css = wbc->wb->blkcg_css;
>   wbc->inode = inode;
>  
>   wbc->wb_id = wbc->wb->memcg_css->id;
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> index c8dae55..abb4dd4 100644
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -13,6 +13,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #ifdef CONFIG_BLOCK
>  
> @@ -205,6 +206,16 @@ int bh_submit_read(struct buffer_head *bh);
>  loff_t page_cache_seek_hole_data(struct inode *inode, loff_t offset,
>loff_t length, int whence);
>  
> +#ifdef CONFIG_CGROUP_WRITEBACK
> +int submit_bh_blkcg(int op, int op_flags, struct buffer_head *bh,
> + struct cgroup_subsys_state *blkcg_css);
> +#else
> +static inline int submit_bh_blkcg(int op, int op_flags, struct buffer_head 
> *bh,
> +   struct cgroup_subsys_state *blkcg_css)
> +{
> + return submit_bh(op, op_flags, bh);
> +}
> +#endif
>  extern int buffer_heads_over_limit;
>  
>  /*
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index d581579..81e5946 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -91,6 +91,8 @@ struct writeback_control {
>   unsigned for_sync:1;/* sync(2) WB_SYNC_ALL writeback */
>  #ifdef CONFIG_CGROUP_WRITEBACK
>   struct bdi_writeback *wb;   /* wb this writeback is issued under */
> + struct cgroup_subsys_state *blkcg_css; /* usually wb->blkcg_css but
> +   may be overridden */
>   struct inode *inode;/* inode being written out */
>  
>   /* foreign inode detection, see wbc_detach_inode() */
> @@ -277,8 +279,8 @@ static inline void wbc_init_bio(struct writeback_control 
> *wbc, struct bio *bio)
>* behind a slow cgroup.  Ultimately, we want pageout() to kick off
>* regular writeback instead of writing things out itself.
>*/
> - if (wbc->wb)
> - bio_associate_blkcg(bio, wbc->wb->blkcg_css);
> + if (wbc->blkcg_css)
> + bio_associate_blkcg(bio, wbc->blkcg_css);
>  }
>  
>  #else/* CONFIG_CGROUP_WRITEBACK */
> -- 
> 2.9.5
> 
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ANNOUNCE] fsperf: a simple fs/block performance testing framework

2017-10-10 Thread Mel Gorman
On Tue, Oct 10, 2017 at 08:09:20AM +1100, Dave Chinner wrote:
> > I'd _like_ to expand fio for cases we come up with that aren't possible, as
> > there's already a ton of measurements that are taken, especially around
> > latencies.
> 
> To be properly useful it needs to support more than just fio to run
> tests. Indeed, it's largely useless to me if that's all it can do or
> it's a major pain to add support for different tools like fsmark.
> 
> e.g.  my typical perf regression test that you see the concurrnet
> fsmark create workload is actually a lot more. It does:
> 
>   fsmark to create 50m zero length files
>   umount,
>   run parallel xfs_repair (excellent mmap_sem/page fault punisher)
>   mount
>   run parallel find -ctime (readdir + lookup traversal)
>   unmount, mount
>   run parallel ls -R (readdir + dtype traversal)
>   unmount, mount
>   parallel rm -rf of 50m files
> 
> I have variants that use small 4k files or large files rather than
> empty files, taht use different fsync patterns to stress the
> log, use grep -R to traverse the data as well as
> the directory/inode structure instead of find, etc.
> 

FWIW, this is partially implemented in mmtests as
configs/config-global-dhp__io-xfsrepair. It covers the fsmark and
xfs_repair part and an example report is

http://beta.suse.com/private/mgorman/results/home/marvin/openSUSE-LEAP-42.2/global-dhp__io-xfsrepair-xfs/delboy/#xfsrepair

(ignore 4.12.603, it's 4.12.3-stable with some additional patches that were
pending for -stable at the time the test was executed). That config was
added after a discussion with you a few years ago and I've kept it since as
it has been useful in a number of contexts. Adding additional tests to cover
parallel find, parallel ls and parallel rm would be relatively trivial but
it's not there. This is a test that doesn't have proper graphing support
but it could be added in 10-15 minutes as xfsrepair is the primary metric
and it's simply reported as elapsed time.

fsmark is also covered albeit not necessarily with parameters everyone wants
as configs/config-global-dhp__io-metadata in mmtests.  Example report is

http://beta.suse.com/private/mgorman/results/home/marvin/openSUSE-LEAP-42.2/global-dhp__io-metadata-xfs/delboy/#fsmark-threaded

mmtests has been modified multiple times as according as methodologies
were improved and it's far from perfect but it seems to me that fsperf
is going to end up reimplementing a lot of it.

It's not perfect as there are multiple quality-of-implementation issues
as it often takes the shortest path to being able to collect data but
it improves over time. When a test is found to be flawed, it's fixed and
historical data is discarded. It doesn't store data in sqlite or anything
fancy, just the raw logs are preserved and reports generated as required. In
terms of tools required, the core is just bash scripts. Some of the tests
require a number of packages to be installed but not all of them. It uses a
tool to install packages if they are missing but the naming is all based on
opensuse. It *can* map opensuse package names to fedora and debian but the
mappings are not up-to-date as I do not personally run those distributions.

Even with the quality-of-implementation issues, it seems to me that it
covers a lot of the requirements that fsperf aims for.

-- 
Mel Gorman
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] cgroup, writeback: replace SB_I_CGROUPWB with per-inode S_CGROUPWB

2017-10-10 Thread Jan Kara
On Mon 09-10-17 14:29:09, Tejun Heo wrote:
> Currently, filesystem can indiate cgroup writeback support per
> superblock; however, depending on the filesystem, especially if inodes
> are used to carry metadata, it can be useful to indicate cgroup
> writeback support per inode.
> 
> This patch replaces the superblock flag SB_I_CGROUPWB with per-inode
> S_CGROUPWB, so that cgroup writeback can be enabled selectively.
> 
> * block_dev sets the new flag in bdget() when initializing new inode.
> 
> * ext2/4 set the new flag in ext?_set_inode_flags() function.
> 
> * btrfs sets the new flag in btrfs_update_iflags() function.  Note
>   that this automatically excludes btree_inode which doesn't use
>   btrfs_update_iflags() during initialization.  This is an intended
>   behavior change.
> 
> Signed-off-by: Tejun Heo 
> Cc: Jan Kara 
> Cc: Jens Axboe 
> Cc: Chris Mason 
> Cc: Josef Bacik 
> Cc: linux-btrfs@vger.kernel.org
> Cc: "Theodore Ts'o" 
> Cc: Andreas Dilger 
> Cc: linux-e...@vger.kernel.org

...

> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 31db875..344f12b 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4591,8 +4591,11 @@ void ext4_set_inode_flags(struct inode *inode)
>   !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) &&
>   !ext4_encrypted_inode(inode))
>   new_fl |= S_DAX;
> + if (test_opt(inode->i_sb, DATA_FLAGS) != EXT4_MOUNT_JOURNAL_DATA)
> + new_fl |= S_CGROUPWB;

Use ext4_should_journal_data(inode) here? Ext4 can be journalling data also
because of per-inode flag.

Otherwise the patch looks good. You can add:

Reviewed-by: Jan Kara 

Honza
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: prefix sysfs attribute struct names

2017-10-10 Thread Hans van Kranenburg
On 10/09/2017 06:17 PM, David Sterba wrote:
> On Sun, Oct 08, 2017 at 10:30:58PM +0200, Hans van Kranenburg wrote:
>> Currently struct names for sysfs are generated only based on the
>> attribute names. This means that attribute names cannot be reused in
>> multiple places throughout the complete btrfs sysfs hierarchy.
>>
>> E.g. allocation/data/total_bytes and allocation/data/single/total_bytes
>> result in the same struct name btrfs_attr_total_bytes. A workaround for
>> this case was made in the past by ad hoc creating an extra macro
>> wrapper, BTRFS_RAID_ATTR, that inserts some extra text in the struct
>> name.
>>
>> Instead of polluting sysfs.h with such kind of extra macro definitions,
>> and only doing so when there are collisions, use a prefix which gets
>> inserted in the struct name, so we keep everything nicely grouped
>> together by default.
>>
>> Current collections of attributes are:
>> * (the toplevel, empty prefix)
>> * allocation
>> * space_info
>> * raid
>> * features
>>
>> Signed-off-by: Hans van Kranenburg 
> 
> Reviewed-by: David Sterba 

Thanks! If anyone wonders why...

Last summer I was trying to build in some metadata cow counters per
tree, and initially put them in sysfs, then I ran into the fact that I
couldn't use the field free_space_tree as a counter because it was also
used in the features.

It was a nice exercise and the results clearly showed that my suspicions
about extent tree rumination problems were right.

However, then I discovered tracepoints and could throw away the code again.

But I still finished this part, because it makes sysfs nicer for a
future developer who wants to change something :D

>> @@ -277,7 +277,8 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
>>  
>>  down_read(>groups_sem);
>>  list_for_each_entry(block_group, >block_groups[index], list) {
>> -if (>attr == BTRFS_RAID_ATTR_PTR(total_bytes))
>> +if (>attr == \
> 
> the \ is not needed here, only in macro defintions that must be on one
> logical line

Ha, yes... At some point the backslashes were dancing around before my
eyes while getting all those macros in shape again. Some started
wandering off it seems.

> 
>> +BTRFS_ATTR_PTR(raid, total_bytes))
>>  val += block_group->key.offset;
>>  else
>>  val += btrfs_block_group_used(_group->item);
>> @@ -331,19 +332,20 @@ SPACE_INFO_ATTR(bytes_may_use);
>>  SPACE_INFO_ATTR(bytes_readonly);
>>  SPACE_INFO_ATTR(disk_used);
>>  SPACE_INFO_ATTR(disk_total);
>> -BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
>> +BTRFS_ATTR(space_info, total_bytes_pinned, \
> 
> same here. will be fixed at commit time.
> 
>> +   btrfs_space_info_show_total_bytes_pinned);
>>  


-- 
Hans van Kranenburg
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: qgroup: show subvol path when qgroup show

2017-10-10 Thread Lu Fengqi
Show the absolute subvol path for the associated level-0 qgroup.

Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-qgroup.asciidoc |   2 +
 cmds-qgroup.c   |   7 +-
 qgroup.c| 130 +---
 qgroup.h|   1 +
 4 files changed, 129 insertions(+), 11 deletions(-)

diff --git a/Documentation/btrfs-qgroup.asciidoc 
b/Documentation/btrfs-qgroup.asciidoc
index 3053f2e6..fb48a917 100644
--- a/Documentation/btrfs-qgroup.asciidoc
+++ b/Documentation/btrfs-qgroup.asciidoc
@@ -93,6 +93,8 @@ Show all qgroups in the btrfs filesystem identified by .
 print parent qgroup id.
 -c
 print child qgroup id.
+-P
+print subvol path.
 -r
 print limit of referenced size of qgroup.
 -e
diff --git a/cmds-qgroup.c b/cmds-qgroup.c
index 38382ea9..9dce84ae 100644
--- a/cmds-qgroup.c
+++ b/cmds-qgroup.c
@@ -277,6 +277,7 @@ static const char * const cmd_qgroup_show_usage[] = {
"Show subvolume quota groups.",
"-p print parent qgroup id",
"-c print child qgroup id",
+   "-P print subvol path",
"-r print limit of referenced size of qgroup",
"-e print limit of exclusive size of qgroup",
"-F list all qgroups which impact the given path",
@@ -322,7 +323,7 @@ static int cmd_qgroup_show(int argc, char **argv)
{ NULL, 0, NULL, 0 }
};
 
-   c = getopt_long(argc, argv, "pcreFf", long_options, NULL);
+   c = getopt_long(argc, argv, "pcPreFf", long_options, NULL);
if (c < 0)
break;
switch (c) {
@@ -334,6 +335,10 @@ static int cmd_qgroup_show(int argc, char **argv)
btrfs_qgroup_setup_print_column(
BTRFS_QGROUP_CHILD);
break;
+   case 'P':
+   btrfs_qgroup_setup_print_column(
+   BTRFS_QGROUP_PATH);
+   break;
case 'r':
btrfs_qgroup_setup_print_column(
BTRFS_QGROUP_MAX_RFER);
diff --git a/qgroup.c b/qgroup.c
index fffdbb12..d618107e 100644
--- a/qgroup.c
+++ b/qgroup.c
@@ -62,6 +62,9 @@ struct btrfs_qgroup {
struct list_head qgroups;
/*qgroups that are members of this group*/
struct list_head members;
+
+   /* level-0 qgroup's subvolume path */
+   char *path;
 };
 
 /*
@@ -133,6 +136,13 @@ static struct {
.unit_mode  = 0,
.max_len= 5,
},
+   {
+   .name   = "path",
+   .column_name= "Path",
+   .need_print = 0,
+   .unit_mode  = 0,
+   .max_len= 4,
+   },
{
.name   = NULL,
.column_name= NULL,
@@ -253,6 +263,9 @@ static void print_qgroup_column(struct btrfs_qgroup *qgroup,
len = print_child_column(qgroup);
print_qgroup_column_add_blank(BTRFS_QGROUP_CHILD, len);
break;
+   case BTRFS_QGROUP_PATH:
+   len = printf("%s", qgroup->path ? qgroup->path : "---");
+   print_qgroup_column_add_blank(BTRFS_QGROUP_PATH, len);
default:
break;
}
@@ -267,7 +280,7 @@ static void print_single_qgroup_table(struct btrfs_qgroup 
*qgroup)
continue;
print_qgroup_column(qgroup, i);
 
-   if (i != BTRFS_QGROUP_CHILD)
+   if (i != BTRFS_QGROUP_ALL - 1)
printf(" ");
}
printf("\n");
@@ -284,7 +297,7 @@ static void print_table_head(void)
if (!btrfs_qgroup_columns[i].need_print)
continue;
if ((i == BTRFS_QGROUP_QGROUPID) | (i == BTRFS_QGROUP_PARENT) |
-   (i == BTRFS_QGROUP_CHILD))
+   (i == BTRFS_QGROUP_CHILD) | (i == BTRFS_QGROUP_PATH))
printf("%-*s", max_len, btrfs_qgroup_columns[i].name);
else
printf("%*s", max_len, btrfs_qgroup_columns[i].name);
@@ -296,7 +309,7 @@ static void print_table_head(void)
if (!btrfs_qgroup_columns[i].need_print)
continue;
if ((i == BTRFS_QGROUP_QGROUPID) | (i == BTRFS_QGROUP_PARENT) |
-   (i == BTRFS_QGROUP_CHILD)) {
+   (i == BTRFS_QGROUP_CHILD) | (i == BTRFS_QGROUP_PATH)) {
len = strlen(btrfs_qgroup_columns[i].name);
while (len--)
printf("-");
@@ -627,7 +640,7 @@ static int add_qgroup(struct qgroup_lookup *qgroup_lookup, 
u64 qgroupid,
  u64 

Re: [PATCH 3/4] btrfs-progs: check: Also check unalignment/mismatch device and super size

2017-10-10 Thread Qu Wenruo



On 2017年10月10日 16:31, Nikolay Borisov wrote:



On 10.10.2017 10:51, Qu Wenruo wrote:

Along with the fix introduced, also introduce check for them.
Unlike normal check funtions, some of the check is optional, and even if
the image failed to pass optional check, kernel can still run fine.
(But may cause noisy kernel warning)

So some check, mainly for alignment, will not cause btrfs check to fail,
but only to output warning and how to fix it.

Signed-off-by: Qu Wenruo 
---
  cmds-check.c | 73 
  1 file changed, 73 insertions(+)

diff --git a/cmds-check.c b/cmds-check.c
index fdb6d832eee1..23dd3b51102b 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -9879,6 +9879,67 @@ static int check_device_used(struct device_record 
*dev_rec,
}
  }
  
+/*

+ * Extra (optional) check for dev_item size
+ *
+ * To avoid possible kernel warning for v4.14 kernel.
+ * It's not a deadly problem, just to info v4.14 kernel user.
+ * So it won't change the return value.
+ */
+static void check_dev_size_alignment(u64 devid, u64 total_bytes,
+u32 sectorsize)
+{
+   if (!IS_ALIGNED(total_bytes, sectorsize)) {
+   warning("unaligned total_bytes detected for devid %llu, have %llu 
should be aligned to %u",
+   devid, total_bytes, sectorsize);
+   warning("this is OK for older kernel (<4.14), but may cause kernel 
warning for newer kernel (>=4.14)");
+   warning("this can be fixed by 'btrfs check --fix-dev-size'");
+   }
+}
+
+/*
+ * Unlike device size alignment check above, some super total_bytes check
+ * failure can lead to unmountable fs for kernel >=v4.6.
+ *
+ * So this function will return the error for fatal super total_bytes problem.
+ */
+static int check_super_size(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_device *dev;
+   struct list_head *dev_list = _info->fs_devices->devices;
+   u64 total_bytes = 0;
+   u64 super_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+
+   list_for_each_entry(dev, dev_list, dev_list)
+   total_bytes += dev->total_bytes;
+
+   /* Important check, which can cause unmountable fs */
+   if (super_bytes < total_bytes) {
+   error("super total bytes %llu smaller than real devices size 
%llu",
+   super_bytes, total_bytes);
+   error("this fs will not be mountable for newer kernels 
(>=v4.6)");
+   error("this can be fixed by 'btrfs check --fix-dev-size'");
+   return 1;
+   }
+
+   /*
+* Optional check, just to make everything aligned and match with
+* each other.
+*
+* For btrfs-image restored fs, we don't need to check it anyway.
+*/
+   if (btrfs_super_flags(fs_info->super_copy) &
+   (BTRFS_SUPER_FLAG_METADUMP | BTRFS_SUPER_FLAG_METADUMP_V2))
+   return 0;
+   if (!IS_ALIGNED(super_bytes, fs_info->sectorsize) ||
+   !IS_ALIGNED(total_bytes, fs_info->sectorsize) ||
+   super_bytes != total_bytes) {
+   warning("minor unaligned/mismatch device size detected");
+   warning("recommended to use 'btrfs check --fix-dev-size' to fix 
it");
+   }
+   return 0;
+}


nit: Make the function return bool and perhaps rename it to
"is_super_size_aligned" or something like that ?


+
  /* check btrfs_dev_item -> btrfs_dev_extent */
  static int check_devices(struct rb_root *dev_cache,
 struct device_extent_tree *dev_extent_cache)
@@ -9896,6 +9957,11 @@ static int check_devices(struct rb_root *dev_cache,
if (err)
ret = err;
  
+		if (!IS_ALIGNED(dev_rec->total_byte, global_info->sectorsize)) {

+   }
+
+   check_dev_size_alignment(dev_rec->devid, dev_rec->total_byte,
+global_info->sectorsize);
dev_node = rb_next(dev_node);


Something funny is going on here, the body of the if is empty, perhaps
those function have to be inside?


Oh...

That's embarrassing, that's the old code where I didn't encapsulate the 
check into check_dev_size_alignment().


I'll update the patchset to address all your comment.

Thanks,
Qu




}
list_for_each_entry(dext_rec, _extent_cache->no_device_orphans,
@@ -11141,6 +11207,7 @@ static int check_dev_item(struct btrfs_fs_info *fs_info,
struct btrfs_path path;
struct btrfs_key key;
struct btrfs_dev_extent *ptr;
+   u64 total_bytes;
u64 dev_id;
u64 used;
u64 total = 0;
@@ -11149,6 +11216,7 @@ static int check_dev_item(struct btrfs_fs_info *fs_info,
dev_item = btrfs_item_ptr(eb, slot, struct btrfs_dev_item);
dev_id = btrfs_device_id(eb, dev_item);
used = btrfs_device_bytes_used(eb, dev_item);
+   total_bytes = 

Re: [PATCH 0/4] btrfs-progs repair support for unaligned/mismatched device sizes

2017-10-10 Thread Qu Wenruo



On 2017年10月10日 16:15, Nikolay Borisov wrote:



On 10.10.2017 10:51, Qu Wenruo wrote:

The patchset can be fetched from github:
https://github.com/adam900710/btrfs-progs/tree/check_unaligned_dev

There are several reports in mail list for btrfs device size related
problems.

1) Unmountable fs, due to mismatched super total_bytes
Unmountable if super total_bytes is smaller than total rw bytes of
all devices.
Root cause under investigation, but only one report here.


Don't you mean mountable? We've internally seen mounting being a problem
due to said size mismatch.


I mean the fs is unable to be mounted.
(Which is the correct single word to express "unable to be mounted"?)

The direct cause is obvious, strict super block total_bytes checker.

But I don't know how it happened.

Maybe there is a window between some kernel release make it out of sync, 
but anyway, your introduced alignment enhancement commit should handle 
them well in v4.14.


At least for worst case, such users with fs unable to mount, won't be 
left in cold wind.

And new users starting from v4.13 should not hit such problems.

Thanks,
Qu





This patchset provides the tool to fix it offline.
(At least better than unmountable forever)

2) Harmless kernel warning for btrfs_update_device()
v4.14 introduced restrict device size checker.
This somewhat break the backward compatibility and causing kernel
warning.

It can be fixed online with "btrfs filesystem resize".
(Although it is better to fixed it at mount time)

This patchset also provide a fallback method to fix it.

Qu Wenruo (4):
   btrfs-progs: Introduce functions to repair unaligned/mismatch device
 size
   btrfs-progs: fsck: Introduce --fix-dev-size option
   btrfs-progs: check: Also check unalignment/mismatch device and super
 size
   btrfs-progs: test/fsck: Add test case image for --fix-dev-size

Qu Wenruo (4):
   btrfs-progs: Introduce functions to repair unaligned/mismatch device
 size
   btrfs-progs: fsck: Introduce --fix-dev-size option
   btrfs-progs: check: Also check unalignment/mismatch device and super
 size
   btrfs-progs: test/fsck: Add test case image for --fix-dev-size

  Documentation/btrfs-check.asciidoc |  23 ++
  cmds-check.c   | 292 -
  .../dev_and_super_mismatch_unaligned.raw.xz| Bin 0 -> 21536 bytes
  3 files changed, 314 insertions(+), 1 deletion(-)
  create mode 100644 
tests/fsck-tests/027-unaligned-super-dev-sizes/dev_and_super_mismatch_unaligned.raw.xz


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] btrfs-progs: check: Also check unalignment/mismatch device and super size

2017-10-10 Thread Nikolay Borisov


On 10.10.2017 10:51, Qu Wenruo wrote:
> Along with the fix introduced, also introduce check for them.
> Unlike normal check funtions, some of the check is optional, and even if
> the image failed to pass optional check, kernel can still run fine.
> (But may cause noisy kernel warning)
> 
> So some check, mainly for alignment, will not cause btrfs check to fail,
> but only to output warning and how to fix it.
> 
> Signed-off-by: Qu Wenruo 
> ---
>  cmds-check.c | 73 
> 
>  1 file changed, 73 insertions(+)
> 
> diff --git a/cmds-check.c b/cmds-check.c
> index fdb6d832eee1..23dd3b51102b 100644
> --- a/cmds-check.c
> +++ b/cmds-check.c
> @@ -9879,6 +9879,67 @@ static int check_device_used(struct device_record 
> *dev_rec,
>   }
>  }
>  
> +/*
> + * Extra (optional) check for dev_item size
> + *
> + * To avoid possible kernel warning for v4.14 kernel.
> + * It's not a deadly problem, just to info v4.14 kernel user.
> + * So it won't change the return value.
> + */
> +static void check_dev_size_alignment(u64 devid, u64 total_bytes,
> +  u32 sectorsize)
> +{
> + if (!IS_ALIGNED(total_bytes, sectorsize)) {
> + warning("unaligned total_bytes detected for devid %llu, have 
> %llu should be aligned to %u",
> + devid, total_bytes, sectorsize);
> + warning("this is OK for older kernel (<4.14), but may cause 
> kernel warning for newer kernel (>=4.14)");
> + warning("this can be fixed by 'btrfs check --fix-dev-size'");
> + }
> +}
> +
> +/*
> + * Unlike device size alignment check above, some super total_bytes check
> + * failure can lead to unmountable fs for kernel >=v4.6.
> + *
> + * So this function will return the error for fatal super total_bytes 
> problem.
> + */
> +static int check_super_size(struct btrfs_fs_info *fs_info)
> +{
> + struct btrfs_device *dev;
> + struct list_head *dev_list = _info->fs_devices->devices;
> + u64 total_bytes = 0;
> + u64 super_bytes = btrfs_super_total_bytes(fs_info->super_copy);
> +
> + list_for_each_entry(dev, dev_list, dev_list)
> + total_bytes += dev->total_bytes;
> +
> + /* Important check, which can cause unmountable fs */
> + if (super_bytes < total_bytes) {
> + error("super total bytes %llu smaller than real devices size 
> %llu",
> + super_bytes, total_bytes);
> + error("this fs will not be mountable for newer kernels 
> (>=v4.6)");
> + error("this can be fixed by 'btrfs check --fix-dev-size'");
> + return 1;
> + }
> +
> + /*
> +  * Optional check, just to make everything aligned and match with
> +  * each other.
> +  *
> +  * For btrfs-image restored fs, we don't need to check it anyway.
> +  */
> + if (btrfs_super_flags(fs_info->super_copy) &
> + (BTRFS_SUPER_FLAG_METADUMP | BTRFS_SUPER_FLAG_METADUMP_V2))
> + return 0;
> + if (!IS_ALIGNED(super_bytes, fs_info->sectorsize) ||
> + !IS_ALIGNED(total_bytes, fs_info->sectorsize) ||
> + super_bytes != total_bytes) {
> + warning("minor unaligned/mismatch device size detected");
> + warning("recommended to use 'btrfs check --fix-dev-size' to fix 
> it");
> + }
> + return 0;
> +}

nit: Make the function return bool and perhaps rename it to
"is_super_size_aligned" or something like that ?

> +
>  /* check btrfs_dev_item -> btrfs_dev_extent */
>  static int check_devices(struct rb_root *dev_cache,
>struct device_extent_tree *dev_extent_cache)
> @@ -9896,6 +9957,11 @@ static int check_devices(struct rb_root *dev_cache,
>   if (err)
>   ret = err;
>  
> + if (!IS_ALIGNED(dev_rec->total_byte, global_info->sectorsize)) {
> + }
> +
> + check_dev_size_alignment(dev_rec->devid, dev_rec->total_byte,
> +  global_info->sectorsize);
>   dev_node = rb_next(dev_node);

Something funny is going on here, the body of the if is empty, perhaps
those function have to be inside?


>   }
>   list_for_each_entry(dext_rec, _extent_cache->no_device_orphans,
> @@ -11141,6 +11207,7 @@ static int check_dev_item(struct btrfs_fs_info 
> *fs_info,
>   struct btrfs_path path;
>   struct btrfs_key key;
>   struct btrfs_dev_extent *ptr;
> + u64 total_bytes;
>   u64 dev_id;
>   u64 used;
>   u64 total = 0;
> @@ -11149,6 +11216,7 @@ static int check_dev_item(struct btrfs_fs_info 
> *fs_info,
>   dev_item = btrfs_item_ptr(eb, slot, struct btrfs_dev_item);
>   dev_id = btrfs_device_id(eb, dev_item);
>   used = btrfs_device_bytes_used(eb, dev_item);
> + total_bytes = btrfs_device_total_bytes(eb, dev_item);
>  
>   key.objectid = dev_id;
>   key.type = BTRFS_DEV_EXTENT_KEY;
> @@ 

Re: [PATCH 1/4] btrfs-progs: Introduce functions to repair unaligned/mismatch device size

2017-10-10 Thread Nikolay Borisov


On 10.10.2017 10:51, Qu Wenruo wrote:
> This patch introduces functions to repair device size related problems,
> including:
> 1) Unaligned total_bytes of dev_item
>v4.14-rc kernel introduced total_bytes alignment checker.
>However older kernel device add/shrink doesn't align these members.
>This will cause kernel warning every time dev_item get updated.
> 
>Although it can be fixed by shrinking device on latest kernel or
>use manually aligned size on older kernel, a fallback method in
>btrfs-progs won't hurt.
> 
> 2) Mismatch super->total_bytes
>There are some reports about unmountable fs, due to mismatched
>super->total_bytes.
>And normal kernel device shrink won't help since it only modify the
>total_bytes by the size changed, not re-calculating it.
> 
>The root cause is still under investigation, but at least to fix it
>in btrfs-progs
> 
> Fix all of them by manually rounding down total_bytes of each device, and
> recalculate super->total_bytes using all existing devices.

Thanks for doing this, we needed it. One minor nit in the comments but
overall:

Reviewed-by: Nikolay Borisov 


> 
> Reported-by: Asif Youssuff 
> Reported-by: Rich Rauenzahn 
> Signed-off-by: Qu Wenruo > ---
>  cmds-check.c | 191 
> +++
>  1 file changed, 191 insertions(+)
> 
> diff --git a/cmds-check.c b/cmds-check.c
> index 0c08618ed701..007781fa5d1b 100644
> --- a/cmds-check.c
> +++ b/cmds-check.c
> @@ -12885,6 +12885,197 @@ close_out:
>   return ret;
>  }
>  
> +/*
> + * Return 0 if DEV_ITEM for @device is good
> + * Return >0 if DEV_ITEM for @device has unaligned value and fixed
> + * Return <0 if we failed to fix the unaligned value
> + */
> +static int reset_one_dev_size(struct btrfs_fs_info *fs_info,
> +struct btrfs_device *device)
> +{
> + struct btrfs_trans_handle *trans;
> + struct btrfs_key key;
> + struct btrfs_path path;
> + struct btrfs_root *chunk_root = fs_info->chunk_root;
> + struct btrfs_dev_item *di;
> + u64 old_bytes = device->total_bytes;
> + int ret;
> +
> + if (IS_ALIGNED(device->total_bytes, fs_info->sectorsize))
> + return 0;
> +
> + /* Align the in-memory total_bytes first, and use it later */
> + device->total_bytes = round_down(device->total_bytes,
> +  fs_info->sectorsize);
> +
> + key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
> + key.type = BTRFS_DEV_ITEM_KEY;
> + key.offset = device->devid;
> +
> + trans = btrfs_start_transaction(chunk_root, 1);
> + if (IS_ERR(trans)) {
> + ret = PTR_ERR(trans);
> + error("error starting transaction:  %d (%s)",
> +   ret, strerror(-ret));
> + return ret;
> + }
> +
> + btrfs_init_path();
> + ret = btrfs_search_slot(trans, chunk_root, , , 0, 1);
> + if (ret > 0) {
> + error("failed to find DEV_ITEM for devid %llu",
> + device->devid);
> + ret = -ENOENT;
> + goto err;
> + }
> + if (ret < 0) {
> + error("failed to search chunk root: %d (%s)",
> + ret, strerror(-ret));
> + goto err;
> + }
> + di = btrfs_item_ptr(path.nodes[0], path.slots[0],
> + struct btrfs_dev_item);
> + btrfs_set_device_total_bytes(path.nodes[0], di, device->total_bytes);
> + btrfs_mark_buffer_dirty(path.nodes[0]);
> + ret = btrfs_commit_transaction(trans, chunk_root);
> + if (ret < 0) {
> + error("failed to commit current transaction: %d (%s)",
> + ret, strerror(-ret));
> + btrfs_release_path();
> + return ret;
> + }
> + btrfs_release_path();
> + printf("Fixed device size for devid %llu, old size: %llu new size: 
> %llu\n",
> + device->devid, old_bytes, device->total_bytes);
> + return 1;
> +
> +err:
> + /* We haven't modified anything, it's OK to commit current trans */
> + btrfs_commit_transaction(trans, chunk_root);
> + btrfs_release_path();
> + return ret;
> +}
> +
> +/*
> + * Return 0 if super block total bytes matches with device total_bytes
> + * Return >0 if super block has mismatch total_bytes and fixed
> + * Return <0 if we failed to fix the mismatch total_bytes
> + */
> +static int reset_super_total_bytes(struct btrfs_fs_info *fs_info)
> +{
> + struct btrfs_trans_handle *trans;
> + struct btrfs_device *device;
> + struct list_head *dev_list = _info->fs_devices->devices;
> + u64 total_bytes = 0;
> + u64 old_bytes = btrfs_super_total_bytes(fs_info->super_copy);
> + int ret;
> +
> + list_for_each_entry(device, dev_list, dev_list)
> + total_bytes += device->total_bytes;
> +
> + if (total_bytes == old_bytes)
> +  

Re: [PATCH 0/4] btrfs-progs repair support for unaligned/mismatched device sizes

2017-10-10 Thread Nikolay Borisov


On 10.10.2017 10:51, Qu Wenruo wrote:
> The patchset can be fetched from github:
> https://github.com/adam900710/btrfs-progs/tree/check_unaligned_dev
> 
> There are several reports in mail list for btrfs device size related
> problems.
> 
> 1) Unmountable fs, due to mismatched super total_bytes
>Unmountable if super total_bytes is smaller than total rw bytes of
>all devices.
>Root cause under investigation, but only one report here.

Don't you mean mountable? We've internally seen mounting being a problem
due to said size mismatch.

> 
>This patchset provides the tool to fix it offline.
>(At least better than unmountable forever)
> 
> 2) Harmless kernel warning for btrfs_update_device()
>v4.14 introduced restrict device size checker.
>This somewhat break the backward compatibility and causing kernel
>warning.
> 
>It can be fixed online with "btrfs filesystem resize".
>(Although it is better to fixed it at mount time)
> 
>This patchset also provide a fallback method to fix it.
> 
> Qu Wenruo (4):
>   btrfs-progs: Introduce functions to repair unaligned/mismatch device
> size
>   btrfs-progs: fsck: Introduce --fix-dev-size option
>   btrfs-progs: check: Also check unalignment/mismatch device and super
> size
>   btrfs-progs: test/fsck: Add test case image for --fix-dev-size
> 
> Qu Wenruo (4):
>   btrfs-progs: Introduce functions to repair unaligned/mismatch device
> size
>   btrfs-progs: fsck: Introduce --fix-dev-size option
>   btrfs-progs: check: Also check unalignment/mismatch device and super
> size
>   btrfs-progs: test/fsck: Add test case image for --fix-dev-size
> 
>  Documentation/btrfs-check.asciidoc |  23 ++
>  cmds-check.c   | 292 
> -
>  .../dev_and_super_mismatch_unaligned.raw.xz| Bin 0 -> 21536 bytes
>  3 files changed, 314 insertions(+), 1 deletion(-)
>  create mode 100644 
> tests/fsck-tests/027-unaligned-super-dev-sizes/dev_and_super_mismatch_unaligned.raw.xz
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4] btrfs-progs repair support for unaligned/mismatched device sizes

2017-10-10 Thread Qu Wenruo
The patchset can be fetched from github:
https://github.com/adam900710/btrfs-progs/tree/check_unaligned_dev

There are several reports in mail list for btrfs device size related
problems.

1) Unmountable fs, due to mismatched super total_bytes
   Unmountable if super total_bytes is smaller than total rw bytes of
   all devices.
   Root cause under investigation, but only one report here.

   This patchset provides the tool to fix it offline.
   (At least better than unmountable forever)

2) Harmless kernel warning for btrfs_update_device()
   v4.14 introduced restrict device size checker.
   This somewhat break the backward compatibility and causing kernel
   warning.

   It can be fixed online with "btrfs filesystem resize".
   (Although it is better to fixed it at mount time)

   This patchset also provide a fallback method to fix it.

Qu Wenruo (4):
  btrfs-progs: Introduce functions to repair unaligned/mismatch device
size
  btrfs-progs: fsck: Introduce --fix-dev-size option
  btrfs-progs: check: Also check unalignment/mismatch device and super
size
  btrfs-progs: test/fsck: Add test case image for --fix-dev-size

Qu Wenruo (4):
  btrfs-progs: Introduce functions to repair unaligned/mismatch device
size
  btrfs-progs: fsck: Introduce --fix-dev-size option
  btrfs-progs: check: Also check unalignment/mismatch device and super
size
  btrfs-progs: test/fsck: Add test case image for --fix-dev-size

 Documentation/btrfs-check.asciidoc |  23 ++
 cmds-check.c   | 292 -
 .../dev_and_super_mismatch_unaligned.raw.xz| Bin 0 -> 21536 bytes
 3 files changed, 314 insertions(+), 1 deletion(-)
 create mode 100644 
tests/fsck-tests/027-unaligned-super-dev-sizes/dev_and_super_mismatch_unaligned.raw.xz

-- 
2.14.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/4] btrfs-progs: fsck: Introduce --fix-dev-size option

2017-10-10 Thread Qu Wenruo
Introduce --fix-dev-size option to fix device related problems.

This repairing  is also included in --repair, but considering the safety
and potential affected users, it's better to provide a option to fix and
only fix device size related problem to avoid full --repair.

Reported-by: Asif Youssuff 
Reported-by: Rich Rauenzahn 
Signed-off-by: Qu Wenruo 
---
 Documentation/btrfs-check.asciidoc | 23 +++
 cmds-check.c   | 28 +++-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-check.asciidoc 
b/Documentation/btrfs-check.asciidoc
index fbf48847ac25..e45c7a457bac 100644
--- a/Documentation/btrfs-check.asciidoc
+++ b/Documentation/btrfs-check.asciidoc
@@ -93,6 +93,29 @@ the entire free space cache. This option with 'v2' provides 
an alternative
 method of clearing the free space cache that doesn't require mounting the
 filesystem.
 
+--fix-dev-size::
+From v4.14-rc kernels, a more restrict device size checker is introduced, while
+old kernel doesn't strictly align its device size, so it may cause noisy kernel
+warning for newer kernel, like:
++
+
+WARNING: CPU: 3 PID: 439 at fs/btrfs/ctree.h:1559 
btrfs_update_device+0x1c5/0x1d0 [btrfs]
+
++
+And for some case where super block total device size may mismatch with all
+devices, and the filesystem will be unable to be mounted, with kernel message
+like:
++
+
+BTRFS error (device sdb): super_total_bytes 92017859088384 mismatch with 
fs_devices total_rw_bytes 92017859094528 
+
++
+This option will fix both problems by aligning all size of devices, and
+re-calculating superblock total bytes.
++
+Although such repairing is included in *--repair* option, considering the
+safety of *--repair*, this option is provided to suppress all other dangerous
+repairing and only fix device sizes related problems.
 
 DANGEROUS OPTIONS
 -
diff --git a/cmds-check.c b/cmds-check.c
index 007781fa5d1b..fdb6d832eee1 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -11746,6 +11746,8 @@ out:
return err;
 }
 
+static int reset_devs_size(struct btrfs_fs_info *fs_info);
+
 static int do_check_chunks_and_extents(struct btrfs_fs_info *fs_info)
 {
int ret;
@@ -11756,6 +11758,12 @@ static int do_check_chunks_and_extents(struct 
btrfs_fs_info *fs_info)
ret = check_chunks_and_extents_v2(fs_info);
else
ret = check_chunks_and_extents(fs_info);
+   /* Also repair device sizes if needed */
+   if (repair && !ret) {
+   ret = reset_devs_size(fs_info);
+   if (ret > 0)
+   ret = 0;
+   }
 
return ret;
 }
@@ -13088,6 +13096,8 @@ const char * const cmd_check_usage[] = {
"-b|--backup use the first valid backup root copy",
"--force skip mount checks, repair is not possible",
"--repairtry to repair the filesystem",
+   "--fix-dev-size  repair device size related problem",
+   "will not trigger other repair",
"--readonly  run in read-only mode (default)",
"--init-csum-treecreate a new CRC tree",
"--init-extent-tree  create a new extent tree",
@@ -13128,6 +13138,7 @@ int cmd_check(int argc, char **argv)
int qgroups_repaired = 0;
unsigned ctree_flags = OPEN_CTREE_EXCLUSIVE;
int force = 0;
+   bool fix_dev_size = false;
 
while(1) {
int c;
@@ -13135,7 +13146,7 @@ int cmd_check(int argc, char **argv)
GETOPT_VAL_INIT_EXTENT, GETOPT_VAL_CHECK_CSUM,
GETOPT_VAL_READONLY, GETOPT_VAL_CHUNK_TREE,
GETOPT_VAL_MODE, GETOPT_VAL_CLEAR_SPACE_CACHE,
-   GETOPT_VAL_FORCE };
+   GETOPT_VAL_FORCE, GETOPT_VAL_FIX_DEV_SIZE };
static const struct option long_options[] = {
{ "super", required_argument, NULL, 's' },
{ "repair", no_argument, NULL, GETOPT_VAL_REPAIR },
@@ -13158,6 +13169,8 @@ int cmd_check(int argc, char **argv)
{ "clear-space-cache", required_argument, NULL,
GETOPT_VAL_CLEAR_SPACE_CACHE},
{ "force", no_argument, NULL, GETOPT_VAL_FORCE },
+   { "fix-dev-size", no_argument, NULL,
+   GETOPT_VAL_FIX_DEV_SIZE },
{ NULL, 0, NULL, 0}
};
 
@@ -13245,6 +13258,11 @@ int cmd_check(int argc, char **argv)
case GETOPT_VAL_FORCE:
force = 1;
break;
+   case GETOPT_VAL_FIX_DEV_SIZE:
+   fix_dev_size = true;
+  

[PATCH 4/4] btrfs-progs: test/fsck: Add test case image for --fix-dev-size

2017-10-10 Thread Qu Wenruo
The image has 2 problems mixed:

1) Too small super total_bytes
   This super total_bytes is manually modified to create such problem.

2) Unaligned dev item total_bytes
   This is created by v4.12 kernel, with 128M + 2K device added, and
   original device removed.
   Then we can create such image with unaligned dev item total_bytes.

Signed-off-by: Qu Wenruo 
---
 .../dev_and_super_mismatch_unaligned.raw.xz | Bin 0 -> 21536 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 
tests/fsck-tests/027-unaligned-super-dev-sizes/dev_and_super_mismatch_unaligned.raw.xz

diff --git 
a/tests/fsck-tests/027-unaligned-super-dev-sizes/dev_and_super_mismatch_unaligned.raw.xz
 
b/tests/fsck-tests/027-unaligned-super-dev-sizes/dev_and_super_mismatch_unaligned.raw.xz
new file mode 100644
index 
..153e514a89d5f50a7e6b4f9c3d3214896a4070cd
GIT binary patch
literal 21536
zcmeI4XIN8d7RM6^O{57*lP*o^y(5T75s)4_2uSZmrAd*16zK>K7>W=;sUj#fA{`P0

[PATCH 3/4] btrfs-progs: check: Also check unalignment/mismatch device and super size

2017-10-10 Thread Qu Wenruo
Along with the fix introduced, also introduce check for them.
Unlike normal check funtions, some of the check is optional, and even if
the image failed to pass optional check, kernel can still run fine.
(But may cause noisy kernel warning)

So some check, mainly for alignment, will not cause btrfs check to fail,
but only to output warning and how to fix it.

Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 73 
 1 file changed, 73 insertions(+)

diff --git a/cmds-check.c b/cmds-check.c
index fdb6d832eee1..23dd3b51102b 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -9879,6 +9879,67 @@ static int check_device_used(struct device_record 
*dev_rec,
}
 }
 
+/*
+ * Extra (optional) check for dev_item size
+ *
+ * To avoid possible kernel warning for v4.14 kernel.
+ * It's not a deadly problem, just to info v4.14 kernel user.
+ * So it won't change the return value.
+ */
+static void check_dev_size_alignment(u64 devid, u64 total_bytes,
+u32 sectorsize)
+{
+   if (!IS_ALIGNED(total_bytes, sectorsize)) {
+   warning("unaligned total_bytes detected for devid %llu, have 
%llu should be aligned to %u",
+   devid, total_bytes, sectorsize);
+   warning("this is OK for older kernel (<4.14), but may cause 
kernel warning for newer kernel (>=4.14)");
+   warning("this can be fixed by 'btrfs check --fix-dev-size'");
+   }
+}
+
+/*
+ * Unlike device size alignment check above, some super total_bytes check
+ * failure can lead to unmountable fs for kernel >=v4.6.
+ *
+ * So this function will return the error for fatal super total_bytes problem.
+ */
+static int check_super_size(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_device *dev;
+   struct list_head *dev_list = _info->fs_devices->devices;
+   u64 total_bytes = 0;
+   u64 super_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+
+   list_for_each_entry(dev, dev_list, dev_list)
+   total_bytes += dev->total_bytes;
+
+   /* Important check, which can cause unmountable fs */
+   if (super_bytes < total_bytes) {
+   error("super total bytes %llu smaller than real devices size 
%llu",
+   super_bytes, total_bytes);
+   error("this fs will not be mountable for newer kernels 
(>=v4.6)");
+   error("this can be fixed by 'btrfs check --fix-dev-size'");
+   return 1;
+   }
+
+   /*
+* Optional check, just to make everything aligned and match with
+* each other.
+*
+* For btrfs-image restored fs, we don't need to check it anyway.
+*/
+   if (btrfs_super_flags(fs_info->super_copy) &
+   (BTRFS_SUPER_FLAG_METADUMP | BTRFS_SUPER_FLAG_METADUMP_V2))
+   return 0;
+   if (!IS_ALIGNED(super_bytes, fs_info->sectorsize) ||
+   !IS_ALIGNED(total_bytes, fs_info->sectorsize) ||
+   super_bytes != total_bytes) {
+   warning("minor unaligned/mismatch device size detected");
+   warning("recommended to use 'btrfs check --fix-dev-size' to fix 
it");
+   }
+   return 0;
+}
+
 /* check btrfs_dev_item -> btrfs_dev_extent */
 static int check_devices(struct rb_root *dev_cache,
 struct device_extent_tree *dev_extent_cache)
@@ -9896,6 +9957,11 @@ static int check_devices(struct rb_root *dev_cache,
if (err)
ret = err;
 
+   if (!IS_ALIGNED(dev_rec->total_byte, global_info->sectorsize)) {
+   }
+
+   check_dev_size_alignment(dev_rec->devid, dev_rec->total_byte,
+global_info->sectorsize);
dev_node = rb_next(dev_node);
}
list_for_each_entry(dext_rec, _extent_cache->no_device_orphans,
@@ -11141,6 +11207,7 @@ static int check_dev_item(struct btrfs_fs_info *fs_info,
struct btrfs_path path;
struct btrfs_key key;
struct btrfs_dev_extent *ptr;
+   u64 total_bytes;
u64 dev_id;
u64 used;
u64 total = 0;
@@ -11149,6 +11216,7 @@ static int check_dev_item(struct btrfs_fs_info *fs_info,
dev_item = btrfs_item_ptr(eb, slot, struct btrfs_dev_item);
dev_id = btrfs_device_id(eb, dev_item);
used = btrfs_device_bytes_used(eb, dev_item);
+   total_bytes = btrfs_device_total_bytes(eb, dev_item);
 
key.objectid = dev_id;
key.type = BTRFS_DEV_EXTENT_KEY;
@@ -11193,6 +11261,8 @@ next:
BTRFS_DEV_EXTENT_KEY, dev_id);
return ACCOUNTING_MISMATCH;
}
+   check_dev_size_alignment(dev_id, total_bytes, fs_info->sectorsize);
+
return 0;
 }
 
@@ -13471,6 +13541,9 @@ int cmd_check(int argc, char **argv)
error(
"errors found in extent allocation tree or chunk 

[PATCH 1/4] btrfs-progs: Introduce functions to repair unaligned/mismatch device size

2017-10-10 Thread Qu Wenruo
This patch introduces functions to repair device size related problems,
including:
1) Unaligned total_bytes of dev_item
   v4.14-rc kernel introduced total_bytes alignment checker.
   However older kernel device add/shrink doesn't align these members.
   This will cause kernel warning every time dev_item get updated.

   Although it can be fixed by shrinking device on latest kernel or
   use manually aligned size on older kernel, a fallback method in
   btrfs-progs won't hurt.

2) Mismatch super->total_bytes
   There are some reports about unmountable fs, due to mismatched
   super->total_bytes.
   And normal kernel device shrink won't help since it only modify the
   total_bytes by the size changed, not re-calculating it.

   The root cause is still under investigation, but at least to fix it
   in btrfs-progs

Fix all of them by manually rounding down total_bytes of each device, and
recalculate super->total_bytes using all existing devices.

Reported-by: Asif Youssuff 
Reported-by: Rich Rauenzahn 
Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 191 +++
 1 file changed, 191 insertions(+)

diff --git a/cmds-check.c b/cmds-check.c
index 0c08618ed701..007781fa5d1b 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -12885,6 +12885,197 @@ close_out:
return ret;
 }
 
+/*
+ * Return 0 if DEV_ITEM for @device is good
+ * Return >0 if DEV_ITEM for @device has unaligned value and fixed
+ * Return <0 if we failed to fix the unaligned value
+ */
+static int reset_one_dev_size(struct btrfs_fs_info *fs_info,
+  struct btrfs_device *device)
+{
+   struct btrfs_trans_handle *trans;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   struct btrfs_root *chunk_root = fs_info->chunk_root;
+   struct btrfs_dev_item *di;
+   u64 old_bytes = device->total_bytes;
+   int ret;
+
+   if (IS_ALIGNED(device->total_bytes, fs_info->sectorsize))
+   return 0;
+
+   /* Align the in-memory total_bytes first, and use it later */
+   device->total_bytes = round_down(device->total_bytes,
+fs_info->sectorsize);
+
+   key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+   key.type = BTRFS_DEV_ITEM_KEY;
+   key.offset = device->devid;
+
+   trans = btrfs_start_transaction(chunk_root, 1);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   error("error starting transaction:  %d (%s)",
+ ret, strerror(-ret));
+   return ret;
+   }
+
+   btrfs_init_path();
+   ret = btrfs_search_slot(trans, chunk_root, , , 0, 1);
+   if (ret > 0) {
+   error("failed to find DEV_ITEM for devid %llu",
+   device->devid);
+   ret = -ENOENT;
+   goto err;
+   }
+   if (ret < 0) {
+   error("failed to search chunk root: %d (%s)",
+   ret, strerror(-ret));
+   goto err;
+   }
+   di = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_dev_item);
+   btrfs_set_device_total_bytes(path.nodes[0], di, device->total_bytes);
+   btrfs_mark_buffer_dirty(path.nodes[0]);
+   ret = btrfs_commit_transaction(trans, chunk_root);
+   if (ret < 0) {
+   error("failed to commit current transaction: %d (%s)",
+   ret, strerror(-ret));
+   btrfs_release_path();
+   return ret;
+   }
+   btrfs_release_path();
+   printf("Fixed device size for devid %llu, old size: %llu new size: 
%llu\n",
+   device->devid, old_bytes, device->total_bytes);
+   return 1;
+
+err:
+   /* We haven't modified anything, it's OK to commit current trans */
+   btrfs_commit_transaction(trans, chunk_root);
+   btrfs_release_path();
+   return ret;
+}
+
+/*
+ * Return 0 if super block total bytes matches with device total_bytes
+ * Return >0 if super block has mismatch total_bytes and fixed
+ * Return <0 if we failed to fix the mismatch total_bytes
+ */
+static int reset_super_total_bytes(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_trans_handle *trans;
+   struct btrfs_device *device;
+   struct list_head *dev_list = _info->fs_devices->devices;
+   u64 total_bytes = 0;
+   u64 old_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+   int ret;
+
+   list_for_each_entry(device, dev_list, dev_list)
+   total_bytes += device->total_bytes;
+
+   if (total_bytes == old_bytes)
+   return 0;
+
+   /* Just in case */
+   if (!IS_ALIGNED(total_bytes, fs_info->sectorsize)) {
+   error("final total_bytes still not aligned to %u, please report 
a bug to btrfs mail list",
+   fs_info->sectorsize);
+   return -EUCLEAN;
+  

Re: [PATCH v3] btrfs-progs: RAID5: Inject data stripe corruption and verify scrub fixes it and retains correct parity.

2017-10-10 Thread Lakshmipathi.G
Any review comments on this RAID5 test script? I understand it depends
on 'dump-tree' output, but I assume, right now, we have no other
choice other than this method.

I'll modify this patch to reflect coding practices as mentioned in
tests/README.md, if current method is okay.

Cheers.
Lakshmipathi.G

On Mon, Feb 20, 2017 at 11:57 PM, Lakshmipathi.G
 wrote:
> Test script to create file with specific data-stripe layout.Computes stripe 
> locations.
> Inject corruption into data-stripe and verify scrubbing process fixes 
> corrupted block.
> It also validates the corresponding parity stripe.
>
> Signed-off-by: Lakshmipathi.G 
> ---
>  .../test.sh| 402 
> +
>  1 file changed, 402 insertions(+)
>  create mode 100755 
> tests/misc-tests/020-raid5-datastripe-corruption-parity-check/test.sh
>
> diff --git 
> a/tests/misc-tests/020-raid5-datastripe-corruption-parity-check/test.sh 
> b/tests/misc-tests/020-raid5-datastripe-corruption-parity-check/test.sh
> new file mode 100755
> index 000..0129a75
> --- /dev/null
> +++ b/tests/misc-tests/020-raid5-datastripe-corruption-parity-check/test.sh
> @@ -0,0 +1,402 @@
> +#!/bin/bash
> +#
> +# Raid5: Inject data stripe corruption and fix them using scrub.
> +#
> +# Script will perform the following:
> +# 1) Create Raid5 using 3 loopback devices.
> +# 2) Ensure file layout is created in a predictable manner.
> +#Each data stripe(64KB) should uniquely start with 'DN',
> +#where N represents the data stripe number.(ex:D0,D1 etc)
> +# 3) Once file is created with specific layout, check whether file
> +#has single extent. At the moment, script wont handle multi-extent
> +#files.
> +# 4) If file has single extent with the help of 'dump-tree' compute data and
> +#parity stripe details like devicename, position and actual on-disk data.
> +# 5) $STRIPEINFO_COMPLETE file will have all necessary data at this stage.
> +# 6) Inject corruption into given data-stripe by zero'ing out its first 4 
> bytes.
> +# 7) After injecting corruption, running online-scrub is expected to fix
> +#the corrupted data stripe with the help of parity block and
> +#corresponding data stripe.
> +# 8) If scrub successful, verify the data stripe has original un-corrupted 
> value.
> +# 9) If scrub successful, verify parity stripe is valid, otherwise its a 
> parity bug.
> +# 10) If no issues found, cleanup files and devices. Repeat the process for
> +#different file size and data-stripe.
> +#
> +#  Note: This script corrupts only data-stripe blocks.
> +#  Known Limitations (will be addressed later):
> +# - Script expects even number of data-stripes in file.
> +# - Script expects the file to have single extent.
> +
> +source $TOP/tests/common
> +
> +check_prereq btrfs
> +check_prereq mkfs.btrfs
> +check_prereq btrfs-debugfs
> +
> +setup_root_helper
> +prepare_test_dev 1024M
> +
> +ndevs=3
> +declare -a devs
> +declare -a parity_offsets
> +stripe_entry=""
> +device_name=""
> +stripe_offset=""
> +stripe_content=""
> +
> +#Complete stripe layout
> +STRIPEINFO_COMPLETE=$(mktemp --tmpdir 
> btrfs-progs-raid5-stripe-complete.infoXX)
> +#dump-tree output file
> +DUMPTREE_OUTPUT=$(mktemp --tmpdir btrfs-progs-raid5-tree-dump.infoXX)
> +#tmp files
> +STRIPEINFO_PARTIAL=$(mktemp --tmpdir 
> btrfs-progs-raid5-stripe-partial.infoXX)
> +STRIPE_TMP=$(mktemp --tmpdir btrfs-progs-raid5-stripetmp.infoXX)
> +MULTI_EXTENT_CHECK=$(mktemp --tmpdir 
> btrfs-progs-raid5-extent-check.infoXX)
> +EXTENT_WITH_SIZE=$(mktemp --tmpdir btrfs-progs-raid5-extent-size.infoXX)
> +PARITY_LOC1=$(mktemp --tmpdir btrfs-progs-raid5-parity-loc1.infoXX)
> +PARITY_LOCATION=$(mktemp --tmpdir 
> btrfs-progs-raid5-parity-locations.infoXX)
> +
> +
> +prepare_devices()
> +{
> +   for i in `seq $ndevs`; do
> +   touch img$i
> +   chmod a+rw img$i
> +   truncate -s0 img$i
> +   truncate -s512M img$i
> +   devs[$i]=`run_check_stdout $SUDO_HELPER losetup --find --show 
> img$i`
> +   done
> +   truncate -s0 $STRIPE_TMP
> +   truncate -s0 $STRIPEINFO_PARTIAL
> +   truncate -s0 $STRIPEINFO_COMPLETE
> +}
> +
> +cleanup_devices()
> +{
> +   for dev in ${devs[@]}; do
> +   run_check $SUDO_HELPER losetup -d $dev
> +   done
> +   for i in `seq $ndevs`; do
> +   truncate -s0 img$i
> +   done
> +   run_check $SUDO_HELPER losetup --all
> +}
> +
> +test_do_mkfs()
> +{
> +   run_check $SUDO_HELPER $TOP/mkfs.btrfs -f   \
> +   $@
> +}
> +
> +test_mkfs_multi()
> +{
> +   test_do_mkfs $@ ${devs[@]}
> +}
> +
> +#$1 Filename
> +#$2 Expected no.of data stripes for the file.
> +create_layout(){
> +   fname=$1
> +   size=$(( $2 * 65536 ))
> +   n=0
> +   bs_value=1
> +   stripe=0
> +   while (( $n < $size ))

Re: WARNING: ... at fs/btrfs/ctree.h:1559 btrfs_update_device+0x1be/0x1d0 [btrfs]

2017-10-10 Thread Nikolay Borisov


On  9.10.2017 20:26, Cloud Admin wrote:
> Hi,
> I update kernel from 4.11.10 to 4.13.4 and since that I get the following 
> message in the kernel journal calling 'scrub' or 'balance'. I use Fedora 26 
> with btrfs-progs v4.9.1.
> What does this mean and (more important) what can I do? 
> Bye
>   Frank
> 
> BTRFS info (device dm-7): relocating block group 44050690342912 flags 
> system|raid1
> BTRFS info (device dm-7): found 117 extents
> [ cut here ]
> WARNING: CPU: 3 PID: 22095 at fs/btrfs/ctree.h:1559 
> btrfs_update_device+0x1be/0x1d0 [btrfs]

This is :

WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); in
btrfs_set_device_total_bytes. This likely means your device's size isn't
a multiple of sectorsize (4kb). Can you paste the output of

btrfs inspect-internal dump-super /path/to/btrfs/device


What i will advise you to do is to shrink your device to be a multiple
of 4kb.

In any case you can be assured that there is no danger of you losing data.


> Modules linked in: rpcsec_gss_krb5 veth xt_nat xt_addrtype br_netfilter 
> xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun nf_conntrack_sane xt_CT 
> ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink 
> ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 
> nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_raw ip6table_security 
> iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack 
> libcrc32c iptable_mangle iptable_raw iptable_security ebtable_filter ebtables 
> ip6table_filter ip6_tables ftsteutates btrfs xor raid6_pq tda18212 cxd2841er 
> tpm_crb intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel 
> kvm iTCO_wdt irqbypass iTCO_vendor_support intel_cstate intel_uncore mei_wdt 
> intel_rapl_perf ppdev hci_uart ddbridge dvb_core
> Okt 09 19:08:32 hypercloud.fritz.box kernel:  btbcm btqca btintel mei_me 
> i2c_i801 shpchp bluetooth joydev mei intel_pch_thermal wmi intel_lpss_acpi 
> intel_lpss pinctrl_sunrisepoint fujitsu_laptop parport_pc sparse_keymap 
> ecdh_generic parport tpm_tis pinctrl_intel tpm_tis_core rfkill tpm acpi_pad 
> nfsd auth_rpcgss nfs_acl lockd grace sunrpc dm_crypt i915 crct10dif_pclmul 
> i2c_algo_bit crc32_pclmul drm_kms_helper crc32c_intel e1000e drm 
> ghash_clmulni_intel serio_raw ptp pps_core video i2c_hid
> CPU: 3 PID: 22095 Comm: btrfs Tainted: GW   
> 4.13.4-200.fc26.x86_64 #1
> Hardware name: FUJITSU D3417-B1/D3417-B1, BIOS V5.0.0.11 R1.12.0 for 
> D3417-B1x02/09/2016
> task: 8ecb59b026c0 task.stack: b805cae9
> RIP: 0010:btrfs_update_device+0x1be/0x1d0 [btrfs]
> RSP: 0018:b805cae93ac8 EFLAGS: 00010206
> RAX: 0fff RBX: 8ed094bb11c0 RCX: 074702251e00
> RDX: 0004 RSI: 3efa RDI: 8ec9eb032c08
> RBP: b805cae93b10 R08: 3efe R09: b805cae93a80
> R10: 1000 R11: 0003 R12: 8ed0c7a3f000
> R13:  R14: 3eda R15: 8ec9eb032c08
> FS:  7f10b256a8c0() GS:8ed0ee4c() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 7f6261c0d5f0 CR3: 0005cad4c000 CR4: 003406e0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> Call Trace:
>  btrfs_remove_chunk+0x365/0x870 [btrfs]
>  btrfs_relocate_chunk+0x7e/0xd0 [btrfs]
>  btrfs_balance+0xc07/0x1390 [btrfs]
>  btrfs_ioctl_balance+0x319/0x380 [btrfs]
>  btrfs_ioctl+0x9d5/0x24a0 [btrfs]
>  ? lru_cache_add+0x3a/0x80
>  ? lru_cache_add_active_or_unevictable+0x4c/0xf0
>  ? __handle_mm_fault+0x939/0x10b0
>  do_vfs_ioctl+0xa5/0x600
>  ? do_brk_flags+0x230/0x360
>  ? do_vfs_ioctl+0xa5/0x600
>  SyS_ioctl+0x79/0x90
>  entry_SYSCALL_64_fastpath+0x1a/0xa5
> RIP: 0033:0x7f10b15e65e7
> RSP: 002b:7ffc9402ebe8 EFLAGS: 0246 ORIG_RAX: 0010
> RAX: ffda RBX: 8041 RCX: 7f10b15e65e7
> RDX: 7ffc9402ec80 RSI: c4009420 RDI: 0003
> RBP: 7f10b18abae0 R08: 55b07a3b3010 R09: 0078
> R10: 7f10b18abb38 R11: 0246 R12: 
> R13: 7f10b18abb38 R14: 8060 R15: 
> Code: 4c 89 ff 45 31 c0 ba 10 00 00 00 4c 89 f6 e8 fa 20 ff ff 4c 89 ff e8 72 
> ef fc ff e9 d3 fe ff ff 41 bd f4 ff ff ff e9 d0 fe ff ff <0f> ff eb b6 e8 39 
> fd 78 c6 66 0f 1f 84 00 00 00 00 00 0f 1f 44 
> ---[ end trace d1e1c8aff99bfeb8 ]---
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: avoid losing data raid profile when deleting a device

2017-10-10 Thread Nikolay Borisov


On  9.10.2017 21:01, Liu Bo wrote:
> We've avoided data losing raid profile when doing balance, but it
> turns out that deleting a device could also result in the same
> problem.
> 
> This fixes the problem by creating an empty data chunk before
> relocating the data chunk.
> 
> Metadata/System chunk are supposed to have non-zero bytes all the time
> so their raid profile is persistent.

This patch introduces new warning:

fs/btrfs/volumes.c:3523:29: note: ‘trans’ was declared here
  struct btrfs_trans_handle *trans;


> 
> Reported-by: James Alandt 
> Signed-off-by: Liu Bo 
> ---
>  fs/btrfs/volumes.c | 87 
> ++
>  1 file changed, 68 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 4a72c45..3f48bcd 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -144,6 +144,8 @@ static int __btrfs_map_block(struct btrfs_fs_info 
> *fs_info,
>u64 logical, u64 *length,
>struct btrfs_bio **bbio_ret,
>int mirror_num, int need_raid_map);
> +static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
> +   u64 chunk_offset);

Also there is no need to have this forward declaration, the function can
just as well be put right before __btrfs_balance. Let's try and keep
changes minimal.

>  
>  DEFINE_MUTEX(uuid_mutex);
>  static LIST_HEAD(fs_uuids);
> @@ -3476,7 +3478,6 @@ static int __btrfs_balance(struct btrfs_fs_info 
> *fs_info)
>   u32 count_meta = 0;
>   u32 count_sys = 0;
>   int chunk_reserved = 0;
> - u64 bytes_used = 0;
>  
>   /* step one make some room on all the devices */
>   devices = _info->fs_devices->devices;
> @@ -3635,28 +3636,22 @@ static int __btrfs_balance(struct btrfs_fs_info 
> *fs_info)
>   goto loop;
>   }
>  
> - ASSERT(fs_info->data_sinfo);
> - spin_lock(_info->data_sinfo->lock);
> - bytes_used = fs_info->data_sinfo->bytes_used;
> - spin_unlock(_info->data_sinfo->lock);
> -
> - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
> - !chunk_reserved && !bytes_used) {
> - trans = btrfs_start_transaction(chunk_root, 0);
> - if (IS_ERR(trans)) {
> - mutex_unlock(_info->delete_unused_bgs_mutex);
> - ret = PTR_ERR(trans);
> - goto error;
> - }
> -
> - ret = btrfs_force_chunk_alloc(trans, fs_info,
> -   BTRFS_BLOCK_GROUP_DATA);
> - btrfs_end_transaction(trans);
> + if (!chunk_reserved) {
> + /*
> +  * We may be relocating the only data chunk we have,
> +  * which could potentially end up with losing data's
> +  * raid profile, so lets allocate an empty one in
> +  * advance.
> +  */
> + ret = btrfs_may_alloc_data_chunk(fs_info,
> +  found_key.offset);
>   if (ret < 0) {
>   mutex_unlock(_info->delete_unused_bgs_mutex);
> + ret = PTR_ERR(trans);
>   goto error;
> + } else if (ret == 1) {
> + chunk_reserved = 1;
>   }
> - chunk_reserved = 1;
>   }
>  
>   ret = btrfs_relocate_chunk(fs_info, found_key.offset);
> @@ -4327,6 +4322,48 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info 
> *fs_info)
>  }
>  
>  /*
> + * return 1 : allocate a data chunk successfully,
> + * return <0: errors during allocating a data chunk,
> + * return 0 : no need to allocate a data chunk.
> + */
> +static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
> +   u64 chunk_offset)
> +{
> + struct btrfs_block_group_cache *cache;
> + u64 bytes_used;
> + u64 chunk_type;
> +
> + cache = btrfs_lookup_block_group(fs_info, chunk_offset);
> + ASSERT(cache);
> + chunk_type = cache->flags;
> + btrfs_put_block_group(cache);
> +
> + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
> + spin_lock(_info->data_sinfo->lock);
> + bytes_used = fs_info->data_sinfo->bytes_used;
> + spin_unlock(_info->data_sinfo->lock);
> +
> + if (!bytes_used) {
> + struct btrfs_trans_handle *trans;
> + int ret;
> +
> + trans = btrfs_join_transaction(fs_info->tree_root);
> + if (IS_ERR(trans))
> + return