Re: [PATCH v2 3/3] btrfs-progs: check: Cleanup all checkpatch error and warning

2018-01-31 Thread Qu Wenruo


On 2018年02月01日 15:08, Su Yue wrote:
> 
> 
> On 02/01/2018 02:45 PM, Qu Wenruo wrote:
>> Since we're moving tons of codes, it's a good idea to fix all errors and
[snip]
>>   }
>> @@ -2500,7 +2507,8 @@ static int repair_extent_data_item(struct
>> btrfs_trans_handle *trans,
>>   if (ret)
>>   goto out;
>>   eb = path.nodes[0];
>> -    ei = btrfs_item_ptr(eb, path.slots[0], struct
>> btrfs_extent_item);
>> +    ei = btrfs_item_ptr(eb, path.slots[0],
>> +    struct btrfs_extent_item);
>>     btrfs_set_extent_refs(eb, ei, 0);
>>   btrfs_set_extent_generation(eb, ei, generation);
>> @@ -2657,7 +2665,8 @@ static int check_extent_data_item(struct
>> btrfs_root *root,
>>   }
>>   if (type == BTRFS_EXTENT_DATA_REF_KEY) {
>>   ref_root = btrfs_extent_data_ref_root(leaf, dref);
>> -    ref_objectid = btrfs_extent_data_ref_objectid(leaf, dref);
>> +    ref_objectid = btrfs_extent_data_ref_objectid(leaf,
>> +  dref);
>>   ref_offset = btrfs_extent_data_ref_offset(leaf, dref);
>>     if (ref_objectid == fi_key.objectid &&
>> @@ -2820,8 +2829,8 @@ static int check_block_group_item(struct
>> btrfs_fs_info *fs_info,
>>   if (!(bg_flags & BTRFS_BLOCK_GROUP_DATA)) {
>>   error(
>>   "bad extent[%llu, %llu) type mismatch with chunk",
>> -    extent_key.objectid,
>> -    extent_key.objectid + extent_key.offset);
>> +  extent_key.objectid,
>> +  extent_key.objectid + extent_key.offset);
>>   err |= CHUNK_TYPE_MISMATCH;
>>   }
>>   } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
>> @@ -3175,7 +3184,8 @@ static int check_extent_data_backref(struct
>> btrfs_fs_info *fs_info,
>>   btrfs_header_owner(leaf) != root_id)
>>   goto next;
>>   btrfs_item_key_to_cpu(leaf, , slot);
>> -    if (key.objectid != objectid || key.type !=
>> BTRFS_EXTENT_DATA_KEY)
>> +    if (key.objectid != objectid || key.type !=
>> +    BTRFS_EXTENT_DATA_KEY)
> if (key.objectid != objectid ||
>     key.type != BTRFS_EXTENT_DATA_KEY)
> is more better.
> Other changes are nice.

I also thought about that, but that leaves too much space in previous line.

Not sure what should be the best practice here.

Thank,
Qu
> 
> Thanks,
> Su



signature.asc
Description: OpenPGP digital signature


[josef-btrfs:current-work 2/2] block/blk-wbt.h:113:8: error: redefinition of 'struct rq_qos'

2018-01-31 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/josef/btrfs-next.git 
current-work
head:   055b23681aedfe6c6a9da010a57b3d6d9167f882
commit: 055b23681aedfe6c6a9da010a57b3d6d9167f882 [2/2] current-work
config: xtensa-allyesconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 055b23681aedfe6c6a9da010a57b3d6d9167f882
# save the attached .config to linux build tree
make.cross ARCH=xtensa 

All error/warnings (new ones prefixed by >>):

   In file included from block/cfq-iosched.c:20:0:
>> block/blk-wbt.h:113:8: error: redefinition of 'struct rq_qos'
struct rq_qos {
   ^~
   block/blk-wbt.h:88:8: note: originally defined here
struct rq_qos {
   ^~
--
   In file included from block/blk-wbt.c:27:0:
>> block/blk-wbt.h:113:8: error: redefinition of 'struct rq_qos'
struct rq_qos {
   ^~
   block/blk-wbt.h:88:8: note: originally defined here
struct rq_qos {
   ^~
   block/blk-wbt.c: In function '__wbt_done':
>> block/blk-wbt.c:123:9: error: 'struct rq_wb' has no member named 'wc'
 if (rwb->wc && !wb_recent_wait(rwb))
^~
   block/blk-wbt.c: In function 'wbt_done':
>> block/blk-wbt.c:158:26: error: 'struct rq_wb' has no member named 'last_comp'
   wb_timestamp(rwb, >last_comp);
 ^~
   block/blk-wbt.c: In function 'calc_max_depth':
>> block/blk-wbt.c:181:6: error: 'rwb' undeclared (first use in this function); 
>> did you mean 'rmb'?
 if (rwb->queue_depth == 1) {
 ^~~
 rmb
   block/blk-wbt.c:181:6: note: each undeclared identifier is reported only 
once for each function it appears in
   block/blk-wbt.c: In function 'rwb_trace_step':
>> block/blk-wbt.c:308:32: error: 'struct rq_wb' has no member named 
>> 'scale_step'; did you mean 'enable_state'?
 trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
   ^~
   enable_state
>> block/blk-wbt.c:308:47: error: 'struct rq_wb' has no member named 
>> 'cur_win_nsec'
 trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
  ^~
>> block/blk-wbt.c:309:45: error: 'struct rq_wb' has no member named 'wb_max'; 
>> did you mean 'wb_normal'?
   rwb->wb_background, rwb->wb_normal, rwb->wb_max);
^~
wb_normal
   block/blk-wbt.c: In function 'scale_up':
>> block/blk-wbt.c:323:20: error: implicit declaration of function 
>> 'calc_wb_limits'; did you mean 'task_rlimit'? 
>> [-Werror=implicit-function-declaration]
 rqd->scaled_max = calc_wb_limits(rwb);
   ^~
   task_rlimit
   block/blk-wbt.c:323:35: error: 'rwb' undeclared (first use in this 
function); did you mean 'rmb'?
 rqd->scaled_max = calc_wb_limits(rwb);
  ^~~
  rmb
   block/blk-wbt.c: In function 'scale_down':
>> block/blk-wbt.c:352:2: error: implicit declaration of function 
>> 'rqd_trace_step'; did you mean 'rwb_trace_step'? 
>> [-Werror=implicit-function-declaration]
 rqd_trace_step(rqd, "step down");
 ^~
 rwb_trace_step
   block/blk-wbt.c: In function 'rwb_arm_timer':
   block/blk-wbt.c:357:11: error: 'struct rq_wb' has no member named 
'scale_step'; did you mean 'enable_state'?
 if (rwb->scale_step > 0) {
  ^~
  enable_state
   block/blk-wbt.c:364:6: error: 'struct rq_wb' has no member named 
'cur_win_nsec'
  rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
 ^~
>> block/blk-wbt.c:364:34: error: 'struct rq_wb' has no member named 'win_nsec'
  rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
 ^~
   block/blk-wbt.c:365:21: error: 'struct rq_wb' has no member named 
'scale_step'; did you mean 'enable_state'?
 int_sqrt((rwb->scale_step + 1) << 8));
^~
enable_state
   block/blk-wbt.c:371:6: error: 'struct rq_wb' has no member named 
'cur_win_nsec'
  rwb->cur_win_nsec = rwb->win_nsec;
 ^~
   block/blk-wbt.c:371:26: error: 'struct rq_wb' has no member named 'win_nsec'
  rwb->cur_win_nsec = rwb->win_nsec;
 ^~
>> block/blk-wbt.c:374:29: error: 'struct rq_wb' has no member named 'cb'
 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
^~
   block/blk-wbt.c:374:38: error: 'struct rq_wb' has no member named 
'cur_win_nsec'
 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
 ^~
   block/blk-wbt.c: In function 'wb_timer_fn':
   

Re: [PATCH v2 3/3] btrfs-progs: check: Cleanup all checkpatch error and warning

2018-01-31 Thread Su Yue



On 02/01/2018 02:45 PM, Qu Wenruo wrote:

Since we're moving tons of codes, it's a good idea to fix all errors and
warnings from checkpatch.

Signed-off-by: Qu Wenruo 
---
  check/lowmem.c |  65 +--
  check/main.c   | 253 +
  2 files changed, 165 insertions(+), 153 deletions(-)

diff --git a/check/lowmem.c b/check/lowmem.c
index a3bda97fdea8..3e7fe148bc56 100644
--- a/check/lowmem.c
+++ b/check/lowmem.c
@@ -322,7 +322,8 @@ static int repair_tree_block_ref(struct btrfs_trans_handle 
*trans,
goto out;
  
  		eb = path.nodes[0];

-   ei = btrfs_item_ptr(eb, path.slots[0], struct 
btrfs_extent_item);
+   ei = btrfs_item_ptr(eb, path.slots[0],
+   struct btrfs_extent_item);
  
  		btrfs_set_extent_refs(eb, ei, 0);

btrfs_set_extent_generation(eb, ei, generation);
@@ -730,7 +731,9 @@ begin:
need_research = 0;
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, ref_key, path, 0, 0);
-   /* the item was deleted, let path point to the last checked 
item */
+   /*
+* the item was deleted, let path point to the last checked item
+*/
if (ret > 0) {
if (path->slots[0] == 0)
btrfs_prev_leaf(root, path);
@@ -1486,7 +1489,8 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_key *fkey,
search_start = disk_bytenr;
search_len = disk_num_bytes;
}
-   ret = count_csum_range(root->fs_info, search_start, search_len, 
_found);
+   ret = count_csum_range(root->fs_info, search_start, search_len,
+  _found);
if (csum_found > 0 && nodatasum) {
err |= ODD_CSUM_ITEM;
error("root %llu EXTENT_DATA[%llu %llu] nodatasum shouldn't have 
datasum",
@@ -1497,7 +1501,8 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_key *fkey,
error("root %llu EXTENT_DATA[%llu %llu] csum missing, have: %llu, 
expected: %llu",
  root->objectid, fkey->objectid, fkey->offset,
  csum_found, search_len);
-   } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC && csum_found > 0) 
{
+   } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
+  csum_found > 0) {
err |= ODD_CSUM_ITEM;
error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have 
csum, but has: %llu",
  root->objectid, fkey->objectid, fkey->offset, csum_found);
@@ -1561,7 +1566,8 @@ loop:
}
  
  special_case:

-   di = btrfs_item_ptr(path.nodes[0], path.slots[0], struct 
btrfs_dir_item);
+   di = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_dir_item);
cur = 0;
total = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
  
@@ -1913,7 +1919,8 @@ static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path,

nodatasum = btrfs_inode_flags(node, ii) & BTRFS_INODE_NODATASUM;
  
  	while (1) {

-   btrfs_item_key_to_cpu(path->nodes[0], _key, 
path->slots[0]);
+   btrfs_item_key_to_cpu(path->nodes[0], _key,
+ path->slots[0]);
ret = btrfs_next_item(root, path);
if (ret < 0) {
/* out will fill 'err' rusing current statistics */
@@ -2360,7 +2367,7 @@ static int check_tree_block_ref(struct btrfs_root *root,
 * Check if the backref points to valid
 * referencer
 */
-   found_ref = !check_tree_block_ref( root, NULL,
+   found_ref = !check_tree_block_ref(root, NULL,
offset, level + 1, owner,
NULL);
}
@@ -2500,7 +2507,8 @@ static int repair_extent_data_item(struct 
btrfs_trans_handle *trans,
if (ret)
goto out;
eb = path.nodes[0];
-   ei = btrfs_item_ptr(eb, path.slots[0], struct 
btrfs_extent_item);
+   ei = btrfs_item_ptr(eb, path.slots[0],
+   struct btrfs_extent_item);
  
  		btrfs_set_extent_refs(eb, ei, 0);

btrfs_set_extent_generation(eb, ei, generation);
@@ -2657,7 +2665,8 @@ static int check_extent_data_item(struct btrfs_root *root,
}
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
ref_root = btrfs_extent_data_ref_root(leaf, dref);
-   ref_objectid = btrfs_extent_data_ref_objectid(leaf, 

[josef-btrfs:current-work 2/2] block/blk-wbt.c:323:20: error: implicit declaration of function 'calc_wb_limits'

2018-01-31 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/josef/btrfs-next.git 
current-work
head:   055b23681aedfe6c6a9da010a57b3d6d9167f882
commit: 055b23681aedfe6c6a9da010a57b3d6d9167f882 [2/2] current-work
config: m68k-allyesconfig (attached as .config)
compiler: m68k-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 055b23681aedfe6c6a9da010a57b3d6d9167f882
# save the attached .config to linux build tree
make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   In file included from block/blk-wbt.c:27:0:
   block/blk-wbt.h:113:8: error: redefinition of 'struct rq_qos'
struct rq_qos {
   ^~
   block/blk-wbt.h:88:8: note: originally defined here
struct rq_qos {
   ^~
   block/blk-wbt.c: In function '__wbt_done':
   block/blk-wbt.c:123:9: error: 'struct rq_wb' has no member named 'wc'
 if (rwb->wc && !wb_recent_wait(rwb))
^~
   block/blk-wbt.c: In function 'wbt_done':
   block/blk-wbt.c:158:26: error: 'struct rq_wb' has no member named 'last_comp'
   wb_timestamp(rwb, >last_comp);
 ^~
   block/blk-wbt.c: In function 'calc_max_depth':
   block/blk-wbt.c:181:6: error: 'rwb' undeclared (first use in this function); 
did you mean 'rmb'?
 if (rwb->queue_depth == 1) {
 ^~~
 rmb
   block/blk-wbt.c:181:6: note: each undeclared identifier is reported only 
once for each function it appears in
   block/blk-wbt.c: In function 'rwb_trace_step':
   block/blk-wbt.c:308:32: error: 'struct rq_wb' has no member named 
'scale_step'; did you mean 'enable_state'?
 trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
   ^~
   enable_state
   block/blk-wbt.c:308:47: error: 'struct rq_wb' has no member named 
'cur_win_nsec'
 trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
  ^~
   block/blk-wbt.c:309:45: error: 'struct rq_wb' has no member named 'wb_max'; 
did you mean 'wb_normal'?
   rwb->wb_background, rwb->wb_normal, rwb->wb_max);
^~
wb_normal
   block/blk-wbt.c: In function 'scale_up':
>> block/blk-wbt.c:323:20: error: implicit declaration of function 
>> 'calc_wb_limits' [-Werror=implicit-function-declaration]
 rqd->scaled_max = calc_wb_limits(rwb);
   ^~
   block/blk-wbt.c:323:35: error: 'rwb' undeclared (first use in this 
function); did you mean 'rmb'?
 rqd->scaled_max = calc_wb_limits(rwb);
  ^~~
  rmb
   block/blk-wbt.c: In function 'scale_down':
   block/blk-wbt.c:352:2: error: implicit declaration of function 
'rqd_trace_step'; did you mean 'rwb_trace_step'? 
[-Werror=implicit-function-declaration]
 rqd_trace_step(rqd, "step down");
 ^~
 rwb_trace_step
   block/blk-wbt.c: In function 'rwb_arm_timer':
   block/blk-wbt.c:357:11: error: 'struct rq_wb' has no member named 
'scale_step'; did you mean 'enable_state'?
 if (rwb->scale_step > 0) {
  ^~
  enable_state
   block/blk-wbt.c:364:6: error: 'struct rq_wb' has no member named 
'cur_win_nsec'
  rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
 ^~
   block/blk-wbt.c:364:34: error: 'struct rq_wb' has no member named 'win_nsec'
  rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
 ^~
   block/blk-wbt.c:365:21: error: 'struct rq_wb' has no member named 
'scale_step'; did you mean 'enable_state'?
 int_sqrt((rwb->scale_step + 1) << 8));
^~
enable_state
   block/blk-wbt.c:371:6: error: 'struct rq_wb' has no member named 
'cur_win_nsec'
  rwb->cur_win_nsec = rwb->win_nsec;
 ^~
   block/blk-wbt.c:371:26: error: 'struct rq_wb' has no member named 'win_nsec'
  rwb->cur_win_nsec = rwb->win_nsec;
 ^~
   block/blk-wbt.c:374:29: error: 'struct rq_wb' has no member named 'cb'
 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
^~
   block/blk-wbt.c:374:38: error: 'struct rq_wb' has no member named 
'cur_win_nsec'
 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
 ^~
   block/blk-wbt.c: In function 'wb_timer_fn':
   block/blk-wbt.c:386:61: error: 'struct rq_wb' has no member named 
'scale_step'; did you mean 'enable_state'?
 trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
^~
enable_state
   

[PATCH v2 0/3] btrfs-progs: Split lowmem mode check to its own

2018-01-31 Thread Qu Wenruo
As usual, the main part is over 500K so the biggest patch won't reach
mail list.
Please fetch the whole branch from github:
https://github.com/adam900710/btrfs-progs/tree/split_check

This update rebased (the truth is, it's re-created other than rebase)
the branch to David's devel branch, whose head is:

commit cfe255515ae0a27fdd8e612a884b94ffb703acdf (david/devel)
Author: Qu Wenruo 
Date:   Thu Jan 18 16:38:23 2018 +0800

btrfs-progs: check: Move reset_cached_block_groups to check/common.c

Reviewed-by: Su Yue 
Signed-off-by: Qu Wenruo 


Despite the update, a new patch is introduced, to address the tons of
BORING errors and warning from checkpatch.pl.

Qu Wenruo (3):
  btrfs-progs: check: Move lowmem check code to its own
check/lowmem.[ch]
  btrfs-progs: check/lowmem: Cleanup unnecessary _v2 suffix
  btrfs-progs: check: Cleanup all checkpatch error and warning

 Makefile   | 2 +-
 check/lowmem.c |  4579 
 check/lowmem.h | 5 +
 check/main.c   | 12772 ++-
 4 files changed, 8694 insertions(+), 8664 deletions(-)
 create mode 100644 check/lowmem.c

-- 
2.16.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/3] btrfs-progs: check: Cleanup all checkpatch error and warning

2018-01-31 Thread Qu Wenruo
Since we're moving tons of codes, it's a good idea to fix all errors and
warnings from checkpatch.

Signed-off-by: Qu Wenruo 
---
 check/lowmem.c |  65 +--
 check/main.c   | 253 +
 2 files changed, 165 insertions(+), 153 deletions(-)

diff --git a/check/lowmem.c b/check/lowmem.c
index a3bda97fdea8..3e7fe148bc56 100644
--- a/check/lowmem.c
+++ b/check/lowmem.c
@@ -322,7 +322,8 @@ static int repair_tree_block_ref(struct btrfs_trans_handle 
*trans,
goto out;
 
eb = path.nodes[0];
-   ei = btrfs_item_ptr(eb, path.slots[0], struct 
btrfs_extent_item);
+   ei = btrfs_item_ptr(eb, path.slots[0],
+   struct btrfs_extent_item);
 
btrfs_set_extent_refs(eb, ei, 0);
btrfs_set_extent_generation(eb, ei, generation);
@@ -730,7 +731,9 @@ begin:
need_research = 0;
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, ref_key, path, 0, 0);
-   /* the item was deleted, let path point to the last checked 
item */
+   /*
+* the item was deleted, let path point to the last checked item
+*/
if (ret > 0) {
if (path->slots[0] == 0)
btrfs_prev_leaf(root, path);
@@ -1486,7 +1489,8 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_key *fkey,
search_start = disk_bytenr;
search_len = disk_num_bytes;
}
-   ret = count_csum_range(root->fs_info, search_start, search_len, 
_found);
+   ret = count_csum_range(root->fs_info, search_start, search_len,
+  _found);
if (csum_found > 0 && nodatasum) {
err |= ODD_CSUM_ITEM;
error("root %llu EXTENT_DATA[%llu %llu] nodatasum shouldn't 
have datasum",
@@ -1497,7 +1501,8 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_key *fkey,
error("root %llu EXTENT_DATA[%llu %llu] csum missing, have: 
%llu, expected: %llu",
  root->objectid, fkey->objectid, fkey->offset,
  csum_found, search_len);
-   } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC && csum_found > 0) 
{
+   } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
+  csum_found > 0) {
err |= ODD_CSUM_ITEM;
error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have 
csum, but has: %llu",
  root->objectid, fkey->objectid, fkey->offset, csum_found);
@@ -1561,7 +1566,8 @@ loop:
}
 
 special_case:
-   di = btrfs_item_ptr(path.nodes[0], path.slots[0], struct 
btrfs_dir_item);
+   di = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_dir_item);
cur = 0;
total = btrfs_item_size_nr(path.nodes[0], path.slots[0]);
 
@@ -1913,7 +1919,8 @@ static int check_inode_item(struct btrfs_root *root, 
struct btrfs_path *path,
nodatasum = btrfs_inode_flags(node, ii) & BTRFS_INODE_NODATASUM;
 
while (1) {
-   btrfs_item_key_to_cpu(path->nodes[0], _key, 
path->slots[0]);
+   btrfs_item_key_to_cpu(path->nodes[0], _key,
+ path->slots[0]);
ret = btrfs_next_item(root, path);
if (ret < 0) {
/* out will fill 'err' rusing current statistics */
@@ -2360,7 +2367,7 @@ static int check_tree_block_ref(struct btrfs_root *root,
 * Check if the backref points to valid
 * referencer
 */
-   found_ref = !check_tree_block_ref( root, NULL,
+   found_ref = !check_tree_block_ref(root, NULL,
offset, level + 1, owner,
NULL);
}
@@ -2500,7 +2507,8 @@ static int repair_extent_data_item(struct 
btrfs_trans_handle *trans,
if (ret)
goto out;
eb = path.nodes[0];
-   ei = btrfs_item_ptr(eb, path.slots[0], struct 
btrfs_extent_item);
+   ei = btrfs_item_ptr(eb, path.slots[0],
+   struct btrfs_extent_item);
 
btrfs_set_extent_refs(eb, ei, 0);
btrfs_set_extent_generation(eb, ei, generation);
@@ -2657,7 +2665,8 @@ static int check_extent_data_item(struct btrfs_root *root,
}
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
ref_root = btrfs_extent_data_ref_root(leaf, dref);
-   ref_objectid = btrfs_extent_data_ref_objectid(leaf, 
dref);
+ 

[PATCH v2 2/3] btrfs-progs: check/lowmem: Cleanup unnecessary _v2 suffix

2018-01-31 Thread Qu Wenruo
There used to be some functions with _v2 suffix to distinguish them from
original mode similar functions.

However now moved lowmem code to their own check/lowmem.[ch], cleanup
such _v2 suffixes, and for functions really needs to be distinguished
from original mode (exported functions), change the _v2 suffix to
_lowmem.

Signed-off-by: Qu Wenruo 
---
 check/lowmem.c | 46 +++---
 check/lowmem.h |  4 ++--
 check/main.c   |  4 ++--
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/check/lowmem.c b/check/lowmem.c
index 5d17961edda7..a3bda97fdea8 100644
--- a/check/lowmem.c
+++ b/check/lowmem.c
@@ -28,8 +28,8 @@
 #include "check/common.h"
 #include "check/lowmem.h"
 
-static int calc_extent_flag_v2(struct btrfs_root *root, struct extent_buffer 
*eb,
-  u64 *flags_ret)
+static int calc_extent_flag(struct btrfs_root *root, struct extent_buffer *eb,
+   u64 *flags_ret)
 {
struct btrfs_root *extent_root = root->fs_info->extent_root;
struct btrfs_root_item *ri = >root_item;
@@ -225,7 +225,7 @@ static int update_nodes_refs(struct btrfs_root *root, u64 
bytenr,
}
 
if (check_all && eb) {
-   calc_extent_flag_v2(root, eb, );
+   calc_extent_flag(root, eb, );
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
nrefs->full_backref[level] = 1;
}
@@ -2081,8 +2081,8 @@ out:
  * Returns <0  Fatal error, must exit the whole check
  * Returns 0   No errors found
  */
-static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path 
*path,
-  struct node_refs *nrefs, int *level, int ext_ref)
+static int process_one_leaf(struct btrfs_root *root, struct btrfs_path *path,
+   struct node_refs *nrefs, int *level, int ext_ref)
 {
struct extent_buffer *cur = path->nodes[0];
struct btrfs_key key;
@@ -3895,10 +3895,10 @@ out:
  * Returns <0  Fatal error, must exit the whole check
  * Returns 0   No errors found
  */
-static int walk_down_tree_v2(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, struct btrfs_path *path,
-int *level, struct node_refs *nrefs, int ext_ref,
-int check_all)
+static int walk_down_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ int *level, struct node_refs *nrefs, int ext_ref,
+ int check_all)
 {
enum btrfs_tree_block_status status;
u64 bytenr;
@@ -3968,8 +3968,8 @@ static int walk_down_tree_v2(struct btrfs_trans_handle 
*trans,
 
ret = 0;
if (!check_all)
-   ret = process_one_leaf_v2(root, path, nrefs,
- level, ext_ref);
+   ret = process_one_leaf(root, path, nrefs,
+  level, ext_ref);
else
ret = check_leaf_items(trans, root, path,
   nrefs, account_file_data);
@@ -3993,7 +3993,7 @@ static int walk_down_tree_v2(struct btrfs_trans_handle 
*trans,
if (ret < 0)
break;
/*
-* check all trees in check_chunks_and_extent_v2
+* check all trees in check_chunks_and_extent
 * check shared node once in check_fs_roots
 */
if (!check_all && !nrefs->need_check[*level - 1]) {
@@ -4046,8 +4046,8 @@ static int walk_down_tree_v2(struct btrfs_trans_handle 
*trans,
return err;
 }
 
-static int walk_up_tree_v2(struct btrfs_root *root, struct btrfs_path *path,
-  int *level)
+static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
+   int *level)
 {
int i;
struct extent_buffer *leaf;
@@ -4200,7 +4200,7 @@ out:
 }
 
 /*
- * This function calls walk_down_tree_v2 and walk_up_tree_v2 to check tree
+ * This function calls walk_down_tree and walk_up_tree to check tree
  * blocks and integrity of fs tree items.
  *
  * @root: the root of the tree to be checked.
@@ -4257,8 +4257,8 @@ static int check_btrfs_root(struct btrfs_trans_handle 
*trans,
}
 
while (1) {
-   ret = walk_down_tree_v2(trans, root, , , ,
-   ext_ref, check_all);
+   ret = walk_down_tree(trans, root, , , ,
+ext_ref, check_all);
 
err |= !!ret;
 
@@ -4268,7 +4268,7 @@ static int check_btrfs_root(struct btrfs_trans_handle 
*trans,
break;
}
 
-   ret = 

Re: [PATCH 2/2] btrfs: add read_mirror_policy parameter devid

2018-01-31 Thread Edmund Nadolski
On 1/31/18 7:36 AM, Anand Jain wrote:
> 
> 
> On 01/31/2018 09:42 PM, Nikolay Borisov wrote:
> 
> 
 So usually this should be functionality handled by the raid/san
 controller I guess, > but given that btrfs is playing the role of a
 controller here at what point are we drawing the line of not
 implementing block-level functionality into the filesystem ?
>>>
>>>   Don't worry this is not invading into the block layer. How
>>>   can you even build this functionality in the block layer ?
>>>   Block layer even won't know that disks are mirrored. RAID
>>>   does or BTRFS in our case.
>>>
>>
>> By block layer I guess I meant the storage driver of a particular raid
>> card. Because what is currently happening is re-implementing
>> functionality that will generally sit in the driver. So my question was
>> more generic and high-level - at what point do we draw the line of
>> implementing feature that are generally implemented in hardware devices
>> (be it their drivers or firmware).
> 
>  Not all HW configs use RAID capable HBAs. A server connected to a SATA
>  JBOD using a SATA HBA without MD will relay on BTRFS to provide all the
>  features and capabilities that otherwise would have provided by such a
>  presumable HW config.

That does sort of sound like means implementing some portion of the
HBA features/capabilities in the filesystem.

To me it seems this this could be workable at the fs level, provided it
deals just with policies and remains hardware-neutral.  However most
of the use cases appear to involve some hardware-dependent knowledge
or assumptions.  What happens when someone sets this on a virtual disk,
or say a (persistent) memory-backed block device?  Case #6 seems to
open up some potential for unexpected interactions (which may be hard
to reproduce, esp. in error/recovery scenarios).

Case #2 takes a devid, but I notice btrfs_device::devid says, "the
internal btrfs device id".  How does a user obtain that internal value
so it can be set as a mount option?

Thanks,
Ed


> ::
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index 39ba59832f38..478623e6e074 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -5270,6 +5270,16 @@ static int find_live_mirror(struct
>>> btrfs_fs_info *fs_info,
>>>     num = map->num_stripes;
>>>       switch(fs_info->read_mirror_policy) {
>>> +    case BTRFS_READ_MIRROR_BY_DEV:
>>> +    optimal = first;
>>> +    if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
>>> + >stripes[optimal].dev->dev_state))
>>> +    break;
>>> +    if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
>>> + >stripes[++optimal].dev->dev_state))
>>> +    break;
>>> +    optimal = first;
>>
>> you set optimal 2 times, the second one seems redundant.
>
>    No actually. When both the disks containing the stripe does not
>    have the BTRFS_DEV_STATE_READ_MIRROR, then I would just want to
>    use first found stripe.

 Yes, and the fact that you've already set optimal = first right after
 BTRFS_READ_MIRROR_BY_DEV ensures that, no ? Why do you need to again
 set
 optimal right before the final break? What am I missing here?
>>>
>>>    Ah. I think you are missing ++optimal in the 2nd if.
>>
>> You are right, but I'd prefer you index the stripes array with 'optimal'
>> and 'optimal + 1' and leave just a single assignment
> 
>  Ok. Will improve that.
> 
> Thanks, Anand
> 
> 
>>>
>>> Thanks, Anand
>>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/16] btrfs-progs: Split lowmem mode check to its own

2018-01-31 Thread Qu Wenruo


On 2018年02月01日 02:26, David Sterba wrote:
> On Fri, Jan 19, 2018 at 01:37:15PM +0800, Qu Wenruo wrote:
>> The long planned cmds-check re-construction is finally here.
>>
>> As the original cmds-check.c is getting larger and larger (already over
>> 15K lines), it's always a good idea to split it into its own check/
>> directory.
>>
>> This patchset do the following work:
>> 1) Move cmds-check.c to check/main.c
>> 2) Put codes shared by both original and lowmem mode into
>>check/common.[ch]
>> 3) Put lowmem code into check/lowmem.[ch]
>>With minor renaming to get rid of unnecessary _v2 suffix.
>>
>> The modification looks scary, but no functional change at all.
>>
>> And considering how much the file structure changed, it's a good idea to
>> put PART1 as quick as possible, and there will be less pressure to
>> rebase new incoming fsck related codes.
> 
> I agree with the way you split check, my attempts were going along the
> same lines. The question whether to split first and then add fixes shall
> be resolved as I'm going to merge the split first. This would need
> refreshing the lowmem fixes but the other way around would not avoid the
> merge conflicts anyway.
> 
> The patch 15 does not apply cleanly to current devel, the conflict did
> not seem big, but I did not try to resolve it and merge the last patch.
> So, 1-14 goes to devel, please refresh 15 and 16.

No problem.

Update under way.

> 
>> The real move work happens in the 15th patch, which due to its size
>> (500KB+), it may not be able to reach mail list.
>> So please fetch the whole patchset from github:
>> https://github.com/adam900710/btrfs-progs/tree/split_check
>>
>> There will be a part 2, mostly moving original mode to its own
>> check/original.[ch], along with extra comment explaining how the two
>> different modes work.
> 
> I'm going to do a few file renames, eg. original.c -> mode-original.c
> and similar.
> It would be better if you send part 2 after that.

Of course.
It would be a while for part 2.

> I'm
> expecting more patches that would do no functional change so we can
> settle down with the new check/ structure.
> 
> As 'check' is the 1st level command, the cmds-check.c will need to be
> restored at some point, but it will be only a simple wrapper around the
> commandline handling.

OK, I think it would be part of PART 2 to restore it.

Thanks,
Qu

> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



signature.asc
Description: OpenPGP digital signature


[PATCH v2] Btrfs: fix unexpected cow in run_delalloc_nocow

2018-01-31 Thread Liu Bo
Fstests generic/475 provides a way to fail metadata reads while
checking if checksum exists for the inode inside run_delalloc_nocow(),
and csum_exist_in_range() interprets error (-EIO) as inode having
checksum and makes its caller enters the cow path.

In case of free space inode, this ends up with a warning in
cow_file_range().

The same problem applies for btrfs_cross_ref_exist() since it may also
read metadata in between.

With this, run_delalloc_nocow() bails out when errors occur at the two
places.

cc:  v2.6.28+
Fixes: 17d217fe970d ("Btrfs: fix nodatasum handling in balancing code")
Signed-off-by: Liu Bo 
---
v2: fix error handling after btrfs_cross_ref_exist().

 fs/btrfs/inode.c | 37 -
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e1a7f3c..8f2bd04 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1257,6 +1257,8 @@ static noinline int csum_exist_in_range(struct 
btrfs_fs_info *fs_info,
list_del(>list);
kfree(sums);
}
+   if (ret < 0)
+   return ret;
return 1;
 }
 
@@ -1386,10 +1388,23 @@ static noinline int run_delalloc_nocow(struct inode 
*inode,
goto out_check;
if (btrfs_extent_readonly(fs_info, disk_bytenr))
goto out_check;
-   if (btrfs_cross_ref_exist(root, ino,
- found_key.offset -
- extent_offset, disk_bytenr))
+   ret = btrfs_cross_ref_exist(root, ino,
+   found_key.offset -
+   extent_offset, disk_bytenr);
+   if (ret) {
+   /*
+* ret could be -EIO if the above fails to read
+* metadata.
+*/
+   if (ret < 0) {
+   if (cow_start != (u64)-1)
+   cur_offset = cow_start;
+   goto error;
+   }
+
+   WARN_ON_ONCE(nolock);
goto out_check;
+   }
disk_bytenr += extent_offset;
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
@@ -1407,10 +1422,22 @@ static noinline int run_delalloc_nocow(struct inode 
*inode,
 * this ensure that csum for a given extent are
 * either valid or do not exist.
 */
-   if (csum_exist_in_range(fs_info, disk_bytenr,
-   num_bytes)) {
+   ret = csum_exist_in_range(fs_info, disk_bytenr,
+ num_bytes);
+   if (ret) {
if (!nolock)
btrfs_end_write_no_snapshotting(root);
+
+   /*
+* ret could be -EIO if the above fails to read
+* metadata.
+*/
+   if (ret < 0) {
+   if (cow_start != (u64)-1)
+   cur_offset = cow_start;
+   goto error;
+   }
+   WARN_ON_ONCE(nolock);
goto out_check;
}
if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/16] btrfs-progs: Split lowmem mode check to its own

2018-01-31 Thread David Sterba
On Fri, Jan 19, 2018 at 01:37:15PM +0800, Qu Wenruo wrote:
> The long planned cmds-check re-construction is finally here.
> 
> As the original cmds-check.c is getting larger and larger (already over
> 15K lines), it's always a good idea to split it into its own check/
> directory.
> 
> This patchset do the following work:
> 1) Move cmds-check.c to check/main.c
> 2) Put codes shared by both original and lowmem mode into
>check/common.[ch]
> 3) Put lowmem code into check/lowmem.[ch]
>With minor renaming to get rid of unnecessary _v2 suffix.
> 
> The modification looks scary, but no functional change at all.
> 
> And considering how much the file structure changed, it's a good idea to
> put PART1 as quick as possible, and there will be less pressure to
> rebase new incoming fsck related codes.

I agree with the way you split check, my attempts were going along the
same lines. The question whether to split first and then add fixes shall
be resolved as I'm going to merge the split first. This would need
refreshing the lowmem fixes but the other way around would not avoid the
merge conflicts anyway.

The patch 15 does not apply cleanly to current devel, the conflict did
not seem big, but I did not try to resolve it and merge the last patch.
So, 1-14 goes to devel, please refresh 15 and 16.

> The real move work happens in the 15th patch, which due to its size
> (500KB+), it may not be able to reach mail list.
> So please fetch the whole patchset from github:
> https://github.com/adam900710/btrfs-progs/tree/split_check
> 
> There will be a part 2, mostly moving original mode to its own
> check/original.[ch], along with extra comment explaining how the two
> different modes work.

I'm going to do a few file renames, eg. original.c -> mode-original.c
and similar. It would be better if you send part 2 after that. I'm
expecting more patches that would do no functional change so we can
settle down with the new check/ structure.

As 'check' is the 1st level command, the cmds-check.c will need to be
restored at some point, but it will be only a simple wrapper around the
commandline handling.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] iversion: make inode_cmp_iversion{+raw} return bool instead of s64

2018-01-31 Thread Jeff Layton
On Wed, 2018-01-31 at 08:46 -0800, Linus Torvalds wrote:
> On Wed, Jan 31, 2018 at 4:29 AM, Jeff Layton  wrote:
> > 
> > Do you mind just taking it directly? I don't have anything else queued
> > up for this cycle.
> 
> Done.
> 

Thanks...and also many thanks for spotting the original issue. I agree
that this makes it much harder for the callers to get things wrong (and
is probably much more efficient on some arches, as Ted pointed out).

> I wonder if "false for same, true for different" calling convention
> makes much sense, but it matches the old "0 for same" so obviously
> makes for a smaller diff.
> 
> If it ever ends up confusing people, maybe the sense of that function
> should be reversed, and the name changed to something like
> "same_inode_version()" or something.
> 
> But at least for now the situation seems ok to me,
> 

G. Baroncelli suggested changing the name too, so maybe we should just
go ahead and do it. Let me think on what the best approach is and I may
try to send another patch or PR before the end of the merge window.

Cheers,
-- 
Jeff Layton 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] iversion: make inode_cmp_iversion{+raw} return bool instead of s64

2018-01-31 Thread Linus Torvalds
On Wed, Jan 31, 2018 at 4:29 AM, Jeff Layton  wrote:
>
> Do you mind just taking it directly? I don't have anything else queued
> up for this cycle.

Done.

I wonder if "false for same, true for different" calling convention
makes much sense, but it matches the old "0 for same" so obviously
makes for a smaller diff.

If it ever ends up confusing people, maybe the sense of that function
should be reversed, and the name changed to something like
"same_inode_version()" or something.

But at least for now the situation seems ok to me,

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Policy to balance read across mirrored devices

2018-01-31 Thread Peter Becker
ok, i understood the commitmessage as if the behavior for tests is
more of a bonus

2018-01-30 7:30 GMT+01:00 Anand Jain :
> This __also__ helps testing

then it's clear to me. would only be good that this behavior is
documented. not that anyone else, like me, tries to use this as
performance tuning. at least the feature with the devid.

Thanks Austin,
Thanks Anand

2018-01-31 17:11 GMT+01:00 Austin S. Hemmelgarn :
> On 2018-01-31 09:52, Peter Becker wrote:
>>
>> This is all clear. My question referes to "use the lower devid disk
>> containing the stripe"
>>
>> 2018-01-31 10:01 GMT+01:00 Anand Jain :
>>>
>>>   When a stripe is not present on the read optimized disk it will just
>>>   use the lower devid disk containing the stripe (instead of failing back
>>>   to the pid based random disk).
>>
>>
>> Use only one disk (the disk with the lowest devid that containing the
>> stripe) as fallback should be not a good option imho.
>> Instead of it should still be used the pid as fallback to distribute
>> the workload among all available drives.
>>
>> [stripe to use] = [preffer stripes present on read_mirror_policy
>> devids] > [fallback to pid % stripe count]
>>
>> Perhaps I'm not be able to express myself in English or did I
>> misunderstand you?
>
> Unless I'm seriously misunderstanding the commit messages, the primary
> purpose of having this as an option at all is for testing.  The fact that it
> happens to allow semantics similar to MD's write-mostly flag when dealing
> with a 2-device raid1 profile is just a bonus.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: fsck-tests: Rename tree-reloc-tree test number

2018-01-31 Thread David Sterba
On Wed, Jan 31, 2018 at 10:40:36AM +0800, Qu Wenruo wrote:
> There are 2 fsck tests with the same number 027:
> tree-reloc-tree
> bad-extent-inline-ref-type
> 
> And we also have a hole in 015, so just rename tree-reloc-tree to 015,
> to get rid of the duplicated test number and fill in the hole.
> 
> Signed-off-by: Qu Wenruo 

Applied thanks. I've changed the subject so it mentions the numbers for
future reference.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 7/7] btrfs-progs: Cleanup use of root in leaf_data_end

2018-01-31 Thread David Sterba
On Wed, Jan 31, 2018 at 11:09:19AM +0800, Gu Jinxiang wrote:
> In function leaf_data_end, root is just used to get fs_info,
> so change the parameter of this function from btrfs_root to
> btrfs_fs_info.
> And also make it consistent with kernel.
> 
> Changelog:
> v3->v2:
> Add const to parameter leaf of function btrfs_item_offset_nr
> to keep type consistent with leaf_data_end.
> 
> Signed-off-by: Gu Jinxiang 
> Reviewed-by: Qu Wenruo 

I did not notice the V3, this patch has been added to devel as the
warning is gone, thanks. The was trivial after all.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/7] btrfs-progs: Do some clean up to be consistent

2018-01-31 Thread David Sterba
On Fri, Jan 26, 2018 at 04:18:55PM +0800, Qu Wenruo wrote:
> 
> 
> On 2018年01月26日 15:25, Gu Jinxiang wrote:
> > Patch 1~4 and 7: clean up use of btrfs_root.
> > Patch 5: remove redundancy value assignment.
> > Patch 6: remove no longer be used function define.
> 
> Overall it looks good, just one small nitpick, which should not need
> another update.
> 
> > Changelog:
> > v2->v1:
> > Patch 2,4: To be consistent with kernel, change macro to inline function.
> > Patch 3:
> > Change macro to inline function to be consistent with kernel.
> > And change the function body to match kernel.
> > 
> > Gu Jinxiang (7):
> >   btrfs-progs: Use fs_info instead of root for BTRFS_LEAF_DATA_SIZE
> >   btrfs-progs: Use fs_info instead of root for BTRFS_NODEPTRS_PER_BLOCK
> >   btrfs-progs: Use fs_info instead of root for
> > BTRFS_MAX_INLINE_DATA_SIZE
> 
> Not only this patch modified the parameter but also sync the code with
> kernel.
> It would be better to mention it in commit message, but anyway, just a
> small nitpick, nothing to worry about.

I've updated the changelogs so it mentions the kernel code sync.

Patches 1-6 applied to devel as they're fairly independend of the other
patchsets waiting fo merge.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 7/7] btrfs-progs: Cleanup use of root in leaf_data_end

2018-01-31 Thread David Sterba
On Tue, Jan 30, 2018 at 03:36:28PM +0800, Qu Wenruo wrote:
> On 2018年01月26日 15:26, Gu Jinxiang wrote:
> > In function leaf_data_end, root is just used to get fs_info,
> > so change the parameter of this function from btrfs_root to
> > btrfs_fs_info.
> > And also make it consistent with kernel.
> > 
> > Signed-off-by: Gu Jinxiang 
> > ---
> >  ctree.c | 32 +---
> >  1 file changed, 17 insertions(+), 15 deletions(-)
> > 
> > diff --git a/ctree.c b/ctree.c
> > index 11d207e7..2417483d 100644
> > --- a/ctree.c
> > +++ b/ctree.c
> > @@ -410,12 +410,12 @@ static int btrfs_comp_keys(struct btrfs_disk_key 
> > *disk, struct btrfs_key *k2)
> >   * this returns the address of the start of the last item,
> >   * which is the stop of the leaf data stack
> >   */
> > -static inline unsigned int leaf_data_end(struct btrfs_root *root,
> > -struct extent_buffer *leaf)
> > +static inline unsigned int leaf_data_end(const struct btrfs_fs_info 
> > *fs_info,
> > +const struct extent_buffer *leaf)
> 
> Here leaf is const, which is fine.
> 
> >  {
> > u32 nr = btrfs_header_nritems(leaf);
> > if (nr == 0)
> > -   return BTRFS_LEAF_DATA_SIZE(root->fs_info);
> > +   return BTRFS_LEAF_DATA_SIZE(fs_info);
> > return btrfs_item_offset_nr(leaf, nr - 1);
> 
> But btrfs_item_offset_nr() doesn't have const prefix for leaf.
> Leaving the following warning:
> 
> passing argument 1 of ‘btrfs_item_offset_nr’ discards ‘const’ qualifier
> from pointer target type [-Wdiscarded-qualifiers]
>   return btrfs_item_offset_nr(leaf, nr - 1);
>   ^~~~

I don't want to add the warning, so I'll skip this patch for now. The
constification of arguments will likely propagate to more helpers so
it's not a trivial fix to this patch.

Gu Jinxiang, please fix the warning and resend with possibly more
preparatory cleanups if necessary. Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Policy to balance read across mirrored devices

2018-01-31 Thread Austin S. Hemmelgarn

On 2018-01-31 09:52, Peter Becker wrote:

This is all clear. My question referes to "use the lower devid disk
containing the stripe"

2018-01-31 10:01 GMT+01:00 Anand Jain :

  When a stripe is not present on the read optimized disk it will just
  use the lower devid disk containing the stripe (instead of failing back
  to the pid based random disk).


Use only one disk (the disk with the lowest devid that containing the
stripe) as fallback should be not a good option imho.
Instead of it should still be used the pid as fallback to distribute
the workload among all available drives.

[stripe to use] = [preffer stripes present on read_mirror_policy
devids] > [fallback to pid % stripe count]

Perhaps I'm not be able to express myself in English or did I misunderstand you?
Unless I'm seriously misunderstanding the commit messages, the primary 
purpose of having this as an option at all is for testing.  The fact 
that it happens to allow semantics similar to MD's write-mostly flag 
when dealing with a 2-device raid1 profile is just a bonus.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: fix null pointer dereference when replacing missing device

2018-01-31 Thread David Sterba
On Wed, Jan 31, 2018 at 11:25:49AM +0800, Anand Jain wrote:
> > This is trivially reproduced by running the test btrfs/027 from fstests
> > like this:
> > 
> >$ MOUNT_OPTIONS="-o discard" ./check btrfs/027
> > 
> > Fix this by skipping devices without a backing device before attempting
> > to discard.
> > 
> > Fixes: 38b5f68e9811 ("btrfs: drop btrfs_device::can_discard to query 
> > directly")
> > Signed-off-by: Filipe Manana 
>   oh no, I missed the context of degraded when writing this patch.
>   Thanks for the fix.
> 
>   Reviewed-by: Anand Jain 

Added to next and will add it to the next 4.16 pull.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/2] btrfs: Refactor __get_raid_index() to btrfs_bg_flags_to_raid_index()

2018-01-31 Thread David Sterba
On Tue, Jan 30, 2018 at 06:20:45PM +0800, Qu Wenruo wrote:
> Function __get_raid_index() is used to convert block group flags into
> raid index, which can be used to get various info directly from
> btrfs_raid_array[].
> 
> Refactor this function a little:
> 
> 1) Rename to btrfs_bg_flags_to_raid_index()
>Double underline prefix is normally for internal functions, while the
>function is used by both extent-tree and volumes.
> 
>Although the name is a little longer, but it should explain its usage
>quite well.
> 
> 2) Move it to volumes.h and make it static inline
>Just several if-else branches, really no need to define it as a normal
>function.
> 
>This also makes later code re-use between kernel and btrfs-progs
>easier.
> 
> 3) Remove function get_block_group_index()
>Really no need to do such a simple thing as an exported function.

Agreed on this kind of cleanups. Patches added to next, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] btrfs: Handle btrfs_set_extent_delalloc failure in relocate_file_extent_cluster

2018-01-31 Thread Nikolay Borisov
Essentially duplicate the error handling from the above block which
handles the !PageUptodate(page) case and additionally clear
EXTENT_BOUNDARY.

Signed-off-by: Nikolay Borisov 
Reviewed-by: Josef Bacik 
---

V2: 
 * Remove unrelated whitespace fix 

 fs/btrfs/relocation.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f0c3f00e97cb..cd2298d185dd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3268,8 +3268,22 @@ static int relocate_file_extent_cluster(struct inode 
*inode,
nr++;
}
 
-   btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL,
- 0);
+   ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
+   NULL, 0);
+   if (ret) {
+   unlock_page(page);
+   put_page(page);
+   btrfs_delalloc_release_metadata(BTRFS_I(inode),
+PAGE_SIZE);
+   btrfs_delalloc_release_extents(BTRFS_I(inode),
+  PAGE_SIZE);
+
+   clear_extent_bits(_I(inode)->io_tree,
+ page_start, page_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+   goto out;
+
+   }
set_page_dirty(page);
 
unlock_extent(_I(inode)->io_tree,
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: volumes: Cleanup stripe size calculation

2018-01-31 Thread David Sterba
On Wed, Jan 31, 2018 at 02:16:34PM +0800, Qu Wenruo wrote:
> Cleanup the following things:
> 1) open coded SZ_16M round up
> 2) use min() to replace open-coded size comparison
> 3) code style
> 
> Signed-off-by: Qu Wenruo 

Added to next, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] btrfs: remove spurious WARN_ON(ref->count < 0) in find_parent_nodes

2018-01-31 Thread David Sterba
On Wed, Jan 24, 2018 at 11:33:08AM +0800, Lu Fengqi wrote:
> On Tue, Jan 23, 2018 at 10:22:09PM -0500, Zygo Blaxell wrote:
> >Until v4.14, this warning was very infrequent:
> >
> > WARNING: CPU: 3 PID: 18172 at fs/btrfs/backref.c:1391 
> > find_parent_nodes+0xc41/0x14e0
> > Modules linked in: [...]
> > CPU: 3 PID: 18172 Comm: bees Tainted: G  D WL  4.11.9-zb64+ #1
> > Hardware name: System manufacturer System Product Name/M5A78L-M/USB3, 
> > BIOS 210112/02/2014
> > Call Trace:
> >  dump_stack+0x85/0xc2
> >  __warn+0xd1/0xf0
> >  warn_slowpath_null+0x1d/0x20
> >  find_parent_nodes+0xc41/0x14e0
> >  __btrfs_find_all_roots+0xad/0x120
> >  ? extent_same_check_offsets+0x70/0x70
> >  iterate_extent_inodes+0x168/0x300
> >  iterate_inodes_from_logical+0x87/0xb0
> >  ? iterate_inodes_from_logical+0x87/0xb0
> >  ? extent_same_check_offsets+0x70/0x70
> >  btrfs_ioctl+0x8ac/0x2820
> >  ? lock_acquire+0xc2/0x200
> >  do_vfs_ioctl+0x91/0x700
> >  ? __fget+0x112/0x200
> >  SyS_ioctl+0x79/0x90
> >  entry_SYSCALL_64_fastpath+0x23/0xc6
> >  ? trace_hardirqs_off_caller+0x1f/0x140
> >
> >Starting with v4.14 (specifically 86d5f9944252 ("btrfs: convert prelimary
> >reference tracking to use rbtrees")) the WARN_ON occurs three orders of
> >magnitude more frequently--almost once per second while running workloads
> >like bees.
> >
> >Replace the WARN_ON() with a comment rationale for its removal.
> >The rationale is paraphrased from an explanation by Edmund Nadolski
> > on the linux-btrfs mailing list.
> >
> >Fixes: 8da6d5815c59 ("Btrfs: added btrfs_find_all_roots()")
> >Signed-off-by: Zygo Blaxell 
> 
> Reviewed-by: Lu Fengqi 

Added to next, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Btrfs progs pre-release 4.15-rc1

2018-01-31 Thread David Sterba
Hi,

a pre-release has been tagged.

The only significant change is removing the custom chunk allocator from mkfs
for the --rootdir option.

ETA for 4.15 is in +2 days (2018-02-02).

Changes:

* mkfs --rootdir reworked, does not minimize the final image but can be still
  done using a new option --shrink
* fix allocation of system chunk, don't allocate from the reserved area
* other
  * new and updated tests
  * cleanups, refactoring
  * doc updates

Tarballs: https://www.kernel.org/pub/linux/kernel/people/kdave/btrfs-progs/
Git: git://git.kernel.org/pub/scm/linux/kernel/git/kdave/btrfs-progs.git

Shortlog:

David Sterba (14):
  btrfs-progs: docs: update manual for mkfs --shrink
  btrfs-progs: tests: fix typo in error message
  btrfs-progs: tests: bump zstd version in CI to 1.3.3
  btrfs-progs: build: simplify version tracking
  btrfs-progs: tests: 029-super-recovery: cleanup the test
  btrfs-progs: tests: add more coverage to 
mkfs-tests/013-reserved-1M-for-single
  btrfs-progs: tests: truncate test image to 0 first
  btrfs-progs: don't clobber errno in close_file_or_dir
  btrfs-progs: build: update help text for zstd
  btrfs-progs: docs: clean all generated files
  btrfs-progs: tests: disable some mkfs/010 testcases inside travis
  btrfs-progs: tests: enhance common umount helper to take optional paths
  btrfs-progs: tests: fixup mount tests of 
fsck/028-unaligned-super-dev-sizes
  btrfs-progs: update CHANGES for v4.15

Gu Jinxiang (2):
  btrfs-progs: Remove unused parameter trans
  btrfs-progs: build: Remove unused variable TESTS

Hans van Kranenburg (2):
  btrfs-progs: Fix progs_extra build dependencies
  btrfs-progs: Fix build of btrfs-calc-size

Lu Fengqi (2):
  btrfs-progs: qgroup: move btrfs_show_qgroups's error handler to 
__qgroup_search
  btrfs-progs: qgroup: cleanup __qgroup_search

Nikolay Borisov (8):
  btrfs-progs: Print error on invalid extent item format during check
  btrfs-progs: tests: Explictly state test.sh must be executable
  btrfs-progs: Factor out common print_device_info
  btrfs-progs: Remove recover_get_good_super
  btrfs-progs: Replace usage of list_for_each with list_for_each_entry
  btrfs-progs: Document logic of btrfs_read_dev_super
  btrfs-progs: super-recover: fix the broken sb detection
  btrfs-progs: tests: Add test for super block recovery

Qu Wenruo (22):
  btrfs-progs: mkfs: move image creation of rootdir to its own files
  btrfs-progs: mkfs: move source dir size calculation to its own files
  btrfs-progs: test/mkfs: Test if the minimal device size is valid
  btrfs-progs: mkfs/rootdir: Introduce function to get end position of last 
device extent
  btrfs-progs: mkfs: Update allocation info before verbose output
  btrfs-progs: mkfs: Cleanup temporary chunks before filling rootdir
  btrfs-progs: mkfs: Don't use custom chunk allocator for rootdir
  btrfs-progs: mkfs/rootdir: Use over-reserve method to make size estimate 
easier
  btrfs-progs: mkfs/rootdir: Shrink fs for rootdir option
  btrfs-progs: mkfs: Separate shrink from rootdir
  btrfs-progs: mkfs: fix regression preventing --rootdir to create file
  btrfs-progs: tests/mkfs: Introduce test case to check if mkfs rootdir can 
create a new file
  btrfs-progs: mkfs: Use the whole file or block device to mkfs for rootdir
  btrfs-progs: tests/mkfs: verify that mkfs.btrfs rootdir+shrink behaves 
correctly
  btrfs-progs: mkfs: Prevent temporary system chunk to use space in 
reserved 1M range
  btrfs-progs: tests:mkfs/010: Output minimal device size
  btrfs-progs: tests: mkfs: don't overwrite first 1M for single
  btrfs-progs: Use bool parameter to determine if we're allocating data 
extent
  btrfs-progs: volumes: Make find_free_dev_extent_start static
  btrfs-progs: volumes: Remove unnecessary trans parameter
  btrfs-progs: volumes: Remove unnecessary parameters when allocating 
device extent
  btrfs-progs: Remove unnecessary parameter for btrfs_add_block_group

Rosen Penev (1):
  btrfs-progs: treewide: Replace strerror(errno) with %m.

Su Yue (1):
  btrfs-progs: check: report more specific info about invalid location

William Giokas (1):
  btrfs-progs: docs: fix typo in btrfs-filesystem manual page

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RESEND PATCH] btrfs: Handle btrfs_set_extent_delalloc failure in relocate_file_extent_cluster

2018-01-31 Thread David Sterba
On Wed, Jan 31, 2018 at 12:53:38PM +0200, Nikolay Borisov wrote:
> > This is an unrelated change. Please don't mix pure white
> > space/indentation changes with functional changes.
> 
> David seems rather adamant in not accepting pure whitespace/indention
> changes on their own so I don't see a way to actually improve the code
> base in that regard unless i slip them up when modifying nearby code.

If it's an unrelated change, than it's wrong to slip it in just because
it's near. It's still an unrelated change. Other than that, and I think
I mentioned that in your previous attempts to add whitespace changes,
it just pollutes the commit history. Looking for a commit that
potentially broke some code and finding a whitespace change just makes
anybody grumpy. Maintaner should know and not let such changes in. I've
been on both sides, and based on this experience I will not let in such
changes.

The review process in the mailinglist is there to catch that and point
out, though pointing out just whitespace is kind of not welcome, unless
the real review is also done. If there are minor things to fix, I do
that at commit time, which means I edit majority of all patches or
changelogs. In some cases I will let the patch author know so I don't
have to fix that over and over again. (But it never lasts.)

> There are a couple of space with trailing whitespace which I constantly
> select out from my commits.
> 
> Given that you have now also expressed objection to such cleanups, how
> should they eventually be fixed?

They will be fixed once the code on those lines gets changed. Which may
not anytime soon.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Policy to balance read across mirrored devices

2018-01-31 Thread Peter Becker
This is all clear. My question referes to "use the lower devid disk
containing the stripe"

2018-01-31 10:01 GMT+01:00 Anand Jain :
>  When a stripe is not present on the read optimized disk it will just
>  use the lower devid disk containing the stripe (instead of failing back
>  to the pid based random disk).

Use only one disk (the disk with the lowest devid that containing the
stripe) as fallback should be not a good option imho.
Instead of it should still be used the pid as fallback to distribute
the workload among all available drives.

[stripe to use] = [preffer stripes present on read_mirror_policy
devids] > [fallback to pid % stripe count]

Perhaps I'm not be able to express myself in English or did I misunderstand you?

2018-01-31 15:26 GMT+01:00 Anand Jain :
>
>
> On 01/31/2018 06:47 PM, Peter Becker wrote:
>>
>> 2018-01-31 10:01 GMT+01:00 Anand Jain :
>>>
>>>   When a stripe is not present on the read optimized disk it will just
>>>   use the lower devid disk containing the stripe (instead of failing back
>>>   to the pid based random disk).
>>
>>
>> Is this a good behavior? beause this would eliminate every performance
>> benefit of the pid base random disk pick if the requested stripe is
>> not present on the read optimized disk.
>> Wouldn't it be better to specify a fallback and use the pid base
>> random pick as default for the fallback.
>>
>> For example:
>>
>> RAID 1 over 4 disk's
>>
>> devid | rpm | size
>> 
>> 1 | 7200 rpm | 3 TB
>> 2 | 7200 rpm | 3 TB
>> 3 | 5400 rpm | 4 TB
>> 4 | 5400 rpm | 4 TB
>>
>> mount -o read_mirror_policy=1,read_mirror_policy=2
>>
>> Cases:
>> 1. if the requested stripe is on devid 3 and 4 the algorithm should
>> choise on of both randomly to incresse performance instead of read
>> everytime from 3 and never from 4
>> 2. if the requested stripe is on devid 1 and 3, all is fine ( in case
>> of the queue deep of 1 isn't mutch larger then the queue deep of 3 )
>> 3. if the requested stripe is on devid 1 and 2, the algorithm should
>> choise on of both randomly to incresse performance instead of read
>> everytime from 1 and never from 2
>
>>
>>
>> And all randomly picks of a device should be replaced by a heuristic
>> algorithm wo respect the queue deep and sequential reads in the
>> future.
>
>
>  This scenario is very well handled by the pid/heuristic based
>  read load balancer, pid based read load balancer is by default still,
>  Tim has written IO load based read balancer which can be set using
>  this mount option when all integrated together, and it needs
>  experiments to see if it can be by default replacing the pid method.
>  Further as of now we don't do allocation grouping, so if you have two
>  ssd and two hd in a RAID1 its not guaranteed that allocation will
>  always span across a SSD and a HD, so there is bit of randomness
>  in the allocation itself.
>
> Thanks, Anand
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs: add read_mirror_policy parameter devid

2018-01-31 Thread Anand Jain



On 01/31/2018 09:42 PM, Nikolay Borisov wrote:



So usually this should be functionality handled by the raid/san
controller I guess, > but given that btrfs is playing the role of a
controller here at what point are we drawing the line of not
implementing block-level functionality into the filesystem ?


  Don't worry this is not invading into the block layer. How
  can you even build this functionality in the block layer ?
  Block layer even won't know that disks are mirrored. RAID
  does or BTRFS in our case.



By block layer I guess I meant the storage driver of a particular raid
card. Because what is currently happening is re-implementing
functionality that will generally sit in the driver. So my question was
more generic and high-level - at what point do we draw the line of
implementing feature that are generally implemented in hardware devices
(be it their drivers or firmware).


 Not all HW configs use RAID capable HBAs. A server connected to a SATA
 JBOD using a SATA HBA without MD will relay on BTRFS to provide all the
 features and capabilities that otherwise would have provided by such a
 presumable HW config.

::


::

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 39ba59832f38..478623e6e074 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5270,6 +5270,16 @@ static int find_live_mirror(struct
btrfs_fs_info *fs_info,
    num = map->num_stripes;
      switch(fs_info->read_mirror_policy) {
+    case BTRFS_READ_MIRROR_BY_DEV:
+    optimal = first;
+    if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+ >stripes[optimal].dev->dev_state))
+    break;
+    if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+ >stripes[++optimal].dev->dev_state))
+    break;
+    optimal = first;


you set optimal 2 times, the second one seems redundant.


   No actually. When both the disks containing the stripe does not
   have the BTRFS_DEV_STATE_READ_MIRROR, then I would just want to
   use first found stripe.


Yes, and the fact that you've already set optimal = first right after
BTRFS_READ_MIRROR_BY_DEV ensures that, no ? Why do you need to again set
optimal right before the final break? What am I missing here?


   Ah. I think you are missing ++optimal in the 2nd if.


You are right, but I'd prefer you index the stripes array with 'optimal'
and 'optimal + 1' and leave just a single assignment


 Ok. Will improve that.

Thanks, Anand




Thanks, Anand


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Policy to balance read across mirrored devices

2018-01-31 Thread Anand Jain



On 01/31/2018 06:47 PM, Peter Becker wrote:

2018-01-31 10:01 GMT+01:00 Anand Jain :

  When a stripe is not present on the read optimized disk it will just
  use the lower devid disk containing the stripe (instead of failing back
  to the pid based random disk).


Is this a good behavior? beause this would eliminate every performance
benefit of the pid base random disk pick if the requested stripe is
not present on the read optimized disk.
Wouldn't it be better to specify a fallback and use the pid base
random pick as default for the fallback.

For example:

RAID 1 over 4 disk's

devid | rpm | size

1 | 7200 rpm | 3 TB
2 | 7200 rpm | 3 TB
3 | 5400 rpm | 4 TB
4 | 5400 rpm | 4 TB

mount -o read_mirror_policy=1,read_mirror_policy=2

Cases:
1. if the requested stripe is on devid 3 and 4 the algorithm should
choise on of both randomly to incresse performance instead of read
everytime from 3 and never from 4
2. if the requested stripe is on devid 1 and 3, all is fine ( in case
of the queue deep of 1 isn't mutch larger then the queue deep of 3 )
3. if the requested stripe is on devid 1 and 2, the algorithm should
choise on of both randomly to incresse performance instead of read
everytime from 1 and never from 2

>

And all randomly picks of a device should be replaced by a heuristic
algorithm wo respect the queue deep and sequential reads in the
future.


 This scenario is very well handled by the pid/heuristic based
 read load balancer, pid based read load balancer is by default still,
 Tim has written IO load based read balancer which can be set using
 this mount option when all integrated together, and it needs
 experiments to see if it can be by default replacing the pid method.
 Further as of now we don't do allocation grouping, so if you have two
 ssd and two hd in a RAID1 its not guaranteed that allocation will
 always span across a SSD and a HD, so there is bit of randomness
 in the allocation itself.

Thanks, Anand
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs: add read_mirror_policy parameter devid

2018-01-31 Thread Nikolay Borisov


On 31.01.2018 15:38, Anand Jain wrote:
> 
> 
> On 01/31/2018 05:54 PM, Nikolay Borisov wrote:
>>
>>
>> On 31.01.2018 11:28, Anand Jain wrote:
>>>
>>>
>>> On 01/31/2018 04:38 PM, Nikolay Borisov wrote:


 On 30.01.2018 08:30, Anand Jain wrote:
> Adds the mount option:
>     mount -o read_mirror_policy=
>
> To set the devid of the device which should be used for read. That
> means all the normal reads will go to that particular device only.
>
> This also helps testing and gives a better control for the test
> scripts including mount context reads.

 Some code comments below. OTOH, does such policy really make sense,
 what
 happens if the selected device fails, will the other mirror be retried?
>>>
>>>   Everything as usual, read_mirror_policy=devid just lets the user to
>>>   specify his read optimized disk, so that we don't depend on the pid
>>>   to pick a stripe mirrored disk, and instead we would pick as suggested
>>>   by the user, and if that disk fails then we go back to the other
>>> mirror
>>>   which may not be the read optimized disk as we have no other choice.
>>>
 If the answer to the previous question is positive then why do we
 really
 care which device is going to be tried first?
>>>
>>>   It matters.
>>>     - If you are reading from both disks alternatively, then it
>>>   duplicates the LUN cache on the storage.
>>>     - Some disks are read-optimized and using that for reading and going
>>>   back to the other disk only when this disk fails provides a better
>>>   overall read performance.
>>
>> So usually this should be functionality handled by the raid/san
>> controller I guess, > but given that btrfs is playing the role of a
>> controller here at what point are we drawing the line of not
>> implementing block-level functionality into the filesystem ?
> 
>  Don't worry this is not invading into the block layer. How
>  can you even build this functionality in the block layer ?
>  Block layer even won't know that disks are mirrored. RAID
>  does or BTRFS in our case.
> 

By block layer I guess I meant the storage driver of a particular raid
card. Because what is currently happening is re-implementing
functionality that will generally sit in the driver. So my question was
more generic and high-level - at what point do we draw the line of
implementing feature that are generally implemented in hardware devices
(be it their drivers or firmware).

>>> ::
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 39ba59832f38..478623e6e074 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -5270,6 +5270,16 @@ static int find_live_mirror(struct
> btrfs_fs_info *fs_info,
>    num = map->num_stripes;
>      switch(fs_info->read_mirror_policy) {
> +    case BTRFS_READ_MIRROR_BY_DEV:
> +    optimal = first;
> +    if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
> + >stripes[optimal].dev->dev_state))
> +    break;
> +    if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
> + >stripes[++optimal].dev->dev_state))
> +    break;
> +    optimal = first;

 you set optimal 2 times, the second one seems redundant.
>>>
>>>   No actually. When both the disks containing the stripe does not
>>>   have the BTRFS_DEV_STATE_READ_MIRROR, then I would just want to
>>>   use first found stripe.
>>
>> Yes, and the fact that you've already set optimal = first right after
>> BTRFS_READ_MIRROR_BY_DEV ensures that, no ? Why do you need to again set
>> optimal right before the final break? What am I missing here?
> 
>   Ah. I think you are missing ++optimal in the 2nd if.

You are right, but I'd prefer you index the stripes array with 'optimal'
and 'optimal + 1' and leave just a single assignment

> 
> Thanks, Anand
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs: add read_mirror_policy parameter devid

2018-01-31 Thread Anand Jain



On 01/31/2018 05:54 PM, Nikolay Borisov wrote:



On 31.01.2018 11:28, Anand Jain wrote:



On 01/31/2018 04:38 PM, Nikolay Borisov wrote:



On 30.01.2018 08:30, Anand Jain wrote:

Adds the mount option:
    mount -o read_mirror_policy=

To set the devid of the device which should be used for read. That
means all the normal reads will go to that particular device only.

This also helps testing and gives a better control for the test
scripts including mount context reads.


Some code comments below. OTOH, does such policy really make sense, what
happens if the selected device fails, will the other mirror be retried?


  Everything as usual, read_mirror_policy=devid just lets the user to
  specify his read optimized disk, so that we don't depend on the pid
  to pick a stripe mirrored disk, and instead we would pick as suggested
  by the user, and if that disk fails then we go back to the other mirror
  which may not be the read optimized disk as we have no other choice.


If the answer to the previous question is positive then why do we really
care which device is going to be tried first?


  It matters.
    - If you are reading from both disks alternatively, then it
  duplicates the LUN cache on the storage.
    - Some disks are read-optimized and using that for reading and going
  back to the other disk only when this disk fails provides a better
  overall read performance.


So usually this should be functionality handled by the raid/san
controller I guess, > but given that btrfs is playing the role of a
controller here at what point are we drawing the line of not
implementing block-level functionality into the filesystem ?


 Don't worry this is not invading into the block layer. How
 can you even build this functionality in the block layer ?
 Block layer even won't know that disks are mirrored. RAID
 does or BTRFS in our case.


::

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 39ba59832f38..478623e6e074 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5270,6 +5270,16 @@ static int find_live_mirror(struct
btrfs_fs_info *fs_info,
   num = map->num_stripes;
     switch(fs_info->read_mirror_policy) {
+    case BTRFS_READ_MIRROR_BY_DEV:
+    optimal = first;
+    if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+ >stripes[optimal].dev->dev_state))
+    break;
+    if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+ >stripes[++optimal].dev->dev_state))
+    break;
+    optimal = first;


you set optimal 2 times, the second one seems redundant.


  No actually. When both the disks containing the stripe does not
  have the BTRFS_DEV_STATE_READ_MIRROR, then I would just want to
  use first found stripe.


Yes, and the fact that you've already set optimal = first right after
BTRFS_READ_MIRROR_BY_DEV ensures that, no ? Why do you need to again set
optimal right before the final break? What am I missing here?


  Ah. I think you are missing ++optimal in the 2nd if.

Thanks, Anand
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] iversion: make inode_cmp_iversion{+raw} return bool instead of s64

2018-01-31 Thread Jeff Layton
On Tue, 2018-01-30 at 12:53 -0800, Linus Torvalds wrote:
> Ack. Should I expect this in a future pull request, or take it directly?
> 
> There's no hurry about this, since none of the existing users of that
> function actually do anything but test the return value against zero,
> and nobody saves it into anything but a "bool" (which has magical
> casting properties and does not lose upper bits).
> 
>   Linus

Do you mind just taking it directly? I don't have anything else queued
up for this cycle.

Thanks,
-- 
Jeff Layton 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs: add read_mirror_policy parameter devid

2018-01-31 Thread Anand Jain



On 01/31/2018 04:38 PM, Nikolay Borisov wrote:



On 30.01.2018 08:30, Anand Jain wrote:

Adds the mount option:
   mount -o read_mirror_policy=

To set the devid of the device which should be used for read. That
means all the normal reads will go to that particular device only.

This also helps testing and gives a better control for the test
scripts including mount context reads.


Some code comments below. OTOH, does such policy really make sense, what
happens if the selected device fails, will the other mirror be retried?


 Everything as usual, read_mirror_policy=devid just lets the user to
 specify his read optimized disk, so that we don't depend on the pid
 to pick a stripe mirrored disk, and instead we would pick as suggested
 by the user, and if that disk fails then we go back to the other mirror
 which may not be the read optimized disk as we have no other choice.


If the answer to the previous question is positive then why do we really
care which device is going to be tried first?


 It matters.
   - If you are reading from both disks alternatively, then it
 duplicates the LUN cache on the storage.
   - Some disks are read-optimized and using that for reading and going
 back to the other disk only when this disk fails provides a better
 overall read performance.

::

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 39ba59832f38..478623e6e074 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5270,6 +5270,16 @@ static int find_live_mirror(struct btrfs_fs_info 
*fs_info,
num = map->num_stripes;
  
  	switch(fs_info->read_mirror_policy) {

+   case BTRFS_READ_MIRROR_BY_DEV:
+   optimal = first;
+   if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+>stripes[optimal].dev->dev_state))
+   break;
+   if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+>stripes[++optimal].dev->dev_state))
+   break;
+   optimal = first;


you set optimal 2 times, the second one seems redundant.


 No actually. When both the disks containing the stripe does not
 have the BTRFS_DEV_STATE_READ_MIRROR, then I would just want to
 use first found stripe.


Alongside this
patch it makes sense to also send a patch to btrfs(5) man page
describing the mount option + description of each implemented allocation
policy.


 Yep. Will do.


Another thing which I don't see here is how you are handling the case
when you have more than 2 devices in the RAID1 case. As it stands
currently you assume there are two devices and first test device 0 and
then device 1 and completely ignore any other devices.


 Not really. That part is already handled by the extent mapping.
 As the number of stripe for raid1 is two, the extent mapping will
 manage put related two devices of this stripe.

Thanks, Anand
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs: add mount option read_mirror_policy

2018-01-31 Thread Anand Jain



On 01/31/2018 04:06 PM, Nikolay Borisov wrote:


diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1a462ab85c49..4759e988b0df 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1100,6 +1100,8 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
  #endif
+   /* Policy to balance read across mirrored devices */
+   int read_mirror_policy;


make that member enum btrfs_read_mirror_type


 yep. Will do.

::


diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a61715677b67..39ba59832f38 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5269,7 +5269,13 @@ static int find_live_mirror(struct btrfs_fs_info 
*fs_info,
else
num = map->num_stripes;
  
-	optimal = first + current->pid % num;

+   switch(fs_info->read_mirror_policy) {
+   case BTRFS_READ_MIRROR_DEFAULT:
+   case BTRFS_READ_MIRROR_BY_PID:
+   default:
+   optimal = first + current->pid % num;
+   break;
+   }


Why not factor out this code in a separate function with descriptive
name and some documentation. It seems you have plans how to extend this
mechanism further so let's try and make it maintainable from the get-go.


  This is in fact restoring the original design, will add comments.
  In the long term we may have up couple of more choices (like LBA),
  will move it to a function.

Thanks, Anand
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Policy to balance read across mirrored devices

2018-01-31 Thread Anand Jain


On 01/31/2018 03:51 PM, Peter Becker wrote:


A little question about mount -o read_mirror_policy=.

How would this work with RAID1 over 3 or 4 HDD's?
In particular, if the desired block is not available on device .


 When a stripe is not present on the read optimized disk it will just
 use the lower devid disk containing the stripe (instead of failing back
 to the pid based random disk).


Could i repeat this option like the device-option to specify a
order/priority like this:

mount -o read_mirror_policy= 1,read_mirror_policy= 3


 Yes.

Thanks, Anand
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: Move qgroup rescan on quota enable to btrfs_quota_enable

2018-01-31 Thread Nikolay Borisov
Currently btrfs_run_qgroups is doing a bit too much. Not only is it
responsible for synchronizing in-memory state of qgroups to disk but
it also contains code to trigger the initial qgroup rescan when
quota is enabled initially. This condition is detected by checking that
BTRFS_FS_QUOTA_ENABLED is not set and BTRFS_FS_QUOTA_ENABLING is set.
Nothing really requires fro the code to be structured (and scattered)
the way it is so let's streamline things. First move the quota rescan
code into btrfs_quota_enable, where its invocation is closer to the
user. This also makes the FS_QUOTA_ENABLING flag redundant so let's
remove it as well.i

This has been tested with a full xfstest run with qgroups enabled on
the scratch device of every xfstest and no regressions were observed.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/ctree.h  |  1 -
 fs/btrfs/qgroup.c | 35 ++-
 2 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1a462ab85c49..83b645920aba 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -707,7 +707,6 @@ struct btrfs_delayed_root;
 #define BTRFS_FS_LOG_RECOVERING4
 #define BTRFS_FS_OPEN  5
 #define BTRFS_FS_QUOTA_ENABLED 6
-#define BTRFS_FS_QUOTA_ENABLING7
 #define BTRFS_FS_UPDATE_UUID_TREE_GEN  9
 #define BTRFS_FS_CREATING_FREE_SPACE_TREE  10
 #define BTRFS_FS_BTREE_ERR 11
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f822152693a4..5b446d1bb22d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -826,10 +826,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
int slot;
 
mutex_lock(_info->qgroup_ioctl_lock);
-   if (fs_info->quota_root) {
-   set_bit(BTRFS_FS_QUOTA_ENABLING, _info->flags);
+   if (fs_info->quota_root)
goto out;
-   }
 
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
if (!fs_info->qgroup_ulist) {
@@ -923,8 +921,15 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
}
spin_lock(_info->qgroup_lock);
fs_info->quota_root = quota_root;
-   set_bit(BTRFS_FS_QUOTA_ENABLING, _info->flags);
+   set_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags);
spin_unlock(_info->qgroup_lock);
+   ret = qgroup_rescan_init(fs_info, 0, 1);
+   if (!ret) {
+   qgroup_rescan_zero_tracking(fs_info);
+   btrfs_queue_work(fs_info->qgroup_rescan_workers,
+_info->qgroup_rescan_work);
+   }
+
 out_free_path:
btrfs_free_path(path);
 out_free_root:
@@ -2077,17 +2082,9 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 {
struct btrfs_root *quota_root = fs_info->quota_root;
int ret = 0;
-   int start_rescan_worker = 0;
 
if (!quota_root)
-   goto out;
-
-   if (!test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags) &&
-   test_bit(BTRFS_FS_QUOTA_ENABLING, _info->flags))
-   start_rescan_worker = 1;
-
-   if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, _info->flags))
-   set_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags);
+   return ret;
 
spin_lock(_info->qgroup_lock);
while (!list_empty(_info->dirty_qgroups)) {
@@ -2116,18 +2113,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 
-   if (!ret && start_rescan_worker) {
-   ret = qgroup_rescan_init(fs_info, 0, 1);
-   if (!ret) {
-   qgroup_rescan_zero_tracking(fs_info);
-   btrfs_queue_work(fs_info->qgroup_rescan_workers,
-_info->qgroup_rescan_work);
-   }
-   ret = 0;
-   }
-
-out:
-
return ret;
 }
 
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: volumes: Remove the meaningless condition of minimal nr_devs when allocating a chunk

2018-01-31 Thread Qu Wenruo


On 2018年01月31日 15:35, Nikolay Borisov wrote:
> 
> 
> On 31.01.2018 07:56, Qu Wenruo wrote:
>> When checking the minimal nr_devs, there is one dead and meaningless
>> condition:
>>
>> if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
>> 
>>
>> This condition is meaningless, @devs_increment has nothing to do with
>> @sub_stripes.
>>
>> In fact, in btrfs_raid_array[], profile with sub_stripes larger than 1
>> (RAID10) already has the @devs_increment set to 2.
>> So no need to multiple it by @sub_stripes.
>>
>> And above condition is also dead.
>> For RAID10, @devs_increment * @sub_stripes equals 4, which is also the
>> @devs_min of RAID10.
>> For other profiles, @sub_stripes is always 1, and since @ndevs is
>> rounded down to @devs_increment, the condition will always be true.
>>
>> Remove the meaningless condition to make later reader wander less.
>>
>> Signed-off-by: Qu Wenruo 
> 
> Reviewed-by: Nikolay Borisov 
> 
> Quick question : What exactly is a substripe?

IMHO it is the number of copies inside the RAID0 virtual stripe.

For current RAID10, it's 2 devices as a bundle, using RAID1, then RAID0
these bundles.

> Stripe is essentially how
> many contiguous portions of disk are necessary to satisfy the profile,
> right? So for raid1 we write 1 copy of the data per device (hence
> dev_stripes = 1). For DUP we have 2 copies of the data on the same disk
> hence dev_stripes 2. How does sub_stripes fit in the grand scheme of things?
Here we have extra number to describe the behavior. Mostly
btrfs_raid_arrary.

For sub_stripes, it should be acts like:

Logical address space:   0  1G
 |   RAID10 chunk   |
 | RAID0 of virtual stripe 1~3  |
 /   | \
/|  \
| Virtual stripe 1|| Virtual stripe 2 || Vritual bundle 3 |
|RAID1 of physical stripe 1~2 |
 /\
/  \
|Physical stripe 1| |Physical stripe 2|

And sub_stripes describes how many physical stripes are inside the
virtual stripe.

Thanks,
Qu

>> ---
>>  fs/btrfs/volumes.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 215e85e22c8e..cb0a8d27661b 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -4729,7 +4729,7 @@ static int __btrfs_alloc_chunk(struct 
>> btrfs_trans_handle *trans,
>>  /* round down to number of usable stripes */
>>  ndevs = round_down(ndevs, devs_increment);
>>  
>> -if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
>> +if (ndevs < devs_min) {
>>  ret = -ENOSPC;
>>  goto error;
>>  }
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 2/2] btrfs: add read_mirror_policy parameter devid

2018-01-31 Thread Nikolay Borisov


On 30.01.2018 08:30, Anand Jain wrote:
> Adds the mount option:
>   mount -o read_mirror_policy=
> 
> To set the devid of the device which should be used for read. That
> means all the normal reads will go to that particular device only.
> 
> This also helps testing and gives a better control for the test
> scripts including mount context reads.

Some code comments below. OTOH, does such policy really make sense, what
happens if the selected device fails, will the other mirror be retried?
If the answer to the previous question is positive then why do we really
care which device is going to be tried first?

> 
> Signed-off-by: Anand Jain 
> ---
>  fs/btrfs/super.c   | 21 +
>  fs/btrfs/volumes.c | 10 ++
>  fs/btrfs/volumes.h |  2 ++
>  3 files changed, 33 insertions(+)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index dfe6b3c67df3..d3aad87e 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -847,6 +847,27 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
> *options,
>   BTRFS_READ_MIRROR_BY_PID;
>   break;
>   }
> +
> + intarg = 0;
> + if (match_int([0], ) == 0) {
> + struct btrfs_device *device;
> +
> + device = btrfs_find_device(info, intarg,
> +NULL, NULL);
> + if (!device) {
> + btrfs_err(info,
> +   "read_mirror_policy: invalid devid 
> %d",
> +   intarg);
> + ret = -EINVAL;
> + goto out;
> + }
> + info->read_mirror_policy =
> + BTRFS_READ_MIRROR_BY_DEV;
> + set_bit(BTRFS_DEV_STATE_READ_MIRROR,
> + >dev_state);
> + break;
> + }
> +
>   ret = -EINVAL;
>   goto out;
>   case Opt_err:
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 39ba59832f38..478623e6e074 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -5270,6 +5270,16 @@ static int find_live_mirror(struct btrfs_fs_info 
> *fs_info,
>   num = map->num_stripes;
>  
>   switch(fs_info->read_mirror_policy) {
> + case BTRFS_READ_MIRROR_BY_DEV:
> + optimal = first;
> + if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
> +  >stripes[optimal].dev->dev_state))
> + break;
> + if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
> +  >stripes[++optimal].dev->dev_state))
> + break;
> + optimal = first;

you set optimal 2 times, the second one seems redundant. Alongside this
patch it makes sense to also send a patch to btrfs(5) man page
describing the mount option + description of each implemented allocation
policy.

Another thing which I don't see here is how you are handling the case
when you have more than 2 devices in the RAID1 case. As it stands
currently you assume there are two devices and first test device 0 and
then device 1 and completely ignore any other devices.

> + break;
>   case BTRFS_READ_MIRROR_DEFAULT:
>   case BTRFS_READ_MIRROR_BY_PID:
>   default:
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 78f35d299a61..7281f55dea05 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -50,6 +50,7 @@ struct btrfs_pending_bios {
>  enum btrfs_read_mirror_type {
>   BTRFS_READ_MIRROR_DEFAULT,
>   BTRFS_READ_MIRROR_BY_PID,
> + BTRFS_READ_MIRROR_BY_DEV,
>  };
>  
>  #define BTRFS_DEV_STATE_WRITEABLE(0)
> @@ -57,6 +58,7 @@ enum btrfs_read_mirror_type {
>  #define BTRFS_DEV_STATE_MISSING  (2)
>  #define BTRFS_DEV_STATE_REPLACE_TGT  (3)
>  #define BTRFS_DEV_STATE_FLUSH_SENT   (4)
> +#define BTRFS_DEV_STATE_READ_MIRROR  (5)
>  
>  struct btrfs_device {
>   struct list_head dev_list;
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] btrfs: volumes: Cleanup stripe size calculation

2018-01-31 Thread Gu, Jinxiang


> -Original Message-
> From: linux-btrfs-ow...@vger.kernel.org 
> [mailto:linux-btrfs-ow...@vger.kernel.org]
> On Behalf Of Qu Wenruo
> Sent: Wednesday, January 31, 2018 2:17 PM
> To: linux-btrfs@vger.kernel.org; dste...@suse.cz
> Subject: [PATCH] btrfs: volumes: Cleanup stripe size calculation
> 
> Cleanup the following things:
> 1) open coded SZ_16M round up
> 2) use min() to replace open-coded size comparison
> 3) code style
> 
> Signed-off-by: Qu Wenruo 
> ---
>  fs/btrfs/volumes.c | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index
> cb0a8d27661b..90ad716d2b50 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -4761,18 +4761,17 @@ static int __btrfs_alloc_chunk(struct 
> btrfs_trans_handle
> *trans,
>* and compare that answer with the max chunk size
>*/
>   if (stripe_size * data_stripes > max_chunk_size) {
> - u64 mask = (1ULL << 24) - 1;
> -
>   stripe_size = div_u64(max_chunk_size, data_stripes);
> 
>   /* bump the answer up to a 16MB boundary */
> - stripe_size = (stripe_size + mask) & ~mask;
> + stripe_size = round_up(stripe_size, SZ_16M);
> 
> - /* but don't go higher than the limits we found
> + /*
> +  * but don't go higher than the limits we found
>* while searching for free extents
>*/
> - if (stripe_size > devices_info[ndevs-1].max_avail)
> - stripe_size = devices_info[ndevs-1].max_avail;
> + stripe_size = min(devices_info[ndevs - 1].max_avail,
> +   stripe_size);
>   }
> 
>   stripe_size = div_u64(stripe_size, dev_stripes);
> --

Reviewed-by: Gu Jinxiang 



> 2.16.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the 
> body of a
> message to majord...@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
> 





Re: [PATCH 1/2] btrfs: add mount option read_mirror_policy

2018-01-31 Thread Nikolay Borisov


On 30.01.2018 08:30, Anand Jain wrote:
> In case of RAID1 and RAID10 devices are mirror-ed, a read IO can
> pick any device for reading. This choice of picking a device for
> reading should be configurable. In short not one policy would
> satisfy all types of workload and configs.
> 
> So before we add more policies, this patch-set makes existing
> $pid policy configurable from the mount option.
> 
> For example..
>   mount -o read_mirror_policy=pid (which is also default)
> 
> Signed-off-by: Anand Jain 
> ---
>  fs/btrfs/ctree.h   |  2 ++
>  fs/btrfs/super.c   | 10 ++
>  fs/btrfs/volumes.c |  8 +++-
>  fs/btrfs/volumes.h |  5 +
>  4 files changed, 24 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 1a462ab85c49..4759e988b0df 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1100,6 +1100,8 @@ struct btrfs_fs_info {
>   spinlock_t ref_verify_lock;
>   struct rb_root block_tree;
>  #endif
> + /* Policy to balance read across mirrored devices */
> + int read_mirror_policy;

make that member enum btrfs_read_mirror_type

>  };
>  
>  static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 367ecbf477b9..dfe6b3c67df3 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -329,6 +329,7 @@ enum {
>  #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>   Opt_ref_verify,
>  #endif
> + Opt_read_mirror_policy,
>   Opt_err,
>  };
>  
> @@ -393,6 +394,7 @@ static const match_table_t tokens = {
>  #ifdef CONFIG_BTRFS_FS_REF_VERIFY
>   {Opt_ref_verify, "ref_verify"},
>  #endif
> + {Opt_read_mirror_policy, "read_mirror_policy=%s"},
>   {Opt_err, NULL},
>  };
>  
> @@ -839,6 +841,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
> *options,
>   btrfs_set_opt(info->mount_opt, REF_VERIFY);
>   break;
>  #endif
> + case Opt_read_mirror_policy:
> + if (strcmp(args[0].from, "pid") == 0) {
> + info->read_mirror_policy =
> + BTRFS_READ_MIRROR_BY_PID;
> + break;
> + }
> + ret = -EINVAL;
> + goto out;
>   case Opt_err:
>   btrfs_info(info, "unrecognized mount option '%s'", p);
>   ret = -EINVAL;
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index a61715677b67..39ba59832f38 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -5269,7 +5269,13 @@ static int find_live_mirror(struct btrfs_fs_info 
> *fs_info,
>   else
>   num = map->num_stripes;
>  
> - optimal = first + current->pid % num;
> + switch(fs_info->read_mirror_policy) {
> + case BTRFS_READ_MIRROR_DEFAULT:
> + case BTRFS_READ_MIRROR_BY_PID:
> + default:
> + optimal = first + current->pid % num;
> + break;
> + }

Why not factor out this code in a separate function with descriptive
name and some documentation. It seems you have plans how to extend this
mechanism further so let's try and make it maintainable from the get-go.

>  
>   if (dev_replace_is_ongoing &&
>   fs_info->dev_replace.cont_reading_from_srcdev_mode ==
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 28c28eeadff3..78f35d299a61 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -47,6 +47,11 @@ struct btrfs_pending_bios {
>  #define btrfs_device_data_ordered_init(device) do { } while (0)
>  #endif
>  
> +enum btrfs_read_mirror_type {
> + BTRFS_READ_MIRROR_DEFAULT,
> + BTRFS_READ_MIRROR_BY_PID,
> +};
> +
>  #define BTRFS_DEV_STATE_WRITEABLE(0)
>  #define BTRFS_DEV_STATE_IN_FS_METADATA   (1)
>  #define BTRFS_DEV_STATE_MISSING  (2)
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html