[RFC][PATCH v3 10/10] btrfs: use common file type conversion

2018-10-26 Thread Phillip Potter
Deduplicate the btrfs file type conversion implementation - file systems
that use the same file types as defined by POSIX do not need to define
their own versions and can use the common helper functions decared in
fs_types.h and implemented in fs_types.c

Signed-off-by: Amir Goldstein 
Signed-off-by: Phillip Potter 
---
 fs/btrfs/btrfs_inode.h  |  2 --
 fs/btrfs/delayed-inode.c|  2 +-
 fs/btrfs/inode.c| 32 +++-
 include/uapi/linux/btrfs_tree.h |  2 ++
 4 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..c7c6db6b4a35 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -196,8 +196,6 @@ struct btrfs_inode {
struct inode vfs_inode;
 };
 
-extern unsigned char btrfs_filetype_table[];
-
 static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
 {
return container_of(inode, struct btrfs_inode, vfs_inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f51b509f2d9b..c1da34e3a775 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1691,7 +1691,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context 
*ctx,
name = (char *)(di + 1);
name_len = btrfs_stack_dir_name_len(di);
 
-   d_type = btrfs_filetype_table[di->type];
+   d_type = fs_ftype_to_dtype(di->type);
btrfs_disk_key_to_cpu(, >location);
 
over = !dir_emit(ctx, name, name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea5339603cf..089638719842 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -73,17 +73,6 @@ struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_path_cachep;
 struct kmem_cache *btrfs_free_space_cachep;
 
-#define S_SHIFT 12
-static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
-   [S_IFREG >> S_SHIFT]= BTRFS_FT_REG_FILE,
-   [S_IFDIR >> S_SHIFT]= BTRFS_FT_DIR,
-   [S_IFCHR >> S_SHIFT]= BTRFS_FT_CHRDEV,
-   [S_IFBLK >> S_SHIFT]= BTRFS_FT_BLKDEV,
-   [S_IFIFO >> S_SHIFT]= BTRFS_FT_FIFO,
-   [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
-   [S_IFLNK >> S_SHIFT]= BTRFS_FT_SYMLINK,
-};
-
 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent 
*ordered_extent);
@@ -5803,10 +5792,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, 
struct dentry *dentry,
return d_splice_alias(inode, dentry);
 }
 
-unsigned char btrfs_filetype_table[] = {
-   DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
 /*
  * All this infrastructure exists because dir_emit can fault, and we are 
holding
  * the tree lock when doing readdir.  For now just allocate a buffer and copy
@@ -5879,6 +5864,19 @@ static int btrfs_real_readdir(struct file *file, struct 
dir_context *ctx)
bool put = false;
struct btrfs_key location;
 
+   /*
+* compile-time asserts that generic FT_x types still match
+* BTRFS_FT_x types
+*/
+   BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
+   BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
+   BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
+   BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
+   BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
+   BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
+   BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
+   BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
+
if (!dir_emit_dots(file, ctx))
return 0;
 
@@ -5945,7 +5943,7 @@ static int btrfs_real_readdir(struct file *file, struct 
dir_context *ctx)
name_ptr = (char *)(entry + 1);
read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
   name_len);
-   put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)],
+   put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
>type);
btrfs_dir_item_key_to_cpu(leaf, di, );
put_unaligned(location.objectid, >ino);
@@ -6350,7 +6348,7 @@ static struct inode *btrfs_new_inode(struct 
btrfs_trans_handle *trans,
 
 static inline u8 btrfs_inode_type(struct inode *inode)
 {
-   return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+   return fs_umode_to_ftype(inode->i_mode);
 }
 
 /*
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index aff1356c2bb8..a4f5fb56a45b 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -307,6 +307,8 @@
  *
  * Used by:
  * struct btrfs_dir_item.type
+ *
+ * Values 0..7 must match common file type values in fs_types.h.
  */
 #define BTRFS_FT_UNKNOWN   0
 #define BTRFS_FT_REG_FILE  1
-- 
2.17.2



Re: Kernel crash related to LZO compression

2018-10-26 Thread Qu Wenruo


On 2018/10/26 下午10:57, Dmitry Katsubo wrote:
> On 2018-10-25 20:49, Chris Murphy wrote:
>> I would say the first step no matter what if you're using an older
>> kernel, is to boot a current Fedora or Arch live or install media,
>> mount the Btrfs and try to read the problem files and see if the
>> problem still happens. I can't even being to estimate the tens of
>> thousands of line changes since kernel 4.9.
> 
> Good point Chris. Indeed booting a fresh kernel is never a problem.
> Actually I forgot to mention that I've seen the same problem with
> kernel 4.12.13 (attached).
> 
>> What profile are you using for this Btrfs? Is this a raid56? What do
>> you get for 'btrfs fi us ' ?
> 
> It is RAID1 volume for both metadata and data, but unfortunately I
> haven't recorded the actual output before the failure. The configuration
> was like this:
> 
> # btrfs filesystem show /var/log
> Label: none  uuid: 5b45ac8e-fd8c-4759-854a-94e45069959d
> Total devices 2 FS bytes used 11.13GiB
> devid    3 size 50.00GiB used 14.03GiB path /dev/sda3
> devid    4 size 50.00GiB used 14.03GiB path /dev/sdc1
> 
> On 2018-10-25 20:49, Chris Murphy wrote:
>> It should be safe even with that kernel. I'm not sure this is
>> compression related. There is a corruption bug related to inline
>> extents and corruption that had been fairly elusive but I think it's
>> fixed now. I haven't run into it though.
> 
> On 2018-10-26 02:09, Qu Wenruo wrote:
>>> Are there any updates / fixes done in that area? Is lzo option safe
>>> to use?
>>
>> Yes, we have commits to harden lzo decompress code in v4.18:
>>
>> de885e3ee281a88f52283c7e8994e762e3a5f6bd btrfs: lzo: Harden inline lzo
>> compressed extent decompression
>> 314bfa473b6b6d3efe68011899bd718b349f29d7 btrfs: lzo: Add header length
>> check to avoid potential out-of-bounds acc
>>
>> And for the root cause, it's compressed data without csum, then scrub
>> could make it corrupted.
>>
>> It's also fixed in v4.18:
>>
>> 665d4953cde6d9e75c62a07ec8f4f8fd7d396ade btrfs: scrub: Don't use inode
>> page cache in scrub_handle_errored_block()
>> ac0b4145d662a3b9e34085dea460fb06ede9b69b btrfs: scrub: Don't use inode
>> pages for device replace
> 
> Thanks, Qu, for this information. Actually one time I've seen the binary
> crap (not zeros) in text log files (/var/log/*.log) and I was surprised
> that btrfs returned me data which is corrupted instead of signalling I/O
> error. Could it be because of "compressed data without csum" problem?

Yes, pretty much the case, especially for your RAID1 setup.

The root fix should has been backported to stable kernel after 4.0, but
the lzo decompression harden part isn't sent to stable kernel, so you
may still hit such problem.

Thanks,
Qu

> 
> Thanks!
> 



signature.asc
Description: OpenPGP digital signature


[josef-btrfs:kill-mmap-sem-v5 9/10] htmldocs: mm/filemap.c:2882: warning: Function parameter or member 'mkwrite' not described in 'filemap_page_mkwrite_nommapsem'

2018-10-26 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/josef/btrfs-next.git 
kill-mmap-sem-v5
head:   c7a3e899803d98ae543468f0026500c651528a6c
commit: c52634339df10df75f28787f431b937de3c3c591 [9/10] mm: introduce 
filemap_page_mkwrite_nommapsem
reproduce: make htmldocs

All warnings (new ones prefixed by >>):

   WARNING: convert(1) not found, for SVG to PDF conversion install ImageMagick 
(https://www.imagemagick.org)
   include/linux/srcu.h:175: warning: Function parameter or member 'p' not 
described in 'srcu_dereference_notrace'
   include/linux/srcu.h:175: warning: Function parameter or member 'sp' not 
described in 'srcu_dereference_notrace'
   include/linux/gfp.h:1: warning: no structured comments found
>> mm/filemap.c:2882: warning: Function parameter or member 'mkwrite' not 
>> described in 'filemap_page_mkwrite_nommapsem'
   mm/filemap.c:2882: warning: Excess function parameter 'page_mkwrite_cb' 
description in 'filemap_page_mkwrite_nommapsem'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4381: warning: Function parameter or member 
'wext.ibss' not described in 'wireless_dev'
   include/net/cfg80211.h:4381: warning: Function parameter or member 
'wext.connect' not described in 'wireless_dev'
   include/net/cfg80211.h:4381: warning: Function parameter or member 
'wext.keys' not described in 'wireless_dev'
   include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.ie' 
not described in 'wireless_dev'
   include/net/cfg80211.h:4381: warning: Function parameter or member 
'wext.ie_len' not described in 'wireless_dev'
   include/net/cfg80211.h:4381: warning: Function parameter or member 
'wext.bssid' not described in 'wireless_dev'
   include/net/cfg80211.h:4381: warning: Function parameter or member 
'wext.ssid' not described in 'wireless_dev'
   include/net/cfg80211.h:4381: warning: Function parameter or member 
'wext.default_key' not described in 'wireless_dev'
   include/net/cfg80211.h:4381: warning: Function parameter or member 
'wext.default_mgmt_key' not described in 'wireless_dev'
   include/net/cfg80211.h:4381: warning: Function parameter or member 
'wext.prev_bssid_valid' not described in 'wireless_dev'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' 
description in 'reg_query_regdb_wmm'
   include/net/cfg80211.h:4869: warning: Excess function parameter 

[PATCH] Btrfs: remove no longer used logged range variables when logging extents

2018-10-26 Thread fdmanana
From: Filipe Manana 

The logged_start and logged_end variables, at btrfs_log_changed_extents(),
were added in commit 8c6c592831a0 ("btrfs: log csums for all modified
extents"). However since the recent simplification for fsync, which makes
us wait for all ordered extents to complete before logging extents, we
no longer need those variables. Commit a2120a473a80 ("btrfs: clean up the
left over logged_list usage") forgot to remove them.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1673dccc76c2..c86c5dd100b2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4383,7 +4383,6 @@ static int btrfs_log_changed_extents(struct 
btrfs_trans_handle *trans,
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = >extent_tree;
-   u64 logged_start, logged_end;
u64 test_gen;
int ret = 0;
int num = 0;
@@ -4393,8 +4392,6 @@ static int btrfs_log_changed_extents(struct 
btrfs_trans_handle *trans,
down_write(>dio_sem);
write_lock(>lock);
test_gen = root->fs_info->last_trans_committed;
-   logged_start = start;
-   logged_end = end;
 
list_for_each_entry_safe(em, n, >modified_extents, list) {
list_del_init(>list);
@@ -4418,11 +4415,6 @@ static int btrfs_log_changed_extents(struct 
btrfs_trans_handle *trans,
em->start >= i_size_read(>vfs_inode))
continue;
 
-   if (em->start < logged_start)
-   logged_start = em->start;
-   if ((em->start + em->len - 1) > logged_end)
-   logged_end = em->start + em->len - 1;
-
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(>refs);
set_bit(EXTENT_FLAG_LOGGING, >flags);
-- 
2.11.0



Re: [PATCH] Btrfs: remove no longer used stuff for tracking pending ordered extents

2018-10-26 Thread Liu Bo
On Fri, Oct 26, 2018 at 9:16 AM  wrote:
>
> From: Filipe Manana 
>
> Tracking pending ordered extents per transaction was introduced in commit
> 50d9aa99bd35 ("Btrfs: make sure logged extents complete in the current
> transaction V3") and later updated in commit 161c3549b45a ("Btrfs: change
> how we wait for pending ordered extents").
>
> However now that on fsync we always wait for ordered extents to complete
> before logging, done in commit 5636cf7d6dc8 ("btrfs: remove the logged
> extents infrastructure"), we no longer need the stuff to track for pending
> ordered extents, which was not completely removed in the mentioned commit.
> So remove the remaining of the pending ordered extents infrastructure.
>

Reviewed-by: Liu Bo 

thanks,
liubo

> Signed-off-by: Filipe Manana 
> ---
>  fs/btrfs/ordered-data.c | 30 --
>  fs/btrfs/ordered-data.h |  2 --
>  fs/btrfs/transaction.c  | 11 ---
>  fs/btrfs/transaction.h  |  2 --
>  4 files changed, 45 deletions(-)
>
> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
> index 0c4ef208b8b9..6fde2b2741ef 100644
> --- a/fs/btrfs/ordered-data.c
> +++ b/fs/btrfs/ordered-data.c
> @@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
> struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
> struct btrfs_root *root = btrfs_inode->root;
> struct rb_node *node;
> -   bool dec_pending_ordered = false;
>
> /* This is paired with btrfs_add_ordered_extent. */
> spin_lock(_inode->lock);
> @@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
> if (tree->last == node)
> tree->last = NULL;
> set_bit(BTRFS_ORDERED_COMPLETE, >flags);
> -   if (test_and_clear_bit(BTRFS_ORDERED_PENDING, >flags))
> -   dec_pending_ordered = true;
> spin_unlock_irq(>lock);
>
> -   /*
> -* The current running transaction is waiting on us, we need to let it
> -* know that we're complete and wake it up.
> -*/
> -   if (dec_pending_ordered) {
> -   struct btrfs_transaction *trans;
> -
> -   /*
> -* The checks for trans are just a formality, it should be 
> set,
> -* but if it isn't we don't want to deref/assert under the 
> spin
> -* lock, so be nice and check if trans is set, but ASSERT() so
> -* if it isn't set a developer will notice.
> -*/
> -   spin_lock(_info->trans_lock);
> -   trans = fs_info->running_transaction;
> -   if (trans)
> -   refcount_inc(>use_count);
> -   spin_unlock(_info->trans_lock);
> -
> -   ASSERT(trans);
> -   if (trans) {
> -   if (atomic_dec_and_test(>pending_ordered))
> -   wake_up(>pending_wait);
> -   btrfs_put_transaction(trans);
> -   }
> -   }
> -
> spin_lock(>ordered_extent_lock);
> list_del_init(>root_extent_list);
> root->nr_ordered_extents--;
> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
> index 02d813aaa261..b10e6765d88f 100644
> --- a/fs/btrfs/ordered-data.h
> +++ b/fs/btrfs/ordered-data.h
> @@ -56,8 +56,6 @@ struct btrfs_ordered_sum {
>* the isize. */
>  #define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent 
> */
>
> -#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to
> - * complete in the current transaction. */
>  #define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */
>
>  struct btrfs_ordered_extent {
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 3b84f5015029..2fe6c2b1d94b 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -232,14 +232,12 @@ static noinline int join_transaction(struct 
> btrfs_fs_info *fs_info,
> extwriter_counter_init(cur_trans, type);
> init_waitqueue_head(_trans->writer_wait);
> init_waitqueue_head(_trans->commit_wait);
> -   init_waitqueue_head(_trans->pending_wait);
> cur_trans->state = TRANS_STATE_RUNNING;
> /*
>  * One for this trans handle, one so it will live on until we
>  * commit the transaction.
>  */
> refcount_set(_trans->use_count, 2);
> -   atomic_set(_trans->pending_ordered, 0);
> cur_trans->flags = 0;
> cur_trans->start_time = ktime_get_seconds();
>
> @@ -1908,13 +1906,6 @@ static inline void btrfs_wait_delalloc_flush(struct 
> btrfs_fs_info *fs_info)
> btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
>  }
>
> -static inline void
> -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
> -{
> -   wait_event(cur_trans->pending_wait,
> -  atomic_read(_trans->pending_ordered) == 0);
> -}

[PATCH v3] fstests: btrfs verify hardening agaist duplicate fsid

2018-10-26 Thread Anand Jain
We have a known bug in btrfs, that we let the device path be changed
after the device has been mounted. So using this loop hole the new
copied device would appears as if its mounted immediately after its
been copied. So this test case reproduces this issue.

For example:

Initially.. /dev/mmcblk0p4 is mounted as /

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part /
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi

btrfs fi show
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
Total devices 1 FS bytes used 1.40GiB
devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4

Copy mmcblk0 to sda
dd if=/dev/mmcblk0 of=/dev/sda

And immediately after the copy completes the change in the device
superblock is notified which the automount scans using
btrfs device scan and the new device sda becomes the mounted root
device.

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
sda   8:01 14.9G  0 disk
|-sda48:414G  0 part /
|-sda28:21  500M  0 part
|-sda38:31  256M  0 part
`-sda18:11  256M  0 part
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi
btrfs fi show /
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
Total devices 1 FS bytes used 1.40GiB
devid1 size 4.00GiB used 3.00GiB path /dev/sda4

The bug is quite nasty that you can't either unmount /dev/sda4 or
/dev/mmcblk0p4. And the problem does not get solved until you take
the sda out of the system on to another system to change its fsid using
the 'btrfstune -u' command.

Signed-off-by: Anand Jain 
---
v2->v3:
  Check the return code and use _fail to verify and accordingly fix golden
output.
  Rename dev_foo(bar) to device_1(2)
  Don't log dd retun to $seqres.full
 
v1->v2: 
  dont play around with dev patch use it as it is.
  do not use SCRATCH_MNT instead create it at the TEST_DIR and its related
   changes.
  golden out changes

 tests/btrfs/173 | 82 +
 tests/btrfs/173.out |  2 ++
 tests/btrfs/group   |  1 +
 3 files changed, 85 insertions(+)
 create mode 100755 tests/btrfs/173
 create mode 100644 tests/btrfs/173.out

diff --git a/tests/btrfs/173 b/tests/btrfs/173
new file mode 100755
index ..342ae92b4781
--- /dev/null
+++ b/tests/btrfs/173
@@ -0,0 +1,82 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Oracle. All Rights Reserved.
+#
+# FS QA Test 173
+#
+# Fuzzy test for FS image duplication.
+#  Could be fixed by
+#[patch] btrfs: harden agaist duplicate fsid
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+mnt=$TEST_DIR/$seq.mnt
+_cleanup()
+{
+   rm -rf $mnt > /dev/null 2>&1
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch_dev_pool 2
+_scratch_dev_pool_get 2
+
+device_1=$(echo $SCRATCH_DEV_POOL | awk '{print $1}')
+device_2=$(echo $SCRATCH_DEV_POOL | awk '{print $2}')
+
+echo device_1=$device_1 device_2=$device_2 >> $seqres.full
+
+rm -rf $mnt > /dev/null 2>&1
+mkdir $mnt
+_mkfs_dev $device_1
+_mount $device_1 $mnt
+
+[[ $(findmnt $mnt | grep -v TARGET | awk '{print $2}') != $device_1 ]] && \
+   _fail "mounted device changed"
+
+for sb_bytenr in 65536 67108864
+do
+   echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\
+   "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full
+   dd status=none if=$device_1 of=$device_2 bs=1 seek=$sb_bytenr \
+   skip=$sb_bytenr count=4096 > /dev/null 2>&1
+   echo ..:$? >> $seqres.full
+done
+
+#Original device is mounted, scan of its clone should fail
+$BTRFS_UTIL_PROG device scan $device_2 >> $seqres.full 2>&1
+[[ $? != 1 ]] && _fail "cloned device scan should fail"
+
+[[ $(findmnt $mnt | grep -v TARGET | awk '{print $2}') != $device_1 ]] && \
+   _fail "mounted device changed"
+
+#Original device scan should be successful
+$BTRFS_UTIL_PROG device scan $device_1 >> $seqres.full 2>&1
+[[ $? != 0 ]] && \
+   _fail "if it fails here, then it means subvolume mount at boot may fail 
"\
+ "in some configs."
+
+umount $mnt > /dev/null 2>&1
+_scratch_dev_pool_put
+
+# success, all done
+echo "Silence is golden"
+status=0
+exit
diff --git a/tests/btrfs/173.out b/tests/btrfs/173.out

[PATCH] Btrfs: remove no longer used stuff for tracking pending ordered extents

2018-10-26 Thread fdmanana
From: Filipe Manana 

Tracking pending ordered extents per transaction was introduced in commit
50d9aa99bd35 ("Btrfs: make sure logged extents complete in the current
transaction V3") and later updated in commit 161c3549b45a ("Btrfs: change
how we wait for pending ordered extents").

However now that on fsync we always wait for ordered extents to complete
before logging, done in commit 5636cf7d6dc8 ("btrfs: remove the logged
extents infrastructure"), we no longer need the stuff to track for pending
ordered extents, which was not completely removed in the mentioned commit.
So remove the remaining of the pending ordered extents infrastructure.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/ordered-data.c | 30 --
 fs/btrfs/ordered-data.h |  2 --
 fs/btrfs/transaction.c  | 11 ---
 fs/btrfs/transaction.h  |  2 --
 4 files changed, 45 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0c4ef208b8b9..6fde2b2741ef 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
struct rb_node *node;
-   bool dec_pending_ordered = false;
 
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(_inode->lock);
@@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, >flags);
-   if (test_and_clear_bit(BTRFS_ORDERED_PENDING, >flags))
-   dec_pending_ordered = true;
spin_unlock_irq(>lock);
 
-   /*
-* The current running transaction is waiting on us, we need to let it
-* know that we're complete and wake it up.
-*/
-   if (dec_pending_ordered) {
-   struct btrfs_transaction *trans;
-
-   /*
-* The checks for trans are just a formality, it should be set,
-* but if it isn't we don't want to deref/assert under the spin
-* lock, so be nice and check if trans is set, but ASSERT() so
-* if it isn't set a developer will notice.
-*/
-   spin_lock(_info->trans_lock);
-   trans = fs_info->running_transaction;
-   if (trans)
-   refcount_inc(>use_count);
-   spin_unlock(_info->trans_lock);
-
-   ASSERT(trans);
-   if (trans) {
-   if (atomic_dec_and_test(>pending_ordered))
-   wake_up(>pending_wait);
-   btrfs_put_transaction(trans);
-   }
-   }
-
spin_lock(>ordered_extent_lock);
list_del_init(>root_extent_list);
root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 02d813aaa261..b10e6765d88f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -56,8 +56,6 @@ struct btrfs_ordered_sum {
   * the isize. */
 #define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */
 
-#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to
- * complete in the current transaction. */
 #define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */
 
 struct btrfs_ordered_extent {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b84f5015029..2fe6c2b1d94b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -232,14 +232,12 @@ static noinline int join_transaction(struct btrfs_fs_info 
*fs_info,
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(_trans->writer_wait);
init_waitqueue_head(_trans->commit_wait);
-   init_waitqueue_head(_trans->pending_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
 * One for this trans handle, one so it will live on until we
 * commit the transaction.
 */
refcount_set(_trans->use_count, 2);
-   atomic_set(_trans->pending_ordered, 0);
cur_trans->flags = 0;
cur_trans->start_time = ktime_get_seconds();
 
@@ -1908,13 +1906,6 @@ static inline void btrfs_wait_delalloc_flush(struct 
btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 }
 
-static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
-{
-   wait_event(cur_trans->pending_wait,
-  atomic_read(_trans->pending_ordered) == 0);
-}
-
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 {
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2049,8 +2040,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans)
 
btrfs_wait_delalloc_flush(fs_info);
 
-   

Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid

2018-10-26 Thread Anand Jain




On 10/26/2018 11:52 PM, Nikolay Borisov wrote:



On 26.10.18 г. 18:34 ч., Anand Jain wrote:



On 10/26/2018 11:02 PM, Nikolay Borisov wrote:



On 8.10.18 г. 21:28 ч., Anand Jain wrote:

We have a known bug in btrfs, that we let the device path be changed
after the device has been mounted. So using this loop hole the new
copied device would appears as if its mounted immediately after its
been copied. So this test case reproduces this issue.

For example:

Initially.. /dev/mmcblk0p4 is mounted as /

lsblk
NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
mmcblk0 179:0    0 29.2G  0 disk
|-mmcblk0p4 179:4    0    4G  0 part /
|-mmcblk0p2 179:2    0  500M  0 part /boot
|-mmcblk0p3 179:3    0  256M  0 part [SWAP]
`-mmcblk0p1 179:1    0  256M  0 part /boot/efi

btrfs fi show
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
  Total devices 1 FS bytes used 1.40GiB
  devid    1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4

Copy mmcblk0 to sda
dd if=/dev/mmcblk0 of=/dev/sda

And immediately after the copy completes the change in the device
superblock is notified which the automount scans using
btrfs device scan and the new device sda becomes the mounted root
device.

lsblk
NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
sda   8:0    1 14.9G  0 disk
|-sda4    8:4    1    4G  0 part /
|-sda2    8:2    1  500M  0 part
|-sda3    8:3    1  256M  0 part
`-sda1    8:1    1  256M  0 part
mmcblk0 179:0    0 29.2G  0 disk
|-mmcblk0p4 179:4    0    4G  0 part
|-mmcblk0p2 179:2    0  500M  0 part /boot
|-mmcblk0p3 179:3    0  256M  0 part [SWAP]
`-mmcblk0p1 179:1    0  256M  0 part /boot/efi
btrfs fi show /
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
  Total devices 1 FS bytes used 1.40GiB
  devid    1 size 4.00GiB used 3.00GiB path /dev/sda4

The bug is quite nasty that you can't either unmount /dev/sda4 or
/dev/mmcblk0p4. And the problem does not get solved until you take
the sda out of the system on to another system to change its fsid using
the 'btrfstune -u' command.


Is there a pending fix for this?


Yes.
https://patchwork.kernel.org/patch/10641041/

Test case header mentioned it.





Signed-off-by: Anand Jain 
---
v1->v2:
    dont play around with dev patch use it as it is.
    do not use SCRATCH_MNT instead create it at the TEST_DIR and its
related
     changes.
    golden out changes
       tests/btrfs/173 | 88
+
   tests/btrfs/173.out |  6 
   tests/btrfs/group   |  1 +
   3 files changed, 95 insertions(+)
   create mode 100755 tests/btrfs/173
   create mode 100644 tests/btrfs/173.out

diff --git a/tests/btrfs/173 b/tests/btrfs/173
new file mode 100755
index ..b466ae921e19
--- /dev/null
+++ b/tests/btrfs/173
@@ -0,0 +1,88 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Oracle. All Rights Reserved.
+#
+# FS QA Test 173
+#
+# Fuzzy test for FS image duplication.
+#  Could be fixed by
+#    [patch] btrfs: harden agaist duplicate fsid
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1    # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+mnt=$TEST_DIR/$seq.mnt
+_cleanup()
+{
+    rm -rf $mnt > /dev/null 2>&1
+    cd /
+    rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch_dev_pool 2
+_scratch_dev_pool_get 2
+
+dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}')
+dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}')


Naming devices _foo and _bar just shows you could not care less about
the test.


   Hmm. Will make it device_1 and device_2.


Either use sane names - primary_dev/secondary dev or device_1
and device_2.



+
+echo dev_foo=$dev_foo >> $seqres.full
+echo dev_bar=$dev_bar >> $seqres.full
+echo | tee -a $seqres.full
+
+rm -rf $mnt > /dev/null 2>&1


So what is $mnt? I can see it's used by very few tests and it's not
obvious? Generally you should either define it as a local variable or
use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked
with Eric re. the use of $mnt in tests and his conclusion is :

" looks like a bug"


  No its not. As few lines above, I have assigned it as..
     mnt=$TEST_DIR/$seq.mnt


I missed that, I will recommend moving the assignment near where you set
the devices


 cleanup() is using $mnt. cleanup() may get called for any trap before 
the actual test.


Thanks, Anand






+mkdir $mnt
+_mkfs_dev $dev_foo
+_mount $dev_foo $mnt
+
+check_btrfs_mount()
+{
+    local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}')
+    [[ $x == $dev_foo ]] && echo DEV_FOO
+    [[ $x == $dev_bar ]] && echo DEV_BAR
+}


Same thing here re. DEV_(FOO|BAR).


+
+echo MNT $(check_btrfs_mount)
+
+for 

Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid

2018-10-26 Thread Nikolay Borisov



On 26.10.18 г. 18:34 ч., Anand Jain wrote:
> 
> 
> On 10/26/2018 11:02 PM, Nikolay Borisov wrote:
>>
>>
>> On 8.10.18 г. 21:28 ч., Anand Jain wrote:
>>> We have a known bug in btrfs, that we let the device path be changed
>>> after the device has been mounted. So using this loop hole the new
>>> copied device would appears as if its mounted immediately after its
>>> been copied. So this test case reproduces this issue.
>>>
>>> For example:
>>>
>>> Initially.. /dev/mmcblk0p4 is mounted as /
>>>
>>> lsblk
>>> NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
>>> mmcblk0 179:0    0 29.2G  0 disk
>>> |-mmcblk0p4 179:4    0    4G  0 part /
>>> |-mmcblk0p2 179:2    0  500M  0 part /boot
>>> |-mmcblk0p3 179:3    0  256M  0 part [SWAP]
>>> `-mmcblk0p1 179:1    0  256M  0 part /boot/efi
>>>
>>> btrfs fi show
>>> Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
>>>  Total devices 1 FS bytes used 1.40GiB
>>>  devid    1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4
>>>
>>> Copy mmcblk0 to sda
>>> dd if=/dev/mmcblk0 of=/dev/sda
>>>
>>> And immediately after the copy completes the change in the device
>>> superblock is notified which the automount scans using
>>> btrfs device scan and the new device sda becomes the mounted root
>>> device.
>>>
>>> lsblk
>>> NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
>>> sda   8:0    1 14.9G  0 disk
>>> |-sda4    8:4    1    4G  0 part /
>>> |-sda2    8:2    1  500M  0 part
>>> |-sda3    8:3    1  256M  0 part
>>> `-sda1    8:1    1  256M  0 part
>>> mmcblk0 179:0    0 29.2G  0 disk
>>> |-mmcblk0p4 179:4    0    4G  0 part
>>> |-mmcblk0p2 179:2    0  500M  0 part /boot
>>> |-mmcblk0p3 179:3    0  256M  0 part [SWAP]
>>> `-mmcblk0p1 179:1    0  256M  0 part /boot/efi
>>> btrfs fi show /
>>> Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
>>>  Total devices 1 FS bytes used 1.40GiB
>>>  devid    1 size 4.00GiB used 3.00GiB path /dev/sda4
>>>
>>> The bug is quite nasty that you can't either unmount /dev/sda4 or
>>> /dev/mmcblk0p4. And the problem does not get solved until you take
>>> the sda out of the system on to another system to change its fsid using
>>> the 'btrfstune -u' command.
>>
>> Is there a pending fix for this?
> 
> Yes.
> https://patchwork.kernel.org/patch/10641041/
> 
> Test case header mentioned it.
> 
>>
>>>
>>> Signed-off-by: Anand Jain 
>>> ---
>>> v1->v2:
>>>    dont play around with dev patch use it as it is.
>>>    do not use SCRATCH_MNT instead create it at the TEST_DIR and its
>>> related
>>>     changes.
>>>    golden out changes
>>>       tests/btrfs/173 | 88
>>> +
>>>   tests/btrfs/173.out |  6 
>>>   tests/btrfs/group   |  1 +
>>>   3 files changed, 95 insertions(+)
>>>   create mode 100755 tests/btrfs/173
>>>   create mode 100644 tests/btrfs/173.out
>>>
>>> diff --git a/tests/btrfs/173 b/tests/btrfs/173
>>> new file mode 100755
>>> index ..b466ae921e19
>>> --- /dev/null
>>> +++ b/tests/btrfs/173
>>> @@ -0,0 +1,88 @@
>>> +#! /bin/bash
>>> +# SPDX-License-Identifier: GPL-2.0
>>> +# Copyright (c) 2018 Oracle. All Rights Reserved.
>>> +#
>>> +# FS QA Test 173
>>> +#
>>> +# Fuzzy test for FS image duplication.
>>> +#  Could be fixed by
>>> +#    [patch] btrfs: harden agaist duplicate fsid
>>> +#
>>> +seq=`basename $0`
>>> +seqres=$RESULT_DIR/$seq
>>> +echo "QA output created by $seq"
>>> +
>>> +here=`pwd`
>>> +tmp=/tmp/$$
>>> +status=1    # failure is the default!
>>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>>> +
>>> +mnt=$TEST_DIR/$seq.mnt
>>> +_cleanup()
>>> +{
>>> +    rm -rf $mnt > /dev/null 2>&1
>>> +    cd /
>>> +    rm -f $tmp.*
>>> +}
>>> +
>>> +# get standard environment, filters and checks
>>> +. ./common/rc
>>> +. ./common/filter
>>> +
>>> +# remove previous $seqres.full before test
>>> +rm -f $seqres.full
>>> +
>>> +# real QA test starts here
>>> +
>>> +# Modify as appropriate.
>>> +_supported_fs btrfs
>>> +_supported_os Linux
>>> +_require_scratch_dev_pool 2
>>> +_scratch_dev_pool_get 2
>>> +
>>> +dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}')
>>> +dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}')
>>
>> Naming devices _foo and _bar just shows you could not care less about
>> the test.
> 
>   Hmm. Will make it device_1 and device_2.
> 
>> Either use sane names - primary_dev/secondary dev or device_1
>> and device_2.
> 
>>> +
>>> +echo dev_foo=$dev_foo >> $seqres.full
>>> +echo dev_bar=$dev_bar >> $seqres.full
>>> +echo | tee -a $seqres.full
>>> +
>>> +rm -rf $mnt > /dev/null 2>&1
>>
>> So what is $mnt? I can see it's used by very few tests and it's not
>> obvious? Generally you should either define it as a local variable or
>> use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked
>> with Eric re. the use of $mnt in tests and his conclusion is :
>>
>> " looks like a bug"
> 
>  No its not. As few lines above, I have assigned it as..
>     mnt=$TEST_DIR/$seq.mnt

I missed that, I will 

Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid

2018-10-26 Thread Anand Jain




On 10/26/2018 11:02 PM, Nikolay Borisov wrote:



On 8.10.18 г. 21:28 ч., Anand Jain wrote:

We have a known bug in btrfs, that we let the device path be changed
after the device has been mounted. So using this loop hole the new
copied device would appears as if its mounted immediately after its
been copied. So this test case reproduces this issue.

For example:

Initially.. /dev/mmcblk0p4 is mounted as /

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part /
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi

btrfs fi show
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
 Total devices 1 FS bytes used 1.40GiB
 devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4

Copy mmcblk0 to sda
dd if=/dev/mmcblk0 of=/dev/sda

And immediately after the copy completes the change in the device
superblock is notified which the automount scans using
btrfs device scan and the new device sda becomes the mounted root
device.

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
sda   8:01 14.9G  0 disk
|-sda48:414G  0 part /
|-sda28:21  500M  0 part
|-sda38:31  256M  0 part
`-sda18:11  256M  0 part
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi
btrfs fi show /
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
 Total devices 1 FS bytes used 1.40GiB
 devid1 size 4.00GiB used 3.00GiB path /dev/sda4

The bug is quite nasty that you can't either unmount /dev/sda4 or
/dev/mmcblk0p4. And the problem does not get solved until you take
the sda out of the system on to another system to change its fsid using
the 'btrfstune -u' command.


Is there a pending fix for this?


Yes.
https://patchwork.kernel.org/patch/10641041/

Test case header mentioned it.





Signed-off-by: Anand Jain 
---
v1->v2:
   dont play around with dev patch use it as it is.
   do not use SCRATCH_MNT instead create it at the TEST_DIR and its related
changes.
   golden out changes

  tests/btrfs/173 | 88 +

  tests/btrfs/173.out |  6 
  tests/btrfs/group   |  1 +
  3 files changed, 95 insertions(+)
  create mode 100755 tests/btrfs/173
  create mode 100644 tests/btrfs/173.out

diff --git a/tests/btrfs/173 b/tests/btrfs/173
new file mode 100755
index ..b466ae921e19
--- /dev/null
+++ b/tests/btrfs/173
@@ -0,0 +1,88 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Oracle. All Rights Reserved.
+#
+# FS QA Test 173
+#
+# Fuzzy test for FS image duplication.
+#  Could be fixed by
+#[patch] btrfs: harden agaist duplicate fsid
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+mnt=$TEST_DIR/$seq.mnt
+_cleanup()
+{
+   rm -rf $mnt > /dev/null 2>&1
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch_dev_pool 2
+_scratch_dev_pool_get 2
+
+dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}')
+dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}')


Naming devices _foo and _bar just shows you could not care less about
the test.


  Hmm. Will make it device_1 and device_2.


Either use sane names - primary_dev/secondary dev or device_1
and device_2.



+
+echo dev_foo=$dev_foo >> $seqres.full
+echo dev_bar=$dev_bar >> $seqres.full
+echo | tee -a $seqres.full
+
+rm -rf $mnt > /dev/null 2>&1


So what is $mnt? I can see it's used by very few tests and it's not
obvious? Generally you should either define it as a local variable or
use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked
with Eric re. the use of $mnt in tests and his conclusion is :

" looks like a bug"


 No its not. As few lines above, I have assigned it as..
mnt=$TEST_DIR/$seq.mnt



+mkdir $mnt
+_mkfs_dev $dev_foo
+_mount $dev_foo $mnt
+
+check_btrfs_mount()
+{
+   local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}')
+   [[ $x == $dev_foo ]] && echo DEV_FOO
+   [[ $x == $dev_bar ]] && echo DEV_BAR
+}


Same thing here re. DEV_(FOO|BAR).


+
+echo MNT $(check_btrfs_mount)
+
+for sb_bytenr in 65536 67108864
+do
+   echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\
+   "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full
+   dd status=none if=$dev_foo of=$dev_bar bs=1 seek=$sb_bytenr \
+   

Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid

2018-10-26 Thread Nikolay Borisov



On 8.10.18 г. 21:28 ч., Anand Jain wrote:
> We have a known bug in btrfs, that we let the device path be changed
> after the device has been mounted. So using this loop hole the new
> copied device would appears as if its mounted immediately after its
> been copied. So this test case reproduces this issue.
> 
> For example:
> 
> Initially.. /dev/mmcblk0p4 is mounted as /
> 
> lsblk
> NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
> mmcblk0 179:00 29.2G  0 disk
> |-mmcblk0p4 179:404G  0 part /
> |-mmcblk0p2 179:20  500M  0 part /boot
> |-mmcblk0p3 179:30  256M  0 part [SWAP]
> `-mmcblk0p1 179:10  256M  0 part /boot/efi
> 
> btrfs fi show
> Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
> Total devices 1 FS bytes used 1.40GiB
> devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4
> 
> Copy mmcblk0 to sda
> dd if=/dev/mmcblk0 of=/dev/sda
> 
> And immediately after the copy completes the change in the device
> superblock is notified which the automount scans using
> btrfs device scan and the new device sda becomes the mounted root
> device.
> 
> lsblk
> NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
> sda   8:01 14.9G  0 disk
> |-sda48:414G  0 part /
> |-sda28:21  500M  0 part
> |-sda38:31  256M  0 part
> `-sda18:11  256M  0 part
> mmcblk0 179:00 29.2G  0 disk
> |-mmcblk0p4 179:404G  0 part
> |-mmcblk0p2 179:20  500M  0 part /boot
> |-mmcblk0p3 179:30  256M  0 part [SWAP]
> `-mmcblk0p1 179:10  256M  0 part /boot/efi
> btrfs fi show /
> Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
> Total devices 1 FS bytes used 1.40GiB
> devid1 size 4.00GiB used 3.00GiB path /dev/sda4
> 
> The bug is quite nasty that you can't either unmount /dev/sda4 or
> /dev/mmcblk0p4. And the problem does not get solved until you take
> the sda out of the system on to another system to change its fsid using
> the 'btrfstune -u' command.

Is there a pending fix for this?

> 
> Signed-off-by: Anand Jain 
> ---
> v1->v2: 
>   dont play around with dev patch use it as it is.
>   do not use SCRATCH_MNT instead create it at the TEST_DIR and its related
>changes.
>   golden out changes
>
>  tests/btrfs/173 | 88 
> +
>  tests/btrfs/173.out |  6 
>  tests/btrfs/group   |  1 +
>  3 files changed, 95 insertions(+)
>  create mode 100755 tests/btrfs/173
>  create mode 100644 tests/btrfs/173.out
> 
> diff --git a/tests/btrfs/173 b/tests/btrfs/173
> new file mode 100755
> index ..b466ae921e19
> --- /dev/null
> +++ b/tests/btrfs/173
> @@ -0,0 +1,88 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2018 Oracle. All Rights Reserved.
> +#
> +# FS QA Test 173
> +#
> +# Fuzzy test for FS image duplication.
> +#  Could be fixed by
> +#[patch] btrfs: harden agaist duplicate fsid
> +#
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1 # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +mnt=$TEST_DIR/$seq.mnt
> +_cleanup()
> +{
> + rm -rf $mnt > /dev/null 2>&1
> + cd /
> + rm -f $tmp.*
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/filter
> +
> +# remove previous $seqres.full before test
> +rm -f $seqres.full
> +
> +# real QA test starts here
> +
> +# Modify as appropriate.
> +_supported_fs btrfs
> +_supported_os Linux
> +_require_scratch_dev_pool 2
> +_scratch_dev_pool_get 2
> +
> +dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}')
> +dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}')

Naming devices _foo and _bar just shows you could not care less about
the test. Either use sane names - primary_dev/secondary dev or device_1
and device_2.

> +
> +echo dev_foo=$dev_foo >> $seqres.full
> +echo dev_bar=$dev_bar >> $seqres.full
> +echo | tee -a $seqres.full
> +
> +rm -rf $mnt > /dev/null 2>&1

So what is $mnt? I can see it's used by very few tests and it's not
obvious? Generally you should either define it as a local variable or
use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked
with Eric re. the use of $mnt in tests and his conclusion is :

" looks like a bug"

> +mkdir $mnt
> +_mkfs_dev $dev_foo
> +_mount $dev_foo $mnt
> +
> +check_btrfs_mount()
> +{
> + local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}')
> + [[ $x == $dev_foo ]] && echo DEV_FOO
> + [[ $x == $dev_bar ]] && echo DEV_BAR
> +}

Same thing here re. DEV_(FOO|BAR).

> +
> +echo MNT $(check_btrfs_mount)
> +
> +for sb_bytenr in 65536 67108864
> +do
> + echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\
> + "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full
> + dd status=none if=$dev_foo of=$dev_bar bs=1 seek=$sb_bytenr \
> + skip=$sb_bytenr count=4096 >> $seqres.full 

Re: Kernel crash related to LZO compression

2018-10-26 Thread Dmitry Katsubo

On 2018-10-25 20:49, Chris Murphy wrote:

I would say the first step no matter what if you're using an older
kernel, is to boot a current Fedora or Arch live or install media,
mount the Btrfs and try to read the problem files and see if the
problem still happens. I can't even being to estimate the tens of
thousands of line changes since kernel 4.9.


Good point Chris. Indeed booting a fresh kernel is never a problem.
Actually I forgot to mention that I've seen the same problem with
kernel 4.12.13 (attached).


What profile are you using for this Btrfs? Is this a raid56? What do
you get for 'btrfs fi us ' ?


It is RAID1 volume for both metadata and data, but unfortunately I
haven't recorded the actual output before the failure. The configuration
was like this:

# btrfs filesystem show /var/log
Label: none  uuid: 5b45ac8e-fd8c-4759-854a-94e45069959d
Total devices 2 FS bytes used 11.13GiB
devid3 size 50.00GiB used 14.03GiB path /dev/sda3
devid4 size 50.00GiB used 14.03GiB path /dev/sdc1

On 2018-10-25 20:49, Chris Murphy wrote:

It should be safe even with that kernel. I'm not sure this is
compression related. There is a corruption bug related to inline
extents and corruption that had been fairly elusive but I think it's
fixed now. I haven't run into it though.


On 2018-10-26 02:09, Qu Wenruo wrote:
Are there any updates / fixes done in that area? Is lzo option safe to 
use?


Yes, we have commits to harden lzo decompress code in v4.18:

de885e3ee281a88f52283c7e8994e762e3a5f6bd btrfs: lzo: Harden inline lzo
compressed extent decompression
314bfa473b6b6d3efe68011899bd718b349f29d7 btrfs: lzo: Add header length
check to avoid potential out-of-bounds acc

And for the root cause, it's compressed data without csum, then scrub
could make it corrupted.

It's also fixed in v4.18:

665d4953cde6d9e75c62a07ec8f4f8fd7d396ade btrfs: scrub: Don't use inode
page cache in scrub_handle_errored_block()
ac0b4145d662a3b9e34085dea460fb06ede9b69b btrfs: scrub: Don't use inode
pages for device replace


Thanks, Qu, for this information. Actually one time I've seen the binary
crap (not zeros) in text log files (/var/log/*.log) and I was surprised
that btrfs returned me data which is corrupted instead of signalling I/O
error. Could it be because of "compressed data without csum" problem?

Thanks!

--
With best regards,
Dmitry
[Sun Dec  3 19:39:55 2017] BUG: unable to handle kernel paging request at 
f80a3000
[Sun Dec  3 19:39:55 2017] IP: memcpy+0x11/0x20
[Sun Dec  3 19:39:55 2017] *pde = 370bb067 
[Sun Dec  3 19:39:55 2017] *pte =  
[Sun Dec  3 19:39:55 2017] Oops: 0002 [#1] SMP
[Sun Dec  3 19:39:55 2017] Modules linked in: bridge stp llc arc4 iTCO_wdt 
iTCO_vendor_support ppdev ath5k evdev ath mac80211 cfg80211 i915 coretemp 
pcspkr rfkill snd_hda_codec_realtek serio_raw snd_hda_codec_generic video 
snd_hda_intel drm_kms_helper snd_hda_codec lpc_ich drm snd_hda_core snd_hwdep 
i2c_algo_bit snd_pcm_oss snd_mixer_oss fb_sys_fops sg snd_pcm syscopyarea 
snd_timer sysfillrect rng_core snd sysimgblt soundcore parport_pc parport 
shpchp button acpi_cpufreq binfmt_misc w83627hf hwmon_vid ip_tables x_tables 
autofs4 ses enclosure scsi_transport_sas xfs libcrc32c hid_generic usbhid hid 
btrfs crc32c_generic xor raid6_pq uas usb_storage sr_mod cdrom sd_mod 
ata_generic ata_piix i2c_i801 libata scsi_mod firewire_ohci firewire_core 
crc_itu_t ehci_pci e1000e ptp pps_core uhci_hcd ehci_hcd usbcore usb_common
[Sun Dec  3 19:39:55 2017] CPU: 1 PID: 100 Comm: kworker/u4:2 Tainted: G
W   4.12.0-2-686 #1 Debian 4.12.13-1
[Sun Dec  3 19:39:55 2017] Hardware name: AOpen i945GMx-IF/i945GMx-IF, BIOS 
i945GMx-IF R1.01 Mar.02.2007 AOpen Inc. 03/02/2007
[Sun Dec  3 19:39:55 2017] Workqueue: btrfs-endio btrfs_endio_helper [btrfs]
[Sun Dec  3 19:39:55 2017] task: f7337280 task.stack: f695c000
[Sun Dec  3 19:39:55 2017] EIP: memcpy+0x11/0x20
[Sun Dec  3 19:39:55 2017] EFLAGS: 00010206 CPU: 1
[Sun Dec  3 19:39:55 2017] EAX: f80a2ff8 EBX: 1000 ECX: 03fe EDX: 
ff998000
[Sun Dec  3 19:39:55 2017] ESI: ff998008 EDI: f80a3000 EBP:  ESP: 
f695de88
[Sun Dec  3 19:39:55 2017]  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
[Sun Dec  3 19:39:55 2017] CR0: 80050033 CR2: f9c00140 CR3: 36bc7000 CR4: 
06d0
[Sun Dec  3 19:39:55 2017] Call Trace:
[Sun Dec  3 19:39:55 2017]  ? lzo_decompress_bio+0x19f/0x2b0 [btrfs]
[Sun Dec  3 19:39:55 2017]  ? end_compressed_bio_read+0x28d/0x360 [btrfs]
[Sun Dec  3 19:39:55 2017]  ? btrfs_scrubparity_helper+0xb6/0x2c0 [btrfs]
[Sun Dec  3 19:39:55 2017]  ? process_one_work+0x135/0x2f0
[Sun Dec  3 19:39:55 2017]  ? worker_thread+0x39/0x3a0
[Sun Dec  3 19:39:55 2017]  ? kthread+0xd7/0x110
[Sun Dec  3 19:39:55 2017]  ? process_one_work+0x2f0/0x2f0
[Sun Dec  3 19:39:55 2017]  ? kthread_create_on_node+0x30/0x30
[Sun Dec  3 19:39:55 2017]  ? ret_from_fork+0x19/0x24
[Sun Dec  3 19:39:55 2017] Code: 43 58 2b 43 50 88 43 4e 5b eb ed 90 90 90 90 
90 90 90 90 90 90 90 90 90 90 90 

Re: [PATCH] btrfs-progs: add cli to forget one or all scanned devices

2018-10-26 Thread Nikolay Borisov



On 26.10.18 г. 17:27 ч., Anand Jain wrote:
> This patch adds cli
>   btrfs device forget [dev]
> to remove the given device structure in the kernel if the device
> is unmounted. If no argument is given it shall remove all stale
> (device which are not mounted) from the kernel.
> 
> Signed-off-by: Anand Jain 

Reviewed-by: Nikolay Borisov 

> ---
>  cmds-device.c | 72 
> ---
>  ioctl.h   |  2 ++
>  2 files changed, 61 insertions(+), 13 deletions(-)
> 
> diff --git a/cmds-device.c b/cmds-device.c
> index 2a05f70a76a9..280d6f555377 100644
> --- a/cmds-device.c
> +++ b/cmds-device.c
> @@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv)
>   return _cmd_device_remove(argc, argv, cmd_device_delete_usage);
>  }
>  
> +static int btrfs_forget_devices(char *path)
> +{
> + struct btrfs_ioctl_vol_args args;
> + int ret;
> + int fd;
> +
> + fd = open("/dev/btrfs-control", O_RDWR);
> + if (fd < 0)
> + return -errno;
> +
> + memset(, 0, sizeof(args));
> + if (path)
> + strncpy_null(args.name, path);
> + ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, );
> + if (ret)
> + ret = -errno;
> + close(fd);
> + return ret;
> +}
> +
>  static const char * const cmd_device_scan_usage[] = {
> - "btrfs device scan [(-d|--all-devices)| [...]]",
> - "Scan devices for a btrfs filesystem",
> + "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\
> + "[...]]",
> + "Scan or forget (deregister) devices for a btrfs filesystem",
>   " -d|--all-devices (deprecated)",
> + " -u|--forget [ ..]",
>   NULL
>  };
>  
> @@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv)
>   int devstart;
>   int all = 0;
>   int ret = 0;
> + int forget = 0;
>  
>   optind = 0;
>   while (1) {
>   int c;
>   static const struct option long_options[] = {
>   { "all-devices", no_argument, NULL, 'd'},
> + { "forget", no_argument, NULL, 'u'},
>   { NULL, 0, NULL, 0}
>   };
>  
> - c = getopt_long(argc, argv, "d", long_options, NULL);
> + c = getopt_long(argc, argv, "du", long_options, NULL);
>   if (c < 0)
>   break;
>   switch (c) {
>   case 'd':
>   all = 1;
>   break;
> + case 'u':
> + forget = 1;
> + break;
>   default:
>   usage(cmd_device_scan_usage);
>   }
>   }
>   devstart = optind;
>  
> + if (all && forget)
> + usage(cmd_device_scan_usage);
> +
>   if (all && check_argc_max(argc - optind, 1))
>   usage(cmd_device_scan_usage);
>  
>   if (all || argc - optind == 0) {
> - printf("Scanning for Btrfs filesystems\n");
> - ret = btrfs_scan_devices();
> - error_on(ret, "error %d while scanning", ret);
> - ret = btrfs_register_all_devices();
> - error_on(ret, "there are %d errors while registering devices", 
> ret);
> + if (forget) {
> + ret = btrfs_forget_devices(NULL);
> + error_on(ret, "'%s', forget failed",
> +  strerror(-ret));
> + } else {
> + printf("Scanning for Btrfs filesystems\n");
> + ret = btrfs_scan_devices();
> + error_on(ret, "error %d while scanning", ret);
> + ret = btrfs_register_all_devices();
> + error_on(ret,
> + "there are %d errors while registering devices",
> + ret);
> + }
>   goto out;
>   }
>  
> @@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv)
>   ret = 1;
>   goto out;
>   }
> - printf("Scanning for Btrfs filesystems in '%s'\n", path);
> - if (btrfs_register_one_device(path) != 0) {
> - ret = 1;
> - free(path);
> - goto out;
> + if (forget) {
> + ret = btrfs_forget_devices(path);
> + if (ret)
> + error("Can't forget '%s': %s",
> + path, strerror(-ret));
> + } else {
> + printf("Scanning for Btrfs filesystems in '%s'\n",
> + path);
> + if (btrfs_register_one_device(path) != 0) {
> + ret = 1;
> + free(path);
> +   

Re: [PATCH] btrfs: introduce feature to forget a btrfs device

2018-10-26 Thread Nikolay Borisov



On 26.10.18 г. 17:27 ч., Anand Jain wrote:
> Support for a new command 'btrfs dev forget [dev]' is proposed here
> to undo the effects of 'btrfs dev scan [dev]'. For this purpose
> this patch proposes to use ioctl #5 as it was empty.
>   IOW(BTRFS_IOCTL_MAGIC, 5, ..)
> This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from
> the /dev/btrfs-control to forget one or all devices, (devices which are
> not mounted) from the btrfs kernel.
> 
> The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be
> set to specify the device path. And all unmounted devices can be removed
> from the kernel if no device path is provided.
> 
> Again, the devices are removed only if the relevant fsid aren't mounted.
> 
> This new cli can provide..
>  . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the
>device is not going to be mounted.
>  . Ability to mount the device in degraded mode when one of the other
>device is corrupted like in split brain raid1.
>  . Running test cases which requires btrfs.ko-reload if the rootfs
>is btrfs.
> 
> Signed-off-by: Anand Jain 

Reviewed-by: Nikolay Borisov 

> ---
>  fs/btrfs/super.c   | 3 +++
>  fs/btrfs/volumes.c | 9 +
>  fs/btrfs/volumes.h | 1 +
>  include/uapi/linux/btrfs.h | 2 ++
>  4 files changed, 15 insertions(+)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 345c64d810d4..f99db6899004 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, 
> unsigned int cmd,
>   ret = PTR_ERR_OR_ZERO(device);
>   mutex_unlock(_mutex);
>   break;
> + case BTRFS_IOC_FORGET_DEV:
> + ret = btrfs_forget_devices(vol->name);
> + break;
>   case BTRFS_IOC_DEVICES_READY:
>   mutex_lock(_mutex);
>   device = btrfs_scan_one_device(vol->name, FMODE_READ,
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index f435d397019e..e1365a122657 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device 
> *bdev, u64 bytenr,
>   return 0;
>  }
>  
> +int btrfs_forget_devices(const char *path)
> +{
> + mutex_lock(_mutex);
> + btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL);
> + mutex_unlock(_mutex);
> +
> + return 0;
> +}
> +
>  /*
>   * Look for a btrfs signature on a device. This may be called out of the 
> mount path
>   * and we are not allowed to call set_blocksize during the scan. The 
> superblock
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index aefce895e994..180297d04938 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices 
> *fs_devices,
>  fmode_t flags, void *holder);
>  struct btrfs_device *btrfs_scan_one_device(const char *path,
>  fmode_t flags, void *holder);
> +int btrfs_forget_devices(const char *path);
>  int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
>  void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
>  void btrfs_assign_next_active_device(struct btrfs_device *device,
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index 5ca1d21fc4a7..b1be7f828cb4 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -836,6 +836,8 @@ enum btrfs_err_code {
>  struct btrfs_ioctl_vol_args)
>  #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
>  struct btrfs_ioctl_vol_args)
> +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
> +struct btrfs_ioctl_vol_args)
>  /* trans start and trans end are dangerous, and only for
>   * use by applications that know how to avoid the
>   * resulting deadlocks
> 


[PATCH] btrfs-progs: add cli to forget one or all scanned devices

2018-10-26 Thread Anand Jain
This patch adds cli
  btrfs device forget [dev]
to remove the given device structure in the kernel if the device
is unmounted. If no argument is given it shall remove all stale
(device which are not mounted) from the kernel.

Signed-off-by: Anand Jain 
---
 cmds-device.c | 72 ---
 ioctl.h   |  2 ++
 2 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/cmds-device.c b/cmds-device.c
index 2a05f70a76a9..280d6f555377 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv)
return _cmd_device_remove(argc, argv, cmd_device_delete_usage);
 }
 
+static int btrfs_forget_devices(char *path)
+{
+   struct btrfs_ioctl_vol_args args;
+   int ret;
+   int fd;
+
+   fd = open("/dev/btrfs-control", O_RDWR);
+   if (fd < 0)
+   return -errno;
+
+   memset(, 0, sizeof(args));
+   if (path)
+   strncpy_null(args.name, path);
+   ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, );
+   if (ret)
+   ret = -errno;
+   close(fd);
+   return ret;
+}
+
 static const char * const cmd_device_scan_usage[] = {
-   "btrfs device scan [(-d|--all-devices)| [...]]",
-   "Scan devices for a btrfs filesystem",
+   "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\
+   "[...]]",
+   "Scan or forget (deregister) devices for a btrfs filesystem",
" -d|--all-devices (deprecated)",
+   " -u|--forget [ ..]",
NULL
 };
 
@@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv)
int devstart;
int all = 0;
int ret = 0;
+   int forget = 0;
 
optind = 0;
while (1) {
int c;
static const struct option long_options[] = {
{ "all-devices", no_argument, NULL, 'd'},
+   { "forget", no_argument, NULL, 'u'},
{ NULL, 0, NULL, 0}
};
 
-   c = getopt_long(argc, argv, "d", long_options, NULL);
+   c = getopt_long(argc, argv, "du", long_options, NULL);
if (c < 0)
break;
switch (c) {
case 'd':
all = 1;
break;
+   case 'u':
+   forget = 1;
+   break;
default:
usage(cmd_device_scan_usage);
}
}
devstart = optind;
 
+   if (all && forget)
+   usage(cmd_device_scan_usage);
+
if (all && check_argc_max(argc - optind, 1))
usage(cmd_device_scan_usage);
 
if (all || argc - optind == 0) {
-   printf("Scanning for Btrfs filesystems\n");
-   ret = btrfs_scan_devices();
-   error_on(ret, "error %d while scanning", ret);
-   ret = btrfs_register_all_devices();
-   error_on(ret, "there are %d errors while registering devices", 
ret);
+   if (forget) {
+   ret = btrfs_forget_devices(NULL);
+   error_on(ret, "'%s', forget failed",
+strerror(-ret));
+   } else {
+   printf("Scanning for Btrfs filesystems\n");
+   ret = btrfs_scan_devices();
+   error_on(ret, "error %d while scanning", ret);
+   ret = btrfs_register_all_devices();
+   error_on(ret,
+   "there are %d errors while registering devices",
+   ret);
+   }
goto out;
}
 
@@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv)
ret = 1;
goto out;
}
-   printf("Scanning for Btrfs filesystems in '%s'\n", path);
-   if (btrfs_register_one_device(path) != 0) {
-   ret = 1;
-   free(path);
-   goto out;
+   if (forget) {
+   ret = btrfs_forget_devices(path);
+   if (ret)
+   error("Can't forget '%s': %s",
+   path, strerror(-ret));
+   } else {
+   printf("Scanning for Btrfs filesystems in '%s'\n",
+   path);
+   if (btrfs_register_one_device(path) != 0) {
+   ret = 1;
+   free(path);
+   goto out;
+   }
}
free(path);
}
diff --git a/ioctl.h b/ioctl.h
index 

[PATCH v11] Add cli and ioctl to forget scanned device(s)

2018-10-26 Thread Anand Jain
v11:
 btrfs-progs: Bring the code into the else part of if(forget).
  Use strerror to print the erorr instead of ret.

v10:
 Make btrfs-progs changes more readable.
 With an effort to keep the known bug [1] as it is..
  [1]
   The cli 'btrfs device scan --all /dev/sdb' which should have scanned
only one device, ends up scanning all the devices and I am not trying
to fix this bug in this patch because..
  . -d|--all is marked as deprecated, I hope -d option would go away
  . For now some script might be using this bug as a feature, and fixing
this bug might lead to mount failure.

v9:
 Make forget as a btrfs device scan option.
 Use forget in the fstests, now you can run fstests with btrfs as rootfs
  which helps to exercise the uuid_mutex lock.

v8:
 Change log update in the kernel patch.

v7:
 Use struct btrfs_ioctl_vol_args (instead of struct
  btrfs_ioctl_vol_args_v2) as its inline with other ioctl
  btrfs-control
 The CLI usage/features remains same. However internally the ioctl flag
  is not required to delete all the unmounted devices. Instead leave
  btrfs_ioctl_vol_args::name NULL.

v6:
 Use the changed fn name btrfs_free_stale_devices().

 Change in title:
 Old v5:
 Cover-letter:
  [PATCH v5] Add cli and ioctl to ignore a scanned device
 Kernel:
  [PATCH v5] btrfs: introduce feature to ignore a btrfs device
 Progs:
  [PATCH v5] btrfs-progs: add 'btrfs device ignore' cli

v5:
  Adds feature to delete all stale devices
  Reuses btrfs_free_stale_devices() fn and so depends on the
patch-set [1] in the ML.
  Uses struct btrfs_ioctl_vol_args_v2 instead of
struct btrfs_ioctl_vol_args as arg
  Does the device path matching instead of btrfs_device matching
(we won't delete the mounted device as btrfs_free_stale_devices()
checks for it)
v4:
  No change. But as the ML thread may be confusing, so resend.
v3:
  No change. Send to correct ML.
v2:
  Accepts review from Nikolay, details are in the specific patch.
  Patch 1/2 is renamed from
[PATCH 1/2] btrfs: refactor btrfs_free_stale_device() to get device list 
delete
  to
[PATCH 1/2] btrfs: add function to device list delete

Adds cli and ioctl to forget a scanned device or forget all stale
devices in the kernel.


Anand Jain (1):
  btrfs: introduce feature to forget a btrfs device

 fs/btrfs/super.c   | 3 +++
 fs/btrfs/volumes.c | 9 +
 fs/btrfs/volumes.h | 1 +
 include/uapi/linux/btrfs.h | 2 ++
 4 files changed, 15 insertions(+)

Anand Jain (1):
  btrfs-progs: add cli to forget one or all scanned devices

 cmds-device.c | 63 ++-
 ioctl.h   |  2 ++
 2 files changed, 56 insertions(+), 9 deletions(-)

[1]
Anand Jain (1):
  fstests: btrfs use forget if not reload

 common/btrfs| 20 
 tests/btrfs/124 |  6 +++---
 tests/btrfs/125 |  6 +++---
 tests/btrfs/154 |  6 +++---
 tests/btrfs/164 |  4 ++--
 5 files changed, 31 insertions(+), 11 deletions(-)

-- 
1.8.3.1



[PATCH] btrfs: introduce feature to forget a btrfs device

2018-10-26 Thread Anand Jain
Support for a new command 'btrfs dev forget [dev]' is proposed here
to undo the effects of 'btrfs dev scan [dev]'. For this purpose
this patch proposes to use ioctl #5 as it was empty.
IOW(BTRFS_IOCTL_MAGIC, 5, ..)
This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from
the /dev/btrfs-control to forget one or all devices, (devices which are
not mounted) from the btrfs kernel.

The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be
set to specify the device path. And all unmounted devices can be removed
from the kernel if no device path is provided.

Again, the devices are removed only if the relevant fsid aren't mounted.

This new cli can provide..
 . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the
   device is not going to be mounted.
 . Ability to mount the device in degraded mode when one of the other
   device is corrupted like in split brain raid1.
 . Running test cases which requires btrfs.ko-reload if the rootfs
   is btrfs.

Signed-off-by: Anand Jain 
---
 fs/btrfs/super.c   | 3 +++
 fs/btrfs/volumes.c | 9 +
 fs/btrfs/volumes.h | 1 +
 include/uapi/linux/btrfs.h | 2 ++
 4 files changed, 15 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 345c64d810d4..f99db6899004 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, 
unsigned int cmd,
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(_mutex);
break;
+   case BTRFS_IOC_FORGET_DEV:
+   ret = btrfs_forget_devices(vol->name);
+   break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..e1365a122657 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device 
*bdev, u64 bytenr,
return 0;
 }
 
+int btrfs_forget_devices(const char *path)
+{
+   mutex_lock(_mutex);
+   btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL);
+   mutex_unlock(_mutex);
+
+   return 0;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount 
path
  * and we are not allowed to call set_blocksize during the scan. The superblock
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aefce895e994..180297d04938 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
   fmode_t flags, void *holder);
 struct btrfs_device *btrfs_scan_one_device(const char *path,
   fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
 void btrfs_assign_next_active_device(struct btrfs_device *device,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5ca1d21fc4a7..b1be7f828cb4 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -836,6 +836,8 @@ enum btrfs_err_code {
   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
+  struct btrfs_ioctl_vol_args)
 /* trans start and trans end are dangerous, and only for
  * use by applications that know how to avoid the
  * resulting deadlocks
-- 
1.8.3.1



Re: [PATCH] btrfs-progs: add cli to forget one or all scanned devices

2018-10-26 Thread Anand Jain




On 10/26/2018 08:21 PM, Nikolay Borisov wrote:



On 24.10.2018 07:31, Anand Jain wrote:

This patch adds cli
   btrfs device forget [dev]
to remove the given device structure in the kernel if the device
is unmounted. If no argument is given it shall remove all stale
(device which are not mounted) from the kernel.

Signed-off-by: Anand Jain 
---
  cmds-device.c | 59 +++
  ioctl.h   |  2 ++
  2 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/cmds-device.c b/cmds-device.c
index 2a05f70a76a9..c2a2b7f304b8 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv)
return _cmd_device_remove(argc, argv, cmd_device_delete_usage);
  }
  
+static int btrfs_forget_devices(char *path)

+{
+   struct btrfs_ioctl_vol_args args;
+   int ret;
+   int fd;
+
+   fd = open("/dev/btrfs-control", O_RDWR);
+   if (fd < 0)
+   return -errno;
+
+   memset(, 0, sizeof(args));
+   if (path)
+   strncpy_null(args.name, path);
+   ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, );
+   if (ret)
+   ret = -errno;
+   close(fd);
+   return ret;
+}
+
  static const char * const cmd_device_scan_usage[] = {
-   "btrfs device scan [(-d|--all-devices)| [...]]",
-   "Scan devices for a btrfs filesystem",
+   "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\
+   "[...]]",
+   "Scan or forget (deregister) devices for a btrfs filesystem",
" -d|--all-devices (deprecated)",
+   " -u|--forget [ ..]",
NULL
  };
  
@@ -267,32 +289,45 @@ static int cmd_device_scan(int argc, char **argv)

int devstart;
int all = 0;
int ret = 0;
+   int forget = 0;
  
  	optind = 0;

while (1) {
int c;
static const struct option long_options[] = {
{ "all-devices", no_argument, NULL, 'd'},
+   { "forget", no_argument, NULL, 'u'},
{ NULL, 0, NULL, 0}
};
  
-		c = getopt_long(argc, argv, "d", long_options, NULL);

+   c = getopt_long(argc, argv, "du", long_options, NULL);
if (c < 0)
break;
switch (c) {
case 'd':
all = 1;
break;
+   case 'u':
+   forget = 1;
+   break;
default:
usage(cmd_device_scan_usage);
}
}
devstart = optind;
  
+	if (all && forget)

+   usage(cmd_device_scan_usage);
+
if (all && check_argc_max(argc - optind, 1))
usage(cmd_device_scan_usage);
  
  	if (all || argc - optind == 0) {

+   if (forget) {
+   ret = btrfs_forget_devices(NULL);
+   error_on(ret, "error %d while running forget", ret);
+   goto out;
+   }


nit: I would prefer to also have an if () {} else {} construct here,
similar to what you do below.


 Will fix.

 Also, I think I can use sterror() instead of ret, in the error_on().

 Fixed both of these in v11.

Thanks, Anand


printf("Scanning for Btrfs filesystems\n");
ret = btrfs_scan_devices();
error_on(ret, "error %d while scanning", ret);
@@ -315,11 +350,19 @@ static int cmd_device_scan(int argc, char **argv)
ret = 1;
goto out;
}
-   printf("Scanning for Btrfs filesystems in '%s'\n", path);
-   if (btrfs_register_one_device(path) != 0) {
-   ret = 1;
-   free(path);
-   goto out;
+   if (forget) {
+   ret = btrfs_forget_devices(path);
+   if (ret)
+   error("Can't forget '%s': %s",
+   path, strerror(-ret));
+   } else {
+   printf("Scanning for Btrfs filesystems in '%s'\n",
+   path);
+   if (btrfs_register_one_device(path) != 0) {
+   ret = 1;
+   free(path);
+   goto out;
+   }
}
free(path);
}
diff --git a/ioctl.h b/ioctl.h
index 709e996f401c..e27d80e09392 100644
--- a/ioctl.h
+++ b/ioctl.h
@@ -721,6 +721,8 @@ static inline char *btrfs_err_str(enum btrfs_err_code 
err_code)
   struct btrfs_ioctl_vol_args)
  #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
 

Re: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents

2018-10-26 Thread kbuild test robot
Hi Nikolay,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on v4.19]
[also build test ERROR on next-20181019]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Nikolay-Borisov/btrfs-Fix-error-handling-in-btrfs_cleanup_ordered_extents/20181026-194005
config: x86_64-randconfig-x014-201842 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All error/warnings (new ones prefixed by >>):

   fs/btrfs/inode.c: In function 'btrfs_cleanup_ordered_extents':
>> fs/btrfs/inode.c:10618:0: error: unterminated argument list invoking macro 
>> "if"
};

>> fs/btrfs/inode.c:140:2: error: expected '(' at end of input
 if (page_start >= offset && page_end <= (offset + bytes - 1) {
 ^~
>> fs/btrfs/inode.c:140:2: warning: this 'if' clause does not guard... 
>> [-Wmisleading-indentation]
   fs/btrfs/inode.c:10618:0: note: ...this statement, but the latter is 
misleadingly indented as if it were guarded by the 'if'
};

>> fs/btrfs/inode.c:140:2: error: expected declaration or statement at end of 
>> input
 if (page_start >= offset && page_end <= (offset + bytes - 1) {
 ^~
   fs/btrfs/inode.c:122:6: warning: unused variable 'page_end' 
[-Wunused-variable]
 u64 page_end = page_start + PAGE_SIZE - 1;
 ^~~~
   fs/btrfs/inode.c: At top level:
   fs/btrfs/inode.c:87:12: warning: 'btrfs_setsize' declared 'static' but never 
defined [-Wunused-function]
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
   ^
   fs/btrfs/inode.c:88:12: warning: 'btrfs_truncate' declared 'static' but 
never defined [-Wunused-function]
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
   ^~
   fs/btrfs/inode.c:89:12: warning: 'btrfs_finish_ordered_io' declared 'static' 
but never defined [-Wunused-function]
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent 
*ordered_extent);
   ^~~
   fs/btrfs/inode.c:90:21: warning: 'cow_file_range' declared 'static' but 
never defined [-Wunused-function]
static noinline int cow_file_range(struct inode *inode,
^~
   fs/btrfs/inode.c:95:27: warning: 'create_io_em' declared 'static' but never 
defined [-Wunused-function]
static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 
len,
  ^~~~
   fs/btrfs/inode.c:101:13: warning: '__endio_write_update_ordered' declared 
'static' but never defined [-Wunused-function]
static void __endio_write_update_ordered(struct inode *inode,
^~~~
   fs/btrfs/inode.c:71:27: warning: 'btrfs_inode_cachep' defined but not used 
[-Wunused-variable]
static struct kmem_cache *btrfs_inode_cachep;
  ^~

vim +/if +10618 fs/btrfs/inode.c

76dda93c Yan, Zheng  2009-09-21  10615  
82d339d9 Alexey Dobriyan 2009-10-09  10616  const struct dentry_operations 
btrfs_dentry_operations = {
76dda93c Yan, Zheng  2009-09-21  10617  .d_delete   = 
btrfs_dentry_delete,
76dda93c Yan, Zheng  2009-09-21 @10618  };

:: The code at line 10618 was first introduced by commit
:: 76dda93c6ae2c1dc3e6cde34569d6aca26b0c918 Btrfs: add snapshot/subvolume 
destroy ioctl

:: TO: Yan, Zheng 
:: CC: Chris Mason 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH 5/5] btrfs: Replace BUG_ON with ASSERT in find_lock_delalloc_range

2018-10-26 Thread Qu Wenruo



On 2018/10/26 下午7:43, Nikolay Borisov wrote:
> lock_delalloc_pages should only return 2 values - 0 in case of success
> and -EAGAIN if the range of pages to be locked should be shrunk due to
> some of gone. Manual inspections confirms that this is
> indeed the case since __process_pages_contig is where lock_delalloc_pages
> gets its return value. The latter always returns 0  or -EAGAIN so the
> invariant holds. No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Qu Wenruo 

Thanks,
Qu

> ---
>  fs/btrfs/extent_io.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 1a9a521aefe5..94bc53472031 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1606,6 +1606,7 @@ static noinline_for_stack u64 
> find_lock_delalloc_range(struct inode *inode,
>   /* step two, lock all the pages after the page that has start */
>   ret = lock_delalloc_pages(inode, locked_page,
> delalloc_start, delalloc_end);
> + ASSERT(!ret || ret == -EAGAIN);
>   if (ret == -EAGAIN) {
>   /* some of the pages are gone, lets avoid looping by
>* shortening the size of the delalloc range we're searching
> @@ -1621,7 +1622,6 @@ static noinline_for_stack u64 
> find_lock_delalloc_range(struct inode *inode,
>   goto out_failed;
>   }
>   }
> - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
>  
>   /* step three, lock the state bits for the whole range */
>   lock_extent_bits(tree, delalloc_start, delalloc_end, _state);
> 


Re: [PATCH 4/5] btrfs: Sink find_lock_delalloc_range's 'max_bytes' argument

2018-10-26 Thread Qu Wenruo



On 2018/10/26 下午7:43, Nikolay Borisov wrote:
> All callers of this function pass BTRFS_MAX_EXTENT_SIZE (128M) so let's
> reduce the argument count and make that a local variable. No functional
> changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Qu Wenruo 

Thanks,
Qu

> ---
>  fs/btrfs/extent_io.c | 11 +--
>  fs/btrfs/extent_io.h |  2 +-
>  fs/btrfs/tests/extent-io-tests.c | 10 +-
>  3 files changed, 11 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 6877a74c7469..1a9a521aefe5 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1566,8 +1566,9 @@ static noinline int lock_delalloc_pages(struct inode 
> *inode,
>  static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
>   struct extent_io_tree *tree,
>   struct page *locked_page, u64 *start,
> - u64 *end, u64 max_bytes)
> + u64 *end)
>  {
> + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
>   u64 delalloc_start;
>   u64 delalloc_end;
>   u64 found;
> @@ -1647,10 +1648,9 @@ static noinline_for_stack u64 
> find_lock_delalloc_range(struct inode *inode,
>  u64 btrfs_find_lock_delalloc_range(struct inode *inode,
>   struct extent_io_tree *tree,
>   struct page *locked_page, u64 *start,
> - u64 *end, u64 max_bytes)
> + u64 *end)
>  {
> - return find_lock_delalloc_range(inode, tree, locked_page, start, end,
> - max_bytes);
> + return find_lock_delalloc_range(inode, tree, locked_page, start, end);
>  }
>  #endif
>  
> @@ -3233,8 +3233,7 @@ static noinline_for_stack int writepage_delalloc(struct 
> inode *inode,
>   nr_delalloc = find_lock_delalloc_range(inode, tree,
>  page,
>  _start,
> -_end,
> -BTRFS_MAX_EXTENT_SIZE);
> +_end);
>   if (nr_delalloc == 0) {
>   delalloc_start = delalloc_end + 1;
>   continue;
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 369daa5d4f73..7ec4f93caf78 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree,
>  u64 btrfs_find_lock_delalloc_range(struct inode *inode,
> struct extent_io_tree *tree,
> struct page *locked_page, u64 *start,
> -   u64 *end, u64 max_bytes);
> +   u64 *end);
>  #endif
>  struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
>  u64 start);
> diff --git a/fs/btrfs/tests/extent-io-tests.c 
> b/fs/btrfs/tests/extent-io-tests.c
> index 9e0f4a01be14..8ea8c2aa6696 100644
> --- a/fs/btrfs/tests/extent-io-tests.c
> +++ b/fs/btrfs/tests/extent-io-tests.c
> @@ -107,7 +107,7 @@ static int test_find_delalloc(u32 sectorsize)
>   start = 0;
>   end = 0;
>   found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
> -  , max_bytes);
> +  );
>   if (!found) {
>   test_err("should have found at least one delalloc");
>   goto out_bits;
> @@ -138,7 +138,7 @@ static int test_find_delalloc(u32 sectorsize)
>   start = test_start;
>   end = 0;
>   found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
> -  , max_bytes);
> +  );
>   if (!found) {
>   test_err("couldn't find delalloc in our range");
>   goto out_bits;
> @@ -172,7 +172,7 @@ static int test_find_delalloc(u32 sectorsize)
>   start = test_start;
>   end = 0;
>   found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
> -  , max_bytes);
> +  );
>   if (found) {
>   test_err("found range when we shouldn't have");
>   goto out_bits;
> @@ -193,7 +193,7 @@ static int test_find_delalloc(u32 sectorsize)
>   start = test_start;
>   end = 0;
>   found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
> -  , max_bytes);
> +  );
>   if (!found) {
>   test_err("didn't find our range");
>   goto out_bits;
> @@ -234,7 +234,7 @@ static int test_find_delalloc(u32 sectorsize)
>* tests expected 

Re: [PATCH 3/5] btrfs: Remove superfluous check form btrfs_remove_chunk

2018-10-26 Thread Qu Wenruo



On 2018/10/26 下午7:43, Nikolay Borisov wrote:
> It's unnecessary to check map->stripes[i].dev for NULL given its value
> is already set and dereferenced above the the check. No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Qu Wenruo 

Thanks,
Qu

> ---
>  fs/btrfs/volumes.c | 12 +---
>  1 file changed, 5 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index dc53d94a62aa..f0db43d08456 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -2797,13 +2797,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle 
> *trans, u64 chunk_offset)
>   mutex_unlock(_info->chunk_mutex);
>   }
>  
> - if (map->stripes[i].dev) {
> - ret = btrfs_update_device(trans, map->stripes[i].dev);
> - if (ret) {
> - mutex_unlock(_devices->device_list_mutex);
> - btrfs_abort_transaction(trans, ret);
> - goto out;
> - }
> + ret = btrfs_update_device(trans, device);
> + if (ret) {
> + mutex_unlock(_devices->device_list_mutex);
> + btrfs_abort_transaction(trans, ret);
> + goto out;
>   }
>   }
>   mutex_unlock(_devices->device_list_mutex);
> 


Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance

2018-10-26 Thread Hans van Kranenburg
On 10/26/18 2:16 PM, Nikolay Borisov wrote:
> 
> (Adding Chris to CC since he is the original author of the code)
> 
> On 26.10.2018 15:09, Hans van Kranenburg wrote:
>> On 10/26/18 1:43 PM, Nikolay Borisov wrote:
>>> The first part of balance operation is to shrink every constituting
>>> device to ensure there is free space for chunk allocation.
>>
>> A very useful thing to be able to do if there's no unallocated raw disk
>> space left, is use balance to rewrite some blockgroup into the free
>> space of other ones.
>>
>> This does not need any raw space for a chunk allocation. Requiring so
>> makes it unneccesarily more complicated for users to fix the situation
>> they got in.
> 
> The current logic of the code doesn't preclude this use case since if we
> can't shrink we just proceed to relocation. So even if 1g is replaced
> with 5eb and shrink_device always fails balancing will still proceed.
> 
> However, all of this really makes me wonder do we even need this "first
> stage" code in balancing (Chris, any thoughts?).

Having something that tries to free up 1G on each device can be very
useful when converting to another profile. I guess that's why it is there.

And shrink device + grow device is a nice way to try packing some data
into other existing block groups. Only... if shrink+grow is done for
each device after each other, and there's one with 1G unallocated left,
and it can't easily move it into existing free space, then it may just
be moving data around to the next disk all the time I guess, ending up
with the same situation as before. :D

>>> However, the code
>>> has been buggy ever since its introduction since calculating the space to 
>>> shrink
>>> the device by was bounded by 1 mb. Most likely the original intention was to
>>> have an upper bound of 1g and not 1m, since the largest chunk size is 1g. 
>>> This 
>>> means the first stage in __btrfs_balance so far has been a null op since it
>>> effectively freed just a single megabyte.
>>>
>>> Fix this by setting an upper bound of size_to_free of 1g. 
>>>
>>> Signed-off-by: Nikolay Borisov 
>>> ---
>>>  fs/btrfs/volumes.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index f435d397019e..8b0fd7bf3447 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info 
>>> *fs_info)
>>> list_for_each_entry(device, devices, dev_list) {
>>> old_size = btrfs_device_get_total_bytes(device);
>>> size_to_free = div_factor(old_size, 1);
>>> -   size_to_free = min_t(u64, size_to_free, SZ_1M);
>>> +   size_to_free = min_t(u64, size_to_free, SZ_1G);
>>> if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) ||
>>> btrfs_device_get_total_bytes(device) -
>>> btrfs_device_get_bytes_used(device) > size_to_free ||
>>>
>>
>>


-- 
Hans van Kranenburg


Re: [PATCH 2/5] btrfs: Refactor btrfs_can_relocate

2018-10-26 Thread Qu Wenruo



On 2018/10/26 下午7:43, Nikolay Borisov wrote:
> btrfs_can_relocate returns 0 when it concludes the given chunk can be
> relocated and -1 otherwise. Since this function is used as a predicated
> and it return a binary value it makes no sense to have it's return
> value as an int so change it to bool. Furthermore, remove a stale
> leftover comment from e6ec716f0ddb ("Btrfs: make raid attr array more
> readable").
> 
> Finally make the logic in the list_for_each_entry loop a bit more obvious. Up
> until now the fact that we are returning success (0) was a bit masked by the
> fact that the 0 is re-used from the return value of find_free_dev_extent.
> Instead, now just increment dev_nr if we find a suitable extent and explicitly
> set can_reloc to true if enough devices with unallocated space are present.
> No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Qu Wenruo 

Thanks,
Qu

> ---
>  fs/btrfs/ctree.h   |  2 +-
>  fs/btrfs/extent-tree.c | 39 ++-
>  fs/btrfs/volumes.c |  3 +--
>  3 files changed, 16 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 68ca41dbbef3..06edc4f9ceb2 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -2680,7 +2680,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle 
> *trans,
>  int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
>  int btrfs_free_block_groups(struct btrfs_fs_info *info);
>  int btrfs_read_block_groups(struct btrfs_fs_info *info);
> -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
> +bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
>  int btrfs_make_block_group(struct btrfs_trans_handle *trans,
>  u64 bytes_used, u64 type, u64 chunk_offset,
>  u64 size);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index a1febf155747..816bca482358 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -9423,10 +9423,10 @@ void btrfs_dec_block_group_ro(struct 
> btrfs_block_group_cache *cache)
>  /*
>   * checks to see if its even possible to relocate this block group.
>   *
> - * @return - -1 if it's not a good idea to relocate this block group, 0 if 
> its
> - * ok to go ahead and try.
> + * @return - false if not enough space can be found for relocation, true
> + * otherwise
>   */
> -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
> +bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
>  {
>   struct btrfs_root *root = fs_info->extent_root;
>   struct btrfs_block_group_cache *block_group;
> @@ -9441,7 +9441,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
> u64 bytenr)
>   int debug;
>   int index;
>   int full = 0;
> - int ret = 0;
> + bool can_reloc = true;
>  
>   debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
>  
> @@ -9453,7 +9453,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
> u64 bytenr)
>   btrfs_warn(fs_info,
>  "can't find block group for bytenr %llu",
>  bytenr);
> - return -1;
> + return false;
>   }
>  
>   min_free = btrfs_block_group_used(_group->item);
> @@ -9489,16 +9489,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
> u64 bytenr)
>* group is going to be restriped, run checks against the target
>* profile instead of the current one.
>*/
> - ret = -1;
> + can_reloc = false;
>  
> - /*
> -  * index:
> -  *  0: raid10
> -  *  1: raid1
> -  *  2: dup
> -  *  3: raid0
> -  *  4: single
> -  */
>   target = get_restripe_target(fs_info, block_group->flags);
>   if (target) {
>   index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
> @@ -9534,10 +9526,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
> u64 bytenr)
>  
>   /* We need to do this so that we can look at pending chunks */
>   trans = btrfs_join_transaction(root);
> - if (IS_ERR(trans)) {
> - ret = PTR_ERR(trans);
> + if (IS_ERR(trans))
>   goto out;
> - }
>  
>   mutex_lock(_info->chunk_mutex);
>   list_for_each_entry(device, _devices->alloc_list, dev_alloc_list) {
> @@ -9549,18 +9539,17 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
> u64 bytenr)
>*/
>   if (device->total_bytes > device->bytes_used + min_free &&
>   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) 
> {
> - ret = find_free_dev_extent(trans, device, min_free,
> -_offset, NULL);
> - if (!ret)
> + if (!find_free_dev_extent(trans, device, min_free,
> +_offset, NULL))
>   

Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance

2018-10-26 Thread Qu Wenruo



On 2018/10/26 下午8:08, Nikolay Borisov wrote:
> 
> 
> On 26.10.2018 15:04, Qu Wenruo wrote:
>>
>>
>> On 2018/10/26 下午7:43, Nikolay Borisov wrote:
>>> The first part of balance operation is to shrink every constituting
>>> device to ensure there is free space for chunk allocation. However, the code
>>> has been buggy ever since its introduction since calculating the space to 
>>> shrink
>>> the device by was bounded by 1 mb. Most likely the original intention was to
>>> have an upper bound of 1g and not 1m, since the largest chunk size is 1g.
>>
>> Minor nitpick, largest chunk size -> largest chunk stripe size.
>>
>> As for data chunk, it's possible to get a 10G chunk, but still only 1G
>> stripe up limit.
>>
>>> This 
>>> means the first stage in __btrfs_balance so far has been a null op since it
>>> effectively freed just a single megabyte.
>>>
>>> Fix this by setting an upper bound of size_to_free of 1g. 
>>
>> One question come to me naturally, what if we failed to shrink the device?
>>
>> In fact if btrfs_shrink_device() returns ENOSPC we just skip to
>> relocation part, so it doesn't look like to cause regression.
>>
>> If this can be mentioned in the commit message, it would save reviewer
>> minutes to read the code.
> 
> Will incorporate it in v2.
> 
>>
>>
>>
>> BTW, I think for that (ret == ENOSPC) after btrfs_shrink_device(), we
>> should continue other than break, to get more chance to secure
>> unallocated space.
> 
> I agree but this should be done in a separate patch, this one deals with
> the silly upper bound of 1m.

No problem, just a hint for a new patch :)

Thanks,
Qu

> 
>>
>> Thanks,
>> Qu
>>
>>>
>>> Signed-off-by: Nikolay Borisov 
>>> ---
>>>  fs/btrfs/volumes.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index f435d397019e..8b0fd7bf3447 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info 
>>> *fs_info)
>>> list_for_each_entry(device, devices, dev_list) {
>>> old_size = btrfs_device_get_total_bytes(device);
>>> size_to_free = div_factor(old_size, 1);
>>> -   size_to_free = min_t(u64, size_to_free, SZ_1M);
>>> +   size_to_free = min_t(u64, size_to_free, SZ_1G);
>>> if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) ||
>>> btrfs_device_get_total_bytes(device) -
>>> btrfs_device_get_bytes_used(device) > size_to_free ||
>>>
>>


Re: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents

2018-10-26 Thread kbuild test robot
Hi Nikolay,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on v4.19]
[also build test ERROR on next-20181019]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Nikolay-Borisov/btrfs-Fix-error-handling-in-btrfs_cleanup_ordered_extents/20181026-194005
config: i386-randconfig-x007-201842 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   fs//btrfs/inode.c: In function 'btrfs_cleanup_ordered_extents':
>> fs//btrfs/inode.c:140:63: error: expected ')' before '{' token
 if (page_start >= offset && page_end <= (offset + bytes - 1) {
  ^
>> fs//btrfs/inode.c:146:1: error: expected expression before '}' token
}
^

vim +140 fs//btrfs/inode.c

86  
87  static int btrfs_setsize(struct inode *inode, struct iattr *attr);
88  static int btrfs_truncate(struct inode *inode, bool skip_writeback);
89  static int btrfs_finish_ordered_io(struct btrfs_ordered_extent 
*ordered_extent);
90  static noinline int cow_file_range(struct inode *inode,
91 struct page *locked_page,
92 u64 start, u64 end, u64 delalloc_end,
93 int *page_started, unsigned long 
*nr_written,
94 int unlock, struct btrfs_dedupe_hash 
*hash);
95  static struct extent_map *create_io_em(struct inode *inode, u64 start, 
u64 len,
96 u64 orig_start, u64 block_start,
97 u64 block_len, u64 
orig_block_len,
98 u64 ram_bytes, int compress_type,
99 int type);
   100  
   101  static void __endio_write_update_ordered(struct inode *inode,
   102   const u64 offset, const u64 
bytes,
   103   const bool uptodate);
   104  
   105  /*
   106   * Cleanup all submitted ordered extents in specified range to handle 
errors
   107   * from the fill_dellaloc() callback.
   108   *
   109   * NOTE: caller must ensure that when an error happens, it can not call
   110   * extent_clear_unlock_delalloc() to clear both the bits 
EXTENT_DO_ACCOUNTING
   111   * and EXTENT_DELALLOC simultaneously, because that causes the reserved 
metadata
   112   * to be released, which we want to happen only when finishing the 
ordered
   113   * extent (btrfs_finish_ordered_io()).
   114   */
   115  static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
   116   struct page 
*locked_page,
   117   u64 offset, u64 bytes)
   118  {
   119  unsigned long index = offset >> PAGE_SHIFT;
   120  unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
   121  u64 page_start = page_offset(locked_page);
   122  u64 page_end = page_start + PAGE_SIZE - 1;
   123  
   124  struct page *page;
   125  
   126  while (index <= end_index) {
   127  page = find_get_page(inode->i_mapping, index);
   128  index++;
   129  if (!page)
   130  continue;
   131  ClearPagePrivate2(page);
   132  put_page(page);
   133  }
   134  
   135  /*
   136   * In case this page belongs to the delalloc range being 
instantiated
   137   * then skip it, since the first page of a range is going to be
   138   * properly cleaned up by the caller of run_delalloc_range
   139   */
 > 140  if (page_start >= offset && page_end <= (offset + bytes - 1) {
   141  offset += PAGE_SIZE;
   142  bytes -= PAGE_SIZE;
   143  }
   144  
   145  return __endio_write_update_ordered(inode, offset, bytes, 
false);
 > 146  }
   147  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH] btrfs-progs: add cli to forget one or all scanned devices

2018-10-26 Thread Nikolay Borisov



On 24.10.2018 07:31, Anand Jain wrote:
> This patch adds cli
>   btrfs device forget [dev]
> to remove the given device structure in the kernel if the device
> is unmounted. If no argument is given it shall remove all stale
> (device which are not mounted) from the kernel.
> 
> Signed-off-by: Anand Jain 
> ---
>  cmds-device.c | 59 
> +++
>  ioctl.h   |  2 ++
>  2 files changed, 53 insertions(+), 8 deletions(-)
> 
> diff --git a/cmds-device.c b/cmds-device.c
> index 2a05f70a76a9..c2a2b7f304b8 100644
> --- a/cmds-device.c
> +++ b/cmds-device.c
> @@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv)
>   return _cmd_device_remove(argc, argv, cmd_device_delete_usage);
>  }
>  
> +static int btrfs_forget_devices(char *path)
> +{
> + struct btrfs_ioctl_vol_args args;
> + int ret;
> + int fd;
> +
> + fd = open("/dev/btrfs-control", O_RDWR);
> + if (fd < 0)
> + return -errno;
> +
> + memset(, 0, sizeof(args));
> + if (path)
> + strncpy_null(args.name, path);
> + ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, );
> + if (ret)
> + ret = -errno;
> + close(fd);
> + return ret;
> +}
> +
>  static const char * const cmd_device_scan_usage[] = {
> - "btrfs device scan [(-d|--all-devices)| [...]]",
> - "Scan devices for a btrfs filesystem",
> + "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\
> + "[...]]",
> + "Scan or forget (deregister) devices for a btrfs filesystem",
>   " -d|--all-devices (deprecated)",
> + " -u|--forget [ ..]",
>   NULL
>  };
>  
> @@ -267,32 +289,45 @@ static int cmd_device_scan(int argc, char **argv)
>   int devstart;
>   int all = 0;
>   int ret = 0;
> + int forget = 0;
>  
>   optind = 0;
>   while (1) {
>   int c;
>   static const struct option long_options[] = {
>   { "all-devices", no_argument, NULL, 'd'},
> + { "forget", no_argument, NULL, 'u'},
>   { NULL, 0, NULL, 0}
>   };
>  
> - c = getopt_long(argc, argv, "d", long_options, NULL);
> + c = getopt_long(argc, argv, "du", long_options, NULL);
>   if (c < 0)
>   break;
>   switch (c) {
>   case 'd':
>   all = 1;
>   break;
> + case 'u':
> + forget = 1;
> + break;
>   default:
>   usage(cmd_device_scan_usage);
>   }
>   }
>   devstart = optind;
>  
> + if (all && forget)
> + usage(cmd_device_scan_usage);
> +
>   if (all && check_argc_max(argc - optind, 1))
>   usage(cmd_device_scan_usage);
>  
>   if (all || argc - optind == 0) {
> + if (forget) {
> + ret = btrfs_forget_devices(NULL);
> + error_on(ret, "error %d while running forget", ret);
> + goto out;
> + }

nit: I would prefer to also have an if () {} else {} construct here,
similar to what you do below.

>   printf("Scanning for Btrfs filesystems\n");
>   ret = btrfs_scan_devices();
>   error_on(ret, "error %d while scanning", ret);
> @@ -315,11 +350,19 @@ static int cmd_device_scan(int argc, char **argv)
>   ret = 1;
>   goto out;
>   }
> - printf("Scanning for Btrfs filesystems in '%s'\n", path);
> - if (btrfs_register_one_device(path) != 0) {
> - ret = 1;
> - free(path);
> - goto out;
> + if (forget) {
> + ret = btrfs_forget_devices(path);
> + if (ret)
> + error("Can't forget '%s': %s",
> + path, strerror(-ret));
> + } else {
> + printf("Scanning for Btrfs filesystems in '%s'\n",
> + path);
> + if (btrfs_register_one_device(path) != 0) {
> + ret = 1;
> + free(path);
> + goto out;
> + }
>   }
>   free(path);
>   }
> diff --git a/ioctl.h b/ioctl.h
> index 709e996f401c..e27d80e09392 100644
> --- a/ioctl.h
> +++ b/ioctl.h
> @@ -721,6 +721,8 @@ static inline char *btrfs_err_str(enum btrfs_err_code 
> err_code)
>  struct btrfs_ioctl_vol_args)
>  #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
>  struct btrfs_ioctl_vol_args)
> +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 

Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance

2018-10-26 Thread Nikolay Borisov


(Adding Chris to CC since he is the original author of the code)

On 26.10.2018 15:09, Hans van Kranenburg wrote:
> On 10/26/18 1:43 PM, Nikolay Borisov wrote:
>> The first part of balance operation is to shrink every constituting
>> device to ensure there is free space for chunk allocation.
> 
> A very useful thing to be able to do if there's no unallocated raw disk
> space left, is use balance to rewrite some blockgroup into the free
> space of other ones.
> 
> This does not need any raw space for a chunk allocation. Requiring so
> makes it unneccesarily more complicated for users to fix the situation
> they got in.

The current logic of the code doesn't preclude this use case since if we
can't shrink we just proceed to relocation. So even if 1g is replaced
with 5eb and shrink_device always fails balancing will still proceed.

However, all of this really makes me wonder do we even need this "first
stage" code in balancing (Chris, any thoughts?).

> 
>> However, the code
>> has been buggy ever since its introduction since calculating the space to 
>> shrink
>> the device by was bounded by 1 mb. Most likely the original intention was to
>> have an upper bound of 1g and not 1m, since the largest chunk size is 1g. 
>> This 
>> means the first stage in __btrfs_balance so far has been a null op since it
>> effectively freed just a single megabyte.
>>
>> Fix this by setting an upper bound of size_to_free of 1g. 
>>
>> Signed-off-by: Nikolay Borisov 
>> ---
>>  fs/btrfs/volumes.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index f435d397019e..8b0fd7bf3447 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info 
>> *fs_info)
>>  list_for_each_entry(device, devices, dev_list) {
>>  old_size = btrfs_device_get_total_bytes(device);
>>  size_to_free = div_factor(old_size, 1);
>> -size_to_free = min_t(u64, size_to_free, SZ_1M);
>> +size_to_free = min_t(u64, size_to_free, SZ_1G);
>>  if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) ||
>>  btrfs_device_get_total_bytes(device) -
>>  btrfs_device_get_bytes_used(device) > size_to_free ||
>>
> 
> 


Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance

2018-10-26 Thread Hans van Kranenburg
On 10/26/18 1:43 PM, Nikolay Borisov wrote:
> The first part of balance operation is to shrink every constituting
> device to ensure there is free space for chunk allocation.

A very useful thing to be able to do if there's no unallocated raw disk
space left, is use balance to rewrite some blockgroup into the free
space of other ones.

This does not need any raw space for a chunk allocation. Requiring so
makes it unneccesarily more complicated for users to fix the situation
they got in.

> However, the code
> has been buggy ever since its introduction since calculating the space to 
> shrink
> the device by was bounded by 1 mb. Most likely the original intention was to
> have an upper bound of 1g and not 1m, since the largest chunk size is 1g. 
> This 
> means the first stage in __btrfs_balance so far has been a null op since it
> effectively freed just a single megabyte.
> 
> Fix this by setting an upper bound of size_to_free of 1g. 
> 
> Signed-off-by: Nikolay Borisov 
> ---
>  fs/btrfs/volumes.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index f435d397019e..8b0fd7bf3447 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info 
> *fs_info)
>   list_for_each_entry(device, devices, dev_list) {
>   old_size = btrfs_device_get_total_bytes(device);
>   size_to_free = div_factor(old_size, 1);
> - size_to_free = min_t(u64, size_to_free, SZ_1M);
> + size_to_free = min_t(u64, size_to_free, SZ_1G);
>   if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) ||
>   btrfs_device_get_total_bytes(device) -
>   btrfs_device_get_bytes_used(device) > size_to_free ||
> 


-- 
Hans van Kranenburg


Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance

2018-10-26 Thread Nikolay Borisov



On 26.10.2018 15:04, Qu Wenruo wrote:
> 
> 
> On 2018/10/26 下午7:43, Nikolay Borisov wrote:
>> The first part of balance operation is to shrink every constituting
>> device to ensure there is free space for chunk allocation. However, the code
>> has been buggy ever since its introduction since calculating the space to 
>> shrink
>> the device by was bounded by 1 mb. Most likely the original intention was to
>> have an upper bound of 1g and not 1m, since the largest chunk size is 1g.
> 
> Minor nitpick, largest chunk size -> largest chunk stripe size.
> 
> As for data chunk, it's possible to get a 10G chunk, but still only 1G
> stripe up limit.
> 
>> This 
>> means the first stage in __btrfs_balance so far has been a null op since it
>> effectively freed just a single megabyte.
>>
>> Fix this by setting an upper bound of size_to_free of 1g. 
> 
> One question come to me naturally, what if we failed to shrink the device?
> 
> In fact if btrfs_shrink_device() returns ENOSPC we just skip to
> relocation part, so it doesn't look like to cause regression.
> 
> If this can be mentioned in the commit message, it would save reviewer
> minutes to read the code.

Will incorporate it in v2.

> 
> 
> 
> BTW, I think for that (ret == ENOSPC) after btrfs_shrink_device(), we
> should continue other than break, to get more chance to secure
> unallocated space.

I agree but this should be done in a separate patch, this one deals with
the silly upper bound of 1m.

> 
> Thanks,
> Qu
> 
>>
>> Signed-off-by: Nikolay Borisov 
>> ---
>>  fs/btrfs/volumes.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index f435d397019e..8b0fd7bf3447 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info 
>> *fs_info)
>>  list_for_each_entry(device, devices, dev_list) {
>>  old_size = btrfs_device_get_total_bytes(device);
>>  size_to_free = div_factor(old_size, 1);
>> -size_to_free = min_t(u64, size_to_free, SZ_1M);
>> +size_to_free = min_t(u64, size_to_free, SZ_1G);
>>  if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) ||
>>  btrfs_device_get_total_bytes(device) -
>>  btrfs_device_get_bytes_used(device) > size_to_free ||
>>
> 


Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance

2018-10-26 Thread Qu Wenruo



On 2018/10/26 下午7:43, Nikolay Borisov wrote:
> The first part of balance operation is to shrink every constituting
> device to ensure there is free space for chunk allocation. However, the code
> has been buggy ever since its introduction since calculating the space to 
> shrink
> the device by was bounded by 1 mb. Most likely the original intention was to
> have an upper bound of 1g and not 1m, since the largest chunk size is 1g.

Minor nitpick, largest chunk size -> largest chunk stripe size.

As for data chunk, it's possible to get a 10G chunk, but still only 1G
stripe up limit.

> This 
> means the first stage in __btrfs_balance so far has been a null op since it
> effectively freed just a single megabyte.
> 
> Fix this by setting an upper bound of size_to_free of 1g. 

One question come to me naturally, what if we failed to shrink the device?

In fact if btrfs_shrink_device() returns ENOSPC we just skip to
relocation part, so it doesn't look like to cause regression.

If this can be mentioned in the commit message, it would save reviewer
minutes to read the code.



BTW, I think for that (ret == ENOSPC) after btrfs_shrink_device(), we
should continue other than break, to get more chance to secure
unallocated space.

Thanks,
Qu

> 
> Signed-off-by: Nikolay Borisov 
> ---
>  fs/btrfs/volumes.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index f435d397019e..8b0fd7bf3447 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info 
> *fs_info)
>   list_for_each_entry(device, devices, dev_list) {
>   old_size = btrfs_device_get_total_bytes(device);
>   size_to_free = div_factor(old_size, 1);
> - size_to_free = min_t(u64, size_to_free, SZ_1M);
> + size_to_free = min_t(u64, size_to_free, SZ_1G);
>   if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) ||
>   btrfs_device_get_total_bytes(device) -
>   btrfs_device_get_bytes_used(device) > size_to_free ||
> 


Re: [PATCH v2] btrfs: Fix error handling in btrfs_cleanup_ordered_extents

2018-10-26 Thread Nikolay Borisov



On 26.10.2018 14:53, Qu Wenruo wrote:
> 
> 
> On 2018/10/26 下午7:41, Nikolay Borisov wrote:
>> Running btrfs/124 in a loop hung up on me sporadically with the
>> following call trace:
>>  btrfs   D0  5760   5324 0x
>>  Call Trace:
>>   ? __schedule+0x243/0x800
>>   schedule+0x33/0x90
>>   btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs]
>>   ? wait_woken+0xa0/0xa0
>>   btrfs_wait_ordered_range+0xbb/0x100 [btrfs]
>>   btrfs_relocate_block_group+0x1ff/0x230 [btrfs]
>>   btrfs_relocate_chunk+0x49/0x100 [btrfs]
>>   btrfs_balance+0xbeb/0x1740 [btrfs]
>>   btrfs_ioctl_balance+0x2ee/0x380 [btrfs]
>>   btrfs_ioctl+0x1691/0x3110 [btrfs]
>>   ? lockdep_hardirqs_on+0xed/0x180
>>   ? __handle_mm_fault+0x8e7/0xfb0
>>   ? _raw_spin_unlock+0x24/0x30
>>   ? __handle_mm_fault+0x8e7/0xfb0
>>   ? do_vfs_ioctl+0xa5/0x6e0
>>   ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
>>   do_vfs_ioctl+0xa5/0x6e0
>>   ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
>>   ksys_ioctl+0x3a/0x70
>>   __x64_sys_ioctl+0x16/0x20
>>   do_syscall_64+0x60/0x1b0
>>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>
>> Turns out during page writeback it's possible that the code in
>> writepage_delalloc can instantiate a delalloc range which doesn't
>> belong to the page currently being written back.
> 
> Just a nitpick, would you please split long paragraphs with newlines?
> 
> It makes reviewers' eyes less painful.
> 
>> This happens since
>> find_lock_delalloc_range returns up to BTRFS_MAX_EXTENT_SIZE delalloc
>> range when asked and doens't really consider the range of the passed
>> page.
> 
> 
>> When such a foregin range is found the code proceeds to
>> run_delalloc_range and calls the appropriate function to fill the
>> delalloc and create ordered extents. If, however, a failure occurs
>> while this operation is in effect then the clean up code in
>> btrfs_cleanup_ordered_extents will be called. This function has the
>> wrong assumption that caller of run_delalloc_range always properly
>> cleans the first page of the range hence when it calls
>> __endio_write_update_ordered it explicitly ommits the first page of
>> the delalloc range.
> 
> Yes, that's the old assumption, at least well explained by some ascii
> art. (even it's wrong)
> 
>> This is wrong because this function could be
>> cleaning a delalloc range that doesn't belong to the current page. This
>> in turn means that the page cleanup code in __extent_writepage will
>> not really free the initial page for the range, leaving a hanging
>> ordered extent with bytes_left set to 4k. This bug has been present
>> ever since the original introduction of the cleanup code in 524272607e88.
> 
> The cause sounds valid, however would you please explain more about how
> such cleaning on unrelated delalloc range happens?

So in my case the following happened - 2 block groups were created as
delalloc ranges in the - 0-1m and 1m-128m. Their respective pages were
dirtied, so when page 0 - 0-4k when into writepage_delalloc,
find_lock_delalloc_range would return the range 0-1m. So the call to
fill_delalloc instantiates OE 0-1m and writeback continues as expected.

Now, when the 2nd page from range 0-1m is again set for writeback and
find_lock_delalloc_range is called with delalloc_start ==  4096 it will
actually return the range 1m-128m. Then fill_delalloc is called with
locked_page belonging to the range which was already instantiated and
the start/end arguments pointing to 1m-128m if an error occurred in
run_delalloc_range in this case then  btrfs_cleanup_ordered_extents will
be called which will clear Private2 bit for pages belonging to 1m-128m
range and the OE will be cleared of all but the first page (since the
code wrongly assumed locked_page always belongs to the range currently
being instantiated).

> 
> Even the fix looks solid, it's still better to explain the cause a
> little more, as the more we understand the cause, the better solution we
> may get.
> 
>>
>> Fix this by correctly checking whether the current page belongs to the
>> range being instantiated and if so correctly adjust the range parameters
>> passed for cleaning up. If it doesn't, then just clean the whole OE
>> range directly.
> 
> And the solution also looks good to me, and much more robust, without
> any (possibly wrong) assumption.
> 
> Thanks,
> Qu
> 
>>
>> Signed-off-by: Nikolay Borisov 
>> Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid 
>> ordered extent hang")
>> ---
>>
>> V2: 
>>  * Fix compilation failure due to missing parentheses
>>  * Fixed the "Fixes" tag. 
>>
>>  fs/btrfs/inode.c | 29 -
>>  1 file changed, 20 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index e1f00d8c24ce..5906564ae2e9 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -109,17 +109,17 @@ static void 

Re: [PATCH v2] btrfs: Fix error handling in btrfs_cleanup_ordered_extents

2018-10-26 Thread Qu Wenruo



On 2018/10/26 下午7:41, Nikolay Borisov wrote:
> Running btrfs/124 in a loop hung up on me sporadically with the
> following call trace:
>   btrfs   D0  5760   5324 0x
>   Call Trace:
>? __schedule+0x243/0x800
>schedule+0x33/0x90
>btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs]
>? wait_woken+0xa0/0xa0
>btrfs_wait_ordered_range+0xbb/0x100 [btrfs]
>btrfs_relocate_block_group+0x1ff/0x230 [btrfs]
>btrfs_relocate_chunk+0x49/0x100 [btrfs]
>btrfs_balance+0xbeb/0x1740 [btrfs]
>btrfs_ioctl_balance+0x2ee/0x380 [btrfs]
>btrfs_ioctl+0x1691/0x3110 [btrfs]
>? lockdep_hardirqs_on+0xed/0x180
>? __handle_mm_fault+0x8e7/0xfb0
>? _raw_spin_unlock+0x24/0x30
>? __handle_mm_fault+0x8e7/0xfb0
>? do_vfs_ioctl+0xa5/0x6e0
>? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
>do_vfs_ioctl+0xa5/0x6e0
>? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
>ksys_ioctl+0x3a/0x70
>__x64_sys_ioctl+0x16/0x20
>do_syscall_64+0x60/0x1b0
>entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> Turns out during page writeback it's possible that the code in
> writepage_delalloc can instantiate a delalloc range which doesn't
> belong to the page currently being written back.

Just a nitpick, would you please split long paragraphs with newlines?

It makes reviewers' eyes less painful.

> This happens since
> find_lock_delalloc_range returns up to BTRFS_MAX_EXTENT_SIZE delalloc
> range when asked and doens't really consider the range of the passed
> page.


> When such a foregin range is found the code proceeds to
> run_delalloc_range and calls the appropriate function to fill the
> delalloc and create ordered extents. If, however, a failure occurs
> while this operation is in effect then the clean up code in
> btrfs_cleanup_ordered_extents will be called. This function has the
> wrong assumption that caller of run_delalloc_range always properly
> cleans the first page of the range hence when it calls
> __endio_write_update_ordered it explicitly ommits the first page of
> the delalloc range.

Yes, that's the old assumption, at least well explained by some ascii
art. (even it's wrong)

> This is wrong because this function could be
> cleaning a delalloc range that doesn't belong to the current page. This
> in turn means that the page cleanup code in __extent_writepage will
> not really free the initial page for the range, leaving a hanging
> ordered extent with bytes_left set to 4k. This bug has been present
> ever since the original introduction of the cleanup code in 524272607e88.

The cause sounds valid, however would you please explain more about how
such cleaning on unrelated delalloc range happens?

Even the fix looks solid, it's still better to explain the cause a
little more, as the more we understand the cause, the better solution we
may get.

> 
> Fix this by correctly checking whether the current page belongs to the
> range being instantiated and if so correctly adjust the range parameters
> passed for cleaning up. If it doesn't, then just clean the whole OE
> range directly.

And the solution also looks good to me, and much more robust, without
any (possibly wrong) assumption.

Thanks,
Qu

> 
> Signed-off-by: Nikolay Borisov 
> Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered 
> extent hang")
> ---
> 
> V2: 
>  * Fix compilation failure due to missing parentheses
>  * Fixed the "Fixes" tag. 
> 
>  fs/btrfs/inode.c | 29 -
>  1 file changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index e1f00d8c24ce..5906564ae2e9 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -109,17 +109,17 @@ static void __endio_write_update_ordered(struct inode 
> *inode,
>   * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
>   * and EXTENT_DELALLOC simultaneously, because that causes the reserved 
> metadata
>   * to be released, which we want to happen only when finishing the ordered
> - * extent (btrfs_finish_ordered_io()). Also note that the caller of the
> - * fill_delalloc() callback already does proper cleanup for the first page of
> - * the range, that is, it invokes the callback writepage_end_io_hook() for 
> the
> - * range of the first page.
> + * extent (btrfs_finish_ordered_io()).
>   */
>  static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
> -  const u64 offset,
> -  const u64 bytes)
> +  struct page *locked_page,
> +  u64 offset, u64 bytes)
>  {
>   unsigned long index = offset >> PAGE_SHIFT;
>   unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
> + u64 page_start = page_offset(locked_page);
> + u64 

[PATCH 3/5] btrfs: Remove superfluous check form btrfs_remove_chunk

2018-10-26 Thread Nikolay Borisov
It's unnecessary to check map->stripes[i].dev for NULL given its value
is already set and dereferenced above the the check. No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/volumes.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dc53d94a62aa..f0db43d08456 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2797,13 +2797,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle 
*trans, u64 chunk_offset)
mutex_unlock(_info->chunk_mutex);
}
 
-   if (map->stripes[i].dev) {
-   ret = btrfs_update_device(trans, map->stripes[i].dev);
-   if (ret) {
-   mutex_unlock(_devices->device_list_mutex);
-   btrfs_abort_transaction(trans, ret);
-   goto out;
-   }
+   ret = btrfs_update_device(trans, device);
+   if (ret) {
+   mutex_unlock(_devices->device_list_mutex);
+   btrfs_abort_transaction(trans, ret);
+   goto out;
}
}
mutex_unlock(_devices->device_list_mutex);
-- 
2.7.4



[PATCH 5/5] btrfs: Replace BUG_ON with ASSERT in find_lock_delalloc_range

2018-10-26 Thread Nikolay Borisov
lock_delalloc_pages should only return 2 values - 0 in case of success
and -EAGAIN if the range of pages to be locked should be shrunk due to
some of gone. Manual inspections confirms that this is
indeed the case since __process_pages_contig is where lock_delalloc_pages
gets its return value. The latter always returns 0  or -EAGAIN so the
invariant holds. No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1a9a521aefe5..94bc53472031 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1606,6 +1606,7 @@ static noinline_for_stack u64 
find_lock_delalloc_range(struct inode *inode,
/* step two, lock all the pages after the page that has start */
ret = lock_delalloc_pages(inode, locked_page,
  delalloc_start, delalloc_end);
+   ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
/* some of the pages are gone, lets avoid looping by
 * shortening the size of the delalloc range we're searching
@@ -1621,7 +1622,6 @@ static noinline_for_stack u64 
find_lock_delalloc_range(struct inode *inode,
goto out_failed;
}
}
-   BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
 
/* step three, lock the state bits for the whole range */
lock_extent_bits(tree, delalloc_start, delalloc_end, _state);
-- 
2.7.4



[PATCH 1/5] btrfs: Ensure at least 1g is free for balance

2018-10-26 Thread Nikolay Borisov
The first part of balance operation is to shrink every constituting
device to ensure there is free space for chunk allocation. However, the code
has been buggy ever since its introduction since calculating the space to shrink
the device by was bounded by 1 mb. Most likely the original intention was to
have an upper bound of 1g and not 1m, since the largest chunk size is 1g. This 
means the first stage in __btrfs_balance so far has been a null op since it
effectively freed just a single megabyte.

Fix this by setting an upper bound of size_to_free of 1g. 

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..8b0fd7bf3447 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, devices, dev_list) {
old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
-   size_to_free = min_t(u64, size_to_free, SZ_1M);
+   size_to_free = min_t(u64, size_to_free, SZ_1G);
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) ||
btrfs_device_get_total_bytes(device) -
btrfs_device_get_bytes_used(device) > size_to_free ||
-- 
2.7.4



[PATCH 0/5] Misc cleanups in balance code

2018-10-26 Thread Nikolay Borisov
While investigating the balance hang I came across various inconsistencies in 
the source. This series aims to fix those. 

The first patch is (I believe) a fix to a longstanding bug that could cause 
balance to fail due to ENOSPC. The code no properly ensures that there is 
at least 1g of unallocated space on every device being balance.

Patch 2 makes btrfs_can_relocate a bit more obvious and removes leftovers from 
previous cleanup

Patches 3/4/5 remove some redundant code from various functions. 

This series has survived multiple xfstest runs. 

Nikolay Borisov (5):
  btrfs: Ensure at least 1g is free for balance
  btrfs: Refactor btrfs_can_relocate
  btrfs: Remove superfluous check form btrfs_remove_chunk
  btrfs: Sink find_lock_delalloc_range's 'max_bytes' argument
  btrfs: Replace BUG_ON with ASSERT in find_lock_delalloc_range

 fs/btrfs/ctree.h |  2 +-
 fs/btrfs/extent-tree.c   | 39 ++-
 fs/btrfs/extent_io.c | 13 ++---
 fs/btrfs/extent_io.h |  2 +-
 fs/btrfs/tests/extent-io-tests.c | 10 +-
 fs/btrfs/volumes.c   | 17 +++--
 6 files changed, 34 insertions(+), 49 deletions(-)

-- 
2.7.4



[PATCH 2/5] btrfs: Refactor btrfs_can_relocate

2018-10-26 Thread Nikolay Borisov
btrfs_can_relocate returns 0 when it concludes the given chunk can be
relocated and -1 otherwise. Since this function is used as a predicated
and it return a binary value it makes no sense to have it's return
value as an int so change it to bool. Furthermore, remove a stale
leftover comment from e6ec716f0ddb ("Btrfs: make raid attr array more
readable").

Finally make the logic in the list_for_each_entry loop a bit more obvious. Up
until now the fact that we are returning success (0) was a bit masked by the
fact that the 0 is re-used from the return value of find_free_dev_extent.
Instead, now just increment dev_nr if we find a suitable extent and explicitly
set can_reloc to true if enough devices with unallocated space are present.
No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/extent-tree.c | 39 ++-
 fs/btrfs/volumes.c |  3 +--
 3 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68ca41dbbef3..06edc4f9ceb2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2680,7 +2680,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle 
*trans,
 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
+bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
   u64 bytes_used, u64 type, u64 chunk_offset,
   u64 size);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a1febf155747..816bca482358 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9423,10 +9423,10 @@ void btrfs_dec_block_group_ro(struct 
btrfs_block_group_cache *cache)
 /*
  * checks to see if its even possible to relocate this block group.
  *
- * @return - -1 if it's not a good idea to relocate this block group, 0 if its
- * ok to go ahead and try.
+ * @return - false if not enough space can be found for relocation, true
+ * otherwise
  */
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
+bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 {
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_block_group_cache *block_group;
@@ -9441,7 +9441,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 
bytenr)
int debug;
int index;
int full = 0;
-   int ret = 0;
+   bool can_reloc = true;
 
debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
 
@@ -9453,7 +9453,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 
bytenr)
btrfs_warn(fs_info,
   "can't find block group for bytenr %llu",
   bytenr);
-   return -1;
+   return false;
}
 
min_free = btrfs_block_group_used(_group->item);
@@ -9489,16 +9489,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
u64 bytenr)
 * group is going to be restriped, run checks against the target
 * profile instead of the current one.
 */
-   ret = -1;
+   can_reloc = false;
 
-   /*
-* index:
-*  0: raid10
-*  1: raid1
-*  2: dup
-*  3: raid0
-*  4: single
-*/
target = get_restripe_target(fs_info, block_group->flags);
if (target) {
index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
@@ -9534,10 +9526,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
u64 bytenr)
 
/* We need to do this so that we can look at pending chunks */
trans = btrfs_join_transaction(root);
-   if (IS_ERR(trans)) {
-   ret = PTR_ERR(trans);
+   if (IS_ERR(trans))
goto out;
-   }
 
mutex_lock(_info->chunk_mutex);
list_for_each_entry(device, _devices->alloc_list, dev_alloc_list) {
@@ -9549,18 +9539,17 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
u64 bytenr)
 */
if (device->total_bytes > device->bytes_used + min_free &&
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) 
{
-   ret = find_free_dev_extent(trans, device, min_free,
-  _offset, NULL);
-   if (!ret)
+   if (!find_free_dev_extent(trans, device, min_free,
+  _offset, NULL))
dev_nr++;
 
-   if (dev_nr >= dev_min)
+   if (dev_nr >= dev_min) {
+   can_reloc = true;
break;
-
-   

[PATCH 4/5] btrfs: Sink find_lock_delalloc_range's 'max_bytes' argument

2018-10-26 Thread Nikolay Borisov
All callers of this function pass BTRFS_MAX_EXTENT_SIZE (128M) so let's
reduce the argument count and make that a local variable. No functional
changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/extent_io.c | 11 +--
 fs/btrfs/extent_io.h |  2 +-
 fs/btrfs/tests/extent-io-tests.c | 10 +-
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6877a74c7469..1a9a521aefe5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1566,8 +1566,9 @@ static noinline int lock_delalloc_pages(struct inode 
*inode,
 static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
-   u64 *end, u64 max_bytes)
+   u64 *end)
 {
+   u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
u64 found;
@@ -1647,10 +1648,9 @@ static noinline_for_stack u64 
find_lock_delalloc_range(struct inode *inode,
 u64 btrfs_find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
-   u64 *end, u64 max_bytes)
+   u64 *end)
 {
-   return find_lock_delalloc_range(inode, tree, locked_page, start, end,
-   max_bytes);
+   return find_lock_delalloc_range(inode, tree, locked_page, start, end);
 }
 #endif
 
@@ -3233,8 +3233,7 @@ static noinline_for_stack int writepage_delalloc(struct 
inode *inode,
nr_delalloc = find_lock_delalloc_range(inode, tree,
   page,
   _start,
-  _end,
-  BTRFS_MAX_EXTENT_SIZE);
+  _end);
if (nr_delalloc == 0) {
delalloc_start = delalloc_end + 1;
continue;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 369daa5d4f73..7ec4f93caf78 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree,
 u64 btrfs_find_lock_delalloc_range(struct inode *inode,
  struct extent_io_tree *tree,
  struct page *locked_page, u64 *start,
- u64 *end, u64 max_bytes);
+ u64 *end);
 #endif
 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
   u64 start);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e0f4a01be14..8ea8c2aa6696 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -107,7 +107,7 @@ static int test_find_delalloc(u32 sectorsize)
start = 0;
end = 0;
found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
-, max_bytes);
+);
if (!found) {
test_err("should have found at least one delalloc");
goto out_bits;
@@ -138,7 +138,7 @@ static int test_find_delalloc(u32 sectorsize)
start = test_start;
end = 0;
found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
-, max_bytes);
+);
if (!found) {
test_err("couldn't find delalloc in our range");
goto out_bits;
@@ -172,7 +172,7 @@ static int test_find_delalloc(u32 sectorsize)
start = test_start;
end = 0;
found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
-, max_bytes);
+);
if (found) {
test_err("found range when we shouldn't have");
goto out_bits;
@@ -193,7 +193,7 @@ static int test_find_delalloc(u32 sectorsize)
start = test_start;
end = 0;
found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
-, max_bytes);
+);
if (!found) {
test_err("didn't find our range");
goto out_bits;
@@ -234,7 +234,7 @@ static int test_find_delalloc(u32 sectorsize)
 * tests expected behavior.
 */
found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
-, max_bytes);
+  

[PATCH v2] btrfs: Fix error handling in btrfs_cleanup_ordered_extents

2018-10-26 Thread Nikolay Borisov
Running btrfs/124 in a loop hung up on me sporadically with the
following call trace:
btrfs   D0  5760   5324 0x
Call Trace:
 ? __schedule+0x243/0x800
 schedule+0x33/0x90
 btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs]
 ? wait_woken+0xa0/0xa0
 btrfs_wait_ordered_range+0xbb/0x100 [btrfs]
 btrfs_relocate_block_group+0x1ff/0x230 [btrfs]
 btrfs_relocate_chunk+0x49/0x100 [btrfs]
 btrfs_balance+0xbeb/0x1740 [btrfs]
 btrfs_ioctl_balance+0x2ee/0x380 [btrfs]
 btrfs_ioctl+0x1691/0x3110 [btrfs]
 ? lockdep_hardirqs_on+0xed/0x180
 ? __handle_mm_fault+0x8e7/0xfb0
 ? _raw_spin_unlock+0x24/0x30
 ? __handle_mm_fault+0x8e7/0xfb0
 ? do_vfs_ioctl+0xa5/0x6e0
 ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
 do_vfs_ioctl+0xa5/0x6e0
 ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
 ksys_ioctl+0x3a/0x70
 __x64_sys_ioctl+0x16/0x20
 do_syscall_64+0x60/0x1b0
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Turns out during page writeback it's possible that the code in
writepage_delalloc can instantiate a delalloc range which doesn't
belong to the page currently being written back. This happens since
find_lock_delalloc_range returns up to BTRFS_MAX_EXTENT_SIZE delalloc
range when asked and doens't really consider the range of the passed
page. When such a foregin range is found the code proceeds to
run_delalloc_range and calls the appropriate function to fill the
delalloc and create ordered extents. If, however, a failure occurs
while this operation is in effect then the clean up code in
btrfs_cleanup_ordered_extents will be called. This function has the
wrong assumption that caller of run_delalloc_range always properly
cleans the first page of the range hence when it calls
__endio_write_update_ordered it explicitly ommits the first page of
the delalloc range. This is wrong because this function could be
cleaning a delalloc range that doesn't belong to the current page. This
in turn means that the page cleanup code in __extent_writepage will
not really free the initial page for the range, leaving a hanging
ordered extent with bytes_left set to 4k. This bug has been present
ever since the original introduction of the cleanup code in 524272607e88.

Fix this by correctly checking whether the current page belongs to the
range being instantiated and if so correctly adjust the range parameters
passed for cleaning up. If it doesn't, then just clean the whole OE
range directly.

Signed-off-by: Nikolay Borisov 
Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered 
extent hang")
---

V2: 
 * Fix compilation failure due to missing parentheses
 * Fixed the "Fixes" tag. 

 fs/btrfs/inode.c | 29 -
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e1f00d8c24ce..5906564ae2e9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -109,17 +109,17 @@ static void __endio_write_update_ordered(struct inode 
*inode,
  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
  * and EXTENT_DELALLOC simultaneously, because that causes the reserved 
metadata
  * to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of the
- * fill_delalloc() callback already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
  */
 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
-const u64 offset,
-const u64 bytes)
+struct page *locked_page,
+u64 offset, u64 bytes)
 {
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+   u64 page_start = page_offset(locked_page);
+   u64 page_end = page_start + PAGE_SIZE - 1;
+
struct page *page;
 
while (index <= end_index) {
@@ -130,8 +130,18 @@ static inline void btrfs_cleanup_ordered_extents(struct 
inode *inode,
ClearPagePrivate2(page);
put_page(page);
}
-   return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
-   bytes - PAGE_SIZE, false);
+
+   /*
+* In case this page belongs to the delalloc range being instantiated
+* then skip it, since the first page of a range is going to be
+* properly cleaned up by the caller of run_delalloc_range
+*/
+   if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+   offset += PAGE_SIZE;
+  

Re: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents

2018-10-26 Thread Holger Hoffstätte

On 10/26/18 13:13, Nikolay Borisov wrote:


+   if (page_start >= offset && page_end <= (offset + bytes - 1) {


fs/btrfs/inode.c: In function 'btrfs_cleanup_ordered_extents':
fs/btrfs/inode.c:140:62: error: expected ')' before '{' token
  if (page_start >= offset && page_end <= (offset + bytes - 1) {
 ~^~
  )
You're welcome :)

Holger



[PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents

2018-10-26 Thread Nikolay Borisov
Running btrfs/124 in a loop hung up on me sporadically with the
following call trace:

btrfs   D0  5760   5324 0x
Call Trace:
 ? __schedule+0x243/0x800
 schedule+0x33/0x90
 btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs]
 ? wait_woken+0xa0/0xa0
 btrfs_wait_ordered_range+0xbb/0x100 [btrfs]
 btrfs_relocate_block_group+0x1ff/0x230 [btrfs]
 btrfs_relocate_chunk+0x49/0x100 [btrfs]
 btrfs_balance+0xbeb/0x1740 [btrfs]
 btrfs_ioctl_balance+0x2ee/0x380 [btrfs]
 btrfs_ioctl+0x1691/0x3110 [btrfs]
 ? lockdep_hardirqs_on+0xed/0x180
 ? __handle_mm_fault+0x8e7/0xfb0
 ? _raw_spin_unlock+0x24/0x30
 ? __handle_mm_fault+0x8e7/0xfb0
 ? do_vfs_ioctl+0xa5/0x6e0
 ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
 do_vfs_ioctl+0xa5/0x6e0
 ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
 ksys_ioctl+0x3a/0x70
 __x64_sys_ioctl+0x16/0x20
 do_syscall_64+0x60/0x1b0
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Turns out during page writeback it's possible that the code in
writepage_delalloc can instantiate a delalloc range which doesn't
belong to the page currently being written back. This happens since
find_lock_delalloc_range returns up to BTRFS_MAX_EXTENT_SIZE delalloc range
when asked and doens't really consider the range of the passed page. When such
a foregin range is found the code proceeds to run_delalloc_range and calls the
appropriate function to fill the delalloc and create ordered extents. If,
however, a failure occurs while this operation is in effect then the clean up
code in btrfs_cleanup_ordered_extents will be called. This function has the
wrong assumption that caller of run_delalloc_range always properly cleans the
first page of the range hence when it calls __endio_write_update_ordered it
explicitly ommits the first page of the delalloc range. This is wrong because
this function could be cleaning a delalloc range that doesn't belong to the
current page. This in turn means that the page cleanup code in 
__extent_writepage will not really free the initial page for the range, leaving
a hanging ordered extent with bytes_left set to 4k. This bug has been present
ever since the original introduction of the cleanup code in 524272607e88.

Fix this by correctly checking whether the current page belongs to the
range being instantiated and if so correctly adjust the range parameters
passed for cleaning up. If it doesn't, then just clean the whole OE
range directly. 

524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent 
hang")
Signed-off-by: Nikolay Borisov 
---
With this patch I can no longer get btrfs/124 to hang nor do I see any 
regressions in xfstest. 

Furthermore, this needs to go in stable since v4.12

 fs/btrfs/inode.c | 29 -
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index de4e3a9563f7..f6a96da2f2ea 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -109,17 +109,17 @@ static void __endio_write_update_ordered(struct inode 
*inode,
  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
  * and EXTENT_DELALLOC simultaneously, because that causes the reserved 
metadata
  * to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of the
- * fill_delalloc() callback already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
  */
 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
-const u64 offset,
-const u64 bytes)
+struct page *locked_page,
+u64 offset, u64 bytes)
 {
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+   u64 page_start = page_offset(locked_page);
+   u64 page_end = page_start + PAGE_SIZE - 1;
+
struct page *page;
 
while (index <= end_index) {
@@ -130,8 +130,18 @@ static inline void btrfs_cleanup_ordered_extents(struct 
inode *inode,
ClearPagePrivate2(page);
put_page(page);
}
-   return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
-   bytes - PAGE_SIZE, false);
+
+   /*
+* In case this page belongs to the delalloc range being instantiated
+* then skip it, since the first page of a range is going to be
+* properly cleaned up by the caller of run_delalloc_range
+*/
+   if (page_start >= offset && page_end <= (offset