[RFC][PATCH v3 10/10] btrfs: use common file type conversion
Deduplicate the btrfs file type conversion implementation - file systems that use the same file types as defined by POSIX do not need to define their own versions and can use the common helper functions decared in fs_types.h and implemented in fs_types.c Signed-off-by: Amir Goldstein Signed-off-by: Phillip Potter --- fs/btrfs/btrfs_inode.h | 2 -- fs/btrfs/delayed-inode.c| 2 +- fs/btrfs/inode.c| 32 +++- include/uapi/linux/btrfs_tree.h | 2 ++ 4 files changed, 18 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1343ac57b438..c7c6db6b4a35 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -196,8 +196,6 @@ struct btrfs_inode { struct inode vfs_inode; }; -extern unsigned char btrfs_filetype_table[]; - static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f51b509f2d9b..c1da34e3a775 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1691,7 +1691,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, name = (char *)(di + 1); name_len = btrfs_stack_dir_name_len(di); - d_type = btrfs_filetype_table[di->type]; + d_type = fs_ftype_to_dtype(di->type); btrfs_disk_key_to_cpu(, >location); over = !dir_emit(ctx, name, name_len, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3ea5339603cf..089638719842 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -73,17 +73,6 @@ struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; -#define S_SHIFT 12 -static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT]= BTRFS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT]= BTRFS_FT_DIR, - [S_IFCHR >> S_SHIFT]= BTRFS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT]= BTRFS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT]= BTRFS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, - [S_IFLNK >> S_SHIFT]= BTRFS_FT_SYMLINK, -}; - static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); @@ -5803,10 +5792,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } -unsigned char btrfs_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - /* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy @@ -5879,6 +5864,19 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) bool put = false; struct btrfs_key location; + /* +* compile-time asserts that generic FT_x types still match +* BTRFS_FT_x types +*/ + BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); + BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); + BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); + BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); + BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); + BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); + BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); + BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); + if (!dir_emit_dots(file, ctx)) return 0; @@ -5945,7 +5943,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); - put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)], + put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), >type); btrfs_dir_item_key_to_cpu(leaf, di, ); put_unaligned(location.objectid, >ino); @@ -6350,7 +6348,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, static inline u8 btrfs_inode_type(struct inode *inode) { - return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; + return fs_umode_to_ftype(inode->i_mode); } /* diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index aff1356c2bb8..a4f5fb56a45b 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -307,6 +307,8 @@ * * Used by: * struct btrfs_dir_item.type + * + * Values 0..7 must match common file type values in fs_types.h. */ #define BTRFS_FT_UNKNOWN 0 #define BTRFS_FT_REG_FILE 1 -- 2.17.2
Re: Kernel crash related to LZO compression
On 2018/10/26 下午10:57, Dmitry Katsubo wrote: > On 2018-10-25 20:49, Chris Murphy wrote: >> I would say the first step no matter what if you're using an older >> kernel, is to boot a current Fedora or Arch live or install media, >> mount the Btrfs and try to read the problem files and see if the >> problem still happens. I can't even being to estimate the tens of >> thousands of line changes since kernel 4.9. > > Good point Chris. Indeed booting a fresh kernel is never a problem. > Actually I forgot to mention that I've seen the same problem with > kernel 4.12.13 (attached). > >> What profile are you using for this Btrfs? Is this a raid56? What do >> you get for 'btrfs fi us ' ? > > It is RAID1 volume for both metadata and data, but unfortunately I > haven't recorded the actual output before the failure. The configuration > was like this: > > # btrfs filesystem show /var/log > Label: none uuid: 5b45ac8e-fd8c-4759-854a-94e45069959d > Total devices 2 FS bytes used 11.13GiB > devid 3 size 50.00GiB used 14.03GiB path /dev/sda3 > devid 4 size 50.00GiB used 14.03GiB path /dev/sdc1 > > On 2018-10-25 20:49, Chris Murphy wrote: >> It should be safe even with that kernel. I'm not sure this is >> compression related. There is a corruption bug related to inline >> extents and corruption that had been fairly elusive but I think it's >> fixed now. I haven't run into it though. > > On 2018-10-26 02:09, Qu Wenruo wrote: >>> Are there any updates / fixes done in that area? Is lzo option safe >>> to use? >> >> Yes, we have commits to harden lzo decompress code in v4.18: >> >> de885e3ee281a88f52283c7e8994e762e3a5f6bd btrfs: lzo: Harden inline lzo >> compressed extent decompression >> 314bfa473b6b6d3efe68011899bd718b349f29d7 btrfs: lzo: Add header length >> check to avoid potential out-of-bounds acc >> >> And for the root cause, it's compressed data without csum, then scrub >> could make it corrupted. >> >> It's also fixed in v4.18: >> >> 665d4953cde6d9e75c62a07ec8f4f8fd7d396ade btrfs: scrub: Don't use inode >> page cache in scrub_handle_errored_block() >> ac0b4145d662a3b9e34085dea460fb06ede9b69b btrfs: scrub: Don't use inode >> pages for device replace > > Thanks, Qu, for this information. Actually one time I've seen the binary > crap (not zeros) in text log files (/var/log/*.log) and I was surprised > that btrfs returned me data which is corrupted instead of signalling I/O > error. Could it be because of "compressed data without csum" problem? Yes, pretty much the case, especially for your RAID1 setup. The root fix should has been backported to stable kernel after 4.0, but the lzo decompression harden part isn't sent to stable kernel, so you may still hit such problem. Thanks, Qu > > Thanks! > signature.asc Description: OpenPGP digital signature
[josef-btrfs:kill-mmap-sem-v5 9/10] htmldocs: mm/filemap.c:2882: warning: Function parameter or member 'mkwrite' not described in 'filemap_page_mkwrite_nommapsem'
tree: https://git.kernel.org/pub/scm/linux/kernel/git/josef/btrfs-next.git kill-mmap-sem-v5 head: c7a3e899803d98ae543468f0026500c651528a6c commit: c52634339df10df75f28787f431b937de3c3c591 [9/10] mm: introduce filemap_page_mkwrite_nommapsem reproduce: make htmldocs All warnings (new ones prefixed by >>): WARNING: convert(1) not found, for SVG to PDF conversion install ImageMagick (https://www.imagemagick.org) include/linux/srcu.h:175: warning: Function parameter or member 'p' not described in 'srcu_dereference_notrace' include/linux/srcu.h:175: warning: Function parameter or member 'sp' not described in 'srcu_dereference_notrace' include/linux/gfp.h:1: warning: no structured comments found >> mm/filemap.c:2882: warning: Function parameter or member 'mkwrite' not >> described in 'filemap_page_mkwrite_nommapsem' mm/filemap.c:2882: warning: Excess function parameter 'page_mkwrite_cb' description in 'filemap_page_mkwrite_nommapsem' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.ibss' not described in 'wireless_dev' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.connect' not described in 'wireless_dev' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.keys' not described in 'wireless_dev' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.ie' not described in 'wireless_dev' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.ie_len' not described in 'wireless_dev' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.bssid' not described in 'wireless_dev' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.ssid' not described in 'wireless_dev' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.default_key' not described in 'wireless_dev' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.default_mgmt_key' not described in 'wireless_dev' include/net/cfg80211.h:4381: warning: Function parameter or member 'wext.prev_bssid_valid' not described in 'wireless_dev' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' include/net/cfg80211.h:4869: warning: Excess function parameter
[PATCH] Btrfs: remove no longer used logged range variables when logging extents
From: Filipe Manana The logged_start and logged_end variables, at btrfs_log_changed_extents(), were added in commit 8c6c592831a0 ("btrfs: log csums for all modified extents"). However since the recent simplification for fsync, which makes us wait for all ordered extents to complete before logging extents, we no longer need those variables. Commit a2120a473a80 ("btrfs: clean up the left over logged_list usage") forgot to remove them. Signed-off-by: Filipe Manana --- fs/btrfs/tree-log.c | 8 1 file changed, 8 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1673dccc76c2..c86c5dd100b2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4383,7 +4383,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = >extent_tree; - u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4393,8 +4392,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, down_write(>dio_sem); write_lock(>lock); test_gen = root->fs_info->last_trans_committed; - logged_start = start; - logged_end = end; list_for_each_entry_safe(em, n, >modified_extents, list) { list_del_init(>list); @@ -4418,11 +4415,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em->start >= i_size_read(>vfs_inode)) continue; - if (em->start < logged_start) - logged_start = em->start; - if ((em->start + em->len - 1) > logged_end) - logged_end = em->start + em->len - 1; - /* Need a ref to keep it from getting evicted from cache */ refcount_inc(>refs); set_bit(EXTENT_FLAG_LOGGING, >flags); -- 2.11.0
Re: [PATCH] Btrfs: remove no longer used stuff for tracking pending ordered extents
On Fri, Oct 26, 2018 at 9:16 AM wrote: > > From: Filipe Manana > > Tracking pending ordered extents per transaction was introduced in commit > 50d9aa99bd35 ("Btrfs: make sure logged extents complete in the current > transaction V3") and later updated in commit 161c3549b45a ("Btrfs: change > how we wait for pending ordered extents"). > > However now that on fsync we always wait for ordered extents to complete > before logging, done in commit 5636cf7d6dc8 ("btrfs: remove the logged > extents infrastructure"), we no longer need the stuff to track for pending > ordered extents, which was not completely removed in the mentioned commit. > So remove the remaining of the pending ordered extents infrastructure. > Reviewed-by: Liu Bo thanks, liubo > Signed-off-by: Filipe Manana > --- > fs/btrfs/ordered-data.c | 30 -- > fs/btrfs/ordered-data.h | 2 -- > fs/btrfs/transaction.c | 11 --- > fs/btrfs/transaction.h | 2 -- > 4 files changed, 45 deletions(-) > > diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c > index 0c4ef208b8b9..6fde2b2741ef 100644 > --- a/fs/btrfs/ordered-data.c > +++ b/fs/btrfs/ordered-data.c > @@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, > struct btrfs_inode *btrfs_inode = BTRFS_I(inode); > struct btrfs_root *root = btrfs_inode->root; > struct rb_node *node; > - bool dec_pending_ordered = false; > > /* This is paired with btrfs_add_ordered_extent. */ > spin_lock(_inode->lock); > @@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode, > if (tree->last == node) > tree->last = NULL; > set_bit(BTRFS_ORDERED_COMPLETE, >flags); > - if (test_and_clear_bit(BTRFS_ORDERED_PENDING, >flags)) > - dec_pending_ordered = true; > spin_unlock_irq(>lock); > > - /* > -* The current running transaction is waiting on us, we need to let it > -* know that we're complete and wake it up. > -*/ > - if (dec_pending_ordered) { > - struct btrfs_transaction *trans; > - > - /* > -* The checks for trans are just a formality, it should be > set, > -* but if it isn't we don't want to deref/assert under the > spin > -* lock, so be nice and check if trans is set, but ASSERT() so > -* if it isn't set a developer will notice. > -*/ > - spin_lock(_info->trans_lock); > - trans = fs_info->running_transaction; > - if (trans) > - refcount_inc(>use_count); > - spin_unlock(_info->trans_lock); > - > - ASSERT(trans); > - if (trans) { > - if (atomic_dec_and_test(>pending_ordered)) > - wake_up(>pending_wait); > - btrfs_put_transaction(trans); > - } > - } > - > spin_lock(>ordered_extent_lock); > list_del_init(>root_extent_list); > root->nr_ordered_extents--; > diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h > index 02d813aaa261..b10e6765d88f 100644 > --- a/fs/btrfs/ordered-data.h > +++ b/fs/btrfs/ordered-data.h > @@ -56,8 +56,6 @@ struct btrfs_ordered_sum { >* the isize. */ > #define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent > */ > > -#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to > - * complete in the current transaction. */ > #define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */ > > struct btrfs_ordered_extent { > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index 3b84f5015029..2fe6c2b1d94b 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -232,14 +232,12 @@ static noinline int join_transaction(struct > btrfs_fs_info *fs_info, > extwriter_counter_init(cur_trans, type); > init_waitqueue_head(_trans->writer_wait); > init_waitqueue_head(_trans->commit_wait); > - init_waitqueue_head(_trans->pending_wait); > cur_trans->state = TRANS_STATE_RUNNING; > /* > * One for this trans handle, one so it will live on until we > * commit the transaction. > */ > refcount_set(_trans->use_count, 2); > - atomic_set(_trans->pending_ordered, 0); > cur_trans->flags = 0; > cur_trans->start_time = ktime_get_seconds(); > > @@ -1908,13 +1906,6 @@ static inline void btrfs_wait_delalloc_flush(struct > btrfs_fs_info *fs_info) > btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); > } > > -static inline void > -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) > -{ > - wait_event(cur_trans->pending_wait, > - atomic_read(_trans->pending_ordered) == 0); > -}
[PATCH v3] fstests: btrfs verify hardening agaist duplicate fsid
We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after its been copied. So this test case reproduces this issue. For example: Initially.. /dev/mmcblk0p4 is mounted as / lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part / |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:01 14.9G 0 disk |-sda48:414G 0 part / |-sda28:21 500M 0 part |-sda38:31 256M 0 part `-sda18:11 256M 0 part mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take the sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Signed-off-by: Anand Jain --- v2->v3: Check the return code and use _fail to verify and accordingly fix golden output. Rename dev_foo(bar) to device_1(2) Don't log dd retun to $seqres.full v1->v2: dont play around with dev patch use it as it is. do not use SCRATCH_MNT instead create it at the TEST_DIR and its related changes. golden out changes tests/btrfs/173 | 82 + tests/btrfs/173.out | 2 ++ tests/btrfs/group | 1 + 3 files changed, 85 insertions(+) create mode 100755 tests/btrfs/173 create mode 100644 tests/btrfs/173.out diff --git a/tests/btrfs/173 b/tests/btrfs/173 new file mode 100755 index ..342ae92b4781 --- /dev/null +++ b/tests/btrfs/173 @@ -0,0 +1,82 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Oracle. All Rights Reserved. +# +# FS QA Test 173 +# +# Fuzzy test for FS image duplication. +# Could be fixed by +#[patch] btrfs: harden agaist duplicate fsid +# +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! +trap "_cleanup; exit \$status" 0 1 2 3 15 + +mnt=$TEST_DIR/$seq.mnt +_cleanup() +{ + rm -rf $mnt > /dev/null 2>&1 + cd / + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# remove previous $seqres.full before test +rm -f $seqres.full + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch_dev_pool 2 +_scratch_dev_pool_get 2 + +device_1=$(echo $SCRATCH_DEV_POOL | awk '{print $1}') +device_2=$(echo $SCRATCH_DEV_POOL | awk '{print $2}') + +echo device_1=$device_1 device_2=$device_2 >> $seqres.full + +rm -rf $mnt > /dev/null 2>&1 +mkdir $mnt +_mkfs_dev $device_1 +_mount $device_1 $mnt + +[[ $(findmnt $mnt | grep -v TARGET | awk '{print $2}') != $device_1 ]] && \ + _fail "mounted device changed" + +for sb_bytenr in 65536 67108864 +do + echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\ + "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full + dd status=none if=$device_1 of=$device_2 bs=1 seek=$sb_bytenr \ + skip=$sb_bytenr count=4096 > /dev/null 2>&1 + echo ..:$? >> $seqres.full +done + +#Original device is mounted, scan of its clone should fail +$BTRFS_UTIL_PROG device scan $device_2 >> $seqres.full 2>&1 +[[ $? != 1 ]] && _fail "cloned device scan should fail" + +[[ $(findmnt $mnt | grep -v TARGET | awk '{print $2}') != $device_1 ]] && \ + _fail "mounted device changed" + +#Original device scan should be successful +$BTRFS_UTIL_PROG device scan $device_1 >> $seqres.full 2>&1 +[[ $? != 0 ]] && \ + _fail "if it fails here, then it means subvolume mount at boot may fail "\ + "in some configs." + +umount $mnt > /dev/null 2>&1 +_scratch_dev_pool_put + +# success, all done +echo "Silence is golden" +status=0 +exit diff --git a/tests/btrfs/173.out b/tests/btrfs/173.out
[PATCH] Btrfs: remove no longer used stuff for tracking pending ordered extents
From: Filipe Manana Tracking pending ordered extents per transaction was introduced in commit 50d9aa99bd35 ("Btrfs: make sure logged extents complete in the current transaction V3") and later updated in commit 161c3549b45a ("Btrfs: change how we wait for pending ordered extents"). However now that on fsync we always wait for ordered extents to complete before logging, done in commit 5636cf7d6dc8 ("btrfs: remove the logged extents infrastructure"), we no longer need the stuff to track for pending ordered extents, which was not completely removed in the mentioned commit. So remove the remaining of the pending ordered extents infrastructure. Signed-off-by: Filipe Manana --- fs/btrfs/ordered-data.c | 30 -- fs/btrfs/ordered-data.h | 2 -- fs/btrfs/transaction.c | 11 --- fs/btrfs/transaction.h | 2 -- 4 files changed, 45 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0c4ef208b8b9..6fde2b2741ef 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; - bool dec_pending_ordered = false; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(_inode->lock); @@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, >flags); - if (test_and_clear_bit(BTRFS_ORDERED_PENDING, >flags)) - dec_pending_ordered = true; spin_unlock_irq(>lock); - /* -* The current running transaction is waiting on us, we need to let it -* know that we're complete and wake it up. -*/ - if (dec_pending_ordered) { - struct btrfs_transaction *trans; - - /* -* The checks for trans are just a formality, it should be set, -* but if it isn't we don't want to deref/assert under the spin -* lock, so be nice and check if trans is set, but ASSERT() so -* if it isn't set a developer will notice. -*/ - spin_lock(_info->trans_lock); - trans = fs_info->running_transaction; - if (trans) - refcount_inc(>use_count); - spin_unlock(_info->trans_lock); - - ASSERT(trans); - if (trans) { - if (atomic_dec_and_test(>pending_ordered)) - wake_up(>pending_wait); - btrfs_put_transaction(trans); - } - } - spin_lock(>ordered_extent_lock); list_del_init(>root_extent_list); root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 02d813aaa261..b10e6765d88f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -56,8 +56,6 @@ struct btrfs_ordered_sum { * the isize. */ #define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */ -#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to - * complete in the current transaction. */ #define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */ struct btrfs_ordered_extent { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3b84f5015029..2fe6c2b1d94b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -232,14 +232,12 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, extwriter_counter_init(cur_trans, type); init_waitqueue_head(_trans->writer_wait); init_waitqueue_head(_trans->commit_wait); - init_waitqueue_head(_trans->pending_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ refcount_set(_trans->use_count, 2); - atomic_set(_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = ktime_get_seconds(); @@ -1908,13 +1906,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } -static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) -{ - wait_event(cur_trans->pending_wait, - atomic_read(_trans->pending_ordered) == 0); -} - int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2049,8 +2040,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(fs_info); -
Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid
On 10/26/2018 11:52 PM, Nikolay Borisov wrote: On 26.10.18 г. 18:34 ч., Anand Jain wrote: On 10/26/2018 11:02 PM, Nikolay Borisov wrote: On 8.10.18 г. 21:28 ч., Anand Jain wrote: We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after its been copied. So this test case reproduces this issue. For example: Initially.. /dev/mmcblk0p4 is mounted as / lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part / |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:0 1 14.9G 0 disk |-sda4 8:4 1 4G 0 part / |-sda2 8:2 1 500M 0 part |-sda3 8:3 1 256M 0 part `-sda1 8:1 1 256M 0 part mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take the sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Is there a pending fix for this? Yes. https://patchwork.kernel.org/patch/10641041/ Test case header mentioned it. Signed-off-by: Anand Jain --- v1->v2: dont play around with dev patch use it as it is. do not use SCRATCH_MNT instead create it at the TEST_DIR and its related changes. golden out changes tests/btrfs/173 | 88 + tests/btrfs/173.out | 6 tests/btrfs/group | 1 + 3 files changed, 95 insertions(+) create mode 100755 tests/btrfs/173 create mode 100644 tests/btrfs/173.out diff --git a/tests/btrfs/173 b/tests/btrfs/173 new file mode 100755 index ..b466ae921e19 --- /dev/null +++ b/tests/btrfs/173 @@ -0,0 +1,88 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Oracle. All Rights Reserved. +# +# FS QA Test 173 +# +# Fuzzy test for FS image duplication. +# Could be fixed by +# [patch] btrfs: harden agaist duplicate fsid +# +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! +trap "_cleanup; exit \$status" 0 1 2 3 15 + +mnt=$TEST_DIR/$seq.mnt +_cleanup() +{ + rm -rf $mnt > /dev/null 2>&1 + cd / + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# remove previous $seqres.full before test +rm -f $seqres.full + +# real QA test starts here + +# Modify as appropriate. +_supported_fs btrfs +_supported_os Linux +_require_scratch_dev_pool 2 +_scratch_dev_pool_get 2 + +dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}') +dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}') Naming devices _foo and _bar just shows you could not care less about the test. Hmm. Will make it device_1 and device_2. Either use sane names - primary_dev/secondary dev or device_1 and device_2. + +echo dev_foo=$dev_foo >> $seqres.full +echo dev_bar=$dev_bar >> $seqres.full +echo | tee -a $seqres.full + +rm -rf $mnt > /dev/null 2>&1 So what is $mnt? I can see it's used by very few tests and it's not obvious? Generally you should either define it as a local variable or use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked with Eric re. the use of $mnt in tests and his conclusion is : " looks like a bug" No its not. As few lines above, I have assigned it as.. mnt=$TEST_DIR/$seq.mnt I missed that, I will recommend moving the assignment near where you set the devices cleanup() is using $mnt. cleanup() may get called for any trap before the actual test. Thanks, Anand +mkdir $mnt +_mkfs_dev $dev_foo +_mount $dev_foo $mnt + +check_btrfs_mount() +{ + local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}') + [[ $x == $dev_foo ]] && echo DEV_FOO + [[ $x == $dev_bar ]] && echo DEV_BAR +} Same thing here re. DEV_(FOO|BAR). + +echo MNT $(check_btrfs_mount) + +for
Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid
On 26.10.18 г. 18:34 ч., Anand Jain wrote: > > > On 10/26/2018 11:02 PM, Nikolay Borisov wrote: >> >> >> On 8.10.18 г. 21:28 ч., Anand Jain wrote: >>> We have a known bug in btrfs, that we let the device path be changed >>> after the device has been mounted. So using this loop hole the new >>> copied device would appears as if its mounted immediately after its >>> been copied. So this test case reproduces this issue. >>> >>> For example: >>> >>> Initially.. /dev/mmcblk0p4 is mounted as / >>> >>> lsblk >>> NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT >>> mmcblk0 179:0 0 29.2G 0 disk >>> |-mmcblk0p4 179:4 0 4G 0 part / >>> |-mmcblk0p2 179:2 0 500M 0 part /boot >>> |-mmcblk0p3 179:3 0 256M 0 part [SWAP] >>> `-mmcblk0p1 179:1 0 256M 0 part /boot/efi >>> >>> btrfs fi show >>> Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba >>> Total devices 1 FS bytes used 1.40GiB >>> devid 1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 >>> >>> Copy mmcblk0 to sda >>> dd if=/dev/mmcblk0 of=/dev/sda >>> >>> And immediately after the copy completes the change in the device >>> superblock is notified which the automount scans using >>> btrfs device scan and the new device sda becomes the mounted root >>> device. >>> >>> lsblk >>> NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT >>> sda 8:0 1 14.9G 0 disk >>> |-sda4 8:4 1 4G 0 part / >>> |-sda2 8:2 1 500M 0 part >>> |-sda3 8:3 1 256M 0 part >>> `-sda1 8:1 1 256M 0 part >>> mmcblk0 179:0 0 29.2G 0 disk >>> |-mmcblk0p4 179:4 0 4G 0 part >>> |-mmcblk0p2 179:2 0 500M 0 part /boot >>> |-mmcblk0p3 179:3 0 256M 0 part [SWAP] >>> `-mmcblk0p1 179:1 0 256M 0 part /boot/efi >>> btrfs fi show / >>> Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba >>> Total devices 1 FS bytes used 1.40GiB >>> devid 1 size 4.00GiB used 3.00GiB path /dev/sda4 >>> >>> The bug is quite nasty that you can't either unmount /dev/sda4 or >>> /dev/mmcblk0p4. And the problem does not get solved until you take >>> the sda out of the system on to another system to change its fsid using >>> the 'btrfstune -u' command. >> >> Is there a pending fix for this? > > Yes. > https://patchwork.kernel.org/patch/10641041/ > > Test case header mentioned it. > >> >>> >>> Signed-off-by: Anand Jain >>> --- >>> v1->v2: >>> dont play around with dev patch use it as it is. >>> do not use SCRATCH_MNT instead create it at the TEST_DIR and its >>> related >>> changes. >>> golden out changes >>> tests/btrfs/173 | 88 >>> + >>> tests/btrfs/173.out | 6 >>> tests/btrfs/group | 1 + >>> 3 files changed, 95 insertions(+) >>> create mode 100755 tests/btrfs/173 >>> create mode 100644 tests/btrfs/173.out >>> >>> diff --git a/tests/btrfs/173 b/tests/btrfs/173 >>> new file mode 100755 >>> index ..b466ae921e19 >>> --- /dev/null >>> +++ b/tests/btrfs/173 >>> @@ -0,0 +1,88 @@ >>> +#! /bin/bash >>> +# SPDX-License-Identifier: GPL-2.0 >>> +# Copyright (c) 2018 Oracle. All Rights Reserved. >>> +# >>> +# FS QA Test 173 >>> +# >>> +# Fuzzy test for FS image duplication. >>> +# Could be fixed by >>> +# [patch] btrfs: harden agaist duplicate fsid >>> +# >>> +seq=`basename $0` >>> +seqres=$RESULT_DIR/$seq >>> +echo "QA output created by $seq" >>> + >>> +here=`pwd` >>> +tmp=/tmp/$$ >>> +status=1 # failure is the default! >>> +trap "_cleanup; exit \$status" 0 1 2 3 15 >>> + >>> +mnt=$TEST_DIR/$seq.mnt >>> +_cleanup() >>> +{ >>> + rm -rf $mnt > /dev/null 2>&1 >>> + cd / >>> + rm -f $tmp.* >>> +} >>> + >>> +# get standard environment, filters and checks >>> +. ./common/rc >>> +. ./common/filter >>> + >>> +# remove previous $seqres.full before test >>> +rm -f $seqres.full >>> + >>> +# real QA test starts here >>> + >>> +# Modify as appropriate. >>> +_supported_fs btrfs >>> +_supported_os Linux >>> +_require_scratch_dev_pool 2 >>> +_scratch_dev_pool_get 2 >>> + >>> +dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}') >>> +dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}') >> >> Naming devices _foo and _bar just shows you could not care less about >> the test. > > Hmm. Will make it device_1 and device_2. > >> Either use sane names - primary_dev/secondary dev or device_1 >> and device_2. > >>> + >>> +echo dev_foo=$dev_foo >> $seqres.full >>> +echo dev_bar=$dev_bar >> $seqres.full >>> +echo | tee -a $seqres.full >>> + >>> +rm -rf $mnt > /dev/null 2>&1 >> >> So what is $mnt? I can see it's used by very few tests and it's not >> obvious? Generally you should either define it as a local variable or >> use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked >> with Eric re. the use of $mnt in tests and his conclusion is : >> >> " looks like a bug" > > No its not. As few lines above, I have assigned it as.. > mnt=$TEST_DIR/$seq.mnt I missed that, I will
Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid
On 10/26/2018 11:02 PM, Nikolay Borisov wrote: On 8.10.18 г. 21:28 ч., Anand Jain wrote: We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after its been copied. So this test case reproduces this issue. For example: Initially.. /dev/mmcblk0p4 is mounted as / lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part / |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:01 14.9G 0 disk |-sda48:414G 0 part / |-sda28:21 500M 0 part |-sda38:31 256M 0 part `-sda18:11 256M 0 part mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take the sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Is there a pending fix for this? Yes. https://patchwork.kernel.org/patch/10641041/ Test case header mentioned it. Signed-off-by: Anand Jain --- v1->v2: dont play around with dev patch use it as it is. do not use SCRATCH_MNT instead create it at the TEST_DIR and its related changes. golden out changes tests/btrfs/173 | 88 + tests/btrfs/173.out | 6 tests/btrfs/group | 1 + 3 files changed, 95 insertions(+) create mode 100755 tests/btrfs/173 create mode 100644 tests/btrfs/173.out diff --git a/tests/btrfs/173 b/tests/btrfs/173 new file mode 100755 index ..b466ae921e19 --- /dev/null +++ b/tests/btrfs/173 @@ -0,0 +1,88 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Oracle. All Rights Reserved. +# +# FS QA Test 173 +# +# Fuzzy test for FS image duplication. +# Could be fixed by +#[patch] btrfs: harden agaist duplicate fsid +# +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! +trap "_cleanup; exit \$status" 0 1 2 3 15 + +mnt=$TEST_DIR/$seq.mnt +_cleanup() +{ + rm -rf $mnt > /dev/null 2>&1 + cd / + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# remove previous $seqres.full before test +rm -f $seqres.full + +# real QA test starts here + +# Modify as appropriate. +_supported_fs btrfs +_supported_os Linux +_require_scratch_dev_pool 2 +_scratch_dev_pool_get 2 + +dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}') +dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}') Naming devices _foo and _bar just shows you could not care less about the test. Hmm. Will make it device_1 and device_2. Either use sane names - primary_dev/secondary dev or device_1 and device_2. + +echo dev_foo=$dev_foo >> $seqres.full +echo dev_bar=$dev_bar >> $seqres.full +echo | tee -a $seqres.full + +rm -rf $mnt > /dev/null 2>&1 So what is $mnt? I can see it's used by very few tests and it's not obvious? Generally you should either define it as a local variable or use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked with Eric re. the use of $mnt in tests and his conclusion is : " looks like a bug" No its not. As few lines above, I have assigned it as.. mnt=$TEST_DIR/$seq.mnt +mkdir $mnt +_mkfs_dev $dev_foo +_mount $dev_foo $mnt + +check_btrfs_mount() +{ + local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}') + [[ $x == $dev_foo ]] && echo DEV_FOO + [[ $x == $dev_bar ]] && echo DEV_BAR +} Same thing here re. DEV_(FOO|BAR). + +echo MNT $(check_btrfs_mount) + +for sb_bytenr in 65536 67108864 +do + echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\ + "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full + dd status=none if=$dev_foo of=$dev_bar bs=1 seek=$sb_bytenr \ +
Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid
On 8.10.18 г. 21:28 ч., Anand Jain wrote: > We have a known bug in btrfs, that we let the device path be changed > after the device has been mounted. So using this loop hole the new > copied device would appears as if its mounted immediately after its > been copied. So this test case reproduces this issue. > > For example: > > Initially.. /dev/mmcblk0p4 is mounted as / > > lsblk > NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT > mmcblk0 179:00 29.2G 0 disk > |-mmcblk0p4 179:404G 0 part / > |-mmcblk0p2 179:20 500M 0 part /boot > |-mmcblk0p3 179:30 256M 0 part [SWAP] > `-mmcblk0p1 179:10 256M 0 part /boot/efi > > btrfs fi show > Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba > Total devices 1 FS bytes used 1.40GiB > devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 > > Copy mmcblk0 to sda > dd if=/dev/mmcblk0 of=/dev/sda > > And immediately after the copy completes the change in the device > superblock is notified which the automount scans using > btrfs device scan and the new device sda becomes the mounted root > device. > > lsblk > NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT > sda 8:01 14.9G 0 disk > |-sda48:414G 0 part / > |-sda28:21 500M 0 part > |-sda38:31 256M 0 part > `-sda18:11 256M 0 part > mmcblk0 179:00 29.2G 0 disk > |-mmcblk0p4 179:404G 0 part > |-mmcblk0p2 179:20 500M 0 part /boot > |-mmcblk0p3 179:30 256M 0 part [SWAP] > `-mmcblk0p1 179:10 256M 0 part /boot/efi > btrfs fi show / > Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba > Total devices 1 FS bytes used 1.40GiB > devid1 size 4.00GiB used 3.00GiB path /dev/sda4 > > The bug is quite nasty that you can't either unmount /dev/sda4 or > /dev/mmcblk0p4. And the problem does not get solved until you take > the sda out of the system on to another system to change its fsid using > the 'btrfstune -u' command. Is there a pending fix for this? > > Signed-off-by: Anand Jain > --- > v1->v2: > dont play around with dev patch use it as it is. > do not use SCRATCH_MNT instead create it at the TEST_DIR and its related >changes. > golden out changes > > tests/btrfs/173 | 88 > + > tests/btrfs/173.out | 6 > tests/btrfs/group | 1 + > 3 files changed, 95 insertions(+) > create mode 100755 tests/btrfs/173 > create mode 100644 tests/btrfs/173.out > > diff --git a/tests/btrfs/173 b/tests/btrfs/173 > new file mode 100755 > index ..b466ae921e19 > --- /dev/null > +++ b/tests/btrfs/173 > @@ -0,0 +1,88 @@ > +#! /bin/bash > +# SPDX-License-Identifier: GPL-2.0 > +# Copyright (c) 2018 Oracle. All Rights Reserved. > +# > +# FS QA Test 173 > +# > +# Fuzzy test for FS image duplication. > +# Could be fixed by > +#[patch] btrfs: harden agaist duplicate fsid > +# > +seq=`basename $0` > +seqres=$RESULT_DIR/$seq > +echo "QA output created by $seq" > + > +here=`pwd` > +tmp=/tmp/$$ > +status=1 # failure is the default! > +trap "_cleanup; exit \$status" 0 1 2 3 15 > + > +mnt=$TEST_DIR/$seq.mnt > +_cleanup() > +{ > + rm -rf $mnt > /dev/null 2>&1 > + cd / > + rm -f $tmp.* > +} > + > +# get standard environment, filters and checks > +. ./common/rc > +. ./common/filter > + > +# remove previous $seqres.full before test > +rm -f $seqres.full > + > +# real QA test starts here > + > +# Modify as appropriate. > +_supported_fs btrfs > +_supported_os Linux > +_require_scratch_dev_pool 2 > +_scratch_dev_pool_get 2 > + > +dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}') > +dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}') Naming devices _foo and _bar just shows you could not care less about the test. Either use sane names - primary_dev/secondary dev or device_1 and device_2. > + > +echo dev_foo=$dev_foo >> $seqres.full > +echo dev_bar=$dev_bar >> $seqres.full > +echo | tee -a $seqres.full > + > +rm -rf $mnt > /dev/null 2>&1 So what is $mnt? I can see it's used by very few tests and it's not obvious? Generally you should either define it as a local variable or use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked with Eric re. the use of $mnt in tests and his conclusion is : " looks like a bug" > +mkdir $mnt > +_mkfs_dev $dev_foo > +_mount $dev_foo $mnt > + > +check_btrfs_mount() > +{ > + local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}') > + [[ $x == $dev_foo ]] && echo DEV_FOO > + [[ $x == $dev_bar ]] && echo DEV_BAR > +} Same thing here re. DEV_(FOO|BAR). > + > +echo MNT $(check_btrfs_mount) > + > +for sb_bytenr in 65536 67108864 > +do > + echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\ > + "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full > + dd status=none if=$dev_foo of=$dev_bar bs=1 seek=$sb_bytenr \ > + skip=$sb_bytenr count=4096 >> $seqres.full
Re: Kernel crash related to LZO compression
On 2018-10-25 20:49, Chris Murphy wrote: I would say the first step no matter what if you're using an older kernel, is to boot a current Fedora or Arch live or install media, mount the Btrfs and try to read the problem files and see if the problem still happens. I can't even being to estimate the tens of thousands of line changes since kernel 4.9. Good point Chris. Indeed booting a fresh kernel is never a problem. Actually I forgot to mention that I've seen the same problem with kernel 4.12.13 (attached). What profile are you using for this Btrfs? Is this a raid56? What do you get for 'btrfs fi us ' ? It is RAID1 volume for both metadata and data, but unfortunately I haven't recorded the actual output before the failure. The configuration was like this: # btrfs filesystem show /var/log Label: none uuid: 5b45ac8e-fd8c-4759-854a-94e45069959d Total devices 2 FS bytes used 11.13GiB devid3 size 50.00GiB used 14.03GiB path /dev/sda3 devid4 size 50.00GiB used 14.03GiB path /dev/sdc1 On 2018-10-25 20:49, Chris Murphy wrote: It should be safe even with that kernel. I'm not sure this is compression related. There is a corruption bug related to inline extents and corruption that had been fairly elusive but I think it's fixed now. I haven't run into it though. On 2018-10-26 02:09, Qu Wenruo wrote: Are there any updates / fixes done in that area? Is lzo option safe to use? Yes, we have commits to harden lzo decompress code in v4.18: de885e3ee281a88f52283c7e8994e762e3a5f6bd btrfs: lzo: Harden inline lzo compressed extent decompression 314bfa473b6b6d3efe68011899bd718b349f29d7 btrfs: lzo: Add header length check to avoid potential out-of-bounds acc And for the root cause, it's compressed data without csum, then scrub could make it corrupted. It's also fixed in v4.18: 665d4953cde6d9e75c62a07ec8f4f8fd7d396ade btrfs: scrub: Don't use inode page cache in scrub_handle_errored_block() ac0b4145d662a3b9e34085dea460fb06ede9b69b btrfs: scrub: Don't use inode pages for device replace Thanks, Qu, for this information. Actually one time I've seen the binary crap (not zeros) in text log files (/var/log/*.log) and I was surprised that btrfs returned me data which is corrupted instead of signalling I/O error. Could it be because of "compressed data without csum" problem? Thanks! -- With best regards, Dmitry [Sun Dec 3 19:39:55 2017] BUG: unable to handle kernel paging request at f80a3000 [Sun Dec 3 19:39:55 2017] IP: memcpy+0x11/0x20 [Sun Dec 3 19:39:55 2017] *pde = 370bb067 [Sun Dec 3 19:39:55 2017] *pte = [Sun Dec 3 19:39:55 2017] Oops: 0002 [#1] SMP [Sun Dec 3 19:39:55 2017] Modules linked in: bridge stp llc arc4 iTCO_wdt iTCO_vendor_support ppdev ath5k evdev ath mac80211 cfg80211 i915 coretemp pcspkr rfkill snd_hda_codec_realtek serio_raw snd_hda_codec_generic video snd_hda_intel drm_kms_helper snd_hda_codec lpc_ich drm snd_hda_core snd_hwdep i2c_algo_bit snd_pcm_oss snd_mixer_oss fb_sys_fops sg snd_pcm syscopyarea snd_timer sysfillrect rng_core snd sysimgblt soundcore parport_pc parport shpchp button acpi_cpufreq binfmt_misc w83627hf hwmon_vid ip_tables x_tables autofs4 ses enclosure scsi_transport_sas xfs libcrc32c hid_generic usbhid hid btrfs crc32c_generic xor raid6_pq uas usb_storage sr_mod cdrom sd_mod ata_generic ata_piix i2c_i801 libata scsi_mod firewire_ohci firewire_core crc_itu_t ehci_pci e1000e ptp pps_core uhci_hcd ehci_hcd usbcore usb_common [Sun Dec 3 19:39:55 2017] CPU: 1 PID: 100 Comm: kworker/u4:2 Tainted: G W 4.12.0-2-686 #1 Debian 4.12.13-1 [Sun Dec 3 19:39:55 2017] Hardware name: AOpen i945GMx-IF/i945GMx-IF, BIOS i945GMx-IF R1.01 Mar.02.2007 AOpen Inc. 03/02/2007 [Sun Dec 3 19:39:55 2017] Workqueue: btrfs-endio btrfs_endio_helper [btrfs] [Sun Dec 3 19:39:55 2017] task: f7337280 task.stack: f695c000 [Sun Dec 3 19:39:55 2017] EIP: memcpy+0x11/0x20 [Sun Dec 3 19:39:55 2017] EFLAGS: 00010206 CPU: 1 [Sun Dec 3 19:39:55 2017] EAX: f80a2ff8 EBX: 1000 ECX: 03fe EDX: ff998000 [Sun Dec 3 19:39:55 2017] ESI: ff998008 EDI: f80a3000 EBP: ESP: f695de88 [Sun Dec 3 19:39:55 2017] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 [Sun Dec 3 19:39:55 2017] CR0: 80050033 CR2: f9c00140 CR3: 36bc7000 CR4: 06d0 [Sun Dec 3 19:39:55 2017] Call Trace: [Sun Dec 3 19:39:55 2017] ? lzo_decompress_bio+0x19f/0x2b0 [btrfs] [Sun Dec 3 19:39:55 2017] ? end_compressed_bio_read+0x28d/0x360 [btrfs] [Sun Dec 3 19:39:55 2017] ? btrfs_scrubparity_helper+0xb6/0x2c0 [btrfs] [Sun Dec 3 19:39:55 2017] ? process_one_work+0x135/0x2f0 [Sun Dec 3 19:39:55 2017] ? worker_thread+0x39/0x3a0 [Sun Dec 3 19:39:55 2017] ? kthread+0xd7/0x110 [Sun Dec 3 19:39:55 2017] ? process_one_work+0x2f0/0x2f0 [Sun Dec 3 19:39:55 2017] ? kthread_create_on_node+0x30/0x30 [Sun Dec 3 19:39:55 2017] ? ret_from_fork+0x19/0x24 [Sun Dec 3 19:39:55 2017] Code: 43 58 2b 43 50 88 43 4e 5b eb ed 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90
Re: [PATCH] btrfs-progs: add cli to forget one or all scanned devices
On 26.10.18 г. 17:27 ч., Anand Jain wrote: > This patch adds cli > btrfs device forget [dev] > to remove the given device structure in the kernel if the device > is unmounted. If no argument is given it shall remove all stale > (device which are not mounted) from the kernel. > > Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov > --- > cmds-device.c | 72 > --- > ioctl.h | 2 ++ > 2 files changed, 61 insertions(+), 13 deletions(-) > > diff --git a/cmds-device.c b/cmds-device.c > index 2a05f70a76a9..280d6f555377 100644 > --- a/cmds-device.c > +++ b/cmds-device.c > @@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv) > return _cmd_device_remove(argc, argv, cmd_device_delete_usage); > } > > +static int btrfs_forget_devices(char *path) > +{ > + struct btrfs_ioctl_vol_args args; > + int ret; > + int fd; > + > + fd = open("/dev/btrfs-control", O_RDWR); > + if (fd < 0) > + return -errno; > + > + memset(, 0, sizeof(args)); > + if (path) > + strncpy_null(args.name, path); > + ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, ); > + if (ret) > + ret = -errno; > + close(fd); > + return ret; > +} > + > static const char * const cmd_device_scan_usage[] = { > - "btrfs device scan [(-d|--all-devices)| [...]]", > - "Scan devices for a btrfs filesystem", > + "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\ > + "[...]]", > + "Scan or forget (deregister) devices for a btrfs filesystem", > " -d|--all-devices (deprecated)", > + " -u|--forget [ ..]", > NULL > }; > > @@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv) > int devstart; > int all = 0; > int ret = 0; > + int forget = 0; > > optind = 0; > while (1) { > int c; > static const struct option long_options[] = { > { "all-devices", no_argument, NULL, 'd'}, > + { "forget", no_argument, NULL, 'u'}, > { NULL, 0, NULL, 0} > }; > > - c = getopt_long(argc, argv, "d", long_options, NULL); > + c = getopt_long(argc, argv, "du", long_options, NULL); > if (c < 0) > break; > switch (c) { > case 'd': > all = 1; > break; > + case 'u': > + forget = 1; > + break; > default: > usage(cmd_device_scan_usage); > } > } > devstart = optind; > > + if (all && forget) > + usage(cmd_device_scan_usage); > + > if (all && check_argc_max(argc - optind, 1)) > usage(cmd_device_scan_usage); > > if (all || argc - optind == 0) { > - printf("Scanning for Btrfs filesystems\n"); > - ret = btrfs_scan_devices(); > - error_on(ret, "error %d while scanning", ret); > - ret = btrfs_register_all_devices(); > - error_on(ret, "there are %d errors while registering devices", > ret); > + if (forget) { > + ret = btrfs_forget_devices(NULL); > + error_on(ret, "'%s', forget failed", > + strerror(-ret)); > + } else { > + printf("Scanning for Btrfs filesystems\n"); > + ret = btrfs_scan_devices(); > + error_on(ret, "error %d while scanning", ret); > + ret = btrfs_register_all_devices(); > + error_on(ret, > + "there are %d errors while registering devices", > + ret); > + } > goto out; > } > > @@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv) > ret = 1; > goto out; > } > - printf("Scanning for Btrfs filesystems in '%s'\n", path); > - if (btrfs_register_one_device(path) != 0) { > - ret = 1; > - free(path); > - goto out; > + if (forget) { > + ret = btrfs_forget_devices(path); > + if (ret) > + error("Can't forget '%s': %s", > + path, strerror(-ret)); > + } else { > + printf("Scanning for Btrfs filesystems in '%s'\n", > + path); > + if (btrfs_register_one_device(path) != 0) { > + ret = 1; > + free(path); > +
Re: [PATCH] btrfs: introduce feature to forget a btrfs device
On 26.10.18 г. 17:27 ч., Anand Jain wrote: > Support for a new command 'btrfs dev forget [dev]' is proposed here > to undo the effects of 'btrfs dev scan [dev]'. For this purpose > this patch proposes to use ioctl #5 as it was empty. > IOW(BTRFS_IOCTL_MAGIC, 5, ..) > This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from > the /dev/btrfs-control to forget one or all devices, (devices which are > not mounted) from the btrfs kernel. > > The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be > set to specify the device path. And all unmounted devices can be removed > from the kernel if no device path is provided. > > Again, the devices are removed only if the relevant fsid aren't mounted. > > This new cli can provide.. > . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the >device is not going to be mounted. > . Ability to mount the device in degraded mode when one of the other >device is corrupted like in split brain raid1. > . Running test cases which requires btrfs.ko-reload if the rootfs >is btrfs. > > Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov > --- > fs/btrfs/super.c | 3 +++ > fs/btrfs/volumes.c | 9 + > fs/btrfs/volumes.h | 1 + > include/uapi/linux/btrfs.h | 2 ++ > 4 files changed, 15 insertions(+) > > diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c > index 345c64d810d4..f99db6899004 100644 > --- a/fs/btrfs/super.c > +++ b/fs/btrfs/super.c > @@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, > unsigned int cmd, > ret = PTR_ERR_OR_ZERO(device); > mutex_unlock(_mutex); > break; > + case BTRFS_IOC_FORGET_DEV: > + ret = btrfs_forget_devices(vol->name); > + break; > case BTRFS_IOC_DEVICES_READY: > mutex_lock(_mutex); > device = btrfs_scan_one_device(vol->name, FMODE_READ, > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index f435d397019e..e1365a122657 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device > *bdev, u64 bytenr, > return 0; > } > > +int btrfs_forget_devices(const char *path) > +{ > + mutex_lock(_mutex); > + btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL); > + mutex_unlock(_mutex); > + > + return 0; > +} > + > /* > * Look for a btrfs signature on a device. This may be called out of the > mount path > * and we are not allowed to call set_blocksize during the scan. The > superblock > diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h > index aefce895e994..180297d04938 100644 > --- a/fs/btrfs/volumes.h > +++ b/fs/btrfs/volumes.h > @@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices > *fs_devices, > fmode_t flags, void *holder); > struct btrfs_device *btrfs_scan_one_device(const char *path, > fmode_t flags, void *holder); > +int btrfs_forget_devices(const char *path); > int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); > void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); > void btrfs_assign_next_active_device(struct btrfs_device *device, > diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h > index 5ca1d21fc4a7..b1be7f828cb4 100644 > --- a/include/uapi/linux/btrfs.h > +++ b/include/uapi/linux/btrfs.h > @@ -836,6 +836,8 @@ enum btrfs_err_code { > struct btrfs_ioctl_vol_args) > #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ > struct btrfs_ioctl_vol_args) > +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ > +struct btrfs_ioctl_vol_args) > /* trans start and trans end are dangerous, and only for > * use by applications that know how to avoid the > * resulting deadlocks >
[PATCH] btrfs-progs: add cli to forget one or all scanned devices
This patch adds cli btrfs device forget [dev] to remove the given device structure in the kernel if the device is unmounted. If no argument is given it shall remove all stale (device which are not mounted) from the kernel. Signed-off-by: Anand Jain --- cmds-device.c | 72 --- ioctl.h | 2 ++ 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/cmds-device.c b/cmds-device.c index 2a05f70a76a9..280d6f555377 100644 --- a/cmds-device.c +++ b/cmds-device.c @@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv) return _cmd_device_remove(argc, argv, cmd_device_delete_usage); } +static int btrfs_forget_devices(char *path) +{ + struct btrfs_ioctl_vol_args args; + int ret; + int fd; + + fd = open("/dev/btrfs-control", O_RDWR); + if (fd < 0) + return -errno; + + memset(, 0, sizeof(args)); + if (path) + strncpy_null(args.name, path); + ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, ); + if (ret) + ret = -errno; + close(fd); + return ret; +} + static const char * const cmd_device_scan_usage[] = { - "btrfs device scan [(-d|--all-devices)| [...]]", - "Scan devices for a btrfs filesystem", + "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\ + "[...]]", + "Scan or forget (deregister) devices for a btrfs filesystem", " -d|--all-devices (deprecated)", + " -u|--forget [ ..]", NULL }; @@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv) int devstart; int all = 0; int ret = 0; + int forget = 0; optind = 0; while (1) { int c; static const struct option long_options[] = { { "all-devices", no_argument, NULL, 'd'}, + { "forget", no_argument, NULL, 'u'}, { NULL, 0, NULL, 0} }; - c = getopt_long(argc, argv, "d", long_options, NULL); + c = getopt_long(argc, argv, "du", long_options, NULL); if (c < 0) break; switch (c) { case 'd': all = 1; break; + case 'u': + forget = 1; + break; default: usage(cmd_device_scan_usage); } } devstart = optind; + if (all && forget) + usage(cmd_device_scan_usage); + if (all && check_argc_max(argc - optind, 1)) usage(cmd_device_scan_usage); if (all || argc - optind == 0) { - printf("Scanning for Btrfs filesystems\n"); - ret = btrfs_scan_devices(); - error_on(ret, "error %d while scanning", ret); - ret = btrfs_register_all_devices(); - error_on(ret, "there are %d errors while registering devices", ret); + if (forget) { + ret = btrfs_forget_devices(NULL); + error_on(ret, "'%s', forget failed", +strerror(-ret)); + } else { + printf("Scanning for Btrfs filesystems\n"); + ret = btrfs_scan_devices(); + error_on(ret, "error %d while scanning", ret); + ret = btrfs_register_all_devices(); + error_on(ret, + "there are %d errors while registering devices", + ret); + } goto out; } @@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv) ret = 1; goto out; } - printf("Scanning for Btrfs filesystems in '%s'\n", path); - if (btrfs_register_one_device(path) != 0) { - ret = 1; - free(path); - goto out; + if (forget) { + ret = btrfs_forget_devices(path); + if (ret) + error("Can't forget '%s': %s", + path, strerror(-ret)); + } else { + printf("Scanning for Btrfs filesystems in '%s'\n", + path); + if (btrfs_register_one_device(path) != 0) { + ret = 1; + free(path); + goto out; + } } free(path); } diff --git a/ioctl.h b/ioctl.h index
[PATCH v11] Add cli and ioctl to forget scanned device(s)
v11: btrfs-progs: Bring the code into the else part of if(forget). Use strerror to print the erorr instead of ret. v10: Make btrfs-progs changes more readable. With an effort to keep the known bug [1] as it is.. [1] The cli 'btrfs device scan --all /dev/sdb' which should have scanned only one device, ends up scanning all the devices and I am not trying to fix this bug in this patch because.. . -d|--all is marked as deprecated, I hope -d option would go away . For now some script might be using this bug as a feature, and fixing this bug might lead to mount failure. v9: Make forget as a btrfs device scan option. Use forget in the fstests, now you can run fstests with btrfs as rootfs which helps to exercise the uuid_mutex lock. v8: Change log update in the kernel patch. v7: Use struct btrfs_ioctl_vol_args (instead of struct btrfs_ioctl_vol_args_v2) as its inline with other ioctl btrfs-control The CLI usage/features remains same. However internally the ioctl flag is not required to delete all the unmounted devices. Instead leave btrfs_ioctl_vol_args::name NULL. v6: Use the changed fn name btrfs_free_stale_devices(). Change in title: Old v5: Cover-letter: [PATCH v5] Add cli and ioctl to ignore a scanned device Kernel: [PATCH v5] btrfs: introduce feature to ignore a btrfs device Progs: [PATCH v5] btrfs-progs: add 'btrfs device ignore' cli v5: Adds feature to delete all stale devices Reuses btrfs_free_stale_devices() fn and so depends on the patch-set [1] in the ML. Uses struct btrfs_ioctl_vol_args_v2 instead of struct btrfs_ioctl_vol_args as arg Does the device path matching instead of btrfs_device matching (we won't delete the mounted device as btrfs_free_stale_devices() checks for it) v4: No change. But as the ML thread may be confusing, so resend. v3: No change. Send to correct ML. v2: Accepts review from Nikolay, details are in the specific patch. Patch 1/2 is renamed from [PATCH 1/2] btrfs: refactor btrfs_free_stale_device() to get device list delete to [PATCH 1/2] btrfs: add function to device list delete Adds cli and ioctl to forget a scanned device or forget all stale devices in the kernel. Anand Jain (1): btrfs: introduce feature to forget a btrfs device fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) Anand Jain (1): btrfs-progs: add cli to forget one or all scanned devices cmds-device.c | 63 ++- ioctl.h | 2 ++ 2 files changed, 56 insertions(+), 9 deletions(-) [1] Anand Jain (1): fstests: btrfs use forget if not reload common/btrfs| 20 tests/btrfs/124 | 6 +++--- tests/btrfs/125 | 6 +++--- tests/btrfs/154 | 6 +++--- tests/btrfs/164 | 4 ++-- 5 files changed, 31 insertions(+), 11 deletions(-) -- 1.8.3.1
[PATCH] btrfs: introduce feature to forget a btrfs device
Support for a new command 'btrfs dev forget [dev]' is proposed here to undo the effects of 'btrfs dev scan [dev]'. For this purpose this patch proposes to use ioctl #5 as it was empty. IOW(BTRFS_IOCTL_MAGIC, 5, ..) This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from the /dev/btrfs-control to forget one or all devices, (devices which are not mounted) from the btrfs kernel. The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be set to specify the device path. And all unmounted devices can be removed from the kernel if no device path is provided. Again, the devices are removed only if the relevant fsid aren't mounted. This new cli can provide.. . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the device is not going to be mounted. . Ability to mount the device in degraded mode when one of the other device is corrupted like in split brain raid1. . Running test cases which requires btrfs.ko-reload if the rootfs is btrfs. Signed-off-by: Anand Jain --- fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 345c64d810d4..f99db6899004 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..e1365a122657 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + mutex_lock(_mutex); + btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL); + mutex_unlock(_mutex); + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aefce895e994..180297d04938 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5ca1d21fc4a7..b1be7f828cb4 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -836,6 +836,8 @@ enum btrfs_err_code { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_vol_args) /* trans start and trans end are dangerous, and only for * use by applications that know how to avoid the * resulting deadlocks -- 1.8.3.1
Re: [PATCH] btrfs-progs: add cli to forget one or all scanned devices
On 10/26/2018 08:21 PM, Nikolay Borisov wrote: On 24.10.2018 07:31, Anand Jain wrote: This patch adds cli btrfs device forget [dev] to remove the given device structure in the kernel if the device is unmounted. If no argument is given it shall remove all stale (device which are not mounted) from the kernel. Signed-off-by: Anand Jain --- cmds-device.c | 59 +++ ioctl.h | 2 ++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/cmds-device.c b/cmds-device.c index 2a05f70a76a9..c2a2b7f304b8 100644 --- a/cmds-device.c +++ b/cmds-device.c @@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv) return _cmd_device_remove(argc, argv, cmd_device_delete_usage); } +static int btrfs_forget_devices(char *path) +{ + struct btrfs_ioctl_vol_args args; + int ret; + int fd; + + fd = open("/dev/btrfs-control", O_RDWR); + if (fd < 0) + return -errno; + + memset(, 0, sizeof(args)); + if (path) + strncpy_null(args.name, path); + ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, ); + if (ret) + ret = -errno; + close(fd); + return ret; +} + static const char * const cmd_device_scan_usage[] = { - "btrfs device scan [(-d|--all-devices)| [...]]", - "Scan devices for a btrfs filesystem", + "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\ + "[...]]", + "Scan or forget (deregister) devices for a btrfs filesystem", " -d|--all-devices (deprecated)", + " -u|--forget [ ..]", NULL }; @@ -267,32 +289,45 @@ static int cmd_device_scan(int argc, char **argv) int devstart; int all = 0; int ret = 0; + int forget = 0; optind = 0; while (1) { int c; static const struct option long_options[] = { { "all-devices", no_argument, NULL, 'd'}, + { "forget", no_argument, NULL, 'u'}, { NULL, 0, NULL, 0} }; - c = getopt_long(argc, argv, "d", long_options, NULL); + c = getopt_long(argc, argv, "du", long_options, NULL); if (c < 0) break; switch (c) { case 'd': all = 1; break; + case 'u': + forget = 1; + break; default: usage(cmd_device_scan_usage); } } devstart = optind; + if (all && forget) + usage(cmd_device_scan_usage); + if (all && check_argc_max(argc - optind, 1)) usage(cmd_device_scan_usage); if (all || argc - optind == 0) { + if (forget) { + ret = btrfs_forget_devices(NULL); + error_on(ret, "error %d while running forget", ret); + goto out; + } nit: I would prefer to also have an if () {} else {} construct here, similar to what you do below. Will fix. Also, I think I can use sterror() instead of ret, in the error_on(). Fixed both of these in v11. Thanks, Anand printf("Scanning for Btrfs filesystems\n"); ret = btrfs_scan_devices(); error_on(ret, "error %d while scanning", ret); @@ -315,11 +350,19 @@ static int cmd_device_scan(int argc, char **argv) ret = 1; goto out; } - printf("Scanning for Btrfs filesystems in '%s'\n", path); - if (btrfs_register_one_device(path) != 0) { - ret = 1; - free(path); - goto out; + if (forget) { + ret = btrfs_forget_devices(path); + if (ret) + error("Can't forget '%s': %s", + path, strerror(-ret)); + } else { + printf("Scanning for Btrfs filesystems in '%s'\n", + path); + if (btrfs_register_one_device(path) != 0) { + ret = 1; + free(path); + goto out; + } } free(path); } diff --git a/ioctl.h b/ioctl.h index 709e996f401c..e27d80e09392 100644 --- a/ioctl.h +++ b/ioctl.h @@ -721,6 +721,8 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code) struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
Re: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
Hi Nikolay, Thank you for the patch! Yet something to improve: [auto build test ERROR on v4.19] [also build test ERROR on next-20181019] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Nikolay-Borisov/btrfs-Fix-error-handling-in-btrfs_cleanup_ordered_extents/20181026-194005 config: x86_64-randconfig-x014-201842 (attached as .config) compiler: gcc-7 (Debian 7.3.0-1) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All error/warnings (new ones prefixed by >>): fs/btrfs/inode.c: In function 'btrfs_cleanup_ordered_extents': >> fs/btrfs/inode.c:10618:0: error: unterminated argument list invoking macro >> "if" }; >> fs/btrfs/inode.c:140:2: error: expected '(' at end of input if (page_start >= offset && page_end <= (offset + bytes - 1) { ^~ >> fs/btrfs/inode.c:140:2: warning: this 'if' clause does not guard... >> [-Wmisleading-indentation] fs/btrfs/inode.c:10618:0: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if' }; >> fs/btrfs/inode.c:140:2: error: expected declaration or statement at end of >> input if (page_start >= offset && page_end <= (offset + bytes - 1) { ^~ fs/btrfs/inode.c:122:6: warning: unused variable 'page_end' [-Wunused-variable] u64 page_end = page_start + PAGE_SIZE - 1; ^~~~ fs/btrfs/inode.c: At top level: fs/btrfs/inode.c:87:12: warning: 'btrfs_setsize' declared 'static' but never defined [-Wunused-function] static int btrfs_setsize(struct inode *inode, struct iattr *attr); ^ fs/btrfs/inode.c:88:12: warning: 'btrfs_truncate' declared 'static' but never defined [-Wunused-function] static int btrfs_truncate(struct inode *inode, bool skip_writeback); ^~ fs/btrfs/inode.c:89:12: warning: 'btrfs_finish_ordered_io' declared 'static' but never defined [-Wunused-function] static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); ^~~ fs/btrfs/inode.c:90:21: warning: 'cow_file_range' declared 'static' but never defined [-Wunused-function] static noinline int cow_file_range(struct inode *inode, ^~ fs/btrfs/inode.c:95:27: warning: 'create_io_em' declared 'static' but never defined [-Wunused-function] static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, ^~~~ fs/btrfs/inode.c:101:13: warning: '__endio_write_update_ordered' declared 'static' but never defined [-Wunused-function] static void __endio_write_update_ordered(struct inode *inode, ^~~~ fs/btrfs/inode.c:71:27: warning: 'btrfs_inode_cachep' defined but not used [-Wunused-variable] static struct kmem_cache *btrfs_inode_cachep; ^~ vim +/if +10618 fs/btrfs/inode.c 76dda93c Yan, Zheng 2009-09-21 10615 82d339d9 Alexey Dobriyan 2009-10-09 10616 const struct dentry_operations btrfs_dentry_operations = { 76dda93c Yan, Zheng 2009-09-21 10617 .d_delete = btrfs_dentry_delete, 76dda93c Yan, Zheng 2009-09-21 @10618 }; :: The code at line 10618 was first introduced by commit :: 76dda93c6ae2c1dc3e6cde34569d6aca26b0c918 Btrfs: add snapshot/subvolume destroy ioctl :: TO: Yan, Zheng :: CC: Chris Mason --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [PATCH 5/5] btrfs: Replace BUG_ON with ASSERT in find_lock_delalloc_range
On 2018/10/26 下午7:43, Nikolay Borisov wrote: > lock_delalloc_pages should only return 2 values - 0 in case of success > and -EAGAIN if the range of pages to be locked should be shrunk due to > some of gone. Manual inspections confirms that this is > indeed the case since __process_pages_contig is where lock_delalloc_pages > gets its return value. The latter always returns 0 or -EAGAIN so the > invariant holds. No functional changes. > > Signed-off-by: Nikolay Borisov Reviewed-by: Qu Wenruo Thanks, Qu > --- > fs/btrfs/extent_io.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index 1a9a521aefe5..94bc53472031 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -1606,6 +1606,7 @@ static noinline_for_stack u64 > find_lock_delalloc_range(struct inode *inode, > /* step two, lock all the pages after the page that has start */ > ret = lock_delalloc_pages(inode, locked_page, > delalloc_start, delalloc_end); > + ASSERT(!ret || ret == -EAGAIN); > if (ret == -EAGAIN) { > /* some of the pages are gone, lets avoid looping by >* shortening the size of the delalloc range we're searching > @@ -1621,7 +1622,6 @@ static noinline_for_stack u64 > find_lock_delalloc_range(struct inode *inode, > goto out_failed; > } > } > - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ > > /* step three, lock the state bits for the whole range */ > lock_extent_bits(tree, delalloc_start, delalloc_end, _state); >
Re: [PATCH 4/5] btrfs: Sink find_lock_delalloc_range's 'max_bytes' argument
On 2018/10/26 下午7:43, Nikolay Borisov wrote: > All callers of this function pass BTRFS_MAX_EXTENT_SIZE (128M) so let's > reduce the argument count and make that a local variable. No functional > changes. > > Signed-off-by: Nikolay Borisov Reviewed-by: Qu Wenruo Thanks, Qu > --- > fs/btrfs/extent_io.c | 11 +-- > fs/btrfs/extent_io.h | 2 +- > fs/btrfs/tests/extent-io-tests.c | 10 +- > 3 files changed, 11 insertions(+), 12 deletions(-) > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index 6877a74c7469..1a9a521aefe5 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -1566,8 +1566,9 @@ static noinline int lock_delalloc_pages(struct inode > *inode, > static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, > struct extent_io_tree *tree, > struct page *locked_page, u64 *start, > - u64 *end, u64 max_bytes) > + u64 *end) > { > + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; > u64 delalloc_start; > u64 delalloc_end; > u64 found; > @@ -1647,10 +1648,9 @@ static noinline_for_stack u64 > find_lock_delalloc_range(struct inode *inode, > u64 btrfs_find_lock_delalloc_range(struct inode *inode, > struct extent_io_tree *tree, > struct page *locked_page, u64 *start, > - u64 *end, u64 max_bytes) > + u64 *end) > { > - return find_lock_delalloc_range(inode, tree, locked_page, start, end, > - max_bytes); > + return find_lock_delalloc_range(inode, tree, locked_page, start, end); > } > #endif > > @@ -3233,8 +3233,7 @@ static noinline_for_stack int writepage_delalloc(struct > inode *inode, > nr_delalloc = find_lock_delalloc_range(inode, tree, > page, > _start, > -_end, > -BTRFS_MAX_EXTENT_SIZE); > +_end); > if (nr_delalloc == 0) { > delalloc_start = delalloc_end + 1; > continue; > diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h > index 369daa5d4f73..7ec4f93caf78 100644 > --- a/fs/btrfs/extent_io.h > +++ b/fs/btrfs/extent_io.h > @@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, > u64 btrfs_find_lock_delalloc_range(struct inode *inode, > struct extent_io_tree *tree, > struct page *locked_page, u64 *start, > - u64 *end, u64 max_bytes); > + u64 *end); > #endif > struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, > u64 start); > diff --git a/fs/btrfs/tests/extent-io-tests.c > b/fs/btrfs/tests/extent-io-tests.c > index 9e0f4a01be14..8ea8c2aa6696 100644 > --- a/fs/btrfs/tests/extent-io-tests.c > +++ b/fs/btrfs/tests/extent-io-tests.c > @@ -107,7 +107,7 @@ static int test_find_delalloc(u32 sectorsize) > start = 0; > end = 0; > found = btrfs_find_lock_delalloc_range(inode, , locked_page, , > - , max_bytes); > + ); > if (!found) { > test_err("should have found at least one delalloc"); > goto out_bits; > @@ -138,7 +138,7 @@ static int test_find_delalloc(u32 sectorsize) > start = test_start; > end = 0; > found = btrfs_find_lock_delalloc_range(inode, , locked_page, , > - , max_bytes); > + ); > if (!found) { > test_err("couldn't find delalloc in our range"); > goto out_bits; > @@ -172,7 +172,7 @@ static int test_find_delalloc(u32 sectorsize) > start = test_start; > end = 0; > found = btrfs_find_lock_delalloc_range(inode, , locked_page, , > - , max_bytes); > + ); > if (found) { > test_err("found range when we shouldn't have"); > goto out_bits; > @@ -193,7 +193,7 @@ static int test_find_delalloc(u32 sectorsize) > start = test_start; > end = 0; > found = btrfs_find_lock_delalloc_range(inode, , locked_page, , > - , max_bytes); > + ); > if (!found) { > test_err("didn't find our range"); > goto out_bits; > @@ -234,7 +234,7 @@ static int test_find_delalloc(u32 sectorsize) >* tests expected
Re: [PATCH 3/5] btrfs: Remove superfluous check form btrfs_remove_chunk
On 2018/10/26 下午7:43, Nikolay Borisov wrote: > It's unnecessary to check map->stripes[i].dev for NULL given its value > is already set and dereferenced above the the check. No functional changes. > > Signed-off-by: Nikolay Borisov Reviewed-by: Qu Wenruo Thanks, Qu > --- > fs/btrfs/volumes.c | 12 +--- > 1 file changed, 5 insertions(+), 7 deletions(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index dc53d94a62aa..f0db43d08456 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -2797,13 +2797,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle > *trans, u64 chunk_offset) > mutex_unlock(_info->chunk_mutex); > } > > - if (map->stripes[i].dev) { > - ret = btrfs_update_device(trans, map->stripes[i].dev); > - if (ret) { > - mutex_unlock(_devices->device_list_mutex); > - btrfs_abort_transaction(trans, ret); > - goto out; > - } > + ret = btrfs_update_device(trans, device); > + if (ret) { > + mutex_unlock(_devices->device_list_mutex); > + btrfs_abort_transaction(trans, ret); > + goto out; > } > } > mutex_unlock(_devices->device_list_mutex); >
Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance
On 10/26/18 2:16 PM, Nikolay Borisov wrote: > > (Adding Chris to CC since he is the original author of the code) > > On 26.10.2018 15:09, Hans van Kranenburg wrote: >> On 10/26/18 1:43 PM, Nikolay Borisov wrote: >>> The first part of balance operation is to shrink every constituting >>> device to ensure there is free space for chunk allocation. >> >> A very useful thing to be able to do if there's no unallocated raw disk >> space left, is use balance to rewrite some blockgroup into the free >> space of other ones. >> >> This does not need any raw space for a chunk allocation. Requiring so >> makes it unneccesarily more complicated for users to fix the situation >> they got in. > > The current logic of the code doesn't preclude this use case since if we > can't shrink we just proceed to relocation. So even if 1g is replaced > with 5eb and shrink_device always fails balancing will still proceed. > > However, all of this really makes me wonder do we even need this "first > stage" code in balancing (Chris, any thoughts?). Having something that tries to free up 1G on each device can be very useful when converting to another profile. I guess that's why it is there. And shrink device + grow device is a nice way to try packing some data into other existing block groups. Only... if shrink+grow is done for each device after each other, and there's one with 1G unallocated left, and it can't easily move it into existing free space, then it may just be moving data around to the next disk all the time I guess, ending up with the same situation as before. :D >>> However, the code >>> has been buggy ever since its introduction since calculating the space to >>> shrink >>> the device by was bounded by 1 mb. Most likely the original intention was to >>> have an upper bound of 1g and not 1m, since the largest chunk size is 1g. >>> This >>> means the first stage in __btrfs_balance so far has been a null op since it >>> effectively freed just a single megabyte. >>> >>> Fix this by setting an upper bound of size_to_free of 1g. >>> >>> Signed-off-by: Nikolay Borisov >>> --- >>> fs/btrfs/volumes.c | 2 +- >>> 1 file changed, 1 insertion(+), 1 deletion(-) >>> >>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >>> index f435d397019e..8b0fd7bf3447 100644 >>> --- a/fs/btrfs/volumes.c >>> +++ b/fs/btrfs/volumes.c >>> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info >>> *fs_info) >>> list_for_each_entry(device, devices, dev_list) { >>> old_size = btrfs_device_get_total_bytes(device); >>> size_to_free = div_factor(old_size, 1); >>> - size_to_free = min_t(u64, size_to_free, SZ_1M); >>> + size_to_free = min_t(u64, size_to_free, SZ_1G); >>> if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) || >>> btrfs_device_get_total_bytes(device) - >>> btrfs_device_get_bytes_used(device) > size_to_free || >>> >> >> -- Hans van Kranenburg
Re: [PATCH 2/5] btrfs: Refactor btrfs_can_relocate
On 2018/10/26 下午7:43, Nikolay Borisov wrote: > btrfs_can_relocate returns 0 when it concludes the given chunk can be > relocated and -1 otherwise. Since this function is used as a predicated > and it return a binary value it makes no sense to have it's return > value as an int so change it to bool. Furthermore, remove a stale > leftover comment from e6ec716f0ddb ("Btrfs: make raid attr array more > readable"). > > Finally make the logic in the list_for_each_entry loop a bit more obvious. Up > until now the fact that we are returning success (0) was a bit masked by the > fact that the 0 is re-used from the return value of find_free_dev_extent. > Instead, now just increment dev_nr if we find a suitable extent and explicitly > set can_reloc to true if enough devices with unallocated space are present. > No functional changes. > > Signed-off-by: Nikolay Borisov Reviewed-by: Qu Wenruo Thanks, Qu > --- > fs/btrfs/ctree.h | 2 +- > fs/btrfs/extent-tree.c | 39 ++- > fs/btrfs/volumes.c | 3 +-- > 3 files changed, 16 insertions(+), 28 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 68ca41dbbef3..06edc4f9ceb2 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -2680,7 +2680,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle > *trans, > int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); > int btrfs_free_block_groups(struct btrfs_fs_info *info); > int btrfs_read_block_groups(struct btrfs_fs_info *info); > -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); > +bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); > int btrfs_make_block_group(struct btrfs_trans_handle *trans, > u64 bytes_used, u64 type, u64 chunk_offset, > u64 size); > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index a1febf155747..816bca482358 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -9423,10 +9423,10 @@ void btrfs_dec_block_group_ro(struct > btrfs_block_group_cache *cache) > /* > * checks to see if its even possible to relocate this block group. > * > - * @return - -1 if it's not a good idea to relocate this block group, 0 if > its > - * ok to go ahead and try. > + * @return - false if not enough space can be found for relocation, true > + * otherwise > */ > -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) > +bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) > { > struct btrfs_root *root = fs_info->extent_root; > struct btrfs_block_group_cache *block_group; > @@ -9441,7 +9441,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, > u64 bytenr) > int debug; > int index; > int full = 0; > - int ret = 0; > + bool can_reloc = true; > > debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); > > @@ -9453,7 +9453,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, > u64 bytenr) > btrfs_warn(fs_info, > "can't find block group for bytenr %llu", > bytenr); > - return -1; > + return false; > } > > min_free = btrfs_block_group_used(_group->item); > @@ -9489,16 +9489,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, > u64 bytenr) >* group is going to be restriped, run checks against the target >* profile instead of the current one. >*/ > - ret = -1; > + can_reloc = false; > > - /* > - * index: > - * 0: raid10 > - * 1: raid1 > - * 2: dup > - * 3: raid0 > - * 4: single > - */ > target = get_restripe_target(fs_info, block_group->flags); > if (target) { > index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); > @@ -9534,10 +9526,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, > u64 bytenr) > > /* We need to do this so that we can look at pending chunks */ > trans = btrfs_join_transaction(root); > - if (IS_ERR(trans)) { > - ret = PTR_ERR(trans); > + if (IS_ERR(trans)) > goto out; > - } > > mutex_lock(_info->chunk_mutex); > list_for_each_entry(device, _devices->alloc_list, dev_alloc_list) { > @@ -9549,18 +9539,17 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, > u64 bytenr) >*/ > if (device->total_bytes > device->bytes_used + min_free && > !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) > { > - ret = find_free_dev_extent(trans, device, min_free, > -_offset, NULL); > - if (!ret) > + if (!find_free_dev_extent(trans, device, min_free, > +_offset, NULL)) >
Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance
On 2018/10/26 下午8:08, Nikolay Borisov wrote: > > > On 26.10.2018 15:04, Qu Wenruo wrote: >> >> >> On 2018/10/26 下午7:43, Nikolay Borisov wrote: >>> The first part of balance operation is to shrink every constituting >>> device to ensure there is free space for chunk allocation. However, the code >>> has been buggy ever since its introduction since calculating the space to >>> shrink >>> the device by was bounded by 1 mb. Most likely the original intention was to >>> have an upper bound of 1g and not 1m, since the largest chunk size is 1g. >> >> Minor nitpick, largest chunk size -> largest chunk stripe size. >> >> As for data chunk, it's possible to get a 10G chunk, but still only 1G >> stripe up limit. >> >>> This >>> means the first stage in __btrfs_balance so far has been a null op since it >>> effectively freed just a single megabyte. >>> >>> Fix this by setting an upper bound of size_to_free of 1g. >> >> One question come to me naturally, what if we failed to shrink the device? >> >> In fact if btrfs_shrink_device() returns ENOSPC we just skip to >> relocation part, so it doesn't look like to cause regression. >> >> If this can be mentioned in the commit message, it would save reviewer >> minutes to read the code. > > Will incorporate it in v2. > >> >> >> >> BTW, I think for that (ret == ENOSPC) after btrfs_shrink_device(), we >> should continue other than break, to get more chance to secure >> unallocated space. > > I agree but this should be done in a separate patch, this one deals with > the silly upper bound of 1m. No problem, just a hint for a new patch :) Thanks, Qu > >> >> Thanks, >> Qu >> >>> >>> Signed-off-by: Nikolay Borisov >>> --- >>> fs/btrfs/volumes.c | 2 +- >>> 1 file changed, 1 insertion(+), 1 deletion(-) >>> >>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >>> index f435d397019e..8b0fd7bf3447 100644 >>> --- a/fs/btrfs/volumes.c >>> +++ b/fs/btrfs/volumes.c >>> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info >>> *fs_info) >>> list_for_each_entry(device, devices, dev_list) { >>> old_size = btrfs_device_get_total_bytes(device); >>> size_to_free = div_factor(old_size, 1); >>> - size_to_free = min_t(u64, size_to_free, SZ_1M); >>> + size_to_free = min_t(u64, size_to_free, SZ_1G); >>> if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) || >>> btrfs_device_get_total_bytes(device) - >>> btrfs_device_get_bytes_used(device) > size_to_free || >>> >>
Re: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
Hi Nikolay, Thank you for the patch! Yet something to improve: [auto build test ERROR on v4.19] [also build test ERROR on next-20181019] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Nikolay-Borisov/btrfs-Fix-error-handling-in-btrfs_cleanup_ordered_extents/20181026-194005 config: i386-randconfig-x007-201842 (attached as .config) compiler: gcc-7 (Debian 7.3.0-1) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=i386 All errors (new ones prefixed by >>): fs//btrfs/inode.c: In function 'btrfs_cleanup_ordered_extents': >> fs//btrfs/inode.c:140:63: error: expected ')' before '{' token if (page_start >= offset && page_end <= (offset + bytes - 1) { ^ >> fs//btrfs/inode.c:146:1: error: expected expression before '}' token } ^ vim +140 fs//btrfs/inode.c 86 87 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 88 static int btrfs_truncate(struct inode *inode, bool skip_writeback); 89 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 90 static noinline int cow_file_range(struct inode *inode, 91 struct page *locked_page, 92 u64 start, u64 end, u64 delalloc_end, 93 int *page_started, unsigned long *nr_written, 94 int unlock, struct btrfs_dedupe_hash *hash); 95 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, 96 u64 orig_start, u64 block_start, 97 u64 block_len, u64 orig_block_len, 98 u64 ram_bytes, int compress_type, 99 int type); 100 101 static void __endio_write_update_ordered(struct inode *inode, 102 const u64 offset, const u64 bytes, 103 const bool uptodate); 104 105 /* 106 * Cleanup all submitted ordered extents in specified range to handle errors 107 * from the fill_dellaloc() callback. 108 * 109 * NOTE: caller must ensure that when an error happens, it can not call 110 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING 111 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata 112 * to be released, which we want to happen only when finishing the ordered 113 * extent (btrfs_finish_ordered_io()). 114 */ 115 static inline void btrfs_cleanup_ordered_extents(struct inode *inode, 116 struct page *locked_page, 117 u64 offset, u64 bytes) 118 { 119 unsigned long index = offset >> PAGE_SHIFT; 120 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; 121 u64 page_start = page_offset(locked_page); 122 u64 page_end = page_start + PAGE_SIZE - 1; 123 124 struct page *page; 125 126 while (index <= end_index) { 127 page = find_get_page(inode->i_mapping, index); 128 index++; 129 if (!page) 130 continue; 131 ClearPagePrivate2(page); 132 put_page(page); 133 } 134 135 /* 136 * In case this page belongs to the delalloc range being instantiated 137 * then skip it, since the first page of a range is going to be 138 * properly cleaned up by the caller of run_delalloc_range 139 */ > 140 if (page_start >= offset && page_end <= (offset + bytes - 1) { 141 offset += PAGE_SIZE; 142 bytes -= PAGE_SIZE; 143 } 144 145 return __endio_write_update_ordered(inode, offset, bytes, false); > 146 } 147 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [PATCH] btrfs-progs: add cli to forget one or all scanned devices
On 24.10.2018 07:31, Anand Jain wrote: > This patch adds cli > btrfs device forget [dev] > to remove the given device structure in the kernel if the device > is unmounted. If no argument is given it shall remove all stale > (device which are not mounted) from the kernel. > > Signed-off-by: Anand Jain > --- > cmds-device.c | 59 > +++ > ioctl.h | 2 ++ > 2 files changed, 53 insertions(+), 8 deletions(-) > > diff --git a/cmds-device.c b/cmds-device.c > index 2a05f70a76a9..c2a2b7f304b8 100644 > --- a/cmds-device.c > +++ b/cmds-device.c > @@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv) > return _cmd_device_remove(argc, argv, cmd_device_delete_usage); > } > > +static int btrfs_forget_devices(char *path) > +{ > + struct btrfs_ioctl_vol_args args; > + int ret; > + int fd; > + > + fd = open("/dev/btrfs-control", O_RDWR); > + if (fd < 0) > + return -errno; > + > + memset(, 0, sizeof(args)); > + if (path) > + strncpy_null(args.name, path); > + ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, ); > + if (ret) > + ret = -errno; > + close(fd); > + return ret; > +} > + > static const char * const cmd_device_scan_usage[] = { > - "btrfs device scan [(-d|--all-devices)| [...]]", > - "Scan devices for a btrfs filesystem", > + "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\ > + "[...]]", > + "Scan or forget (deregister) devices for a btrfs filesystem", > " -d|--all-devices (deprecated)", > + " -u|--forget [ ..]", > NULL > }; > > @@ -267,32 +289,45 @@ static int cmd_device_scan(int argc, char **argv) > int devstart; > int all = 0; > int ret = 0; > + int forget = 0; > > optind = 0; > while (1) { > int c; > static const struct option long_options[] = { > { "all-devices", no_argument, NULL, 'd'}, > + { "forget", no_argument, NULL, 'u'}, > { NULL, 0, NULL, 0} > }; > > - c = getopt_long(argc, argv, "d", long_options, NULL); > + c = getopt_long(argc, argv, "du", long_options, NULL); > if (c < 0) > break; > switch (c) { > case 'd': > all = 1; > break; > + case 'u': > + forget = 1; > + break; > default: > usage(cmd_device_scan_usage); > } > } > devstart = optind; > > + if (all && forget) > + usage(cmd_device_scan_usage); > + > if (all && check_argc_max(argc - optind, 1)) > usage(cmd_device_scan_usage); > > if (all || argc - optind == 0) { > + if (forget) { > + ret = btrfs_forget_devices(NULL); > + error_on(ret, "error %d while running forget", ret); > + goto out; > + } nit: I would prefer to also have an if () {} else {} construct here, similar to what you do below. > printf("Scanning for Btrfs filesystems\n"); > ret = btrfs_scan_devices(); > error_on(ret, "error %d while scanning", ret); > @@ -315,11 +350,19 @@ static int cmd_device_scan(int argc, char **argv) > ret = 1; > goto out; > } > - printf("Scanning for Btrfs filesystems in '%s'\n", path); > - if (btrfs_register_one_device(path) != 0) { > - ret = 1; > - free(path); > - goto out; > + if (forget) { > + ret = btrfs_forget_devices(path); > + if (ret) > + error("Can't forget '%s': %s", > + path, strerror(-ret)); > + } else { > + printf("Scanning for Btrfs filesystems in '%s'\n", > + path); > + if (btrfs_register_one_device(path) != 0) { > + ret = 1; > + free(path); > + goto out; > + } > } > free(path); > } > diff --git a/ioctl.h b/ioctl.h > index 709e996f401c..e27d80e09392 100644 > --- a/ioctl.h > +++ b/ioctl.h > @@ -721,6 +721,8 @@ static inline char *btrfs_err_str(enum btrfs_err_code > err_code) > struct btrfs_ioctl_vol_args) > #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ > struct btrfs_ioctl_vol_args) > +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC,
Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance
(Adding Chris to CC since he is the original author of the code) On 26.10.2018 15:09, Hans van Kranenburg wrote: > On 10/26/18 1:43 PM, Nikolay Borisov wrote: >> The first part of balance operation is to shrink every constituting >> device to ensure there is free space for chunk allocation. > > A very useful thing to be able to do if there's no unallocated raw disk > space left, is use balance to rewrite some blockgroup into the free > space of other ones. > > This does not need any raw space for a chunk allocation. Requiring so > makes it unneccesarily more complicated for users to fix the situation > they got in. The current logic of the code doesn't preclude this use case since if we can't shrink we just proceed to relocation. So even if 1g is replaced with 5eb and shrink_device always fails balancing will still proceed. However, all of this really makes me wonder do we even need this "first stage" code in balancing (Chris, any thoughts?). > >> However, the code >> has been buggy ever since its introduction since calculating the space to >> shrink >> the device by was bounded by 1 mb. Most likely the original intention was to >> have an upper bound of 1g and not 1m, since the largest chunk size is 1g. >> This >> means the first stage in __btrfs_balance so far has been a null op since it >> effectively freed just a single megabyte. >> >> Fix this by setting an upper bound of size_to_free of 1g. >> >> Signed-off-by: Nikolay Borisov >> --- >> fs/btrfs/volumes.c | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >> index f435d397019e..8b0fd7bf3447 100644 >> --- a/fs/btrfs/volumes.c >> +++ b/fs/btrfs/volumes.c >> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info >> *fs_info) >> list_for_each_entry(device, devices, dev_list) { >> old_size = btrfs_device_get_total_bytes(device); >> size_to_free = div_factor(old_size, 1); >> -size_to_free = min_t(u64, size_to_free, SZ_1M); >> +size_to_free = min_t(u64, size_to_free, SZ_1G); >> if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) || >> btrfs_device_get_total_bytes(device) - >> btrfs_device_get_bytes_used(device) > size_to_free || >> > >
Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance
On 10/26/18 1:43 PM, Nikolay Borisov wrote: > The first part of balance operation is to shrink every constituting > device to ensure there is free space for chunk allocation. A very useful thing to be able to do if there's no unallocated raw disk space left, is use balance to rewrite some blockgroup into the free space of other ones. This does not need any raw space for a chunk allocation. Requiring so makes it unneccesarily more complicated for users to fix the situation they got in. > However, the code > has been buggy ever since its introduction since calculating the space to > shrink > the device by was bounded by 1 mb. Most likely the original intention was to > have an upper bound of 1g and not 1m, since the largest chunk size is 1g. > This > means the first stage in __btrfs_balance so far has been a null op since it > effectively freed just a single megabyte. > > Fix this by setting an upper bound of size_to_free of 1g. > > Signed-off-by: Nikolay Borisov > --- > fs/btrfs/volumes.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index f435d397019e..8b0fd7bf3447 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info > *fs_info) > list_for_each_entry(device, devices, dev_list) { > old_size = btrfs_device_get_total_bytes(device); > size_to_free = div_factor(old_size, 1); > - size_to_free = min_t(u64, size_to_free, SZ_1M); > + size_to_free = min_t(u64, size_to_free, SZ_1G); > if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) || > btrfs_device_get_total_bytes(device) - > btrfs_device_get_bytes_used(device) > size_to_free || > -- Hans van Kranenburg
Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance
On 26.10.2018 15:04, Qu Wenruo wrote: > > > On 2018/10/26 下午7:43, Nikolay Borisov wrote: >> The first part of balance operation is to shrink every constituting >> device to ensure there is free space for chunk allocation. However, the code >> has been buggy ever since its introduction since calculating the space to >> shrink >> the device by was bounded by 1 mb. Most likely the original intention was to >> have an upper bound of 1g and not 1m, since the largest chunk size is 1g. > > Minor nitpick, largest chunk size -> largest chunk stripe size. > > As for data chunk, it's possible to get a 10G chunk, but still only 1G > stripe up limit. > >> This >> means the first stage in __btrfs_balance so far has been a null op since it >> effectively freed just a single megabyte. >> >> Fix this by setting an upper bound of size_to_free of 1g. > > One question come to me naturally, what if we failed to shrink the device? > > In fact if btrfs_shrink_device() returns ENOSPC we just skip to > relocation part, so it doesn't look like to cause regression. > > If this can be mentioned in the commit message, it would save reviewer > minutes to read the code. Will incorporate it in v2. > > > > BTW, I think for that (ret == ENOSPC) after btrfs_shrink_device(), we > should continue other than break, to get more chance to secure > unallocated space. I agree but this should be done in a separate patch, this one deals with the silly upper bound of 1m. > > Thanks, > Qu > >> >> Signed-off-by: Nikolay Borisov >> --- >> fs/btrfs/volumes.c | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >> index f435d397019e..8b0fd7bf3447 100644 >> --- a/fs/btrfs/volumes.c >> +++ b/fs/btrfs/volumes.c >> @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info >> *fs_info) >> list_for_each_entry(device, devices, dev_list) { >> old_size = btrfs_device_get_total_bytes(device); >> size_to_free = div_factor(old_size, 1); >> -size_to_free = min_t(u64, size_to_free, SZ_1M); >> +size_to_free = min_t(u64, size_to_free, SZ_1G); >> if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) || >> btrfs_device_get_total_bytes(device) - >> btrfs_device_get_bytes_used(device) > size_to_free || >> >
Re: [PATCH 1/5] btrfs: Ensure at least 1g is free for balance
On 2018/10/26 下午7:43, Nikolay Borisov wrote: > The first part of balance operation is to shrink every constituting > device to ensure there is free space for chunk allocation. However, the code > has been buggy ever since its introduction since calculating the space to > shrink > the device by was bounded by 1 mb. Most likely the original intention was to > have an upper bound of 1g and not 1m, since the largest chunk size is 1g. Minor nitpick, largest chunk size -> largest chunk stripe size. As for data chunk, it's possible to get a 10G chunk, but still only 1G stripe up limit. > This > means the first stage in __btrfs_balance so far has been a null op since it > effectively freed just a single megabyte. > > Fix this by setting an upper bound of size_to_free of 1g. One question come to me naturally, what if we failed to shrink the device? In fact if btrfs_shrink_device() returns ENOSPC we just skip to relocation part, so it doesn't look like to cause regression. If this can be mentioned in the commit message, it would save reviewer minutes to read the code. BTW, I think for that (ret == ENOSPC) after btrfs_shrink_device(), we should continue other than break, to get more chance to secure unallocated space. Thanks, Qu > > Signed-off-by: Nikolay Borisov > --- > fs/btrfs/volumes.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index f435d397019e..8b0fd7bf3447 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info > *fs_info) > list_for_each_entry(device, devices, dev_list) { > old_size = btrfs_device_get_total_bytes(device); > size_to_free = div_factor(old_size, 1); > - size_to_free = min_t(u64, size_to_free, SZ_1M); > + size_to_free = min_t(u64, size_to_free, SZ_1G); > if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) || > btrfs_device_get_total_bytes(device) - > btrfs_device_get_bytes_used(device) > size_to_free || >
Re: [PATCH v2] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
On 26.10.2018 14:53, Qu Wenruo wrote: > > > On 2018/10/26 下午7:41, Nikolay Borisov wrote: >> Running btrfs/124 in a loop hung up on me sporadically with the >> following call trace: >> btrfs D0 5760 5324 0x >> Call Trace: >> ? __schedule+0x243/0x800 >> schedule+0x33/0x90 >> btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs] >> ? wait_woken+0xa0/0xa0 >> btrfs_wait_ordered_range+0xbb/0x100 [btrfs] >> btrfs_relocate_block_group+0x1ff/0x230 [btrfs] >> btrfs_relocate_chunk+0x49/0x100 [btrfs] >> btrfs_balance+0xbeb/0x1740 [btrfs] >> btrfs_ioctl_balance+0x2ee/0x380 [btrfs] >> btrfs_ioctl+0x1691/0x3110 [btrfs] >> ? lockdep_hardirqs_on+0xed/0x180 >> ? __handle_mm_fault+0x8e7/0xfb0 >> ? _raw_spin_unlock+0x24/0x30 >> ? __handle_mm_fault+0x8e7/0xfb0 >> ? do_vfs_ioctl+0xa5/0x6e0 >> ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] >> do_vfs_ioctl+0xa5/0x6e0 >> ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe >> ksys_ioctl+0x3a/0x70 >> __x64_sys_ioctl+0x16/0x20 >> do_syscall_64+0x60/0x1b0 >> entry_SYSCALL_64_after_hwframe+0x49/0xbe >> >> Turns out during page writeback it's possible that the code in >> writepage_delalloc can instantiate a delalloc range which doesn't >> belong to the page currently being written back. > > Just a nitpick, would you please split long paragraphs with newlines? > > It makes reviewers' eyes less painful. > >> This happens since >> find_lock_delalloc_range returns up to BTRFS_MAX_EXTENT_SIZE delalloc >> range when asked and doens't really consider the range of the passed >> page. > > >> When such a foregin range is found the code proceeds to >> run_delalloc_range and calls the appropriate function to fill the >> delalloc and create ordered extents. If, however, a failure occurs >> while this operation is in effect then the clean up code in >> btrfs_cleanup_ordered_extents will be called. This function has the >> wrong assumption that caller of run_delalloc_range always properly >> cleans the first page of the range hence when it calls >> __endio_write_update_ordered it explicitly ommits the first page of >> the delalloc range. > > Yes, that's the old assumption, at least well explained by some ascii > art. (even it's wrong) > >> This is wrong because this function could be >> cleaning a delalloc range that doesn't belong to the current page. This >> in turn means that the page cleanup code in __extent_writepage will >> not really free the initial page for the range, leaving a hanging >> ordered extent with bytes_left set to 4k. This bug has been present >> ever since the original introduction of the cleanup code in 524272607e88. > > The cause sounds valid, however would you please explain more about how > such cleaning on unrelated delalloc range happens? So in my case the following happened - 2 block groups were created as delalloc ranges in the - 0-1m and 1m-128m. Their respective pages were dirtied, so when page 0 - 0-4k when into writepage_delalloc, find_lock_delalloc_range would return the range 0-1m. So the call to fill_delalloc instantiates OE 0-1m and writeback continues as expected. Now, when the 2nd page from range 0-1m is again set for writeback and find_lock_delalloc_range is called with delalloc_start == 4096 it will actually return the range 1m-128m. Then fill_delalloc is called with locked_page belonging to the range which was already instantiated and the start/end arguments pointing to 1m-128m if an error occurred in run_delalloc_range in this case then btrfs_cleanup_ordered_extents will be called which will clear Private2 bit for pages belonging to 1m-128m range and the OE will be cleared of all but the first page (since the code wrongly assumed locked_page always belongs to the range currently being instantiated). > > Even the fix looks solid, it's still better to explain the cause a > little more, as the more we understand the cause, the better solution we > may get. > >> >> Fix this by correctly checking whether the current page belongs to the >> range being instantiated and if so correctly adjust the range parameters >> passed for cleaning up. If it doesn't, then just clean the whole OE >> range directly. > > And the solution also looks good to me, and much more robust, without > any (possibly wrong) assumption. > > Thanks, > Qu > >> >> Signed-off-by: Nikolay Borisov >> Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid >> ordered extent hang") >> --- >> >> V2: >> * Fix compilation failure due to missing parentheses >> * Fixed the "Fixes" tag. >> >> fs/btrfs/inode.c | 29 - >> 1 file changed, 20 insertions(+), 9 deletions(-) >> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c >> index e1f00d8c24ce..5906564ae2e9 100644 >> --- a/fs/btrfs/inode.c >> +++ b/fs/btrfs/inode.c >> @@ -109,17 +109,17 @@ static void
Re: [PATCH v2] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
On 2018/10/26 下午7:41, Nikolay Borisov wrote: > Running btrfs/124 in a loop hung up on me sporadically with the > following call trace: > btrfs D0 5760 5324 0x > Call Trace: >? __schedule+0x243/0x800 >schedule+0x33/0x90 >btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs] >? wait_woken+0xa0/0xa0 >btrfs_wait_ordered_range+0xbb/0x100 [btrfs] >btrfs_relocate_block_group+0x1ff/0x230 [btrfs] >btrfs_relocate_chunk+0x49/0x100 [btrfs] >btrfs_balance+0xbeb/0x1740 [btrfs] >btrfs_ioctl_balance+0x2ee/0x380 [btrfs] >btrfs_ioctl+0x1691/0x3110 [btrfs] >? lockdep_hardirqs_on+0xed/0x180 >? __handle_mm_fault+0x8e7/0xfb0 >? _raw_spin_unlock+0x24/0x30 >? __handle_mm_fault+0x8e7/0xfb0 >? do_vfs_ioctl+0xa5/0x6e0 >? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] >do_vfs_ioctl+0xa5/0x6e0 >? entry_SYSCALL_64_after_hwframe+0x3e/0xbe >ksys_ioctl+0x3a/0x70 >__x64_sys_ioctl+0x16/0x20 >do_syscall_64+0x60/0x1b0 >entry_SYSCALL_64_after_hwframe+0x49/0xbe > > Turns out during page writeback it's possible that the code in > writepage_delalloc can instantiate a delalloc range which doesn't > belong to the page currently being written back. Just a nitpick, would you please split long paragraphs with newlines? It makes reviewers' eyes less painful. > This happens since > find_lock_delalloc_range returns up to BTRFS_MAX_EXTENT_SIZE delalloc > range when asked and doens't really consider the range of the passed > page. > When such a foregin range is found the code proceeds to > run_delalloc_range and calls the appropriate function to fill the > delalloc and create ordered extents. If, however, a failure occurs > while this operation is in effect then the clean up code in > btrfs_cleanup_ordered_extents will be called. This function has the > wrong assumption that caller of run_delalloc_range always properly > cleans the first page of the range hence when it calls > __endio_write_update_ordered it explicitly ommits the first page of > the delalloc range. Yes, that's the old assumption, at least well explained by some ascii art. (even it's wrong) > This is wrong because this function could be > cleaning a delalloc range that doesn't belong to the current page. This > in turn means that the page cleanup code in __extent_writepage will > not really free the initial page for the range, leaving a hanging > ordered extent with bytes_left set to 4k. This bug has been present > ever since the original introduction of the cleanup code in 524272607e88. The cause sounds valid, however would you please explain more about how such cleaning on unrelated delalloc range happens? Even the fix looks solid, it's still better to explain the cause a little more, as the more we understand the cause, the better solution we may get. > > Fix this by correctly checking whether the current page belongs to the > range being instantiated and if so correctly adjust the range parameters > passed for cleaning up. If it doesn't, then just clean the whole OE > range directly. And the solution also looks good to me, and much more robust, without any (possibly wrong) assumption. Thanks, Qu > > Signed-off-by: Nikolay Borisov > Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered > extent hang") > --- > > V2: > * Fix compilation failure due to missing parentheses > * Fixed the "Fixes" tag. > > fs/btrfs/inode.c | 29 - > 1 file changed, 20 insertions(+), 9 deletions(-) > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index e1f00d8c24ce..5906564ae2e9 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -109,17 +109,17 @@ static void __endio_write_update_ordered(struct inode > *inode, > * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING > * and EXTENT_DELALLOC simultaneously, because that causes the reserved > metadata > * to be released, which we want to happen only when finishing the ordered > - * extent (btrfs_finish_ordered_io()). Also note that the caller of the > - * fill_delalloc() callback already does proper cleanup for the first page of > - * the range, that is, it invokes the callback writepage_end_io_hook() for > the > - * range of the first page. > + * extent (btrfs_finish_ordered_io()). > */ > static inline void btrfs_cleanup_ordered_extents(struct inode *inode, > - const u64 offset, > - const u64 bytes) > + struct page *locked_page, > + u64 offset, u64 bytes) > { > unsigned long index = offset >> PAGE_SHIFT; > unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; > + u64 page_start = page_offset(locked_page); > + u64
[PATCH 3/5] btrfs: Remove superfluous check form btrfs_remove_chunk
It's unnecessary to check map->stripes[i].dev for NULL given its value is already set and dereferenced above the the check. No functional changes. Signed-off-by: Nikolay Borisov --- fs/btrfs/volumes.c | 12 +--- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dc53d94a62aa..f0db43d08456 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2797,13 +2797,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) mutex_unlock(_info->chunk_mutex); } - if (map->stripes[i].dev) { - ret = btrfs_update_device(trans, map->stripes[i].dev); - if (ret) { - mutex_unlock(_devices->device_list_mutex); - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = btrfs_update_device(trans, device); + if (ret) { + mutex_unlock(_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); + goto out; } } mutex_unlock(_devices->device_list_mutex); -- 2.7.4
[PATCH 5/5] btrfs: Replace BUG_ON with ASSERT in find_lock_delalloc_range
lock_delalloc_pages should only return 2 values - 0 in case of success and -EAGAIN if the range of pages to be locked should be shrunk due to some of gone. Manual inspections confirms that this is indeed the case since __process_pages_contig is where lock_delalloc_pages gets its return value. The latter always returns 0 or -EAGAIN so the invariant holds. No functional changes. Signed-off-by: Nikolay Borisov --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1a9a521aefe5..94bc53472031 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1606,6 +1606,7 @@ static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, delalloc_start, delalloc_end); + ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching @@ -1621,7 +1622,6 @@ static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, goto out_failed; } } - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ lock_extent_bits(tree, delalloc_start, delalloc_end, _state); -- 2.7.4
[PATCH 1/5] btrfs: Ensure at least 1g is free for balance
The first part of balance operation is to shrink every constituting device to ensure there is free space for chunk allocation. However, the code has been buggy ever since its introduction since calculating the space to shrink the device by was bounded by 1 mb. Most likely the original intention was to have an upper bound of 1g and not 1m, since the largest chunk size is 1g. This means the first stage in __btrfs_balance so far has been a null op since it effectively freed just a single megabyte. Fix this by setting an upper bound of size_to_free of 1g. Signed-off-by: Nikolay Borisov --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..8b0fd7bf3447 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3467,7 +3467,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) list_for_each_entry(device, devices, dev_list) { old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); - size_to_free = min_t(u64, size_to_free, SZ_1M); + size_to_free = min_t(u64, size_to_free, SZ_1G); if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state) || btrfs_device_get_total_bytes(device) - btrfs_device_get_bytes_used(device) > size_to_free || -- 2.7.4
[PATCH 0/5] Misc cleanups in balance code
While investigating the balance hang I came across various inconsistencies in the source. This series aims to fix those. The first patch is (I believe) a fix to a longstanding bug that could cause balance to fail due to ENOSPC. The code no properly ensures that there is at least 1g of unallocated space on every device being balance. Patch 2 makes btrfs_can_relocate a bit more obvious and removes leftovers from previous cleanup Patches 3/4/5 remove some redundant code from various functions. This series has survived multiple xfstest runs. Nikolay Borisov (5): btrfs: Ensure at least 1g is free for balance btrfs: Refactor btrfs_can_relocate btrfs: Remove superfluous check form btrfs_remove_chunk btrfs: Sink find_lock_delalloc_range's 'max_bytes' argument btrfs: Replace BUG_ON with ASSERT in find_lock_delalloc_range fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 39 ++- fs/btrfs/extent_io.c | 13 ++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/tests/extent-io-tests.c | 10 +- fs/btrfs/volumes.c | 17 +++-- 6 files changed, 34 insertions(+), 49 deletions(-) -- 2.7.4
[PATCH 2/5] btrfs: Refactor btrfs_can_relocate
btrfs_can_relocate returns 0 when it concludes the given chunk can be relocated and -1 otherwise. Since this function is used as a predicated and it return a binary value it makes no sense to have it's return value as an int so change it to bool. Furthermore, remove a stale leftover comment from e6ec716f0ddb ("Btrfs: make raid attr array more readable"). Finally make the logic in the list_for_each_entry loop a bit more obvious. Up until now the fact that we are returning success (0) was a bit masked by the fact that the 0 is re-used from the return value of find_free_dev_extent. Instead, now just increment dev_nr if we find a suitable extent and explicitly set can_reloc to true if enough devices with unallocated space are present. No functional changes. Signed-off-by: Nikolay Borisov --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 39 ++- fs/btrfs/volumes.c | 3 +-- 3 files changed, 16 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 68ca41dbbef3..06edc4f9ceb2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2680,7 +2680,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_fs_info *info); -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); +bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1febf155747..816bca482358 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9423,10 +9423,10 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) /* * checks to see if its even possible to relocate this block group. * - * @return - -1 if it's not a good idea to relocate this block group, 0 if its - * ok to go ahead and try. + * @return - false if not enough space can be found for relocation, true + * otherwise */ -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) +bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_root *root = fs_info->extent_root; struct btrfs_block_group_cache *block_group; @@ -9441,7 +9441,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) int debug; int index; int full = 0; - int ret = 0; + bool can_reloc = true; debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); @@ -9453,7 +9453,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) btrfs_warn(fs_info, "can't find block group for bytenr %llu", bytenr); - return -1; + return false; } min_free = btrfs_block_group_used(_group->item); @@ -9489,16 +9489,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) * group is going to be restriped, run checks against the target * profile instead of the current one. */ - ret = -1; + can_reloc = false; - /* -* index: -* 0: raid10 -* 1: raid1 -* 2: dup -* 3: raid0 -* 4: single -*/ target = get_restripe_target(fs_info, block_group->flags); if (target) { index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); @@ -9534,10 +9526,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) /* We need to do this so that we can look at pending chunks */ trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); + if (IS_ERR(trans)) goto out; - } mutex_lock(_info->chunk_mutex); list_for_each_entry(device, _devices->alloc_list, dev_alloc_list) { @@ -9549,18 +9539,17 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free && !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) { - ret = find_free_dev_extent(trans, device, min_free, - _offset, NULL); - if (!ret) + if (!find_free_dev_extent(trans, device, min_free, + _offset, NULL)) dev_nr++; - if (dev_nr >= dev_min) + if (dev_nr >= dev_min) { + can_reloc = true; break; - -
[PATCH 4/5] btrfs: Sink find_lock_delalloc_range's 'max_bytes' argument
All callers of this function pass BTRFS_MAX_EXTENT_SIZE (128M) so let's reduce the argument count and make that a local variable. No functional changes. Signed-off-by: Nikolay Borisov --- fs/btrfs/extent_io.c | 11 +-- fs/btrfs/extent_io.h | 2 +- fs/btrfs/tests/extent-io-tests.c | 10 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6877a74c7469..1a9a521aefe5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1566,8 +1566,9 @@ static noinline int lock_delalloc_pages(struct inode *inode, static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) + u64 *end) { + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; u64 found; @@ -1647,10 +1648,9 @@ static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, u64 btrfs_find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) + u64 *end) { - return find_lock_delalloc_range(inode, tree, locked_page, start, end, - max_bytes); + return find_lock_delalloc_range(inode, tree, locked_page, start, end); } #endif @@ -3233,8 +3233,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, nr_delalloc = find_lock_delalloc_range(inode, tree, page, _start, - _end, - BTRFS_MAX_EXTENT_SIZE); + _end); if (nr_delalloc == 0) { delalloc_start = delalloc_end + 1; continue; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 369daa5d4f73..7ec4f93caf78 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, u64 btrfs_find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes); + u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e0f4a01be14..8ea8c2aa6696 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -107,7 +107,7 @@ static int test_find_delalloc(u32 sectorsize) start = 0; end = 0; found = btrfs_find_lock_delalloc_range(inode, , locked_page, , -, max_bytes); +); if (!found) { test_err("should have found at least one delalloc"); goto out_bits; @@ -138,7 +138,7 @@ static int test_find_delalloc(u32 sectorsize) start = test_start; end = 0; found = btrfs_find_lock_delalloc_range(inode, , locked_page, , -, max_bytes); +); if (!found) { test_err("couldn't find delalloc in our range"); goto out_bits; @@ -172,7 +172,7 @@ static int test_find_delalloc(u32 sectorsize) start = test_start; end = 0; found = btrfs_find_lock_delalloc_range(inode, , locked_page, , -, max_bytes); +); if (found) { test_err("found range when we shouldn't have"); goto out_bits; @@ -193,7 +193,7 @@ static int test_find_delalloc(u32 sectorsize) start = test_start; end = 0; found = btrfs_find_lock_delalloc_range(inode, , locked_page, , -, max_bytes); +); if (!found) { test_err("didn't find our range"); goto out_bits; @@ -234,7 +234,7 @@ static int test_find_delalloc(u32 sectorsize) * tests expected behavior. */ found = btrfs_find_lock_delalloc_range(inode, , locked_page, , -, max_bytes); +
[PATCH v2] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
Running btrfs/124 in a loop hung up on me sporadically with the following call trace: btrfs D0 5760 5324 0x Call Trace: ? __schedule+0x243/0x800 schedule+0x33/0x90 btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs] ? wait_woken+0xa0/0xa0 btrfs_wait_ordered_range+0xbb/0x100 [btrfs] btrfs_relocate_block_group+0x1ff/0x230 [btrfs] btrfs_relocate_chunk+0x49/0x100 [btrfs] btrfs_balance+0xbeb/0x1740 [btrfs] btrfs_ioctl_balance+0x2ee/0x380 [btrfs] btrfs_ioctl+0x1691/0x3110 [btrfs] ? lockdep_hardirqs_on+0xed/0x180 ? __handle_mm_fault+0x8e7/0xfb0 ? _raw_spin_unlock+0x24/0x30 ? __handle_mm_fault+0x8e7/0xfb0 ? do_vfs_ioctl+0xa5/0x6e0 ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] do_vfs_ioctl+0xa5/0x6e0 ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe ksys_ioctl+0x3a/0x70 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x60/0x1b0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Turns out during page writeback it's possible that the code in writepage_delalloc can instantiate a delalloc range which doesn't belong to the page currently being written back. This happens since find_lock_delalloc_range returns up to BTRFS_MAX_EXTENT_SIZE delalloc range when asked and doens't really consider the range of the passed page. When such a foregin range is found the code proceeds to run_delalloc_range and calls the appropriate function to fill the delalloc and create ordered extents. If, however, a failure occurs while this operation is in effect then the clean up code in btrfs_cleanup_ordered_extents will be called. This function has the wrong assumption that caller of run_delalloc_range always properly cleans the first page of the range hence when it calls __endio_write_update_ordered it explicitly ommits the first page of the delalloc range. This is wrong because this function could be cleaning a delalloc range that doesn't belong to the current page. This in turn means that the page cleanup code in __extent_writepage will not really free the initial page for the range, leaving a hanging ordered extent with bytes_left set to 4k. This bug has been present ever since the original introduction of the cleanup code in 524272607e88. Fix this by correctly checking whether the current page belongs to the range being instantiated and if so correctly adjust the range parameters passed for cleaning up. If it doesn't, then just clean the whole OE range directly. Signed-off-by: Nikolay Borisov Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") --- V2: * Fix compilation failure due to missing parentheses * Fixed the "Fixes" tag. fs/btrfs/inode.c | 29 - 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e1f00d8c24ce..5906564ae2e9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -109,17 +109,17 @@ static void __endio_write_update_ordered(struct inode *inode, * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered - * extent (btrfs_finish_ordered_io()). Also note that the caller of the - * fill_delalloc() callback already does proper cleanup for the first page of - * the range, that is, it invokes the callback writepage_end_io_hook() for the - * range of the first page. + * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, -const u64 offset, -const u64 bytes) +struct page *locked_page, +u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + u64 page_start = page_offset(locked_page); + u64 page_end = page_start + PAGE_SIZE - 1; + struct page *page; while (index <= end_index) { @@ -130,8 +130,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, ClearPagePrivate2(page); put_page(page); } - return __endio_write_update_ordered(inode, offset + PAGE_SIZE, - bytes - PAGE_SIZE, false); + + /* +* In case this page belongs to the delalloc range being instantiated +* then skip it, since the first page of a range is going to be +* properly cleaned up by the caller of run_delalloc_range +*/ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + offset += PAGE_SIZE; +
Re: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
On 10/26/18 13:13, Nikolay Borisov wrote: + if (page_start >= offset && page_end <= (offset + bytes - 1) { fs/btrfs/inode.c: In function 'btrfs_cleanup_ordered_extents': fs/btrfs/inode.c:140:62: error: expected ')' before '{' token if (page_start >= offset && page_end <= (offset + bytes - 1) { ~^~ ) You're welcome :) Holger
[PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
Running btrfs/124 in a loop hung up on me sporadically with the following call trace: btrfs D0 5760 5324 0x Call Trace: ? __schedule+0x243/0x800 schedule+0x33/0x90 btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs] ? wait_woken+0xa0/0xa0 btrfs_wait_ordered_range+0xbb/0x100 [btrfs] btrfs_relocate_block_group+0x1ff/0x230 [btrfs] btrfs_relocate_chunk+0x49/0x100 [btrfs] btrfs_balance+0xbeb/0x1740 [btrfs] btrfs_ioctl_balance+0x2ee/0x380 [btrfs] btrfs_ioctl+0x1691/0x3110 [btrfs] ? lockdep_hardirqs_on+0xed/0x180 ? __handle_mm_fault+0x8e7/0xfb0 ? _raw_spin_unlock+0x24/0x30 ? __handle_mm_fault+0x8e7/0xfb0 ? do_vfs_ioctl+0xa5/0x6e0 ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] do_vfs_ioctl+0xa5/0x6e0 ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe ksys_ioctl+0x3a/0x70 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x60/0x1b0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Turns out during page writeback it's possible that the code in writepage_delalloc can instantiate a delalloc range which doesn't belong to the page currently being written back. This happens since find_lock_delalloc_range returns up to BTRFS_MAX_EXTENT_SIZE delalloc range when asked and doens't really consider the range of the passed page. When such a foregin range is found the code proceeds to run_delalloc_range and calls the appropriate function to fill the delalloc and create ordered extents. If, however, a failure occurs while this operation is in effect then the clean up code in btrfs_cleanup_ordered_extents will be called. This function has the wrong assumption that caller of run_delalloc_range always properly cleans the first page of the range hence when it calls __endio_write_update_ordered it explicitly ommits the first page of the delalloc range. This is wrong because this function could be cleaning a delalloc range that doesn't belong to the current page. This in turn means that the page cleanup code in __extent_writepage will not really free the initial page for the range, leaving a hanging ordered extent with bytes_left set to 4k. This bug has been present ever since the original introduction of the cleanup code in 524272607e88. Fix this by correctly checking whether the current page belongs to the range being instantiated and if so correctly adjust the range parameters passed for cleaning up. If it doesn't, then just clean the whole OE range directly. 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") Signed-off-by: Nikolay Borisov --- With this patch I can no longer get btrfs/124 to hang nor do I see any regressions in xfstest. Furthermore, this needs to go in stable since v4.12 fs/btrfs/inode.c | 29 - 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index de4e3a9563f7..f6a96da2f2ea 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -109,17 +109,17 @@ static void __endio_write_update_ordered(struct inode *inode, * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered - * extent (btrfs_finish_ordered_io()). Also note that the caller of the - * fill_delalloc() callback already does proper cleanup for the first page of - * the range, that is, it invokes the callback writepage_end_io_hook() for the - * range of the first page. + * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, -const u64 offset, -const u64 bytes) +struct page *locked_page, +u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + u64 page_start = page_offset(locked_page); + u64 page_end = page_start + PAGE_SIZE - 1; + struct page *page; while (index <= end_index) { @@ -130,8 +130,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, ClearPagePrivate2(page); put_page(page); } - return __endio_write_update_ordered(inode, offset + PAGE_SIZE, - bytes - PAGE_SIZE, false); + + /* +* In case this page belongs to the delalloc range being instantiated +* then skip it, since the first page of a range is going to be +* properly cleaned up by the caller of run_delalloc_range +*/ + if (page_start >= offset && page_end <= (offset