Re: subvol copying
A user of a workstation has a home directory /home/john as a subvolume. I wrote a cron job to make read-only snapshots of it under /home/john/backup which was fortunate as they just ran a script that did something like rm -rf ~. Apart from copying dozens of gigs of data back, is there a good way of recovering it all? Whatever you suggest isn't going to work for this time (the copy is almost done) but will be useful for next time. Should I have put the backups under /backup instead so that I could just delete the corrupted subvol and make a read-write snapshot of the last good one? You can move subvolumes at any time, as if they were regular directories. For example: move the backups to an external location, move what's left of the home to another location out of the way, and make a snapshot to restore. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/17] Btrfs: fix accessing a freed tree root
inode_tree_del() will move the tree root into the dead root list, and then the tree will be destroyed by the cleaner. So if we remove the delayed node which is cached in the inode after inode_tree_del(), we may access a freed tree root. Fix it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1669c3b..7f6e78a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4723,6 +4723,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); no_delete: + btrfs_remove_delayed_node(inode); clear_inode(inode); return; } @@ -7978,7 +7979,6 @@ void btrfs_destroy_inode(struct inode *inode) inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: - btrfs_remove_delayed_node(inode); call_rcu(inode-i_rcu, btrfs_i_callback); } -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/17] Btrfs: fix unprotected root node of the subvolume's inode rb-tree
The root node of the rb-tree may be changed, so we should get it under the lock. Fix it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/inode.c | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7f6e78a..bf5c399 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4839,14 +4839,13 @@ static void inode_tree_add(struct inode *inode) struct rb_node **p; struct rb_node *parent; u64 ino = btrfs_ino(inode); -again: - p = root-inode_tree.rb_node; - parent = NULL; if (inode_unhashed(inode)) return; - +again: + parent = NULL; spin_lock(root-inode_lock); + p = root-inode_tree.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/17] Btrfs: don't invoke btrfs_invalidate_inodes() in the spin lock context
btrfs_invalidate_inodes() may sleep, so we should not invoke it in the spin lock context. Fix it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 642c861..724a0da 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3685,8 +3685,11 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, ordered_operations); list_del_init(btrfs_inode-ordered_operations); + spin_unlock(root-fs_info-ordered_extent_lock); btrfs_invalidate_inodes(btrfs_inode-root); + + spin_lock(root-fs_info-ordered_extent_lock); } spin_unlock(root-fs_info-ordered_extent_lock); @@ -3808,8 +3811,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) list_del_init(btrfs_inode-delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, btrfs_inode-runtime_flags); + spin_unlock(root-fs_info-delalloc_lock); btrfs_invalidate_inodes(btrfs_inode-root); + + spin_lock(root-fs_info-delalloc_lock); } spin_unlock(root-fs_info-delalloc_lock); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 08/17] Btrfs: introduce per-subvolume delalloc inode list
When we create a snapshot, we need flush all delalloc inodes in the fs, just flushing the inodes in the source tree is OK. So we introduce per-subvolume delalloc inode list. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 22 --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/disk-io.c | 49 --- fs/btrfs/extent-tree.c | 6 +- fs/btrfs/inode.c | 167 - fs/btrfs/relocation.c | 2 +- fs/btrfs/transaction.c | 2 +- 7 files changed, 183 insertions(+), 67 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 958ce6c..067233f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1449,13 +1449,9 @@ struct btrfs_fs_info { */ struct list_head ordered_extents; - spinlock_t delalloc_lock; - /* -* all of the inodes that have delalloc bytes. It is possible for -* this list to be empty even when there is still dirty data=ordered -* extents waiting to finish IO. -*/ - struct list_head delalloc_inodes; + spinlock_t delalloc_root_lock; + /* all fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; /* * there is a pool of worker threads for checksumming during writes @@ -1740,6 +1736,16 @@ struct btrfs_root { spinlock_t root_item_lock; atomic_t refs; + + spinlock_t delalloc_lock; + /* +* all of the inodes that have delalloc bytes. It is possible for +* this list to be empty even when there is still dirty data=ordered +* extents waiting to finish IO. +*/ + struct list_head delalloc_inodes; + struct list_head delalloc_root; + u64 nr_delalloc_inodes; }; struct btrfs_ioctl_defrag_range_args { @@ -3543,6 +3549,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, + int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7ba7b39..aca77ad 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -465,7 +465,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_all_delalloc_inodes(root-fs_info, 0); if (ret) { mutex_unlock(dev_replace-lock_finishing_cancel_unmount); return ret; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 724a0da..13dddba 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1191,6 +1191,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root-objectid = objectid; root-last_trans = 0; root-highest_objectid = 0; + root-nr_delalloc_inodes = 0; root-name = NULL; root-inode_tree = RB_ROOT; INIT_RADIX_TREE(root-delayed_nodes_tree, GFP_ATOMIC); @@ -1199,10 +1200,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(root-dirty_list); INIT_LIST_HEAD(root-root_list); + INIT_LIST_HEAD(root-delalloc_inodes); + INIT_LIST_HEAD(root-delalloc_root); INIT_LIST_HEAD(root-logged_list[0]); INIT_LIST_HEAD(root-logged_list[1]); spin_lock_init(root-orphan_lock); spin_lock_init(root-inode_lock); + spin_lock_init(root-delalloc_lock); spin_lock_init(root-accounting_lock); spin_lock_init(root-log_extents_lock[0]); spin_lock_init(root-log_extents_lock[1]); @@ -2137,9 +2141,9 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(fs_info-trans_list); INIT_LIST_HEAD(fs_info-dead_roots); INIT_LIST_HEAD(fs_info-delayed_iputs); - INIT_LIST_HEAD(fs_info-delalloc_inodes); + INIT_LIST_HEAD(fs_info-delalloc_roots); INIT_LIST_HEAD(fs_info-caching_block_groups); - spin_lock_init(fs_info-delalloc_lock); + spin_lock_init(fs_info-delalloc_root_lock); spin_lock_init(fs_info-trans_lock); spin_lock_init(fs_info-fs_roots_radix_lock); spin_lock_init(fs_info-delayed_iput_lock); @@ -3801,24 +3805,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) INIT_LIST_HEAD(splice); - spin_lock(root-fs_info-delalloc_lock); - list_splice_init(root-fs_info-delalloc_inodes, splice); + spin_lock(root-delalloc_lock); +
[PATCH 05/17] Btrfs: cleanup the similar code of the fs root read
There are several functions whose code is similar, such as btrfs_find_last_root() btrfs_read_fs_root_no_radix() Besides that, some functions are invoked twice, it is unnecessary, for example, we are sure that all roots which is found in btrfs_find_orphan_roots() have their orphan items, so it is unnecessary to check the orphan item again. So cleanup it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 6 +- fs/btrfs/disk-io.c | 282 + fs/btrfs/disk-io.h | 11 +- fs/btrfs/extent-tree.c | 6 +- fs/btrfs/relocation.c | 5 +- fs/btrfs/root-tree.c | 170 ++--- fs/btrfs/tree-log.c| 3 +- fs/btrfs/volumes.c | 13 ++- fs/btrfs/volumes.h | 1 + 9 files changed, 228 insertions(+), 269 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9a48fd..845b77f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3369,9 +3369,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root_item *item); void btrfs_read_root_item(struct extent_buffer *eb, int slot, struct btrfs_root_item *item); -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct -btrfs_root_item *item, struct btrfs_key *key); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); int btrfs_find_orphan_roots(struct btrfs_root *tree_root); void btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8c1e4fb..42d6ba2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1234,39 +1234,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(root-root_item_lock); } -static int __must_check find_and_setup_root(struct btrfs_root *tree_root, - struct btrfs_fs_info *fs_info, - u64 objectid, - struct btrfs_root *root) -{ - int ret; - u32 blocksize; - u64 generation; - - __setup_root(tree_root-nodesize, tree_root-leafsize, -tree_root-sectorsize, tree_root-stripesize, -root, fs_info, objectid); - ret = btrfs_find_last_root(tree_root, objectid, - root-root_item, root-root_key); - if (ret 0) - return -ENOENT; - else if (ret 0) - return ret; - - generation = btrfs_root_generation(root-root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(root-root_item)); - root-commit_root = NULL; - root-node = read_tree_block(root, btrfs_root_bytenr(root-root_item), -blocksize, generation); - if (!root-node || !btrfs_buffer_uptodate(root-node, generation, 0)) { - free_extent_buffer(root-node); - root-node = NULL; - return -EIO; - } - root-commit_root = btrfs_root_node(root); - return 0; -} - static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); @@ -1451,70 +1418,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location) +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root-fs_info; struct btrfs_path *path; - struct extent_buffer *l; u64 generation; u32 blocksize; - int ret = 0; - int slot; + int ret; - root = btrfs_alloc_root(fs_info); - if (!root) + path = btrfs_alloc_path(); + if (!path) return ERR_PTR(-ENOMEM); - if (location-offset == (u64)-1) { - ret = find_and_setup_root(tree_root, fs_info, - location-objectid, root); - if (ret) { - kfree(root); - return ERR_PTR(ret); - } - goto out; + + root = btrfs_alloc_root(fs_info); + if (!root) { + ret = -ENOMEM; + goto alloc_fail; } __setup_root(tree_root-nodesize, tree_root-leafsize, tree_root-sectorsize, tree_root-stripesize, -
[PATCH 14/17] Btrfs: don't flush the delalloc inodes in the while loop if flushoncommit is set
It is unnecessary to flush the delalloc inodes again and again because we don't care the dirty pages which are introduced after the flush, and they will be flush in the transaction commit. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 26 ++ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fd319b2..265db57 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1518,16 +1518,8 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); int ret; - if (flush_on_commit) { - ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1); - if (ret) - return ret; - btrfs_wait_all_ordered_extents(root-fs_info, 1); - } - ret = btrfs_run_delayed_items(trans, root); if (ret) return ret; @@ -1551,6 +1543,19 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, return ret; } +static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info-tree_root, FLUSHONCOMMIT)) + return btrfs_start_all_delalloc_inodes(fs_info, 1); + return 0; +} + +static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info-tree_root, FLUSHONCOMMIT)) + btrfs_wait_all_ordered_extents(fs_info, 1); +} + /* * btrfs_transaction state sequence: *in_commit = 0, blocked = 0 (initial) @@ -1654,6 +1659,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, extwriter_counter_dec(cur_trans, trans-type); + ret = btrfs_start_delalloc_flush(root-fs_info); + if (ret) + goto cleanup_transaction; + if (!btrfs_test_opt(root, SSD) (now cur_trans-start_time || now - cur_trans-start_time 1)) should_grow = 1; @@ -1683,6 +1692,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; + btrfs_wait_delalloc_flush(root-fs_info); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 16/17] Btrfs: remove the time check in btrfs_commit_transaction()
We checked the commit time to avoid committing the transaction frequently, but it is unnecessary because: - It made the transaction commit spend more time, and delayed the operation of the external writers(TRANS_START/TRANS_USERSPACE). - Except the space that we have to commit transaction, such as snapshot creation, btrfs doesn't commit the transaction on its own initiative. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 29 ++--- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 75e7b15..5e75ff4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1566,10 +1566,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans-transaction; struct btrfs_transaction *prev_trans = NULL; - DEFINE_WAIT(wait); int ret; - int should_grow = 0; - unsigned long now = get_seconds(); ret = btrfs_run_ordered_operations(trans, root, 0); if (ret) { @@ -1660,28 +1657,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; - if (!btrfs_test_opt(root, SSD) - (now cur_trans-start_time || now - cur_trans-start_time 1)) - should_grow = 1; - - do { - WARN_ON(cur_trans != trans-transaction); - - ret = btrfs_flush_all_pending_stuffs(trans, root); - if (ret) - goto cleanup_transaction; - - prepare_to_wait(cur_trans-writer_wait, wait, - TASK_UNINTERRUPTIBLE); - - if (extwriter_counter_read(cur_trans) 0) - schedule(); - else if (should_grow) - schedule_timeout(1); + ret = btrfs_flush_all_pending_stuffs(trans, root); + if (ret) + goto cleanup_transaction; - finish_wait(cur_trans-writer_wait, wait); - } while (extwriter_counter_read(cur_trans) 0); + wait_event(cur_trans-writer_wait, + extwriter_counter_read(cur_trans) == 0); + /* some pending stuffs might be added after the previous flush. */ ret = btrfs_flush_all_pending_stuffs(trans, root); if (ret) goto cleanup_transaction; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/17] Btrfs: don't wait for all the writers circularly during the transaction commit
btrfs_commit_transaction has the following loop before we commit the transaction. do { // attempt to do some useful stuff and/or sleep } while (atomic_read(cur_trans-num_writers) 1 || (should_grow cur_trans-num_joined != joined)); This is used to prevent from the TRANS_START to get in the way of a committing transaction. But it does not prevent from TRANS_JOIN, that is we would do this loop for a long time if some writers JOIN the current transaction endlessly. Because we need join the current transaction to do some useful stuff, we can not block TRANS_JOIN here. So we introduce a external writer counter, which is used to count the TRANS_USERSPACE/TRANS_START writers. If the external writer counter is zero, we can break the above loop. In order to make the code more clear, we don't use enum variant to define the type of the transaction handle, use bitmask instead. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 55 ++ fs/btrfs/transaction.h | 31 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cf8706c..fd319b2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -51,17 +51,41 @@ static noinline void switch_commit_root(struct btrfs_root *root) } static inline int can_join_transaction(struct btrfs_transaction *trans, - int type) + unsigned int type) { return !(trans-in_commit -type != TRANS_JOIN -type != TRANS_JOIN_NOLOCK); +(type TRANS_EXTWRITERS)); +} + +static inline void extwriter_counter_inc(struct btrfs_transaction *trans, +unsigned int type) +{ + if (type TRANS_EXTWRITERS) + atomic_inc(trans-num_extwriters); +} + +static inline void extwriter_counter_dec(struct btrfs_transaction *trans, +unsigned int type) +{ + if (type TRANS_EXTWRITERS) + atomic_dec(trans-num_extwriters); +} + +static inline void extwriter_counter_init(struct btrfs_transaction *trans, + unsigned int type) +{ + atomic_set(trans-num_extwriters, ((type TRANS_EXTWRITERS) ? 1 : 0)); +} + +static inline int extwriter_counter_read(struct btrfs_transaction *trans) +{ + return atomic_read(trans-num_extwriters); } /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root, int type) +static noinline int join_transaction(struct btrfs_root *root, unsigned int type) { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *fs_info = root-fs_info; @@ -99,6 +123,7 @@ loop: } atomic_inc(cur_trans-use_count); atomic_inc(cur_trans-num_writers); + extwriter_counter_inc(cur_trans, type); cur_trans-num_joined++; spin_unlock(fs_info-trans_lock); return 0; @@ -131,6 +156,7 @@ loop: } atomic_set(cur_trans-num_writers, 1); + extwriter_counter_init(cur_trans, type); cur_trans-num_joined = 0; init_waitqueue_head(cur_trans-writer_wait); init_waitqueue_head(cur_trans-commit_wait); @@ -307,7 +333,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type) } static struct btrfs_trans_handle * -start_transaction(struct btrfs_root *root, u64 num_items, int type, +start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; @@ -320,7 +346,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, return ERR_PTR(-EROFS); if (current-journal_info) { - WARN_ON(type != TRANS_JOIN type != TRANS_JOIN_NOLOCK); + WARN_ON(type TRANS_EXTWRITERS); h = current-journal_info; h-use_count++; WARN_ON(h-use_count 2); @@ -366,7 +392,7 @@ again: * If we are ATTACH, it means we just want to catch the current * transaction and commit it, so we needn't do sb_start_intwrite(). */ - if (type TRANS_JOIN_NOLOCK) + if (type __TRANS_FREEZABLE) sb_start_intwrite(root-fs_info-sb); if (may_wait_transaction(root, type)) @@ -429,7 +455,7 @@ got_it: return h; join_fail: - if (type TRANS_JOIN_NOLOCK) + if (type __TRANS_FREEZABLE) sb_end_intwrite(root-fs_info-sb); kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: @@ -677,12 +703,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } } - if (trans-type
[PATCH 17/17] Btrfs: make the state of the transaction more readable
We used 3 variants to track the state of the transaction, it was complex and wasted the memory space. Besides that, it was hard to understand that which types of the transaction handles should be blocked in each transaction state, so the developers often made mistakes. This patch improved the above problem. In this patch, we define 6 states for the transaction, enum btrfs_trans_state { TRANS_STATE_RUNNING = 0, TRANS_STATE_BLOCKED = 1, TRANS_STATE_COMMIT_START= 2, TRANS_STATE_COMMIT_DOING= 3, TRANS_STATE_UNBLOCKED = 4, TRANS_STATE_COMPLETED = 5, TRANS_STATE_MAX = 6, } and just use 1 variant to track those state. In order to make the blocked handle types for each state more clear, we introduce a array: unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | __TRANS_START), [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE | __TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE | __TRANS_START | __TRANS_ATTACH | __TRANS_JOIN), [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE | __TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK), [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE | __TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK), } it is very intuitionistic. Besides that, because we remove -in_commit in transaction structure, so the lock -commit_lock which was used to protect it is unnecessary, remove -commit_lock. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 36 ++-- fs/btrfs/transaction.c | 156 ++--- fs/btrfs/transaction.h | 16 +++-- 4 files changed, 114 insertions(+), 95 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a7e71ff..bf92302 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1496,7 +1496,6 @@ struct btrfs_fs_info { int closing; int log_root_recovering; int enospc_unlink; - int trans_no_join; u64 total_pinned; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6bb3f3d..530e3c0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1747,7 +1747,7 @@ static int transaction_kthread(void *arg) } now = get_seconds(); - if (!cur-blocked + if (cur-state TRANS_STATE_BLOCKED (now cur-start_time || now - cur-start_time 30)) { spin_unlock(root-fs_info-trans_lock); delay = HZ * 5; @@ -2183,7 +2183,6 @@ int open_ctree(struct super_block *sb, fs_info-max_inline = 8192 * 1024; fs_info-metadata_ratio = 0; fs_info-defrag_inodes = RB_ROOT; - fs_info-trans_no_join = 0; fs_info-free_chunk_space = 0; fs_info-tree_mod_log = RB_ROOT; @@ -3956,19 +3955,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_block_rsv_release(root, root-fs_info-trans_block_rsv, cur_trans-dirty_pages.dirty_bytes); - /* FIXME: cleanup wait for commit */ - cur_trans-in_commit = 1; - cur_trans-blocked = 1; + cur_trans-state = TRANS_STATE_COMMIT_START; wake_up(root-fs_info-transaction_blocked_wait); btrfs_evict_pending_snapshots(cur_trans); - cur_trans-blocked = 0; + cur_trans-state = TRANS_STATE_UNBLOCKED; wake_up(root-fs_info-transaction_wait); - cur_trans-commit_done = 1; - wake_up(cur_trans-commit_wait); - btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); @@ -3977,6 +3971,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_pinned_extent(root, root-fs_info-pinned_extents); + cur_trans-state =TRANS_STATE_COMPLETED; + wake_up(cur_trans-commit_wait); + /* memset(cur_trans, 0, sizeof(*cur_trans)); kmem_cache_free(btrfs_transaction_cachep, cur_trans); @@ -4004,25 +4001,23 @@ static int
[PATCH 12/17] Btrfs: remove the code for the impossible case in cleanup_transaction()
If the transaction is removed from the transaction list, it means the transaction has been committed successfully. So it is impossible to call cleanup_transaction(), otherwise there is something wrong with the code logic. Thus, we use BUG_ON() instead of the original handle. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bc22be9..cf8706c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1450,11 +1450,12 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(root-fs_info-trans_lock); - if (list_empty(cur_trans-list)) { - spin_unlock(root-fs_info-trans_lock); - btrfs_end_transaction(trans, root); - return; - } + /* +* If the transaction is removed from the list, it means this +* transaction has been committed successfully, so it is impossible +* to call the cleanup function. +*/ + BUG_ON(list_empty(cur_trans-list)); list_del_init(cur_trans-list); if (cur_trans == root-fs_info-running_transaction) { -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/17] Btrfs: pause the space balance when remounting to R/O
Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a4807ce..f0857e0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1263,6 +1263,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); ret = btrfs_commit_super(root); if (ret) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation
Before applying this patch, we need flush all the delalloc inodes in the fs when we want to create a snapshot, it wastes time, and make the transaction commit be blocked for a long time. It means some other user operation would also be blocked for a long time. This patch improves this problem, we just flush the delalloc inodes that in the source trees before snapshot creation, so the transaction commit will complete quickly. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c | 6 ++ fs/btrfs/transaction.c | 10 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de4a2f..2677dcc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root-ref_cows) return -EINVAL; + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + + btrfs_wait_ordered_extents(root, 0); + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) return -ENOMEM; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b17213..bc22be9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - int snap_pending = 0; int ret; - if (!flush_on_commit) { - spin_lock(root-fs_info-trans_lock); - if (!list_empty(trans-transaction-pending_snapshots)) - snap_pending = 1; - spin_unlock(root-fs_info-trans_lock); - } - - if (flush_on_commit || snap_pending) { + if (flush_on_commit) { ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1); if (ret) return ret; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/17] Btrfs: introduce per-subvolume ordered extent list
The reason we introduce per-subvolume ordered extent list is the same as the per-subvolume delalloc inode list. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h| 25 --- fs/btrfs/dev-replace.c | 4 +- fs/btrfs/disk-io.c | 45 +++- fs/btrfs/extent-tree.c | 6 +-- fs/btrfs/inode.c| 4 +- fs/btrfs/ordered-data.c | 109 +--- fs/btrfs/ordered-data.h | 2 + fs/btrfs/relocation.c | 2 +- fs/btrfs/super.c| 2 +- fs/btrfs/transaction.c | 2 +- 10 files changed, 143 insertions(+), 58 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 067233f..a7e71ff 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1437,17 +1437,18 @@ struct btrfs_fs_info { atomic_t open_ioctl_trans; /* -* this is used by the balancing code to wait for all the pending -* ordered extents +* this is used to protect the following list -- ordered_roots. */ - spinlock_t ordered_extent_lock; + spinlock_t ordered_root_lock; /* -* all of the data=ordered extents pending writeback +* all fs/file tree roots in which there are data=ordered extents +* pending writeback are added into this list. +* * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ - struct list_head ordered_extents; + struct list_head ordered_roots; spinlock_t delalloc_root_lock; /* all fs/file tree roots that have delalloc inodes. */ @@ -1746,6 +1747,20 @@ struct btrfs_root { struct list_head delalloc_inodes; struct list_head delalloc_root; u64 nr_delalloc_inodes; + /* +* this is used by the balancing code to wait for all the pending +* ordered extents +*/ + spinlock_t ordered_extent_lock; + + /* +* all of the data=ordered extents pending writeback +* these can span multiple transactions and basically include +* every dirty data page that isn't from nodatacow +*/ + struct list_head ordered_extents; + struct list_head ordered_root; + u64 nr_ordered_extents; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index aca77ad..4254da8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args-result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root-fs_info, 0); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -470,7 +470,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(dev_replace-lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root-fs_info, 0); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 13dddba..44d5a86 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1192,6 +1192,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root-last_trans = 0; root-highest_objectid = 0; root-nr_delalloc_inodes = 0; + root-nr_ordered_extents = 0; root-name = NULL; root-inode_tree = RB_ROOT; INIT_RADIX_TREE(root-delayed_nodes_tree, GFP_ATOMIC); @@ -1202,11 +1203,14 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(root-root_list); INIT_LIST_HEAD(root-delalloc_inodes); INIT_LIST_HEAD(root-delalloc_root); + INIT_LIST_HEAD(root-ordered_extents); + INIT_LIST_HEAD(root-ordered_root); INIT_LIST_HEAD(root-logged_list[0]); INIT_LIST_HEAD(root-logged_list[1]); spin_lock_init(root-orphan_lock); spin_lock_init(root-inode_lock); spin_lock_init(root-delalloc_lock); + spin_lock_init(root-ordered_extent_lock); spin_lock_init(root-accounting_lock); spin_lock_init(root-log_extents_lock[0]); spin_lock_init(root-log_extents_lock[1]); @@ -2190,8 +2194,8 @@ int open_ctree(struct super_block *sb, fs_info-thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); - INIT_LIST_HEAD(fs_info-ordered_extents); - spin_lock_init(fs_info-ordered_extent_lock); + INIT_LIST_HEAD(fs_info-ordered_roots); + spin_lock_init(fs_info-ordered_root_lock); fs_info-delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
[PATCH 11/17] Btrfs: cleanup unnecessary assignment when cleaning up all the residual transaction
When we umount a fs with serious errors, we will invoke btrfs_cleanup_transactions() to clean up the residual transaction. At this time, It is impossible to start a new transaction, so we needn't assign trans_no_join to 1, and also needn't clear running transaction every time we destroy a residual transaction. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 44d5a86..6bb3f3d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3992,7 +3992,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) spin_lock(root-fs_info-trans_lock); list_splice_init(root-fs_info-trans_list, list); - root-fs_info-trans_no_join = 1; + root-fs_info-running_transaction = NULL; spin_unlock(root-fs_info-trans_lock); while (!list_empty(list)) { @@ -4028,10 +4028,6 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_all_delalloc_inodes(root-fs_info); - spin_lock(root-fs_info-trans_lock); - root-fs_info-running_transaction = NULL; - spin_unlock(root-fs_info-trans_lock); - btrfs_destroy_marked_extents(root, t-dirty_pages, EXTENT_DIRTY); @@ -4044,9 +4040,6 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) kmem_cache_free(btrfs_transaction_cachep, t); } - spin_lock(root-fs_info-trans_lock); - root-fs_info-trans_no_join = 0; - spin_unlock(root-fs_info-trans_lock); mutex_unlock(root-fs_info-transaction_kthread_mutex); return 0; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/17] Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix()
We have checked if -node is NULL or not, so it is unnecessary to use BUG_ON() to check again. Remove it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2a9ae38..8c1e4fb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1513,7 +1513,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, } root-commit_root = btrfs_root_node(root); - BUG_ON(!root-node); /* -ENOMEM */ out: if (location-objectid != BTRFS_TREE_LOG_OBJECTID) { root-ref_cows = 1; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 15/17] Btrfs: remove unnecessary varient -num_joined in btrfs_transaction structure
We used -num_joined track if there were some writers which join the current transaction when the committer was sleeping. If some writers joined the current transaction, we has to continue the while loop to do some necessary stuff, such as flush the ordered operations. But it is unnecessary because we will do it after the while loop. Besides that, tracking -num_joined would make the committer drop into the while loop when there are lots of internal writers(TRANS_JOIN). So we remove -num_joined and don't track if there are some writers which join the current transaction when the committer is sleeping. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 8 +--- fs/btrfs/transaction.h | 2 -- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 265db57..75e7b15 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -124,7 +124,6 @@ loop: atomic_inc(cur_trans-use_count); atomic_inc(cur_trans-num_writers); extwriter_counter_inc(cur_trans, type); - cur_trans-num_joined++; spin_unlock(fs_info-trans_lock); return 0; } @@ -157,7 +156,6 @@ loop: atomic_set(cur_trans-num_writers, 1); extwriter_counter_init(cur_trans, type); - cur_trans-num_joined = 0; init_waitqueue_head(cur_trans-writer_wait); init_waitqueue_head(cur_trans-commit_wait); cur_trans-in_commit = 0; @@ -1566,7 +1564,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - unsigned long joined = 0; struct btrfs_transaction *cur_trans = trans-transaction; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); @@ -1668,8 +1665,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, should_grow = 1; do { - joined = cur_trans-num_joined; - WARN_ON(cur_trans != trans-transaction); ret = btrfs_flush_all_pending_stuffs(trans, root); @@ -1685,8 +1680,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, schedule_timeout(1); finish_wait(cur_trans-writer_wait, wait); - } while (extwriter_counter_read(cur_trans) 0 || -(should_grow cur_trans-num_joined != joined)); + } while (extwriter_counter_read(cur_trans) 0); ret = btrfs_flush_all_pending_stuffs(trans, root); if (ret) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5cc77b0..0fc45e2 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -37,8 +37,6 @@ struct btrfs_transaction { atomic_t num_writers; atomic_t use_count; - unsigned long num_joined; - spinlock_t commit_lock; int in_commit; int commit_done; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 00/17] improve the block time during the transaction commit
This patchset improve the problem that the transaction may be blocked for a long time when it is being committed if there is heavy I/O. In this patchset, - 0001-0005, 0007, 0011-0012 are random fix or code cleanup patch. - 0006, 0008-0010 introduce per-subvolume delalloc inode list and ordered extent list, which can reduce the flush time when we create snapshots. - 0013-0016 improve the block time during the transaction commit by removing the while loop at the beginning of the transaction commit. - 0017 improves the readability of the code. Miao Xie (17): Btrfs: fix accessing a freed tree root Btrfs: fix unprotected root node of the subvolume's inode rb-tree Btrfs: pause the space balance when remounting to R/O Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix() Btrfs: cleanup the similar code of the fs root read Btrfs: introduce grab/put functions for the root of the fs/file tree Btrfs: don't invoke btrfs_invalidate_inodes() in the spin lock context Btrfs: introduce per-subvolume delalloc inode list Btrfs: introduce per-subvolume ordered extent list Btrfs: just flush the delalloc inodes in the source tree before snapshot creation Btrfs: cleanup unnecessary assignment when cleaning up all the residual transaction Btrfs: remove the code for the impossible case in cleanup_transaction() Btrfs: don't wait for all the writers circularly during the transaction commit Btrfs: don't flush the delalloc inodes in the while loop if flushoncommit is set Btrfs: remove unnecessary varient -num_joined in btrfs_transaction structure Btrfs: remove the time check in btrfs_commit_transaction() Btrfs: make the state of the transaction more readable fs/btrfs/ctree.h| 55 +-- fs/btrfs/dev-replace.c | 6 +- fs/btrfs/disk-io.c | 425 +++- fs/btrfs/disk-io.h | 32 +++- fs/btrfs/extent-tree.c | 20 +-- fs/btrfs/inode.c| 180 ++-- fs/btrfs/ioctl.c| 6 + fs/btrfs/ordered-data.c | 109 + fs/btrfs/ordered-data.h | 2 + fs/btrfs/relocation.c | 9 +- fs/btrfs/root-tree.c| 170 ++- fs/btrfs/super.c| 3 +- fs/btrfs/transaction.c | 271 -- fs/btrfs/transaction.h | 49 -- fs/btrfs/tree-log.c | 3 +- fs/btrfs/volumes.c | 13 +- fs/btrfs/volumes.h | 1 + 17 files changed, 791 insertions(+), 563 deletions(-) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree
The grab/put funtions will be used in the next patch, which need grab the root object and ensure it is not freed. We use reference counter instead of the srcu lock is to aovid blocking the memory reclaim task, which invokes synchronize_srcu(). Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/disk-io.h | 21 + fs/btrfs/extent-tree.c | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 845b77f..958ce6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1739,6 +1739,7 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; + atomic_t refs; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42d6ba2..642c861 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(root-log_writers, 0); atomic_set(root-log_batch, 0); atomic_set(root-orphan_inodes, 0); + atomic_set(root-refs, 1); root-log_transid = 0; root-last_log_commit = 0; extent_io_tree_init(root-dirty_log_pages, @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]-node); free_extent_buffer(gang[0]-commit_root); - kfree(gang[0]); + btrfs_put_fs_root(gang[0]); } } @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root-free_ino_ctl); kfree(root-free_ino_pinned); kfree(root-name); - kfree(root); + btrfs_put_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 534d583..b71acd6e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info-subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(root-refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(root-refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 08e42c8..08f9862 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } else { free_extent_buffer(root-node); free_extent_buffer(root-commit_root); - kfree(root); + btrfs_put_fs_root(root); } out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 0/8] Btrfs: introduce a tree for UUID to subvol ID mapping
On Tue, 14 May 2013 19:11:54 +0200, Stefan Behrens wrote: On Tue, 14 May 2013 18:55:23 +0800, Liu Bo wrote: On Tue, May 14, 2013 at 11:36:52AM +0200, Stefan Behrens wrote: Mapping UUIDs to subvolume IDs is an operation with a high effort today. Today, the algorithm even has quadratic effort (based on the number of existing subvolumes), which means, that it takes minutes to send/receive a single subvolume if 10,000 subvolumes exist. But even linear effort would be too much since it is a waste. And these data structures to allow mapping UUIDs to subvolume IDs are created every time a btrfs send/receive instance is started. So the issue to address is that Btrfs send / receive does not work as it is today when a high number of subvolumes exist. It is much more efficient to maintain a searchable persistent data structure in the filesystem, one that is updated whenever a subvolume/snapshot is created and deleted, and when the received subvolume UUID is set by the btrfs-receive tool. Therefore kernel code is added that is able to maintain data structures in the filesystem that allow to quickly search for a given UUID and to retrieve the subvol ID. Now follows the lengthy justification, why a new tree was added instead of using the existing root tree: The first approach was to not create another tree that holds UUID items. Instead, the items should just go into the top root tree. Unfortunately this confused the algorithm to assign the objectid of subvolumes and snapshots. The reason is that btrfs_find_free_objectid() calls btrfs_find_highest_objectid() for the first created subvol or snapshot after mounting a filesystem, and this function simply searches for the largest used objectid in the root tree keys to pick the next objectid to assign. Of course, the UUID keys have always been the ones with the highest offset value, and the next assigned subvol ID was wastefully huge. To use any other existing tree did not look proper. To apply a workaround such as setting the objectid to zero in the UUID item key and to implement collision handling would either add limitations (in case of a btrfs_extend_item() approach to handle the collisions) or a lot of complexity and source code (in case a key would be looked up that is free of collisions). Adding new code that introduces limitations is not good, and adding code that is complex and lengthy for no good reason is also not good. That's the justification why a completely new tree was introduced. I'd appreciate if some performance number appear here since it's a speedup. That's a good idea. The numbers are below in the table and there's also a link to a chart. I stopped the measurement with the old version after 1 subvolumes because it already took almost 13 minutes to send a single, empty subvolume. All the time is spent building a database, and this is done each time the btrfs send or receive tool is started to send or receive a single subvolume. The table shows the time it takes to send a single, empty subvolume depending on the number of subvolume that exist in the filesystem. # of subvols | without| with in filesystem | UUID tree | UUID tree --++-- 2 | 0m00.004s | 0m00.003s 1000 | 0m07.010s | 0m00.004s 2000 | 0m28.210s | 0m00.004s 3000 | 1m04.872s | 0m00.004s 4000 | 1m56.059s | 0m00.004s 5000 | 3m00.489s | 0m00.004s 6000 | 4m27.376s | 0m00.004s 7000 | 6m08.938s | 0m00.004s 8000 | 7m54.020s | 0m00.004s 9000 | 10m05.108s | 0m00.004s 1 | 12m47.406s | 0m00.004s Or as a chart: http://btrfs.giantdisaster.de/Btrfs-send-recv-perf.pdf The table goes on like this for larger number of subvolumes (and the time value is always the time to transfer just _one_ of the subvolumes): # of subvols | without| with in filesystem | UUID tree | UUID tree --++-- 2 | 0m00.004s | 0m00.003s 1000 | 0m07.010s | 0m00.004s 2000 | 0m28.210s | 0m00.004s 3000 | 1m04.872s | 0m00.004s 4000 | 1m56.059s | 0m00.004s 5000 | 3m00.489s | 0m00.004s 6000 | 4m27.376s | 0m00.004s 7000 | 6m08.938s | 0m00.004s 8000 | 7m54.020s | 0m00.004s 9000 | 10m05.108s | 0m00.004s 1 | 12m47.406s | 0m00.004s 11000 | 15m05.800s | 0m00.004s 12000 | 18m00.170s | 0m00.004s 13000 | 21m39.438s | 0m00.004s 14000 | 24m54.681s | 0m00.004s 15000 | 28m09.096s | 0m00.004s 16000 | 33m08.856s | 0m00.004s 17000 | 37m10.562s | 0m00.004s 18000 | 41m44.727s | 0m00.004s 19000 | 46m14.335s | 0m00.004s 2 | 51m55.100s | 0m00.004s 21000 | 56m54.346s | 0m00.004s 22000 | 62m53.466s | 0m00.004s 23000 | 66m57.328s | 0m00.004s 24000 | 73m59.687s |
Re: [PATCH] Btrfs: increase the max global reserve size to 1gig
On Fri, May 03, 2013 at 11:28:35PM +0800, Liu Bo wrote: On Fri, May 03, 2013 at 08:56:54AM -0400, Josef Bacik wrote: Apparently 512mb was too small, with a fs_mark command we could get so much delayed work built up that we'd never trip the lets commit the transaction logic until we'd gotten too much delayed refs built up. Increasing this to 1 gig makes us much safer and we no longer abort with Dave's fs_mark tester. Thanks, I remember that last time I made a similar commit, but users complains that they cannot boot their system on root btrfs partition due to lacking space and Chris eventually got to revert that one... That was long time ago (3.5) and enospc has been updated since, so this needs re-evaluation if it still leads to early enospc. This patch is not in next queue, i wonder if your concern is the reason of that. david -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] xfstests btrfs/284: shorten duration, fix output
On 04/26/2013 01:45 PM, Eric Sandeen wrote: test 284 had... some issues. diff --git a/tests/btrfs/284 b/tests/btrfs/284 old mode 100644 new mode 100755 index d952977..67161a3 --- a/tests/btrfs/284 +++ b/tests/btrfs/284 This patch has been committed: commit 91f87e3b89c0f7350a56d397ba7255d0e3d6b486 Author: Eric Sandeen sand...@redhat.com Date: Wed May 15 07:18:07 2013 -0500 xfstests btrfs/284: shorten duration, fix output Signed-off-by: Eric Sandeen sand...@redhat.com Reviewed-by: Josef Bacik jba...@fusionio.com Reviewed-by: Liu Bo bo.li@oracle.com Thanks --Rich -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 0/8] Btrfs: introduce a tree for UUID to subvol ID mapping
On Tue, May 14, 2013 at 07:11:54PM +0200, Stefan Behrens wrote: # of subvols | without| with in filesystem | UUID tree | UUID tree --++-- 2 | 0m00.004s | 0m00.003s 1000 | 0m07.010s | 0m00.004s 2000 | 0m28.210s | 0m00.004s 3000 | 1m04.872s | 0m00.004s 4000 | 1m56.059s | 0m00.004s 5000 | 3m00.489s | 0m00.004s 6000 | 4m27.376s | 0m00.004s 7000 | 6m08.938s | 0m00.004s 8000 | 7m54.020s | 0m00.004s 9000 | 10m05.108s | 0m00.004s 1 | 12m47.406s | 0m00.004s It looks like a copy paste error in the 3rd column. No really, this is a great improvement! david -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: I/O errors block the entire filesystem
On Sat, May 11, 2013 at 01:16:38AM -0600, Alexandre Oliva wrote: On Apr 4, 2013, Alexandre Oliva ol...@gnu.org wrote: I've been trying to figure out the btrfs I/O stack to try to understand why, sometimes (but not always), after a failure to read a (data non-replicated) block from the disk, the file being accessed becomes permanently locked, and the filesystem, unmountable. So, after some further investigation, we could determine that the problem was that end_bio_extent_readpage would unlock_extent_cached only part of the page, because it had previously computed whole_page as zero because of the nonzero bv_offset. So I started hunting for some place that would set up the bio with partial pages, and I failed. I was already suspecting some race condition or other form of corruption of the bvec before it got to end_bio_extent_readpage when I realized that the bv_offset was always a multiple of 512 bytes, and it represented the offset into the 4KiB page that the sector that failed to read was going to occupy. So I started hunting for places that modified bv_offset, and I found blk_update_request in fs/blk-core.c, where the very error message reporting the failed sector was output. The conclusion is that we cannot assume bvec is unmodified between our submitting the bio and our getting an error back. OTOH, I don't see that we ever set up bvecs that do not correspond to whole pages. Indeed, my attempts to catch such situations with a wrapper around bio_add_page got no hits whatsoever, which suggests we could just do away with the whole_page computation, and take bv_offset+bv_len == PAGE_CACHE_SIZE as the requested read size. With this patch, after a read error, I get an EIO rather than a process hang that causes further attempts to access the file to hang, generally in a non-interruptible way. Yay! btrfs: do away with non-whole_page extent I/O From: Alexandre Oliva ol...@gnu.org end_bio_extent_readpage computes whole_page based on bv_offset and bv_len, without taking into account that blk_update_request may modify them when some of the blocks to be read into a page produce a read error. This would cause the read to unlock only part of the file range associated with the page, which would in turn leave the entire page locked, which would not only keep the process blocked instead of returning -EIO to it, but also prevent any further access to the file. It turns out that btrfs always issues whole-page reads and writes. The special handling of non-whole_page appears to be a mistake or a left-over from a time when this wasn't the case. Indeed, end_bio_extent_writepage distinguished between whole_page and non-whole_page writes but behaved identically in both cases! I've replaced the whole_page computations with warnings, just to be sure that we're not issuing partial page reads or writes. The warnings should probably just go away some time. Signed-off-by: Alexandre Oliva ol...@gnu.org --- fs/btrfs/extent_io.c | 85 ++ 1 file changed, 30 insertions(+), 55 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cdee391..f44b033 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1873,28 +1873,6 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) } /* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked - */ -static void check_page_locked(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) - unlock_page(page); -} - -/* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback - */ -static void check_page_writeback(struct extent_io_tree *tree, - struct page *page) -{ - end_page_writeback(page); -} - -/* * When IO fails, either with EIO or csum verification fails, we * try other mirrors that might have a good copy of the data. This * io_failure_record is used to record state as we go through all the @@ -2323,19 +2301,24 @@ static void end_bio_extent_writepage(struct bio *bio, int err) struct extent_io_tree *tree; u64 start; u64 end; - int whole_page; do { struct page *page = bvec-bv_page; tree = BTRFS_I(page-mapping-host)-io_tree; - start = page_offset(page) + bvec-bv_offset; - end = start + bvec-bv_len - 1; + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error +
[PATCH] Btrfs-progs: detect when scrub is started twice
Check whether any involved device is already busy running a scrub. This would cause damaged status messages and the state aborted without the explanation that a scrub was already running. Therefore check it first, prevent it and give some feedback to the user if scrub is already running. Note that if scrub is started with a block device as the parameter, only that particular block device is checked. It is a normal mode of operation to start scrub on multiple single devices, there is no reason to prevent this. Here is an example: /mnt2 is the mountpoint of a filesystem. /dev/sdk and /dev/sdl are the block devices for that filesystem. case 1: btrfs scrub start /mnt2 btrfs scrub start /mnt2 - complain case 1: btrfs scrub start /dev/sdk btrfs scrub start /dev/sdk - complain case 3: btrfs scrub start /dev/sdk btrfs scrub start /dev/sdl - don't complain case 4: btrfs scrub start /dev/sdk btrfs scrub start /mnt2 - complain case 5: btrfs scrub start /mnt2 btrfs scrub start /dev/sdk - complain if the scrub on /dev/sdk is still running. - don't complain if the scrub on /dev/sdk is finished, the status messages will be fine. Reported-by: David Sterba dste...@suse.cz Signed-off-by: Stefan Behrens sbehr...@giantdisaster.de --- cmds-scrub.c | 42 ++ 1 file changed, 42 insertions(+) diff --git a/cmds-scrub.c b/cmds-scrub.c index c0dc584..95dfee3 100644 --- a/cmds-scrub.c +++ b/cmds-scrub.c @@ -1025,6 +1025,27 @@ int mkdir_p(char *path) return 0; } +static int is_scrub_running_on_fs(struct btrfs_ioctl_fs_info_args *fi_args, + struct btrfs_ioctl_dev_info_args *di_args, + struct scrub_file_record **past_scrubs) +{ + int i; + + if (!fi_args || !di_args || !past_scrubs) + return 0; + + for (i = 0; i fi_args-num_devices; i++) { + struct scrub_file_record *sfr = + last_dev_scrub(past_scrubs, di_args[i].devid); + + if (!sfr) + continue; + if (!(sfr-stats.finished || sfr-stats.canceled)) + return 1; + } + return 0; +} + static const char * const cmd_scrub_start_usage[]; static const char * const cmd_scrub_resume_usage[]; @@ -1162,6 +1183,27 @@ static int scrub_start(int argc, char **argv, int resume) close(fdres); } + /* +* check whether any involved device is already busy running a +* scrub. This would cause damaged status messages and the state +* aborted without the explanation that a scrub was already +* running. Therefore check it first, prevent it and give some +* feedback to the user if scrub is already running. +* Note that if scrub is started with a block device as the +* parameter, only that particular block device is checked. It +* is a normal mode of operation to start scrub on multiple +* single devices, there is no reason to prevent this. +*/ + if (is_scrub_running_on_fs(fi_args, di_args, past_scrubs)) { + ERR(!do_quiet, + ERROR: scrub is already running.\n + To cancel use 'btrfs scrub cancel %s'.\n + To see the status use 'btrfs scrub status [-d] %s'.\n, + path, path); + err = 1; + goto out; + } + t_devs = malloc(fi_args.num_devices * sizeof(*t_devs)); sp = calloc(fi_args.num_devices, sizeof(*sp)); spc.progress = calloc(fi_args.num_devices * 2, sizeof(*spc.progress)); -- 1.8.2.3 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: increase the max global reserve size to 1gig
On Fri, May 03, 2013 at 08:56:54AM -0400, Josef Bacik wrote: Apparently 512mb was too small, with a fs_mark command we could get so much delayed work built up that we'd never trip the lets commit the transaction logic until we'd gotten too much delayed refs built up. Increasing this to 1 gig makes us much safer and we no longer abort with Dave's fs_mark tester. With this patch on top of today's next I still hit the abort and will file a bug. david -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
kernel BUG at fs/btrfs/free-space-cache.c:1567!, Kernel 3.9.1
Hello, my btrfs filesystem was not mountable anymore after a loss of power: kernel BUG at fs/btrfs/free-space-cache.c:1567! invalid opcode: [#1] SMP Modules linked in: btrfs libcrc32c xor zlib_deflate raid6_pq i915(+) i2c_algo_bit drm_kms_helper drm i2c_core video uinput CPU 3 Pid: 147, comm: mount Not tainted 3.9.1-301.fc19.x86_64 #1 LENOVO 4290W4H/4290W4H RIP: 0010:[a01c5984] [a01c5984] remove_from_bitmap+0x1f4/0x210 [btrfs] RSP: 0018:88020fddd698 EFLAGS: 00010287 RAX: 001454248000 RBX: 880211399180 RCX: 001454253000 RDX: 00013000 RSI: 51c0 RDI: 2e40 RBP: 88020fddd6e8 R08: 88020fecf5c8 R09: 00145940 R10: 8802124ce240 R11: 880211399198 R12: 880210025240 R13: 88020fddd708 R14: 88020fddd700 R15: 8802113991a8 FS: 7f019f213880() GS:88021e2c() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7f4cdd4223f0 CR3: 00020ff03000 CR4: 000407e0 DR0: DR1: DR2: DR3: DR6: 0ff0 DR7: 0400 Process mount (pid: 147, threadinfo 88020fddc000, task 8802100b4650) Stack: 88020fddd6e8 00145940 880210025264 001454253000 00013000 880210025240 880210025264 880210025248 88020ff4ec00 88020fddd738 a01c714b Call Trace: [a01c714b] btrfs_remove_free_space+0x5b/0x290 [btrfs] [810810e0] ? wake_up_bit+0x30/0x30 [a0176bff] btrfs_alloc_logged_file_extent+0x1af/0x1d0 [btrfs] [a0162e86] ? btrfs_free_path+0x26/0x30 [btrfs] [a01c1ef7] replay_one_extent+0x607/0x690 [btrfs] [a01c224b] replay_one_buffer+0x2cb/0x390 [btrfs] [a01a77b4] ? alloc_extent_buffer+0x94/0x3f0 [btrfs] [a01bd484] walk_down_log_tree+0x224/0x410 [btrfs] [a01bd95f] walk_log_tree+0xbf/0x1f0 [btrfs] [a01c43c1] btrfs_recover_log_trees+0x1f1/0x360 [btrfs] [a01c1f80] ? replay_one_extent+0x690/0x690 [btrfs] [a0186a1b] open_ctree+0x18eb/0x1f90 [btrfs] [812dac54] ? disk_name+0x54/0xb0 [a015f94e] btrfs_mount+0x5ce/0x6c0 [btrfs] [8129525d] ? selinux_sb_copy_data+0x14d/0x220 [8119d1a9] mount_fs+0x39/0x1b0 [811b635f] vfs_kern_mount+0x5f/0xf0 [a015f4fc] btrfs_mount+0x17c/0x6c0 [btrfs] [8129525d] ? selinux_sb_copy_data+0x14d/0x220 [8119d1a9] mount_fs+0x39/0x1b0 [811b635f] vfs_kern_mount+0x5f/0xf0 [811b862e] do_mount+0x23e/0xa20 [811b8276] ? copy_mount_options+0x36/0x170 [811b8e93] sys_mount+0x83/0xc0 [8164db19] system_call_fastpath+0x16/0x1b Code: 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 1f 40 00 31 c0 48 83 7b 20 00 75 e4 48 89 de 4c 89 e7 89 45 c0 e8 b1 ed ff ff 8b 45 c0 eb d1 0f 0b e8 69 a4 02 00 0f 1f 44 00 00 e8 59 a4 02 00 66 66 2e 0f RIP [a01c5984] remove_from_bitmap+0x1f4/0x210 [btrfs] RSP 88020fddd698 Mounting the fs with -o recovery,ro, and using btrfsck did not work. btrfsck segfaulted, but I can not provide the error message. btrfs-zero-log allowed me to mount the fs again. BR, Philipp Dreimann -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs-progs: fix missing recow roots when making btrfs filesystem
On Tue, May 14, 2013 at 07:50:28PM +0800, Wang Shilong wrote: When making btrfs filesystem. we firstly write root leaf to specified filed, and then we recow the root. If we don't recow, some trees are not in the correct block group. Steps to reproduce: dd if=/dev/zero of=test.img bs=1M count=100 mkfs.btrfs -f test.img btrfs-debug-tree test.img Very simple reproducer, I guess that it gets fixed with first update of the misplaced blocks. --- a/mkfs.c +++ b/mkfs.c @@ -151,37 +151,55 @@ static int recow_roots(struct btrfs_trans_handle *trans, int ret; struct extent_buffer *tmp; struct btrfs_fs_info *info = root-fs_info; + u64 generation; - ret = __btrfs_cow_block(trans, info-fs_root, info-fs_root-node, - NULL, 0, tmp, 0, 0); - BUG_ON(ret); - free_extent_buffer(tmp); + generation = btrfs_root_generation(info-fs_root-root_item); + if (generation != trans-transid) { + ret = __btrfs_cow_block(trans, info-fs_root, + info-fs_root-node, NULL, 0, tmp, 0, 0); + BUG_ON(ret); + free_extent_buffer(tmp); + } This gets repeated, please use wrapper for that. david -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kernel BUG at fs/btrfs/free-space-cache.c:1567!, Kernel 3.9.1
On Wed, May 15, 2013 at 4:19 PM, Philipp Dreimann phil...@dreimann.net wrote: Hello, my btrfs filesystem was not mountable anymore after a loss of power: kernel BUG at fs/btrfs/free-space-cache.c:1567! invalid opcode: [#1] SMP Modules linked in: btrfs libcrc32c xor zlib_deflate raid6_pq i915(+) i2c_algo_bit drm_kms_helper drm i2c_core video uinput CPU 3 Pid: 147, comm: mount Not tainted 3.9.1-301.fc19.x86_64 #1 LENOVO 4290W4H/4290W4H RIP: 0010:[a01c5984] [a01c5984] remove_from_bitmap+0x1f4/0x210 [btrfs] RSP: 0018:88020fddd698 EFLAGS: 00010287 RAX: 001454248000 RBX: 880211399180 RCX: 001454253000 RDX: 00013000 RSI: 51c0 RDI: 2e40 RBP: 88020fddd6e8 R08: 88020fecf5c8 R09: 00145940 R10: 8802124ce240 R11: 880211399198 R12: 880210025240 R13: 88020fddd708 R14: 88020fddd700 R15: 8802113991a8 FS: 7f019f213880() GS:88021e2c() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7f4cdd4223f0 CR3: 00020ff03000 CR4: 000407e0 DR0: DR1: DR2: DR3: DR6: 0ff0 DR7: 0400 Process mount (pid: 147, threadinfo 88020fddc000, task 8802100b4650) Stack: 88020fddd6e8 00145940 880210025264 001454253000 00013000 880210025240 880210025264 880210025248 88020ff4ec00 88020fddd738 a01c714b Call Trace: [a01c714b] btrfs_remove_free_space+0x5b/0x290 [btrfs] [810810e0] ? wake_up_bit+0x30/0x30 [a0176bff] btrfs_alloc_logged_file_extent+0x1af/0x1d0 [btrfs] [a0162e86] ? btrfs_free_path+0x26/0x30 [btrfs] [a01c1ef7] replay_one_extent+0x607/0x690 [btrfs] [a01c224b] replay_one_buffer+0x2cb/0x390 [btrfs] [a01a77b4] ? alloc_extent_buffer+0x94/0x3f0 [btrfs] [a01bd484] walk_down_log_tree+0x224/0x410 [btrfs] [a01bd95f] walk_log_tree+0xbf/0x1f0 [btrfs] [a01c43c1] btrfs_recover_log_trees+0x1f1/0x360 [btrfs] [a01c1f80] ? replay_one_extent+0x690/0x690 [btrfs] [a0186a1b] open_ctree+0x18eb/0x1f90 [btrfs] [812dac54] ? disk_name+0x54/0xb0 [a015f94e] btrfs_mount+0x5ce/0x6c0 [btrfs] [8129525d] ? selinux_sb_copy_data+0x14d/0x220 [8119d1a9] mount_fs+0x39/0x1b0 [811b635f] vfs_kern_mount+0x5f/0xf0 [a015f4fc] btrfs_mount+0x17c/0x6c0 [btrfs] [8129525d] ? selinux_sb_copy_data+0x14d/0x220 [8119d1a9] mount_fs+0x39/0x1b0 [811b635f] vfs_kern_mount+0x5f/0xf0 [811b862e] do_mount+0x23e/0xa20 [811b8276] ? copy_mount_options+0x36/0x170 [811b8e93] sys_mount+0x83/0xc0 [8164db19] system_call_fastpath+0x16/0x1b Code: 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 1f 40 00 31 c0 48 83 7b 20 00 75 e4 48 89 de 4c 89 e7 89 45 c0 e8 b1 ed ff ff 8b 45 c0 eb d1 0f 0b e8 69 a4 02 00 0f 1f 44 00 00 e8 59 a4 02 00 66 66 2e 0f RIP [a01c5984] remove_from_bitmap+0x1f4/0x210 [btrfs] RSP 88020fddd698 Mounting the fs with -o recovery,ro, and using btrfsck did not work. btrfsck segfaulted, but I can not provide the error message. btrfs-zero-log allowed me to mount the fs again. BR, Philipp Dreimann -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Looks to me like you should have been able to mount with -o clear_cache. Did you try that? Does it work again now? -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kernel BUG at fs/btrfs/free-space-cache.c:1567!, Kernel 3.9.1
On 15 May 2013 17:00, Harald Glatt m...@hachre.de wrote: On Wed, May 15, 2013 at 4:19 PM, Philipp Dreimann phil...@dreimann.net wrote: Hello, my btrfs filesystem was not mountable anymore after a loss of power: kernel BUG at fs/btrfs/free-space-cache.c:1567! invalid opcode: [#1] SMP Modules linked in: btrfs libcrc32c xor zlib_deflate raid6_pq i915(+) i2c_algo_bit drm_kms_helper drm i2c_core video uinput CPU 3 Pid: 147, comm: mount Not tainted 3.9.1-301.fc19.x86_64 #1 LENOVO 4290W4H/4290W4H RIP: 0010:[a01c5984] [a01c5984] remove_from_bitmap+0x1f4/0x210 [btrfs] RSP: 0018:88020fddd698 EFLAGS: 00010287 RAX: 001454248000 RBX: 880211399180 RCX: 001454253000 RDX: 00013000 RSI: 51c0 RDI: 2e40 RBP: 88020fddd6e8 R08: 88020fecf5c8 R09: 00145940 R10: 8802124ce240 R11: 880211399198 R12: 880210025240 R13: 88020fddd708 R14: 88020fddd700 R15: 8802113991a8 FS: 7f019f213880() GS:88021e2c() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7f4cdd4223f0 CR3: 00020ff03000 CR4: 000407e0 DR0: DR1: DR2: DR3: DR6: 0ff0 DR7: 0400 Process mount (pid: 147, threadinfo 88020fddc000, task 8802100b4650) Stack: 88020fddd6e8 00145940 880210025264 001454253000 00013000 880210025240 880210025264 880210025248 88020ff4ec00 88020fddd738 a01c714b Call Trace: [a01c714b] btrfs_remove_free_space+0x5b/0x290 [btrfs] [810810e0] ? wake_up_bit+0x30/0x30 [a0176bff] btrfs_alloc_logged_file_extent+0x1af/0x1d0 [btrfs] [a0162e86] ? btrfs_free_path+0x26/0x30 [btrfs] [a01c1ef7] replay_one_extent+0x607/0x690 [btrfs] [a01c224b] replay_one_buffer+0x2cb/0x390 [btrfs] [a01a77b4] ? alloc_extent_buffer+0x94/0x3f0 [btrfs] [a01bd484] walk_down_log_tree+0x224/0x410 [btrfs] [a01bd95f] walk_log_tree+0xbf/0x1f0 [btrfs] [a01c43c1] btrfs_recover_log_trees+0x1f1/0x360 [btrfs] [a01c1f80] ? replay_one_extent+0x690/0x690 [btrfs] [a0186a1b] open_ctree+0x18eb/0x1f90 [btrfs] [812dac54] ? disk_name+0x54/0xb0 [a015f94e] btrfs_mount+0x5ce/0x6c0 [btrfs] [8129525d] ? selinux_sb_copy_data+0x14d/0x220 [8119d1a9] mount_fs+0x39/0x1b0 [811b635f] vfs_kern_mount+0x5f/0xf0 [a015f4fc] btrfs_mount+0x17c/0x6c0 [btrfs] [8129525d] ? selinux_sb_copy_data+0x14d/0x220 [8119d1a9] mount_fs+0x39/0x1b0 [811b635f] vfs_kern_mount+0x5f/0xf0 [811b862e] do_mount+0x23e/0xa20 [811b8276] ? copy_mount_options+0x36/0x170 [811b8e93] sys_mount+0x83/0xc0 [8164db19] system_call_fastpath+0x16/0x1b Code: 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 1f 40 00 31 c0 48 83 7b 20 00 75 e4 48 89 de 4c 89 e7 89 45 c0 e8 b1 ed ff ff 8b 45 c0 eb d1 0f 0b e8 69 a4 02 00 0f 1f 44 00 00 e8 59 a4 02 00 66 66 2e 0f RIP [a01c5984] remove_from_bitmap+0x1f4/0x210 [btrfs] RSP 88020fddd698 Mounting the fs with -o recovery,ro, and using btrfsck did not work. btrfsck segfaulted, but I can not provide the error message. btrfs-zero-log allowed me to mount the fs again. BR, Philipp Dreimann -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Looks to me like you should have been able to mount with -o clear_cache. Did you try that? Does it work again now? I did not try that. The filesystem works since a btrfs-zero-log execution again! -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 4/8] Btrfs: maintain subvolume items in the UUID tree
On Tue, 14 May 2013 18:44:11 +0800, Liu Bo wrote: On Tue, May 14, 2013 at 11:36:56AM +0200, Stefan Behrens wrote: @@ -396,7 +403,7 @@ static noinline int create_subvol(struct inode *dir, * of create_snapshot(). */ ret = btrfs_subvolume_reserve_metadata(root, block_rsv, - 7, qgroup_reserved); + 8, qgroup_reserved); if (ret) return ret; This uuid_root will not use trans-block_rsv but empty_rsv since it does not set ref_cow, so you don't need to add one more to block_rsv, and same for the below cases. Hi Liu Bo, Thanks for your review comments! You are right, this won't work because the empty_block_rsv is used for the UUID tree as it is now. I need to avoid ENOSPC in the middle of the transaction. Can you please acknowledge or comment the following addition to get_block_rsv()? And the plus 1 that I have added to the transaction reservations for subvolume/snapshot creation/destruction will work correctly afterwards. --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4259,6 +4259,9 @@ static struct btrfs_block_rsv *get_block_rsv( if (root == root-fs_info-csum_root trans-adding_csums) block_rsv = trans-block_rsv; + if (root == root-fs_info-uuid_root) + block_rsv = trans-block_rsv; + if (!block_rsv) block_rsv = root-block_rsv; @@ -567,9 +578,10 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, * 1 - root item * 2 - root ref/backref * 1 - root of snapshot + * 1 - UUID item */ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)-root, -pending_snapshot-block_rsv, 7, +pending_snapshot-block_rsv, 8, pending_snapshot-qgroup_reserved); if (ret) goto out; -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: subvol copying
On May 15, 2013, at 1:40 AM, Gabriel de Perthuis g2p.c...@gmail.com wrote: You can move subvolumes at any time, as if they were regular directories. In the example case, the subvolumes are read-only. So is it possible to make a read-only subvolume (snapshot) read-writable? And is it possible to make a read-writeable snapshot of a read-only subvolume? Chris Murphy -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: subvol copying
On Wed, May 15, 2013 at 6:43 PM, Chris Murphy li...@colorremedies.com wrote: On May 15, 2013, at 1:40 AM, Gabriel de Perthuis g2p.c...@gmail.com wrote: You can move subvolumes at any time, as if they were regular directories. In the example case, the subvolumes are read-only. So is it possible to make a read-only subvolume (snapshot) read-writable? And is it possible to make a read-writeable snapshot of a read-only subvolume? Chris Murphy -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html You make a ro snapshot rw by creating a snapshot of it that is rw. So yes to both questions, by doing the same thing in both cases. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: subvol copying
On May 15, 2013, at 10:44 AM, Harald Glatt m...@hachre.de wrote: You make a ro snapshot rw by creating a snapshot of it that is rw. So yes to both questions, by doing the same thing in both cases. In other words, a normal snapshot (without -r) of a read-only snapshot will create a rw snapshot? In any case, for the OP, making a rw snapshot of the last backup ro snapshot, and putting it in place of the home folder is the way to do the rollback. Chris Murphy-- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: subvol copying
On Wed, May 15, 2013 at 7:28 PM, Chris Murphy li...@colorremedies.com wrote: On May 15, 2013, at 10:44 AM, Harald Glatt m...@hachre.de wrote: You make a ro snapshot rw by creating a snapshot of it that is rw. So yes to both questions, by doing the same thing in both cases. In other words, a normal snapshot (without -r) of a read-only snapshot will create a rw snapshot? Yes -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs-progs: add a newline to a free space cache message
Left out a newline in the generation check printf. Signed-off-by: Josef Bacik jba...@fusionio.com --- free-space-cache.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/free-space-cache.c b/free-space-cache.c index 5fb8ece..a30438b 100644 --- a/free-space-cache.c +++ b/free-space-cache.c @@ -314,7 +314,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct btrfs_inode_item); if (btrfs_inode_generation(leaf, inode_item) != generation) { printf(free space inode generation (%llu) did not match - free space cache generation (%llu), + free space cache generation (%llu)\n, (unsigned long long)btrfs_inode_generation(leaf, inode_item), (unsigned long long)generation); -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: I/O errors block the entire filesystem
On May 14, 2013, Liu Bo bo.li@oracle.com wrote: In one of the failures that caused machine load spikes, I tried to collect info on active processes with perf top and SysRq-T, but nothing there seemed to explain the spike. Thoughts on how to figure out what's causing this? Although I've seen your solution patch in this thread, I'm still curious about this senario, could you please share the reproducer script or something? I'm afraid I don't have one. I just use the filesystem on various disks, with ceph osds and other non-ceph subvolumes and files, and occasionally I run into one of these bad blocks and the filesystem gets into these odd states. I guess that you're using '-l 64k -n 64k' for mkfs.btrfs That is correct, but IIUC this should only affect metadata, and metadata recovery from the DUP block works. It's data (single copy) that fails as described. -- Alexandre Oliva, freedom fighterhttp://FSFLA.org/~lxoliva/ You must be the change you wish to see in the world. -- Gandhi Be Free! -- http://FSFLA.org/ FSF Latin America board member Free Software Evangelist Red Hat Brazil Compiler Engineer -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: I/O errors block the entire filesystem
On May 15, 2013, Josef Bacik jba...@fusionio.com wrote: So this should only happen in the case that you are on a dm device it looks like, is that how you are running? That was my first thought, but no, I'm using partitions out of the SATA disks directly. I even checked for stray dm out of fake raid or somesuch, but the dm modules were not even loaded, and perusing /sys/block confirms the “scsi” devices are actual ATA disks. Further investigation suggested that when individual 512-byte blocks are read from a disk (that's the block size reported by the kernel), the underlying disk driver is supposed to inform the upper layer about what it could read by updating the bio_vec bits in precisely the observed way. -- Alexandre Oliva, freedom fighterhttp://FSFLA.org/~lxoliva/ You must be the change you wish to see in the world. -- Gandhi Be Free! -- http://FSFLA.org/ FSF Latin America board member Free Software Evangelist Red Hat Brazil Compiler Engineer -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
Zach Brown z...@redhat.com wrote: This adds a syscall and vfs entry point for clone_range which offloads data copying between existing files. The syscall is a thin wrapper around the vfs entry point. Its arguments are inspired by sys_splice(). Why introduce a new syscall instead of extending sys_splice? Perhaps adding a new SPLICE_F_COPYRANGE flag to avoid compatibility problems with userspace which may expect ESPIPE when given two non-pipes would be useful. If the user doesn't need a out offset, then sendfile() should also be able to transparently utilize COPY/CLONE_RANGE, too. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point
On Wed, May 15, 2013 at 07:44:05PM +, Eric Wong wrote: Why introduce a new syscall instead of extending sys_splice? Personally, I think it's ugly to have different operations use the same syscall just because their arguments match. But that preference aside, sure, if the consensus is that we'd rather use the splice() entry point then I can duck tape the pieces together to make it work. If the user doesn't need a out offset, then sendfile() should also be able to transparently utilize COPY/CLONE_RANGE, too. Perhaps, yeah. - z -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC v0 4/4] nfs, nfsd: rough sys_copy_range and COPY support
On Tue, May 14, 2013 at 02:15:26PM -0700, Zach Brown wrote: This crude patch illustrates the simplest plumbing involved in supporting sys_call_range with the NFS COPY operation that's pending in the 4.2 draft spec. The patch is based on a previous prototype that used the COPY op to implement sys_copyfileat which created a new file (based on the ocfs2 reflink ioctl). By contrast, this copies file contents between existing files. There's still a lot of implementation and testing to do, but this can get discussion going. I'm using: git://github.com/loghyr/NFSv4.2 as my reference for the draft protocol. On a quick skim, one thing this is missing before it complies is a client implementation of CB_OFFLOAD: If a client desires an intra-server file copy, then it MUST support the COPY and CB_OFFLOAD operations. The server doesn't have to implement CB_OFFLOAD, though, so we should ditch these todo's: +/* + * XXX: + * - do something with stateids :) + * - implement callback results and OFFLOAD_ABORT + * - inter-server copies? + */ ... + /* don't support async callbacks yet */ ... lest someone go try to implement them for no reason. (Stranger things have happened.) Nits, possibly to ignore for now: + copy-u.ok.cr_callback_id_length = 0; + + return status; +} + /* This routine never returns NFS_OK! If there are no other errors, it * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the * attributes matched. VERIFY is implemented by mapping NFSERR_SAME @@ -1798,6 +1829,10 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + [OP_COPY] = { + .op_func = (nfsd4op_func)nfsd4_copy, + .op_name = OP_COPY, + }, There's some more boilerplate to fill in (see other ops). +static __be32 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) { return nfs_ok; @@ -1557,6 +1577,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_WANT_DELEGATION]= (nfsd4_dec)nfsd4_decode_notsupp, [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, + [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, And this should be made 4.2-specific. }; struct nfsd4_minorversion_ops { @@ -3394,6 +3415,27 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 +nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_copy *copy) +{ + __be32 *p; + + if (!nfserr) { + RESERVE_SPACE(4); + WRITE32(copy-u.ok.cr_callback_id_length); + ADJUST_ARGS(); + if (copy-u.ok.cr_callback_id_length == 1) + nfsd4_encode_stateid(resp, copy-u.ok.cr_callback_id); + } else { + RESERVE_SPACE(8); + WRITE64(copy-u.cr_bytes_copied); + ADJUST_ARGS(); + } + + return nfserr; +} + +static __be32 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) { return nfserr; @@ -3465,6 +3507,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_WANT_DELEGATION]= (nfsd4_enc)nfsd4_encode_noop, [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, }; /* diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 84ce601..0c1b427 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -28,6 +28,8 @@ #include asm/uaccess.h #include linux/exportfs.h #include linux/writeback.h +#include linux/fs_struct.h +#include linux/kmod.h #ifdef CONFIG_NFSD_V3 #include xdr3.h @@ -621,6 +623,45 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; return 1; } + +__be32 +nfsd_copy_range(struct svc_rqst *rqstp, struct svc_fh *fhp_in, u64 pos_in, + struct svc_fh *fhp_out, u64 pos_out, u64 count) +{ + struct file *filp_in = NULL; + struct file *filp_out = NULL; + int err; + + /* XXX verify pos and count within sane limits? */ + + err = nfsd_open(rqstp, fhp_in, S_IFREG, NFSD_MAY_READ, filp_in); + if (err) + goto out; + + err = nfsd_open(rqstp, fhp_out, S_IFREG, NFSD_MAY_WRITE, filp_out); + if (err) + goto out; Looking at the xdr... the COPY operation takes stateid's, which nfsd can use to look up files, so the opens shouldn't be required. --b. + + err = vfs_copy_range(filp_in, pos_in, filp_out, pos_out, count); + /* fall back if .copy_range isn't supported */ + + if (!err EX_ISSYNC(fhp_out-fh_export)) + err = vfs_fsync_range(filp_out, pos_out, pos_out + count-1,
Re: [RFC v0 4/4] nfs, nfsd: rough sys_copy_range and COPY support
On Wed, 2013-05-15 at 16:19 -0400, J. Bruce Fields wrote: On Tue, May 14, 2013 at 02:15:26PM -0700, Zach Brown wrote: This crude patch illustrates the simplest plumbing involved in supporting sys_call_range with the NFS COPY operation that's pending in the 4.2 draft spec. The patch is based on a previous prototype that used the COPY op to implement sys_copyfileat which created a new file (based on the ocfs2 reflink ioctl). By contrast, this copies file contents between existing files. There's still a lot of implementation and testing to do, but this can get discussion going. I'm using: git://github.com/loghyr/NFSv4.2 as my reference for the draft protocol. On a quick skim, one thing this is missing before it complies is a client implementation of CB_OFFLOAD: If a client desires an intra-server file copy, then it MUST support the COPY and CB_OFFLOAD operations. Note that Bryan is currently working on updating the NFS implementation to match the draft protocol. -- Trond Myklebust Linux NFS client maintainer NetApp trond.mykleb...@netapp.com www.netapp.com -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC v0 4/4] nfs, nfsd: rough sys_copy_range and COPY support
On Wed, May 15, 2013 at 08:21:54PM +, Myklebust, Trond wrote: On Wed, 2013-05-15 at 16:19 -0400, J. Bruce Fields wrote: On Tue, May 14, 2013 at 02:15:26PM -0700, Zach Brown wrote: This crude patch illustrates the simplest plumbing involved in supporting sys_call_range with the NFS COPY operation that's pending in the 4.2 draft spec. The patch is based on a previous prototype that used the COPY op to implement sys_copyfileat which created a new file (based on the ocfs2 reflink ioctl). By contrast, this copies file contents between existing files. There's still a lot of implementation and testing to do, but this can get discussion going. I'm using: git://github.com/loghyr/NFSv4.2 as my reference for the draft protocol. On a quick skim, one thing this is missing before it complies is a client implementation of CB_OFFLOAD: If a client desires an intra-server file copy, then it MUST support the COPY and CB_OFFLOAD operations. Note that Bryan is currently working on updating the NFS implementation to match the draft protocol. OK, good.--b. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs-progs: restore: use long option for the path regex
Current way of specifying the path to match is not very comfortable, but the feature itself is very useful. Let's save the short option -m for a more user friendly syntax and keep a long option --path-regex with the current syntax. CC: Peter Stuge pe...@stuge.se Signed-off-by: David Sterba dste...@suse.cz --- cmds-restore.c | 17 +++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cmds-restore.c b/cmds-restore.c index 52852bc..8a4f75a 100644 --- a/cmds-restore.c +++ b/cmds-restore.c @@ -32,6 +32,7 @@ #include lzo/lzo1x.h #include zlib.h #include regex.h +#include getopt.h #include ctree.h #include disk-io.h @@ -924,6 +925,11 @@ out: return ret; } +static struct option long_options[] = { + { path-regex, 1, NULL, 256}, + { 0, 0, 0, 0} +}; + const char * const cmd_restore_usage[] = { btrfs restore [options] device, Try to restore files from a damaged filesystem (unmounted), @@ -936,6 +942,10 @@ const char * const cmd_restore_usage[] = { -f offset filesystem location, -u block super mirror, -d find dir, + --path-regex regex, + restore only filenames matching regex,, + you have to use following syntax (possibly quoted):, + ^/(|home(|/username(|/Desktop(|/.*$, NULL }; @@ -950,6 +960,7 @@ int cmd_restore(int argc, char **argv) int len; int ret; int opt; + int option_index = 0; int super_mirror = 0; int find_dir = 0; int list_roots = 0; @@ -958,7 +969,8 @@ int cmd_restore(int argc, char **argv) regex_t match_reg, *mreg = NULL; char reg_err[256]; - while ((opt = getopt(argc, argv, sviot:u:df:r:lcm:)) != -1) { + while ((opt = getopt_long(argc, argv, sviot:u:df:r:lc, long_options, + option_index)) != -1) { switch (opt) { case 's': @@ -1016,7 +1028,8 @@ int cmd_restore(int argc, char **argv) case 'c': match_cflags |= REG_ICASE; break; - case 'm': + /* long option without single letter alternative */ + case 256: match_regstr = optarg; break; default: -- 1.8.2 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 4/8] Btrfs: maintain subvolume items in the UUID tree
On Wed, May 15, 2013 at 05:39:35PM +0200, Stefan Behrens wrote: On Tue, 14 May 2013 18:44:11 +0800, Liu Bo wrote: On Tue, May 14, 2013 at 11:36:56AM +0200, Stefan Behrens wrote: @@ -396,7 +403,7 @@ static noinline int create_subvol(struct inode *dir, * of create_snapshot(). */ ret = btrfs_subvolume_reserve_metadata(root, block_rsv, - 7, qgroup_reserved); + 8, qgroup_reserved); if (ret) return ret; This uuid_root will not use trans-block_rsv but empty_rsv since it does not set ref_cow, so you don't need to add one more to block_rsv, and same for the below cases. Hi Liu Bo, Thanks for your review comments! You are right, this won't work because the empty_block_rsv is used for the UUID tree as it is now. I need to avoid ENOSPC in the middle of the transaction. Can you please acknowledge or comment the following addition to get_block_rsv()? And the plus 1 that I have added to the transaction reservations for subvolume/snapshot creation/destruction will work correctly afterwards. Looks good. (Although I'm more looking forward to Josef's metadata enospc killer patch, that way we won't ever need to deal with all of this buggy stuff ;)) thanks, liubo --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4259,6 +4259,9 @@ static struct btrfs_block_rsv *get_block_rsv( if (root == root-fs_info-csum_root trans-adding_csums) block_rsv = trans-block_rsv; + if (root == root-fs_info-uuid_root) + block_rsv = trans-block_rsv; + if (!block_rsv) block_rsv = root-block_rsv; @@ -567,9 +578,10 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, * 1 - root item * 2 - root ref/backref * 1 - root of snapshot + * 1 - UUID item */ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)-root, - pending_snapshot-block_rsv, 7, + pending_snapshot-block_rsv, 8, pending_snapshot-qgroup_reserved); if (ret) goto out; -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation
On Wed, May 15, 2013 at 03:48:24PM +0800, Miao Xie wrote: Before applying this patch, we need flush all the delalloc inodes in the fs when we want to create a snapshot, it wastes time, and make the transaction commit be blocked for a long time. It means some other user operation would also be blocked for a long time. This patch improves this problem, we just flush the delalloc inodes that in the source trees before snapshot creation, so the transaction commit will complete quickly. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c | 6 ++ fs/btrfs/transaction.c | 10 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de4a2f..2677dcc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root-ref_cows) return -EINVAL; + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + + btrfs_wait_ordered_extents(root, 0); + Does this look too radical? Does this snapshot creation ioctl block all writes on its src root? No, we can only be sure that there is no ordered extents being added until setting trans_no_join, and then it's safe to create pending snapshots. thanks, liubo pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) return -ENOMEM; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b17213..bc22be9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - int snap_pending = 0; int ret; - if (!flush_on_commit) { - spin_lock(root-fs_info-trans_lock); - if (!list_empty(trans-transaction-pending_snapshots)) - snap_pending = 1; - spin_unlock(root-fs_info-trans_lock); - } - - if (flush_on_commit || snap_pending) { + if (flush_on_commit) { ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1); if (ret) return ret; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree
On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote: The grab/put funtions will be used in the next patch, which need grab the root object and ensure it is not freed. We use reference counter instead of the srcu lock is to aovid blocking the memory reclaim task, which invokes synchronize_srcu(). I don't think this is necessary, we put 'kfree(root)' because we really need to free them at the very end time, when there should be no inodes linking on the root(we should have cleaned all inodes out from it). So when we flush delalloc inodes and wait for ordered extents to finish, the root should be valid, otherwise someone is doing wrong things. And even with this grab_fs_root to avoid freeing root, it's just the root that remains in memory, all its attributes, like root-node, commit_root, root-inode_tree, are already NULL or empty. thanks, liubo Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/disk-io.h | 21 + fs/btrfs/extent-tree.c | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 845b77f..958ce6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1739,6 +1739,7 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; + atomic_t refs; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42d6ba2..642c861 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(root-log_writers, 0); atomic_set(root-log_batch, 0); atomic_set(root-orphan_inodes, 0); + atomic_set(root-refs, 1); root-log_transid = 0; root-last_log_commit = 0; extent_io_tree_init(root-dirty_log_pages, @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]-node); free_extent_buffer(gang[0]-commit_root); - kfree(gang[0]); + btrfs_put_fs_root(gang[0]); } } @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root-free_ino_ctl); kfree(root-free_ino_pinned); kfree(root-name); - kfree(root); + btrfs_put_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 534d583..b71acd6e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info-subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(root-refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(root-refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 08e42c8..08f9862 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } else { free_extent_buffer(root-node); free_extent_buffer(root-commit_root); - kfree(root); + btrfs_put_fs_root(root); } out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation
On Thu, 16 May 2013 11:20:39 +0800, Liu Bo wrote: On Wed, May 15, 2013 at 03:48:24PM +0800, Miao Xie wrote: Before applying this patch, we need flush all the delalloc inodes in the fs when we want to create a snapshot, it wastes time, and make the transaction commit be blocked for a long time. It means some other user operation would also be blocked for a long time. This patch improves this problem, we just flush the delalloc inodes that in the source trees before snapshot creation, so the transaction commit will complete quickly. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c | 6 ++ fs/btrfs/transaction.c | 10 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de4a2f..2677dcc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root-ref_cows) return -EINVAL; +ret = btrfs_start_delalloc_inodes(root, 0); +if (ret) +return ret; + +btrfs_wait_ordered_extents(root, 0); + Does this look too radical? Does this snapshot creation ioctl block all writes on its src root? I don't think it is radical, and I think flushing delalloc inodes during the transaction commit is stupid, especially flushing all the inodes including the roots which are not going to be snapshoted. Because it will block the operations of the users (such as mkdir, rmdir, create and so on) for a long time if there are lots of dirty pages. And The snapshot creation now doesn't block the writes of the source root at all, there is no appreciable difference between this way and the background flusher. No, we can only be sure that there is no ordered extents being added until setting trans_no_join, and then it's safe to create pending snapshots. Actually, we can not avoid that the new ordered extents are added before trans_no_join is set. But for the users, the 1st case below must be handled correctly, but the 2nd one can be ignored because we can see the write of the 2nd case as the one that happens after the snapshot creation. 1st case: Task write data into a file make a snapshot 2nd case: Task0 Task1 make a snapshot flush delalloc inodes write data into a file commit transaction create_pending_snapshot Thanks Miao thanks, liubo pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) return -ENOMEM; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b17213..bc22be9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); -int snap_pending = 0; int ret; -if (!flush_on_commit) { -spin_lock(root-fs_info-trans_lock); -if (!list_empty(trans-transaction-pending_snapshots)) -snap_pending = 1; -spin_unlock(root-fs_info-trans_lock); -} - -if (flush_on_commit || snap_pending) { +if (flush_on_commit) { ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1); if (ret) return ret; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree
On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote: On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote: The grab/put funtions will be used in the next patch, which need grab the root object and ensure it is not freed. We use reference counter instead of the srcu lock is to aovid blocking the memory reclaim task, which invokes synchronize_srcu(). I don't think this is necessary, we put 'kfree(root)' because we really need to free them at the very end time, when there should be no inodes linking on the root(we should have cleaned all inodes out from it). So when we flush delalloc inodes and wait for ordered extents to finish, the root should be valid, otherwise someone is doing wrong things. And even with this grab_fs_root to avoid freeing root, it's just the root that remains in memory, all its attributes, like root-node, commit_root, root-inode_tree, are already NULL or empty. Please consider the case: Task1 Task2 Cleaner get the root flush all delalloc inodes drop subvolume iput the last inode move the root into the dead list drop subvolume kfree(root) If Task1 accesses the root now, oops will happen. I introduced there two functions just to protect the access of the root object, not its attributes, so don't worry about the attributes. (Please see the first sentence of the changelog.) Thanks Miao thanks, liubo Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/disk-io.h | 21 + fs/btrfs/extent-tree.c | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 845b77f..958ce6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1739,6 +1739,7 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; +atomic_t refs; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42d6ba2..642c861 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(root-log_writers, 0); atomic_set(root-log_batch, 0); atomic_set(root-orphan_inodes, 0); +atomic_set(root-refs, 1); root-log_transid = 0; root-last_log_commit = 0; extent_io_tree_init(root-dirty_log_pages, @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]-node); free_extent_buffer(gang[0]-commit_root); -kfree(gang[0]); +btrfs_put_fs_root(gang[0]); } } @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root-free_ino_ctl); kfree(root-free_ino_pinned); kfree(root-name); -kfree(root); +btrfs_put_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 534d583..b71acd6e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info-subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ +if (atomic_inc_not_zero(root-refs)) +return root; +return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ +if (atomic_dec_and_test(root-refs)) +kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 08e42c8..08f9862 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } else { free_extent_buffer(root-node); free_extent_buffer(root-commit_root); -kfree(root); +btrfs_put_fs_root(root); } out_end_trans: btrfs_end_transaction_throttle(trans, tree_root);
Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree
On Thu, May 16, 2013 at 12:31:11PM +0800, Miao Xie wrote: On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote: On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote: The grab/put funtions will be used in the next patch, which need grab the root object and ensure it is not freed. We use reference counter instead of the srcu lock is to aovid blocking the memory reclaim task, which invokes synchronize_srcu(). I don't think this is necessary, we put 'kfree(root)' because we really need to free them at the very end time, when there should be no inodes linking on the root(we should have cleaned all inodes out from it). So when we flush delalloc inodes and wait for ordered extents to finish, the root should be valid, otherwise someone is doing wrong things. And even with this grab_fs_root to avoid freeing root, it's just the root that remains in memory, all its attributes, like root-node, commit_root, root-inode_tree, are already NULL or empty. Please consider the case: Task1 Task2 Cleaner get the root flush all delalloc inodes drop subvolume iput the last inode move the root into the dead list drop subvolume kfree(root) If Task1 accesses the root now, oops will happen. Then it's task1's fault, why it is not protected by subvol_srcu section when it's possible that someone like task2 sets root's refs to 0? synchronize_srcu(subvol_srcu) before adding root into dead root list is just for this race case, why do we need another? thanks, liubo I introduced there two functions just to protect the access of the root object, not its attributes, so don't worry about the attributes. (Please see the first sentence of the changelog.) Thanks Miao thanks, liubo Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/disk-io.h | 21 + fs/btrfs/extent-tree.c | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 845b77f..958ce6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1739,6 +1739,7 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; + atomic_t refs; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42d6ba2..642c861 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(root-log_writers, 0); atomic_set(root-log_batch, 0); atomic_set(root-orphan_inodes, 0); + atomic_set(root-refs, 1); root-log_transid = 0; root-last_log_commit = 0; extent_io_tree_init(root-dirty_log_pages, @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]-node); free_extent_buffer(gang[0]-commit_root); - kfree(gang[0]); + btrfs_put_fs_root(gang[0]); } } @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root-free_ino_ctl); kfree(root-free_ino_pinned); kfree(root-name); - kfree(root); + btrfs_put_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 534d583..b71acd6e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + *fs_info-subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(root-refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(root-refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index
Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree
On Thu, 16 May 2013 13:15:57 +0800, Liu Bo wrote: On Thu, May 16, 2013 at 12:31:11PM +0800, Miao Xie wrote: On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote: On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote: The grab/put funtions will be used in the next patch, which need grab the root object and ensure it is not freed. We use reference counter instead of the srcu lock is to aovid blocking the memory reclaim task, which invokes synchronize_srcu(). I don't think this is necessary, we put 'kfree(root)' because we really need to free them at the very end time, when there should be no inodes linking on the root(we should have cleaned all inodes out from it). So when we flush delalloc inodes and wait for ordered extents to finish, the root should be valid, otherwise someone is doing wrong things. And even with this grab_fs_root to avoid freeing root, it's just the root that remains in memory, all its attributes, like root-node, commit_root, root-inode_tree, are already NULL or empty. Please consider the case: Task1 Task2 Cleaner get the root flush all delalloc inodes drop subvolume iput the last inode move the root into the dead list drop subvolume kfree(root) If Task1 accesses the root now, oops will happen. Then it's task1's fault, why it is not protected by subvol_srcu section when it's possible that someone like task2 sets root's refs to 0? synchronize_srcu(subvol_srcu) before adding root into dead root list is just for this race case, why do we need another? Please read my changelog. Miao thanks, liubo I introduced there two functions just to protect the access of the root object, not its attributes, so don't worry about the attributes. (Please see the first sentence of the changelog.) Thanks Miao thanks, liubo Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/disk-io.h | 21 + fs/btrfs/extent-tree.c | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 845b77f..958ce6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1739,6 +1739,7 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; + atomic_t refs; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42d6ba2..642c861 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(root-log_writers, 0); atomic_set(root-log_batch, 0); atomic_set(root-orphan_inodes, 0); + atomic_set(root-refs, 1); root-log_transid = 0; root-last_log_commit = 0; extent_io_tree_init(root-dirty_log_pages, @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]-node); free_extent_buffer(gang[0]-commit_root); - kfree(gang[0]); + btrfs_put_fs_root(gang[0]); } } @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root-free_ino_ctl); kfree(root-free_ino_pinned); kfree(root-name); - kfree(root); + btrfs_put_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 534d583..b71acd6e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + *fs_info-subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(root-refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(root-refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 08e42c8..08f9862 100644 ---