[PATCH 13/17] Btrfs: don't wait for all the writers circularly during the transaction commit
btrfs_commit_transaction has the following loop before we commit the transaction. do { // attempt to do some useful stuff and/or sleep } while (atomic_read(cur_trans-num_writers) 1 || (should_grow cur_trans-num_joined != joined)); This is used to prevent from the TRANS_START to get in the way of a committing transaction. But it does not prevent from TRANS_JOIN, that is we would do this loop for a long time if some writers JOIN the current transaction endlessly. Because we need join the current transaction to do some useful stuff, we can not block TRANS_JOIN here. So we introduce a external writer counter, which is used to count the TRANS_USERSPACE/TRANS_START writers. If the external writer counter is zero, we can break the above loop. In order to make the code more clear, we don't use enum variant to define the type of the transaction handle, use bitmask instead. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 55 ++ fs/btrfs/transaction.h | 31 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cf8706c..fd319b2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -51,17 +51,41 @@ static noinline void switch_commit_root(struct btrfs_root *root) } static inline int can_join_transaction(struct btrfs_transaction *trans, - int type) + unsigned int type) { return !(trans-in_commit -type != TRANS_JOIN -type != TRANS_JOIN_NOLOCK); +(type TRANS_EXTWRITERS)); +} + +static inline void extwriter_counter_inc(struct btrfs_transaction *trans, +unsigned int type) +{ + if (type TRANS_EXTWRITERS) + atomic_inc(trans-num_extwriters); +} + +static inline void extwriter_counter_dec(struct btrfs_transaction *trans, +unsigned int type) +{ + if (type TRANS_EXTWRITERS) + atomic_dec(trans-num_extwriters); +} + +static inline void extwriter_counter_init(struct btrfs_transaction *trans, + unsigned int type) +{ + atomic_set(trans-num_extwriters, ((type TRANS_EXTWRITERS) ? 1 : 0)); +} + +static inline int extwriter_counter_read(struct btrfs_transaction *trans) +{ + return atomic_read(trans-num_extwriters); } /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root, int type) +static noinline int join_transaction(struct btrfs_root *root, unsigned int type) { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *fs_info = root-fs_info; @@ -99,6 +123,7 @@ loop: } atomic_inc(cur_trans-use_count); atomic_inc(cur_trans-num_writers); + extwriter_counter_inc(cur_trans, type); cur_trans-num_joined++; spin_unlock(fs_info-trans_lock); return 0; @@ -131,6 +156,7 @@ loop: } atomic_set(cur_trans-num_writers, 1); + extwriter_counter_init(cur_trans, type); cur_trans-num_joined = 0; init_waitqueue_head(cur_trans-writer_wait); init_waitqueue_head(cur_trans-commit_wait); @@ -307,7 +333,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type) } static struct btrfs_trans_handle * -start_transaction(struct btrfs_root *root, u64 num_items, int type, +start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; @@ -320,7 +346,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, return ERR_PTR(-EROFS); if (current-journal_info) { - WARN_ON(type != TRANS_JOIN type != TRANS_JOIN_NOLOCK); + WARN_ON(type TRANS_EXTWRITERS); h = current-journal_info; h-use_count++; WARN_ON(h-use_count 2); @@ -366,7 +392,7 @@ again: * If we are ATTACH, it means we just want to catch the current * transaction and commit it, so we needn't do sb_start_intwrite(). */ - if (type TRANS_JOIN_NOLOCK) + if (type __TRANS_FREEZABLE) sb_start_intwrite(root-fs_info-sb); if (may_wait_transaction(root, type)) @@ -429,7 +455,7 @@ got_it: return h; join_fail: - if (type TRANS_JOIN_NOLOCK) + if (type __TRANS_FREEZABLE) sb_end_intwrite(root-fs_info-sb); kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: @@ -677,12 +703,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } } - if (trans-type
[PATCH 17/17] Btrfs: make the state of the transaction more readable
We used 3 variants to track the state of the transaction, it was complex and wasted the memory space. Besides that, it was hard to understand that which types of the transaction handles should be blocked in each transaction state, so the developers often made mistakes. This patch improved the above problem. In this patch, we define 6 states for the transaction, enum btrfs_trans_state { TRANS_STATE_RUNNING = 0, TRANS_STATE_BLOCKED = 1, TRANS_STATE_COMMIT_START= 2, TRANS_STATE_COMMIT_DOING= 3, TRANS_STATE_UNBLOCKED = 4, TRANS_STATE_COMPLETED = 5, TRANS_STATE_MAX = 6, } and just use 1 variant to track those state. In order to make the blocked handle types for each state more clear, we introduce a array: unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | __TRANS_START), [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE | __TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE | __TRANS_START | __TRANS_ATTACH | __TRANS_JOIN), [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE | __TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK), [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE | __TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK), } it is very intuitionistic. Besides that, because we remove -in_commit in transaction structure, so the lock -commit_lock which was used to protect it is unnecessary, remove -commit_lock. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 36 ++-- fs/btrfs/transaction.c | 156 ++--- fs/btrfs/transaction.h | 16 +++-- 4 files changed, 114 insertions(+), 95 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a7e71ff..bf92302 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1496,7 +1496,6 @@ struct btrfs_fs_info { int closing; int log_root_recovering; int enospc_unlink; - int trans_no_join; u64 total_pinned; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6bb3f3d..530e3c0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1747,7 +1747,7 @@ static int transaction_kthread(void *arg) } now = get_seconds(); - if (!cur-blocked + if (cur-state TRANS_STATE_BLOCKED (now cur-start_time || now - cur-start_time 30)) { spin_unlock(root-fs_info-trans_lock); delay = HZ * 5; @@ -2183,7 +2183,6 @@ int open_ctree(struct super_block *sb, fs_info-max_inline = 8192 * 1024; fs_info-metadata_ratio = 0; fs_info-defrag_inodes = RB_ROOT; - fs_info-trans_no_join = 0; fs_info-free_chunk_space = 0; fs_info-tree_mod_log = RB_ROOT; @@ -3956,19 +3955,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_block_rsv_release(root, root-fs_info-trans_block_rsv, cur_trans-dirty_pages.dirty_bytes); - /* FIXME: cleanup wait for commit */ - cur_trans-in_commit = 1; - cur_trans-blocked = 1; + cur_trans-state = TRANS_STATE_COMMIT_START; wake_up(root-fs_info-transaction_blocked_wait); btrfs_evict_pending_snapshots(cur_trans); - cur_trans-blocked = 0; + cur_trans-state = TRANS_STATE_UNBLOCKED; wake_up(root-fs_info-transaction_wait); - cur_trans-commit_done = 1; - wake_up(cur_trans-commit_wait); - btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); @@ -3977,6 +3971,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_pinned_extent(root, root-fs_info-pinned_extents); + cur_trans-state =TRANS_STATE_COMPLETED; + wake_up(cur_trans-commit_wait); + /* memset(cur_trans, 0, sizeof(*cur_trans)); kmem_cache_free(btrfs_transaction_cachep, cur_trans); @@ -4004,25 +4001,23 @@ static int
[PATCH 12/17] Btrfs: remove the code for the impossible case in cleanup_transaction()
If the transaction is removed from the transaction list, it means the transaction has been committed successfully. So it is impossible to call cleanup_transaction(), otherwise there is something wrong with the code logic. Thus, we use BUG_ON() instead of the original handle. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bc22be9..cf8706c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1450,11 +1450,12 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(root-fs_info-trans_lock); - if (list_empty(cur_trans-list)) { - spin_unlock(root-fs_info-trans_lock); - btrfs_end_transaction(trans, root); - return; - } + /* +* If the transaction is removed from the list, it means this +* transaction has been committed successfully, so it is impossible +* to call the cleanup function. +*/ + BUG_ON(list_empty(cur_trans-list)); list_del_init(cur_trans-list); if (cur_trans == root-fs_info-running_transaction) { -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/17] Btrfs: pause the space balance when remounting to R/O
Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a4807ce..f0857e0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1263,6 +1263,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); ret = btrfs_commit_super(root); if (ret) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation
Before applying this patch, we need flush all the delalloc inodes in the fs when we want to create a snapshot, it wastes time, and make the transaction commit be blocked for a long time. It means some other user operation would also be blocked for a long time. This patch improves this problem, we just flush the delalloc inodes that in the source trees before snapshot creation, so the transaction commit will complete quickly. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c | 6 ++ fs/btrfs/transaction.c | 10 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de4a2f..2677dcc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root-ref_cows) return -EINVAL; + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + + btrfs_wait_ordered_extents(root, 0); + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) return -ENOMEM; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b17213..bc22be9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - int snap_pending = 0; int ret; - if (!flush_on_commit) { - spin_lock(root-fs_info-trans_lock); - if (!list_empty(trans-transaction-pending_snapshots)) - snap_pending = 1; - spin_unlock(root-fs_info-trans_lock); - } - - if (flush_on_commit || snap_pending) { + if (flush_on_commit) { ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1); if (ret) return ret; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/17] Btrfs: introduce per-subvolume ordered extent list
The reason we introduce per-subvolume ordered extent list is the same as the per-subvolume delalloc inode list. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h| 25 --- fs/btrfs/dev-replace.c | 4 +- fs/btrfs/disk-io.c | 45 +++- fs/btrfs/extent-tree.c | 6 +-- fs/btrfs/inode.c| 4 +- fs/btrfs/ordered-data.c | 109 +--- fs/btrfs/ordered-data.h | 2 + fs/btrfs/relocation.c | 2 +- fs/btrfs/super.c| 2 +- fs/btrfs/transaction.c | 2 +- 10 files changed, 143 insertions(+), 58 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 067233f..a7e71ff 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1437,17 +1437,18 @@ struct btrfs_fs_info { atomic_t open_ioctl_trans; /* -* this is used by the balancing code to wait for all the pending -* ordered extents +* this is used to protect the following list -- ordered_roots. */ - spinlock_t ordered_extent_lock; + spinlock_t ordered_root_lock; /* -* all of the data=ordered extents pending writeback +* all fs/file tree roots in which there are data=ordered extents +* pending writeback are added into this list. +* * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ - struct list_head ordered_extents; + struct list_head ordered_roots; spinlock_t delalloc_root_lock; /* all fs/file tree roots that have delalloc inodes. */ @@ -1746,6 +1747,20 @@ struct btrfs_root { struct list_head delalloc_inodes; struct list_head delalloc_root; u64 nr_delalloc_inodes; + /* +* this is used by the balancing code to wait for all the pending +* ordered extents +*/ + spinlock_t ordered_extent_lock; + + /* +* all of the data=ordered extents pending writeback +* these can span multiple transactions and basically include +* every dirty data page that isn't from nodatacow +*/ + struct list_head ordered_extents; + struct list_head ordered_root; + u64 nr_ordered_extents; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index aca77ad..4254da8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args-result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root-fs_info, 0); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -470,7 +470,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(dev_replace-lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root-fs_info, 0); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 13dddba..44d5a86 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1192,6 +1192,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root-last_trans = 0; root-highest_objectid = 0; root-nr_delalloc_inodes = 0; + root-nr_ordered_extents = 0; root-name = NULL; root-inode_tree = RB_ROOT; INIT_RADIX_TREE(root-delayed_nodes_tree, GFP_ATOMIC); @@ -1202,11 +1203,14 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(root-root_list); INIT_LIST_HEAD(root-delalloc_inodes); INIT_LIST_HEAD(root-delalloc_root); + INIT_LIST_HEAD(root-ordered_extents); + INIT_LIST_HEAD(root-ordered_root); INIT_LIST_HEAD(root-logged_list[0]); INIT_LIST_HEAD(root-logged_list[1]); spin_lock_init(root-orphan_lock); spin_lock_init(root-inode_lock); spin_lock_init(root-delalloc_lock); + spin_lock_init(root-ordered_extent_lock); spin_lock_init(root-accounting_lock); spin_lock_init(root-log_extents_lock[0]); spin_lock_init(root-log_extents_lock[1]); @@ -2190,8 +2194,8 @@ int open_ctree(struct super_block *sb, fs_info-thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); - INIT_LIST_HEAD(fs_info-ordered_extents); - spin_lock_init(fs_info-ordered_extent_lock); + INIT_LIST_HEAD(fs_info-ordered_roots); + spin_lock_init(fs_info-ordered_root_lock); fs_info-delayed_root = kmalloc(sizeof(struct btrfs_delayed_root
[PATCH 11/17] Btrfs: cleanup unnecessary assignment when cleaning up all the residual transaction
When we umount a fs with serious errors, we will invoke btrfs_cleanup_transactions() to clean up the residual transaction. At this time, It is impossible to start a new transaction, so we needn't assign trans_no_join to 1, and also needn't clear running transaction every time we destroy a residual transaction. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 44d5a86..6bb3f3d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3992,7 +3992,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) spin_lock(root-fs_info-trans_lock); list_splice_init(root-fs_info-trans_list, list); - root-fs_info-trans_no_join = 1; + root-fs_info-running_transaction = NULL; spin_unlock(root-fs_info-trans_lock); while (!list_empty(list)) { @@ -4028,10 +4028,6 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_all_delalloc_inodes(root-fs_info); - spin_lock(root-fs_info-trans_lock); - root-fs_info-running_transaction = NULL; - spin_unlock(root-fs_info-trans_lock); - btrfs_destroy_marked_extents(root, t-dirty_pages, EXTENT_DIRTY); @@ -4044,9 +4040,6 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) kmem_cache_free(btrfs_transaction_cachep, t); } - spin_lock(root-fs_info-trans_lock); - root-fs_info-trans_no_join = 0; - spin_unlock(root-fs_info-trans_lock); mutex_unlock(root-fs_info-transaction_kthread_mutex); return 0; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/17] Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix()
We have checked if -node is NULL or not, so it is unnecessary to use BUG_ON() to check again. Remove it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2a9ae38..8c1e4fb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1513,7 +1513,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, } root-commit_root = btrfs_root_node(root); - BUG_ON(!root-node); /* -ENOMEM */ out: if (location-objectid != BTRFS_TREE_LOG_OBJECTID) { root-ref_cows = 1; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 15/17] Btrfs: remove unnecessary varient -num_joined in btrfs_transaction structure
We used -num_joined track if there were some writers which join the current transaction when the committer was sleeping. If some writers joined the current transaction, we has to continue the while loop to do some necessary stuff, such as flush the ordered operations. But it is unnecessary because we will do it after the while loop. Besides that, tracking -num_joined would make the committer drop into the while loop when there are lots of internal writers(TRANS_JOIN). So we remove -num_joined and don't track if there are some writers which join the current transaction when the committer is sleeping. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 8 +--- fs/btrfs/transaction.h | 2 -- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 265db57..75e7b15 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -124,7 +124,6 @@ loop: atomic_inc(cur_trans-use_count); atomic_inc(cur_trans-num_writers); extwriter_counter_inc(cur_trans, type); - cur_trans-num_joined++; spin_unlock(fs_info-trans_lock); return 0; } @@ -157,7 +156,6 @@ loop: atomic_set(cur_trans-num_writers, 1); extwriter_counter_init(cur_trans, type); - cur_trans-num_joined = 0; init_waitqueue_head(cur_trans-writer_wait); init_waitqueue_head(cur_trans-commit_wait); cur_trans-in_commit = 0; @@ -1566,7 +1564,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - unsigned long joined = 0; struct btrfs_transaction *cur_trans = trans-transaction; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); @@ -1668,8 +1665,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, should_grow = 1; do { - joined = cur_trans-num_joined; - WARN_ON(cur_trans != trans-transaction); ret = btrfs_flush_all_pending_stuffs(trans, root); @@ -1685,8 +1680,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, schedule_timeout(1); finish_wait(cur_trans-writer_wait, wait); - } while (extwriter_counter_read(cur_trans) 0 || -(should_grow cur_trans-num_joined != joined)); + } while (extwriter_counter_read(cur_trans) 0); ret = btrfs_flush_all_pending_stuffs(trans, root); if (ret) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5cc77b0..0fc45e2 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -37,8 +37,6 @@ struct btrfs_transaction { atomic_t num_writers; atomic_t use_count; - unsigned long num_joined; - spinlock_t commit_lock; int in_commit; int commit_done; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 00/17] improve the block time during the transaction commit
This patchset improve the problem that the transaction may be blocked for a long time when it is being committed if there is heavy I/O. In this patchset, - 0001-0005, 0007, 0011-0012 are random fix or code cleanup patch. - 0006, 0008-0010 introduce per-subvolume delalloc inode list and ordered extent list, which can reduce the flush time when we create snapshots. - 0013-0016 improve the block time during the transaction commit by removing the while loop at the beginning of the transaction commit. - 0017 improves the readability of the code. Miao Xie (17): Btrfs: fix accessing a freed tree root Btrfs: fix unprotected root node of the subvolume's inode rb-tree Btrfs: pause the space balance when remounting to R/O Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix() Btrfs: cleanup the similar code of the fs root read Btrfs: introduce grab/put functions for the root of the fs/file tree Btrfs: don't invoke btrfs_invalidate_inodes() in the spin lock context Btrfs: introduce per-subvolume delalloc inode list Btrfs: introduce per-subvolume ordered extent list Btrfs: just flush the delalloc inodes in the source tree before snapshot creation Btrfs: cleanup unnecessary assignment when cleaning up all the residual transaction Btrfs: remove the code for the impossible case in cleanup_transaction() Btrfs: don't wait for all the writers circularly during the transaction commit Btrfs: don't flush the delalloc inodes in the while loop if flushoncommit is set Btrfs: remove unnecessary varient -num_joined in btrfs_transaction structure Btrfs: remove the time check in btrfs_commit_transaction() Btrfs: make the state of the transaction more readable fs/btrfs/ctree.h| 55 +-- fs/btrfs/dev-replace.c | 6 +- fs/btrfs/disk-io.c | 425 +++- fs/btrfs/disk-io.h | 32 +++- fs/btrfs/extent-tree.c | 20 +-- fs/btrfs/inode.c| 180 ++-- fs/btrfs/ioctl.c| 6 + fs/btrfs/ordered-data.c | 109 + fs/btrfs/ordered-data.h | 2 + fs/btrfs/relocation.c | 9 +- fs/btrfs/root-tree.c| 170 ++- fs/btrfs/super.c| 3 +- fs/btrfs/transaction.c | 271 -- fs/btrfs/transaction.h | 49 -- fs/btrfs/tree-log.c | 3 +- fs/btrfs/volumes.c | 13 +- fs/btrfs/volumes.h | 1 + 17 files changed, 791 insertions(+), 563 deletions(-) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree
The grab/put funtions will be used in the next patch, which need grab the root object and ensure it is not freed. We use reference counter instead of the srcu lock is to aovid blocking the memory reclaim task, which invokes synchronize_srcu(). Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/disk-io.h | 21 + fs/btrfs/extent-tree.c | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 845b77f..958ce6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1739,6 +1739,7 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; + atomic_t refs; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42d6ba2..642c861 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(root-log_writers, 0); atomic_set(root-log_batch, 0); atomic_set(root-orphan_inodes, 0); + atomic_set(root-refs, 1); root-log_transid = 0; root-last_log_commit = 0; extent_io_tree_init(root-dirty_log_pages, @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]-node); free_extent_buffer(gang[0]-commit_root); - kfree(gang[0]); + btrfs_put_fs_root(gang[0]); } } @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root-free_ino_ctl); kfree(root-free_ino_pinned); kfree(root-name); - kfree(root); + btrfs_put_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 534d583..b71acd6e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info-subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(root-refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(root-refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 08e42c8..08f9862 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } else { free_extent_buffer(root-node); free_extent_buffer(root-commit_root); - kfree(root); + btrfs_put_fs_root(root); } out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation
On Thu, 16 May 2013 11:20:39 +0800, Liu Bo wrote: On Wed, May 15, 2013 at 03:48:24PM +0800, Miao Xie wrote: Before applying this patch, we need flush all the delalloc inodes in the fs when we want to create a snapshot, it wastes time, and make the transaction commit be blocked for a long time. It means some other user operation would also be blocked for a long time. This patch improves this problem, we just flush the delalloc inodes that in the source trees before snapshot creation, so the transaction commit will complete quickly. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c | 6 ++ fs/btrfs/transaction.c | 10 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de4a2f..2677dcc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root-ref_cows) return -EINVAL; +ret = btrfs_start_delalloc_inodes(root, 0); +if (ret) +return ret; + +btrfs_wait_ordered_extents(root, 0); + Does this look too radical? Does this snapshot creation ioctl block all writes on its src root? I don't think it is radical, and I think flushing delalloc inodes during the transaction commit is stupid, especially flushing all the inodes including the roots which are not going to be snapshoted. Because it will block the operations of the users (such as mkdir, rmdir, create and so on) for a long time if there are lots of dirty pages. And The snapshot creation now doesn't block the writes of the source root at all, there is no appreciable difference between this way and the background flusher. No, we can only be sure that there is no ordered extents being added until setting trans_no_join, and then it's safe to create pending snapshots. Actually, we can not avoid that the new ordered extents are added before trans_no_join is set. But for the users, the 1st case below must be handled correctly, but the 2nd one can be ignored because we can see the write of the 2nd case as the one that happens after the snapshot creation. 1st case: Task write data into a file make a snapshot 2nd case: Task0 Task1 make a snapshot flush delalloc inodes write data into a file commit transaction create_pending_snapshot Thanks Miao thanks, liubo pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) return -ENOMEM; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b17213..bc22be9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); -int snap_pending = 0; int ret; -if (!flush_on_commit) { -spin_lock(root-fs_info-trans_lock); -if (!list_empty(trans-transaction-pending_snapshots)) -snap_pending = 1; -spin_unlock(root-fs_info-trans_lock); -} - -if (flush_on_commit || snap_pending) { +if (flush_on_commit) { ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1); if (ret) return ret; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree
On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote: On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote: The grab/put funtions will be used in the next patch, which need grab the root object and ensure it is not freed. We use reference counter instead of the srcu lock is to aovid blocking the memory reclaim task, which invokes synchronize_srcu(). I don't think this is necessary, we put 'kfree(root)' because we really need to free them at the very end time, when there should be no inodes linking on the root(we should have cleaned all inodes out from it). So when we flush delalloc inodes and wait for ordered extents to finish, the root should be valid, otherwise someone is doing wrong things. And even with this grab_fs_root to avoid freeing root, it's just the root that remains in memory, all its attributes, like root-node, commit_root, root-inode_tree, are already NULL or empty. Please consider the case: Task1 Task2 Cleaner get the root flush all delalloc inodes drop subvolume iput the last inode move the root into the dead list drop subvolume kfree(root) If Task1 accesses the root now, oops will happen. I introduced there two functions just to protect the access of the root object, not its attributes, so don't worry about the attributes. (Please see the first sentence of the changelog.) Thanks Miao thanks, liubo Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/disk-io.h | 21 + fs/btrfs/extent-tree.c | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 845b77f..958ce6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1739,6 +1739,7 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; +atomic_t refs; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42d6ba2..642c861 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(root-log_writers, 0); atomic_set(root-log_batch, 0); atomic_set(root-orphan_inodes, 0); +atomic_set(root-refs, 1); root-log_transid = 0; root-last_log_commit = 0; extent_io_tree_init(root-dirty_log_pages, @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]-node); free_extent_buffer(gang[0]-commit_root); -kfree(gang[0]); +btrfs_put_fs_root(gang[0]); } } @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root-free_ino_ctl); kfree(root-free_ino_pinned); kfree(root-name); -kfree(root); +btrfs_put_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 534d583..b71acd6e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info-subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ +if (atomic_inc_not_zero(root-refs)) +return root; +return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ +if (atomic_dec_and_test(root-refs)) +kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 08e42c8..08f9862 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } else { free_extent_buffer(root-node); free_extent_buffer(root-commit_root); -kfree(root); +btrfs_put_fs_root(root); } out_end_trans: btrfs_end_transaction_throttle(trans, tree_root
Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree
On Thu, 16 May 2013 13:15:57 +0800, Liu Bo wrote: On Thu, May 16, 2013 at 12:31:11PM +0800, Miao Xie wrote: On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote: On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote: The grab/put funtions will be used in the next patch, which need grab the root object and ensure it is not freed. We use reference counter instead of the srcu lock is to aovid blocking the memory reclaim task, which invokes synchronize_srcu(). I don't think this is necessary, we put 'kfree(root)' because we really need to free them at the very end time, when there should be no inodes linking on the root(we should have cleaned all inodes out from it). So when we flush delalloc inodes and wait for ordered extents to finish, the root should be valid, otherwise someone is doing wrong things. And even with this grab_fs_root to avoid freeing root, it's just the root that remains in memory, all its attributes, like root-node, commit_root, root-inode_tree, are already NULL or empty. Please consider the case: Task1 Task2 Cleaner get the root flush all delalloc inodes drop subvolume iput the last inode move the root into the dead list drop subvolume kfree(root) If Task1 accesses the root now, oops will happen. Then it's task1's fault, why it is not protected by subvol_srcu section when it's possible that someone like task2 sets root's refs to 0? synchronize_srcu(subvol_srcu) before adding root into dead root list is just for this race case, why do we need another? Please read my changelog. Miao thanks, liubo I introduced there two functions just to protect the access of the root object, not its attributes, so don't worry about the attributes. (Please see the first sentence of the changelog.) Thanks Miao thanks, liubo Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/disk-io.h | 21 + fs/btrfs/extent-tree.c | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 845b77f..958ce6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1739,6 +1739,7 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; + atomic_t refs; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42d6ba2..642c861 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(root-log_writers, 0); atomic_set(root-log_batch, 0); atomic_set(root-orphan_inodes, 0); + atomic_set(root-refs, 1); root-log_transid = 0; root-last_log_commit = 0; extent_io_tree_init(root-dirty_log_pages, @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]-node); free_extent_buffer(gang[0]-commit_root); - kfree(gang[0]); + btrfs_put_fs_root(gang[0]); } } @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root) kfree(root-free_ino_ctl); kfree(root-free_ino_pinned); kfree(root-name); - kfree(root); + btrfs_put_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 534d583..b71acd6e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + *fs_info-subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(root-refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(root-refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 08e42c8..08f9862 100644 --- a/fs/btrfs
Re: [PATCH v3] btrfs: clean snapshots one by one
On tue, 7 May 2013 13:54:49 +0200, David Sterba wrote: On Mon, May 06, 2013 at 08:41:06PM -0400, Chris Mason wrote: diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 988b860..4de2351 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1690,15 +1690,19 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { + int again = 0; + if (!(root-fs_info-sb-s_flags MS_RDONLY) + down_read_trylock(root-fs_info-sb-s_umount) mutex_trylock(root-fs_info-cleaner_mutex)) { btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); + again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(root-fs_info-cleaner_mutex); btrfs_run_defrag_inodes(root-fs_info); + up_read(root-fs_info-sb-s_umount); Can we use just the cleaner mutex for this? We're deadlocking during 068 with autodefrag on because the cleaner is holding s_umount while autodefrag is trying to bump the writer count. I have now reproduced the deadlock and see where it's stuck. It did not happen with running 068 in a loop, but after interrupting the test. If unmount takes the cleaner mutex once it should wait long enough for the cleaner to stop. You mean removing s_umount from here completely? I'm not sure about other mis-interaction, eg with remount + autodefrag. Miao sent a patch for that case http://www.spinics.net/lists/linux-btrfs/msg16634.html (but it would not fix this deadlock). I have given up this patch and fix this problem by the other way. http://marc.info/?l=linux-btrfsm=136142833013628w=2 I think we need use s_umount here, all things we need do is to check R/O in cleaner_mutex. Or we may continue to delete the dead tree after the fs is remounted to be R/O. Thanks Miao I'm for keeping the clean-by-one patch for 3.10, we can fix other regressions during rc cycle. david -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4] Btrfs: remove unnecessary -s_umount in cleaner_kthread()
In order to avoid the R/O remount, we acquired -s_umount lock during we deleted the dead snapshots and subvolumes. But it is unnecessary, because we have cleaner_mutex. We use cleaner_mutex to protect the process of the dead snapshots/subvolumes deletion. And when we remount the fs to be R/O, we also acquire this mutex to do cleanup after we change the status of the fs. That is this lock can serialize the above operations, the cleaner can be aware of the status of the fs, and if the cleaner is deleting the dead snapshots/subvolumes, the remount task will wait for it. So it is safe to remove -s_umount in cleaner_kthread(). Cc: David Sterba dste...@suse.cz Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 40 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9df562..cb2bfd1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1676,24 +1676,40 @@ static void end_workqueue_fn(struct btrfs_work *work) bio_endio(bio, error); } +/* + * If we remount the fs to be R/O, the cleaner needn't do anything except + * sleeping. This function is used to check the status of the fs. + */ +static inline int need_cleaner_sleep(struct btrfs_root *root) +{ + return root-fs_info-sb-s_flags MS_RDONLY; +} + static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; + int again; do { - int again = 0; - - if (!(root-fs_info-sb-s_flags MS_RDONLY) - down_read_trylock(root-fs_info-sb-s_umount)) { - if (mutex_trylock(root-fs_info-cleaner_mutex)) { - btrfs_run_delayed_iputs(root); - again = btrfs_clean_one_deleted_snapshot(root); - mutex_unlock(root-fs_info-cleaner_mutex); - } - btrfs_run_defrag_inodes(root-fs_info); - up_read(root-fs_info-sb-s_umount); - } + again = 0; + /* Make the cleaner go to sleep early. */ + if (need_cleaner_sleep(root)) + goto sleep; + + if (!mutex_trylock(root-fs_info-cleaner_mutex)) + goto sleep; + + btrfs_run_delayed_iputs(root); + again = btrfs_clean_one_deleted_snapshot(root); + mutex_unlock(root-fs_info-cleaner_mutex); + + /* +* The defragger has dealt with the R/O remount, needn't +* do anything special here. +*/ + btrfs_run_defrag_inodes(root-fs_info); +sleep: if (!try_to_freeze() !again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/4] Btrfs: move the R/O check out of btrfs_clean_one_deleted_snapshot()
If the fs is remounted to be R/O, it is unnecessary to call btrfs_clean_one_deleted_snapshot(), so move the R/O check out of this function. And besides that, it can make the check logic in the caller more clear. Cc: David Sterba dste...@suse.cz Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 9 + fs/btrfs/transaction.c | 5 - 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 927da1a..c69ff46 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1702,6 +1702,15 @@ static int cleaner_kthread(void *arg) if (!mutex_trylock(root-fs_info-cleaner_mutex)) goto sleep; + /* +* Avoid the problem that we change the status of the fs +* during the above check and trylock. +*/ + if (need_cleaner_sleep(root)) { + mutex_unlock(root-fs_info-cleaner_mutex); + goto sleep; + } + btrfs_run_delayed_iputs(root); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(root-fs_info-cleaner_mutex); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 89fad06..4b63111 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1885,11 +1885,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) int ret; struct btrfs_fs_info *fs_info = root-fs_info; - if (fs_info-sb-s_flags MS_RDONLY) { - pr_debug(btrfs: cleaner called for RO fs!\n); - return 0; - } - spin_lock(fs_info-trans_lock); if (list_empty(fs_info-dead_roots)) { spin_unlock(fs_info-trans_lock); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/4] Btrfs: make the cleaner complete early when the fs is going to be umounted
Cc: David Sterba dste...@suse.cz Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index cb2bfd1..927da1a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1677,12 +1677,14 @@ static void end_workqueue_fn(struct btrfs_work *work) } /* - * If we remount the fs to be R/O, the cleaner needn't do anything except - * sleeping. This function is used to check the status of the fs. + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. */ static inline int need_cleaner_sleep(struct btrfs_root *root) { - return root-fs_info-sb-s_flags MS_RDONLY; + return (root-fs_info-sb-s_flags MS_RDONLY || + btrfs_fs_closing(root-fs_info)); } static int cleaner_kthread(void *arg) @@ -1705,8 +1707,8 @@ static int cleaner_kthread(void *arg) mutex_unlock(root-fs_info-cleaner_mutex); /* -* The defragger has dealt with the R/O remount, needn't -* do anything special here. +* The defragger has dealt with the R/O remount and umount, +* needn't do anything special here. */ btrfs_run_defrag_inodes(root-fs_info); sleep: -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] Btrfs: make the snap/subv deletion end more early when the fs is R/O
The snapshot/subvolume deletion might spend lots of time, it would make the remount task wait for a long time. This patch improve this problem, we will break the deletion if the fs is remounted to be R/O. It will make the users happy. Cc: David Sterba dste...@suse.cz Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 12 fs/btrfs/disk-io.c | 15 ++- fs/btrfs/extent-tree.c | 2 +- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index df9824b..067233f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3318,6 +3318,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info-closing; } + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root) +{ + return (root-fs_info-sb-s_flags MS_RDONLY || + btrfs_fs_closing(root-fs_info)); +} + static inline void free_fs_info(struct btrfs_fs_info *fs_info) { kfree(fs_info-balance_ctl); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c69ff46..78e2dfb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1676,17 +1676,6 @@ static void end_workqueue_fn(struct btrfs_work *work) bio_endio(bio, error); } -/* - * If we remount the fs to be R/O or umount the fs, the cleaner needn't do - * anything except sleeping. This function is used to check the status of - * the fs. - */ -static inline int need_cleaner_sleep(struct btrfs_root *root) -{ - return (root-fs_info-sb-s_flags MS_RDONLY || - btrfs_fs_closing(root-fs_info)); -} - static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; @@ -1696,7 +1685,7 @@ static int cleaner_kthread(void *arg) again = 0; /* Make the cleaner go to sleep early. */ - if (need_cleaner_sleep(root)) + if (btrfs_need_cleaner_sleep(root)) goto sleep; if (!mutex_trylock(root-fs_info-cleaner_mutex)) @@ -1706,7 +1695,7 @@ static int cleaner_kthread(void *arg) * Avoid the problem that we change the status of the fs * during the above check and trylock. */ - if (need_cleaner_sleep(root)) { + if (btrfs_need_cleaner_sleep(root)) { mutex_unlock(root-fs_info-cleaner_mutex); goto sleep; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fbeb0c0..455117a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7378,7 +7378,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, wc-reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { - if (!for_reloc btrfs_fs_closing(root-fs_info)) { + if (!for_reloc btrfs_need_cleaner_sleep(root)) { pr_debug(btrfs: drop snapshot early exit\n); err = -EAGAIN; goto out_end_trans; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] Btrfs: remove unnecessary -s_umount in cleaner_kthread()
On tue, 14 May 2013 18:20:40 +0800, Miao Xie wrote: In order to avoid the R/O remount, we acquired -s_umount lock during we deleted the dead snapshots and subvolumes. But it is unnecessary, because we have cleaner_mutex. We use cleaner_mutex to protect the process of the dead snapshots/subvolumes deletion. And when we remount the fs to be R/O, we also acquire this mutex to do cleanup after we change the status of the fs. That is this lock can serialize the above operations, the cleaner can be aware of the status of the fs, and if the cleaner is deleting the dead snapshots/subvolumes, the remount task will wait for it. So it is safe to remove -s_umount in cleaner_kthread(). According to my test, this patch can also fix the deadlock problem which is caused by the race between autodefragger and freeze(xfstest 068). Thanks Miao Cc: David Sterba dste...@suse.cz Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 40 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9df562..cb2bfd1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1676,24 +1676,40 @@ static void end_workqueue_fn(struct btrfs_work *work) bio_endio(bio, error); } +/* + * If we remount the fs to be R/O, the cleaner needn't do anything except + * sleeping. This function is used to check the status of the fs. + */ +static inline int need_cleaner_sleep(struct btrfs_root *root) +{ + return root-fs_info-sb-s_flags MS_RDONLY; +} + static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; + int again; do { - int again = 0; - - if (!(root-fs_info-sb-s_flags MS_RDONLY) - down_read_trylock(root-fs_info-sb-s_umount)) { - if (mutex_trylock(root-fs_info-cleaner_mutex)) { - btrfs_run_delayed_iputs(root); - again = btrfs_clean_one_deleted_snapshot(root); - mutex_unlock(root-fs_info-cleaner_mutex); - } - btrfs_run_defrag_inodes(root-fs_info); - up_read(root-fs_info-sb-s_umount); - } + again = 0; + /* Make the cleaner go to sleep early. */ + if (need_cleaner_sleep(root)) + goto sleep; + + if (!mutex_trylock(root-fs_info-cleaner_mutex)) + goto sleep; + + btrfs_run_delayed_iputs(root); + again = btrfs_clean_one_deleted_snapshot(root); + mutex_unlock(root-fs_info-cleaner_mutex); + + /* + * The defragger has dealt with the R/O remount, needn't + * do anything special here. + */ + btrfs_run_defrag_inodes(root-fs_info); +sleep: if (!try_to_freeze() !again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] Btrfs: don't abort the current transaction if there is no enough space for inode cache
The filesystem with inode cache was forced to be read-only when we umounted it. Steps to reproduce: # mkfs.btrfs -f ${DEV} # mount -o inode_cache ${DEV} ${MNT} # dd if=/dev/zero of=${MNT}/file1 bs=1M count=8192 # btrfs fi syn ${MNT} # dd if=${MNT}/file1 of=/dev/null bs=1M # rm -f ${MNT}/file1 # btrfs fi syn ${MNT} # umount ${MNT} It is because there was no enough space to do inode cache truncation, and then we aborted the current transaction. But no space error is not a serious problem when we write out the inode cache, and it is safe that we just skip this step if we meet this problem. So we need not abort the current transaction. Reported-by: Tsutomu Itoh t-i...@jp.fujitsu.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/inode-map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d26f67a..9818d4a 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -468,7 +468,8 @@ again: if (i_size_read(inode) 0) { ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret != -ENOSPC) + btrfs_abort_transaction(trans, root, ret); goto out_put; } } -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/5] Btrfs: don't steal the reserved space from the global reserve if their space type is different
If the type of the space we need is different with the global reserve, we can not steal the space from the global reserve, because we can not allocate the space from the free space cache that the global reserve points to. Cc: Tsutomu Itoh t-i...@jp.fujitsu.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/extent-tree.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e34e268..c48e1bd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6684,9 +6684,11 @@ try_reserve: return block_rsv; /* * If we couldn't reserve metadata bytes try and use some from -* the global reserve. +* the global reserve if its space type is the same as the global +* reservation. */ - if (block_rsv-type != BTRFS_BLOCK_RSV_GLOBAL) { + if (block_rsv-type != BTRFS_BLOCK_RSV_GLOBAL + block_rsv-space_info == global_rsv-space_info) { ret = block_rsv_use_bytes(global_rsv, blocksize); if (!ret) return global_rsv; -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/5] Btrfs: don't use global block reservation for inode cache truncation
It is very likely that there are lots of subvolumes/snapshots in the filesystem, so if we use global block reservation to do inode cache truncation, we may hog all the free space that is reserved in global rsv. So it is better that we do the free space reservation for inode cache truncation by ourselves. Cc: Tsutomu Itoh t-i...@jp.fujitsu.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/extent-tree.c | 5 + fs/btrfs/free-space-cache.c | 39 +++ fs/btrfs/free-space-cache.h | 2 ++ fs/btrfs/inode-map.c| 5 +++-- fs/btrfs/relocation.c | 5 + 5 files changed, 34 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2305b5c..43afa77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3109,6 +3109,11 @@ again: WARN_ON(ret); if (i_size_read(inode) 0) { + ret = btrfs_check_trunc_cache_free_space(root, + root-fs_info-global_block_rsv); + if (ret) + goto out_put; + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ecca6c7..a1948f4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -197,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root, block_group-key.objectid); } -int btrfs_truncate_free_space_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) { - struct btrfs_block_rsv *rsv; u64 needed_bytes; - loff_t oldsize; - int ret = 0; - - rsv = trans-block_rsv; - trans-block_rsv = root-fs_info-global_block_rsv; + int ret; /* 1 for slack space, 1 for updating the inode */ needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + btrfs_calc_trans_metadata_size(root, 1); - spin_lock(trans-block_rsv-lock); - if (trans-block_rsv-reserved needed_bytes) { - spin_unlock(trans-block_rsv-lock); - trans-block_rsv = rsv; - return -ENOSPC; - } - spin_unlock(trans-block_rsv-lock); + spin_lock(rsv-lock); + if (rsv-reserved needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(rsv-lock); + return 0; +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + loff_t oldsize; + int ret = 0; oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); @@ -232,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - trans-block_rsv = rsv; btrfs_abort_transaction(trans, root, ret); return ret; } @@ -242,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, ret = btrfs_update_inode(trans, root, inode); if (ret) btrfs_abort_transaction(trans, root, ret); - trans-block_rsv = rsv; return ret; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 4dc17d8..8b7f19f 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_path *path, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 9818d4a..2c66ddb 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root, num_bytes = trans-bytes_reserved; /* * 1 item for inode item insertion if need -* 3 items for inode item update (in the worst case) +* 4 items for inode item update (in the worst case
[PATCH 5/5] Btrfs: update the global reserve if it is empty
Before applying this patch, we reserved the space for the global reserve by the minimum unit if we found it is empty, it was unreasonable and inefficient, because if the global reserve space was depleted, it implied that the size of the global reserve was too small. In this case, we shoud update the global reserve and fill it. Cc: Tsutomu Itoh t-i...@jp.fujitsu.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/extent-tree.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c48e1bd..c75fe11 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6656,12 +6656,13 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *global_rsv = root-fs_info-global_block_rsv; int ret; + bool global_updated = false; block_rsv = get_block_rsv(trans, root); if (unlikely(block_rsv-size == 0)) goto try_reserve; - +again: ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; @@ -6669,6 +6670,12 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (block_rsv-failfast) return ERR_PTR(ret); + if (block_rsv-type == BTRFS_BLOCK_RSV_GLOBAL !global_updated) { + global_updated = true; + update_global_block_rsv(root-fs_info); + goto again; + } + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/5] Btrfs: optimize the error handle of use_block_rsv()
cc: Tsutomu Itoh t-i...@jp.fujitsu.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/extent-tree.c | 65 ++ 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 43afa77..e34e268 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6659,48 +6659,39 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); - if (block_rsv-size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, -BTRFS_RESERVE_NO_FLUSH); - /* -* If we couldn't reserve metadata bytes try and use some from -* the global reserve. -*/ - if (ret block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - return ERR_PTR(ret); - } else if (ret) { - return ERR_PTR(ret); - } - return block_rsv; - } + if (unlikely(block_rsv-size == 0)) + goto try_reserve; ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; - if (ret !block_rsv-failfast) { - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL * 10, - /*DEFAULT_RATELIMIT_BURST*/ 1); - if (__ratelimit(_rs)) - WARN(1, KERN_DEBUG - btrfs: block rsv returned %d\n, ret); - } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, -BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - return block_rsv; - } else if (ret block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } - } - return ERR_PTR(-ENOSPC); + if (block_rsv-failfast) + return ERR_PTR(ret); + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(_rs)) + WARN(1, KERN_DEBUG + btrfs: block rsv returned %d\n, ret); + } +try_reserve: + ret = reserve_metadata_bytes(root, block_rsv, blocksize, +BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* +* If we couldn't reserve metadata bytes try and use some from +* the global reserve. +*/ + if (block_rsv-type != BTRFS_BLOCK_RSV_GLOBAL) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + return ERR_PTR(ret); } static void unuse_block_rsv(struct btrfs_fs_info *fs_info, -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3] Btrfs: remove btrfs_sector_sum structure
Using the structure btrfs_sector_sum to keep the checksum value is unnecessary, because the extents that btrfs_sector_sum points to are continuous, we can find out the expected checksums by btrfs_ordered_sum's bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After removing bytenr, there is only one member in the structure, so it makes no sense to keep the structure, just remove it, and use a u32 array to store the checksum value. By this change, we don't use the while loop to get the checksums one by one. Now, we can get several checksum value at one time, it improved the performance by ~74% on my SSD (31MB/s - 54MB/s). test command: # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v2 - v3: - address the problem that the csums was inserted into the wrong range, this bug was reported by Josef. Changelog v1 - v2: - modify the changelog and the title which can not explain this patch clearly - fix the 64bit division problem on 32bit machine --- fs/btrfs/file-item.c| 144 ++-- fs/btrfs/ordered-data.c | 19 +++ fs/btrfs/ordered-data.h | 25 ++--- fs/btrfs/relocation.c | 10 fs/btrfs/scrub.c| 16 ++ 5 files changed, 73 insertions(+), 141 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b193bf3..a7bfc95 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,8 +34,7 @@ #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)-sectorsize - (r)-sectorsize) + sizeof(u32) * (r)-sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; LIST_HEAD(tmplist); unsigned long offset; @@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); +MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); + GFP_NOFS); if (!sums) { ret = -ENOMEM; goto fail; } - sector_sum = sums-sums; sums-bytenr = start; - sums-len = size; + sums-len = (int)size; offset = (start - key.offset) root-fs_info-sb-s_blocksize_bits; offset *= csum_size; + size = root-fs_info-sb-s_blocksize_bits; - while (size 0) { - read_extent_buffer(path-nodes[0], - sector_sum-sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum-bytenr = start; - - size -= root-sectorsize; - start += root-sectorsize; - offset += csum_size; - sector_sum++; - } + read_extent_buffer(path-nodes[0], + sums-sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root-sectorsize * size; list_add_tail(sums-list, tmplist); } path-slots[0]++; @@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; char *data; struct bio_vec *bvec = bio-bi_io_vec; int bio_index = 0; + int index; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; - u64
Re: [PATCH V2 2/2] Btrfs: remove btrfs_sector_sum structure
On tue, 23 Apr 2013 16:54:35 -0400, Josef Bacik wrote: On Wed, Apr 03, 2013 at 03:14:56AM -0600, Miao Xie wrote: Using the structure btrfs_sector_sum to keep the checksum value is unnecessary, because the extents that btrfs_sector_sum points to are continuous, we can find out the expected checksums by btrfs_ordered_sum's bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After removing bytenr, there is only one member in the structure, so it makes no sense to keep the structure, just remove it, and use a u32 array to store the checksum value. By this change, we don't use the while loop to get the checksums one by one. Now, we can get several checksum value at one time, it improved the performance by ~74% on my SSD (31MB/s - 54MB/s). test command: # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync Signed-off-by: Miao Xie mi...@cn.fujitsu.com Reviewed-by: Liu Bo bo.li@oracle.com [SNIP] next_offset = (u64)-1; found_next = 0; + bytenr = sums-bytenr + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = sector_sum-bytenr; - bytenr = sector_sum-bytenr; + file_key.offset = bytenr; btrfs_set_key_type(file_key, BTRFS_EXTENT_CSUM_KEY); - item = btrfs_lookup_csum(trans, root, path, sector_sum-bytenr, 1); + item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { - leaf = path-nodes[0]; - ret = 0; - goto found; + csum_offset = 0; + goto csum; Ok I've just spent the last 3 hours tracking down an fsync() problem that turned out to be because of this patch. btrfs_lookup_csum() assumes you are just going in 4k chunks, but we could be going in larger chunks. So as long as the bytenr falls inside of this csum item it thinks its good. So what I'm seeing is this, we have item [0-8k] and we are csumming [4k-12k] and then we're adding our new csum into the old one, the sizes match but the bytenrs don't match. If you want a reproducer just run my fsync xfstest that I just posted. I'm dropping this patch for now and I'll wait for you to fix it. Thanks, Is the reproducer is the 311th case of xfstests? ([PATCH] xfstests 311: test fsync with dm flakey V2) If yes, I'm so sorry that we didn't reproduce the problem you said above. Could you give me your test option? Thanks Miao From d8b9c06ecb4aa5cb2aca5be96a8b65af1afb1992 Mon Sep 17 00:00:00 2001 From: Miao Xie mi...@cn.fujitsu.com Date: Sat, 16 Mar 2013 01:06:03 +0800 Subject: [PATCH] Btrfs: remove btrfs_sector_sum structure Using the structure btrfs_sector_sum to keep the checksum value is unnecessary, because the extents that btrfs_sector_sum points to are continuous, we can find out the expected checksums by btrfs_ordered_sum's bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After removing bytenr, there is only one member in the structure, so it makes no sense to keep the structure, just remove it, and use a u32 array to store the checksum value. By this change, we don't use the while loop to get the checksums one by one. Now, we can get several checksum value at one time, it improved the performance by ~74% on my SSD (31MB/s - 54MB/s). test command: # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/file-item.c| 142 ++-- fs/btrfs/ordered-data.c | 19 +++ fs/btrfs/ordered-data.h | 25 ++--- fs/btrfs/relocation.c | 10 fs/btrfs/scrub.c| 16 ++ 5 files changed, 70 insertions(+), 142 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 769eb86..f5f6629 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,8 +34,7 @@ #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)-sectorsize - (r)-sectorsize) + sizeof(u32) * (r)-sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -317,7 +316,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; LIST_HEAD(tmplist); unsigned long offset; @@ -388,34 +386,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); + MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); + GFP_NOFS); if (!sums) { ret = -ENOMEM; goto fail; } - sector_sum
Re: [PATCH V2 2/2] Btrfs: remove btrfs_sector_sum structure
On Fri, 26 Apr 2013 16:58:18 +0800, Miao Xie wrote: On tue, 23 Apr 2013 16:54:35 -0400, Josef Bacik wrote: On Wed, Apr 03, 2013 at 03:14:56AM -0600, Miao Xie wrote: Using the structure btrfs_sector_sum to keep the checksum value is unnecessary, because the extents that btrfs_sector_sum points to are continuous, we can find out the expected checksums by btrfs_ordered_sum's bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After removing bytenr, there is only one member in the structure, so it makes no sense to keep the structure, just remove it, and use a u32 array to store the checksum value. By this change, we don't use the while loop to get the checksums one by one. Now, we can get several checksum value at one time, it improved the performance by ~74% on my SSD (31MB/s - 54MB/s). test command: # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync Signed-off-by: Miao Xie mi...@cn.fujitsu.com Reviewed-by: Liu Bo bo.li@oracle.com [SNIP] next_offset = (u64)-1; found_next = 0; + bytenr = sums-bytenr + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = sector_sum-bytenr; - bytenr = sector_sum-bytenr; + file_key.offset = bytenr; btrfs_set_key_type(file_key, BTRFS_EXTENT_CSUM_KEY); - item = btrfs_lookup_csum(trans, root, path, sector_sum-bytenr, 1); + item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { - leaf = path-nodes[0]; - ret = 0; - goto found; + csum_offset = 0; + goto csum; Ok I've just spent the last 3 hours tracking down an fsync() problem that turned out to be because of this patch. btrfs_lookup_csum() assumes you are just going in 4k chunks, but we could be going in larger chunks. So as long as the bytenr falls inside of this csum item it thinks its good. So what I'm seeing is this, we have item [0-8k] and we are csumming [4k-12k] and then we're adding our new csum into the old one, the sizes match but the bytenrs don't match. If you want a reproducer just run my fsync xfstest that I just posted. I'm dropping this patch for now and I'll wait for you to fix it. Thanks, Is the reproducer is the 311th case of xfstests? ([PATCH] xfstests 311: test fsync with dm flakey V2) If yes, I'm so sorry that we didn't reproduce the problem you said above. Could you give me your test option? please ignore the attached patch, I sent it out by mistake. Thanks Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: allocate new chunks if the space is not enough for global rsv
When running the 208th of xfstests, the fs returned the enospc error when there was lots of free space in the disk. By bisect debug, we found it was introduced by commit 96f1bb5777. This commit makes the space check for the global reservation in can_overcommit() be inconsistent with should_alloc_chunk(). can_overcommit() requires that the free space is 2 times the size of the global reservation, or we can't do overcommit. And instead, we need reclaim some reserved space, and if we still don't have enough free space, we need allocate a new chunk. But unfortunately, should_alloc_chunk() just requires that the free space is 1 time the size of the global reservation, that is we would not try to allocate a new chunk if the free space size is in the middle of these two requires, and just return the enospc error. Fix it. Cc: Jim Schutt jasc...@sandia.gov Cc: Josef Bacik jba...@fusionio.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/extent-tree.c | 19 +++ 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0d84787..4976f93 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3557,6 +3557,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ + return (global-size 1); +} + static int should_alloc_chunk(struct btrfs_root *root, struct btrfs_space_info *sinfo, int force) { @@ -3574,7 +3579,7 @@ static int should_alloc_chunk(struct btrfs_root *root, * global_rsv, it doesn't change except when the transaction commits. */ if (sinfo-flags BTRFS_BLOCK_GROUP_METADATA) - num_allocated += global_rsv-size; + num_allocated += calc_global_rsv_need_space(global_rsv); /* * in limited mode, we want to have some free space up to @@ -3746,7 +3751,7 @@ static int can_overcommit(struct btrfs_root *root, { struct btrfs_block_rsv *global_rsv = root-fs_info-global_block_rsv; u64 profile = btrfs_get_alloc_profile(root, 0); - u64 rsv_size = 0; + u64 space_size; u64 avail; u64 used; u64 to_add; @@ -3754,18 +3759,16 @@ static int can_overcommit(struct btrfs_root *root, used = space_info-bytes_used + space_info-bytes_reserved + space_info-bytes_pinned + space_info-bytes_readonly; - spin_lock(global_rsv-lock); - rsv_size = global_rsv-size; - spin_unlock(global_rsv-lock); - /* * We only want to allow over committing if we have lots of actual space * free, but if we don't have enough space to handle the global reserve * space then we could end up having a real enospc problem when trying * to allocate a chunk or some other such important allocation. */ - rsv_size = 1; - if (used + rsv_size = space_info-total_bytes) + spin_lock(global_rsv-lock); + space_size = calc_global_rsv_need_space(global_rsv); + spin_unlock(global_rsv-lock); + if (used + space_size = space_info-total_bytes) return 0; used += space_info-bytes_may_use; -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] Btrfs: introduce noextiref mount option
On Mon, 15 Apr 2013 19:20:51 +0200, David Sterba wrote: On Fri, Apr 12, 2013 at 12:01:19PM -0500, Eric Sandeen wrote: On 4/11/13 5:35 AM, Miao Xie wrote: Now, we set incompat flag EXTEND_IREF when we actually need insert a extend inode reference, not when making a fs. But some users may hope that the fs still can be mounted on the old kernel, and don't hope we insert any extend inode references. So we introduce noextiref mount option to close this function. I'd really rather not have yet another work-around mount option. Wouldn't it be better to say: if you don't want extended irefs, turn that feature off on the filesystem itself, either at mkfs time or via btrfstune after the fact. I agree with this, and hope the inconsistency around extref is only temporary so the mount options is not required in the long term. The code reverting extref set by default in mkfs is in integration branch. The preferred solution is the -O option where we can put all the fs features in one go at mkfs time. All right, let's add a option for mkfs, and throw away this patchset. Thanks. Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] Btrfs: use a lock to protect incompat/compat flag of the super block
On thu, 18 Apr 2013 00:17:11 +0200, David Sterba wrote: On Thu, Apr 11, 2013 at 06:30:16PM +0800, Miao Xie wrote: In order to avoid this problem, we introduce a lock named super_lock into the btrfs_fs_info structure. If we want to update incompat/compat flags of the super block, we must hold it. +/* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. +spinlock_t super_lock; The lock name is too general for protecting just *_flags, do you have plans to add more items from superblock under this lock? If no, I suggest to pick a different name. Yes, I want to add more items from super block under this lock. @@ -3663,8 +3674,15 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, disk_super = fs_info-super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features flag)) { -features |= flag; -btrfs_set_super_incompat_flags(disk_super, features); +spin_lock(fs_info-super_lock); +features = btrfs_super_incompat_flags(disk_super); +if (!(features flag)) { +features |= flag; +btrfs_set_super_incompat_flags(disk_super, features); +printk(KERN_INFO btrfs: setting %llu feature flag\n, + flag); flag is u64, please use (unsigned long long)flag and possibly the new btrfs_info replacement of printks. OK, I'll modify my patch. Thanks for your view. Miao +} +spin_unlock(fs_info-super_lock); } } otherwise ok. Reviewed-by: David Sterba dste...@suse.cz -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] Btrfs: introduce noextiref mount option
On Fri, 12 Apr 2013 09:02:34 +0200, Jan Schmidt wrote: +static int btrfs_close_extend_iref(struct btrfs_fs_info *fs_info, + unsigned long old_opts) The name irritated me, it's more like unset instead of close, isn't it? Maybe btrfs_set_no_extend_iref() is better, the other developers might think we will clear BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF. I think we should use the exact name of the mount option, so btrfs_set_noextiref is probably least ambiguous. Or even btrfs_set_mntflag_noextiref. Much better than mine. +{ + struct btrfs_trans_handle *trans; + int ret; + + if (btrfs_raw_test_opt(old_opts, NOEXTIREF) || + !btrfs_raw_test_opt(fs_info-mount_opt, NOEXTIREF)) + return 0; + + trans = btrfs_attach_transaction(fs_info-tree_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + return PTR_ERR(trans); + } else { + ret = btrfs_commit_transaction(trans, fs_info-tree_root); + if (ret) + return ret; + } Huh? I don't see why we need to commit the transaction here. Can you please explain? We need avoid the case that we check incompat flag is set or not between the extended iref insertion and incompat flag set. Task1 Task2 start_transaction() insert extended iref set NOEXTIREF check incompat flag set incompat flag checking incompat flag after transaction commit can make sure our check happens after the flag is set. Understood. However, in my understanding of transaction.c, btrfs_join_transaction, btrfs_attach_transaction and btrfs_commit_transaction are special and need justification. If you only need the transaction for synchronization purposes, which seems to be the case here, btrfs_start_transaction and btrfs_end_transaction are the right choice. btrfs_end_transaction() does not wait for/force the other tasks to end their transaction, so it is not right here. Thanks Miao Thanks, -Jan Thanks Miao Thanks, -Jan + + if (btrfs_super_incompat_flags(fs_info-super_copy) + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) { + printk(KERN_ERR BTRFS: could not close extend iref.\n); + return -EINVAL; + } + + return 0; +} + static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) { set_bit(BTRFS_FS_STATE_REMOUNTING, fs_info-fs_state); @@ -1259,6 +1293,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } btrfs_remount_begin(fs_info, old_opts, *flags); + + ret = btrfs_close_extend_iref(fs_info, old_opts); + if (ret) + goto restore; + btrfs_resize_thread_pool(fs_info, fs_info-thread_pool_size, old_thread_pool_size); -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] Btrfs: fix unblocked autodefraggers when remount
The new mount option is set after parsing the remount arguments, so it is wrong that checking the autodefrag is close or not at btrfs_remount_prepare(). Fix it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/super.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 68a29a1..0f03569 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1202,11 +1202,14 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } -static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, -unsigned long old_opts, int flags) +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) { set_bit(BTRFS_FS_STATE_REMOUNTING, fs_info-fs_state); +} +static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) (!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) || (flags MS_RDONLY))) { @@ -1247,7 +1250,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info-metadata_ratio; int ret; - btrfs_remount_prepare(fs_info, old_opts, *flags); + btrfs_remount_prepare(fs_info); ret = btrfs_parse_options(root, data); if (ret) { @@ -1255,6 +1258,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info-thread_pool_size, old_thread_pool_size); -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] Btrfs: use a lock to protect incompat/compat flag of the super block
The following case will make the incompat/compat flag of the super block be recovered. Task1 |Task2 flags = btrfs_super_incompat_flags(); | |flags = btrfs_super_incompat_flags(); flags |= new_flag1;| |flags |= new_flag2; btrfs_set_super_incompat_flags(flags); | |btrfs_set_super_incompat_flags(flags); the new_flag1 is recovered. In order to avoid this problem, we introduce a lock named super_lock into the btrfs_fs_info structure. If we want to update incompat/compat flags of the super block, we must hold it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 22 -- fs/btrfs/disk-io.c | 5 + fs/btrfs/volumes.c | 10 +- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0d82922..a883e47 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1360,6 +1360,17 @@ struct btrfs_fs_info { wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; + /* +* Used to protect the incompat_flags, compat_flags, compat_ro_flags +* when they are updated. +* +* Because we do not clear the flags for ever, so we needn't use +* the lock on the read side. +* +* We also needn't use the lock when we mount the fs, because +* there is no other task which will update the flag. +*/ + spinlock_t super_lock; struct btrfs_super_block *super_copy; struct btrfs_super_block *super_for_commit; struct block_device *__bdev; @@ -3663,8 +3674,15 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, disk_super = fs_info-super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features flag)) { - features |= flag; - btrfs_set_super_incompat_flags(disk_super, features); + spin_lock(fs_info-super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + printk(KERN_INFO btrfs: setting %llu feature flag\n, +flag); + } + spin_unlock(fs_info-super_lock); } } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6d19a0a..ab8ef37 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2060,6 +2060,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(fs_info-defrag_inodes_lock); spin_lock_init(fs_info-free_chunk_lock); spin_lock_init(fs_info-tree_mod_seq_lock); + spin_lock_init(fs_info-super_lock); rwlock_init(fs_info-tree_mod_log_lock); mutex_init(fs_info-reloc_mutex); seqlock_init(fs_info-profiles_lock); @@ -2319,6 +2320,10 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + /* +* Needn't use the lock because there is no other task which will +* update the flag. +*/ btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2854c82..e710db4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3674,18 +3674,10 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { - u64 features; - if (!(type (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) return; - features = btrfs_super_incompat_flags(info-super_copy); - if (features BTRFS_FEATURE_INCOMPAT_RAID56) - return; - - features |= BTRFS_FEATURE_INCOMPAT_RAID56; - btrfs_set_super_incompat_flags(info-super_copy, features); - printk(KERN_INFO btrfs: setting RAID5/6 feature flag\n); + btrfs_set_fs_incompat(info, RAID56); } static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/2] do not open the extend inode reference at the beginning
In most cases, we do not insert so many inode references, so it is better that we don't set incompat flag -- EXTEND_IREF -- when we make a fs. Otherwise we can not mount the fs on the old kernel though there is no extend iref in fact. And some users may not hope to inserting the extend inode reference because of the incompatible problem. In this case, we introduce a mount option named noextiref. Note, if the extend inode reference function is enabled, we will fail to mount a fs with this option because there might be some extend irefs in the fs, we should not close this function. This patchset is against: [PATCH 1/2] Btrfs: fix unblocked autodefraggers when remount [PATCH 2/2] Btrfs: use a lock to protect incompat/compat flag of the super block Miao Xie (2): Btrfs: set the INCOMPAT_EXTENDED_IREF when the extended iref is inserted Btrfs: introduce noextiref mount option fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c| 9 + fs/btrfs/inode-item.c | 20 ++-- fs/btrfs/super.c | 41 - 4 files changed, 60 insertions(+), 11 deletions(-) -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] Btrfs: set the INCOMPAT_EXTENDED_IREF when the extended iref is inserted
We needn't set the INCOMAT_EXTENDED_IREF when making a new fs, just do it after we insert a extended iref successfully. Otherwise, we can not mount the fs in which there is no extended iref in fact on the old kernel, it is not so flexible for the users. Signed-off-by: Miao Xie mi...@cn.fujitsu.com Cc: Mark Fasheh mfas...@suse.de --- fs/btrfs/inode-item.c | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 48b8fda..f07eb45 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -443,15 +443,15 @@ out: btrfs_free_path(path); if (ret == -EMLINK) { - struct btrfs_super_block *disk_super = root-fs_info-super_copy; - /* We ran out of space in the ref array. Need to -* add an extended ref. */ - if (btrfs_super_incompat_flags(disk_super) -BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) - ret = btrfs_insert_inode_extref(trans, root, name, - name_len, - inode_objectid, - ref_objectid, index); + /* +* We ran out of space in the ref array. Need to add an +* extended ref. +*/ + ret = btrfs_insert_inode_extref(trans, root, name, name_len, + inode_objectid, ref_objectid, + index); + if (!ret) + btrfs_set_fs_incompat(root-fs_info, EXTENDED_IREF); } return ret; -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] Btrfs: introduce noextiref mount option
Now, we set incompat flag EXTEND_IREF when we actually need insert a extend inode reference, not when making a fs. But some users may hope that the fs still can be mounted on the old kernel, and don't hope we insert any extend inode references. So we introduce noextiref mount option to close this function. Signed-off-by: Miao Xie mi...@cn.fujitsu.com Cc: Mark Fasheh mfas...@suse.de --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c| 9 + fs/btrfs/inode-item.c | 2 +- fs/btrfs/super.c | 41 - 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a883e47..db88963 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1911,6 +1911,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY(1 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 22) +#define BTRFS_MOUNT_NOEXTIREF (1 23) #define btrfs_clear_opt(o, opt)((o) = ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ab8ef37..ee00448 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2269,6 +2269,15 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + if ((btrfs_super_incompat_flags(disk_super) +BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) + btrfs_test_opt(tree_root, NOEXTIREF)) { + printk(KERN_ERR BTRFS: couldn't mount because the extend iref + can not be close.\n); + err = -EINVAL; + goto fail_alloc; + } + if (btrfs_super_leafsize(disk_super) != btrfs_super_nodesize(disk_super)) { printk(KERN_ERR BTRFS: couldn't mount because metadata diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index f07eb45..7c4f880 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -442,7 +442,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); - if (ret == -EMLINK) { + if (ret == -EMLINK !btrfs_test_opt(root, NOEXTIREF)) { /* * We ran out of space in the ref array. Need to add an * extended ref. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0f03569..fd375b3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -315,7 +315,7 @@ enum { Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, - Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, + Opt_notreelog, Opt_noextiref, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_skip_balance, @@ -344,6 +344,7 @@ static match_table_t tokens = { {Opt_nossd, nossd}, {Opt_noacl, noacl}, {Opt_notreelog, notreelog}, + {Opt_noextiref, noextiref}, {Opt_flushoncommit, flushoncommit}, {Opt_ratio, metadata_ratio=%d}, {Opt_discard, discard}, @@ -535,6 +536,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO btrfs: disabling tree log\n); btrfs_set_opt(info-mount_opt, NOTREELOG); break; + case Opt_noextiref: + printk(KERN_INFO btrfs: disabling extend inode ref\n); + btrfs_set_opt(info-mount_opt, NOEXTIREF); + break; case Opt_flushoncommit: printk(KERN_INFO btrfs: turning on flush-on-commit\n); btrfs_set_opt(info-mount_opt, FLUSHONCOMMIT); @@ -1202,6 +1207,35 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } +static int btrfs_close_extend_iref(struct btrfs_fs_info *fs_info, + unsigned long old_opts) +{ + struct btrfs_trans_handle *trans; + int ret; + + if (btrfs_raw_test_opt(old_opts, NOEXTIREF) || + !btrfs_raw_test_opt(fs_info-mount_opt, NOEXTIREF)) + return 0; + + trans = btrfs_attach_transaction(fs_info-tree_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + return PTR_ERR(trans); + } else { + ret = btrfs_commit_transaction(trans, fs_info-tree_root); + if (ret) + return ret; + } + + if (btrfs_super_incompat_flags(fs_info-super_copy
[PATCH] Btrfs-progs: don't set INCOMPAT_EXTENDED_IREF flag when making a new fs
There is no extended irefs in the new fs, and we can mount it on the old kernel without extended iref function safely. So we needn't set INCOMPAT_EXTENDED_IREF flag when making a new fs, and just set it when we actually insert a extended iref. Signed-off-by: Miao Xie mi...@cn.fujitsu.com Cc: Mark Fasheh mfas...@suse.de --- mkfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mkfs.c b/mkfs.c index c8cb395..aca6e46 100644 --- a/mkfs.c +++ b/mkfs.c @@ -1654,8 +1654,6 @@ raid_groups: super = root-fs_info-super_copy; flags = btrfs_super_incompat_flags(super); - flags |= BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF; - if (mixed) flags |= BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS; -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs-progs: don't set INCOMPAT_EXTENDED_IREF flag when making a new fs
On Thu, 11 Apr 2013 16:28:11 +0200, Jan Schmidt wrote: On Thu, April 11, 2013 at 12:28 (+0200), Miao Xie wrote: There is no extended irefs in the new fs, and we can mount it on the old kernel without extended iref function safely. So we needn't set INCOMPAT_EXTENDED_IREF flag when making a new fs, and just set it when we actually insert a extended iref. Signed-off-by: Miao Xie mi...@cn.fujitsu.com Cc: Mark Fasheh mfas...@suse.de --- mkfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mkfs.c b/mkfs.c index c8cb395..aca6e46 100644 --- a/mkfs.c +++ b/mkfs.c @@ -1654,8 +1654,6 @@ raid_groups: super = root-fs_info-super_copy; flags = btrfs_super_incompat_flags(super); -flags |= BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF; - if (mixed) flags |= BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS; This one should have a large *** do not apply until kernel patches from [PATCH 0/2] do not open the extend *** inode reference at the beginning have been merged. tag. Otherwise, extended irefs are disabled entirely for all new file systems in environments where they have been working so far. Yes, thanks to point it out. Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] Btrfs: introduce noextiref mount option
On thu, 11 Apr 2013 16:29:48 +0200, Jan Schmidt wrote: On Thu, April 11, 2013 at 12:35 (+0200), Miao Xie wrote: Now, we set incompat flag EXTEND_IREF when we actually need insert a extend inode reference, not when making a fs. But some users may hope that the fs still can be mounted on the old kernel, and don't hope we insert any extend inode references. So we introduce noextiref mount option to close this function. That's a much better approach compared to setting the flag on mkfs, I agree. Signed-off-by: Miao Xie mi...@cn.fujitsu.com Cc: Mark Fasheh mfas...@suse.de --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c| 9 + fs/btrfs/inode-item.c | 2 +- fs/btrfs/super.c | 41 - 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a883e47..db88963 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1911,6 +1911,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY (1 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR(1 22) +#define BTRFS_MOUNT_NOEXTIREF (1 23) #define btrfs_clear_opt(o, opt) ((o) = ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ab8ef37..ee00448 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2269,6 +2269,15 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } +if ((btrfs_super_incompat_flags(disk_super) + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +btrfs_test_opt(tree_root, NOEXTIREF)) { +printk(KERN_ERR BTRFS: couldn't mount because the extend iref + can not be close.\n); +err = -EINVAL; +goto fail_alloc; +} + if (btrfs_super_leafsize(disk_super) != btrfs_super_nodesize(disk_super)) { printk(KERN_ERR BTRFS: couldn't mount because metadata diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index f07eb45..7c4f880 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -442,7 +442,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); -if (ret == -EMLINK) { +if (ret == -EMLINK !btrfs_test_opt(root, NOEXTIREF)) { /* * We ran out of space in the ref array. Need to add an * extended ref. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0f03569..fd375b3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -315,7 +315,7 @@ enum { Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, -Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, +Opt_notreelog, Opt_noextiref, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_skip_balance, @@ -344,6 +344,7 @@ static match_table_t tokens = { {Opt_nossd, nossd}, {Opt_noacl, noacl}, {Opt_notreelog, notreelog}, +{Opt_noextiref, noextiref}, {Opt_flushoncommit, flushoncommit}, {Opt_ratio, metadata_ratio=%d}, {Opt_discard, discard}, @@ -535,6 +536,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO btrfs: disabling tree log\n); btrfs_set_opt(info-mount_opt, NOTREELOG); break; +case Opt_noextiref: +printk(KERN_INFO btrfs: disabling extend inode ref\n); +btrfs_set_opt(info-mount_opt, NOEXTIREF); +break; case Opt_flushoncommit: printk(KERN_INFO btrfs: turning on flush-on-commit\n); btrfs_set_opt(info-mount_opt, FLUSHONCOMMIT); @@ -1202,6 +1207,35 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } +static int btrfs_close_extend_iref(struct btrfs_fs_info *fs_info, + unsigned long old_opts) The name irritated me, it's more like unset instead of close, isn't it? Maybe btrfs_set_no_extend_iref() is better, the other developers might think we will clear BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF. +{ +struct btrfs_trans_handle *trans; +int ret; + +if (btrfs_raw_test_opt(old_opts, NOEXTIREF) || +!btrfs_raw_test_opt(fs_info-mount_opt, NOEXTIREF)) +return 0; + +trans = btrfs_attach_transaction(fs_info-tree_root
Re: [PATCH 2/3] Btrfs: fix the deadlock between the transaction start/attach and commit
On wed, 10 Apr 2013 21:45:43 +0300, Alex Lyakas wrote: Hi Miao, I attempted to fix the issue by not joining a transaction that has trans-in_commit set. I did something similar to what wait_current_trans() does, but I did: smp_rmb(); if (cur_trans cur_trans-in_commit) { ... wait_event(root-fs_info-transaction_wait, !cur_trans-blocked); ... But it will introduce deadlock if we need flush some dirty pages, for example: run ordered operation. I also had to change the order of setting in_commit and blocked in btrfs_commit_transaction: trans-transaction-blocked = 1; trans-transaction-in_commit = 1; smp_wmb(); to make sure that if in_commit is set, then blocked cannot be 0, because btrfs_commit_transaction haven't set it yet to 1. we need smp_wmb() between trans-transaction-blocked = 1; and trans-transaction-in_commit = 1; Or the cpu may set blocked after in_commmit. However, with this fix I observe two issues: # With large trees and heavy commits, join_transaction() is delayed sometimes by 1-3 seconds. This delays the host IO by too much. # With this fix, I think too many transactions happen. Basically with this fix, once transaction-in_commit is set, then I insist to open a new transaction and not to join the current one. It has some bad influence on host response times pattern, but I cannot exactly tell why is that. Did you have other fix in mind? Without the fix, I observe sometimes commits that take like 80 seconds, out of which like 50 seconds are spent in the do-while loop of btrfs_commit_transaction. I'm making the patch to fix this problem, my fix is: - don't flush the dirty page during the commit if we create a snapshot - introduce a new counter to count the external writers(TRANS_USERSPACE/TRANS_START) and if this counter is zero, we will break the while loop. - if flushoncommit is set, we start delalloc flush before the while loop, not in the loop, so we don't flush the dirty pages again and again. - introduce a new transaction handle type named TRANS_JOIN_ENDIO, which is used in the endio process. - introduce a new state for transaction commit, at this state, we block TRANS_JOIN, but don't block TRANS_JOIN_ENDIO. Thanks Miao Thanks, Alex. On Mon, Mar 25, 2013 at 11:11 AM, Alex Lyakas alex.bt...@zadarastorage.com wrote: Hi Miao, On Mon, Mar 25, 2013 at 3:51 AM, Miao Xie mi...@cn.fujitsu.com wrote: On Sun, 24 Mar 2013 13:13:22 +0200, Alex Lyakas wrote: Hi Miao, I am seeing another issue. Your fix prevents from TRANS_START to get in the way of a committing transaction. But it does not prevent from TRANS_JOIN. On the other hand, btrfs_commit_transaction has the following loop: do { // attempt to do some useful stuff and/or sleep } while (atomic_read(cur_trans-num_writers) 1 || (should_grow cur_trans-num_joined != joined)); What I see is basically that new writers join the transaction, while btrfs_commit_transaction() does this loop. I see cur_trans-num_writers decreasing, but then it increases, then decreases etc. This can go for several seconds during heavy IO load. There is nothing to prevent new TRANS_JOIN writers coming and joining a transaction over and over, thus delaying transaction commit. The IO path uses TRANS_JOIN; for example run_delalloc_nocow() does that. Do you observe such behavior? Do you believe it's problematic? I know this behavior, there is no problem with it, the latter code will prevent from TRANS_JOIN. 1672 spin_lock(root-fs_info-trans_lock); 1673 root-fs_info-trans_no_join = 1; 1674 spin_unlock(root-fs_info-trans_lock); 1675 wait_event(cur_trans-writer_wait, 1676atomic_read(cur_trans-num_writers) == 1); Yes, this code prevents anybody from joining, but before btrfs_commit_transaction() gets to this code, it may spend sometimes 10 seconds (in my tests) in the do-while loop, while new writers come and go. Basically, it is not deterministic when the do-while loop will exit, it depends on the IO pattern. And if we block the TRANS_JOIN at the place you point out, the deadlock will happen because we need deal with the ordered operations which will use TRANS_JOIN here. (I am dealing with the problem you said above by adding a new type of TRANS_* now) Thanks. Alex. Thanks Miao Thanks, Alex. On Mon, Feb 25, 2013 at 12:20 PM, Miao Xie mi...@cn.fujitsu.com wrote: On sun, 24 Feb 2013 21:49:55 +0200, Alex Lyakas wrote: Hi Miao, can you please explain your solution a bit more. On Wed, Feb 20, 2013 at 11:16 AM, Miao Xie mi...@cn.fujitsu.com wrote: Now btrfs_commit_transaction() does this ret = btrfs_run_ordered_operations(root, 0) which async flushes all inodes on the ordered operations list, it introduced a deadlock that transaction-start task, transaction-commit task and the flush workers waited for each other. (See the following URL to get
Re: [PATCH 1/2] Btrfs: online data deduplication
On mon, 8 Apr 2013 22:16:26 +0800, Liu Bo wrote: On Mon, Apr 08, 2013 at 08:54:50AM -0400, Josef Bacik wrote: On Sun, Apr 07, 2013 at 07:12:48AM -0600, Liu Bo wrote: (NOTE: This leads to a FORMAT CHANGE, DO NOT use it on real data.) This introduce the online data deduplication feature for btrfs. (1) WHY do we need deduplication? To improve our storage effiency. (2) WHAT is deduplication? Two key ways for practical deduplication implementations, * When the data is deduplicated (inband vs background) * The granularity of the deduplication. (block level vs file level) For btrfs, we choose * inband(synchronous) * block level We choose them because of the same reason as how zfs does. a) To get an immediate benefit. b) To remove redundant parts within a file. So we have an inband, block level data deduplication here. (3) HOW does deduplication works? This makes full use of file extent back reference, the same way as IOCTL_CLONE, which lets us easily store multiple copies of a set of data as a single copy along with an index of references to the copy. Here we have a) a new dedicated tree(DEDUP tree) and b) a new key(BTRFS_DEDUP_ITEM_KEY), which consists of (stop 64bits of hash, type, disk offset), * stop 64bits of hash It comes from sha256, which is very helpful on avoiding collision. And we take the stop 64bits as the index. * disk offset It helps to find where the data is stored. So the whole deduplication process works as, 1) write something, 2) calculate the hash of this something, 3) try to find the match of hash value by searching DEDUP keys in a dedicated tree, DEDUP tree. 4) if found, skip real IO and link to the existing copy if not, do real IO and insert a DEDUP key to the DEDUP tree. For now, we limit the deduplication unit to PAGESIZE, 4096, and we're going to increase this unit dynamically in the future. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/ctree.h| 53 fs/btrfs/disk-io.c | 33 +- fs/btrfs/extent-tree.c | 22 +++- fs/btrfs/extent_io.c|8 +- fs/btrfs/extent_io.h| 11 ++ fs/btrfs/file-item.c| 186 ++ fs/btrfs/file.c |6 +- fs/btrfs/inode.c| 330 +++ fs/btrfs/ioctl.c| 34 +- fs/btrfs/ordered-data.c | 25 +++- fs/btrfs/ordered-data.h |9 ++ fs/btrfs/print-tree.c |6 +- fs/btrfs/super.c|7 +- 13 files changed, 685 insertions(+), 45 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0d82922..59339bc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -32,6 +32,7 @@ #include asm/kmap_types.h #include linux/pagemap.h #include linux/btrfs.h +#include crypto/hash.h #include extent_io.h #include extent_map.h #include async-thread.h @@ -94,6 +95,9 @@ struct btrfs_ordered_sum; /* holds quota configuration and tracking */ #define BTRFS_QUOTA_TREE_OBJECTID 8ULL +/* dedup tree(experimental) */ +#define BTRFS_DEDUP_TREE_OBJECTID 9ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -884,12 +888,31 @@ struct btrfs_file_extent_item { */ __le64 num_bytes; + /* +* the stop 64bits of sha256 hash value, this helps us find the +* corresponding item in dedup tree. +*/ + __le64 dedup_hash; + } __attribute__ ((__packed__)); Please don't do this, do like what we do with the crc tree, just lookup based on the disk bytenr when we free the extent and drop refs that way. Don't further bloat the file extent item, we want it smaller not larger. Thanks, Josef So the real trouble is that I take this hash value as the first field of btrfs_key, and binary searching without the precise first field is not easy. Otherwise we may have to add another key type which replaces hash value with disk bytenr, ie. (disk bytenr, ANOTHER_KEY_TYPE, hash value), then we'll need to search the tree twice to free this one or drop refs. Why need we store refs in btrfs_dedup_item structure? I think the following one is better: key.objectid = the stop 64bits of sha256 hash value key.type = whatever key.offset = bytenr /* the bytenr of block */ struct btrfs_dedup_item { __le64 bytenr, /* the start bytenr of the extent */ __le64 len, } In this way, we use the refs in btrfs_extent_item to make sure the block is not freed. And when we truncate the file, all thing we need do is delete the dedup item when we free the extent just like checksum tree. Thanks Miao Either case is tradeoff, but as this is an initial version, we can try all of these knobs and choose the better one
Re: [PATCH 1/2] Btrfs: online data deduplication
On mon, 8 Apr 2013 15:47:27 +0200, David Sterba wrote: On Sun, Apr 07, 2013 at 09:12:48PM +0800, Liu Bo wrote: (2) WHAT is deduplication? Two key ways for practical deduplication implementations, * When the data is deduplicated (inband vs background) * The granularity of the deduplication. (block level vs file level) For btrfs, we choose * inband(synchronous) * block level Block level may be too fine grained leading to excessive fragmentation and increased metadata usage given that there's a much higher chance to find duplicate (4k) blocks here and there. There's always a tradeoff, the practical values that are considered for granularity range from 8k to 64, see eg. this paper for graphs and analyses http://static.usenix.org/event/fast11/tech/full_papers/Meyer.pdf . This also depends on file data type and access patterns, fixing the dedup basic chunk size to one block does not IMHO fit most usecases. Maybe we can make btrfs(including dedup) support the bigalloc just like ext4. Thanks Miao (3) HOW does deduplication works? ... Here we have a) a new dedicated tree(DEDUP tree) and b) a new key(BTRFS_DEDUP_ITEM_KEY), which consists of (stop 64bits of hash, type, disk offset), * stop 64bits of hash It comes from sha256, which is very helpful on avoiding collision. And we take the stop 64bits as the index. Is it safe to use just 64 bits? I'd like to see better reasoning why this is ok. The limitation of btrfs_key to store only 1-2 64bit items is clear and must be handled, but it's IMO a critical design point. * disk offset It helps to find where the data is stored. Does the disk offset also help to resolving block hash collisions? So the whole deduplication process works as, 1) write something, 2) calculate the hash of this something, 3) try to find the match of hash value by searching DEDUP keys in a dedicated tree, DEDUP tree. 4) if found, skip real IO and link to the existing copy if not, do real IO and insert a DEDUP key to the DEDUP tree. ... how are the hash collisions handled? Using part of a secure has cannot be considered equally strong (given that there is not other safety checks like comparing the whole blocks). Last but not least, there was another dedup proposal (author CCed) http://thread.gmane.org/gmane.comp.file-systems.btrfs/21722 david -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3 1/2] Btrfs: improve the performance of the csums lookup
It is very likely that there are several blocks in bio, it is very inefficient if we get their csums one by one. This patch improves this problem by getting the csums in batch. According to the result of the following test, the execute time of __btrfs_lookup_bio_sums() is down by ~28%(300us - 217us). # dd if=mnt/file of=/dev/null bs=1M count=1024 Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v2 - v3: - address the problem that the logical offset of the pages in the same bio is not contiguous. Changelog v1 - v2: - fix 64bit division problem on i386 machine --- fs/btrfs/extent_io.c| 58 + fs/btrfs/extent_io.h| 4 fs/btrfs/file-item.c| 49 + fs/btrfs/ordered-data.c | 28 ++-- fs/btrfs/ordered-data.h | 3 ++- 5 files changed, 111 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cdee391..19dd3da 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1777,6 +1777,64 @@ out: return ret; } +void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], + int count) +{ + struct rb_node *node; + struct extent_state *state; + + spin_lock(tree-lock); + /* +* this search will find all the extents that end after +* our range starts. +*/ + node = tree_search(tree, start); + BUG_ON(!node); + + state = rb_entry(node, struct extent_state, rb_node); + BUG_ON(state-start != start); + + while (count) { + state-private = *csums++; + count--; + state = next_state(state); + } + spin_unlock(tree-lock); +} + +static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index) +{ + struct bio_vec *bvec = bio-bi_io_vec + bio_index; + + return page_offset(bvec-bv_page) + bvec-bv_offset; +} + +void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index, + u32 csums[], int count) +{ + struct rb_node *node; + struct extent_state *state = NULL; + u64 start; + + spin_lock(tree-lock); + do { + start = __btrfs_get_bio_offset(bio, bio_index); + if (state == NULL || state-start != start) { + node = tree_search(tree, start); + BUG_ON(!node); + + state = rb_entry(node, struct extent_state, rb_node); + BUG_ON(state-start != start); + } + state-private = *csums++; + count--; + bio_index++; + + state = next_state(state); + } while (count); + spin_unlock(tree-lock); +} + int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) { struct rb_node *node; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 258c921..db009d8 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -261,6 +261,10 @@ int extent_readpages(struct extent_io_tree *tree, int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], + int count); +void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, + int bvec_index, u32 csums[], int count); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c4628a2..7e4df79 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -177,7 +177,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { - u32 sum; + u32 sum[16]; + int len; struct bio_vec *bvec = bio-bi_io_vec; int bio_index = 0; u64 offset = 0; @@ -186,7 +187,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 disk_bytenr; u32 diff; u16 csum_size = btrfs_super_csum_size(root-fs_info-super_copy); - int ret; + int count; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = BTRFS_I(inode)-io_tree; @@ -214,10 +215,12 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (dio) offset = logical_offset; while (bio_index bio-bi_vcnt) { + len = min_t(int, ARRAY_SIZE(sum), bio-bi_vcnt - bio_index); if (!dio) offset
[PATCH V2 1/2] Btrfs: improve the performance of the csums lookup
It is very likely that there are several blocks in bio, it is very inefficient if we get their csums one by one. This patch improves this problem by getting the csums in batch. According to the result of the following test, the execute time of __btrfs_lookup_bio_sums() is down by ~28%(300us - 217us). # dd if=mnt/file of=/dev/null bs=1M count=1024 Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - fix 64bit division problem on i386 machine --- fs/btrfs/extent_io.c| 31 +++ fs/btrfs/extent_io.h| 2 ++ fs/btrfs/file-item.c| 45 ++--- fs/btrfs/ordered-data.c | 28 +--- fs/btrfs/ordered-data.h | 3 ++- 5 files changed, 78 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cdee391..fc4d3bc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1777,6 +1777,37 @@ out: return ret; } +void cache_csums(struct extent_io_tree *tree, u64 start, u32 csums[], +int count, int sectorsize) +{ + struct rb_node *node; + struct extent_state *state, *next; + + spin_lock(tree-lock); + /* +* this search will find all the extents that end after +* our range starts. +*/ + node = tree_search(tree, start); + BUG_ON(!node); + + state = rb_entry(node, struct extent_state, rb_node); + BUG_ON(state-start != start); + + while (count) { + BUG_ON(state-end + 1 - state-start != sectorsize); + + state-private = *csums++; + count--; + next = next_state(state); + + BUG_ON(count (!next || next-start != state-end + 1)); + + state = next; + } + spin_unlock(tree-lock); +} + int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) { struct rb_node *node; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 258c921..59819f0 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -261,6 +261,8 @@ int extent_readpages(struct extent_io_tree *tree, int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +void cache_csums(struct extent_io_tree *tree, u64 start, u32 csums[], +int count, int sectorsize); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c4628a2..484017a 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -177,7 +177,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { - u32 sum; + u32 sum[16]; + int len; struct bio_vec *bvec = bio-bi_io_vec; int bio_index = 0; u64 offset = 0; @@ -186,7 +187,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 disk_bytenr; u32 diff; u16 csum_size = btrfs_super_csum_size(root-fs_info-super_copy); - int ret; + int count; + int index; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = BTRFS_I(inode)-io_tree; @@ -214,10 +216,11 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (dio) offset = logical_offset; while (bio_index bio-bi_vcnt) { + len = min_t(int, ARRAY_SIZE(sum), bio-bi_vcnt - bio_index); if (!dio) offset = page_offset(bvec-bv_page) + bvec-bv_offset; - ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum); - if (ret == 0) + count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum, len); + if (count) goto found; if (!item || disk_bytenr item_start_offset || @@ -230,10 +233,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, item = btrfs_lookup_csum(NULL, root-fs_info-csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { - ret = PTR_ERR(item); - if (ret == -ENOENT || ret == -EFBIG) - ret = 0; - sum = 0; + count = 1; + sum[0] = 0; if (BTRFS_I(inode)-root-root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID
[PATCH V2 2/2] Btrfs: remove btrfs_sector_sum structure
Using the structure btrfs_sector_sum to keep the checksum value is unnecessary, because the extents that btrfs_sector_sum points to are continuous, we can find out the expected checksums by btrfs_ordered_sum's bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After removing bytenr, there is only one member in the structure, so it makes no sense to keep the structure, just remove it, and use a u32 array to store the checksum value. By this change, we don't use the while loop to get the checksums one by one. Now, we can get several checksum value at one time, it improved the performance by ~74% on my SSD (31MB/s - 54MB/s). test command: # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync Signed-off-by: Miao Xie mi...@cn.fujitsu.com Reviewed-by: Liu Bo bo.li@oracle.com --- Changelog v1 - v2: - modify the changelog and the title which can not explain this patch clearly - fix the 64bit division problem on 32bit machine --- fs/btrfs/file-item.c| 144 ++-- fs/btrfs/ordered-data.c | 19 +++ fs/btrfs/ordered-data.h | 25 ++--- fs/btrfs/relocation.c | 10 fs/btrfs/scrub.c| 16 ++ 5 files changed, 71 insertions(+), 143 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 484017a..8d653c2 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,8 +34,7 @@ #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)-sectorsize - (r)-sectorsize) + sizeof(u32) * (r)-sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -313,7 +312,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; LIST_HEAD(tmplist); unsigned long offset; @@ -387,34 +385,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); +MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); + GFP_NOFS); if (!sums) { ret = -ENOMEM; goto fail; } - sector_sum = sums-sums; sums-bytenr = start; - sums-len = size; + sums-len = (int)size; offset = (start - key.offset) root-fs_info-sb-s_blocksize_bits; offset *= csum_size; + size = root-fs_info-sb-s_blocksize_bits; - while (size 0) { - read_extent_buffer(path-nodes[0], - sector_sum-sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum-bytenr = start; - - size -= root-sectorsize; - start += root-sectorsize; - offset += csum_size; - sector_sum++; - } + read_extent_buffer(path-nodes[0], + sums-sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root-sectorsize * size; list_add_tail(sums-list, tmplist); } path-slots[0]++; @@ -436,23 +428,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; char *data; struct bio_vec *bvec = bio-bi_io_vec; int bio_index = 0; + int index; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; - u64 disk_bytenr; WARN_ON(bio-bi_vcnt = 0); sums = kzalloc
[PATCH 1/4] Btrfs: fix wrong reservation of csums
We reserve the space for csums only when we write data into a file, in the other cases, such as tree log, log replay, we don't do reservation, so we can use the reservation of the transaction handle just for the former. And for the latter, we should use the tree's own reservation. But the function - btrfs_csum_file_blocks() didn't differentiate between these two types of the cases, fix it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/file-item.c | 2 -- fs/btrfs/inode.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ec16020..b7e529d 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -728,7 +728,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, return -ENOMEM; sector_sum = sums-sums; - trans-adding_csums = 1; again: next_offset = (u64)-1; found_next = 0; @@ -899,7 +898,6 @@ next_sector: goto again; } out: - trans-adding_csums = 0; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f26..63eec5c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1743,8 +1743,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { + trans-adding_csums = 1; btrfs_csum_file_blocks(trans, BTRFS_I(inode)-root-fs_info-csum_root, sum); + trans-adding_csums = 0; } return 0; } -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/4] Btrfs: improve the performance of the csums lookup
It is very likely that there are several blocks in bio, it is very inefficient if we get their csums one by one. This patch improves this problem by getting the csums in batch. According to the result of the following test, the execute time of __btrfs_lookup_bio_sums() is down by ~28%(300us - 217us). # dd if=mnt/file of=/dev/null bs=1M count=1024 Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/extent_io.c| 31 +++ fs/btrfs/extent_io.h| 2 ++ fs/btrfs/file-item.c| 45 ++--- fs/btrfs/ordered-data.c | 24 ++-- fs/btrfs/ordered-data.h | 3 ++- 5 files changed, 75 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f173c5a..3da8da5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1744,6 +1744,37 @@ out: return ret; } +void cache_csums(struct extent_io_tree *tree, u64 start, u32 csums[], +int count, int sectorsize) +{ + struct rb_node *node; + struct extent_state *state, *next; + + spin_lock(tree-lock); + /* +* this search will find all the extents that end after +* our range starts. +*/ + node = tree_search(tree, start); + BUG_ON(!node); + + state = rb_entry(node, struct extent_state, rb_node); + BUG_ON(state-start != start); + + while (count) { + BUG_ON(state-end + 1 - state-start != sectorsize); + + state-private = *csums++; + count--; + next = next_state(state); + + BUG_ON(count (!next || next-start != state-end + 1)); + + state = next; + } + spin_unlock(tree-lock); +} + int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) { struct rb_node *node; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6068a19..b95fb6a 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -261,6 +261,8 @@ int extent_readpages(struct extent_io_tree *tree, int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +void cache_csums(struct extent_io_tree *tree, u64 start, u32 csums[], +int count, int sectorsize); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b7e529d..3e2f080 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -175,7 +175,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { - u32 sum; + u32 sum[16]; + int len; struct bio_vec *bvec = bio-bi_io_vec; int bio_index = 0; u64 offset = 0; @@ -184,7 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 disk_bytenr; u32 diff; u16 csum_size = btrfs_super_csum_size(root-fs_info-super_copy); - int ret; + int count; + int index; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = BTRFS_I(inode)-io_tree; @@ -212,10 +214,11 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (dio) offset = logical_offset; while (bio_index bio-bi_vcnt) { + len = min_t(int, ARRAY_SIZE(sum), bio-bi_vcnt - bio_index); if (!dio) offset = page_offset(bvec-bv_page) + bvec-bv_offset; - ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum); - if (ret == 0) + count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum, len); + if (count) goto found; if (!item || disk_bytenr item_start_offset || @@ -228,10 +231,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, item = btrfs_lookup_csum(NULL, root-fs_info-csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { - ret = PTR_ERR(item); - if (ret == -ENOENT || ret == -EFBIG) - ret = 0; - sum = 0; + count = 1; + sum[0] = 0; if (BTRFS_I(inode)-root-root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, @@ -267,19 +268,25 @@ static int
[PATCH 3/4] Btrfs: remove unnecessary variant in btrfs_sector_sum structure
bytenr in btrfs_sector_sum is unnecessary, because the extents that btrfs_sector_sum points to are continuous,we can find out the expected checksums by btrfs_ordered_sum's bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After removing bytenr, we don't use the while loop to get the checksums one by one. Now, we can get several checksum value at one time. By this way, the performance of write is improved by ~74% on my SSD (31MB/s - 54MB/s) test command: # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/file-item.c| 144 ++-- fs/btrfs/ordered-data.c | 21 +++ fs/btrfs/ordered-data.h | 25 ++--- fs/btrfs/relocation.c | 10 fs/btrfs/scrub.c| 16 ++ 5 files changed, 72 insertions(+), 144 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 3e2f080..9a447bc 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,8 +34,7 @@ #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)-sectorsize - (r)-sectorsize) + sizeof(u32) * (r)-sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -311,7 +310,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; LIST_HEAD(tmplist); unsigned long offset; @@ -385,34 +383,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); +MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); + GFP_NOFS); if (!sums) { ret = -ENOMEM; goto fail; } - sector_sum = sums-sums; sums-bytenr = start; - sums-len = size; + sums-len = (int)size; offset = (start - key.offset) root-fs_info-sb-s_blocksize_bits; offset *= csum_size; + size = root-fs_info-sb-s_blocksize_bits; - while (size 0) { - read_extent_buffer(path-nodes[0], - sector_sum-sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum-bytenr = start; - - size -= root-sectorsize; - start += root-sectorsize; - offset += csum_size; - sector_sum++; - } + read_extent_buffer(path-nodes[0], + sums-sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root-sectorsize * size; list_add_tail(sums-list, tmplist); } path-slots[0]++; @@ -434,23 +426,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; char *data; struct bio_vec *bvec = bio-bi_io_vec; int bio_index = 0; + int index; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; - u64 disk_bytenr; WARN_ON(bio-bi_vcnt = 0); sums = kzalloc(btrfs_ordered_sum_size(root, bio-bi_size), GFP_NOFS); if (!sums) return -ENOMEM; - sector_sum = sums-sums; - disk_bytenr = (u64)bio-bi_sector 9; sums-len = bio-bi_size; INIT_LIST_HEAD(sums-list); @@ -461,7 +450,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, ordered = btrfs_lookup_ordered_extent
[PATCH 4/4] Btrfs: fix wrong return value of btrfs_lookup_csum()
If we don't find the expected csum item, but find a csum item which is adjacent to the specified extent, we should return -EFBIG, or we should return -ENOENT. But btrfs_lookup_csum() return -EFBIG even the csum item is not adjacent to the specified extent. Fix it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/file-item.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 9a447bc..bc89e2f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -117,9 +117,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, csums_in_item = btrfs_item_size_nr(leaf, path-slots[0]); csums_in_item /= csum_size; - if (csum_offset = csums_in_item) { + if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; + } else if (csum_offset csums_in_item) { + goto fail; } } item = btrfs_item_ptr(leaf, path-slots[0], struct btrfs_csum_item); -- 1.8.0.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V2 3/4] Btrfs: remove unnecessary variant in btrfs_sector_sum, structure
bytenr in btrfs_sector_sum is unnecessary, because the extents that btrfs_sector_sum points to are continuous, we can find out the expected checksums by btrfs_ordered_sum's bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After removing bytenr, we don't use the while loop to get the checksums one by one. Now, we can get several checksum value at one time. By this way, the performance of write is improved by ~74% on my SSD (31MB/s - 54MB/s) test command: # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - fix messy code in the changelog. --- fs/btrfs/file-item.c| 144 ++-- fs/btrfs/ordered-data.c | 21 +++ fs/btrfs/ordered-data.h | 25 ++--- fs/btrfs/relocation.c | 10 fs/btrfs/scrub.c| 16 ++ 5 files changed, 72 insertions(+), 144 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 3e2f080..9a447bc 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,8 +34,7 @@ #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)-sectorsize - (r)-sectorsize) + sizeof(u32) * (r)-sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -311,7 +310,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; LIST_HEAD(tmplist); unsigned long offset; @@ -385,34 +383,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); +MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); + GFP_NOFS); if (!sums) { ret = -ENOMEM; goto fail; } - sector_sum = sums-sums; sums-bytenr = start; - sums-len = size; + sums-len = (int)size; offset = (start - key.offset) root-fs_info-sb-s_blocksize_bits; offset *= csum_size; + size = root-fs_info-sb-s_blocksize_bits; - while (size 0) { - read_extent_buffer(path-nodes[0], - sector_sum-sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum-bytenr = start; - - size -= root-sectorsize; - start += root-sectorsize; - offset += csum_size; - sector_sum++; - } + read_extent_buffer(path-nodes[0], + sums-sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root-sectorsize * size; list_add_tail(sums-list, tmplist); } path-slots[0]++; @@ -434,23 +426,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; char *data; struct bio_vec *bvec = bio-bi_io_vec; int bio_index = 0; + int index; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; - u64 disk_bytenr; WARN_ON(bio-bi_vcnt = 0); sums = kzalloc(btrfs_ordered_sum_size(root, bio-bi_size), GFP_NOFS); if (!sums) return -ENOMEM; - sector_sum = sums-sums; - disk_bytenr = (u64)bio-bi_sector 9; sums-len = bio-bi_size; INIT_LIST_HEAD(sums-list); @@ -461,7 +450,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode
Re: [PATCH 3/4] Btrfs: remove unnecessary variant in btrfs_sector_sum structure
On Thu, 28 Mar 2013 22:41:50 +0800, Liu Bo wrote: On Thu, Mar 28, 2013 at 10:38:34PM +0800, Liu Bo wrote: On Thu, Mar 28, 2013 at 04:11:38PM +0800, Miao Xie wrote: bytenr in btrfs_sector_sum is unnecessary, because the extents that btrfs_sector_sum points to are continuous,we can find out the expected checksums by btrfs_ordered_sum's bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After removing bytenr, we don't use the while loop to get the checksums one by one. Now, we can get several checksum value at one time. By this way, the performance of write is improved by ~74% on my SSD (31MB/s - 54MB/s) test command: # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync but the title is a bit confused because you've actually killed all of btrfs_sector_sum. I misused the old title, will change it later. Thanks Miao Looks good to me. Reviewed-by: Liu Bo bo.li@oracle.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/file-item.c| 144 ++-- fs/btrfs/ordered-data.c | 21 +++ fs/btrfs/ordered-data.h | 25 ++--- fs/btrfs/relocation.c | 10 fs/btrfs/scrub.c| 16 ++ 5 files changed, 72 insertions(+), 144 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 3e2f080..9a447bc 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,8 +34,7 @@ #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)-sectorsize - (r)-sectorsize) + sizeof(u32) * (r)-sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -311,7 +310,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; LIST_HEAD(tmplist); unsigned long offset; @@ -385,34 +383,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); +MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); + GFP_NOFS); if (!sums) { ret = -ENOMEM; goto fail; } - sector_sum = sums-sums; sums-bytenr = start; - sums-len = size; + sums-len = (int)size; offset = (start - key.offset) root-fs_info-sb-s_blocksize_bits; offset *= csum_size; + size = root-fs_info-sb-s_blocksize_bits; - while (size 0) { - read_extent_buffer(path-nodes[0], - sector_sum-sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum-bytenr = start; - - size -= root-sectorsize; - start += root-sectorsize; - offset += csum_size; - sector_sum++; - } + read_extent_buffer(path-nodes[0], + sums-sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root-sectorsize * size; list_add_tail(sums-list, tmplist); } path-slots[0]++; @@ -434,23 +426,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; char *data; struct bio_vec *bvec = bio-bi_io_vec; int bio_index = 0; + int index; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; - u64 disk_bytenr; WARN_ON(bio-bi_vcnt = 0); sums = kzalloc(btrfs_ordered_sum_size(root, bio-bi_size), GFP_NOFS); if (!sums) return -ENOMEM; - sector_sum = sums-sums; - disk_bytenr = (u64)bio
Re: [PATCH 2/3] Btrfs: fix the deadlock between the transaction start/attach and commit
On Sun, 24 Mar 2013 13:13:22 +0200, Alex Lyakas wrote: Hi Miao, I am seeing another issue. Your fix prevents from TRANS_START to get in the way of a committing transaction. But it does not prevent from TRANS_JOIN. On the other hand, btrfs_commit_transaction has the following loop: do { // attempt to do some useful stuff and/or sleep } while (atomic_read(cur_trans-num_writers) 1 || (should_grow cur_trans-num_joined != joined)); What I see is basically that new writers join the transaction, while btrfs_commit_transaction() does this loop. I see cur_trans-num_writers decreasing, but then it increases, then decreases etc. This can go for several seconds during heavy IO load. There is nothing to prevent new TRANS_JOIN writers coming and joining a transaction over and over, thus delaying transaction commit. The IO path uses TRANS_JOIN; for example run_delalloc_nocow() does that. Do you observe such behavior? Do you believe it's problematic? I know this behavior, there is no problem with it, the latter code will prevent from TRANS_JOIN. 1672 spin_lock(root-fs_info-trans_lock); 1673 root-fs_info-trans_no_join = 1; 1674 spin_unlock(root-fs_info-trans_lock); 1675 wait_event(cur_trans-writer_wait, 1676atomic_read(cur_trans-num_writers) == 1); And if we block the TRANS_JOIN at the place you point out, the deadlock will happen because we need deal with the ordered operations which will use TRANS_JOIN here. (I am dealing with the problem you said above by adding a new type of TRANS_* now) Thanks Miao Thanks, Alex. On Mon, Feb 25, 2013 at 12:20 PM, Miao Xie mi...@cn.fujitsu.com wrote: On sun, 24 Feb 2013 21:49:55 +0200, Alex Lyakas wrote: Hi Miao, can you please explain your solution a bit more. On Wed, Feb 20, 2013 at 11:16 AM, Miao Xie mi...@cn.fujitsu.com wrote: Now btrfs_commit_transaction() does this ret = btrfs_run_ordered_operations(root, 0) which async flushes all inodes on the ordered operations list, it introduced a deadlock that transaction-start task, transaction-commit task and the flush workers waited for each other. (See the following URL to get the detail http://marc.info/?l=linux-btrfsm=136070705732646w=2) As we know, if -in_commit is set, it means someone is committing the current transaction, we should not try to join it if we are not JOIN or JOIN_NOLOCK, wait is the best choice for it. In this way, we can avoid the above problem. In this way, there is another benefit: there is no new transaction handle to block the transaction which is on the way of commit, once we set -in_commit. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 17 - 1 files changed, 16 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bc2f2d1..71b7e2e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -51,6 +51,14 @@ static noinline void switch_commit_root(struct btrfs_root *root) root-commit_root = btrfs_root_node(root); } +static inline int can_join_transaction(struct btrfs_transaction *trans, + int type) +{ + return !(trans-in_commit +type != TRANS_JOIN +type != TRANS_JOIN_NOLOCK); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -86,6 +94,10 @@ loop: spin_unlock(fs_info-trans_lock); return cur_trans-aborted; } + if (!can_join_transaction(cur_trans, type)) { + spin_unlock(fs_info-trans_lock); + return -EBUSY; + } atomic_inc(cur_trans-use_count); atomic_inc(cur_trans-num_writers); cur_trans-num_joined++; @@ -360,8 +372,11 @@ again: do { ret = join_transaction(root, type); - if (ret == -EBUSY) + if (ret == -EBUSY) { wait_current_trans(root); + if (unlikely(type == TRANS_ATTACH)) + ret = -ENOENT; + } So I understand that instead of incrementing num_writes and joining the current transaction, you do not join and wait for the current transaction to unblock. More specifically,TRANS_START、TRANS_USERSPACE and TRANS_ATTACH can not join and just wait for the current transaction to unblock if -in_commit is set. Which task in Josef's example http://marc.info/?l=linux-btrfsm=136070705732646w=2 task 1, task 2 or task 3 is the one that will not join the transaction, but instead wait? Task1 will not join the transaction, in this way, async inode flush won't run, and then task3 won't do anything. Before applying the patch: Start/Attach_Trans_Task Commit_Task
Re: [PATCH] Btrfs: improve the delayed inode throttling
On wed, 6 Mar 2013 09:53:28 -0500, Chris Mason wrote: On Tue, Mar 05, 2013 at 07:45:34PM -0700, Miao Xie wrote: We re-queue the node just when there are some delayed items in the current node. But if the node still has delayed items after we deal with it, that is to say someone is accessing the node. So it is better to release it and deal with it later. In this way, we can amass more items and deal with them in batches. Thanks, I've made this change. + } else { + btrfs_release_prepared_delayed_node(delayed_node); + if (async_work-nr == 0 || total_done async_work-nr) + goto again; If joining transaction fails, we should end the async handle. And for case -nr == 0 (it means there are too many items, we need flush all), we can set -blocked of the current transaction, in this way, the users can not insert any delayed item for a while, and will wait until the current transation is committed This one I've left out for now, the old code didn't block and I'd prefer that we test that change independently. V2 below, it also has the break Liu Bo mentioned. From: Chris Mason chris.ma...@fusionio.com Date: Mon, 4 Mar 2013 17:13:31 -0500 Subject: [PATCH] Btrfs: improve the delayed inode throttling The delayed inode code batches up changes to the btree in hopes of doing them in bulk. As the changes build up, processes kick off worker threads and wait for them to make progress. The current code kicks off an async work queue item for each delayed node, which creates a lot of churn. It also uses a fixed 1 HZ waiting period for the throttle, which allows us to build a lot of pending work and can slow down the commit. This changes us to watch a sequence counter as it is bumped during the operations. We kick off fewer work items and have each work item do more work. Signed-off-by: Chris Mason chris.ma...@fusionio.com --- fs/btrfs/delayed-inode.c | 152 --- fs/btrfs/delayed-inode.h | 2 + 2 files changed, 94 insertions(+), 60 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b278b1..46f354a 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -22,8 +22,9 @@ #include disk-io.h #include transaction.h -#define BTRFS_DELAYED_WRITEBACK 400 -#define BTRFS_DELAYED_BACKGROUND 100 +#define BTRFS_DELAYED_WRITEBACK 512 +#define BTRFS_DELAYED_BACKGROUND 128 +#define BTRFS_DELAYED_BATCH 16 static struct kmem_cache *delayed_node_cache; @@ -494,6 +495,15 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, BTRFS_DELAYED_DELETION_ITEM); } +static void finish_one_item(struct btrfs_delayed_root *delayed_root) +{ + int seq = atomic_inc_return(delayed_root-items_seq); + if ((atomic_dec_return(delayed_root-items) + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) + waitqueue_active(delayed_root-wait)) + wake_up(delayed_root-wait); +} + static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { struct rb_root *root; @@ -512,10 +522,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase(delayed_item-rb_node, root); delayed_item-delayed_node-count--; - if (atomic_dec_return(delayed_root-items) - BTRFS_DELAYED_BACKGROUND - waitqueue_active(delayed_root-wait)) - wake_up(delayed_root-wait); + + finish_one_item(delayed_root); } static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) @@ -1056,10 +1064,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) delayed_node-count--; delayed_root = delayed_node-root-fs_info-delayed_root; - if (atomic_dec_return(delayed_root-items) - BTRFS_DELAYED_BACKGROUND - waitqueue_active(delayed_root-wait)) - wake_up(delayed_root-wait); + finish_one_item(delayed_root); } } @@ -1304,35 +1309,44 @@ void btrfs_remove_delayed_node(struct inode *inode) btrfs_release_delayed_node(delayed_node); } -struct btrfs_async_delayed_node { - struct btrfs_root *root; - struct btrfs_delayed_node *delayed_node; +struct btrfs_async_delayed_work { + struct btrfs_delayed_root *delayed_root; + int nr; struct btrfs_work work; }; -static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) +static void btrfs_async_run_delayed_root(struct btrfs_work *work) { - struct btrfs_async_delayed_node *async_node; + struct btrfs_async_delayed_work *async_work; + struct btrfs_delayed_root *delayed_root; struct btrfs_trans_handle *trans; struct
Re: [PATCH] Btrfs: improve the delayed inode throttling
On wed, 6 Mar 2013 09:53:28 -0500, Chris Mason wrote: [SNIP] static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, - struct btrfs_root *root, int all) + struct btrfs_root *root, int nr) { - struct btrfs_async_delayed_node *async_node; - struct btrfs_delayed_node *curr; - int count = 0; + struct btrfs_async_delayed_work *async_work; -again: - curr = btrfs_first_prepared_delayed_node(delayed_root); - if (!curr) + if (atomic_read(delayed_root-items) BTRFS_DELAYED_BACKGROUND) return 0; - async_node = kmalloc(sizeof(*async_node), GFP_NOFS); - if (!async_node) { - btrfs_release_prepared_delayed_node(curr); + async_work = kmalloc(sizeof(*async_work), GFP_NOFS); + if (!async_work) return -ENOMEM; - } - - async_node-root = root; - async_node-delayed_node = curr; - - async_node-work.func = btrfs_async_run_delayed_node_done; - async_node-work.flags = 0; - - btrfs_queue_worker(root-fs_info-delayed_workers, async_node-work); - count++; - if (all || count 4) - goto again; + async_work-delayed_root = delayed_root; + async_work-work.func = btrfs_async_run_delayed_root; + async_work-work.flags = 0; + if (nr) + async_work-nr = 0; + else + async_work-nr = nr; the code here is wrong. the argument nr is the number we want to deal with, if it is 0, we will deal with all. so - if (nr) - async_work-nr = 0; - else - async_work-nr = nr; + async_work-nr = nr; + btrfs_queue_worker(root-fs_info-delayed_workers, async_work-work); return 0; } @@ -1424,30 +1431,55 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root) WARN_ON(btrfs_first_delayed_node(delayed_root)); } +static int refs_newer(struct btrfs_delayed_root *delayed_root, + int seq, int count) +{ + int val = atomic_read(delayed_root-items_seq); + + if (val seq || val = seq + count) + return 1; + return 0; +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; + int seq; delayed_root = btrfs_get_delayed_root(root); if (atomic_read(delayed_root-items) BTRFS_DELAYED_BACKGROUND) return; + seq = atomic_read(delayed_root-items_seq); + if (atomic_read(delayed_root-items) = BTRFS_DELAYED_WRITEBACK) { int ret; + DEFINE_WAIT(__wait); + ret = btrfs_wq_run_delayed_node(delayed_root, root, 1); here - ret = btrfs_wq_run_delayed_node(delayed_root, root, 1); + ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); if (ret) return; - wait_event_interruptible_timeout( - delayed_root-wait, - (atomic_read(delayed_root-items) - BTRFS_DELAYED_BACKGROUND), - HZ); - return; + while (1) { + prepare_to_wait(delayed_root-wait, __wait, + TASK_INTERRUPTIBLE); + + if (refs_newer(delayed_root, seq, +BTRFS_DELAYED_BATCH) || + atomic_read(delayed_root-items) + BTRFS_DELAYED_BACKGROUND) { + break; + } + if (!signal_pending(current)) + schedule(); + else + break; + } + finish_wait(delayed_root-wait, __wait); } - btrfs_wq_run_delayed_node(delayed_root, root, 0); + btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); } There is a problem that we may introduce lots of btrfs_works, we need avoid it. Thanks Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: improve the delayed inode throttling
On wed, 6 Mar 2013 22:06:50 -0500, Chris Mason wrote: On Wed, Mar 06, 2013 at 06:39:30PM -0700, Miao Xie wrote: On wed, 6 Mar 2013 09:53:28 -0500, Chris Mason wrote: [SNIP] + async_work-delayed_root = delayed_root; + async_work-work.func = btrfs_async_run_delayed_root; + async_work-work.flags = 0; + if (nr) + async_work-nr = 0; + else + async_work-nr = nr; the code here is wrong. the argument nr is the number we want to deal with, if it is 0, we will deal with all. Whoops, thanks. I missed that when I was cleaning things up. - btrfs_wq_run_delayed_node(delayed_root, root, 0); + btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); } There is a problem that we may introduce lots of btrfs_works, we need avoid it. It is possible, but we won't make more than we used to. The real solution is to limit the workers per root, but the code isn't currently structured for that. Right now the workers will exit out if the number of pending items is below the delayed limit, which isn't perfect but I think it's the best I can do right now. Do you see better ways to improve it? How do you think about per-cpu btrfs_work? If btrfs_work on the current cpu is dealt with, we don't queue it, just update -nr if need and tell the workers that we need do flush again. (This way is a bit ugly because btrfs_work might not be handled on its cpu) Thanks Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: improve the delayed inode throttling
On tue, 5 Mar 2013 10:40:17 -0500, Chris Mason wrote: diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b278b1..460d1a8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -22,8 +22,8 @@ #include disk-io.h #include transaction.h -#define BTRFS_DELAYED_WRITEBACK 400 -#define BTRFS_DELAYED_BACKGROUND 100 +#define BTRFS_DELAYED_WRITEBACK 512 +#define BTRFS_DELAYED_BACKGROUND 128 static struct kmem_cache *delayed_node_cache; @@ -494,6 +494,15 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, BTRFS_DELAYED_DELETION_ITEM); } +static void finish_one_item(struct btrfs_delayed_root *delayed_root) +{ + int seq = atomic_inc_return(delayed_root-items_seq); + if ((atomic_dec_return(delayed_root-items) + BTRFS_DELAYED_BACKGROUND || seq % 16 == 0) + waitqueue_active(delayed_root-wait)) + wake_up(delayed_root-wait); +} + static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { struct rb_root *root; @@ -512,10 +521,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase(delayed_item-rb_node, root); delayed_item-delayed_node-count--; - if (atomic_dec_return(delayed_root-items) - BTRFS_DELAYED_BACKGROUND - waitqueue_active(delayed_root-wait)) - wake_up(delayed_root-wait); + + finish_one_item(delayed_root); } static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) @@ -1056,10 +1063,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) delayed_node-count--; delayed_root = delayed_node-root-fs_info-delayed_root; - if (atomic_dec_return(delayed_root-items) - BTRFS_DELAYED_BACKGROUND - waitqueue_active(delayed_root-wait)) - wake_up(delayed_root-wait); + finish_one_item(delayed_root); } } @@ -1304,35 +1308,55 @@ void btrfs_remove_delayed_node(struct inode *inode) btrfs_release_delayed_node(delayed_node); } +#if 0 struct btrfs_async_delayed_node { struct btrfs_root *root; struct btrfs_delayed_node *delayed_node; struct btrfs_work work; }; +#endif + +struct btrfs_async_delayed_work { + struct btrfs_delayed_root *delayed_root; + int nr; + struct btrfs_work work; +}; -static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) +static void btrfs_async_run_delayed_root(struct btrfs_work *work) { - struct btrfs_async_delayed_node *async_node; + struct btrfs_async_delayed_work *async_work; + struct btrfs_delayed_root *delayed_root; struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; int need_requeue = 0; + int total_done = 0; - async_node = container_of(work, struct btrfs_async_delayed_node, work); + async_work = container_of(work, struct btrfs_async_delayed_work, work); + delayed_root = async_work-delayed_root; path = btrfs_alloc_path(); if (!path) goto out; - path-leave_spinning = 1; - delayed_node = async_node-delayed_node; +again: + if (atomic_read(delayed_root-items) BTRFS_DELAYED_BACKGROUND / 2) + goto free_path; + + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + goto free_path; + +requeue: + path-leave_spinning = 1; + need_requeue = 0; root = delayed_node-root; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) - goto free_path; + goto release_path; block_rsv = trans-block_rsv; trans-block_rsv = root-fs_info-delayed_block_rsv; @@ -1373,47 +1397,48 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) trans-block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); btrfs_btree_balance_dirty_nodelay(root); + +release_path: + btrfs_release_path(path); + total_done++; + + if (need_requeue) { + goto requeue; We re-queue the node just when there are some delayed items in the current node. But if the node still has delayed items after we deal with it, that is to say someone is accessing the node. So it is better to release it and deal with it later. In this way, we can amass more items and deal with them in batches. + } else { + btrfs_release_prepared_delayed_node(delayed_node); + if (async_work-nr == 0 || total_done async_work-nr) + goto again; If joining transaction fails, we should
[PATCH 1/2] Btrfs: fix wrong handle at error path of create_snapshot() when the commit fails
There are several bugs at error path of create_snapshot() when the transaction commitment failed. - access the freed transaction handler. At the end of the transaction commitment, the transaction handler was freed, so we should not access it after the transaction commitment. - we were not aware of the error which happened during the snapshot creation if we submitted a async transaction commitment. - pending snapshot access vs pending snapshot free. when something wrong happened after we submitted a async transaction commitment, the transaction committer would cleanup the pending snapshots and free them. But the snapshot creators were not aware of it, they would access the freed pending snapshots. This patch fixes the above problems by: - remove the dangerous code that accessed the freed handler - assign -error if the error happens during the snapshot creation - the transaction committer doesn't free the pending snapshots, just assigns the error number and evicts them before we unblock the transaction. Reported-by: Dan Carpenter dan.carpen...@oracle.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 16 +--- fs/btrfs/ioctl.c |6 + fs/btrfs/transaction.c | 58 +++ 3 files changed, 41 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02369a3..7d84651 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -62,7 +62,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, @@ -3687,7 +3687,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t) { struct btrfs_pending_snapshot *snapshot; struct list_head splice; @@ -3700,10 +3700,8 @@ static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) snapshot = list_entry(splice.next, struct btrfs_pending_snapshot, list); - + snapshot-error = -ECANCELED; list_del_init(snapshot-list); - - kfree(snapshot); } } @@ -3840,6 +3838,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans-blocked = 1; wake_up(root-fs_info-transaction_blocked_wait); + btrfs_evict_pending_snapshots(cur_trans); + cur_trans-blocked = 0; wake_up(root-fs_info-transaction_wait); @@ -3849,8 +3849,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(cur_trans); - btrfs_destroy_marked_extents(root, cur_trans-dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(root, @@ -3894,6 +3892,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) if (waitqueue_active(root-fs_info-transaction_blocked_wait)) wake_up(root-fs_info-transaction_blocked_wait); + btrfs_evict_pending_snapshots(t); + t-blocked = 0; smp_mb(); if (waitqueue_active(root-fs_info-transaction_wait)) @@ -3907,8 +3907,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(t); - btrfs_destroy_delalloc_inodes(root); spin_lock(root-fs_info-trans_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b908960..94c0e42 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -596,12 +596,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); } - if (ret) { - /* cleanup_transaction has freed this for us */ - if (trans-aborted) - pending_snapshot = NULL; + if (ret) goto fail; - } ret = pending_snapshot-error; if (ret
[PATCH 2/2] Btrfs: fix unclosed transaction handler when the async transaction commitment fails
If the async transaction commitment failed, we need close the current transaction handler, or the current transaction will be blocked to commit because of this orphan handler. We fix the problem by doing sync transaction commitment, that is to invoke btrfs_commit_transaction(). Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 94c0e42..3fdfabc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -527,6 +527,8 @@ fail: if (async_transid) { *async_transid = trans-transid; err = btrfs_commit_transaction_async(trans, root, 1); + if (err) + err = btrfs_commit_transaction(trans, root); } else { err = btrfs_commit_transaction(trans, root); } @@ -592,6 +594,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, *async_transid = trans-transid; ret = btrfs_commit_transaction_async(trans, root-fs_info-extent_root, 1); + if (ret) + ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); -- 1.6.5.2 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] Btrfs: fix wrong handle at error path of create_snapshot() when the commit fails
On Mon, 4 Mar 2013 18:54:02 +0800, Liu Bo wrote: On Mon, Mar 04, 2013 at 05:44:29PM +0800, Miao Xie wrote: There are several bugs at error path of create_snapshot() when the transaction commitment failed. - access the freed transaction handler. At the end of the transaction commitment, the transaction handler was freed, so we should not access it after the transaction commitment. - we were not aware of the error which happened during the snapshot creation if we submitted a async transaction commitment. - pending snapshot access vs pending snapshot free. when something wrong happened after we submitted a async transaction commitment, the transaction committer would cleanup the pending snapshots and free them. But the snapshot creators were not aware of it, they would access the freed pending snapshots. This patch fixes the above problems by: - remove the dangerous code that accessed the freed handler - assign -error if the error happens during the snapshot creation - the transaction committer doesn't free the pending snapshots, just assigns the error number and evicts them before we unblock the transaction. Reported-by: Dan Carpenter dan.carpen...@oracle.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 16 +--- fs/btrfs/ioctl.c |6 + fs/btrfs/transaction.c | 58 +++ 3 files changed, 41 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02369a3..7d84651 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -62,7 +62,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, @@ -3687,7 +3687,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t) { struct btrfs_pending_snapshot *snapshot; struct list_head splice; @@ -3700,10 +3700,8 @@ static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) snapshot = list_entry(splice.next, struct btrfs_pending_snapshot, list); - +snapshot-error = -ECANCELED; ECANCELED or EROFS? Now that EROFS is why we're here. If trans-blocks_used is not 0, the file system may not be set to read-only, so I chose ECANCELED, this error number is proper, I think. Thanks Miao Others look good. thanks, liubo list_del_init(snapshot-list); - -kfree(snapshot); } } @@ -3840,6 +3838,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans-blocked = 1; wake_up(root-fs_info-transaction_blocked_wait); +btrfs_evict_pending_snapshots(cur_trans); + cur_trans-blocked = 0; wake_up(root-fs_info-transaction_wait); @@ -3849,8 +3849,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); -btrfs_destroy_pending_snapshots(cur_trans); - btrfs_destroy_marked_extents(root, cur_trans-dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(root, @@ -3894,6 +3892,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) if (waitqueue_active(root-fs_info-transaction_blocked_wait)) wake_up(root-fs_info-transaction_blocked_wait); +btrfs_evict_pending_snapshots(t); + t-blocked = 0; smp_mb(); if (waitqueue_active(root-fs_info-transaction_wait)) @@ -3907,8 +3907,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); -btrfs_destroy_pending_snapshots(t); - btrfs_destroy_delalloc_inodes(root); spin_lock(root-fs_info-trans_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b908960..94c0e42 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -596,12 +596,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = btrfs_commit_transaction(trans
[PATCH 1/3] Btrfs: remove unnecessary dget_parent/dput when creating the pending snapshot
Since we have grabbed the parent inode at the beginning of the snapshot creation, and both sync and async snapshot creation release it after the pending snapshots are actually created, it is safe to access the parent inode directly during the snapshot creation, we needn't use dget_parent/dput to fix the parent dentry and get the dir inode. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c | 10 ++ fs/btrfs/transaction.c |5 + fs/btrfs/transaction.h |1 + 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2bbbed5..75c551d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -530,9 +530,10 @@ fail: return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen, u64 *async_transid, - bool readonly, struct btrfs_qgroup_inherit *inherit) +static int create_snapshot(struct btrfs_root *root, struct inode *dir, + struct dentry *dentry, char *name, int namelen, + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -551,6 +552,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, pending_snapshot-dentry = dentry; pending_snapshot-root = root; pending_snapshot-readonly = readonly; + pending_snapshot-dir = dir; pending_snapshot-inherit = inherit; trans = btrfs_start_transaction(root-fs_info-extent_root, 6); @@ -728,7 +730,7 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, name, namelen, + error = create_snapshot(snap_src, dir, dentry, name, namelen, async_transid, readonly, inherit); } else { error = create_subvol(BTRFS_I(dir)-root, dentry, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 955204c..63390a3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1061,7 +1061,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct inode *parent_inode; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; @@ -1118,8 +1117,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trans-block_rsv = pending-block_rsv; dentry = pending-dentry; - parent = dget_parent(dentry); - parent_inode = parent-d_inode; + parent_inode = pending-dir; parent_root = BTRFS_I(parent_inode)-root; record_root_in_trans(trans, parent_root); @@ -1267,7 +1265,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, root, ret); fail: - dput(parent); trans-block_rsv = rsv; no_free_objectid: kfree(new_root_item); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5afd7b1..5f67fba 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -84,6 +84,7 @@ struct btrfs_trans_handle { struct btrfs_pending_snapshot { struct dentry *dentry; + struct inode *dir; struct btrfs_root *root; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; -- 1.6.5.2 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] Btrfs: fix wrong reserved space in qgroup during snap/subv creation
There are two problems in the space reservation of the snapshot/ subvolume creation. - don't reserve the space for the root item insertion - the space which is reserved in the qgroup is different with the free space reservation. we need reserve free space for 7 items, but in qgroup reservation, we need reserve space only for 3 items. So we implement new metadata reservation functions for the snapshot/subvolume creation. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h |9 +- fs/btrfs/extent-tree.c | 65 +++- fs/btrfs/ioctl.c | 62 +++-- fs/btrfs/transaction.c |4 +-- fs/btrfs/transaction.h |1 + 5 files changed, 105 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b355bb4..b98c451 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3068,8 +3068,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, +struct btrfs_block_rsv *rsv, +int nitems, +u64 *qgroup_reserved); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 88831fa..b795ed9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4496,19 +4496,60 @@ void btrfs_orphan_release_metadata(struct inode *inode) btrfs_block_rsv_release(root, root-orphan_block_rsv, num_bytes); } -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, +struct btrfs_block_rsv *rsv, +int items, +u64 *qgroup_reserved) { - struct btrfs_root *root = pending-root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); - struct btrfs_block_rsv *dst_rsv = pending-block_rsv; - /* -* two for root back/forward refs, two for directory entries, -* one for root of the snapshot and one for parent inode. -*/ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); - dst_rsv-space_info = src_rsv-space_info; - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + u64 num_bytes; + int ret; + + if (root-fs_info-quota_enabled) { + /* One for parent inode, two for dir entries */ + num_bytes = 3 * root-leafsize; + ret = btrfs_qgroup_reserve(root, num_bytes); + if (ret) + return ret; + } else { + num_bytes = 0; + } + + *qgroup_reserved = num_bytes; + + num_bytes = btrfs_calc_trans_metadata_size(root, items); + rsv-space_info = __find_space_info(root-fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + if (*qgroup_reserved) + btrfs_qgroup_free(root, *qgroup_reserved); + } + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved) +{ + btrfs_block_rsv_release(root, rsv
[PATCH 3/3] Btrfs: fix wrong reserved space when deleting a snapshot/subvolume
When deleting a snapshot/subvolume, we need remove root ref/backref, dir entries and update the dir inode, so we must reserve free space for those operations. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c | 21 +++-- 1 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8046cfc..0b46081 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2064,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; int namelen; int ret; int err = 0; @@ -2153,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out_up_write; + btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* +* One for dir inode, two for dir entries, two for root +* ref/backref. +*/ + err = btrfs_subvolume_reserve_metadata(root, block_rsv, + 5, qgroup_reserved); + if (err) + goto out_up_write; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto out_up_write; + goto out_release; } - trans-block_rsv = root-fs_info-global_block_rsv; + trans-block_rsv = block_rsv; + trans-bytes_reserved = block_rsv.size; ret = btrfs_unlink_subvol(trans, root, dir, dest-root_key.objectid, @@ -2188,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } } out_end_trans: + trans-block_rsv = NULL; + trans-bytes_reserved = 0; ret = btrfs_end_transaction(trans, root); if (ret !err) err = ret; inode-i_flags |= S_DEAD; +out_release: + btrfs_subvolume_release_metadata(root, block_rsv, qgroup_reserved); out_up_write: up_write(root-fs_info-subvol_sem); out_unlock: -- 1.6.5.2 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] Btrfs: fix the deadlock between the transaction start/attach and commit
On sun, 24 Feb 2013 21:49:55 +0200, Alex Lyakas wrote: Hi Miao, can you please explain your solution a bit more. On Wed, Feb 20, 2013 at 11:16 AM, Miao Xie mi...@cn.fujitsu.com wrote: Now btrfs_commit_transaction() does this ret = btrfs_run_ordered_operations(root, 0) which async flushes all inodes on the ordered operations list, it introduced a deadlock that transaction-start task, transaction-commit task and the flush workers waited for each other. (See the following URL to get the detail http://marc.info/?l=linux-btrfsm=136070705732646w=2) As we know, if -in_commit is set, it means someone is committing the current transaction, we should not try to join it if we are not JOIN or JOIN_NOLOCK, wait is the best choice for it. In this way, we can avoid the above problem. In this way, there is another benefit: there is no new transaction handle to block the transaction which is on the way of commit, once we set -in_commit. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 17 - 1 files changed, 16 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bc2f2d1..71b7e2e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -51,6 +51,14 @@ static noinline void switch_commit_root(struct btrfs_root *root) root-commit_root = btrfs_root_node(root); } +static inline int can_join_transaction(struct btrfs_transaction *trans, + int type) +{ + return !(trans-in_commit +type != TRANS_JOIN +type != TRANS_JOIN_NOLOCK); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -86,6 +94,10 @@ loop: spin_unlock(fs_info-trans_lock); return cur_trans-aborted; } + if (!can_join_transaction(cur_trans, type)) { + spin_unlock(fs_info-trans_lock); + return -EBUSY; + } atomic_inc(cur_trans-use_count); atomic_inc(cur_trans-num_writers); cur_trans-num_joined++; @@ -360,8 +372,11 @@ again: do { ret = join_transaction(root, type); - if (ret == -EBUSY) + if (ret == -EBUSY) { wait_current_trans(root); + if (unlikely(type == TRANS_ATTACH)) + ret = -ENOENT; + } So I understand that instead of incrementing num_writes and joining the current transaction, you do not join and wait for the current transaction to unblock. More specifically,TRANS_START、TRANS_USERSPACE and TRANS_ATTACH can not join and just wait for the current transaction to unblock if -in_commit is set. Which task in Josef's example http://marc.info/?l=linux-btrfsm=136070705732646w=2 task 1, task 2 or task 3 is the one that will not join the transaction, but instead wait? Task1 will not join the transaction, in this way, async inode flush won't run, and then task3 won't do anything. Before applying the patch: Start/Attach_Trans_Task Commit_Task Flush_Worker (Task1) (Task2) (Task3) -- the name in Josef's example btrfs_start_transaction() |-may_wait_transaction() | (return 0) | btrfs_commit_transaction() | |-set -in_commit and | | blocked to 1 | |-wait writers to be 1 | | (writers is 1) |-join_transaction() | | (writers is 2) | |-btrfs_commit_transaction() | | |-set trans_no_join to 1 | | (close join transaction) |-btrfs_run_ordered_operations | (Those ordered operations| are added when releasing| file) | |-async inode flush() | |-wait_flush_comlete() | | work_loop() | |-run_work() | |-btrfs_join_transaction() | |-wait_current_trans() |-wait writers to be 1 This three tasks waited for each other. After applying this patch: Start/Attach_Trans_Task Commit_Task Flush_Worker (Task1) (Task2
Re: [PATCH] Btrfs: update inode flags when renaming
On mon, 25 Feb 2013 11:50:01 +0800, Liu Bo wrote: On Fri, Feb 22, 2013 at 11:04:40PM +0100, David Sterba wrote: On Fri, Feb 22, 2013 at 05:34:47PM +0800, Miao Xie wrote: On fri, 22 Feb 2013 16:40:35 +0800, Liu Bo wrote: On Fri, Feb 22, 2013 at 03:32:50AM -0500, Marios Titas wrote: Sorry, but the bug persists even with the above patch. touch test chattr +C test lsattr test mv test test2 lsattr test2 In the above scenario test2 will not have the C flag. What do you expect? IMO it's right that test2 does not have the C flag. No, it's not right. For the users, they expect the C flag is not lost because they just do a rename operation. but fixup_inode_flags() re-sets the flags by the parent directory's flag. I think we should inherit the flags from the parent just when we create a new file/directory, in the other cases, just give a option to the users. How do you think about? I agree with that. The COW status of a file should not be changed at all when renamed. The typical users are database files and vm images, losing the NOCOW flag just from moving here and back is quite unexpected. david Yeah, I agree to remove this bad 'change in rename', will send a patch to address it. I think we can add a mount option, if the option is set, when we move a file to a new directory, or create a new file, we will inherit the flags of the parent. If not set, we inherit the flags only when create a new file. How do you think about it? Thanks Miao thanks, liubo -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: use reserved space for creating a snapshot
On fri, 22 Feb 2013 12:33:36 +0800, Liu Bo wrote: While inserting dir index and updating inode for a snapshot, we'd add delayed items which consume trans-block_rsv, if we don't have any space reserved in this trans handle, we either just return or reserve space again. But before creating pending snapshots during committing transaction, we've done a release on this trans handle, so we don't have space reserved in it at this stage. What we're using is block_rsv of pending snapshots which has already reserved well enough space for both inserting dir index and updating inode, so we need to set trans handle to indicate that we have space now. Signed-off-by: Liu Bo bo.li@oracle.com Reviewed-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fc03aa6..5878bb4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1063,6 +1063,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, rsv = trans-block_rsv; trans-block_rsv = pending-block_rsv; + trans-bytes_reserved = trans-block_rsv-reserved; dentry = pending-dentry; parent = dget_parent(dentry); @@ -1216,6 +1217,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, fail: dput(parent); trans-block_rsv = rsv; + trans-bytes_reserved = 0; no_free_objectid: kfree(new_root_item); root_item_alloc_fail: -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix wrong outstanding_extents when doing DIO write
When running the 083th case of xfstests on the filesystem with compress-force=lzo, the following WARNINGs were triggered. WARNING: at fs/btrfs/inode.c:7908 WARNING: at fs/btrfs/inode.c:7909 WARNING: at fs/btrfs/inode.c:7911 WARNING: at fs/btrfs/extent-tree.c:4510 WARNING: at fs/btrfs/extent-tree.c:4511 This problem was introduced by the patch Btrfs: fix deadlock due to unsubmitted. In this patch, there are two bugs which caused the above problem. The 1st one is a off-by-one bug, if the DIO write return 0, it is also a short write, we need release the reserved space for it. But we didn't do it in that patch. Fix it by change ret 0 to ret = 0. The 2nd one is -outstanding_extents was increased twice when a short write happened. As we know, -outstanding_extents is a counter to keep track of the number of extent items we may use duo to delalloc, when we reserve the free space for a delalloc write, we assume that the write will introduce just one extent item, so we increase -outstanding_extents by 1 at that time. And then we will increase it every time we split the write, it is done at the beginning of btrfs_get_blocks_direct(). So when a short write happens, we needn't increase -outstanding_extents again. But this patch done. In order to fix the 2nd problem, I re-write the logic for -outstanding_extents operation. We don't increase it at the beginning of btrfs_get_blocks_direct(), instead, we just increase it when the split actually happens. Reported-by: Mitch Harder mitch.har...@sabayonlinux.org Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/inode.c | 20 +--- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b009fb5..9a1cc04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6067,12 +6067,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, int unlock_bits = EXTENT_LOCKED; int ret = 0; - if (create) { - spin_lock(BTRFS_I(inode)-lock); - BTRFS_I(inode)-outstanding_extents++; - spin_unlock(BTRFS_I(inode)-lock); + if (create) unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; - } else + else len = min_t(u64, len, root-sectorsize); lockstart = start; @@ -6214,6 +6211,10 @@ unlock: if (start + len i_size_read(inode)) i_size_write(inode, start + len); + spin_lock(BTRFS_I(inode)-lock); + BTRFS_I(inode)-outstanding_extents++; + spin_unlock(BTRFS_I(inode)-lock); + ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart, lockstart + len - 1, EXTENT_DELALLOC, NULL, cached_state, GFP_NOFS); @@ -6716,14 +6717,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, if (rw WRITE) { if (ret 0 ret != -EIOCBQUEUED) btrfs_delalloc_release_space(inode, count); - else if (ret 0 (size_t)ret count) { - spin_lock(BTRFS_I(inode)-lock); - BTRFS_I(inode)-outstanding_extents++; - spin_unlock(BTRFS_I(inode)-lock); + else if (ret = 0 (size_t)ret count) btrfs_delalloc_release_space(inode, count - (size_t)ret); - } - btrfs_delalloc_release_metadata(inode, 0); + else + btrfs_delalloc_release_metadata(inode, 0); } out: if (wakeup) -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Kernel WARNINGs on btrfs-next
hi, On wed, 20 Feb 2013 23:35:36 -0600, Mitch Harder wrote: I'm getting a series of kernel WARNING messages when testing Josef's btrfs-next and Chris' next branch running xfstests 083 when mounted with compress-force=lzo. I'm not seeing any other indications of problems other than the WARNINGs on xfstests 083, so this may be some sort of false positive. Here are the messages against Chris' -next branch (the same warnings are being generated against josef's branch, except against a 3.7.x kernel): I sent a patch to fix this problem as a reply of this mail, could you test it for me? Thanks Miao [ 553.194991] [ cut here ] [ 553.195002] WARNING: at fs/btrfs/inode.c:7908 btrfs_destroy_inode+0x67/0x25b [btrfs]() [ 553.195043] Hardware name: OptiPlex 745 [ 553.195046] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep ppdev parport_pc snd_pcm snd_page_alloc snd_timer snd floppy sr_mod i2c_i801 tg3 ptp iTCO_wdt pps_core iTCO_vendor_support ehci_pci parport lpc_ich microcode serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [ 553.195099] Pid: 4674, comm: rm Not tainted 3.8.0-mason-next+ #1 [ 553.195102] Call Trace: [ 553.195112] [81030522] warn_slowpath_common+0x83/0x9b [ 553.195118] [81030554] warn_slowpath_null+0x1a/0x1c [ 553.195135] [a018d69e] btrfs_destroy_inode+0x67/0x25b [btrfs] [ 553.195141] [8111759a] destroy_inode+0x3b/0x54 [ 553.195145] [811176fc] evict+0x149/0x151 [ 553.195149] [81117f82] iput+0x12c/0x135 [ 553.195166] [a0187f42] ? btrfs_unlink_inode+0x38/0x40 [btrfs] [ 553.195171] [8110de10] do_unlinkat+0x145/0x1df [ 553.195177] [81106e9f] ? sys_newfstatat+0x2a/0x33 [ 553.195191] [8110fce5] sys_unlinkat+0x29/0x2b [ 553.195212] [81607746] system_call_fastpath+0x1a/0x1f [ 553.195224] ---[ end trace 0adc4db1ad1a6634 ]--- [ 553.195231] [ cut here ] [ 553.195247] WARNING: at fs/btrfs/inode.c:7909 btrfs_destroy_inode+0x7e/0x25b [btrfs]() [ 553.195249] Hardware name: OptiPlex 745 [ 553.195251] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep ppdev parport_pc snd_pcm snd_page_alloc snd_timer snd floppy sr_mod i2c_i801 tg3 ptp iTCO_wdt pps_core iTCO_vendor_support ehci_pci parport lpc_ich microcode serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [ 553.195296] Pid: 4674, comm: rm Tainted: GW3.8.0-mason-next+ #1 [ 553.195298] Call Trace: [ 553.195304] [81030522] warn_slowpath_common+0x83/0x9b [ 553.195308] [81030554] warn_slowpath_null+0x1a/0x1c [ 553.195324] [a018d6b5] btrfs_destroy_inode+0x7e/0x25b [btrfs] [ 553.195329] [8111759a] destroy_inode+0x3b/0x54 [ 553.195333] [811176fc] evict+0x149/0x151 [ 553.195336] [81117f82] iput+0x12c/0x135 [ 553.195352] [a0187f42] ? btrfs_unlink_inode+0x38/0x40 [btrfs] [ 553.195356] [8110de10] do_unlinkat+0x145/0x1df [ 553.195360] [81106e9f] ? sys_newfstatat+0x2a/0x33 [ 553.195364] [8110fce5] sys_unlinkat+0x29/0x2b [ 553.195368] [81607746] system_call_fastpath+0x1a/0x1f [ 553.195371] ---[ end trace 0adc4db1ad1a6635 ]--- [ 553.195373] [ cut here ] [ 553.195389] WARNING: at fs/btrfs/inode.c:7911 btrfs_destroy_inode+0xae/0x25b [btrfs]() [ 553.195391] Hardware name: OptiPlex 745 [ 553.195393] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep ppdev parport_pc snd_pcm snd_page_alloc snd_timer snd floppy sr_mod i2c_i801 tg3 ptp iTCO_wdt pps_core iTCO_vendor_support ehci_pci parport lpc_ich microcode serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [ 553.195437] Pid: 4674, comm: rm Tainted: GW3.8.0-mason-next+ #1 [ 553.195439] Call Trace: [ 553.195444] [81030522] warn_slowpath_common+0x83/0x9b [ 553.195449] [81030554] warn_slowpath_null+0x1a/0x1c [ 553.195463] [a018d6e5] btrfs_destroy_inode+0xae/0x25b [btrfs] [ 553.195470] [8111759a] destroy_inode+0x3b/0x54 [ 553.195474] [811176fc] evict+0x149/0x151 [ 553.195480] [81117f82] iput+0x12c/0x135 [ 553.195495] [a0187f42] ? btrfs_unlink_inode+0x38/0x40 [btrfs] [ 553.195499] [8110de10] do_unlinkat+0x145/0x1df [ 553.195504]
[PATCH 1/3] Btrfs: fix the qgroup reserved space is released prematurely
In start_transactio(), we will try to join the transaction again after the current transaction is committed, so we should not release the reserved space of the qgroup. Fix it. Cc: Arne Jansen sensi...@gmx.net Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fc03aa6..bc2f2d1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -383,7 +383,7 @@ again: h-block_rsv = NULL; h-orig_rsv = NULL; h-aborted = 0; - h-qgroup_reserved = qgroup_reserved; + h-qgroup_reserved = 0; h-delayed_ref_elem.seq = 0; h-type = type; INIT_LIST_HEAD(h-qgroup_ref_list); @@ -401,6 +401,7 @@ again: h-block_rsv = root-fs_info-trans_block_rsv; h-bytes_reserved = num_bytes; } + h-qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); -- 1.6.5.2 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] Btrfs: fix the deadlock between the transaction start/attach and commit
Now btrfs_commit_transaction() does this ret = btrfs_run_ordered_operations(root, 0) which async flushes all inodes on the ordered operations list, it introduced a deadlock that transaction-start task, transaction-commit task and the flush workers waited for each other. (See the following URL to get the detail http://marc.info/?l=linux-btrfsm=136070705732646w=2) As we know, if -in_commit is set, it means someone is committing the current transaction, we should not try to join it if we are not JOIN or JOIN_NOLOCK, wait is the best choice for it. In this way, we can avoid the above problem. In this way, there is another benefit: there is no new transaction handle to block the transaction which is on the way of commit, once we set -in_commit. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c | 17 - 1 files changed, 16 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bc2f2d1..71b7e2e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -51,6 +51,14 @@ static noinline void switch_commit_root(struct btrfs_root *root) root-commit_root = btrfs_root_node(root); } +static inline int can_join_transaction(struct btrfs_transaction *trans, + int type) +{ + return !(trans-in_commit +type != TRANS_JOIN +type != TRANS_JOIN_NOLOCK); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -86,6 +94,10 @@ loop: spin_unlock(fs_info-trans_lock); return cur_trans-aborted; } + if (!can_join_transaction(cur_trans, type)) { + spin_unlock(fs_info-trans_lock); + return -EBUSY; + } atomic_inc(cur_trans-use_count); atomic_inc(cur_trans-num_writers); cur_trans-num_joined++; @@ -360,8 +372,11 @@ again: do { ret = join_transaction(root, type); - if (ret == -EBUSY) + if (ret == -EBUSY) { wait_current_trans(root); + if (unlikely(type == TRANS_ATTACH)) + ret = -ENOENT; + } } while (ret == -EBUSY); if (ret 0) { -- 1.6.5.2 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] Btrfs: fix uncompleted transaction
In some cases, we need commit the current transaction, but don't want to start a new one if there is no running transaction, so we introduce the function - btrfs_attach_transaction(), which can catch the current transaction, and return -ENOENT if there is no running transaction. But no running transaction doesn't mean the current transction completely, because we removed the running transaction before it completes. In some cases, it doesn't matter. But in some special cases, such as freeze fs, we hope the transaction is fully on disk, it will introduce some bugs, for example, we may feeze the fs and dump the data in the disk, if the transction doesn't complete, we would dump inconsistent data. So we need fix the above problem for those cases. We fixes this problem by introducing a function: btrfs_attach_transaction_barrier() if we hope all the transaction is fully on the disk, even they are not running, we can use this function. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c |2 +- fs/btrfs/super.c |4 ++-- fs/btrfs/transaction.c | 32 fs/btrfs/transaction.h |2 ++ 4 files changed, 37 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a31cd93..7cbbc2a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3111,7 +3111,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) return PTR_ERR(trans); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d8982e9..74328f7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -876,7 +876,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0); - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ if (PTR_ERR(trans) == -ENOENT) @@ -1559,7 +1559,7 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_sb(sb)-tree_root; - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ if (PTR_ERR(trans) == -ENOENT) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 71b7e2e..257f320 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -468,11 +468,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root return start_transaction(root, 0, TRANS_USERSPACE, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is used when we want to commit the current the transaction, but + * don't want to start a new one. + * + * Note: If this function return -ENOENT, it just means there is no + * running transaction. But it is possible that the inactive transaction + * is still in the memory, not fully on disk. If you hope there is no + * inactive transaction in the fs when -ENOENT is returned, you should + * invoke + * btrfs_attach_transaction_barrier() + */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_ATTACH, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is similar to the above function, the differentia is this one + * will wait for all the inactive transactions until they fully + * complete. + */ +struct btrfs_trans_handle * +btrfs_attach_transaction_barrier(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + trans = start_transaction(root, 0, TRANS_ATTACH, 0); + if (IS_ERR(trans) PTR_ERR(trans) == -ENOENT) + btrfs_wait_for_commit(root, 0); + + return trans; +} + /* wait for a transaction commit to be fully complete */ static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0e8aa1e..422a865 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -110,6 +110,8 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction_barrier( + struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root
[PATCH V2] Btrfs: fix remount vs autodefrag
If we remount the fs to close the auto defragment or make the fs R/O, we should stop the auto defragment. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - don't use -s_umount to avoid R/W-R/O remounting during the defragment. Instead We add a new state that tell thedefragger the fs is under remount, then the defragger pauses. --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/file.c | 5 + fs/btrfs/super.c | 40 ++-- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1679051..b355bb4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -339,6 +339,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) * File system states */ #define BTRFS_FS_STATE_ERROR 0 +#define BTRFS_FS_STATE_REMOUNTING 1 /* Super block flags */ /* Errors detected */ @@ -1864,6 +1865,7 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_clear_opt(o, opt)((o) = ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)-fs_info-mount_opt \ BTRFS_MOUNT_##opt) /* diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b12ba52..32b5cff 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(fs_info-defrag_running); while(1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, +fs_info-fs_state)) + break; + if (!__need_auto_defrag(fs_info-tree_root)) break; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index db1ba9a..68a29a1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1202,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, +unsigned long old_opts, int flags) +{ + set_bit(BTRFS_FS_STATE_REMOUNTING, fs_info-fs_state); + + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) + (!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) || +(flags MS_RDONLY))) { + /* wait for any defraggers to finish */ + wait_event(fs_info-transaction_wait, + (atomic_read(fs_info-defrag_running) == 0)); + if (flags MS_RDONLY) + sync_filesystem(fs_info-sb); + } +} + +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, +unsigned long old_opts) +{ + /* +* We need cleanup all defragable inodes if the autodefragment is +* close or the fs is R/O. +*/ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) + (!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) || +(fs_info-sb-s_flags MS_RDONLY))) { + btrfs_cleanup_defrag_inodes(fs_info); + } + + clear_bit(BTRFS_FS_STATE_REMOUNTING, fs_info-fs_state); +} + static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1215,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info-metadata_ratio; int ret; + btrfs_remount_prepare(fs_info, old_opts, *flags); + ret = btrfs_parse_options(root, data); if (ret) { ret = -EINVAL; @@ -1225,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) fs_info-thread_pool_size, old_thread_pool_size); if ((*flags MS_RDONLY) == (sb-s_flags MS_RDONLY)) - return 0; + goto out; if (*flags MS_RDONLY) { /* @@ -1280,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } sb-s_flags = ~MS_RDONLY; } - +out: + btrfs_remount_cleanup(fs_info, old_opts); return 0; restore: @@ -1297,6 +1332,7 @@ restore: btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info-thread_pool_size); fs_info-metadata_ratio = old_metadata_ratio; + btrfs_remount_cleanup(fs_info, old_opts); return ret; } -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: fix the deadlock between the transaction attach and commit
(Sorry for the late reply, I was on my vacation of the Spring Festival last week.) On Tue, 12 Feb 2013 13:56:32 +0100, David Sterba wrote: On Mon, Feb 11, 2013 at 03:35:37PM -0500, Josef Bacik wrote: or something like that. Me and kdave reproduced by running 274 in a loop, it happpened pretty quick. I'd fix it myself but I have to leave my house for people to come look at it. If you haven't fixed this by tomorrow I'll fix it up. Thanks, I found 224 stuck with this [SNIP] mounted with noatime,space_cache Thanks for your test. My test skipped the 274th case because it always fails, and all the other cases passed, so I didn't hit this problem. Anyways, very sorry for my stupid patch. (I have reviewed Josef's fix patch, and commented on it, please see the reply of that patch) Thanks Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: place ordered operations on a per transaction list
On wed, 13 Feb 2013 11:13:22 -0500, Josef Bacik wrote: Miao made the ordered operations stuff run async, which introduced a deadlock where we could get somebody (sync) racing in and committing the transaction while a commit was already happening. The new committer would try and flush ordered operations which would hang waiting for the commit to finish because it is done asynchronously and no longer inherits the callers trans handle. To fix this we need to make the ordered operations list a per transaction list. We can get new inodes added to the ordered operation list by truncating them and then having another process writing to them, so this makes it so that anybody trying to add an ordered operation _must_ start a transaction in order to add itself to the list, which will keep new inodes from getting added to the ordered operations list after we start committing. This should fix the deadlock and also keeps us from doing a lot more work than we need to during commit. Thanks, Firstly, thanks to deal with the bug which was introduced by my patch. But comparing with this fix method, I prefer the following one because: - we won't worry the similar problem if we add more work during commit in the future. - it is unnecessary to get a new handle and commit it if the transaction is under the commit. Thanks Miao diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fc03aa6..c449cb5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -277,7 +277,8 @@ static void wait_current_trans(struct btrfs_root *root) } } -static int may_wait_transaction(struct btrfs_root *root, int type) +static int may_wait_transaction(struct btrfs_root *root, int type, + bool is_joined) { if (root-fs_info-log_root_recovering) return 0; @@ -285,6 +286,14 @@ static int may_wait_transaction(struct btrfs_root *root, int type) if (type == TRANS_USERSPACE) return 1; + /* +* If we are ATTACH, it means we just want to catch the current +* transaction and commit it. So if someone is committing the +* current transaction now, it is very glad to wait it. +*/ + if (is_joined type == TRANS_ATTACH) + return 1; + if (type == TRANS_START !atomic_read(root-fs_info-open_ioctl_trans)) return 1; @@ -355,7 +364,7 @@ again: if (type TRANS_JOIN_NOLOCK) sb_start_intwrite(root-fs_info-sb); - if (may_wait_transaction(root, type)) + if (may_wait_transaction(root, type, false)) wait_current_trans(root); do { @@ -383,16 +392,26 @@ again: h-block_rsv = NULL; h-orig_rsv = NULL; h-aborted = 0; - h-qgroup_reserved = qgroup_reserved; + h-qgroup_reserved = 0; h-delayed_ref_elem.seq = 0; h-type = type; INIT_LIST_HEAD(h-qgroup_ref_list); INIT_LIST_HEAD(h-new_bgs); smp_mb(); - if (cur_trans-blocked may_wait_transaction(root, type)) { - btrfs_commit_transaction(h, root); - goto again; + if (cur_trans-blocked may_wait_transaction(root, type, true)) { + if (cur_trans-in_commit) { + btrfs_end_transaction(h, root); + wait_current_trans(root); + } else { + btrfs_commit_transaction(h, root); + } + if (unlikely(type == TRANS_ATTACH)) { + ret = -ENOENT; + goto alloc_fail; + } else { + goto again; + } } if (num_bytes) { @@ -401,6 +420,7 @@ again: h-block_rsv = root-fs_info-trans_block_rsv; h-bytes_reserved = num_bytes; } + h-qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); -- 1.6.5.2 Signed-off-by: Josef Bacik jba...@fusionio.com --- fs/btrfs/ctree.h|7 --- fs/btrfs/disk-io.c | 11 ++- fs/btrfs/file.c | 15 ++- fs/btrfs/ordered-data.c | 13 - fs/btrfs/ordered-data.h |3 ++- fs/btrfs/transaction.c |5 +++-- fs/btrfs/transaction.h |1 + 7 files changed, 34 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0c4e4df..9f72ec8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1408,13 +1408,6 @@ struct btrfs_fs_info { struct list_head delalloc_inodes; /* - * special rename and truncate targets that must be on disk before - * we're allowed to commit. This is basically the ext3 style - * data=ordered list. - */ - struct list_head ordered_operations; - - /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming
Re: [PATCH 2/2] Btrfs: fix memory leak of pending_snapshot-inherit
On Thu, 07 Feb 2013 09:43:47 +0100, Arne Jansen wrote: On 02/07/13 07:02, Miao Xie wrote: The argument inherit of btrfs_ioctl_snap_create_transid() was assigned to NULL during we created the snapshots, so we didn't free it though we called kfree() in the caller. But since we are sure the snapshot creation is done after the function - btrfs_ioctl_snap_create_transid() - completes, it is safe that we don't assign the pointer inherit to NULL, and just free it in the caller of btrfs_ioctl_snap_create_transid(). In this way, the code can become more readable. NAK. The snapshot creation is triggered from btrfs_commit_transaction, I don't want to implicitly rely on commit_transaction being called for each snapshot created. I'm not even sure the async path really commits the transaction. The responsibility for the creation is passed to the pending_snapshot data structure, and so should the responsibility for the inherit struct. I don't agree with you. We are sure the async path really commits the transaction because we pass 1 as the value of the third argument into btrfs_commit_transaction_async(). It means we must wait for the completion of the current transaction. So Freeing the inherit struct in the caller is safe. Besides that, the pending_snapshot data structure is also allocated and freed by the same function in fact, why not use this style for the inherit struct. I think it is more readable. Assigning a pointer to be NULL and freeing it in the caller is very strange for the people who reads the code. (It is also the reason why I made the mistake at the beginning.) So I think my patch is reasonable. Thanks Miao -Arne Reported-by: Alex Lyakas alex.bt...@zadarastorage.com Cc: Arne Jansen sensi...@gmx.net Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c | 18 +++--- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 02d3035..40f2fbf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, u64 *async_transid, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct btrfs_trans_handle *trans; struct btrfs_key key; @@ -401,8 +401,7 @@ static noinline int create_subvol(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); -ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, - inherit ? *inherit : NULL); +ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, inherit); if (ret) goto fail; @@ -530,7 +529,7 @@ fail: static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, u64 *async_transid, - bool readonly, struct btrfs_qgroup_inherit **inherit) + bool readonly, struct btrfs_qgroup_inherit *inherit) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -549,10 +548,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, pending_snapshot-dentry = dentry; pending_snapshot-root = root; pending_snapshot-readonly = readonly; -if (inherit) { -pending_snapshot-inherit = *inherit; -*inherit = NULL;/* take responsibility to free it */ -} +pending_snapshot-inherit = inherit; trans = btrfs_start_transaction(root-fs_info-extent_root, 6); if (IS_ERR(trans)) { @@ -692,7 +688,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct inode *dir = parent-dentry-d_inode; struct dentry *dentry; @@ -1454,7 +1450,7 @@ out: static noinline int btrfs_ioctl_snap_create_transid(struct file *file, char *name, unsigned long fd, int subvol, u64 *transid, bool readonly, -struct btrfs_qgroup_inherit **inherit) +struct btrfs_qgroup_inherit *inherit) { int namelen; int ret = 0; @@ -1563,7 +1559,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ret = btrfs_ioctl_snap_create_transid(file, vol_args-name, vol_args-fd, subvol, ptr
[RFC][PATCH] Btrfs: fix deadlock due to unsubmitted
The deadlock problem happened when running fsstress(a test program in LTP). Steps to reproduce: # mkfs.btrfs -b 100M partition # mount partition mnt # Path/fsstress -p 3 -n 1000 -d mnt The reason is: btrfs_direct_IO() |-do_direct_IO() |-get_page() |-get_blocks() | |-btrfs_delalloc_resereve_space() | |-btrfs_add_ordered_extent() --- Add a new ordered extent |-dio_send_cur_page(page0) -- We didn't submit bio here |-get_page() |-get_blocks() |-btrfs_delalloc_resereve_space() |-flush_space() |-btrfs_start_ordered_extent() |-wait_event() -- Wait the completion of the ordered extent that is mentioned above But because we didn't submit the bio that is mentioned above, the ordered extent can not complete, we would wait for its completion forever. There are two methods which can fix this deadlock problem: 1. submit the bio before we invoke get_blocks() 2. reserve the space before we do dio Though the 1st is the simplest way, we need modify the code of VFS, and it is likely to break contiguous requests, and introduce performance regression for the other filesystems. So we have to choose the 2nd way. Signed-off-by: Miao Xie mi...@cn.fujitsu.com Cc: Josef Bacik jba...@fusionio.com --- fs/btrfs/extent-tree.c |3 +- fs/btrfs/inode.c | 81 --- 2 files changed, 43 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 85b8454..ca9afc4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4670,7 +4670,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) spin_lock(BTRFS_I(inode)-lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + if (num_bytes) + to_free = calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(BTRFS_I(inode)-lock); if (dropped 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca7ace7..c5d829d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6004,16 +6004,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 len = bh_result-b_size; struct btrfs_trans_handle *trans; int unlock_bits = EXTENT_LOCKED; - int ret; + int ret = 0; if (create) { - ret = btrfs_delalloc_reserve_space(inode, len); - if (ret) - return ret; + spin_lock(BTRFS_I(inode)-lock); + BTRFS_I(inode)-outstanding_extents++; + spin_unlock(BTRFS_I(inode)-lock); unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; - } else { + } else len = min_t(u64, len, root-sectorsize); - } lockstart = start; lockend = start + len - 1; @@ -6025,14 +6024,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (lock_extent_direct(inode, lockstart, lockend, cached_state, create)) return -ENOTBLK; - if (create) { - ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart, -lockend, EXTENT_DELALLOC, NULL, -cached_state, GFP_NOFS); - if (ret) - goto unlock_err; - } - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -6064,7 +6055,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (!create (em-block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, em-flags))) { free_extent_map(em); - ret = 0; goto unlock_err; } @@ -6162,6 +6152,11 @@ unlock: */ if (start + len i_size_read(inode)) i_size_write(inode, start + len); + + ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart, +lockstart + len - 1, EXTENT_DELALLOC, NULL, +cached_state, GFP_NOFS); + BUG_ON(ret); } /* @@ -6170,24 +6165,9 @@ unlock: * aren't using if there is any left over space. */ if (lockstart lockend) { - if (create len lockend - lockstart) { - clear_extent_bit(BTRFS_I(inode)-io_tree, lockstart, -lockstart + len - 1, -unlock_bits | EXTENT_DEFRAG, 1, 0, -cached_state
[PATCH] Btrfs: fix the deadlock between the transaction attach and commit
Here is the whole story: Trans_Attach_Task Trans_Commit_Task btrfs_commit_transaction() |-wait writers to be 1 btrfs_attach_transaction() | btrfs_commit_transaction() | | |-set trans_no_join to 1 | | (close join transaction) |-btrfs_run_ordered_operations | (Those ordered operations| are added when releasing| file) | |-btrfs_join_transaction() | |-wait_commit() | |-wait writers to be 1 Then these two tasks waited for each other. As we know, btrfs_attach_transaction() is used to catch the current transaction, and commit it, so if someone has committed the transaction, it is unnecessary to join it and commit it, wait is the best choice for it. In this way, we can fix the above problem. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/transaction.c |8 1 files changed, 8 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f154946..7be9d5e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -285,6 +285,14 @@ static int may_wait_transaction(struct btrfs_root *root, int type) if (type == TRANS_USERSPACE) return 1; + /* +* If we are ATTACH, it means we just want to catch the current +* transaction and commit it. So if someone is committing the +* current transaction now, it is very glad to wait it. +*/ + if (type == TRANS_ATTACH) + return 1; + if (type == TRANS_START !atomic_read(root-fs_info-open_ioctl_trans)) return 1; -- 1.6.5.2 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V2 1/2] Btrfs: serialize unlocked dio reads with truncate
Currently, we can do unlocked dio reads, but the following race is possible: dio_read_task truncate_task -btrfs_setattr() -btrfs_direct_IO -__blockdev_direct_IO -btrfs_get_block -btrfs_truncate() #alloc truncated blocks #to other inode -submit_io() #INFORMATION LEAK In order to avoid this problem, we must serialize unlocked dio reads with truncate. There are two approaches: - use extent lock to protect the extent that we truncate - use inode_dio_wait() to make sure the truncating task will wait for the read DIO. If we use the 1st one, we will meet the endless truncation problem due to the nonlocked read DIO after we implement the nonlocked write DIO. It is because we still need invoke inode_dio_wait() avoid the race between write DIO and truncation. By that time, we have to introduce btrfs_inode_{block, resume}_nolock_dio() again. That is we have to implement this patch again, so I choose the 2nd way to fix the problem. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changlog v1 - v2: - Rebase the patch against the following one: [RFC][PATCH] Btrfs: fix deadlock due to unsubmitted - Modify the changelog to explain why we don't choose the extent lock to fix the bug --- fs/btrfs/btrfs_inode.h | 19 +++ fs/btrfs/inode.c | 23 +-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2a8c242..00e2601 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -40,6 +40,7 @@ #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 #define BTRFS_INODE_NEEDS_FULL_SYNC7 #define BTRFS_INODE_COPY_EVERYTHING8 +#define BTRFS_INODE_READDIO_NEED_LOCK 9 /* in memory btrfs inode */ struct btrfs_inode { @@ -216,4 +217,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) return 0; } +/* + * Disable DIO read nolock optimization, so new dio readers will be forced + * to grab i_mutex. It is used to avoid the endless truncate due to + * nonlocked dio read. + */ +static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +{ + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_I(inode)-runtime_flags); + smp_mb(); +} + +static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb__before_clear_bit(); + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, + BTRFS_I(inode)-runtime_flags); +} + #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c5d829d..a49be05 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3832,6 +3832,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); + + /* Disable nonlocked read DIO to avoid the end less truncate */ + btrfs_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + btrfs_inode_resume_unlocked_dio(inode); + ret = btrfs_truncate(inode); if (ret inode-i_nlink) btrfs_orphan_del(NULL, inode); @@ -6615,6 +6621,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, struct file *file = iocb-ki_filp; struct inode *inode = file-f_mapping-host; size_t count = 0; + int flags = 0; + bool wakeup = false; ssize_t ret; if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov, @@ -6626,13 +6634,22 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) return ret; + } else { + atomic_inc(inode-i_dio_count); + smp_mb__after_atomic_inc(); + if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + BTRFS_I(inode)-runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + } else { + wakeup = true; + } } ret = __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev, iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, 0); - + btrfs_submit_direct, flags); if (rw WRITE) { if (ret 0 ret != -EIOCBQUEUED) btrfs_delalloc_release_space(inode, count); @@ -6645,6 +6662,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, } btrfs_delalloc_release_metadata(inode, 0
[PATCH V2 2/2] Btrfs: implement unlocked dio write
This idea is from ext4. By this patch, we can make the dio write parallel, and improve the performance. But because we can not update isize without i_mutex, the unlocked dio write just can be done in front of the EOF. We needn't worry about the race between dio write and truncate, because the truncate need wait untill all the dio write end. And we also needn't worry about the race between dio write and punch hole, because we have extent lock to protect our operation. I ran fio to test the performance of this feature. == Hardware == CPU: Intel(R) Core(TM)2 Duo CPU E7500 @ 2.93GHz Mem: 2GB SSD: Intel X25-M 120GB (Test Partition: 60GB) == config file == [global] ioengine=psync direct=1 bs=4k size=32G runtime=60 directory=/mnt/btrfs/ filename=testfile group_reporting thread [file1] numjobs=1 # 2 4 rw=randwrite == result (KBps) == write 1 2 4 lock24936 24738 24726 nolock 24962 30866 32101 == result (iops) == write 1 2 4 lock623461846181 nolock 624077168025 Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - don't do nolocked DIO write if it is beyond the EOF --- fs/btrfs/inode.c | 35 +++ 1 files changed, 23 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a49be05..2948123 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6622,28 +6622,36 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file-f_mapping-host; size_t count = 0; int flags = 0; - bool wakeup = false; + bool wakeup = true; + bool relock = false; ssize_t ret; if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov, offset, nr_segs)) return 0; + atomic_inc(inode-i_dio_count); + smp_mb__after_atomic_inc(); + if (rw WRITE) { count = iov_length(iov, nr_segs); + /* +* If the write DIO is beyond the EOF, we need update +* the isize, but it is protected by i_mutex. So we can +* not unlock the i_mutex at this case. +*/ + if (offset + count = inode-i_size) { + mutex_unlock(inode-i_mutex); + relock = true; + } ret = btrfs_delalloc_reserve_space(inode, count); if (ret) - return ret; - } else { - atomic_inc(inode-i_dio_count); - smp_mb__after_atomic_inc(); - if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - BTRFS_I(inode)-runtime_flags))) { - inode_dio_done(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else { - wakeup = true; - } + goto out; + } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, +BTRFS_I(inode)-runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; } ret = __blockdev_direct_IO(rw, iocb, inode, @@ -6662,8 +6670,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, } btrfs_delalloc_release_metadata(inode, 0); } +out: if (wakeup) inode_dio_done(inode); + if (relock) + mutex_lock(inode-i_mutex); return ret; } -- 1.6.5.2 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Leaking btrfs_qgroup_inherit on snapshot creation?
On wed, 06 Feb 2013 13:14:23 +0100, Arne Jansen wrote: Hi Alex, On 02/06/13 12:18, Alex Lyakas wrote: Hi Jan, Arne, I see this code in create_snapshot: if (inherit) { pending_snapshot-inherit = *inherit; *inherit = NULL;/* take responsibility to free it */ } So, first thing I think it should be: if (*inherit) because in btrfs_ioctl_snap_create_v2() we have: struct btrfs_qgroup_inherit *inherit = NULL; ... btrfs_ioctl_snap_create_transid(..., inherit) so the current check is very unlikely to be NULL. But in btrfs_ioctl_snap_create it is called with NULL, so *inherit would dereference a NULL pointer. Second, I don't see anybody freeing pending_snapshot-inherit. I guess it should be freed after callin btrfs_qgroup_inherit() and also in btrfs_destroy_pending_snapshots(). You're right. In our original version (6f72c7e20dbaea5) it was still there, in transaction.c. It has been removed in 6fa9700e734: commit 6fa9700e734275de2acbcb0e99414bd7ddfc60f1 Author: Miao Xie mi...@cn.fujitsu.com Date: Thu Sep 6 04:00:32 2012 -0600 Btrfs: fix error path in create_pending_snapshot() This patch fixes the following problem: - If we failed to deal with the delayed dir items, we should abort transaction, just as its comment said. Fix it. - If root reference or root back reference insertion failed, we should abort transaction. Fix it. - Fix the double free problem of pending-inherit. - Do not restore the trans-rsv if we doesn't change it. - make the error path more clearly. Signed-off-by: Miao Xie mi...@cn.fujitsu.com Miao, can you please explain where you see a double free? Sorry, I misread the code,I didn't notice that the pointer had been assigned to NULL. But I think we can make the code more readable and be easy to maintain, we can free the memory in the caller(btrfs_ioctl_snap_create_v2()) since we are sure the snapshot creation is done after btrfs_ioctl_snap_create_transid() completes. Thanks Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] Btrfs: fix the race between bio and btrfs_stop_workers
open_ctree() need read the metadata to initialize the global information of btrfs. But it may fail after it submit some bio, and then it will jump to the error path. Unfortunately, it doesn't check if there are some bios in flight, and just stop all the worker threads. As a result, when the submitted bios end, they can not find any worker thread which can deal with subsequent work, then oops happen. kernel BUG at fs/btrfs/async-thread.c:605! Fix this problem by invoking invalidate_inode_pages2() before we stop the worker threads. This function will wait until the bio end because it need lock the pages which are going to be invalidated, and if a page is under disk read IO, it must be locked. invalidate_inode_pages2() need wait until end bio handler to unlocked it. Reported-and-Tested-by: Tsutomu Itoh t-i...@jp.fujitsu.com Signed-off-by: Eric Sandeen sand...@redhat.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0c31d07..d8fd711 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2728,13 +2728,13 @@ fail_cleaner: * kthreads */ filemap_write_and_wait(fs_info-btree_inode-i_mapping); - invalidate_inode_pages2(fs_info-btree_inode-i_mapping); fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); + invalidate_inode_pages2(fs_info-btree_inode-i_mapping); fail_sb_buffer: btrfs_stop_workers(fs_info-generic_worker); @@ -2755,7 +2755,6 @@ fail_alloc: fail_iput: btrfs_mapping_tree_free(fs_info-mapping_tree); - invalidate_inode_pages2(fs_info-btree_inode-i_mapping); iput(fs_info-btree_inode); fail_bdi: bdi_destroy(fs_info-bdi); -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] Btrfs: fix memory leak of pending_snapshot-inherit
The argument inherit of btrfs_ioctl_snap_create_transid() was assigned to NULL during we created the snapshots, so we didn't free it though we called kfree() in the caller. But since we are sure the snapshot creation is done after the function - btrfs_ioctl_snap_create_transid() - completes, it is safe that we don't assign the pointer inherit to NULL, and just free it in the caller of btrfs_ioctl_snap_create_transid(). In this way, the code can become more readable. Reported-by: Alex Lyakas alex.bt...@zadarastorage.com Cc: Arne Jansen sensi...@gmx.net Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ioctl.c | 18 +++--- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 02d3035..40f2fbf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, u64 *async_transid, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct btrfs_trans_handle *trans; struct btrfs_key key; @@ -401,8 +401,7 @@ static noinline int create_subvol(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, - inherit ? *inherit : NULL); + ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, inherit); if (ret) goto fail; @@ -530,7 +529,7 @@ fail: static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, u64 *async_transid, - bool readonly, struct btrfs_qgroup_inherit **inherit) + bool readonly, struct btrfs_qgroup_inherit *inherit) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -549,10 +548,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, pending_snapshot-dentry = dentry; pending_snapshot-root = root; pending_snapshot-readonly = readonly; - if (inherit) { - pending_snapshot-inherit = *inherit; - *inherit = NULL;/* take responsibility to free it */ - } + pending_snapshot-inherit = inherit; trans = btrfs_start_transaction(root-fs_info-extent_root, 6); if (IS_ERR(trans)) { @@ -692,7 +688,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct inode *dir = parent-dentry-d_inode; struct dentry *dentry; @@ -1454,7 +1450,7 @@ out: static noinline int btrfs_ioctl_snap_create_transid(struct file *file, char *name, unsigned long fd, int subvol, u64 *transid, bool readonly, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { int namelen; int ret = 0; @@ -1563,7 +1559,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ret = btrfs_ioctl_snap_create_transid(file, vol_args-name, vol_args-fd, subvol, ptr, - readonly, inherit); + readonly, inherit); if (ret == 0 ptr copy_to_user(arg + -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!
Hi, Eric I want to send out my fix patch, but Could I add your Signed-off-by? because you found the key to solving the problem. Thanks Miao On Fri, 01 Feb 2013 14:53:09 +0900, Tsutomu Itoh wrote: Can you please explain similar problems, Miao? Before missing device check, there are several places where we read the metadata, such as reading chunk tree root, btrfs_read_chunk_tree, those functions may fail after submit a bio. If we don't wait until the bio end, and just stop the workers, the same problem will happen. (invalidate_inode_pages2() will wait until the bio end, because it need lock the pages which are going to be invalidated, and the page is locked if it is under disk read IO) I understood. My reproducer is not reproduce this problem yet. But the following messages were displayed when 'rmmod btrfs' command was executed. [76378.723481] = [76378.723901] BUG btrfs_extent_buffer (Tainted: G B ): Objects remaining in btrfs_extent_buffer on kmem_cache_close() [76378.724333] - [76378.724333] [76378.724959] INFO: Slab 0xea00065c3280 objects=23 used=2 fp=0x8801970caac0 flags=0x80004080 [76378.725391] Pid: 9156, comm: rmmod Tainted: G B3.8.0-rc5 #1 [76378.725397] Call Trace: [76378.725403] [8111bc23] slab_err+0xb0/0xd2 I think that this message means there is a possibility that I/O did not end normally. and, after Miao's patch applied, this message is not displayed when rmmod was executed. So, Miao's patch seems to fix the problem for me. [SNIP] diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0c31d07..d8fd711 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2728,13 +2728,13 @@ fail_cleaner: * kthreads */ filemap_write_and_wait(fs_info-btree_inode-i_mapping); - invalidate_inode_pages2(fs_info-btree_inode-i_mapping); fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); + invalidate_inode_pages2(fs_info-btree_inode-i_mapping); fail_sb_buffer: btrfs_stop_workers(fs_info-generic_worker); @@ -2755,7 +2755,6 @@ fail_alloc: fail_iput: btrfs_mapping_tree_free(fs_info-mapping_tree); - invalidate_inode_pages2(fs_info-btree_inode-i_mapping); iput(fs_info-btree_inode); fail_bdi: bdi_destroy(fs_info-bdi); -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] Btrfs: serialize unlocked dio reads with truncate
Currently, we can do unlocked dio reads, but the following race is possible: dio_read_task truncate_task -btrfs_setattr() -btrfs_direct_IO -__blockdev_direct_IO -btrfs_get_block -btrfs_truncate() #alloc truncated blocks #to other inode -submit_io() #INFORMATION LEAK In order to avoid this problem, we must serialize unlocked dio reads with truncate by inode_dio_wait(). Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/btrfs_inode.h | 19 +++ fs/btrfs/inode.c | 31 +++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2a8c242..00e2601 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -40,6 +40,7 @@ #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 #define BTRFS_INODE_NEEDS_FULL_SYNC7 #define BTRFS_INODE_COPY_EVERYTHING8 +#define BTRFS_INODE_READDIO_NEED_LOCK 9 /* in memory btrfs inode */ struct btrfs_inode { @@ -216,4 +217,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) return 0; } +/* + * Disable DIO read nolock optimization, so new dio readers will be forced + * to grab i_mutex. It is used to avoid the endless truncate due to + * nonlocked dio read. + */ +static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +{ + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_I(inode)-runtime_flags); + smp_mb(); +} + +static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb__before_clear_bit(); + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, + BTRFS_I(inode)-runtime_flags); +} + #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 97f4c30..d17a04b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3785,6 +3785,11 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); + + btrfs_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + btrfs_inode_resume_unlocked_dio(inode); + ret = btrfs_truncate(inode); } @@ -6583,15 +6588,33 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb-ki_filp; struct inode *inode = file-f_mapping-host; + int flags = 0; + bool wakeup = false; + int ret; if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov, offset, nr_segs)) return 0; - return __blockdev_direct_IO(rw, iocb, inode, - BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, 0); + if (rw == READ) { + atomic_inc(inode-i_dio_count); + smp_mb__after_atomic_inc(); + if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + BTRFS_I(inode)-runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + } else { + wakeup = true; + } + } + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (wakeup) + inode_dio_done(inode); + return ret; } #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC][PATCH 2/2] Btrfs: implement unlocked dio write
This idea is from ext4. By this patch, we can make the dio write parallel, and improve the performance. We needn't worry about the race between dio write and truncate, because the truncate need wait untill all the dio write end. And we also needn't worry about the race between dio write and punch hole, because we have extent lock to protect our operation. I ran fio to test the performance of this feature. == Hardware == CPU: Intel(R) Core(TM)2 Duo CPU E7500 @ 2.93GHz Mem: 2GB SSD: Intel X25-M 120GB (Test Partition: 60GB) == config file == [global] ioengine=psync direct=1 bs=4k size=32G runtime=60 directory=/mnt/btrfs/ filename=testfile group_reporting thread [file1] numjobs=1 # 2 4 rw=randwrite == result (KBps) == write 1 2 4 lock24936 24738 24726 nolock 24962 30866 32101 == result (iops) == write 1 2 4 lock623461846181 nolock 624077168025 Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/inode.c | 24 +--- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d17a04b..091593a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6589,31 +6589,33 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, struct file *file = iocb-ki_filp; struct inode *inode = file-f_mapping-host; int flags = 0; - bool wakeup = false; + bool wakeup = true; int ret; if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov, offset, nr_segs)) return 0; - if (rw == READ) { - atomic_inc(inode-i_dio_count); - smp_mb__after_atomic_inc(); - if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - BTRFS_I(inode)-runtime_flags))) { - inode_dio_done(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else { - wakeup = true; - } + atomic_inc(inode-i_dio_count); + smp_mb__after_atomic_inc(); + if (rw == WRITE) { + mutex_unlock(inode-i_mutex); + } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, +BTRFS_I(inode)-runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; } ret = __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev, iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); + if (wakeup) inode_dio_done(inode); + if (rw == WRITE) + mutex_lock(inode-i_mutex); return ret; } -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!
On Fri, 01 Feb 2013 09:31:33 +0900, Tsutomu Itoh wrote: Hi, On 2013/01/31 16:58, Miao Xie wrote: On wed, 30 Jan 2013 23:55:34 -0600, Eric Sandeen wrote: if you move the fail_block_groups: target above the comment, does that fix it? (although I don't know yet what started IO . . . ) like this: From: Eric Sandeen sand...@redhat.com Make sure that we are always done with the btree_inode's mapping before we shut down the worker threads in open_ctree() error cases. I reviewed your patch again, and found it just fix the above problem, it still have similar problems which are not fixed. How about this one? Thanks Eric and Miao. But I can not reproduce this problem, yet. ('Btrfs: too many missing devices, writeable mount is not allowed' messages was displayed, but not panic) So, I can not test your patch, sorry. Can you please explain similar problems, Miao? Before missing device check, there are several places where we read the metadata, such as reading chunk tree root, btrfs_read_chunk_tree, those functions may fail after submit a bio. If we don't wait until the bio end, and just stop the workers, the same problem will happen. (invalidate_inode_pages2() will wait until the bio end, because it need lock the pages which are going to be invalidated, and the page is locked if it is under disk read IO) Thanks Miao Thanks, Tsutomu diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0c31d07..d8fd711 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2728,13 +2728,13 @@ fail_cleaner: * kthreads */ filemap_write_and_wait(fs_info-btree_inode-i_mapping); -invalidate_inode_pages2(fs_info-btree_inode-i_mapping); fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); +invalidate_inode_pages2(fs_info-btree_inode-i_mapping); fail_sb_buffer: btrfs_stop_workers(fs_info-generic_worker); @@ -2755,7 +2755,6 @@ fail_alloc: fail_iput: btrfs_mapping_tree_free(fs_info-mapping_tree); -invalidate_inode_pages2(fs_info-btree_inode-i_mapping); iput(fs_info-btree_inode); fail_bdi: bdi_destroy(fs_info-bdi); -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH 2/2] Btrfs: implement unlocked dio write
On fri, 1 Feb 2013 10:53:30 +0800, Liu Bo wrote: On Thu, Jan 31, 2013 at 05:39:03PM +0800, Miao Xie wrote: This idea is from ext4. By this patch, we can make the dio write parallel, and improve the performance. Interesting, AFAIK, ext4 can only do nolock dio write on some conditions(should be a overwrite, file size remains unchanged, no aligned/buffer io in flight), btrfs is ok without any conditions? ext4 don't have extent lock, it can not avoid 2 AIO threads are at work on the same unwritten block, so it can not use unlocked dio write for unaligned dio/aio. But btrfs has extent lock, it can avoid this problem. And ext4 need take write lock of -i_data_sem, when it allocate the free space, but in order to avoid truncation and hole punch during dio, it need take the read lock of -i_data_sem before it release -i_mutex, that is if it isn't a overwrite, deadlock will happen, so the unlocked dio of ext4 should be a overwrite. But btrfs doesn't have such limitation. Thanks Miao thanks, liubo We needn't worry about the race between dio write and truncate, because the truncate need wait untill all the dio write end. And we also needn't worry about the race between dio write and punch hole, because we have extent lock to protect our operation. I ran fio to test the performance of this feature. == Hardware == CPU: Intel(R) Core(TM)2 Duo CPU E7500 @ 2.93GHz Mem: 2GB SSD: Intel X25-M 120GB (Test Partition: 60GB) == config file == [global] ioengine=psync direct=1 bs=4k size=32G runtime=60 directory=/mnt/btrfs/ filename=testfile group_reporting thread [file1] numjobs=1 # 2 4 rw=randwrite == result (KBps) == write1 2 4 lock 24936 24738 24726 nolock 24962 30866 32101 == result (iops) == write1 2 4 lock 623461846181 nolock 624077168025 Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/inode.c | 24 +--- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d17a04b..091593a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6589,31 +6589,33 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, struct file *file = iocb-ki_filp; struct inode *inode = file-f_mapping-host; int flags = 0; -bool wakeup = false; +bool wakeup = true; int ret; if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov, offset, nr_segs)) return 0; -if (rw == READ) { -atomic_inc(inode-i_dio_count); -smp_mb__after_atomic_inc(); -if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - BTRFS_I(inode)-runtime_flags))) { -inode_dio_done(inode); -flags = DIO_LOCKING | DIO_SKIP_HOLES; -} else { -wakeup = true; -} +atomic_inc(inode-i_dio_count); +smp_mb__after_atomic_inc(); +if (rw == WRITE) { +mutex_unlock(inode-i_mutex); +} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + BTRFS_I(inode)-runtime_flags))) { +inode_dio_done(inode); +flags = DIO_LOCKING | DIO_SKIP_HOLES; +wakeup = false; } ret = __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev, iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); + if (wakeup) inode_dio_done(inode); +if (rw == WRITE) +mutex_lock(inode-i_mutex); return ret; } -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] Btrfs: serialize unlocked dio reads with truncate
On Thu, 31 Jan 2013 11:40:41 -0500, Josef Bacik wrote: On Thu, Jan 31, 2013 at 02:23:19AM -0700, Miao Xie wrote: Currently, we can do unlocked dio reads, but the following race is possible: dio_read_tasktruncate_task -btrfs_setattr() -btrfs_direct_IO -__blockdev_direct_IO -btrfs_get_block -btrfs_truncate() #alloc truncated blocks #to other inode -submit_io() #INFORMATION LEAK In order to avoid this problem, we must serialize unlocked dio reads with truncate by inode_dio_wait(). So I had thinking about this, are we sure we don't want to just lock the extent range when we truncate? I'm good with this, but it seems like we might as well and be consistent and use the extent locks. What do you think? Thanks, But comparing with the current approach, the extent lock has the following problem: Dio_Read_Task Truncate_task truncate file set isize to 4096 drop pages lock extent[4096, 8191] read extent[4096, 8191] unlock extent[4096, 8191] lock extent[4096, -1ULL] truncate item unlock extent[4096, -1ULL] lock extent[8192, ...] read extent[8192, ...] no extent item zero the buffer unlock extent[8192, ...] we get the data that is mixed with new data.(Punch hole also has this problem, we need fix) Thanks Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH 2/2] Btrfs: implement unlocked dio write
On fri, 01 Feb 2013 12:08:25 +0800, Miao Xie wrote: Onfri, 1 Feb 2013 10:53:30 +0800, Liu Bo wrote: On Thu, Jan 31, 2013 at 05:39:03PM +0800, Miao Xie wrote: This idea is from ext4. By this patch, we can make the dio write parallel, and improve the performance. Interesting, AFAIK, ext4 can only do nolock dio write on some conditions(should be a overwrite, file size remains unchanged, no aligned/buffer io in flight), btrfs is ok without any conditions? ext4 don't have extent lock, it can not avoid 2 AIO threads are at work on the same unwritten block, so it can not use unlocked dio write for unaligned dio/aio. But btrfs has extent lock, it can avoid this problem. Besides that, btrfs doesn't allow doing a unaligned dio/aio. I read the code again, found there is a race that several tasks may update i_size at the same time. There are two methods to fix this problem: 1. just like ext4, don't do unlocked write dio if it is beyond the end of the file 2. use a spin lock to protect i_size update I want to choose the 2nd one. Thanks Miao And ext4 need take write lock of -i_data_sem, when it allocate the free space, but in order to avoid truncation and hole punch during dio, it need take the read lock of -i_data_sem before it release -i_mutex, that is if it isn't a overwrite, deadlock will happen, so the unlocked dio of ext4 should be a overwrite. But btrfs doesn't have such limitation. Thanks Miao thanks, liubo We needn't worry about the race between dio write and truncate, because the truncate need wait untill all the dio write end. And we also needn't worry about the race between dio write and punch hole, because we have extent lock to protect our operation. I ran fio to test the performance of this feature. == Hardware == CPU: Intel(R) Core(TM)2 Duo CPU E7500 @ 2.93GHz Mem: 2GB SSD: Intel X25-M 120GB (Test Partition: 60GB) == config file == [global] ioengine=psync direct=1 bs=4k size=32G runtime=60 directory=/mnt/btrfs/ filename=testfile group_reporting thread [file1] numjobs=1 # 2 4 rw=randwrite == result (KBps) == write 1 2 4 lock24936 24738 24726 nolock 24962 30866 32101 == result (iops) == write 1 2 4 lock623461846181 nolock 624077168025 Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/inode.c | 24 +--- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d17a04b..091593a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6589,31 +6589,33 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, struct file *file = iocb-ki_filp; struct inode *inode = file-f_mapping-host; int flags = 0; - bool wakeup = false; + bool wakeup = true; int ret; if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov, offset, nr_segs)) return 0; - if (rw == READ) { - atomic_inc(inode-i_dio_count); - smp_mb__after_atomic_inc(); - if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - BTRFS_I(inode)-runtime_flags))) { - inode_dio_done(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else { - wakeup = true; - } + atomic_inc(inode-i_dio_count); + smp_mb__after_atomic_inc(); + if (rw == WRITE) { + mutex_unlock(inode-i_mutex); + } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, +BTRFS_I(inode)-runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; } ret = __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev, iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); + if (wakeup) inode_dio_done(inode); + if (rw == WRITE) + mutex_lock(inode-i_mutex); return ret; } -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info
Re: [PATCH 5/5] Btrfs: fix remount vs autodefrag
Any comments about this patch? Thanks Miao On mon, 26 Nov 2012 17:28:13 +0800, Miao Xie wrote: If we remount the fs to close the auto defragment or make the fs R/O, we should stop the auto defragment. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h | 1 + fs/btrfs/file.c | 13 + fs/btrfs/super.c | 29 + 3 files changed, 43 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4ce24ce..01d671c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1759,6 +1759,7 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_clear_opt(o, opt) ((o) = ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt)((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt)((root)-fs_info-mount_opt \ BTRFS_MOUNT_##opt) /* diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 40b17d0..7aaae56 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -320,8 +320,21 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, range.start = defrag-last_offset; sb_start_write(fs_info-sb); + + /* Avoid defraging files on R/O fs */ + if (!down_write_trylock(fs_info-sb-s_umount)) { + sb_end_write(fs_info-sb); + btrfs_requeue_inode_defrag(inode, defrag); + iput(inode); + return -EBUSY; + } + + BUG_ON(fs_info-sb-s_flags MS_RDONLY); + num_defrag = btrfs_defrag_file(inode, NULL, range, defrag-transid, BTRFS_DEFRAG_BATCH); + + up_write(fs_info-sb-s_umount); sb_end_write(fs_info-sb); /* * if we filled the whole defrag batch, there diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b3b041a..2e7beee 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1189,6 +1189,32 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_set_max_workers(fs_info-scrub_workers, new_pool_size); } +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) + (!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) || + (flags MS_RDONLY))) { + /* wait for any defraggers to finish */ + wait_event(fs_info-transaction_wait, +(atomic_read(fs_info-defrag_running) == 0)); + } +} + +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ + /* + * We remount the fs successfully, then we need cleanup all defragable + * inodes if the autodefragment is close or the fs is R/O. + */ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) + (!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) || + (flags MS_RDONLY))) + btrfs_cleanup_defrag_inodes(fs_info); + +} + static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1214,6 +1240,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if ((*flags MS_RDONLY) == (sb-s_flags MS_RDONLY)) return 0; + btrfs_remount_prepare(fs_info, old_opts, *flags); + if (*flags MS_RDONLY) { sb-s_flags |= MS_RDONLY; @@ -1247,6 +1275,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) sb-s_flags = ~MS_RDONLY; } + btrfs_remount_cleanup(fs_info, old_opts, *flags); return 0; restore: -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!
On thu, 31 Jan 2013 12:37:49 +0900, Tsutomu Itoh wrote: Hi, In kernel 3.8-rc5, the following panics occurred when the mount was done by the degraded option. # btrfs fi sh /dev/sdc8 Label: none uuid: fc63cd80-5ae2-4fbe-8795-2d526c937a56 Total devices 3 FS bytes used 20.98GB devid1 size 9.31GB used 9.31GB path /dev/sdd8 devid2 size 9.31GB used 9.31GB path /dev/sdc8 *** Some devices missing Btrfs v0.20-rc1-37-g91d9eec # mount -o degraded /dev/sdc8 /test1 564 static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) 565 { ... ... 595 fallback: 596 fallback = NULL; 597 /* 598 * we have failed to find any workers, just 599 * return the first one we can find. 600 */ 601 if (!list_empty(workers-worker_list)) 602 fallback = workers-worker_list.next; 603 if (!list_empty(workers-idle_list)) 604 fallback = workers-idle_list.next; 605 BUG_ON(!fallback); -- this ! 606 worker = list_entry(fallback, 607 struct btrfs_worker_thread, worker_list); If worker_list is not empty, we get a worker from this list; if worker_list is empty, it means all the workers in idle_list, we get the worker from idle_list. So the above bug is introduced by the second if sentence. it should be else if. Thanks Miao -Tsutomu === [ 7913.075890] btrfs: allowing degraded mounts [ 7913.075893] btrfs: disk space caching is enabled [ 7913.092031] Btrfs: too many missing devices, writeable mount is not allowed [ 7913.092297] [ cut here ] [ 7913.092313] kernel BUG at fs/btrfs/async-thread.c:605! [ 7913.092326] invalid opcode: [#1] SMP [ 7913.092342] Modules linked in: btrfs zlib_deflate crc32c libcrc32c nfsd lockd nfs_acl auth_rpcgss sunrpc 8021q garp stp llc cpufreq_ondemand cachefiles fscache ipv6 ext3 jbd dm_mirror dm_region_hash dm_log dm_mod uinput ppdev iTCO_wdt iTCO_vendor_support parport_pc parport sg acpi_cpufreq freq_table mperf coretemp kvm pcspkr i2c_i801 i2c_core lpc_ich mfd_core tg3 ptp pps_core shpchp pci_hotplug i3000_edac edac_core ext4 mbcache jbd2 crc16 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_piix libata megaraid_sas scsi_mod floppy [last unloaded: microcode] [ 7913.092575] CPU 0 [ 7913.092584] Pid: 3673, comm: btrfs-endio-wri Not tainted 3.8.0-rc5 #1 FUJITSU-SV PRIMERGY/D2399 [ 7913.092608] RIP: 0010:[a04670ef] [a04670ef] btrfs_queue_worker+0x10e/0x236 [btrfs] [ 7913.092663] RSP: 0018:88019fc03c10 EFLAGS: 00010046 [ 7913.092676] RAX: RBX: 8801967b8a58 RCX: [ 7913.092894] RDX: RSI: 8801961239b8 RDI: 8801967b8ab8 [ 7913.093116] RBP: 88019fc03c50 R08: R09: 880198801180 [ 7913.093247] R10: a045fda7 R11: 0003 R12: [ 7913.093247] R13: 8801961239b8 R14: 8801967b8ab8 R15: 0246 [ 7913.093247] FS: () GS:88019fc0() knlGS: [ 7913.093247] CS: 0010 DS: ES: CR0: 8005003b [ 7913.093247] CR2: ff600400 CR3: 00019575d000 CR4: 07f0 [ 7913.093247] DR0: DR1: DR2: [ 7913.093247] DR3: DR6: 0ff0 DR7: 0400 [ 7913.093247] Process btrfs-endio-wri (pid: 3673, threadinfo 8801939ca000, task 880195795b00) [ 7913.093247] Stack: [ 7913.093247] 8801967b8a88 8801967b8a78 88003fa0a600 8801965ad0c0 [ 7913.093247] 88003fa0a600 [ 7913.096183] 88019fc03c60 a043e357 88019fc03c70 811526aa [ 7913.096183] Call Trace: [ 7913.096183] IRQ [ 7913.096183] [ 7913.096183] [a043e357] end_workqueue_bio+0x79/0x7b [btrfs] [ 7913.096183] [811526aa] bio_endio+0x2d/0x2f [ 7913.096183] [a045fdb2] btrfs_end_bio+0x10b/0x122 [btrfs] [ 7913.096183] [811526aa] bio_endio+0x2d/0x2f [ 7913.096183] [811c5e3f] req_bio_endio+0x96/0x9f [ 7913.096183] [811c601d] blk_update_request+0x1d5/0x3a4 [ 7913.096183] [811c620c] blk_update_bidi_request+0x20/0x6f [ 7913.096183] [811c7a59] blk_end_bidi_request+0x1f/0x5d [ 7913.096183] [811c7ad3] blk_end_request+0x10/0x12 [ 7913.096183] [a001db50] scsi_io_completion+0x207/0x4f3 [scsi_mod] [ 7913.096183] [a0016df9] scsi_finish_command+0xec/0xf5 [scsi_mod] [ 7913.096183] [a001df50] scsi_softirq_done+0xff/0x108 [scsi_mod] [ 7913.096183] [811ccb3a] blk_done_softirq+0x7a/0x8e [ 7913.096183] [810475c3]
Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!
On Thu, 31 Jan 2013 01:19:41 -0500 (est), Eric Sandeen wrote: On Jan 31, 2013, at 12:13 AM, Miao Xie mi...@cn.fujitsu.com wrote: On thu, 31 Jan 2013 12:37:49 +0900, Tsutomu Itoh wrote: Hi, In kernel 3.8-rc5, the following panics occurred when the mount was done by the degraded option. # btrfs fi sh /dev/sdc8 Label: none uuid: fc63cd80-5ae2-4fbe-8795-2d526c937a56 Total devices 3 FS bytes used 20.98GB devid1 size 9.31GB used 9.31GB path /dev/sdd8 devid2 size 9.31GB used 9.31GB path /dev/sdc8 *** Some devices missing Btrfs v0.20-rc1-37-g91d9eec # mount -o degraded /dev/sdc8 /test1 564 static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) 565 { ... ... 595 fallback: 596 fallback = NULL; 597 /* 598 * we have failed to find any workers, just 599 * return the first one we can find. 600 */ 601 if (!list_empty(workers-worker_list)) 602 fallback = workers-worker_list.next; 603 if (!list_empty(workers-idle_list)) 604 fallback = workers-idle_list.next; 605 BUG_ON(!fallback); -- this ! 606 worker = list_entry(fallback, 607 struct btrfs_worker_thread, worker_list); If worker_list is not empty, we get a worker from this list; if worker_list is empty, it means all the workers in idle_list, we get the worker from idle_list. So the above bug is introduced by the second if sentence. it should be else if. else if makes sense, but we cannot reach the BUG_ON unless both lists are empty, correct? You are right, I misread the code. Thanks Miao -Eric Thanks Miao -Tsutomu === [ 7913.075890] btrfs: allowing degraded mounts [ 7913.075893] btrfs: disk space caching is enabled [ 7913.092031] Btrfs: too many missing devices, writeable mount is not allowed [ 7913.092297] [ cut here ] [ 7913.092313] kernel BUG at fs/btrfs/async-thread.c:605! [ 7913.092326] invalid opcode: [#1] SMP [ 7913.092342] Modules linked in: btrfs zlib_deflate crc32c libcrc32c nfsd lockd nfs_acl auth_rpcgss sunrpc 8021q garp stp llc cpufreq_ondemand cachefiles fscache ipv6 ext3 jbd dm_mirror dm_region_hash dm_log dm_mod uinput ppdev iTCO_wdt iTCO_vendor_support parport_pc parport sg acpi_cpufreq freq_table mperf coretemp kvm pcspkr i2c_i801 i2c_core lpc_ich mfd_core tg3 ptp pps_core shpchp pci_hotplug i3000_edac edac_core ext4 mbcache jbd2 crc16 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_piix libata megaraid_sas scsi_mod floppy [last unloaded: microcode] [ 7913.092575] CPU 0 [ 7913.092584] Pid: 3673, comm: btrfs-endio-wri Not tainted 3.8.0-rc5 #1 FUJITSU-SV PRIMERGY/D2399 [ 7913.092608] RIP: 0010:[a04670ef] [a04670ef] btrfs_queue_worker+0x10e/0x236 [btrfs] [ 7913.092663] RSP: 0018:88019fc03c10 EFLAGS: 00010046 [ 7913.092676] RAX: RBX: 8801967b8a58 RCX: [ 7913.092894] RDX: RSI: 8801961239b8 RDI: 8801967b8ab8 [ 7913.093116] RBP: 88019fc03c50 R08: R09: 880198801180 [ 7913.093247] R10: a045fda7 R11: 0003 R12: [ 7913.093247] R13: 8801961239b8 R14: 8801967b8ab8 R15: 0246 [ 7913.093247] FS: () GS:88019fc0() knlGS: [ 7913.093247] CS: 0010 DS: ES: CR0: 8005003b [ 7913.093247] CR2: ff600400 CR3: 00019575d000 CR4: 07f0 [ 7913.093247] DR0: DR1: DR2: [ 7913.093247] DR3: DR6: 0ff0 DR7: 0400 [ 7913.093247] Process btrfs-endio-wri (pid: 3673, threadinfo 8801939ca000, task 880195795b00) [ 7913.093247] Stack: [ 7913.093247] 8801967b8a88 8801967b8a78 88003fa0a600 8801965ad0c0 [ 7913.093247] 88003fa0a600 [ 7913.096183] 88019fc03c60 a043e357 88019fc03c70 811526aa [ 7913.096183] Call Trace: [ 7913.096183] IRQ [ 7913.096183] [ 7913.096183] [a043e357] end_workqueue_bio+0x79/0x7b [btrfs] [ 7913.096183] [811526aa] bio_endio+0x2d/0x2f [ 7913.096183] [a045fdb2] btrfs_end_bio+0x10b/0x122 [btrfs] [ 7913.096183] [811526aa] bio_endio+0x2d/0x2f [ 7913.096183] [811c5e3f] req_bio_endio+0x96/0x9f [ 7913.096183] [811c601d] blk_update_request+0x1d5/0x3a4 [ 7913.096183] [811c620c] blk_update_bidi_request+0x20/0x6f [ 7913.096183] [811c7a59] blk_end_bidi_request+0x1f/0x5d [ 7913.096183] [811c7ad3] blk_end_request+0x10/0x12 [ 7913.096183] [a001db50] scsi_io_completion+0x207
Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!
On wed, 30 Jan 2013 23:55:34 -0600, Eric Sandeen wrote: === [ 7913.075890] btrfs: allowing degraded mounts [ 7913.075893] btrfs: disk space caching is enabled [ 7913.092031] Btrfs: too many missing devices, writeable mount is not allowed so this was supposed to fail the mount in open_ctree; it jumps to shutting down the worker threads. Which might result in no threads available. [ 7913.092297] [ cut here ] [ 7913.092313] kernel BUG at fs/btrfs/async-thread.c:605! [ 7913.092326] invalid opcode: [#1] SMP [ 7913.092342] Modules linked in: btrfs zlib_deflate crc32c libcrc32c nfsd lockd nfs_acl auth_rpcgss sunrpc 8021q garp stp llc cpufreq_ondemand cachefiles fscache ipv6 ext3 jbd dm_mirror dm_region_hash dm_log dm_mod uinput ppdev iTCO_wdt iTCO_vendor_support parport_pc parport sg acpi_cpufreq freq_table mperf coretemp kvm pcspkr i2c_i801 i2c_core lpc_ich mfd_core tg3 ptp pps_core shpchp pci_hotplug i3000_edac edac_core ext4 mbcache jbd2 crc16 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_piix libata megaraid_sas scsi_mod floppy [last unloaded: microcode] [ 7913.092575] CPU 0 [ 7913.092584] Pid: 3673, comm: btrfs-endio-wri Not tainted 3.8.0-rc5 #1 FUJITSU-SV PRIMERGY/D2399 [ 7913.092608] RIP: 0010:[a04670ef] [a04670ef] btrfs_queue_worker+0x10e/0x236 [btrfs] but this is already trying to do work, and has no workers to handle it. The place we jump to is fail_block_groups, and before it is this comment: /* * make sure we're done with the btree inode before we stop our * kthreads */ filemap_write_and_wait(fs_info-btree_inode-i_mapping); invalidate_inode_pages2(fs_info-btree_inode-i_mapping); fail_block_groups: btrfs_free_block_groups(fs_info); if you move the fail_block_groups: target above the comment, does that fix it? (although I don't know yet what started IO . . . ) Reading the metadata of the tree root and Reading block group information started IO. so, I think this patch can fix the problem. like this: From: Eric Sandeen sand...@redhat.com Make sure that we are always done with the btree_inode's mapping before we shut down the worker threads in open_ctree() error cases. Signed-off-by: Eric Sandeen sand...@redhat.com diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d89da40..1e2abda 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2689,6 +2689,7 @@ fail_trans_kthread: fail_cleaner: kthread_stop(fs_info-cleaner_kthread); +fail_block_groups: /* * make sure we're done with the btree inode before we stop our * kthreads @@ -2696,7 +2697,6 @@ fail_cleaner: filemap_write_and_wait(fs_info-btree_inode-i_mapping); invalidate_inode_pages2(fs_info-btree_inode-i_mapping); -fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: Just a guess; but I don't know what would have started writes already... I don't think it was write IO. It was just a soft interrupt caused by a metadata read IO, and this soft interrupt happened while btrfs-endio-write-workers was going to stop. Thanks Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!
On wed, 30 Jan 2013 23:55:34 -0600, Eric Sandeen wrote: if you move the fail_block_groups: target above the comment, does that fix it? (although I don't know yet what started IO . . . ) like this: From: Eric Sandeen sand...@redhat.com Make sure that we are always done with the btree_inode's mapping before we shut down the worker threads in open_ctree() error cases. I reviewed your patch again, and found it just fix the above problem, it still have similar problems which are not fixed. How about this one? diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0c31d07..d8fd711 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2728,13 +2728,13 @@ fail_cleaner: * kthreads */ filemap_write_and_wait(fs_info-btree_inode-i_mapping); - invalidate_inode_pages2(fs_info-btree_inode-i_mapping); fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); + invalidate_inode_pages2(fs_info-btree_inode-i_mapping); fail_sb_buffer: btrfs_stop_workers(fs_info-generic_worker); @@ -2755,7 +2755,6 @@ fail_alloc: fail_iput: btrfs_mapping_tree_free(fs_info-mapping_tree); - invalidate_inode_pages2(fs_info-btree_inode-i_mapping); iput(fs_info-btree_inode); fail_bdi: bdi_destroy(fs_info-bdi); -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V2 01/10] Btrfs: use atomic for btrfs_fs_info-generation
fs_info-generation is a 64bit variant, and it can be accessed by multi-task, if there is no lock or other methods to protect it, we might get the wrong number, especially on 32bit machine. For example, Assuming -generation is 0x at the beginning, then we increase it by 1, -generation will be 0x 0001 , but it is in the registers, then we store it into the memory. If some task accesses it at this time, just like this: Task0 Task1 set low 32 bits load low 32 bits load high 32 bits set high 32 bits The task will get 0, it is a wrong number. We fix this problem by the atomic operation. Signed-off-by: Zhao Lei zhao...@cn.fujitsu.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.c | 7 --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 8 fs/btrfs/file.c | 6 -- fs/btrfs/inode.c | 5 +++-- fs/btrfs/qgroup.c| 2 +- fs/btrfs/transaction.c | 4 ++-- include/trace/events/btrfs.h | 3 ++- 8 files changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index eea5da7..4a36c03 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1365,10 +1365,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, (unsigned long long) root-fs_info-running_transaction-transid); - if (trans-transid != root-fs_info-generation) + if (trans-transid != atomic64_read(root-fs_info-generation)) WARN(1, KERN_CRIT trans %llu running %llu\n, (unsigned long long)trans-transid, - (unsigned long long)root-fs_info-generation); + (unsigned long long)atomic64_read( + root-fs_info-generation)); if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -1465,7 +1466,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, return 0; WARN_ON(trans-transaction != root-fs_info-running_transaction); - WARN_ON(trans-transid != root-fs_info-generation); + WARN_ON(trans-transid != atomic64_read(root-fs_info-generation)); parent_nritems = btrfs_header_nritems(parent); blocksize = btrfs_level_size(root, parent_level - 1); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 547b7b0..c3edb22 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1278,7 +1278,7 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; - u64 generation; + atomic64_t generation; u64 last_trans_committed; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65f0367..f03aebc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1200,7 +1200,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, memset(root-root_item, 0, sizeof(root-root_item)); memset(root-defrag_progress, 0, sizeof(root-defrag_progress)); memset(root-root_kobj, 0, sizeof(root-root_kobj)); - root-defrag_trans_start = fs_info-generation; + root-defrag_trans_start = atomic64_read(fs_info-generation); init_completion(root-kobj_unregister); root-defrag_running = 0; root-root_key.objectid = objectid; @@ -2501,7 +2501,7 @@ retry_root_backup: fs_info-pending_quota_state = 1; } - fs_info-generation = generation; + atomic64_set(fs_info-generation, generation); fs_info-last_trans_committed = generation; ret = btrfs_recover_balance(fs_info); @@ -3436,12 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) int was_dirty; btrfs_assert_tree_locked(buf); - if (transid != root-fs_info-generation) + if (transid != atomic64_read(root-fs_info-generation)) WARN(1, KERN_CRIT btrfs transid mismatch buffer %llu, found %llu running %llu\n, (unsigned long long)buf-start, (unsigned long long)transid, - (unsigned long long)root-fs_info-generation); + (u64)atomic64_read(root-fs_info-generation)); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) { spin_lock(root-fs_info-delalloc_lock); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 841cfe3..02409b6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1588,7 +1588,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ - BTRFS_I(inode)-last_trans = root-fs_info-generation + 1
[PATCH V2 02/10] Btrfs: use atomic for fs_info-last_trans_committed
fs_info-last_trans_committed is a 64bit variant, and it can be accessed by multi-task, if there is no lock or other methods to protect it, we might get the wrong number, especially on 32bit machine.(Even on 64bit machine, it is possible that the compiler may split a 64bit operation into two 32bit operation.) For example, Assuming -last_trans_committed is 0x at the beginning, then we want set it to 0x0001. Task0 Task1 set low 32 bits load low 32 bits load high 32 bits set high 32 bits The task will get 0, it is a wrong number. We fix this problem by the atomic operation. Signed-off-by: Zhao Lei zhao...@cn.fujitsu.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h| 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/ioctl.c| 2 +- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/scrub.c| 2 +- fs/btrfs/transaction.c | 5 +++-- fs/btrfs/tree-log.c | 16 +--- 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c3edb22..34a60a8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1279,7 +1279,7 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; atomic64_t generation; - u64 last_trans_committed; + atomic64_t last_trans_committed; /* * this is updated to the current trans every time a full commit diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f03aebc..87ed05a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2502,7 +2502,7 @@ retry_root_backup: } atomic64_set(fs_info-generation, generation); - fs_info-last_trans_committed = generation; + atomic64_set(fs_info-last_trans_committed, generation); ret = btrfs_recover_balance(fs_info); if (ret) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 02409b6..910ea99 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1683,7 +1683,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (btrfs_inode_in_log(inode, atomic64_read(root-fs_info-generation)) || BTRFS_I(inode)-last_trans = - root-fs_info-last_trans_committed) { + atomic64_read(root-fs_info-last_trans_committed)) { BTRFS_I(inode)-last_trans = 0; /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index afbf3ac..3b6c339 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3114,7 +3114,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, return PTR_ERR(trans); /* No running transaction, don't bother */ - transid = root-fs_info-last_trans_committed; + transid = atomic64_read(root-fs_info-last_trans_committed); goto out; } transid = trans-transid; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f107312..f376621 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -975,7 +975,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, * if this file hasn't been changed since the last transaction * commit, we can safely return without doing anything */ - if (last_mod root-fs_info-last_trans_committed) + if (last_mod atomic64_read(root-fs_info-last_trans_committed)) return; spin_lock(root-fs_info-ordered_extent_lock); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f..af0b566 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2703,7 +2703,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (root-fs_info-fs_state BTRFS_SUPER_FLAG_ERROR) return -EIO; - gen = root-fs_info-last_trans_committed; + gen = atomic64_read(root-fs_info-last_trans_committed); for (i = 0; i BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 105d642..29fdf1c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -459,7 +459,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) int ret = 0; if (transid) { - if (transid = root-fs_info-last_trans_committed) + if (transid = + atomic64_read(root-fs_info-last_trans_committed)) goto out; ret = -EINVAL; @@ -1730,7 +1731,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cur_trans-commit_done = 1; - root-fs_info-last_trans_committed = cur_trans-transid; + atomic64_set(root-fs_info
[PATCH V2 03/10] Btrfs: use atomic for fs_info-last_trans_log_full_commit
fs_info-last_trans_log_full_commit is a 64bit variant, and it can be accessed by multi-task, if there is no lock or other methods to protect it, we might get the wrong number, especially on 32bit machine.(Even on 64bit machine, it is possible that the compiler may split a 64bit operation into two 32bit operation.) For example, Assuming -last_trans_log_full_commit is 0x at the beginning, then we want set it to 0x0001. Task0 Task1 set low 32 bits load low 32 bits load high 32 bits set high 32 bits The task will get 0, it is a wrong number. We fix this problem by the atomic operation. Signed-off-by: Zhao Lei zhao...@cn.fujitsu.com Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 3 ++- fs/btrfs/inode.c | 3 ++- fs/btrfs/tree-log.c| 32 +++- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 34a60a8..745e7ad 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1285,7 +1285,7 @@ struct btrfs_fs_info { * this is updated to the current trans every time a full commit * is required instead of the faster short fsync log commits */ - u64 last_trans_log_full_commit; + atomic64_t last_trans_log_full_commit; unsigned long mount_opt; unsigned long compress_type:4; u64 max_inline; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 85b8454..ef61a4a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7868,7 +7868,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root-fs_info-extent_root; - root-fs_info-last_trans_log_full_commit = trans-transid; + atomic64_set(root-fs_info-last_trans_log_full_commit, +trans-transid); cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 35c4dda..803be87 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7433,7 +7433,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - root-fs_info-last_trans_log_full_commit = trans-transid; + atomic64_set(root-fs_info-last_trans_log_full_commit, +trans-transid); } else { ret = btrfs_insert_inode_ref(trans, dest, new_dentry-d_name.name, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7f42a53..bb7c01b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2227,14 +2227,14 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, wait, TASK_UNINTERRUPTIBLE); mutex_unlock(root-log_mutex); - if (root-fs_info-last_trans_log_full_commit != + if (atomic64_read(root-fs_info-last_trans_log_full_commit) != trans-transid root-log_transid transid + 2 atomic_read(root-log_commit[index])) schedule(); finish_wait(root-log_commit_wait[index], wait); mutex_lock(root-log_mutex); - } while (root-fs_info-last_trans_log_full_commit != + } while (atomic64_read(root-fs_info-last_trans_log_full_commit) != trans-transid root-log_transid transid + 2 atomic_read(root-log_commit[index])); return 0; @@ -2244,12 +2244,12 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (root-fs_info-last_trans_log_full_commit != + while (atomic64_read(root-fs_info-last_trans_log_full_commit) != trans-transid atomic_read(root-log_writers)) { prepare_to_wait(root-log_writer_wait, wait, TASK_UNINTERRUPTIBLE); mutex_unlock(root-log_mutex); - if (root-fs_info-last_trans_log_full_commit != + if (atomic64_read(root-fs_info-last_trans_log_full_commit) != trans-transid atomic_read(root-log_writers)) schedule(); mutex_lock(root-log_mutex); @@ -2306,7 +2306,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (root-fs_info-last_trans_log_full_commit == trans-transid) { + if (atomic64_read(root-fs_info-last_trans_log_full_commit) == + trans-transid) { ret
[PATCH V2 04/10] Btrfs: add a comment for fs_info-max_inline
Though -max_inline is a 64bit variant, and may be accessed by multi-task, but it is just suggestive number, so we needn't add anything to protect fs_info-max_inline, just add a comment to explain wny we don't use a lock to protect it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - modify the changelog and make it more clear. --- fs/btrfs/ctree.h | 6 ++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 745e7ad..3e672916 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1288,6 +1288,12 @@ struct btrfs_fs_info { atomic64_t last_trans_log_full_commit; unsigned long mount_opt; unsigned long compress_type:4; + /* +* It is a suggestive number, the read side is safe even it gets a +* wrong number because we will write out the data into a regular +* extent. The write side(mount/remount) is under -s_umount lock, +* so it is also safe. +*/ u64 max_inline; u64 alloc_start; struct btrfs_transaction *running_transaction; -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V2 05/10] Btrfs: protect fs_info-alloc_start
fs_info-alloc_start is a 64bits variant, can be accessed by multi-task, but it is not protected strictly, it can be changed while we are accessing it. On 32bit machine, we will get wrong value because we access it by two instructions.(In fact, it is also possible that the same problem happens on the 64bit machine, because the compiler may split the 64bit operation into two 32bit operation.) For example: Assuming - alloc_start is 0x 0001 at the beginning, then we remount and set -alloc_start to 0x 0100 . Task0 Task1 load high 32 bits set high 32 bits set low 32 bits load low 32 bits Task1 will get 0. This patch fixes this problem by using two locks to protect it fs_info-chunk_mutex sb-s_umount On the read side, we just need get one of these two locks, and on the write side, we must lock all of them. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h | 10 ++ fs/btrfs/super.c | 4 2 files changed, 14 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3e672916..201be7d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1295,6 +1295,16 @@ struct btrfs_fs_info { * so it is also safe. */ u64 max_inline; + /* +* Protected by -chunk_mutex and sb-s_umount. +* +* The reason that we use two lock to protect it is because only +* remount and mount operations can change it and these two operations +* are under sb-s_umount, but the read side (chunk allocation) can not +* acquire sb-s_umount or the deadlock would happen. So we use two +* locks to protect it. On the write side, we must acquire two locks, +* and on the read side, we just need acquire one of them. +*/ u64 alloc_start; struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d8982e9..c96f132 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_alloc_start: num = match_strdup(args[0]); if (num) { + mutex_lock(info-chunk_mutex); info-alloc_start = memparse(num, NULL); + mutex_unlock(info-chunk_mutex); kfree(num); printk(KERN_INFO btrfs: allocations start at %llu\n, @@ -1289,7 +1291,9 @@ restore: fs_info-mount_opt = old_opts; fs_info-compress_type = old_compress_type; fs_info-max_inline = old_max_inline; + mutex_lock(fs_info-chunk_mutex); fs_info-alloc_start = old_alloc_start; + mutex_unlock(fs_info-chunk_mutex); btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info-thread_pool_size); fs_info-metadata_ratio = old_metadata_ratio; -- 1.7.11.7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V2 06/10] Btrfs: use percpu counter for dirty metadata count
-dirty_metadata_bytes is accessed very frequently, so use percpu counter instead of the u64 variant to reduce the contention of the lock. This patch also fixed the problem that we access it without lock protection in __btrfs_btree_balance_dirty(), which may cause we skip the dirty pages flush. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1 - v2: - modify the changelog and make it more clear and stringency. --- fs/btrfs/ctree.h | 9 fs/btrfs/disk-io.c | 64 fs/btrfs/extent_io.c | 9 +++- 3 files changed, 42 insertions(+), 40 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 201be7d..1dcbbfd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) +#define BTRFS_DIRTY_METADATA_THRESH(32 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -1439,10 +1441,9 @@ struct btrfs_fs_info { u64 total_pinned; - /* protected by the delalloc lock, used to keep from writing -* metadata until there is a nice batch -*/ - u64 dirty_metadata_bytes; + /* used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + s32 dirty_metadata_batch; struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87ed05a..961ac58 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -946,18 +946,20 @@ static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_io_tree *tree; + struct btrfs_fs_info *fs_info; + int ret; + tree = BTRFS_I(mapping-host)-io_tree; if (wbc-sync_mode == WB_SYNC_NONE) { - struct btrfs_root *root = BTRFS_I(mapping-host)-root; - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; if (wbc-for_kupdate) return 0; + fs_info = BTRFS_I(mapping-host)-root-fs_info; /* this is a bit racy, but that's ok */ - num_dirty = root-fs_info-dirty_metadata_bytes; - if (num_dirty thresh) + ret = percpu_counter_compare(fs_info-dirty_metadata_bytes, +BTRFS_DIRTY_METADATA_THRESH); + if (ret 0) return 0; } return btree_write_cache_pages(mapping, wbc); @@ -1125,24 +1127,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + struct btrfs_fs_info *fs_info = root-fs_info; + if (btrfs_header_generation(buf) == - root-fs_info-running_transaction-transid) { + fs_info-running_transaction-transid) { btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, buf-bflags)) { - spin_lock(root-fs_info-delalloc_lock); - if (root-fs_info-dirty_metadata_bytes = buf-len) - root-fs_info-dirty_metadata_bytes -= buf-len; - else { - spin_unlock(root-fs_info-delalloc_lock); - btrfs_panic(root-fs_info, -EOVERFLOW, - Can't clear %lu bytes from - dirty_mdatadata_bytes (%llu), - buf-len, - root-fs_info-dirty_metadata_bytes); - } - spin_unlock(root-fs_info-delalloc_lock); - + __percpu_counter_add(fs_info-dirty_metadata_bytes, +-buf-len, +fs_info-dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -2004,10 +1998,18 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } + ret = percpu_counter_init(fs_info-dirty_metadata_bytes, 0); + if (ret) { + err = ret; + goto fail_bdi; + } + fs_info-dirty_metadata_batch = PAGE_CACHE_SIZE * + (1 + ilog2(nr_cpu_ids)); + fs_info-btree_inode = new_inode(sb); if (!fs_info-btree_inode) { err = -ENOMEM