There are several qgroup flush related bugs fixed recently, all of them
are caused by the fact that we can trigger qgroup metadata space
reservation holding a transaction handle.

Thankfully the only situation to trigger above reservation is
btrfs_dirty_inode().

Currently btrfs_dirty_inode() will try join transactio first, then
update the inode.
If btrfs_update_inode() fails with -ENOSPC, then it retry to start
transaction to reserve metadata space.

This not only forces us to reserve metadata space with a transaction
handle hold, but can't handle other errors like -EDQUOT.

This patch will make btrfs_dirty_inode() to call
btrfs_start_transaction() directly without first try joining then
starting, so that in try_flush_qgroup() we won't hold a trans handle.

This will slow down btrfs_dirty_inode() but my fstests doesn't show too
much different for most test cases, thus it may be worthy to skip such
performance "optimization".

Signed-off-by: Qu Wenruo <w...@suse.com>
---
 fs/btrfs/delayed-inode.c | 41 +++++++---------------------------------
 fs/btrfs/inode.c         | 11 +----------
 fs/btrfs/qgroup.c        | 34 +++++++++------------------------
 3 files changed, 17 insertions(+), 69 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 70c0340d839c..0f2454de0b2f 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -617,42 +617,15 @@ static int btrfs_delayed_inode_reserve_metadata(
        num_bytes = btrfs_calc_metadata_size(fs_info, 1);
 
        /*
-        * btrfs_dirty_inode will update the inode under btrfs_join_transaction
-        * which doesn't reserve space for speed.  This is a problem since we
-        * still need to reserve space for this update, so try to reserve the
-        * space.
+        * All of our caller should have metadata space reserved, either through
+        * btrfs_start_transaction() or have reserved space then
+        * btrfs_join_transaction(), or steal from delalloc_block_rsv.
         *
-        * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
-        * we always reserve enough to update the inode item.
+        * This ensure we have all of our metadata space reserved, without the
+        * need to reserve qgroup metadata space with a transaction hold.
         */
-       if (!src_rsv || (!trans->bytes_reserved &&
-                        src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
-               ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
-               if (ret < 0)
-                       return ret;
-               ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
-                                         BTRFS_RESERVE_NO_FLUSH);
-               /*
-                * Since we're under a transaction reserve_metadata_bytes could
-                * try to commit the transaction which will make it return
-                * EAGAIN to make us stop the transaction we have, so return
-                * ENOSPC instead so that btrfs_dirty_inode knows what to do.
-                */
-               if (ret == -EAGAIN) {
-                       ret = -ENOSPC;
-                       btrfs_qgroup_free_meta_prealloc(root, num_bytes);
-               }
-               if (!ret) {
-                       node->bytes_reserved = num_bytes;
-                       trace_btrfs_space_reservation(fs_info,
-                                                     "delayed_inode",
-                                                     btrfs_ino(inode),
-                                                     num_bytes, 1);
-               } else {
-                       btrfs_qgroup_free_meta_prealloc(root, 
fs_info->nodesize);
-               }
-               return ret;
-       }
+       ASSERT(src_rsv && (trans->bytes_reserved ||
+                          src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC));
 
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
        if (!ret) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9c2800fa80c6..da8e1c24ab22 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5911,20 +5911,11 @@ static int btrfs_dirty_inode(struct inode *inode)
        if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
                return 0;
 
-       trans = btrfs_join_transaction(root);
+       trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
        ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
-       if (ret && ret == -ENOSPC) {
-               /* whoops, lets try again with the full transaction */
-               btrfs_end_transaction(trans);
-               trans = btrfs_start_transaction(root, 1);
-               if (IS_ERR(trans))
-                       return PTR_ERR(trans);
-
-               ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
-       }
        btrfs_end_transaction(trans);
        if (BTRFS_I(inode)->delayed_node)
                btrfs_balance_delayed_items(fs_info);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 808370ada888..872a5fed9e1a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3535,37 +3535,24 @@ static int try_flush_qgroup(struct btrfs_root *root)
 {
        struct btrfs_trans_handle *trans;
        int ret;
-       bool can_commit = true;
 
        /*
-        * If current process holds a transaction, we shouldn't flush, as we
-        * assume all space reservation happens before a transaction handle is
-        * held.
+        * We shouldn't hold any trans handle here. Or we can cause deadlock
+        * when flushing.
         *
-        * But there are cases like btrfs_delayed_item_reserve_metadata() where
-        * we try to reserve space with one transction handle already held.
-        * In that case we can't commit transaction, but at least try to end it
-        * and hope the started data writes can free some space.
+        * The if branch is for kernel without CONFIG_BTRFS_ASSERT configured.
         */
-       if (current->journal_info &&
-           current->journal_info != BTRFS_SEND_TRANS_STUB)
-               can_commit = false;
+       ASSERT(current->journal_info == NULL ||
+              current->journal_info == BTRFS_SEND_TRANS_STUB);
+       if (unlikely(current->journal_info &&
+                    current->journal_info != BTRFS_SEND_TRANS_STUB))
+               return 0;
 
        /*
         * We don't want to run flush again and again, so if there is a running
         * one, we won't try to start a new flush, but exit directly.
         */
        if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
-               /*
-                * We are already holding a transaction, thus we can block other
-                * threads from flushing.  So exit right now. This increases
-                * the chance of EDQUOT for heavy load and near limit cases.
-                * But we can argue that if we're already near limit, EDQUOT is
-                * unavoidable anyway.
-                */
-               if (!can_commit)
-                       return 0;
-
                wait_event(root->qgroup_flush_wait,
                        !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
                return 0;
@@ -3582,10 +3569,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
                goto out;
        }
 
-       if (can_commit)
-               ret = btrfs_commit_transaction(trans);
-       else
-               ret = btrfs_end_transaction(trans);
+       ret = btrfs_commit_transaction(trans);
 out:
        clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
        wake_up(&root->qgroup_flush_wait);
-- 
2.29.2

Reply via email to