The dedup reference is a special kind of delayed refs, and the delayed refs are batched to be processed later.
If we find a matched dedup extent, then we queue an ADD delayed ref on it within endio work, but there is already a DROP delayed ref queued, t1 t2 t3 ->writepage commit transaction ->run_delalloc_dedup find_dedup ------------------------------------------------------------------------------ process_delayed refs (it deletes the dedup extent) add ordered extent | submit pages | finish ordered io | insert file extents | queue delayed refs | queue dedup ref | "process delayed refs" continues (insert a ref on an extent deleted by the above) This senario ends up with a crash because we're going to insert a ref on a deleted extent. To avoid the race, we need to check if there is a ADD delayed ref on deleting the extent and protect this job with lock. Signed-off-by: Liu Bo <bo.li....@oracle.com> --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/extent-tree.c | 35 +++++++++++++++++++---------------- fs/btrfs/file-item.c | 36 +++++++++++++++++++++++++++++++++++- fs/btrfs/inode.c | 10 ++-------- 4 files changed, 58 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 65adf07..48f587e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3764,7 +3764,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); int noinline_for_stack -btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash); +btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash, + struct inode *inode, u64 file_pos); int noinline_for_stack btrfs_insert_dedup_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 191f0a7..a8da7aa 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5937,9 +5937,23 @@ again: goto again; } } else { - if (!dedup_hash && is_data && - root_objectid == BTRFS_DEDUP_TREE_OBJECTID) - dedup_hash = extent_data_ref_offset(root, path, iref); + if (is_data && root_objectid == BTRFS_DEDUP_TREE_OBJECTID) { + if (!dedup_hash) + dedup_hash = extent_data_ref_offset(root, + path, iref); + + ret = btrfs_free_dedup_extent(trans, root, + dedup_hash, bytenr); + if (ret) { + if (ret == -EAGAIN) + ret = 0; + else + btrfs_abort_transaction(trans, + extent_root, + ret); + goto out; + } + } if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -5964,21 +5978,10 @@ again: if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, + extent_root, ret); goto out; } - - if (root_objectid == BTRFS_DEDUP_TREE_OBJECTID) { - ret = btrfs_free_dedup_extent(trans, root, - dedup_hash, - bytenr); - if (ret) { - btrfs_abort_transaction(trans, - extent_root, - ret); - goto out; - } - } } ret = update_block_group(root, bytenr, num_bytes, 0); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 068112c..d842cf8 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -887,13 +887,15 @@ fail_unlock: /* 1 means we find one, 0 means we dont. */ int noinline_for_stack -btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash) +btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash, + struct inode *inode, u64 file_pos) { struct btrfs_key key; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_root *dedup_root; struct btrfs_dedup_item *item; + struct btrfs_trans_handle *trans; u64 hash_value; u64 length; u64 dedup_size; @@ -916,6 +918,12 @@ btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash) if (!path) return 0; + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + trans = NULL; + goto out; + } + /* * For SHA256 dedup algorithm, we store the last 64bit as the * key.objectid, and the rest in the tree item. @@ -972,8 +980,16 @@ prev_slot: hash->num_bytes = length; hash->compression = compression; found = 1; + + ret = btrfs_inc_extent_ref(trans, root, key.offset, length, 0, + BTRFS_I(inode)->root->root_key.objectid, + btrfs_ino(inode), + file_pos, /* file_pos - 0 */ + 0); out: btrfs_free_path(path); + if (trans) + btrfs_end_transaction(trans, root); return found; } @@ -1055,6 +1071,8 @@ btrfs_free_dedup_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_root *dedup_root; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; int ret = 0; if (!root->fs_info->dedup_root) @@ -1088,6 +1106,22 @@ btrfs_free_dedup_extent(struct btrfs_trans_handle *trans, if (key.objectid != hash || key.offset != bytenr) goto out; + ret = 0; + + /* check if ADD_DELAYED delayed refs exist */ + delayed_refs = &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + + /* the mutex has been acquired by the caller */ + if (head && head->add_cnt) { + spin_unlock(&delayed_refs->lock); + ret = -EAGAIN; + goto out; + } + spin_unlock(&delayed_refs->lock); + ret = btrfs_del_item(trans, dedup_root, path); WARN_ON(ret); out: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d32b066..9c8bea9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -988,7 +988,8 @@ run_delalloc_dedup(struct inode *inode, struct page *locked_page, u64 start, found = 0; compr = BTRFS_COMPRESS_NONE; } else { - found = btrfs_find_dedup_extent(root, hash); + found = btrfs_find_dedup_extent(root, hash, + inode, start); compr = hash->compression; } @@ -2388,13 +2389,6 @@ static int __insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_ino(inode), hash->hash[index], 0); } - } else { - ret = btrfs_inc_extent_ref(trans, root, ins.objectid, - ins.offset, 0, - root->root_key.objectid, - btrfs_ino(inode), - file_pos, /* file_pos - 0 */ - 0); } out: btrfs_free_path(path); -- 1.8.2.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html