4.16-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Nikolay Borisov <nbori...@suse.com>

commit 5e388e95815408c27f3612190d089afc0774b870 upstream.

When the delayed refs for a head are all run, eventually
cleanup_ref_head is called which (in case of deletion) obtains a
reference for the relevant btrfs_space_info struct by querying the bg
for the range. This is problematic because when the last extent of a
bg is deleted a race window emerges between removal of that bg and the
subsequent invocation of cleanup_ref_head. This can result in cache being null
and either a null pointer dereference or assertion failure.

        task: ffff8d04d31ed080 task.stack: ffff9e5dc10cc000
        RIP: 0010:assfail.constprop.78+0x18/0x1a [btrfs]
        RSP: 0018:ffff9e5dc10cfbe8 EFLAGS: 00010292
        RAX: 0000000000000044 RBX: 0000000000000000 RCX: 0000000000000000
        RDX: ffff8d04ffc1f868 RSI: ffff8d04ffc178c8 RDI: ffff8d04ffc178c8
        RBP: ffff8d04d29e5ea0 R08: 00000000000001f0 R09: 0000000000000001
        R10: ffff9e5dc0507d58 R11: 0000000000000001 R12: ffff8d04d29e5ea0
        R13: ffff8d04d29e5f08 R14: ffff8d04efe29b40 R15: ffff8d04efe203e0
        FS:  00007fbf58ead500(0000) GS:ffff8d04ffc00000(0000) 
knlGS:0000000000000000
        CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
        CR2: 00007fe6c6975648 CR3: 0000000013b2a000 CR4: 00000000000006f0
        DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
        DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
        Call Trace:
         __btrfs_run_delayed_refs+0x10e7/0x12c0 [btrfs]
         btrfs_run_delayed_refs+0x68/0x250 [btrfs]
         btrfs_should_end_transaction+0x42/0x60 [btrfs]
         btrfs_truncate_inode_items+0xaac/0xfc0 [btrfs]
         btrfs_evict_inode+0x4c6/0x5c0 [btrfs]
         evict+0xc6/0x190
         do_unlinkat+0x19c/0x300
         do_syscall_64+0x74/0x140
         entry_SYSCALL_64_after_hwframe+0x3d/0xa2
        RIP: 0033:0x7fbf589c57a7

To fix this, introduce a new flag "is_system" to head_ref structs,
which is populated at insertion time. This allows to decouple the
querying for the spaceinfo from querying the possibly deleted bg.

Fixes: d7eae3403f46 ("Btrfs: rework delayed ref total_bytes_pinned accounting")
CC: sta...@vger.kernel.org # 4.14+
Suggested-by: Omar Sandoval <osan...@osandov.com>
Signed-off-by: Nikolay Borisov <nbori...@suse.com>
Reviewed-by: Omar Sandoval <osan...@fb.com>
Signed-off-by: David Sterba <dste...@suse.com>
Signed-off-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>

---
 fs/btrfs/delayed-ref.c |   19 ++++++++++++++-----
 fs/btrfs/delayed-ref.h |    1 +
 fs/btrfs/extent-tree.c |   16 +++++++++++-----
 3 files changed, 26 insertions(+), 10 deletions(-)

--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -553,8 +553,10 @@ add_delayed_ref_head(struct btrfs_fs_inf
                     struct btrfs_delayed_ref_head *head_ref,
                     struct btrfs_qgroup_extent_record *qrecord,
                     u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
-                    int action, int is_data, int *qrecord_inserted_ret,
+                    int action, int is_data, int is_system,
+                    int *qrecord_inserted_ret,
                     int *old_ref_mod, int *new_ref_mod)
+
 {
        struct btrfs_delayed_ref_head *existing;
        struct btrfs_delayed_ref_root *delayed_refs;
@@ -598,6 +600,7 @@ add_delayed_ref_head(struct btrfs_fs_inf
        head_ref->ref_mod = count_mod;
        head_ref->must_insert_reserved = must_insert_reserved;
        head_ref->is_data = is_data;
+       head_ref->is_system = is_system;
        head_ref->ref_tree = RB_ROOT;
        INIT_LIST_HEAD(&head_ref->ref_add_list);
        RB_CLEAR_NODE(&head_ref->href_node);
@@ -785,6 +788,7 @@ int btrfs_add_delayed_tree_ref(struct bt
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_qgroup_extent_record *record = NULL;
        int qrecord_inserted;
+       int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
 
        BUG_ON(extent_op && extent_op->is_data);
        ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
@@ -813,8 +817,8 @@ int btrfs_add_delayed_tree_ref(struct bt
         */
        head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
                                        bytenr, num_bytes, 0, 0, action, 0,
-                                       &qrecord_inserted, old_ref_mod,
-                                       new_ref_mod);
+                                       is_system, &qrecord_inserted,
+                                       old_ref_mod, new_ref_mod);
 
        add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
                             num_bytes, parent, ref_root, level, action);
@@ -881,7 +885,7 @@ int btrfs_add_delayed_data_ref(struct bt
         */
        head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
                                        bytenr, num_bytes, ref_root, reserved,
-                                       action, 1, &qrecord_inserted,
+                                       action, 1, 0, &qrecord_inserted,
                                        old_ref_mod, new_ref_mod);
 
        add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -911,9 +915,14 @@ int btrfs_add_delayed_extent_op(struct b
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
 
+       /*
+        * extent_ops just modify the flags of an extent and they don't result
+        * in ref count changes, hence it's safe to pass false/0 for is_system
+        * argument
+        */
        add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr,
                             num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
-                            extent_op->is_data, NULL, NULL, NULL);
+                            extent_op->is_data, 0, NULL, NULL, NULL);
 
        spin_unlock(&delayed_refs->lock);
        return 0;
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -139,6 +139,7 @@ struct btrfs_delayed_ref_head {
         */
        unsigned int must_insert_reserved:1;
        unsigned int is_data:1;
+       unsigned int is_system:1;
        unsigned int processing:1;
 };
 
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2615,13 +2615,19 @@ static int cleanup_ref_head(struct btrfs
        trace_run_delayed_ref_head(fs_info, head, 0);
 
        if (head->total_ref_mod < 0) {
-               struct btrfs_block_group_cache *cache;
+               struct btrfs_space_info *space_info;
+               u64 flags;
 
-               cache = btrfs_lookup_block_group(fs_info, head->bytenr);
-               ASSERT(cache);
-               percpu_counter_add(&cache->space_info->total_bytes_pinned,
+               if (head->is_data)
+                       flags = BTRFS_BLOCK_GROUP_DATA;
+               else if (head->is_system)
+                       flags = BTRFS_BLOCK_GROUP_SYSTEM;
+               else
+                       flags = BTRFS_BLOCK_GROUP_METADATA;
+               space_info = __find_space_info(fs_info, flags);
+               ASSERT(space_info);
+               percpu_counter_add(&space_info->total_bytes_pinned,
                                   -head->num_bytes);
-               btrfs_put_block_group(cache);
 
                if (head->is_data) {
                        spin_lock(&delayed_refs->lock);


Reply via email to