The tree mod log will log modifications made fs-tree nodes. Most modifications are done by autobalance of the tree. Such changes are recorded as long as a block entry exists. When released, the log is cleaned.
With the tree modification log, it's possible to reconstruct a consistent old state of the tree. This is required to do backref walking on a busy file system. Signed-off-by: Jan Schmidt <list.bt...@jan-o-sch.net> --- fs/btrfs/ctree.c | 409 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ctree.h | 7 +- fs/btrfs/disk-io.c | 2 +- 3 files changed, 416 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 56485b3..6420638 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/rbtree.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -288,6 +289,414 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } +enum mod_log_op { + MOD_LOG_KEY_REPLACE, + MOD_LOG_KEY_ADD, + MOD_LOG_KEY_REMOVE, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, + MOD_LOG_MOVE_KEYS, + MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { + int dst_slot; + int nr_items; +}; + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 index; /* shifted logical */ + struct seq_list elem; + enum mod_log_op op; + + /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ + int slot; + + /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ + u64 generation; + + /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ + struct btrfs_disk_key key; + u64 blockptr; + + /* this is used for op == MOD_LOG_MOVE_KEYS */ + struct tree_mod_move move; + + /* this is used for op == MOD_LOG_ROOT_REPLACE */ + struct tree_mod_root old_root; +}; + +static inline void +__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +{ + elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +} + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + elem->flags = 1; + spin_lock(&fs_info->tree_mod_seq_lock); + __get_tree_mod_seq(fs_info, elem); + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct seq_list *cur_elem; + struct tree_mod_elem *tm; + u64 min_seq = (u64)-1; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + BUG_ON(!(elem->flags & 1)); + spin_lock(&fs_info->tree_mod_seq_lock); + list_del(&elem->list); + + list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { + if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { + if (seq_putting > cur_elem->seq) { + /* + * blocker with lower sequence number exists, we + * cannot remove anything from the log + */ + goto out; + } + min_seq = cur_elem->seq; + } + } + + /* + * anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = container_of(node, struct tree_mod_elem, node); + if (tm->elem.seq > min_seq) + continue; + rb_erase(node, tm_root); + list_del(&tm->elem.list); + kfree(tm); + } + write_unlock(&fs_info->tree_mod_log_lock); +out: + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +/* + * key order of the log: + * index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + + BUG_ON(!tm || !tm->elem.seq); + + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = container_of(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->index < tm->index) + new = &((*new)->rb_left); + else if (cur->index > tm->index) + new = &((*new)->rb_right); + else if (cur->elem.seq < tm->elem.seq) + new = &((*new)->rb_left); + else if (cur->elem.seq > tm->elem.seq) + new = &((*new)->rb_right); + else { + kfree(tm); + return -EEXIST; + } + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); + write_unlock(&fs_info->tree_mod_log_lock); + + return 0; +} + +int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, + struct tree_mod_elem **tm_ret) +{ + struct tree_mod_elem *tm; + u64 seq = 0; + + /* + * we want to avoid a malloc/free cycle if there's no blocker in the + * list. + * we also want to avoid atomic malloc. so we must drop the spinlock + * before calling kzalloc and recheck afterwards. + */ + spin_lock(&fs_info->tree_mod_seq_lock); + if (list_empty(&fs_info->tree_mod_seq_list)) + goto out; + + spin_unlock(&fs_info->tree_mod_seq_lock); + tm = *tm_ret = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (list_empty(&fs_info->tree_mod_seq_list)) { + kfree(tm); + goto out; + } + + __get_tree_mod_seq(fs_info, &tm->elem); + seq = tm->elem.seq; + tm->elem.flags = 0; + +out: + spin_unlock(&fs_info->tree_mod_seq_lock); + return seq; +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + if (op != MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + int slot, enum mod_log_op op) +{ + return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int dst_slot, int src_slot, + int nr_items, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *old_root, + struct extent_buffer *new_root, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = new_root->start >> PAGE_CACHE_SHIFT; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = MOD_LOG_ROOT_REPLACE; + + return __tree_mod_log_insert(fs_info, tm); +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, + int smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + u64 index = start >> PAGE_CACHE_SHIFT; + + read_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = container_of(node, struct tree_mod_elem, node); + if (cur->index < index) { + node = node->rb_left; + } else if (cur->index > index) { + node = node->rb_right; + } else if (cur->elem.seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* we want the node with the highest seq */ + if (found) + BUG_ON(found->elem.seq > cur->elem.seq); + found = cur; + node = node->rb_left; + } else if (cur->elem.seq > min_seq) { + /* we want the node with the smallest seq */ + if (found) + BUG_ON(found->elem.seq < cur->elem.seq); + found = cur; + node = node->rb_right; + } else { + return cur; + } + } + read_unlock(&fs_info->tree_mod_log_lock); + + return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, + u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static inline void +__copy_extent_buffer_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + int nr_items, size_t item_size) +{ + int ret; + int i; + + /* speed this up by single seq for all operations? */ + for (i = 0; i < nr_items; i++) { + ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, + MOD_LOG_KEY_ADD); + BUG_ON(ret < 0); + } + + copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_offset), + btrfs_node_key_ptr_offset(src_offset), + nr_items * item_size); +} + +static inline void +__memmove_extent_buffer_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *dst, + int dst_offset, int src_offset, int nr_items, + size_t item_size, int tree_mod_log) +{ + int ret; + if (tree_mod_log) { + ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, + src_offset, nr_items, GFP_NOFS); + BUG_ON(ret < 0); + } + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst_offset), + btrfs_node_key_ptr_offset(src_offset), + nr_items * item_size); +} + +static inline void +__set_node_key_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr, int atomic) +{ + int ret; + + ret = tree_mod_log_insert_key_mask(fs_info, eb, nr, MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); + BUG_ON(ret < 0); + + btrfs_set_node_key(eb, disk_key, nr); +} + +static void __log_cleaning(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + int i; + int ret; + u32 nritems; + + nritems = btrfs_header_nritems(eb); + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert_key(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING); + BUG_ON(ret < 0); + } +} + +static inline void +set_root_pointer(struct btrfs_root *root, struct extent_buffer *new_root_node) +{ + int ret; + __log_cleaning(root->fs_info, root->node); + ret = tree_mod_log_insert_root(root->fs_info, root->node, + new_root_node, GFP_NOFS); + BUG_ON(ret < 0); + rcu_assign_pointer(root->node, new_root_node); +} + /* * check if the tree block can be shared by multiple trees */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6774821..e53bfb9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1132,7 +1132,7 @@ struct btrfs_fs_info { /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; atomic_t tree_mod_seq; - struct list_head tree_mod_list; + struct list_head tree_mod_seq_list; /* this protects tree_mod_log */ rwlock_t tree_mod_log_lock; @@ -3114,4 +3114,9 @@ struct seq_list { u32 flags; }; +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); + #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6aec7c6..f51ad84 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1921,7 +1921,7 @@ int open_ctree(struct super_block *sb, init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); - INIT_LIST_HEAD(&fs_info->tree_mod_list); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv); btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html