On Thu, Sep 06, 2012 at 09:10:52AM +0800, Liu Bo wrote: > This comes from one of btrfs's project ideas, > As we defragment files, we break any sharing from other snapshots. > The balancing code will preserve the sharing, and defrag needs to grow this > as well. > > Now we're able to fill the blank with this patch, in which we make full use of > backref walking stuff. > > Here is the basic idea, > o set the writeback ranges started by defragment with flag EXTENT_DEFRAG > o at endio, after we finish updating fs tree, we use backref walking to find > all parents of the ranges and re-link them with the new COWed file layout > by > adding corresponding backrefs. > > Original-Signed-off-by: Li Zefan <[email protected]>
This is a non-standard tag, I think Li will not object against Signed-off-by as his original submission covers most of this patch. And more S-O-B lines is allowed and understood. > Signed-off-by: Liu Bo <[email protected]> > --- > v2: rebase against the latest btrfs-repo > > fs/btrfs/inode.c | 617 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 files changed, 617 insertions(+), 0 deletions(-) > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 55857eb..0470802 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -54,6 +54,7 @@ > #include "locking.h" > #include "free-space-cache.h" > #include "inode-map.h" > +#include "backref.h" > > struct btrfs_iget_args { > u64 ino; > @@ -1846,6 +1847,600 @@ out: > return ret; > } > > +struct extent_backref { > + struct rb_node node; > + struct old_extent *old; > + u64 root_id; > + u64 inum; > + u64 file_pos; > + u64 extent_offset; > + u64 num_bytes; > + u64 generation; > +}; > + > +struct old_extent { this is a very generic name, though it's a local structure, one would think what an old_extent/new_extent would mean when it apperas in the bloated file like inode.c > + struct list_head list; > + struct new_extent *new; > + > + u64 extent_offset; > + u64 bytenr; > + u64 offset; > + u64 len; > + int count; > +}; > + > +struct new_extent { > + struct rb_root root; > + struct list_head head; > + struct btrfs_path *path; > + struct inode *inode; > + u64 file_pos; > + u64 len; > + u64 bytenr; > + u64 disk_len; > + u64 compress_type; you'll be fine with u8 for compress type > +}; > + > +struct relink_work { > + struct work_struct work; > + struct new_extent *new; > +}; > + > +static int backref_comp(struct extent_backref *b1, struct extent_backref *b2) > +{ > + if (b1->root_id < b2->root_id) > + return -1; > + else if (b1->root_id > b2->root_id) > + return 1; > + > + if (b1->inum < b2->inum) > + return -1; > + else if (b1->inum > b2->inum) > + return 1; > + > + if (b1->file_pos < b2->file_pos) > + return -1; > + else if (b1->file_pos > b2->file_pos) > + return 1; > + > + WARN_ON(1); this rather belongs to the caller where you know that two equal backrefs may be a bad thing. otherwise the function does what it's asked for -- compare two backrefs > + return 0; > +} > + > +static void backref_insert(struct rb_root *root, struct extent_backref > *backref) > +{ > + struct rb_node **p = &root->rb_node; > + struct rb_node *parent = NULL; > + struct extent_backref *entry; > + int ret; > + > + while (*p) { > + parent = *p; > + entry = rb_entry(parent, struct extent_backref, node); > + > + ret = backref_comp(backref, entry); > + if (ret < 0) > + p = &(*p)->rb_left; > + else > + p = &(*p)->rb_right; so the backref == entry case is here and some sort of fallback behaviour should be done here (even if it is a BUG_ON) > + } > + > + rb_link_node(&backref->node, parent, p); > + rb_insert_color(&backref->node, root); > +} > + > +/* > + * Note the backref might has changed, and in this case we just return 0. > + */ > +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, > + void *ctx) > +{ > + struct btrfs_file_extent_item *extent; > + struct btrfs_fs_info *fs_info; > + struct old_extent *old = ctx; > + struct new_extent *new = old->new; > + struct btrfs_path *path = new->path; > + struct btrfs_key key; > + struct btrfs_root *root; > + struct extent_backref *backref; > + struct extent_buffer *leaf; > + struct inode *inode = new->inode; > + int slot; > + int ret; > + u64 extent_offset; > + u64 num_bytes; > + > + if (BTRFS_I(inode)->root->root_key.objectid == root_id && > + inum == btrfs_ino(inode)) > + return 0; > + > + key.objectid = root_id; > + key.type = BTRFS_ROOT_ITEM_KEY; > + key.offset = (u64)-1; > + > + fs_info = BTRFS_I(inode)->root->fs_info; > + root = btrfs_read_fs_root_no_name(fs_info, &key); > + if (IS_ERR(root)) { > + if (PTR_ERR(root) == -ENOENT) > + return 0; > + WARN_ON(1); please put some debugging printk ("%d", PTR_ERR(root)) here, otherwise it's pointless to just dump the stack > + return PTR_ERR(root); > + } > + > + key.objectid = inum; > + key.type = BTRFS_EXTENT_DATA_KEY; > + if (offset > (u64)-1 << 32) magic code needs a non-magic comment > + key.offset = 0; > + else > + key.offset = offset; > + > + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); > + if (ret < 0) { > + WARN_ON(1); do you need finer error checking here? like when ret > 0 and when ret == 0 ? > + return ret; > + } > + > + while (1) { > + leaf = path->nodes[0]; > + slot = path->slots[0]; > + > + if (slot >= btrfs_header_nritems(leaf)) { > + ret = btrfs_next_leaf(root, path); > + if (ret < 0) { > + goto out; > + } else if (ret > 0) { > + ret = 0; > + goto out; > + } > + continue; > + } > + > + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); > + > + if (key.objectid != inum || key.type != BTRFS_EXTENT_DATA_KEY) > + goto next; > + > + extent = btrfs_item_ptr(leaf, slot, > + struct btrfs_file_extent_item); > + > + if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) > + goto next; > + > + if (key.offset - btrfs_file_extent_offset(leaf, extent) != > + offset) > + goto next; > + > + break; oh, could this be turned into a more structured block? > +next: usually we've seen a cond_resched() in similar places, as it's a while(1) loop without any apparent scheduling point > + path->slots[0]++; > + } > + > + extent_offset = btrfs_file_extent_offset(leaf, extent); > + num_bytes = btrfs_file_extent_num_bytes(leaf, extent); > + > + if (extent_offset >= old->extent_offset + old->offset + old->len || > + extent_offset + num_bytes < old->extent_offset + old->offset) > + goto out; > + > + backref = kmalloc(sizeof(*backref), GFP_NOFS); > + if (!backref) { > + ret = -ENOENT; > + goto out; > + } > + > + backref->root_id = root_id; > + backref->inum = inum; > + backref->file_pos = offset + extent_offset; > + backref->num_bytes = num_bytes; > + backref->extent_offset = extent_offset; > + backref->generation = btrfs_file_extent_generation(leaf, extent); > + backref->old = old; > + backref_insert(&new->root, backref); > + old->count++; > +out: > + btrfs_release_path(path); > + WARN_ON(ret); > + return ret; > +} > + > +static noinline bool record_extent_backrefs(struct btrfs_path *path, > + struct new_extent *new) > +{ > + struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; > + struct old_extent *old, *tmp; > + int ret; > + bool del = false; del is effectively unused through out this function > + > + new->path = path; > + > + list_for_each_entry_safe(old, tmp, &new->head, list) { > + if (del) > + goto del; > + > + ret = iterate_inodes_from_logical(old->bytenr, fs_info, > + path, record_one_backref, > + old); > + WARN_ON(ret < 0); > +del: > + /* no backref to be processed for this extent */ > + if (!old->count) { > + list_del(&old->list); > + kfree(old); > + } > + } > + > + if (list_empty(&new->head)) > + return false; > + > + return true; > +} > + > +/* > + * Note the backref might has changed, and in this case we just return 0. > + */ > +static noinline int relink_extent_backref(struct btrfs_path *path, > + struct extent_backref *prev, > + struct extent_backref *backref) > +{ > + struct btrfs_file_extent_item *extent; > + struct btrfs_file_extent_item *item; > + struct btrfs_ordered_extent *ordered; > + struct btrfs_trans_handle *trans; > + struct btrfs_fs_info *fs_info; > + struct btrfs_root *root; > + struct btrfs_key key; > + struct extent_buffer *leaf; > + struct old_extent *old = backref->old; > + struct new_extent *new = old->new; > + struct inode *src_inode = new->inode; > + struct inode *inode; > + struct extent_state *cached = NULL; > + int ret = 0; > + u64 hint_byte; > + u64 start; > + u64 len; > + bool merge = false; > + > + if (prev && prev->root_id == backref->root_id && > + prev->inum == backref->inum && > + prev->extent_offset == backref->extent_offset && > + prev->file_pos + prev->num_bytes == backref->file_pos) > + merge = true; > + > + key.objectid = backref->root_id; > + key.type = BTRFS_ROOT_ITEM_KEY; > + key.offset = (u64)-1; > + > + fs_info = BTRFS_I(src_inode)->root->fs_info; > + root = btrfs_read_fs_root_no_name(fs_info, &key); > + if (IS_ERR(root)) { > + if (PTR_ERR(root) == -ENOENT) > + return 0; > + return PTR_ERR(root); > + } > + > + key.objectid = backref->inum; > + key.type = BTRFS_INODE_ITEM_KEY; > + key.offset = 0; > + > + inode = btrfs_iget(fs_info->sb, &key, root, NULL); > + if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { > + if (inode && !IS_ERR(inode)) > + iput(inode); > + return 0; > + } > + > + lock_extent_bits(&BTRFS_I(inode)->io_tree, backref->file_pos, > + backref->file_pos + backref->num_bytes, 0, &cached); > + > + ordered = btrfs_lookup_first_ordered_extent(inode, > + backref->file_pos + > + backref->num_bytes); > + if (ordered) { > + btrfs_put_ordered_extent(ordered); > + goto out_unlock; > + } > + > + trans = btrfs_start_transaction(root, 3); please comment why 3 is the right number here > + if (IS_ERR(trans)) { > + ret = PTR_ERR(trans); > + goto out_unlock; > + } > + > + key.objectid = backref->inum; > + key.type = BTRFS_EXTENT_DATA_KEY; > + key.offset = backref->file_pos; > + > + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); > + if (ret < 0) { > + goto out_free_path; > + } else if (ret > 0) { > + ret = 0; > + goto out_free_path; > + } > + > + extent = btrfs_item_ptr(path->nodes[0], path->slots[0], > + struct btrfs_file_extent_item); > + > + if (btrfs_file_extent_generation(path->nodes[0], extent) != > + backref->generation) > + goto out_free_path; > + > + btrfs_release_path(path); > + > + start = backref->file_pos; > + if (backref->extent_offset < old->extent_offset + old->offset) > + start += old->extent_offset + old->offset - > + backref->extent_offset; > + > + len = min(backref->extent_offset + backref->num_bytes, > + old->extent_offset + old->offset + old->len); > + len -= max(backref->extent_offset, old->extent_offset + old->offset); > + > + ret = btrfs_drop_extents(trans, inode, start, > + start + len, &hint_byte, 1); > + if (ret) > + goto out_free_path; > +again: > + key.objectid = btrfs_ino(inode); > + key.type = BTRFS_EXTENT_DATA_KEY; > + key.offset = start; > + > + if (merge) { > + struct btrfs_file_extent_item *fi; > + u64 extent_len; > + struct btrfs_key found_key; > + > + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); > + if (ret < 0) > + goto out_free_path; > + > + path->slots[0]--; > + leaf = path->nodes[0]; > + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); > + > + fi = btrfs_item_ptr(leaf, path->slots[0], > + struct btrfs_file_extent_item); > + extent_len = btrfs_file_extent_num_bytes(leaf, fi); > + > + if (btrfs_file_extent_disk_bytenr(leaf, fi) == new->bytenr && > + btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_REG && > + !btrfs_file_extent_compression(leaf, fi) && > + !btrfs_file_extent_encryption(leaf, fi) && > + !btrfs_file_extent_other_encoding(leaf, fi) && > + extent_len + found_key.offset == start) { so no cow-aware defrag for compressed files? > + btrfs_set_file_extent_num_bytes(leaf, fi, > + extent_len + len); > + btrfs_mark_buffer_dirty(leaf); > + inode_add_bytes(inode, len); > + > + ret = 1; > + goto out_free_path; > + } else { > + merge = false; > + btrfs_release_path(path); > + goto again; > + } > + } > + > + ret = btrfs_insert_empty_item(trans, root, path, &key, > + sizeof(*extent)); > + BUG_ON(ret); looking at other calls, we do handle the return code either by cleaning up and returning or in connection with transaction abort. please try to fix it up more gracefully. > + > + leaf = path->nodes[0]; > + item = btrfs_item_ptr(leaf, path->slots[0], > + struct btrfs_file_extent_item); > + btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); > + btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); > + btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); > + btrfs_set_file_extent_num_bytes(leaf, item, len); > + btrfs_set_file_extent_ram_bytes(leaf, item, new->len); > + btrfs_set_file_extent_generation(leaf, item, trans->transid); > + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); > + btrfs_set_file_extent_compression(leaf, item, new->compress_type); > + btrfs_set_file_extent_encryption(leaf, item, 0); > + btrfs_set_file_extent_other_encoding(leaf, item, 0); > + > + btrfs_mark_buffer_dirty(leaf); > + inode_add_bytes(inode, len); > + > + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, > + new->disk_len, 0, > + backref->root_id, backref->inum, > + start, 0); > + BUG_ON(ret); > + > + ret = 1; > +out_free_path: > + btrfs_release_path(path); > + btrfs_end_transaction(trans, root); > +out_unlock: > + unlock_extent_cached(&BTRFS_I(inode)->io_tree, backref->file_pos, > + backref->file_pos + backref->num_bytes, > + &cached, GFP_NOFS); > + iput(inode); > + return ret; > +} > + > +static void relink_file_extents(struct work_struct *work) > +{ > + struct relink_work *rwork; > + struct btrfs_path *path; > + struct new_extent *new; > + struct old_extent *old, *tmp; > + struct extent_backref *backref; > + struct extent_backref *prev = NULL; > + struct inode *inode; > + struct btrfs_root *root; > + struct rb_node *node; > + struct extent_state *cached = NULL; > + int ret; > + > + rwork = container_of(work, struct relink_work, work); > + new = rwork->new; > + inode = new->inode; > + root = BTRFS_I(inode)->root; > + > + path = btrfs_alloc_path(); > + if (!path) > + return; > + > + if (!record_extent_backrefs(path, new)) { > + btrfs_free_path(path); > + goto out; > + } > + btrfs_release_path(path); > + > + lock_extent_bits(&BTRFS_I(inode)->io_tree, new->file_pos, > + new->file_pos + new->len, 0, &cached); > + > + while (1) { > + node = rb_first(&new->root); > + if (!node) > + break; > + rb_erase(node, &new->root); > + > + backref = rb_entry(node, struct extent_backref, node); > + > + ret = relink_extent_backref(path, prev, backref); > + WARN_ON(ret < 0); > + > + kfree(prev); > + > + if (ret == 1) > + prev = backref; > + else > + prev = NULL; cond_resched() > + }; drop the ; > + > + kfree(prev); > + > + unlock_extent_cached(&BTRFS_I(inode)->io_tree, new->file_pos, > + new->file_pos + new->len, &cached, GFP_NOFS); > + > + btrfs_free_path(path); > + > + list_for_each_entry_safe(old, tmp, &new->head, list) { > + list_del(&old->list); > + kfree(old); > + } > +out: > + atomic_dec(&root->fs_info->defrag_running); > + wake_up(&root->fs_info->transaction_wait); > + > + kfree(new); > + kfree(rwork); > +} > + > +static struct new_extent * > +record_old_file_extents(struct inode *inode, > + struct btrfs_ordered_extent *ordered) > +{ > + struct btrfs_root *root = BTRFS_I(inode)->root; > + struct btrfs_path *path; > + struct btrfs_key key; > + struct old_extent *old, *tmp; > + struct new_extent *new; > + int ret; > + > + new = kmalloc(sizeof(*new), GFP_NOFS); > + if (!new) > + return NULL; > + > + new->inode = inode; > + new->file_pos = ordered->file_offset; > + new->len = ordered->len; > + new->bytenr = ordered->start; > + new->disk_len = ordered->disk_len; > + new->compress_type = ordered->compress_type; > + new->root = RB_ROOT; > + INIT_LIST_HEAD(&new->head); > + > + path = btrfs_alloc_path(); > + if (!path) > + goto out_kfree; > + > + key.objectid = btrfs_ino(inode); > + key.type = BTRFS_EXTENT_DATA_KEY; > + key.offset = new->file_pos; > + > + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); > + if (ret < 0) > + goto out_free_path; > + if (ret > 0 && path->slots[0] > 0) > + path->slots[0]--; > + > + /* find out all the old extents for the file range */ > + while (1) { > + struct btrfs_file_extent_item *extent; > + struct extent_buffer *l; > + int slot; > + u64 num_bytes; > + u64 offset; > + u64 end; > + > + l = path->nodes[0]; > + slot = path->slots[0]; > + > + if (slot >= btrfs_header_nritems(l)) { > + ret = btrfs_next_leaf(root, path); > + if (ret < 0) > + goto out_free_list; > + else if (ret > 0) > + break; > + continue; > + } > + > + btrfs_item_key_to_cpu(l, &key, slot); > + > + if (key.objectid != btrfs_ino(inode)) > + break; > + if (key.type != BTRFS_EXTENT_DATA_KEY) > + break; > + if (key.offset >= new->file_pos + new->len) > + break; > + > + extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); > + > + num_bytes = btrfs_file_extent_num_bytes(l, extent); > + if (key.offset + num_bytes < new->file_pos) > + goto next; > + > + old = kmalloc(sizeof(*old), GFP_NOFS); > + if (!old) > + goto out_free_list; > + > + offset = max(new->file_pos, key.offset); > + end = min(new->file_pos + new->len, key.offset + num_bytes); > + > + old->bytenr = btrfs_file_extent_disk_bytenr(l, extent); > + old->extent_offset = btrfs_file_extent_offset(l, extent); > + old->offset = offset - key.offset; > + old->len = end - offset; > + old->new = new; > + old->count = 0; > + list_add_tail(&old->list, &new->head); > +next: maybe another cond_resched() > + path->slots[0]++; > + } > + > + btrfs_free_path(path); > + atomic_inc(&root->fs_info->defrag_running); > + > + return new; > + > +out_free_list: > + list_for_each_entry_safe(old, tmp, &new->head, list) { > + list_del(&old->list); > + kfree(old); > + } > +out_free_path: > + btrfs_free_path(path); > +out_kfree: > + kfree(new); > + return NULL; > +} > + > /* > * helper function for btrfs_finish_ordered_io, this > * just reads in some of the csum leaves to prime them into ram > @@ -1863,6 +2458,7 @@ static int btrfs_finish_ordered_io(struct > btrfs_ordered_extent *ordered_extent) > struct btrfs_trans_handle *trans = NULL; > struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; > struct extent_state *cached_state = NULL; > + struct relink_work *work = NULL; > int compress_type = 0; > int ret; > bool nolock; > @@ -1899,6 +2495,23 @@ static int btrfs_finish_ordered_io(struct > btrfs_ordered_extent *ordered_extent) > ordered_extent->file_offset + ordered_extent->len - 1, > 0, &cached_state); > > + ret = test_range_bit(io_tree, ordered_extent->file_offset, > + ordered_extent->file_offset + ordered_extent->len - 1, > + EXTENT_DEFRAG, 1, cached_state); > + if (ret && (btrfs_root_last_snapshot(&root->root_item) >= > + BTRFS_I(inode)->generation)) { unnecessary (...) around the 2nd condition > + /* the inode is shared */ > + work = kmalloc(sizeof(*work), GFP_NOFS); > + if (work) { > + work->new = record_old_file_extents(inode, > + ordered_extent); > + if (!work->new) { > + kfree(work); > + work = NULL; > + } > + } > + } > + > if (nolock) > trans = btrfs_join_transaction_nolock(root); > else > @@ -1975,6 +2588,10 @@ out: > */ > btrfs_remove_ordered_extent(inode, ordered_extent); > > + /* for snapshot-aware defrag */ > + if (work) > + relink_file_extents(&work->work); > + > /* once for us */ > btrfs_put_ordered_extent(ordered_extent); > /* once for the tree */ > --- general notes: * the error handling or reporting could be improved, I wouldn't mind the more WARN_ONs or BUG_ONs for testing purposes, but for a finalized versiont the practice of transaction abort or at least an attempt to minimize number of BUG_ONs should be done * I didn't review the math over the extent lengths and starts ohterwise ok, passed 1st round :) david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to [email protected] More majordomo info at http://vger.kernel.org/majordomo-info.html
