[PATCH] btrfs: reinitialize scrub workers
Scrub starts the workers each time a scrub starts and stops them after it finished. This patch adds an initialization for the workers before each start, otherwise the workers behave strangely. Signed-off-by: Arne Jansen sensi...@gmx.net --- fs/btrfs/disk-io.c |2 -- fs/btrfs/scrub.c |6 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a203d36..7bbbfeb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1668,8 +1668,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(fs_info-scrub_pause_wait); init_rwsem(fs_info-scrub_super_lock); fs_info-scrub_workers_refcnt = 0; - btrfs_init_workers(fs_info-scrub_workers, scrub, - fs_info-thread_pool_size, fs_info-generic_worker); sb-s_blocksize = 4096; sb-s_blocksize_bits = blksize_bits(4096); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d5a4108..92cac19 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1166,8 +1166,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root-fs_info; mutex_lock(fs_info-scrub_lock); - if (fs_info-scrub_workers_refcnt == 0) + if (fs_info-scrub_workers_refcnt == 0) { + btrfs_init_workers(fs_info-scrub_workers, scrub, + fs_info-thread_pool_size, fs_info-generic_worker); + fs_info-scrub_workers.idle_thresh = 4; btrfs_start_workers(fs_info-scrub_workers, 1); + } ++fs_info-scrub_workers_refcnt; mutex_unlock(fs_info-scrub_lock); -- 1.7.3.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 1/6] btrfs: add an extra wait mode to read_extent_buffer_pages
read_extent_buffer_pages currently has two modes, either trigger a read without waiting for anything, or wait for the I/O to finish. The former also bails when it's unable to lock the page. This patch now adds an additional parameter to allow it to block on page lock, but don't wait for completion. Signed-off-by: Arne Jansen sensi...@gmx.net --- fs/btrfs/disk-io.c |4 ++-- fs/btrfs/extent_io.c |6 +++--- fs/btrfs/extent_io.h |3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7bbbfeb..0339cc0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -325,7 +325,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, clear_bit(EXTENT_BUFFER_CORRUPT, eb-bflags); io_tree = BTRFS_I(root-fs_info-btree_inode)-io_tree; while (1) { - ret = read_extent_buffer_pages(io_tree, eb, start, 1, + ret = read_extent_buffer_pages(io_tree, eb, start, 1, 1, btree_get_extent, mirror_num); if (!ret !verify_parent_transid(io_tree, eb, parent_transid)) @@ -940,7 +940,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (!buf) return 0; read_extent_buffer_pages(BTRFS_I(btree_inode)-io_tree, -buf, 0, 0, btree_get_extent, 0); +buf, 0, 0, 0, btree_get_extent, 0); free_extent_buffer(buf); return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b181a94..b7f0b9b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3358,7 +3358,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, -u64 start, int wait, +u64 start, int wait_lock, int wait_complete, get_extent_t *get_extent, int mirror_num) { unsigned long i; @@ -3392,7 +3392,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb-start, eb-len); for (i = start_i; i num_pages; i++) { page = extent_buffer_page(eb, i); - if (!wait) { + if (!wait_lock) { if (!trylock_page(page)) goto unlock_exit; } else { @@ -3436,7 +3436,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (bio) submit_one_bio(READ, bio, mirror_num, bio_flags); - if (ret || !wait) + if (ret || !wait_complete) return ret; for (i = start_i; i num_pages; i++) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4e8445a..118c30b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -241,7 +241,8 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_io_tree *tree, -struct extent_buffer *eb, u64 start, int wait, +struct extent_buffer *eb, u64 start, +int wait_lock, int wait_complete, get_extent_t *get_extent, int mirror_num); static inline void extent_buffer_get(struct extent_buffer *eb) -- 1.7.3.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 3/6] btrfs: state information for readahead
Add state information for readahead to btrfs_fs_info and btrfs_device Changes v2: - don't wait in radix_trees - add own set of workers for readahead Signed-off-by: Arne Jansen sensi...@gmx.net --- fs/btrfs/ctree.h |5 + fs/btrfs/disk-io.c | 11 +++ fs/btrfs/volumes.c |8 fs/btrfs/volumes.h |8 4 files changed, 32 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8490ee0..6023de6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1027,6 +1027,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_write_workers; struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; + struct btrfs_workers readahead_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -1109,6 +1110,10 @@ struct btrfs_fs_info { u64 fs_state; struct btrfs_delayed_root *delayed_root; + + /* readahead tree */ + spinlock_t reada_lock; + struct radix_tree_root reada_tree; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0195172..904c4b3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1679,6 +1679,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info-defrag_inodes = RB_ROOT; fs_info-trans_no_join = 0; + /* readahead state */ + INIT_RADIX_TREE(fs_info-reada_tree, GFP_NOFS ~__GFP_WAIT); + spin_lock_init(fs_info-reada_lock); + fs_info-thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1868,6 +1872,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_workers(fs_info-delayed_workers, delayed-meta, fs_info-thread_pool_size, fs_info-generic_worker); + btrfs_init_workers(fs_info-readahead_workers, readahead, + fs_info-thread_pool_size, + fs_info-generic_worker); /* * endios are largely parallel and should have a very @@ -1878,6 +1885,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info-endio_write_workers.idle_thresh = 2; fs_info-endio_meta_write_workers.idle_thresh = 2; + fs_info-readahead_workers.idle_thresh = 2; btrfs_start_workers(fs_info-workers, 1); btrfs_start_workers(fs_info-generic_worker, 1); @@ -1890,6 +1898,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(fs_info-endio_write_workers, 1); btrfs_start_workers(fs_info-endio_freespace_worker, 1); btrfs_start_workers(fs_info-delayed_workers, 1); + btrfs_start_workers(fs_info-readahead_workers, 1); fs_info-bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info-bdi.ra_pages = max(fs_info-bdi.ra_pages, @@ -2147,6 +2156,7 @@ fail_sb_buffer: btrfs_stop_workers(fs_info-endio_freespace_worker); btrfs_stop_workers(fs_info-submit_workers); btrfs_stop_workers(fs_info-delayed_workers); + btrfs_stop_workers(fs_info-readahead_workers); fail_alloc: kfree(fs_info-delayed_root); fail_iput: @@ -2614,6 +2624,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(fs_info-endio_freespace_worker); btrfs_stop_workers(fs_info-submit_workers); btrfs_stop_workers(fs_info-delayed_workers); + btrfs_stop_workers(fs_info-readahead_workers); btrfs_close_devices(fs_info-fs_devices); btrfs_mapping_tree_free(fs_info-mapping_tree); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index da541df..81ffd9b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -349,6 +349,14 @@ static noinline int device_list_add(const char *path, } INIT_LIST_HEAD(device-dev_alloc_list); + /* init readahead state */ + spin_lock_init(device-reada_lock); + device-reada_curr_zone = NULL; + atomic_set(device-reada_in_flight, 0); + device-reada_next = 0; + INIT_RADIX_TREE(device-reada_zones, GFP_NOFS ~__GFP_WAIT); + INIT_RADIX_TREE(device-reada_extents, GFP_NOFS ~__GFP_WAIT); + mutex_lock(fs_devices-device_list_mutex); list_add_rcu(device-dev_list, fs_devices-devices); mutex_unlock(fs_devices-device_list_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7c12d61..63ac242 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -91,6 +91,14 @@ struct btrfs_device { struct btrfs_work work; struct rcu_head rcu; struct work_struct rcu_work; + + /* readahead state */ + spinlock_t reada_lock; + atomic_t reada_in_flight; + u64 reada_next; + struct reada_zone
[PATCH v2 0/6] btrfs: generic readeahead interface
This series introduces a generic readahead interface for btrfs trees. The intention is to use it to speed up scrub in a first run, but balance is another hot candidate. In general, every tree walk could be accompanied by a readahead. Deletion of large files comes to mind, where the fetching of the csums takes most of the time. Also the initial build-ups of free-space-caches and inode-allocator-caches could be sped up. To make testing easier, a simple ioctl interface is added to trigger a read- ahead from user mode. It also implements a tree walk in the traditional way. A tool to send the ioctl follows shortly. A simple demonstration from my 7-disk test btrfs: - enumerating the extent tree (traditional): 351s - enumerating the extent tree (readahead): 41s - enumerating extents+csum tree (readahead): 49s The implementation is also tested with this tool in various combinations of parallel reads of the same and of different trees. The main changes from v1 are: - Switch from extent_state flags to extent_buffer flags. - Fix a race when triggering the read. - Fix a bug where only parts of the requested range where actually prefetched. The hit only when requesting parts of a tree, so the above numbers doesn't change. Arne Jansen (6): btrfs: add an extra wait mode to read_extent_buffer_pages btrfs: add READAHEAD extent buffer flag btrfs: state information for readahead btrfs: initial readahead code and prototypes btrfs: hooks for readahead btrfs: test ioctl for readahead fs/btrfs/Makefile|3 +- fs/btrfs/ctree.h | 13 + fs/btrfs/disk-io.c | 84 - fs/btrfs/disk-io.h |2 + fs/btrfs/extent_io.c |8 +- fs/btrfs/extent_io.h |4 +- fs/btrfs/ioctl.c | 93 +- fs/btrfs/ioctl.h | 16 + fs/btrfs/reada.c | 997 ++ fs/btrfs/volumes.c |8 + fs/btrfs/volumes.h |8 + 11 files changed, 1225 insertions(+), 11 deletions(-) create mode 100644 fs/btrfs/reada.c -- 1.7.3.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 2/6] btrfs: add READAHEAD extent buffer flag
Add a READAHEAD extent buffer flag. Add a function to trigger a read with this flag set. Changes v2: - use extent buffer flags instead of extent state flags Signed-off-by: Arne Jansen sensi...@gmx.net --- fs/btrfs/disk-io.c | 32 fs/btrfs/disk-io.h |2 ++ fs/btrfs/extent_io.h |1 + 3 files changed, 35 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0339cc0..0195172 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -945,6 +945,38 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, return ret; } +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root-fs_info-btree_inode; + struct extent_io_tree *io_tree = BTRFS_I(btree_inode)-io_tree; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, buf-bflags); + + ret = read_extent_buffer_pages(io_tree, buf, 0, 0, 1, btree_get_extent, +mirror_num); + if (ret) { + free_extent_buffer(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, buf-bflags)) { + *eb = buf; + return -EIO; + } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a0b610a..fb35c4e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 118c30b..e0a09e8 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -32,6 +32,7 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 +#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 -- 1.7.3.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 4/6] btrfs: initial readahead code and prototypes
This is the implementation for the generic read ahead framework. To trigger a readahead, btrfs_reada_add must be called. It will start a read ahead for the given range [start, end) on tree root. The returned handle can either be used to wait on the readahead to finish (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). The read ahead works as follows: On btrfs_reada_add, the root of the tree is inserted into a radix_tree. reada_start_machine will then search for extents to prefetch and trigger some reads. When a read finishes for a node, all contained node/leaf pointers that lie in the given range will also be enqueued. The reads will be triggered in sequential order, thus giving a big win over a naive enumeration. It will also make use of multi-device layouts. Each disk will have its on read pointer and all disks will by utilized in parallel. Also will no two disks read both sides of a mirror simultaneously, as this would waste seeking capacity. Instead both disks will read different parts of the filesystem. Any number of readaheads can be started in parallel. The read order will be determined globally, i.e. 2 parallel readaheads will normally finish faster than the 2 started one after another. Changes v2: - protect root-node by transaction instead of node_lock - fix missed branches: The readahead had a too simple check to determine if a branch from a node should be checked or not. It now also records the upper bound of each node to see if the requested RA range lies within. - use KERN_CONT to debug output, to avoid line breaks - defer reada_start_machine to worker to avoid deadlock Signed-off-by: Arne Jansen sensi...@gmx.net --- fs/btrfs/Makefile |3 +- fs/btrfs/ctree.h |8 + fs/btrfs/reada.c | 997 + 3 files changed, 1007 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9b72dcf..58302ca 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,4 +7,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ - compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ + reada.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6023de6..2f7d506 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2668,4 +2668,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); +/* reada.c */ +void *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, + struct btrfs_key *end); +int btrfs_reada_wait(void *handle); +void btrfs_reada_detach(void *handle); +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, +u64 start, int err); + #endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c new file mode 100644 index 000..99ea583 --- /dev/null +++ b/fs/btrfs/reada.c @@ -0,0 +1,997 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include linux/sched.h +#include linux/pagemap.h +#include linux/writeback.h +#include linux/blkdev.h +#include linux/rbtree.h +#include linux/slab.h +#include linux/workqueue.h +#include ctree.h +#include volumes.h +#include disk-io.h +#include transaction.h + +#undef DEBUG + +/* + * This is the implementation for the generic read ahead framework. + * + * To trigger a readahead, btrfs_reada_add must be called. It will start + * a read ahead for the given range [start, end) on tree root. The returned + * handle can either be used to wait on the readahead to finish + * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). + * + * The read ahead works as follows: + * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. + * reada_start_machine will then search for extents to prefetch and trigger + * some reads. When a read finishes for a node, all contained node/leaf + *
[PATCH v2 6/6] btrfs: test ioctl for readahead
This ioctl is added to trigger a readahead from user mode. It implements a readahead using the new interface and also a traditional tree walk. This way it's possible to measure the two side by side. Signed-off-by: Arne Jansen sensi...@gmx.net --- fs/btrfs/ioctl.c | 93 -- fs/btrfs/ioctl.h | 16 + 2 files changed, 106 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ac37040..77de17c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2777,6 +2777,93 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) return btrfs_wait_for_commit(root, transid); } +static noinline long btrfs_ioctl_reada_test(struct btrfs_fs_info *fs_info, + void __user *argp) +{ + struct btrfs_key start = { 0 }; + struct btrfs_key end = { + .objectid = (u64)-1, + .type = (u8)-1, + .offset = (u64)-1 + }; + struct btrfs_ioctl_reada_args reada_args; + struct btrfs_key key; + struct btrfs_root *root = NULL; + + if (!argp) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(reada_args, + (struct btrfs_ioctl_reada_args __user *)argp, + sizeof(reada_args))) + return -EFAULT; + + start.objectid = reada_args.start_objectid; + start.type = reada_args.start_type; + start.offset = reada_args.start_offset; + end.objectid = reada_args.end_objectid; + end.type = reada_args.end_type; + end.offset = reada_args.end_offset; + + key.objectid = reada_args.tree; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, key); + if (IS_ERR(root)) + return -ENOENT; + + if (!(reada_args.flags BTRFS_READA_IOC_FLAGS_TRAD)) { + void *handle; + + handle = btrfs_reada_add(root, start, end); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (reada_args.flags BTRFS_READA_IOC_FLAGS_WAIT) + btrfs_reada_wait(handle); + else + btrfs_reada_detach(handle); + } else { + struct btrfs_path *path; + struct extent_buffer *leaf; + int slot; + int ret; + + /* +* enumerate the tree the traditional way +*/ + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path-reada = 2; + + ret = btrfs_search_slot(NULL, root, start, path, 0, 0); + if (ret 0) + goto out; + + do { + leaf = path-nodes[0]; + slot = path-slots[0]; + btrfs_item_key_to_cpu(leaf, key, slot); + + if (key.objectid end.objectid) + break; + if (key.objectid == end.objectid key.type end.type) + break; + if (key.objectid == end.objectid + key.type == end.type key.offset end.offset) + break; + } while ((ret = btrfs_next_leaf(root, path)) == 0); +out: + btrfs_free_path(path); + return ret = 0 ? 0 : ret; + } + return 0; +} + static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) { int ret; @@ -2829,8 +2916,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } -long btrfs_ioctl(struct file *file, unsigned int - cmd, unsigned long arg) +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct btrfs_root *root = BTRFS_I(fdentry(file)-d_inode)-root; void __user *argp = (void __user *)arg; @@ -2901,7 +2987,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_READA_TEST: + return btrfs_ioctl_reada_test(root-fs_info, argp); } - return -ENOTTY; } diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ad1ea78..ee83259 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -193,6 +193,20 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_space_info spaces[0]; }; +#define BTRFS_READA_IOC_FLAGS_WAIT 1 +#define BTRFS_READA_IOC_FLAGS_TRAD 2 +struct btrfs_ioctl_reada_args { + __u64 flags; + __u64 tree; + __u64 start_objectid; + __u8
[PATCH v2 5/6] btrfs: hooks for readahead
This adds the hooks needed for readahead. In the readpage_end_io_hook, the extent state is checked for the EXTENT_READAHEAD flag. Only in this case the readahead hook is called, to keep the impact on non-ra as low as possible. Additionally, a hook for a failed IO is added, otherwise readahead would wait indefinitely for the extent to finish. Changes for v2: - eliminate race condition Signed-off-by: Arne Jansen sensi...@gmx.net --- fs/btrfs/disk-io.c | 37 + fs/btrfs/extent_io.c |2 +- 2 files changed, 38 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 904c4b3..f5b7b79 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -574,11 +574,47 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, end = min_t(u64, eb-len, PAGE_CACHE_SIZE); end = eb-start + end - 1; err: + if (test_bit(EXTENT_BUFFER_READAHEAD, eb-bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags); + btree_readahead_hook(root, eb, eb-start, ret); + } + free_extent_buffer(eb); out: return ret; } +static int btree_io_failed_hook(struct bio *failed_bio, +struct page *page, u64 start, u64 end, +struct extent_state *state) +{ + struct extent_io_tree *tree; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page-mapping-host)-root; + + tree = BTRFS_I(page-mapping-host)-io_tree; + if (page-private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page-private) + goto out; + + len = page-private 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) + goto out; + + if (test_bit(EXTENT_BUFFER_READAHEAD, eb-bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags); + btree_readahead_hook(root, eb, eb-start, -EIO); + } + +out: + return -EIO;/* we fixed nothing */ +} + static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio-bi_private; @@ -3132,6 +3168,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b7f0b9b..19ca500 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1774,7 +1774,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (!uptodate tree-ops tree-ops-readpage_io_failed_hook) { ret = tree-ops-readpage_io_failed_hook(bio, page, -start, end, NULL); +start, end, state); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, bio-bi_flags); -- 1.7.3.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs: remove unneeded includes from scrub.c
Signed-off-by: Arne Jansen sensi...@gmx.net --- fs/btrfs/scrub.c |6 -- 1 files changed, 0 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 92cac19..a8d03d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -16,13 +16,7 @@ * Boston, MA 021110-1307, USA. */ -#include linux/sched.h -#include linux/pagemap.h -#include linux/writeback.h #include linux/blkdev.h -#include linux/rbtree.h -#include linux/slab.h -#include linux/workqueue.h #include ctree.h #include volumes.h #include disk-io.h -- 1.7.3.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs: use readahead API for scrub
Scrub uses a simple tree-enumeration to bring the relevant portions of the extent- and csum-tree into the page cache before starting the scrub-I/O. This is now replaced by using the new readahead-API. During readahead the scrub is being accounted as paused, so it won't hold off transaction commits. This change raises the average disk bandwith utilisation on my test volume from 70% to 90%. On another volume, the time for a test run went down from 89s to 43s. Signed-off-by: Arne Jansen sensi...@gmx.net --- fs/btrfs/scrub.c | 116 -- 1 files changed, 52 insertions(+), 64 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5..ce5208c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -29,15 +29,12 @@ * any can be found. * * Future enhancements: - * - To enhance the performance, better read-ahead strategies for the - *extent-tree can be employed. * - In case an unrepairable extent is encountered, track which files are *affected and report them * - In case of a read error on files with nodatasum, map the file and read *the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space - * - make the prefetch cancellable */ struct scrub_bio; @@ -741,13 +738,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, int slot; int i; u64 nstripes; - int start_stripe; struct extent_buffer *l; struct btrfs_key key; u64 physical; u64 logical; u64 generation; u64 mirror_num; + void *reada1; + void *reada2; + struct btrfs_key key_start; + struct btrfs_key key_end; u64 increment = map-stripe_len; u64 offset; @@ -779,81 +779,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (!path) return -ENOMEM; - path-reada = 2; path-search_commit_root = 1; path-skip_locking = 1; /* -* find all extents for each stripe and just read them to get -* them into the page cache -* FIXME: we can do better. build a more intelligent prefetching +* trigger the readahead for extent tree csum tree and wait for +* completion. During readahead, the scrub is officially paused +* to not hold off transaction commits */ logical = base + offset; - physical = map-stripes[num].physical; - ret = 0; - for (i = 0; i nstripes; ++i) { - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret 0) - goto out_noplug; - - /* -* we might miss half an extent here, but that doesn't matter, -* as it's only the prefetch -*/ - while (1) { - l = path-nodes[0]; - slot = path-slots[0]; - if (slot = btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret 0) - goto out_noplug; - break; - } - btrfs_item_key_to_cpu(l, key, slot); + wait_event(sdev-list_wait, + atomic_read(sdev-in_flight) == 0); + atomic_inc(fs_info-scrubs_paused); + wake_up(fs_info-scrub_pause_wait); - if (key.objectid = logical + map-stripe_len) - break; + /* FIXME it might be better to start readahead at commit root */ + key_start.objectid = logical; + key_start.type = BTRFS_EXTENT_ITEM_KEY; + key_start.offset = (u64)0; + key_end.objectid = base + offset + nstripes * increment; + key_end.type = BTRFS_EXTENT_ITEM_KEY; + key_end.offset = (u64)0; + reada1 = btrfs_reada_add(root, key_start, key_end); + + key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_start.type = BTRFS_EXTENT_CSUM_KEY; + key_start.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = base + offset + nstripes * increment; + reada2 = btrfs_reada_add(csum_root, key_start, key_end); + + if (!IS_ERR(reada1)) + btrfs_reada_wait(reada1); + if (!IS_ERR(reada2)) + btrfs_reada_wait(reada2); - path-slots[0]++; - } - btrfs_release_path(path); - logical += increment; -
Re: kernel BUG at fs/btrfs/inode.c:4676!
On 06/09/2011 10:06 PM, Daniel J Blueman wrote: On 10 June 2011 09:57, Andy Lutomirski l...@mit.edu wrote: On 06/06/2011 06:19 AM, Marek Otahal wrote: Hello, the issue happens every time when i have to hard power-off my notebook (suspend problems). With kernel 2.6.39 the partition is unmountable, solution is to boot 2.6.38 kernel which 1/ is able to mount the partition, 2/ by doing that fixes the problem so later .39 (after clean shutdown) can mount it also. Same problem here. Mounting with 2.6.38 says: [ 41.906259] Btrfs loaded [ 41.906747] device fsid e040a9d60da49596-66c0275e348878bf devid 1 transid 69217 /dev/mapper/vg_midnight_ssd-home [ 41.908767] btrfs: disk space caching is enabled [ 42.232185] btrfs: unlinked 17 orphans [ 42.232189] btrfs: truncated 2 orphans dmesg in 2.6.39.1 says: [] [ 15.004255] kernel BUG at fs/btrfs/inode.c:4676! [] I've been experiencing the same issue also. Josef/Chris, would an metadata snapshot or full block snapshot help debug this regression? I can probably setup a small testcase to trigger this. If you can come up with a testcase to reproduce I would love you forever ;). If I get done what I wanted to do today I will try and reproduce. Thanks, Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
David Sterba wrote: Hi, a candidate fix: http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-unstable.git;a=commit;h=aa0467d8d2a00e75b2bb6a56a4ee6d70c5d1928f With Linus' tree, today's linux-next build (powercp ppc64_defconfig) produced this warning: fs/btrfs/delayed-inode.c: In function 'btrfs_delayed_update_inode': fs/btrfs/delayed-inode.c:1598:6: warning: 'ret' may be used uninitialized in this function Introduced by commit 16cdcec736cd (btrfs: implement delayed inode items operation). This fixes a bug in btrfs_update_inode(): if the returned value from btrfs_delayed_update_inode is a nonzero garbage, inode stat data are not updated and several call paths may hit a BUG_ON or fail with strange code. if you can reproduce it reliably, add this patch on top of the delayed inodes. I cherry-picked aa0467d8d2a00e on top of 16cdcec736cd21, which gave me the following instead of a BUG: [ 246.986087] [ cut here ] [ 246.990714] WARNING: at mm/page_alloc.c:2032 __alloc_pages_slowpath+0x54/0x3c5() [ 246.998100] Hardware name: PowerEdge 1950 [ 247.002110] Modules linked in: btrfs zlib_deflate lzo_compress ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp i2c_dev i2c_core ext3 jbd scsi_transport_iscsi rds ib_ipoib rdma_ucm rdma_cm ib_ucm ib_uverbs ib_umad ib_cm iw_cm ib_addr ipv6 ib_sa dm_mirror dm_region_hash dm_log dm_multipath scsi_dh dm_mod video sbs sbshc pci_slot battery acpi_pad ac kvm sg ses sd_mod enclosure megaraid_sas ide_cd_mod cdrom ib_mthca qla2xxx serio_raw ib_mad ib_core scsi_transport_fc button scsi_tgt ata_piix libata scsi_mod dcdbas i5k_amb tpm_tis tpm ioatdma hwmon tpm_bios ehci_hcd dca i5000_edac pcspkr iTCO_wdt uhci_hcd iTCO_vendor_support edac_core rtc nfs nfs_acl auth_rpcgss fscache lockd sunrpc tg3 bnx2 e1000 [last unloaded: freq_table] [ 247.076305] Pid: 6044, comm: cosd Not tainted 2.6.39-2-gf47e9fd #25 [ 247.082911] Call Trace: [ 247.085358] [810ccf8a] ? __alloc_pages_slowpath+0x54/0x3c5 [ 247.091793] [81049379] ? warn_slowpath_common+0x85/0x9e [ 247.097974] [810493ac] ? warn_slowpath_null+0x1a/0x1c [ 247.103974] [810ccf8a] ? __alloc_pages_slowpath+0x54/0x3c5 [ 247.110409] [810ccae2] ? get_page_from_freelist+0x166/0x198 [ 247.116928] [810cd3c5] ? __alloc_pages_nodemask+0xca/0xf4 [ 247.123297] [a0702884] ? unmap_extent_buffer+0x11/0x13 [btrfs] [ 247.130079] [810fd957] ? alloc_pages_current+0xa3/0xac [ 247.136166] [810cc5f7] ? alloc_pages+0xe/0x10 [ 247.141472] [810cc607] ? __get_free_pages+0xe/0x4b [ 247.147218] [811061eb] ? kmalloc_order_trace+0x27/0x55 [ 247.153304] [8110664e] ? __kmalloc+0x37/0x100 [ 247.158625] [a072622a] ? btrfs_batch_insert_items+0xe0/0x229 [btrfs] [ 247.165933] [a06d454b] ? btrfs_block_rsv_release+0x39/0x3b [btrfs] [ 247.173072] [a07265e7] ? btrfs_insert_delayed_items+0xac/0xef [btrfs] [ 247.180472] [a0726798] ? btrfs_run_delayed_items+0x68/0xd9 [btrfs] [ 247.187610] [a06e85fc] ? btrfs_commit_transaction+0x266/0x5c9 [btrfs] [ 247.195000] [81066118] ? list_del_init+0x21/0x21 [ 247.200583] [a0710c4c] ? create_subvol+0x420/0x440 [btrfs] [ 247.207018] [810362ec] ? need_resched+0x23/0x2d [ 247.212511] [a0710d7a] ? btrfs_mksubvol+0x10e/0x167 [btrfs] [ 247.219043] [a071129f] ? btrfs_ioctl_snap_create_transid+0x9c/0x121 [btrfs] [ 247.226962] [a071145e] ? btrfs_ioctl_snap_create+0x50/0x67 [btrfs] [ 247.234101] [a0712c5a] ? btrfs_ioctl+0x1d0/0x2c6 [btrfs] [ 247.240364] [8111fe12] ? vfs_ioctl+0x1d/0x34 [ 247.245582] [8112048d] ? do_vfs_ioctl+0x171/0x17a [ 247.251242] [811130a2] ? fget_light+0x69/0x81 [ 247.256549] [811204f2] ? sys_ioctl+0x5c/0x7c [ 247.261770] [8111d48d] ? putname+0x33/0x37 [ 247.266819] [813b21eb] ? system_call_fastpath+0x16/0x1b [ 247.272993] ---[ end trace 9c75d74017f060f5 ]--- The mkcephfs command I was attempting succeeded, so I'm not sure if the above matters; mm/page_alloc.c:2032 is /* * In the slowpath, we sanity check order to avoid ever trying to * reclaim = MAX_ORDER areas which will never succeed. Callers may * be using allocators in order of preference for an area that is * too large. */ if (order = MAX_ORDER) { WARN_ON_ONCE(!(gfp_mask __GFP_NOWARN)); return NULL; } When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178]
Re: [PATCH 2/2] Btrfs: serialize flushers in reserve_metadata_bytes
On Thu, Jun 09, 2011 at 08:00:15PM +0200, David Sterba wrote: On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote: We're not trying to be perfect here, we're trying to be fast :). Be even faster with smp_rmb() :) Arne made me think about this again. Let's analyze it in more detail: The read side, if(delalloc_bytes), utilizes a full barrier, which will force all cpus to flush pending reads and writes. This will ensure 'if' will see a fresh value. However, there is no pairing write barrier and the write flush will happen at some point in time, (delalloc_bytes += len), but completely unsynchronized with the read side. The smp_mb barrier has no desired synchonization effect. Moreover, it has a performance hit. Doing it right with barriers would mean to add smp_rmb before the if(...) and smp_wmb after the delalloc_bytes =, but only in the case the variable is solely synchronized via barriers. Not our case. There is the spinlock. As strict correctness is not needed here, you admit that delalloc_bytes might not correspond to the state of fs_info-delalloc_inodes and this is not a problem. Fine. But then you do not need the smp_mb. The value of delalloc_bytes will be random (ie. unsynchronized), with or without the barrier. Please drop it from the patch. david -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] Btrfs: serialize flushers in reserve_metadata_bytes
On 06/10/2011 01:47 PM, David Sterba wrote: On Thu, Jun 09, 2011 at 08:00:15PM +0200, David Sterba wrote: On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote: We're not trying to be perfect here, we're trying to be fast :). Be even faster with smp_rmb() :) Arne made me think about this again. Let's analyze it in more detail: The read side, if(delalloc_bytes), utilizes a full barrier, which will force all cpus to flush pending reads and writes. This will ensure 'if' will see a fresh value. However, there is no pairing write barrier and the write flush will happen at some point in time, (delalloc_bytes += len), but completely unsynchronized with the read side. The smp_mb barrier has no desired synchonization effect. Moreover, it has a performance hit. Doing it right with barriers would mean to add smp_rmb before the if(...) and smp_wmb after the delalloc_bytes =, but only in the case the variable is solely synchronized via barriers. Not our case. There is the spinlock. As strict correctness is not needed here, you admit that delalloc_bytes might not correspond to the state of fs_info-delalloc_inodes and this is not a problem. Fine. But then you do not need the smp_mb. The value of delalloc_bytes will be random (ie. unsynchronized), with or without the barrier. Please drop it from the patch. I just used the spin lock. Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400: [ two different btrfs crashes ] I think your two crashes in btrfs were from the uninit variables and those should be fixed in rc2. When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178] BUG: unable to handle kernel NULL pointer dereference at 000a [ 276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd] Looking at the resulting code in the oops, we're here in journal_start: if (handle) { J_ASSERT(handle-h_transaction-t_journal == journal); handle comes from current-journal_info, and we're doing a deref on handle-h_transaction, which is probably 0xa. So, we're leaving crud in current-journal_info and ext3 is finding it. Perhaps its from ceph starting a transaction but leaving it running? The bug came with Josef's transaction performance fixes, but it is probably a mixture of his code with the ioctls ceph is using. [ rest of the oops below for context ] -chris [ 276.365127] PGD 1e4469067 PUD 1e1658067 PMD 0 [ 276.365127] Oops: [#1] SMP [ 276.365127] CPU 2 [ 276.365127] Modules linked in: btrfs zlib_deflate lzo_compress ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp i2c_dev i2c_core ext3 jbd scsi_transport_iscsi rds ib_ipoib rdma_ucm rdma_cm ib_ucm ib_uverbs ib_umad ib_cm iw_cm ib_addr ipv6 ib_sa dm_mirror dm_region_hash dm_log dm_multipath scsi_dh dm_mod video sbs sbshc pci_slot battery acpi_pad ac kvm sg ses sd_mod enclosure megaraid_sas ide_cd_mod cdrom ib_mthca ib_mad qla2xxx button ib_core serio_raw scsi_transport_fc scsi_tgt dcdbas ata_piix libata tpm_tis tpm i5k_amb ioatdma tpm_bios hwmon iTCO_wdt scsi_mod i5000_edac iTCO_vendor_support ehci_hcd dca edac_core uhci_hcd pcspkr rtc nfs nfs_acl auth_rpcgss fscache lockd sunrpc tg3 bnx2 e1000 [last unloaded: freq_table] [ 276.365127] [ 276.365127] Pid: 6076, comm: cosd Not tainted 3.0.0-rc2-00196-g06e8684 #26 Dell Inc. PowerEdge 1950/0DT097 [ 276.365127] RIP: 0010:[a05434b1] [a05434b1] journal_start+0x3e/0x9c [jbd] [ 276.365127] RSP: 0018:8801e2897b28 EFLAGS: 00010286 [ 276.365127] RAX: 000a RBX: 8801de8e1090 RCX: 0002 [ 276.365127] RDX: 19b2d000 RSI: 000e RDI: 000e [ 276.365127] RBP: 8801e2897b48 R08: 0003 R09: 8801e2897c38 [ 276.365127] R10: 8801e2897ed8 R11: 0001 R12: 880223ff4400 [ 276.365127] R13: 880218522d60 R14: 0ec6 R15: 88021f54d878 [ 276.365127] FS: 7f8ff0bbb710() GS:88022fc8() knlGS: [ 276.365127] CS: 0010 DS: ES: CR0: 8005003b [ 276.365127] CR2: 000a CR3: 00021744f000 CR4: 06e0 [ 276.365127] DR0: DR1: DR2: [ 276.365127] DR3: DR6: 0ff0 DR7: 0400 [ 276.365127] Process cosd (pid: 6076, threadinfo 8801e2896000, task 880218522d60) [ 276.365127] Stack: [ 276.365127] 8801e2897b68 ea000756e788 88021f54d728 8801e2897c78 [ 276.365127] 8801e2897b58 a05670ce 8801e2897b68 a055c72d [ 276.365127] 8801e2897be8 a055f044 8801e2897c38 0074 [ 276.365127] Call Trace: [ 276.365127] [a05670ce] ext3_journal_start_sb+0x4f/0x51 [ext3] [ 276.365127] [a055c72d] ext3_journal_start+0x12/0x14 [ext3] [ 276.365127] [a055f044] ext3_write_begin+0x93/0x1a1 [ext3] [ 276.365127] [810c6f0e] ? __kunmap_atomic+0xe/0x10 [ 276.365127] [810c75e5] generic_perform_write+0xb1/0x172 [ 276.365127] [81036a33] ? need_resched+0x23/0x2d [ 276.365127] [810c76ea] generic_file_buffered_write+0x44/0x6f [ 276.365127] [810c91f5] __generic_file_aio_write+0x253/0x2a8 [ 276.365127] [810c92ad] generic_file_aio_write+0x63/0xb8 [ 276.365127] [81113b26] do_sync_write+0xc7/0x10b [ 276.365127] [81036a4b] ? should_resched+0xe/0x2f [ 276.365127] [813b0faf] ? _cond_resched+0xe/0x22 [ 276.365127] [811986c3] ? security_file_permission+0x2c/0x31 [ 276.365127] [81113d21] ? rw_verify_area+0xac/0xdb [ 276.365127] [81114253] vfs_write+0xac/0xe4 [ 276.365127] [8111434f] sys_write+0x4c/0x71 [ 276.365127] [813b8beb] system_call_fastpath+0x16/0x1b [ 276.365127] Code: 89 fc 48 c7 c3 e2 ff ff ff 89 f7 65 4c 8b 2c 25 c0 b5 00 00 4d 85 e4 49 8b 85 48 06 00 00 74
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
On Fri, 10 Jun 2011, Chris Mason wrote: Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400: [ two different btrfs crashes ] I think your two crashes in btrfs were from the uninit variables and those should be fixed in rc2. When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178] BUG: unable to handle kernel NULL pointer dereference at 000a [ 276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd] Looking at the resulting code in the oops, we're here in journal_start: if (handle) { J_ASSERT(handle-h_transaction-t_journal == journal); handle comes from current-journal_info, and we're doing a deref on handle-h_transaction, which is probably 0xa. So, we're leaving crud in current-journal_info and ext3 is finding it. Perhaps its from ceph starting a transaction but leaving it running? The bug came with Josef's transaction performance fixes, but it is probably a mixture of his code with the ioctls ceph is using. Ah, yeah, that's the problem. We saw a similar problem a while back with the start/stop transaction ioctls. In this case, create_snapshot is doing trans = btrfs_start_transaction(root-fs_info-extent_root, 5); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } which sets current-journal_info. Then ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); list_add(pending_snapshot-list, trans-transaction-pending_snapshots); if (async_transid) { *async_transid = trans-transid; ret = btrfs_commit_transaction_async(trans, root-fs_info-extent_root, 1); } else { ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); } but the async snap creation ioctl takes the async path, which runs btrfs_commit_transaction in a worker thread. I'm not sure what the right thing to do is here is... can whatever is in journal_info be attached to trans instead in btrfs_commit_transaction_async()? sage [ rest of the oops below for context ] -chris [ 276.365127] PGD 1e4469067 PUD 1e1658067 PMD 0 [ 276.365127] Oops: [#1] SMP [ 276.365127] CPU 2 [ 276.365127] Modules linked in: btrfs zlib_deflate lzo_compress ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp i2c_dev i2c_core ext3 jbd scsi_transport_iscsi rds ib_ipoib rdma_ucm rdma_cm ib_ucm ib_uverbs ib_umad ib_cm iw_cm ib_addr ipv6 ib_sa dm_mirror dm_region_hash dm_log dm_multipath scsi_dh dm_mod video sbs sbshc pci_slot battery acpi_pad ac kvm sg ses sd_mod enclosure megaraid_sas ide_cd_mod cdrom ib_mthca ib_mad qla2xxx button ib_core serio_raw scsi_transport_fc scsi_tgt dcdbas ata_piix libata tpm_tis tpm i5k_amb ioatdma tpm_bios hwmon iTCO_wdt scsi_mod i5000_edac iTCO_vendor_support ehci_hcd dca edac_core uhci_hcd pcspkr rtc nfs nfs_acl auth_rpcgss fscache lockd sunrpc tg3 bnx2 e1000 [last unloaded: freq_table] [ 276.365127] [ 276.365127] Pid: 6076, comm: cosd Not tainted 3.0.0-rc2-00196-g06e8684 #26 Dell Inc. PowerEdge 1950/0DT097 [ 276.365127] RIP: 0010:[a05434b1] [a05434b1] journal_start+0x3e/0x9c [jbd] [ 276.365127] RSP: 0018:8801e2897b28 EFLAGS: 00010286 [ 276.365127] RAX: 000a RBX: 8801de8e1090 RCX: 0002 [ 276.365127] RDX: 19b2d000 RSI: 000e RDI: 000e [ 276.365127] RBP: 8801e2897b48 R08: 0003 R09: 8801e2897c38 [ 276.365127] R10: 8801e2897ed8 R11: 0001 R12: 880223ff4400 [ 276.365127] R13: 880218522d60 R14: 0ec6 R15: 88021f54d878 [ 276.365127] FS: 7f8ff0bbb710() GS:88022fc8() knlGS: [ 276.365127] CS: 0010 DS: ES: CR0: 8005003b [ 276.365127] CR2: 000a CR3: 00021744f000 CR4: 06e0 [ 276.365127] DR0: DR1: DR2: [ 276.365127] DR3: DR6: 0ff0 DR7: 0400 [ 276.365127] Process cosd (pid: 6076, threadinfo 8801e2896000, task 880218522d60) [ 276.365127] Stack: [ 276.365127] 8801e2897b68 ea000756e788 88021f54d728 8801e2897c78 [ 276.365127] 8801e2897b58 a05670ce 8801e2897b68 a055c72d [ 276.365127] 8801e2897be8 a055f044
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
On Fri, 10 Jun 2011, Sage Weil wrote: On Fri, 10 Jun 2011, Chris Mason wrote: Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400: [ two different btrfs crashes ] I think your two crashes in btrfs were from the uninit variables and those should be fixed in rc2. When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178] BUG: unable to handle kernel NULL pointer dereference at 000a [ 276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd] Looking at the resulting code in the oops, we're here in journal_start: if (handle) { J_ASSERT(handle-h_transaction-t_journal == journal); handle comes from current-journal_info, and we're doing a deref on handle-h_transaction, which is probably 0xa. So, we're leaving crud in current-journal_info and ext3 is finding it. Perhaps its from ceph starting a transaction but leaving it running? The bug came with Josef's transaction performance fixes, but it is probably a mixture of his code with the ioctls ceph is using. Ah, yeah, that's the problem. We saw a similar problem a while back with the start/stop transaction ioctls. In this case, create_snapshot is doing trans = btrfs_start_transaction(root-fs_info-extent_root, 5); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } which sets current-journal_info. Then ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); list_add(pending_snapshot-list, trans-transaction-pending_snapshots); if (async_transid) { *async_transid = trans-transid; ret = btrfs_commit_transaction_async(trans, root-fs_info-extent_root, 1); } else { ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); } but the async snap creation ioctl takes the async path, which runs btrfs_commit_transaction in a worker thread. I'm not sure what the right thing to do is here is... can whatever is in journal_info be attached to trans instead in btrfs_commit_transaction_async()? It looks like it's not used for anything in btrfs, actually; it's just set and cleared. What's the point of that? Anyway, assuming it's useful, I think the below would fix the problem. Want to give it a shot, Jim? Thanks! sage diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c571734..fd04ad7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1196,6 +1196,9 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, put_transaction(cur_trans); mutex_unlock(root-fs_info-trans_mutex); + if (current-journal_info == trans) + current-journal_info = NULL; + return 0; } -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
On 06/10/2011 02:14 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Sage Weil wrote: On Fri, 10 Jun 2011, Chris Mason wrote: Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400: [ two different btrfs crashes ] I think your two crashes in btrfs were from the uninit variables and those should be fixed in rc2. When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178] BUG: unable to handle kernel NULL pointer dereference at 000a [ 276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd] Looking at the resulting code in the oops, we're here in journal_start: if (handle) { J_ASSERT(handle-h_transaction-t_journal == journal); handle comes from current-journal_info, and we're doing a deref on handle-h_transaction, which is probably 0xa. So, we're leaving crud in current-journal_info and ext3 is finding it. Perhaps its from ceph starting a transaction but leaving it running? The bug came with Josef's transaction performance fixes, but it is probably a mixture of his code with the ioctls ceph is using. Ah, yeah, that's the problem. We saw a similar problem a while back with the start/stop transaction ioctls. In this case, create_snapshot is doing trans = btrfs_start_transaction(root-fs_info-extent_root, 5); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } which sets current-journal_info. Then ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); list_add(pending_snapshot-list, trans-transaction-pending_snapshots); if (async_transid) { *async_transid = trans-transid; ret = btrfs_commit_transaction_async(trans, root-fs_info-extent_root, 1); } else { ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); } but the async snap creation ioctl takes the async path, which runs btrfs_commit_transaction in a worker thread. I'm not sure what the right thing to do is here is... can whatever is in journal_info be attached to trans instead in btrfs_commit_transaction_async()? It looks like it's not used for anything in btrfs, actually; it's just set and cleared. What's the point of that? It is used now, check the beginning of start_transaction(). Thanks, Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
On Fri, 10 Jun 2011, Josef Bacik wrote: On 06/10/2011 02:14 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Sage Weil wrote: On Fri, 10 Jun 2011, Chris Mason wrote: Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400: [ two different btrfs crashes ] I think your two crashes in btrfs were from the uninit variables and those should be fixed in rc2. When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178] BUG: unable to handle kernel NULL pointer dereference at 000a [ 276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd] Looking at the resulting code in the oops, we're here in journal_start: if (handle) { J_ASSERT(handle-h_transaction-t_journal == journal); handle comes from current-journal_info, and we're doing a deref on handle-h_transaction, which is probably 0xa. So, we're leaving crud in current-journal_info and ext3 is finding it. Perhaps its from ceph starting a transaction but leaving it running? The bug came with Josef's transaction performance fixes, but it is probably a mixture of his code with the ioctls ceph is using. Ah, yeah, that's the problem. We saw a similar problem a while back with the start/stop transaction ioctls. In this case, create_snapshot is doing trans = btrfs_start_transaction(root-fs_info-extent_root, 5); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } which sets current-journal_info. Then ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); list_add(pending_snapshot-list, trans-transaction-pending_snapshots); if (async_transid) { *async_transid = trans-transid; ret = btrfs_commit_transaction_async(trans, root-fs_info-extent_root, 1); } else { ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); } but the async snap creation ioctl takes the async path, which runs btrfs_commit_transaction in a worker thread. I'm not sure what the right thing to do is here is... can whatever is in journal_info be attached to trans instead in btrfs_commit_transaction_async()? It looks like it's not used for anything in btrfs, actually; it's just set and cleared. What's the point of that? It is used now, check the beginning of start_transaction(). Thanks, Oh I see, okay. So clearing it in btrfs_commit_transaction_async should be fine then, right? When btrfs_commit_transaction runs in the other thread it won't care that current-journal_info is NULL. sage -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
On 06/10/2011 02:35 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Josef Bacik wrote: On 06/10/2011 02:14 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Sage Weil wrote: On Fri, 10 Jun 2011, Chris Mason wrote: Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400: [ two different btrfs crashes ] I think your two crashes in btrfs were from the uninit variables and those should be fixed in rc2. When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178] BUG: unable to handle kernel NULL pointer dereference at 000a [ 276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd] Looking at the resulting code in the oops, we're here in journal_start: if (handle) { J_ASSERT(handle-h_transaction-t_journal == journal); handle comes from current-journal_info, and we're doing a deref on handle-h_transaction, which is probably 0xa. So, we're leaving crud in current-journal_info and ext3 is finding it. Perhaps its from ceph starting a transaction but leaving it running? The bug came with Josef's transaction performance fixes, but it is probably a mixture of his code with the ioctls ceph is using. Ah, yeah, that's the problem. We saw a similar problem a while back with the start/stop transaction ioctls. In this case, create_snapshot is doing trans = btrfs_start_transaction(root-fs_info-extent_root, 5); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } which sets current-journal_info. Then ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); list_add(pending_snapshot-list, trans-transaction-pending_snapshots); if (async_transid) { *async_transid = trans-transid; ret = btrfs_commit_transaction_async(trans, root-fs_info-extent_root, 1); } else { ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); } but the async snap creation ioctl takes the async path, which runs btrfs_commit_transaction in a worker thread. I'm not sure what the right thing to do is here is... can whatever is in journal_info be attached to trans instead in btrfs_commit_transaction_async()? It looks like it's not used for anything in btrfs, actually; it's just set and cleared. What's the point of that? It is used now, check the beginning of start_transaction(). Thanks, Oh I see, okay. So clearing it in btrfs_commit_transaction_async should be fine then, right? When btrfs_commit_transaction runs in the other thread it won't care that current-journal_info is NULL. Oh yeah your patch is good :), Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
On Fri, 10 Jun 2011, Josef Bacik wrote: On 06/10/2011 02:35 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Josef Bacik wrote: On 06/10/2011 02:14 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Sage Weil wrote: On Fri, 10 Jun 2011, Chris Mason wrote: Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400: [ two different btrfs crashes ] I think your two crashes in btrfs were from the uninit variables and those should be fixed in rc2. When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178] BUG: unable to handle kernel NULL pointer dereference at 000a [ 276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd] Looking at the resulting code in the oops, we're here in journal_start: if (handle) { J_ASSERT(handle-h_transaction-t_journal == journal); handle comes from current-journal_info, and we're doing a deref on handle-h_transaction, which is probably 0xa. So, we're leaving crud in current-journal_info and ext3 is finding it. Perhaps its from ceph starting a transaction but leaving it running? The bug came with Josef's transaction performance fixes, but it is probably a mixture of his code with the ioctls ceph is using. Ah, yeah, that's the problem. We saw a similar problem a while back with the start/stop transaction ioctls. In this case, create_snapshot is doing trans = btrfs_start_transaction(root-fs_info-extent_root, 5); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } which sets current-journal_info. Then ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); list_add(pending_snapshot-list, trans-transaction-pending_snapshots); if (async_transid) { *async_transid = trans-transid; ret = btrfs_commit_transaction_async(trans, root-fs_info-extent_root, 1); } else { ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); } but the async snap creation ioctl takes the async path, which runs btrfs_commit_transaction in a worker thread. I'm not sure what the right thing to do is here is... can whatever is in journal_info be attached to trans instead in btrfs_commit_transaction_async()? It looks like it's not used for anything in btrfs, actually; it's just set and cleared. What's the point of that? It is used now, check the beginning of start_transaction(). Thanks, Oh I see, okay. So clearing it in btrfs_commit_transaction_async should be fine then, right? When btrfs_commit_transaction runs in the other thread it won't care that current-journal_info is NULL. Oh yeah your patch is good :), Okay cool. Here's the fix with a proper changelog and a little use-after-free paranoia. Thanks! sage From 9881c0752293769d5133c01dff3ab04c0c24c61b Mon Sep 17 00:00:00 2001 From: Sage Weil s...@newdream.net Date: Fri, 10 Jun 2011 11:41:25 -0700 Subject: [PATCH] Btrfs: clear current-journal_info on async transaction commit Normally current-jouranl_info is cleared by commit_transaction. For an async snap or subvol creation, though, it runs in a work queue. Clear it in btrfs_commit_transaction_async() to avoid leaking a non-NULL journal_info when we return to userspace. When the actual commit runs in the other thread it won't care that it's current-journal_info is already NULL. Signed-off-by: Sage Weil s...@newdream.net --- fs/btrfs/transaction.c |5 - 1 files changed, 4 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dd71966..9d516ed 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1118,8 +1118,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, wait_current_trans_commit_start_and_unblock(root, cur_trans); else wait_current_trans_commit_start(root, cur_trans); - put_transaction(cur_trans); + if (current-journal_info == trans) + current-journal_info = NULL; + + put_transaction(cur_trans); return 0; } -- 1.7.0 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
Excerpts from Josef Bacik's message of 2011-06-10 14:34:21 -0400: On 06/10/2011 02:35 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Josef Bacik wrote: On 06/10/2011 02:14 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Sage Weil wrote: On Fri, 10 Jun 2011, Chris Mason wrote: Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400: [ two different btrfs crashes ] I think your two crashes in btrfs were from the uninit variables and those should be fixed in rc2. When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178] BUG: unable to handle kernel NULL pointer dereference at 000a [ 276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd] Looking at the resulting code in the oops, we're here in journal_start: if (handle) { J_ASSERT(handle-h_transaction-t_journal == journal); handle comes from current-journal_info, and we're doing a deref on handle-h_transaction, which is probably 0xa. So, we're leaving crud in current-journal_info and ext3 is finding it. Perhaps its from ceph starting a transaction but leaving it running? The bug came with Josef's transaction performance fixes, but it is probably a mixture of his code with the ioctls ceph is using. Ah, yeah, that's the problem. We saw a similar problem a while back with the start/stop transaction ioctls. In this case, create_snapshot is doing trans = btrfs_start_transaction(root-fs_info-extent_root, 5); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } which sets current-journal_info. Then ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); list_add(pending_snapshot-list, trans-transaction-pending_snapshots); if (async_transid) { *async_transid = trans-transid; ret = btrfs_commit_transaction_async(trans, root-fs_info-extent_root, 1); } else { ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); } but the async snap creation ioctl takes the async path, which runs btrfs_commit_transaction in a worker thread. I'm not sure what the right thing to do is here is... can whatever is in journal_info be attached to trans instead in btrfs_commit_transaction_async()? It looks like it's not used for anything in btrfs, actually; it's just set and cleared. What's the point of that? It is used now, check the beginning of start_transaction(). Thanks, Oh I see, okay. So clearing it in btrfs_commit_transaction_async should be fine then, right? When btrfs_commit_transaction runs in the other thread it won't care that current-journal_info is NULL. Oh yeah your patch is good :), Thanks everyone (especially Jim). -chris -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kernel BUG at fs/btrfs/inode.c:4676!
On Fri, Jun 10, 2011 at 2:43 PM, Marek Otahal markota...@gmail.com wrote: The test-case is quite easy, 1. mount the FS, just with compress-force=lzo option // I didn't try without, but on my other btrfs partition that doesn't use compression the err never happened ...so, can the others who experience the bug confirm compress=lzo used? Yes, I use compress=lzo. --Andy -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected
Sage Weil wrote: On Fri, 10 Jun 2011, Josef Bacik wrote: On 06/10/2011 02:35 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Josef Bacik wrote: On 06/10/2011 02:14 PM, Sage Weil wrote: On Fri, 10 Jun 2011, Sage Weil wrote: On Fri, 10 Jun 2011, Chris Mason wrote: Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400: [ two different btrfs crashes ] I think your two crashes in btrfs were from the uninit variables and those should be fixed in rc2. When I did my bisection, my criteria for success/failure was did mkcephfs succeed?. When I apply this criteria to a recent linus kernel (e.g. 06e86849cf4019), which includes the fix you mentioned (aa0467d8d2a00e), I get still a different failure mode, which doesn't actually reference btrfs: [ 276.364178] BUG: unable to handle kernel NULL pointer dereference at 000a [ 276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd] Looking at the resulting code in the oops, we're here in journal_start: if (handle) { J_ASSERT(handle-h_transaction-t_journal == journal); handle comes from current-journal_info, and we're doing a deref on handle-h_transaction, which is probably 0xa. So, we're leaving crud in current-journal_info and ext3 is finding it. Perhaps its from ceph starting a transaction but leaving it running? The bug came with Josef's transaction performance fixes, but it is probably a mixture of his code with the ioctls ceph is using. Ah, yeah, that's the problem. We saw a similar problem a while back with the start/stop transaction ioctls. In this case, create_snapshot is doing trans = btrfs_start_transaction(root-fs_info-extent_root, 5); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } which sets current-journal_info. Then ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); list_add(pending_snapshot-list, trans-transaction-pending_snapshots); if (async_transid) { *async_transid = trans-transid; ret = btrfs_commit_transaction_async(trans, root-fs_info-extent_root, 1); } else { ret = btrfs_commit_transaction(trans, root-fs_info-extent_root); } but the async snap creation ioctl takes the async path, which runs btrfs_commit_transaction in a worker thread. I'm not sure what the right thing to do is here is... can whatever is in journal_info be attached to trans instead in btrfs_commit_transaction_async()? It looks like it's not used for anything in btrfs, actually; it's just set and cleared. What's the point of that? It is used now, check the beginning of start_transaction(). Thanks, Oh I see, okay. So clearing it in btrfs_commit_transaction_async should be fine then, right? When btrfs_commit_transaction runs in the other thread it won't care that current-journal_info is NULL. Oh yeah your patch is good :), Okay cool. Here's the fix with a proper changelog and a little use-after-free paranoia. This patch solves my issue - thanks a lot. Tested-by: Jim Schutt jasc...@sandia.gov -- Jim Thanks! sage From 9881c0752293769d5133c01dff3ab04c0c24c61b Mon Sep 17 00:00:00 2001 From: Sage Weil s...@newdream.net Date: Fri, 10 Jun 2011 11:41:25 -0700 Subject: [PATCH] Btrfs: clear current-journal_info on async transaction commit Normally current-jouranl_info is cleared by commit_transaction. For an async snap or subvol creation, though, it runs in a work queue. Clear it in btrfs_commit_transaction_async() to avoid leaking a non-NULL journal_info when we return to userspace. When the actual commit runs in the other thread it won't care that it's current-journal_info is already NULL. Signed-off-by: Sage Weil s...@newdream.net --- fs/btrfs/transaction.c |5 - 1 files changed, 4 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dd71966..9d516ed 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1118,8 +1118,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, wait_current_trans_commit_start_and_unblock(root, cur_trans); else wait_current_trans_commit_start(root, cur_trans); - put_transaction(cur_trans); + if (current-journal_info == trans) + current-journal_info = NULL; + + put_transaction(cur_trans); return 0; } -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: use the normal checksumming infrastructure for free space cache
We used to store the checksums of the space cache directly in the space cache, however that doesn't work out too well if we have more space than we can fit the checksums into the first page. So instead use the normal checksumming infrastructure. There were problems with doing this originally but those problems don't exist now so this works out fine. Thanks, Signed-off-by: Josef Bacik jo...@redhat.com --- fs/btrfs/free-space-cache.c | 137 +-- 1 files changed, 28 insertions(+), 109 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 38f3fd9..17c26aa 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(block_group-lock); + if (BTRFS_I(inode)-flags BTRFS_INODE_NODATASUM) { + printk(KERN_INFO Old style space inode found, converting.\n); + BTRFS_I(inode)-flags = ~BTRFS_INODE_NODATASUM; + block_group-disk_cache_state = BTRFS_DC_CLEAR; + } + if (!btrfs_fs_closing(root-fs_info)) { block_group-inode = igrab(inode); block_group-iref = 1; @@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); + BTRFS_INODE_PREALLOC); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans-transid); btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -239,17 +245,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct page *page; - u32 *checksums = NULL, *crc; - char *disk_crcs = NULL; struct btrfs_key key; struct list_head bitmaps; u64 num_entries; u64 num_bitmaps; u64 generation; - u32 cur_crc = ~(u32)0; pgoff_t index = 0; - unsigned long first_page_offset; - int num_checksums; int ret = 0; INIT_LIST_HEAD(bitmaps); @@ -292,16 +293,6 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) goto out; - /* Setup everything for doing checksumming */ - num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; - checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); - if (!checksums) - goto out; - first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); - disk_crcs = kzalloc(first_page_offset, GFP_NOFS); - if (!disk_crcs) - goto out; - ret = readahead_cache(inode); if (ret) goto out; @@ -311,17 +302,11 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space *e; void *addr; unsigned long offset = 0; - unsigned long start_offset = 0; int need_loop = 0; if (!num_entries !num_bitmaps) break; - if (index == 0) { - start_offset = first_page_offset; - offset = start_offset; - } - page = grab_cache_page(inode-i_mapping, index); if (!page) goto free_cache; @@ -342,8 +327,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (index == 0) { u64 *gen; - memcpy(disk_crcs, addr, first_page_offset); - gen = addr + (sizeof(u32) * num_checksums); + gen = addr; if (*gen != BTRFS_I(inode)-generation) { printk(KERN_ERR btrfs: space cache generation (%llu) does not match inode (%llu)\n, @@ -355,24 +339,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, page_cache_release(page); goto free_cache; } - crc = (u32 *)disk_crcs; - } - entry = addr + start_offset; - - /* First lets check our crc before we do anything fun */ - cur_crc = ~(u32)0; - cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, - PAGE_CACHE_SIZE - start_offset); - btrfs_csum_final(cur_crc, (char *)cur_crc); -
Re: kernel BUG at fs/btrfs/inode.c:4676!
On 06/10/2011 02:43 PM, Marek Otahal wrote: On Friday 10 of June 2011 15:33:20 Josef Bacik wrote: On 06/09/2011 10:06 PM, Daniel J Blueman wrote: On 10 June 2011 09:57, Andy Lutomirski l...@mit.edu wrote: On 06/06/2011 06:19 AM, Marek Otahal wrote: Hello, the issue happens every time when i have to hard power-off my notebook (suspend problems). With kernel 2.6.39 the partition is unmountable, solution is to boot 2.6.38 kernel which 1/ is able to mount the partition, 2/ by doing that fixes the problem so later .39 (after clean shutdown) can mount it also. Same problem here. Mounting with 2.6.38 says: [ 41.906259] Btrfs loaded [ 41.906747] device fsid e040a9d60da49596-66c0275e348878bf devid 1 transid 69217 /dev/mapper/vg_midnight_ssd-home [ 41.908767] btrfs: disk space caching is enabled [ 42.232185] btrfs: unlinked 17 orphans [ 42.232189] btrfs: truncated 2 orphans dmesg in 2.6.39.1 says: [] [ 15.004255] kernel BUG at fs/btrfs/inode.c:4676! [] I've been experiencing the same issue also. Josef/Chris, would an metadata snapshot or full block snapshot help debug this regression? I can probably setup a small testcase to trigger this. If you can come up with a testcase to reproduce I would love you forever ;). If I get done what I wanted to do today I will try and reproduce. Thanks, Josef ...I was getting ready for you eternal love, Josef :P...but I can't reproduce it 100%, like 70% success-rate. The test-case is quite easy, 1. mount the FS, just with compress-force=lzo option // I didn't try without, but on my other btrfs partition that doesn't use compression the err never happened ...so, can the others who experience the bug confirm compress=lzo used? 2. cd to it create a file (not sure if needed) 3. hard power-off To reproduce my tests: dd /dev/zero /btrfstest bs=1M count=256 (min required for default mksf.btrfs) losetup /dev/loop0 /btrfstest mkfs.btrfs /dev/loop0 mount -o compress-force=lzo /dev/loop0 /mnt/tmp vim /mnt/tmp/hello.txt ---power off! How long do you wait between these two steps? I've not been able to reproduce this and I've done it maybe 5 times. Either I've fixed it in my tree (yay!) or I'm doing something wrong (boo!). Thanks, Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kernel BUG at fs/btrfs/inode.c:4676!
On Friday 10 of June 2011 16:52:36 Josef Bacik wrote: On 06/10/2011 02:43 PM, Marek Otahal wrote: On Friday 10 of June 2011 15:33:20 Josef Bacik wrote: On 06/09/2011 10:06 PM, Daniel J Blueman wrote: On 10 June 2011 09:57, Andy Lutomirski l...@mit.edu wrote: On 06/06/2011 06:19 AM, Marek Otahal wrote: Hello, the issue happens every time when i have to hard power-off my notebook (suspend problems). With kernel 2.6.39 the partition is unmountable, solution is to boot 2.6.38 kernel which 1/ is able to mount the partition, 2/ by doing that fixes the problem so later .39 (after clean shutdown) can mount it also. Same problem here. Mounting with 2.6.38 says: [ 41.906259] Btrfs loaded [ 41.906747] device fsid e040a9d60da49596-66c0275e348878bf devid 1 transid 69217 /dev/mapper/vg_midnight_ssd-home [ 41.908767] btrfs: disk space caching is enabled [ 42.232185] btrfs: unlinked 17 orphans [ 42.232189] btrfs: truncated 2 orphans dmesg in 2.6.39.1 says: [] [ 15.004255] kernel BUG at fs/btrfs/inode.c:4676! [] I've been experiencing the same issue also. Josef/Chris, would an metadata snapshot or full block snapshot help debug this regression? I can probably setup a small testcase to trigger this. If you can come up with a testcase to reproduce I would love you forever ;). If I get done what I wanted to do today I will try and reproduce. Thanks, Josef ...I was getting ready for you eternal love, Josef :P...but I can't reproduce it 100%, like 70% success-rate. The test-case is quite easy, 1. mount the FS, just with compress-force=lzo option // I didn't try without, but on my other btrfs partition that doesn't use compression the err never happened ...so, can the others who experience the bug confirm compress=lzo used? 2. cd to it create a file (not sure if needed) 3. hard power-off To reproduce my tests: dd /dev/zero /btrfstest bs=1M count=256 (min required for default mksf.btrfs) losetup /dev/loop0 /btrfstest mkfs.btrfs /dev/loop0 mount -o compress-force=lzo /dev/loop0 /mnt/tmp vim /mnt/tmp/hello.txt ---power off! How long do you wait between these two steps? I've not been able to reproduce this and I've done it maybe 5 times. Either I've fixed it in my tree (yay!) or I'm doing something wrong (boo!). Thanks, Josef Not much but not immediately too, I'd say like ~5s. Did ls, df and quit. Tomorrow I'll try if I can spot a difference. Btw, is there a way to simulate power-off on a loopback-fs? Like to kill the loopback device while fs is mounted or some way? So I don't have to stress the poor hw :) Thank you, Mark -- Marek Otahal :o) signature.asc Description: This is a digitally signed message part.
ext3 and btrfs various Oops and kernel BUGs
Dear all, (please Cc) yesterday I had two bugs with btrfs and ext3 occurrences, always happening when plugging in an external USB btrfs disk. Today I had a BUG which is purely ext3 related. The last bug was with yesterdays latest git checkout. Here are the three bugs/oops: that one was with 3.0.0-rc2 (exactely) Jun 10 14:50:23 mithrandir kernel: [40871.704129] BUG: unable to handle kernel NULL pointer dereference at 0030 Jun 10 14:50:23 mithrandir kernel: [40871.704185] IP: [a0d7b6e5] btrfs_print_leaf+0x17/0x75c [btrfs] Jun 10 14:50:23 mithrandir kernel: [40871.704241] PGD 12d5e9067 PUD 12d5e8067 PMD 0 Jun 10 14:50:23 mithrandir kernel: [40871.704277] Oops: [#1] PREEMPT SMP Jun 10 14:50:23 mithrandir kernel: [40871.704311] CPU 1 Jun 10 14:50:23 mithrandir kernel: [40871.704324] Modules linked in: usb_storage rfcomm bnep snd_hrtimer vboxnetadp vboxnetflt vboxdrv binfmt_misc dm_crypt dm_mod isofs btrfs zlib_deflate crc32c libcrc32c vfat fat hso fuse loop uinput snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi arc4 snd_rawmidi iwlagn mac80211 snd_seq_midi_event cfg80211 snd_seq snd_timer snd_seq_device snd btusb bluetooth sony_laptop tpm_infineon soundcore mxm_wmi joydev firewire_ohci crc16 firewire_core snd_page_alloc rfkill crc_itu_t Jun 10 14:50:23 mithrandir kernel: [40871.704733] Jun 10 14:50:23 mithrandir kernel: [40871.704745] Pid: 18862, comm: btrfs-transacti Tainted: P3.0.0-rc2+ #33 Sony Corporation VGN-Z11VN_B/VAIO Jun 10 14:50:23 mithrandir kernel: [40871.704812] RIP: 0010:[a0d7b6e5] [a0d7b6e5] btrfs_print_leaf+0x17/0x75c [btrfs] Jun 10 14:50:23 mithrandir kernel: [40871.704875] RSP: 0018:880103eadb10 EFLAGS: 00010282 Jun 10 14:50:23 mithrandir kernel: [40871.704906] RAX: fffb RBX: RCX: 88012431a828 Jun 10 14:50:23 mithrandir kernel: [40871.704945] RDX: 0044824bb000 RSI: RDI: 88012bd11800 Jun 10 14:50:23 mithrandir kernel: [40871.704985] RBP: 880103eadb90 R08: 0001 R09: Jun 10 14:50:23 mithrandir kernel: [40871.705024] R10: R11: 0001 R12: 88012bd11800 Jun 10 14:50:23 mithrandir kernel: [40871.705063] R13: 004482495000 R14: 0007 R15: Jun 10 14:50:23 mithrandir kernel: [40871.705103] FS: () GS:88013fd0() knlGS: Jun 10 14:50:23 mithrandir kernel: [40871.705149] CS: 0010 DS: ES: CR0: 8005003b Jun 10 14:50:23 mithrandir kernel: [40871.705180] CR2: 0030 CR3: 0001270df000 CR4: 06e0 Jun 10 14:50:23 mithrandir kernel: [40871.705220] DR0: DR1: DR2: Jun 10 14:50:23 mithrandir kernel: [40871.705259] DR3: DR6: 0ff0 DR7: 0400 Jun 10 14:50:23 mithrandir kernel: [40871.705299] Process btrfs-transacti (pid: 18862, threadinfo 880103eac000, task 88013f5e7750) Jun 10 14:50:23 mithrandir kernel: [40871.705350] Stack: Jun 10 14:50:23 mithrandir kernel: [40871.705364] 880103eadb20 004482495000 880103eadb50 88012bd11800 Jun 10 14:50:23 mithrandir kernel: [40871.705415] 8050 0206 88013f7a64c0 Jun 10 14:50:23 mithrandir kernel: [40871.705465] a80044824950 1000 8800374bf140 880102dcbd00 Jun 10 14:50:23 mithrandir kernel: [40871.705516] Call Trace: Jun 10 14:50:23 mithrandir kernel: [40871.705539] [a0d7584e] __btrfs_free_extent+0x24b/0x54f [btrfs] Jun 10 14:50:23 mithrandir kernel: [40871.705579] [810781cc] ? __delayacct_blkio_end+0x39/0x3b Jun 10 14:50:23 mithrandir kernel: [40871.705616] [81025a4e] ? delayacct_blkio_end+0x1c/0x3c Jun 10 14:50:23 mithrandir kernel: [40871.705652] [810d2a0e] ? wait_on_buffer+0x12/0x12 Jun 10 14:50:23 mithrandir kernel: [40871.705692] [a0d77d22] run_clustered_refs+0x616/0x660 [btrfs] Jun 10 14:50:23 mithrandir kernel: [40871.705733] [81025042] ? set_next_entity+0x3b/0x60 Jun 10 14:50:23 mithrandir kernel: [40871.705776] [a0db768f] ? btrfs_find_ref_cluster+0xf6/0x137 [btrfs] Jun 10 14:50:23 mithrandir kernel: [40871.705822] [a0d77e38] btrfs_run_delayed_refs+0xcc/0x17c [btrfs] Jun 10 14:50:23 mithrandir kernel: [40871.705871] [a0d85775] btrfs_commit_transaction+0x72/0x655 [btrfs] Jun 10 14:50:23 mithrandir kernel: [40871.705920] [a0d84d89] ? join_transaction.isra.23+0x203/0x20a [btrfs] Jun 10 14:50:23 mithrandir kernel: [40871.705964] [810481d4] ? add_wait_queue+0x44/0x44 Jun 10 14:50:23 mithrandir kernel: [40871.706006] [a0d80180] transaction_kthread+0x174/0x22b [btrfs] Jun 10 14:50:23 mithrandir kernel: [40871.706053] [a0d8000c] ?