[PATCH] btrfs: reinitialize scrub workers

2011-06-10 Thread Arne Jansen
Scrub starts the workers each time a scrub starts and stops them after it
finished. This patch adds an initialization for the workers before each
start, otherwise the workers behave strangely.

Signed-off-by: Arne Jansen sensi...@gmx.net
---
 fs/btrfs/disk-io.c |2 --
 fs/btrfs/scrub.c   |6 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a203d36..7bbbfeb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1668,8 +1668,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
init_waitqueue_head(fs_info-scrub_pause_wait);
init_rwsem(fs_info-scrub_super_lock);
fs_info-scrub_workers_refcnt = 0;
-   btrfs_init_workers(fs_info-scrub_workers, scrub,
-  fs_info-thread_pool_size, fs_info-generic_worker);
 
sb-s_blocksize = 4096;
sb-s_blocksize_bits = blksize_bits(4096);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d5a4108..92cac19 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1166,8 +1166,12 @@ static noinline_for_stack int scrub_workers_get(struct 
btrfs_root *root)
struct btrfs_fs_info *fs_info = root-fs_info;
 
mutex_lock(fs_info-scrub_lock);
-   if (fs_info-scrub_workers_refcnt == 0)
+   if (fs_info-scrub_workers_refcnt == 0) {
+   btrfs_init_workers(fs_info-scrub_workers, scrub,
+  fs_info-thread_pool_size, fs_info-generic_worker);
+   fs_info-scrub_workers.idle_thresh = 4;
btrfs_start_workers(fs_info-scrub_workers, 1);
+   }
++fs_info-scrub_workers_refcnt;
mutex_unlock(fs_info-scrub_lock);
 
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/6] btrfs: add an extra wait mode to read_extent_buffer_pages

2011-06-10 Thread Arne Jansen
read_extent_buffer_pages currently has two modes, either trigger a read
without waiting for anything, or wait for the I/O to finish. The former
also bails when it's unable to lock the page. This patch now adds an
additional parameter to allow it to block on page lock, but don't wait
for completion.

Signed-off-by: Arne Jansen sensi...@gmx.net
---
 fs/btrfs/disk-io.c   |4 ++--
 fs/btrfs/extent_io.c |6 +++---
 fs/btrfs/extent_io.h |3 ++-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7bbbfeb..0339cc0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -325,7 +325,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root 
*root,
clear_bit(EXTENT_BUFFER_CORRUPT, eb-bflags);
io_tree = BTRFS_I(root-fs_info-btree_inode)-io_tree;
while (1) {
-   ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+   ret = read_extent_buffer_pages(io_tree, eb, start, 1, 1,
   btree_get_extent, mirror_num);
if (!ret 
!verify_parent_transid(io_tree, eb, parent_transid))
@@ -940,7 +940,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 
bytenr, u32 blocksize,
if (!buf)
return 0;
read_extent_buffer_pages(BTRFS_I(btree_inode)-io_tree,
-buf, 0, 0, btree_get_extent, 0);
+buf, 0, 0, 0, btree_get_extent, 0);
free_extent_buffer(buf);
return ret;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b181a94..b7f0b9b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3358,7 +3358,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 struct extent_buffer *eb,
-u64 start, int wait,
+u64 start, int wait_lock, int wait_complete,
 get_extent_t *get_extent, int mirror_num)
 {
unsigned long i;
@@ -3392,7 +3392,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb-start, eb-len);
for (i = start_i; i  num_pages; i++) {
page = extent_buffer_page(eb, i);
-   if (!wait) {
+   if (!wait_lock) {
if (!trylock_page(page))
goto unlock_exit;
} else {
@@ -3436,7 +3436,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
if (bio)
submit_one_bio(READ, bio, mirror_num, bio_flags);
 
-   if (ret || !wait)
+   if (ret || !wait_complete)
return ret;
 
for (i = start_i; i  num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4e8445a..118c30b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -241,7 +241,8 @@ struct extent_buffer *find_extent_buffer(struct 
extent_io_tree *tree,
 u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-struct extent_buffer *eb, u64 start, int wait,
+struct extent_buffer *eb, u64 start,
+int wait_lock, int wait_complete,
 get_extent_t *get_extent, int mirror_num);
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/6] btrfs: state information for readahead

2011-06-10 Thread Arne Jansen
Add state information for readahead to btrfs_fs_info and btrfs_device

Changes v2:
 - don't wait in radix_trees
 - add own set of workers for readahead

Signed-off-by: Arne Jansen sensi...@gmx.net
---
 fs/btrfs/ctree.h   |5 +
 fs/btrfs/disk-io.c |   11 +++
 fs/btrfs/volumes.c |8 
 fs/btrfs/volumes.h |8 
 4 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8490ee0..6023de6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1027,6 +1027,7 @@ struct btrfs_fs_info {
struct btrfs_workers endio_write_workers;
struct btrfs_workers endio_freespace_worker;
struct btrfs_workers submit_workers;
+   struct btrfs_workers readahead_workers;
/*
 * fixup workers take dirty pages that didn't properly go through
 * the cow mechanism and make them safe to write.  It happens
@@ -1109,6 +1110,10 @@ struct btrfs_fs_info {
u64 fs_state;
 
struct btrfs_delayed_root *delayed_root;
+
+   /* readahead tree */
+   spinlock_t reada_lock;
+   struct radix_tree_root reada_tree;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0195172..904c4b3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1679,6 +1679,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info-defrag_inodes = RB_ROOT;
fs_info-trans_no_join = 0;
 
+   /* readahead state */
+   INIT_RADIX_TREE(fs_info-reada_tree, GFP_NOFS  ~__GFP_WAIT);
+   spin_lock_init(fs_info-reada_lock);
+
fs_info-thread_pool_size = min_t(unsigned long,
  num_online_cpus() + 2, 8);
 
@@ -1868,6 +1872,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_workers(fs_info-delayed_workers, delayed-meta,
   fs_info-thread_pool_size,
   fs_info-generic_worker);
+   btrfs_init_workers(fs_info-readahead_workers, readahead,
+  fs_info-thread_pool_size,
+  fs_info-generic_worker);
 
/*
 * endios are largely parallel and should have a very
@@ -1878,6 +1885,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
fs_info-endio_write_workers.idle_thresh = 2;
fs_info-endio_meta_write_workers.idle_thresh = 2;
+   fs_info-readahead_workers.idle_thresh = 2;
 
btrfs_start_workers(fs_info-workers, 1);
btrfs_start_workers(fs_info-generic_worker, 1);
@@ -1890,6 +1898,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(fs_info-endio_write_workers, 1);
btrfs_start_workers(fs_info-endio_freespace_worker, 1);
btrfs_start_workers(fs_info-delayed_workers, 1);
+   btrfs_start_workers(fs_info-readahead_workers, 1);
 
fs_info-bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info-bdi.ra_pages = max(fs_info-bdi.ra_pages,
@@ -2147,6 +2156,7 @@ fail_sb_buffer:
btrfs_stop_workers(fs_info-endio_freespace_worker);
btrfs_stop_workers(fs_info-submit_workers);
btrfs_stop_workers(fs_info-delayed_workers);
+   btrfs_stop_workers(fs_info-readahead_workers);
 fail_alloc:
kfree(fs_info-delayed_root);
 fail_iput:
@@ -2614,6 +2624,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(fs_info-endio_freespace_worker);
btrfs_stop_workers(fs_info-submit_workers);
btrfs_stop_workers(fs_info-delayed_workers);
+   btrfs_stop_workers(fs_info-readahead_workers);
 
btrfs_close_devices(fs_info-fs_devices);
btrfs_mapping_tree_free(fs_info-mapping_tree);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index da541df..81ffd9b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -349,6 +349,14 @@ static noinline int device_list_add(const char *path,
}
INIT_LIST_HEAD(device-dev_alloc_list);
 
+   /* init readahead state */
+   spin_lock_init(device-reada_lock);
+   device-reada_curr_zone = NULL;
+   atomic_set(device-reada_in_flight, 0);
+   device-reada_next = 0;
+   INIT_RADIX_TREE(device-reada_zones, GFP_NOFS  ~__GFP_WAIT);
+   INIT_RADIX_TREE(device-reada_extents, GFP_NOFS  ~__GFP_WAIT);
+
mutex_lock(fs_devices-device_list_mutex);
list_add_rcu(device-dev_list, fs_devices-devices);
mutex_unlock(fs_devices-device_list_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7c12d61..63ac242 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -91,6 +91,14 @@ struct btrfs_device {
struct btrfs_work work;
struct rcu_head rcu;
struct work_struct rcu_work;
+
+   /* readahead state */
+   spinlock_t reada_lock;
+   atomic_t reada_in_flight;
+   u64 reada_next;
+   struct reada_zone 

[PATCH v2 0/6] btrfs: generic readeahead interface

2011-06-10 Thread Arne Jansen
This series introduces a generic readahead interface for btrfs trees.
The intention is to use it to speed up scrub in a first run, but balance
is another hot candidate. In general, every tree walk could be accompanied
by a readahead. Deletion of large files comes to mind, where the fetching
of the csums takes most of the time.
Also the initial build-ups of free-space-caches and inode-allocator-caches
could be sped up.

To make testing easier, a simple ioctl interface is added to trigger a read-
ahead from user mode. It also implements a tree walk in the traditional way.
A tool to send the ioctl follows shortly.

A simple demonstration from my 7-disk test btrfs:
 - enumerating the extent tree (traditional): 351s
 - enumerating the extent tree (readahead): 41s
 - enumerating extents+csum tree (readahead): 49s

The implementation is also tested with this tool in various combinations of
parallel reads of the same and of different trees.

The main changes from v1 are:
 - Switch from extent_state flags to extent_buffer flags.
 - Fix a race when triggering the read.
 - Fix a bug where only parts of the requested range where actually prefetched.
   The hit only when requesting parts of a tree, so the above numbers doesn't
   change.

Arne Jansen (6):
  btrfs: add an extra wait mode to read_extent_buffer_pages
  btrfs: add READAHEAD extent buffer flag
  btrfs: state information for readahead
  btrfs: initial readahead code and prototypes
  btrfs: hooks for readahead
  btrfs: test ioctl for readahead

 fs/btrfs/Makefile|3 +-
 fs/btrfs/ctree.h |   13 +
 fs/btrfs/disk-io.c   |   84 -
 fs/btrfs/disk-io.h   |2 +
 fs/btrfs/extent_io.c |8 +-
 fs/btrfs/extent_io.h |4 +-
 fs/btrfs/ioctl.c |   93 +-
 fs/btrfs/ioctl.h |   16 +
 fs/btrfs/reada.c |  997 ++
 fs/btrfs/volumes.c   |8 +
 fs/btrfs/volumes.h   |8 +
 11 files changed, 1225 insertions(+), 11 deletions(-)
 create mode 100644 fs/btrfs/reada.c

-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/6] btrfs: add READAHEAD extent buffer flag

2011-06-10 Thread Arne Jansen
Add a READAHEAD extent buffer flag.
Add a function to trigger a read with this flag set.

Changes v2:
 - use extent buffer flags instead of extent state flags

Signed-off-by: Arne Jansen sensi...@gmx.net
---
 fs/btrfs/disk-io.c   |   32 
 fs/btrfs/disk-io.h   |2 ++
 fs/btrfs/extent_io.h |1 +
 3 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0339cc0..0195172 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -945,6 +945,38 @@ int readahead_tree_block(struct btrfs_root *root, u64 
bytenr, u32 blocksize,
return ret;
 }
 
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 
blocksize,
+int mirror_num, struct extent_buffer **eb)
+{
+   struct extent_buffer *buf = NULL;
+   struct inode *btree_inode = root-fs_info-btree_inode;
+   struct extent_io_tree *io_tree = BTRFS_I(btree_inode)-io_tree;
+   int ret;
+
+   buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+   if (!buf)
+   return 0;
+
+   set_bit(EXTENT_BUFFER_READAHEAD, buf-bflags);
+
+   ret = read_extent_buffer_pages(io_tree, buf, 0, 0, 1, btree_get_extent,
+mirror_num);
+   if (ret) {
+   free_extent_buffer(buf);
+   return ret;
+   }
+
+   if (test_bit(EXTENT_BUFFER_CORRUPT, buf-bflags)) {
+   *eb = buf;
+   return -EIO;
+   } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+   *eb = buf;
+   } else {
+   free_extent_buffer(buf);
+   }
+   return 0;
+}
+
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
 {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a0b610a..fb35c4e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root 
*root, u64 bytenr,
  u32 blocksize, u64 parent_transid);
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 u64 parent_transid);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 
blocksize,
+int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 118c30b..e0a09e8 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4  /* this got triggered by readahead */
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/6] btrfs: initial readahead code and prototypes

2011-06-10 Thread Arne Jansen
This is the implementation for the generic read ahead framework.

To trigger a readahead, btrfs_reada_add must be called. It will start
a read ahead for the given range [start, end) on tree root. The returned
handle can either be used to wait on the readahead to finish
(btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).

The read ahead works as follows:
On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
reada_start_machine will then search for extents to prefetch and trigger
some reads. When a read finishes for a node, all contained node/leaf
pointers that lie in the given range will also be enqueued. The reads will
be triggered in sequential order, thus giving a big win over a naive
enumeration. It will also make use of multi-device layouts. Each disk
will have its on read pointer and all disks will by utilized in parallel.
Also will no two disks read both sides of a mirror simultaneously, as this
would waste seeking capacity. Instead both disks will read different parts
of the filesystem.
Any number of readaheads can be started in parallel. The read order will be
determined globally, i.e. 2 parallel readaheads will normally finish faster
than the 2 started one after another.

Changes v2:
 - protect root-node by transaction instead of node_lock
 - fix missed branches:
The readahead had a too simple check to determine if a branch from
a node should be checked or not. It now also records the upper bound
of each node to see if the requested RA range lies within.
 - use KERN_CONT to debug output, to avoid line breaks
 - defer reada_start_machine to worker to avoid deadlock

Signed-off-by: Arne Jansen sensi...@gmx.net
---
 fs/btrfs/Makefile |3 +-
 fs/btrfs/ctree.h  |8 +
 fs/btrfs/reada.c  |  997 +
 3 files changed, 1007 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9b72dcf..58302ca 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,4 +7,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
   export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
-  compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
+  compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+  reada.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6023de6..2f7d506 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2668,4 +2668,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, 
u64 devid);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 struct btrfs_scrub_progress *progress);
 
+/* reada.c */
+void *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start,
+ struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+u64 start, int err);
+
 #endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000..99ea583
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,997 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include linux/sched.h
+#include linux/pagemap.h
+#include linux/writeback.h
+#include linux/blkdev.h
+#include linux/rbtree.h
+#include linux/slab.h
+#include linux/workqueue.h
+#include ctree.h
+#include volumes.h
+#include disk-io.h
+#include transaction.h
+
+#undef DEBUG
+
+/*
+ * This is the implementation for the generic read ahead framework.
+ *
+ * To trigger a readahead, btrfs_reada_add must be called. It will start
+ * a read ahead for the given range [start, end) on tree root. The returned
+ * handle can either be used to wait on the readahead to finish
+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ *
+ * The read ahead works as follows:
+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ * reada_start_machine will then search for extents to prefetch and trigger
+ * some reads. When a read finishes for a node, all contained node/leaf
+ * 

[PATCH v2 6/6] btrfs: test ioctl for readahead

2011-06-10 Thread Arne Jansen
This ioctl is added to trigger a readahead from user mode. It implements
a readahead using the new interface and also a traditional tree walk.
This way it's possible to measure the two side by side.

Signed-off-by: Arne Jansen sensi...@gmx.net
---
 fs/btrfs/ioctl.c |   93 --
 fs/btrfs/ioctl.h |   16 +
 2 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ac37040..77de17c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2777,6 +2777,93 @@ static noinline long btrfs_ioctl_wait_sync(struct file 
*file, void __user *argp)
return btrfs_wait_for_commit(root, transid);
 }
 
+static noinline long btrfs_ioctl_reada_test(struct btrfs_fs_info *fs_info,
+   void __user *argp)
+{
+   struct btrfs_key start = { 0 };
+   struct btrfs_key end = {
+   .objectid = (u64)-1,
+   .type = (u8)-1,
+   .offset = (u64)-1
+   };
+   struct btrfs_ioctl_reada_args reada_args;
+   struct btrfs_key key;
+   struct btrfs_root *root = NULL;
+
+   if (!argp)
+   return -EINVAL;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   if (copy_from_user(reada_args,
+  (struct btrfs_ioctl_reada_args __user *)argp,
+  sizeof(reada_args)))
+   return -EFAULT;
+
+   start.objectid = reada_args.start_objectid;
+   start.type = reada_args.start_type;
+   start.offset = reada_args.start_offset;
+   end.objectid = reada_args.end_objectid;
+   end.type = reada_args.end_type;
+   end.offset = reada_args.end_offset;
+
+   key.objectid = reada_args.tree;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = (u64)-1;
+   root = btrfs_read_fs_root_no_name(fs_info, key);
+   if (IS_ERR(root))
+   return -ENOENT;
+
+   if (!(reada_args.flags  BTRFS_READA_IOC_FLAGS_TRAD)) {
+   void *handle;
+
+   handle = btrfs_reada_add(root, start, end);
+   if (IS_ERR(handle))
+   return PTR_ERR(handle);
+
+   if (reada_args.flags  BTRFS_READA_IOC_FLAGS_WAIT)
+   btrfs_reada_wait(handle);
+   else
+   btrfs_reada_detach(handle);
+   } else {
+   struct btrfs_path *path;
+   struct extent_buffer *leaf;
+   int slot;
+   int ret;
+
+   /*
+* enumerate the tree the traditional way
+*/
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+   path-reada = 2;
+
+   ret = btrfs_search_slot(NULL, root, start, path, 0, 0);
+   if (ret  0)
+   goto out;
+
+   do {
+   leaf = path-nodes[0];
+   slot = path-slots[0];
+   btrfs_item_key_to_cpu(leaf, key, slot);
+
+   if (key.objectid  end.objectid)
+   break;
+   if (key.objectid == end.objectid  key.type  end.type)
+   break;
+   if (key.objectid == end.objectid 
+   key.type == end.type  key.offset  end.offset)
+   break;
+   } while ((ret = btrfs_next_leaf(root, path)) == 0);
+out:
+   btrfs_free_path(path);
+   return ret = 0 ? 0 : ret;
+   }
+   return 0;
+}
+
 static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
 {
int ret;
@@ -2829,8 +2916,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root 
*root,
return ret;
 }
 
-long btrfs_ioctl(struct file *file, unsigned int
-   cmd, unsigned long arg)
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
struct btrfs_root *root = BTRFS_I(fdentry(file)-d_inode)-root;
void __user *argp = (void __user *)arg;
@@ -2901,7 +2987,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_scrub_cancel(root, argp);
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(root, argp);
+   case BTRFS_IOC_READA_TEST:
+   return btrfs_ioctl_reada_test(root-fs_info, argp);
}
-
return -ENOTTY;
 }
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea78..ee83259 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,20 @@ struct btrfs_ioctl_space_args {
struct btrfs_ioctl_space_info spaces[0];
 };
 
+#define BTRFS_READA_IOC_FLAGS_WAIT 1
+#define BTRFS_READA_IOC_FLAGS_TRAD 2
+struct btrfs_ioctl_reada_args {
+   __u64 flags;
+   __u64 tree;
+   __u64 start_objectid;
+   __u8 

[PATCH v2 5/6] btrfs: hooks for readahead

2011-06-10 Thread Arne Jansen
This adds the hooks needed for readahead. In the readpage_end_io_hook,
the extent state is checked for the EXTENT_READAHEAD flag. Only in this
case the readahead hook is called, to keep the impact on non-ra as low
as possible.
Additionally, a hook for a failed IO is added, otherwise readahead would
wait indefinitely for the extent to finish.

Changes for v2:
 - eliminate race condition

Signed-off-by: Arne Jansen sensi...@gmx.net
---
 fs/btrfs/disk-io.c   |   37 +
 fs/btrfs/extent_io.c |2 +-
 2 files changed, 38 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 904c4b3..f5b7b79 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -574,11 +574,47 @@ static int btree_readpage_end_io_hook(struct page *page, 
u64 start, u64 end,
end = min_t(u64, eb-len, PAGE_CACHE_SIZE);
end = eb-start + end - 1;
 err:
+   if (test_bit(EXTENT_BUFFER_READAHEAD, eb-bflags)) {
+   clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags);
+   btree_readahead_hook(root, eb, eb-start, ret);
+   }
+
free_extent_buffer(eb);
 out:
return ret;
 }
 
+static int btree_io_failed_hook(struct bio *failed_bio,
+struct page *page, u64 start, u64 end,
+struct extent_state *state)
+{
+   struct extent_io_tree *tree;
+   unsigned long len;
+   struct extent_buffer *eb;
+   struct btrfs_root *root = BTRFS_I(page-mapping-host)-root;
+
+   tree = BTRFS_I(page-mapping-host)-io_tree;
+   if (page-private == EXTENT_PAGE_PRIVATE)
+   goto out;
+   if (!page-private)
+   goto out;
+
+   len = page-private  2;
+   WARN_ON(len == 0);
+
+   eb = alloc_extent_buffer(tree, start, len, page);
+   if (eb == NULL)
+   goto out;
+
+   if (test_bit(EXTENT_BUFFER_READAHEAD, eb-bflags)) {
+   clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags);
+   btree_readahead_hook(root, eb, eb-start, -EIO);
+   }
+
+out:
+   return -EIO;/* we fixed nothing */
+}
+
 static void end_workqueue_bio(struct bio *bio, int err)
 {
struct end_io_wq *end_io_wq = bio-bi_private;
@@ -3132,6 +3168,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root 
*root)
 static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
+   .readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b7f0b9b..19ca500 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1774,7 +1774,7 @@ static void end_bio_extent_readpage(struct bio *bio, int 
err)
if (!uptodate  tree-ops 
tree-ops-readpage_io_failed_hook) {
ret = tree-ops-readpage_io_failed_hook(bio, page,
-start, end, NULL);
+start, end, state);
if (ret == 0) {
uptodate =
test_bit(BIO_UPTODATE, bio-bi_flags);
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: remove unneeded includes from scrub.c

2011-06-10 Thread Arne Jansen
Signed-off-by: Arne Jansen sensi...@gmx.net
---
 fs/btrfs/scrub.c |6 --
 1 files changed, 0 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 92cac19..a8d03d5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -16,13 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include linux/sched.h
-#include linux/pagemap.h
-#include linux/writeback.h
 #include linux/blkdev.h
-#include linux/rbtree.h
-#include linux/slab.h
-#include linux/workqueue.h
 #include ctree.h
 #include volumes.h
 #include disk-io.h
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: use readahead API for scrub

2011-06-10 Thread Arne Jansen
Scrub uses a simple tree-enumeration to bring the relevant portions
of the extent- and csum-tree into the page cache before starting the
scrub-I/O. This is now replaced by using the new readahead-API.
During readahead the scrub is being accounted as paused, so it won't
hold off transaction commits.

This change raises the average disk bandwith utilisation on my test
volume from 70% to 90%. On another volume, the time for a test run
went down from 89s to 43s.

Signed-off-by: Arne Jansen sensi...@gmx.net
---
 fs/btrfs/scrub.c |  116 --
 1 files changed, 52 insertions(+), 64 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5..ce5208c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -29,15 +29,12 @@
  * any can be found.
  *
  * Future enhancements:
- *  - To enhance the performance, better read-ahead strategies for the
- *extent-tree can be employed.
  *  - In case an unrepairable extent is encountered, track which files are
  *affected and report them
  *  - In case of a read error on files with nodatasum, map the file and read
  *the extent to trigger a writeback of the good copy
  *  - track and record media errors, throw out bad devices
  *  - add a mode to also read unallocated space
- *  - make the prefetch cancellable
  */
 
 struct scrub_bio;
@@ -741,13 +738,16 @@ static noinline_for_stack int scrub_stripe(struct 
scrub_dev *sdev,
int slot;
int i;
u64 nstripes;
-   int start_stripe;
struct extent_buffer *l;
struct btrfs_key key;
u64 physical;
u64 logical;
u64 generation;
u64 mirror_num;
+   void *reada1;
+   void *reada2;
+   struct btrfs_key key_start;
+   struct btrfs_key key_end;
 
u64 increment = map-stripe_len;
u64 offset;
@@ -779,81 +779,67 @@ static noinline_for_stack int scrub_stripe(struct 
scrub_dev *sdev,
if (!path)
return -ENOMEM;
 
-   path-reada = 2;
path-search_commit_root = 1;
path-skip_locking = 1;
 
/*
-* find all extents for each stripe and just read them to get
-* them into the page cache
-* FIXME: we can do better. build a more intelligent prefetching
+* trigger the readahead for extent tree csum tree and wait for
+* completion. During readahead, the scrub is officially paused
+* to not hold off transaction commits
 */
logical = base + offset;
-   physical = map-stripes[num].physical;
-   ret = 0;
-   for (i = 0; i  nstripes; ++i) {
-   key.objectid = logical;
-   key.type = BTRFS_EXTENT_ITEM_KEY;
-   key.offset = (u64)0;
-
-   ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
-   if (ret  0)
-   goto out_noplug;
-
-   /*
-* we might miss half an extent here, but that doesn't matter,
-* as it's only the prefetch
-*/
-   while (1) {
-   l = path-nodes[0];
-   slot = path-slots[0];
-   if (slot = btrfs_header_nritems(l)) {
-   ret = btrfs_next_leaf(root, path);
-   if (ret == 0)
-   continue;
-   if (ret  0)
-   goto out_noplug;
 
-   break;
-   }
-   btrfs_item_key_to_cpu(l, key, slot);
+   wait_event(sdev-list_wait,
+  atomic_read(sdev-in_flight) == 0);
+   atomic_inc(fs_info-scrubs_paused);
+   wake_up(fs_info-scrub_pause_wait);
 
-   if (key.objectid = logical + map-stripe_len)
-   break;
+   /* FIXME it might be better to start readahead at commit root */
+   key_start.objectid = logical;
+   key_start.type = BTRFS_EXTENT_ITEM_KEY;
+   key_start.offset = (u64)0;
+   key_end.objectid = base + offset + nstripes * increment;
+   key_end.type = BTRFS_EXTENT_ITEM_KEY;
+   key_end.offset = (u64)0;
+   reada1 = btrfs_reada_add(root, key_start, key_end);
+
+   key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+   key_start.type = BTRFS_EXTENT_CSUM_KEY;
+   key_start.offset = logical;
+   key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+   key_end.type = BTRFS_EXTENT_CSUM_KEY;
+   key_end.offset = base + offset + nstripes * increment;
+   reada2 = btrfs_reada_add(csum_root, key_start, key_end);
+
+   if (!IS_ERR(reada1))
+   btrfs_reada_wait(reada1);
+   if (!IS_ERR(reada2))
+   btrfs_reada_wait(reada2);
 
-   path-slots[0]++;
-   }
-   btrfs_release_path(path);
-   logical += increment;
-   

Re: kernel BUG at fs/btrfs/inode.c:4676!

2011-06-10 Thread Josef Bacik
On 06/09/2011 10:06 PM, Daniel J Blueman wrote:
 On 10 June 2011 09:57, Andy Lutomirski l...@mit.edu wrote:
 On 06/06/2011 06:19 AM, Marek Otahal wrote:

 Hello,
 the issue happens every time when i have to hard power-off my notebook
 (suspend problems).
 With kernel 2.6.39 the partition is unmountable, solution is to boot
 2.6.38 kernel which
 1/ is able to mount the partition,
 2/ by doing that fixes the problem so later .39 (after clean shutdown) can
 mount it also.

 Same problem here.  Mounting with 2.6.38 says:

 [   41.906259] Btrfs loaded
 [   41.906747] device fsid e040a9d60da49596-66c0275e348878bf devid 1 transid
 69217 /dev/mapper/vg_midnight_ssd-home
 [   41.908767] btrfs: disk space caching is enabled
 [   42.232185] btrfs: unlinked 17 orphans
 [   42.232189] btrfs: truncated 2 orphans

 dmesg in 2.6.39.1 says:
 []
 [   15.004255] kernel BUG at fs/btrfs/inode.c:4676!
 []
 
 I've been experiencing the same issue also.
 
 Josef/Chris, would an metadata snapshot or full block snapshot help
 debug this regression? I can probably setup a small testcase to
 trigger this.
 

If you can come up with a testcase to reproduce I would love you forever
;).  If I get done what I wanted to do today I will try and reproduce.
Thanks,

Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Jim Schutt

David Sterba wrote:

Hi,

a candidate fix:

http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-unstable.git;a=commit;h=aa0467d8d2a00e75b2bb6a56a4ee6d70c5d1928f


 With Linus' tree, today's linux-next build (powercp ppc64_defconfig)
 produced this warning:

 fs/btrfs/delayed-inode.c: In function 'btrfs_delayed_update_inode':
 fs/btrfs/delayed-inode.c:1598:6: warning: 'ret' may be used
 uninitialized in this function
 Introduced by commit 16cdcec736cd (btrfs: implement delayed inode items
 operation).

 This fixes a bug in btrfs_update_inode(): if the returned value from
 btrfs_delayed_update_inode is a nonzero garbage, inode stat data are not
 updated and several call paths may hit a BUG_ON or fail with strange
 code.


if you can reproduce it reliably, add this patch on top of the delayed inodes.



I cherry-picked aa0467d8d2a00e on top of 16cdcec736cd21, which
gave me the following instead of a BUG:

[  246.986087] [ cut here ]
[  246.990714] WARNING: at mm/page_alloc.c:2032 
__alloc_pages_slowpath+0x54/0x3c5()
[  246.998100] Hardware name: PowerEdge 1950
[  247.002110] Modules linked in: btrfs zlib_deflate lzo_compress 
ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state 
nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp 
i2c_dev i2c_core ext3 jbd scsi_transport_iscsi rds ib_ipoib rdma_ucm rdma_cm 
ib_ucm ib_uverbs ib_umad ib_cm iw_cm ib_addr ipv6 ib_sa dm_mirror 
dm_region_hash dm_log dm_multipath scsi_dh dm_mod video sbs sbshc pci_slot 
battery acpi_pad ac kvm sg ses sd_mod enclosure megaraid_sas ide_cd_mod cdrom 
ib_mthca qla2xxx serio_raw ib_mad ib_core scsi_transport_fc button scsi_tgt 
ata_piix libata scsi_mod dcdbas i5k_amb tpm_tis tpm ioatdma hwmon tpm_bios 
ehci_hcd dca i5000_edac pcspkr iTCO_wdt uhci_hcd iTCO_vendor_support edac_core 
rtc nfs nfs_acl auth_rpcgss fscache lockd sunrpc tg3 bnx2 e1000 [last unloaded: 
freq_table]
[  247.076305] Pid: 6044, comm: cosd Not tainted 2.6.39-2-gf47e9fd #25
[  247.082911] Call Trace:
[  247.085358]  [810ccf8a] ? __alloc_pages_slowpath+0x54/0x3c5
[  247.091793]  [81049379] ? warn_slowpath_common+0x85/0x9e
[  247.097974]  [810493ac] ? warn_slowpath_null+0x1a/0x1c
[  247.103974]  [810ccf8a] ? __alloc_pages_slowpath+0x54/0x3c5
[  247.110409]  [810ccae2] ? get_page_from_freelist+0x166/0x198
[  247.116928]  [810cd3c5] ? __alloc_pages_nodemask+0xca/0xf4
[  247.123297]  [a0702884] ? unmap_extent_buffer+0x11/0x13 [btrfs]
[  247.130079]  [810fd957] ? alloc_pages_current+0xa3/0xac
[  247.136166]  [810cc5f7] ? alloc_pages+0xe/0x10
[  247.141472]  [810cc607] ? __get_free_pages+0xe/0x4b
[  247.147218]  [811061eb] ? kmalloc_order_trace+0x27/0x55
[  247.153304]  [8110664e] ? __kmalloc+0x37/0x100
[  247.158625]  [a072622a] ? btrfs_batch_insert_items+0xe0/0x229 
[btrfs]
[  247.165933]  [a06d454b] ? btrfs_block_rsv_release+0x39/0x3b [btrfs]
[  247.173072]  [a07265e7] ? btrfs_insert_delayed_items+0xac/0xef 
[btrfs]
[  247.180472]  [a0726798] ? btrfs_run_delayed_items+0x68/0xd9 [btrfs]
[  247.187610]  [a06e85fc] ? btrfs_commit_transaction+0x266/0x5c9 
[btrfs]
[  247.195000]  [81066118] ? list_del_init+0x21/0x21
[  247.200583]  [a0710c4c] ? create_subvol+0x420/0x440 [btrfs]
[  247.207018]  [810362ec] ? need_resched+0x23/0x2d
[  247.212511]  [a0710d7a] ? btrfs_mksubvol+0x10e/0x167 [btrfs]
[  247.219043]  [a071129f] ? 
btrfs_ioctl_snap_create_transid+0x9c/0x121 [btrfs]
[  247.226962]  [a071145e] ? btrfs_ioctl_snap_create+0x50/0x67 [btrfs]
[  247.234101]  [a0712c5a] ? btrfs_ioctl+0x1d0/0x2c6 [btrfs]
[  247.240364]  [8111fe12] ? vfs_ioctl+0x1d/0x34
[  247.245582]  [8112048d] ? do_vfs_ioctl+0x171/0x17a
[  247.251242]  [811130a2] ? fget_light+0x69/0x81
[  247.256549]  [811204f2] ? sys_ioctl+0x5c/0x7c
[  247.261770]  [8111d48d] ? putname+0x33/0x37
[  247.266819]  [813b21eb] ? system_call_fastpath+0x16/0x1b
[  247.272993] ---[ end trace 9c75d74017f060f5 ]---

The mkcephfs command I was attempting succeeded, so I'm
not sure if the above matters; mm/page_alloc.c:2032 is

/*
 * In the slowpath, we sanity check order to avoid ever trying to
 * reclaim = MAX_ORDER areas which will never succeed. Callers may
 * be using allocators in order of preference for an area that is
 * too large.
 */
if (order = MAX_ORDER) {
WARN_ON_ONCE(!(gfp_mask  __GFP_NOWARN));
return NULL;
}

When I did my bisection, my criteria for success/failure was
did mkcephfs succeed?.  When I apply this criteria to a recent
linus kernel (e.g. 06e86849cf4019), which includes the fix you
mentioned (aa0467d8d2a00e), I get still a different failure mode,
which doesn't actually reference btrfs:

[  276.364178] 

Re: [PATCH 2/2] Btrfs: serialize flushers in reserve_metadata_bytes

2011-06-10 Thread David Sterba
On Thu, Jun 09, 2011 at 08:00:15PM +0200, David Sterba wrote:
 On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote:
  We're not trying to be perfect here, we're trying to be fast :).
 
 Be even faster with smp_rmb() :)

Arne made me think about this again. Let's analyze it in more detail:

The read side, if(delalloc_bytes), utilizes a full barrier, which will
force all cpus to flush pending reads and writes. This will ensure 'if'
will see a fresh value.

However, there is no pairing write barrier and the write flush will
happen at some point in time, (delalloc_bytes += len), but completely
unsynchronized with the read side.

The smp_mb barrier has no desired synchonization effect. Moreover, it
has a performance hit.


Doing it right with barriers would mean to add smp_rmb before the
if(...) and smp_wmb after the delalloc_bytes =, but only in the case
the variable is solely synchronized via barriers. Not our case. There is
the spinlock.

As strict correctness is not needed here, you admit that delalloc_bytes
might not correspond to the state of fs_info-delalloc_inodes and this
is not a problem. Fine. But then you do not need the smp_mb. The value
of delalloc_bytes will be random (ie. unsynchronized), with or without
the barrier. Please drop it from the patch.


david
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Btrfs: serialize flushers in reserve_metadata_bytes

2011-06-10 Thread Josef Bacik
On 06/10/2011 01:47 PM, David Sterba wrote:
 On Thu, Jun 09, 2011 at 08:00:15PM +0200, David Sterba wrote:
 On Thu, Jun 09, 2011 at 10:00:42AM -0400, Josef Bacik wrote:
 We're not trying to be perfect here, we're trying to be fast :).

 Be even faster with smp_rmb() :)
 
 Arne made me think about this again. Let's analyze it in more detail:
 
 The read side, if(delalloc_bytes), utilizes a full barrier, which will
 force all cpus to flush pending reads and writes. This will ensure 'if'
 will see a fresh value.
 
 However, there is no pairing write barrier and the write flush will
 happen at some point in time, (delalloc_bytes += len), but completely
 unsynchronized with the read side.
 
 The smp_mb barrier has no desired synchonization effect. Moreover, it
 has a performance hit.
 
 
 Doing it right with barriers would mean to add smp_rmb before the
 if(...) and smp_wmb after the delalloc_bytes =, but only in the case
 the variable is solely synchronized via barriers. Not our case. There is
 the spinlock.
 
 As strict correctness is not needed here, you admit that delalloc_bytes
 might not correspond to the state of fs_info-delalloc_inodes and this
 is not a problem. Fine. But then you do not need the smp_mb. The value
 of delalloc_bytes will be random (ie. unsynchronized), with or without
 the barrier. Please drop it from the patch.

I just used the spin lock.

Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Chris Mason
Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400:

[ two different btrfs crashes ]

I think your two crashes in btrfs were from the uninit variables and
those should be fixed in rc2.

 When I did my bisection, my criteria for success/failure was
 did mkcephfs succeed?.  When I apply this criteria to a recent
 linus kernel (e.g. 06e86849cf4019), which includes the fix you
 mentioned (aa0467d8d2a00e), I get still a different failure mode,
 which doesn't actually reference btrfs:
 
 [  276.364178] BUG: unable to handle kernel NULL pointer dereference at 
 000a
 [  276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd]

Looking at the resulting code in the oops, we're here in journal_start:

if (handle) {
J_ASSERT(handle-h_transaction-t_journal == journal);

handle comes from current-journal_info, and we're doing a deref on
handle-h_transaction, which is probably 0xa.

So, we're leaving crud in current-journal_info and ext3 is finding it.

Perhaps its from ceph starting a transaction but leaving it running?
The bug came with Josef's transaction performance fixes, but it is
probably a mixture of his code with the ioctls ceph is using.

[ rest of the oops below for context ]

-chris

 [  276.365127] PGD 1e4469067 PUD 1e1658067 PMD 0
 [  276.365127] Oops:  [#1] SMP
 [  276.365127] CPU 2
 [  276.365127] Modules linked in: btrfs zlib_deflate lzo_compress 
 ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state 
 nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge 
 stp i2c_dev i2c_core ext3 jbd scsi_transport_iscsi rds ib_ipoib rdma_ucm 
 rdma_cm ib_ucm ib_uverbs ib_umad ib_cm iw_cm ib_addr ipv6 ib_sa dm_mirror 
 dm_region_hash dm_log dm_multipath scsi_dh dm_mod video sbs sbshc pci_slot 
 battery acpi_pad ac kvm sg ses sd_mod enclosure megaraid_sas ide_cd_mod cdrom 
 ib_mthca ib_mad qla2xxx button ib_core serio_raw scsi_transport_fc scsi_tgt 
 dcdbas ata_piix libata tpm_tis tpm i5k_amb ioatdma tpm_bios hwmon iTCO_wdt 
 scsi_mod i5000_edac iTCO_vendor_support ehci_hcd dca edac_core uhci_hcd 
 pcspkr rtc nfs nfs_acl auth_rpcgss fscache lockd sunrpc tg3 bnx2 e1000 [last 
 unloaded: freq_table]
 [  276.365127]
 [  276.365127] Pid: 6076, comm: cosd Not tainted 3.0.0-rc2-00196-g06e8684 #26 
 Dell Inc. PowerEdge 1950/0DT097
 [  276.365127] RIP: 0010:[a05434b1]  [a05434b1] 
 journal_start+0x3e/0x9c [jbd]
 [  276.365127] RSP: 0018:8801e2897b28  EFLAGS: 00010286
 [  276.365127] RAX: 000a RBX: 8801de8e1090 RCX: 
 0002
 [  276.365127] RDX: 19b2d000 RSI: 000e RDI: 
 000e
 [  276.365127] RBP: 8801e2897b48 R08: 0003 R09: 
 8801e2897c38
 [  276.365127] R10: 8801e2897ed8 R11: 0001 R12: 
 880223ff4400
 [  276.365127] R13: 880218522d60 R14: 0ec6 R15: 
 88021f54d878
 [  276.365127] FS:  7f8ff0bbb710() GS:88022fc8() 
 knlGS:
 [  276.365127] CS:  0010 DS:  ES:  CR0: 8005003b
 [  276.365127] CR2: 000a CR3: 00021744f000 CR4: 
 06e0
 [  276.365127] DR0:  DR1:  DR2: 
 
 [  276.365127] DR3:  DR6: 0ff0 DR7: 
 0400
 [  276.365127] Process cosd (pid: 6076, threadinfo 8801e2896000, task 
 880218522d60)
 [  276.365127] Stack:
 [  276.365127]  8801e2897b68 ea000756e788 88021f54d728 
 8801e2897c78
 [  276.365127]  8801e2897b58 a05670ce 8801e2897b68 
 a055c72d
 [  276.365127]  8801e2897be8 a055f044 8801e2897c38 
 0074
 [  276.365127] Call Trace:
 [  276.365127]  [a05670ce] ext3_journal_start_sb+0x4f/0x51 [ext3]
 [  276.365127]  [a055c72d] ext3_journal_start+0x12/0x14 [ext3]
 [  276.365127]  [a055f044] ext3_write_begin+0x93/0x1a1 [ext3]
 [  276.365127]  [810c6f0e] ? __kunmap_atomic+0xe/0x10
 [  276.365127]  [810c75e5] generic_perform_write+0xb1/0x172
 [  276.365127]  [81036a33] ? need_resched+0x23/0x2d
 [  276.365127]  [810c76ea] generic_file_buffered_write+0x44/0x6f
 [  276.365127]  [810c91f5] __generic_file_aio_write+0x253/0x2a8
 [  276.365127]  [810c92ad] generic_file_aio_write+0x63/0xb8
 [  276.365127]  [81113b26] do_sync_write+0xc7/0x10b
 [  276.365127]  [81036a4b] ? should_resched+0xe/0x2f
 [  276.365127]  [813b0faf] ? _cond_resched+0xe/0x22
 [  276.365127]  [811986c3] ? security_file_permission+0x2c/0x31
 [  276.365127]  [81113d21] ? rw_verify_area+0xac/0xdb
 [  276.365127]  [81114253] vfs_write+0xac/0xe4
 [  276.365127]  [8111434f] sys_write+0x4c/0x71
 [  276.365127]  [813b8beb] system_call_fastpath+0x16/0x1b
 [  276.365127] Code: 89 fc 48 c7 c3 e2 ff ff ff 89 f7 65 4c 8b 2c 25 c0 b5 00 
 00 4d 85 e4 49 8b 85 48 06 00 00 74 

Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Sage Weil
On Fri, 10 Jun 2011, Chris Mason wrote:
 Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400:
 
 [ two different btrfs crashes ]
 
 I think your two crashes in btrfs were from the uninit variables and
 those should be fixed in rc2.
 
  When I did my bisection, my criteria for success/failure was
  did mkcephfs succeed?.  When I apply this criteria to a recent
  linus kernel (e.g. 06e86849cf4019), which includes the fix you
  mentioned (aa0467d8d2a00e), I get still a different failure mode,
  which doesn't actually reference btrfs:
  
  [  276.364178] BUG: unable to handle kernel NULL pointer dereference at 
  000a
  [  276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd]
 
 Looking at the resulting code in the oops, we're here in journal_start:
 
 if (handle) {
 J_ASSERT(handle-h_transaction-t_journal == journal);
 
 handle comes from current-journal_info, and we're doing a deref on
 handle-h_transaction, which is probably 0xa.
 
 So, we're leaving crud in current-journal_info and ext3 is finding it.
 
 Perhaps its from ceph starting a transaction but leaving it running?
 The bug came with Josef's transaction performance fixes, but it is
 probably a mixture of his code with the ioctls ceph is using.

Ah, yeah, that's the problem.  We saw a similar problem a while back with 
the start/stop transaction ioctls.  In this case, create_snapshot is doing

trans = btrfs_start_transaction(root-fs_info-extent_root, 5);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto fail;
}

which sets current-journal_info.  Then

ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
BUG_ON(ret);

list_add(pending_snapshot-list,
 trans-transaction-pending_snapshots);
if (async_transid) {
*async_transid = trans-transid;
ret = btrfs_commit_transaction_async(trans,
 root-fs_info-extent_root, 1);
} else {
ret = btrfs_commit_transaction(trans,
   root-fs_info-extent_root);
}

but the async snap creation ioctl takes the async path, which runs 
btrfs_commit_transaction in a worker thread.

I'm not sure what the right thing to do is here is... can whatever is in 
journal_info be attached to trans instead in 
btrfs_commit_transaction_async()?

sage






 
 [ rest of the oops below for context ]
 
 -chris
 
  [  276.365127] PGD 1e4469067 PUD 1e1658067 PMD 0
  [  276.365127] Oops:  [#1] SMP
  [  276.365127] CPU 2
  [  276.365127] Modules linked in: btrfs zlib_deflate lzo_compress 
  ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state 
  nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge 
  stp i2c_dev i2c_core ext3 jbd scsi_transport_iscsi rds ib_ipoib rdma_ucm 
  rdma_cm ib_ucm ib_uverbs ib_umad ib_cm iw_cm ib_addr ipv6 ib_sa dm_mirror 
  dm_region_hash dm_log dm_multipath scsi_dh dm_mod video sbs sbshc pci_slot 
  battery acpi_pad ac kvm sg ses sd_mod enclosure megaraid_sas ide_cd_mod 
  cdrom ib_mthca ib_mad qla2xxx button ib_core serio_raw scsi_transport_fc 
  scsi_tgt dcdbas ata_piix libata tpm_tis tpm i5k_amb ioatdma tpm_bios hwmon 
  iTCO_wdt scsi_mod i5000_edac iTCO_vendor_support ehci_hcd dca edac_core 
  uhci_hcd pcspkr rtc nfs nfs_acl auth_rpcgss fscache lockd sunrpc tg3 bnx2 
  e1000 [last unloaded: freq_table]
  [  276.365127]
  [  276.365127] Pid: 6076, comm: cosd Not tainted 3.0.0-rc2-00196-g06e8684 
  #26 Dell Inc. PowerEdge 1950/0DT097
  [  276.365127] RIP: 0010:[a05434b1]  [a05434b1] 
  journal_start+0x3e/0x9c [jbd]
  [  276.365127] RSP: 0018:8801e2897b28  EFLAGS: 00010286
  [  276.365127] RAX: 000a RBX: 8801de8e1090 RCX: 
  0002
  [  276.365127] RDX: 19b2d000 RSI: 000e RDI: 
  000e
  [  276.365127] RBP: 8801e2897b48 R08: 0003 R09: 
  8801e2897c38
  [  276.365127] R10: 8801e2897ed8 R11: 0001 R12: 
  880223ff4400
  [  276.365127] R13: 880218522d60 R14: 0ec6 R15: 
  88021f54d878
  [  276.365127] FS:  7f8ff0bbb710() GS:88022fc8() 
  knlGS:
  [  276.365127] CS:  0010 DS:  ES:  CR0: 8005003b
  [  276.365127] CR2: 000a CR3: 00021744f000 CR4: 
  06e0
  [  276.365127] DR0:  DR1:  DR2: 
  
  [  276.365127] DR3:  DR6: 0ff0 DR7: 
  0400
  [  276.365127] Process cosd (pid: 6076, threadinfo 8801e2896000, task 
  880218522d60)
  [  276.365127] Stack:
  [  276.365127]  8801e2897b68 ea000756e788 88021f54d728 
  8801e2897c78
  [  276.365127]  8801e2897b58 a05670ce 8801e2897b68 
  a055c72d
  [  276.365127]  8801e2897be8 a055f044 

Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Sage Weil
On Fri, 10 Jun 2011, Sage Weil wrote:
 On Fri, 10 Jun 2011, Chris Mason wrote:
  Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400:
  
  [ two different btrfs crashes ]
  
  I think your two crashes in btrfs were from the uninit variables and
  those should be fixed in rc2.
  
   When I did my bisection, my criteria for success/failure was
   did mkcephfs succeed?.  When I apply this criteria to a recent
   linus kernel (e.g. 06e86849cf4019), which includes the fix you
   mentioned (aa0467d8d2a00e), I get still a different failure mode,
   which doesn't actually reference btrfs:
   
   [  276.364178] BUG: unable to handle kernel NULL pointer dereference at 
   000a
   [  276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd]
  
  Looking at the resulting code in the oops, we're here in journal_start:
  
  if (handle) {
  J_ASSERT(handle-h_transaction-t_journal == journal);
  
  handle comes from current-journal_info, and we're doing a deref on
  handle-h_transaction, which is probably 0xa.
  
  So, we're leaving crud in current-journal_info and ext3 is finding it.
  
  Perhaps its from ceph starting a transaction but leaving it running?
  The bug came with Josef's transaction performance fixes, but it is
  probably a mixture of his code with the ioctls ceph is using.
 
 Ah, yeah, that's the problem.  We saw a similar problem a while back with 
 the start/stop transaction ioctls.  In this case, create_snapshot is doing
 
   trans = btrfs_start_transaction(root-fs_info-extent_root, 5);
   if (IS_ERR(trans)) {
   ret = PTR_ERR(trans);
   goto fail;
   }
 
 which sets current-journal_info.  Then
 
   ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
   BUG_ON(ret);
 
   list_add(pending_snapshot-list,
trans-transaction-pending_snapshots);
   if (async_transid) {
   *async_transid = trans-transid;
   ret = btrfs_commit_transaction_async(trans,
root-fs_info-extent_root, 1);
   } else {
   ret = btrfs_commit_transaction(trans,
  root-fs_info-extent_root);
   }
 
 but the async snap creation ioctl takes the async path, which runs 
 btrfs_commit_transaction in a worker thread.
 
 I'm not sure what the right thing to do is here is... can whatever is in 
 journal_info be attached to trans instead in 
 btrfs_commit_transaction_async()?

It looks like it's not used for anything in btrfs, actually; it's just set 
and cleared.  What's the point of that?

Anyway, assuming it's useful, I think the below would fix the problem.  
Want to give it a shot, Jim?

Thanks!
sage


diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c571734..fd04ad7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1196,6 +1196,9 @@ int btrfs_commit_transaction_async(struct 
btrfs_trans_handle *trans,
put_transaction(cur_trans);
mutex_unlock(root-fs_info-trans_mutex);
 
+   if (current-journal_info == trans)
+   current-journal_info = NULL;
+
return 0;
 }
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Josef Bacik
On 06/10/2011 02:14 PM, Sage Weil wrote:
 On Fri, 10 Jun 2011, Sage Weil wrote:
 On Fri, 10 Jun 2011, Chris Mason wrote:
 Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400:

 [ two different btrfs crashes ]

 I think your two crashes in btrfs were from the uninit variables and
 those should be fixed in rc2.

 When I did my bisection, my criteria for success/failure was
 did mkcephfs succeed?.  When I apply this criteria to a recent
 linus kernel (e.g. 06e86849cf4019), which includes the fix you
 mentioned (aa0467d8d2a00e), I get still a different failure mode,
 which doesn't actually reference btrfs:

 [  276.364178] BUG: unable to handle kernel NULL pointer dereference at 
 000a
 [  276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd]

 Looking at the resulting code in the oops, we're here in journal_start:

 if (handle) {
 J_ASSERT(handle-h_transaction-t_journal == journal);

 handle comes from current-journal_info, and we're doing a deref on
 handle-h_transaction, which is probably 0xa.

 So, we're leaving crud in current-journal_info and ext3 is finding it.

 Perhaps its from ceph starting a transaction but leaving it running?
 The bug came with Josef's transaction performance fixes, but it is
 probably a mixture of his code with the ioctls ceph is using.

 Ah, yeah, that's the problem.  We saw a similar problem a while back with 
 the start/stop transaction ioctls.  In this case, create_snapshot is doing

  trans = btrfs_start_transaction(root-fs_info-extent_root, 5);
  if (IS_ERR(trans)) {
  ret = PTR_ERR(trans);
  goto fail;
  }

 which sets current-journal_info.  Then

  ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
  BUG_ON(ret);

  list_add(pending_snapshot-list,
   trans-transaction-pending_snapshots);
  if (async_transid) {
  *async_transid = trans-transid;
  ret = btrfs_commit_transaction_async(trans,
   root-fs_info-extent_root, 1);
  } else {
  ret = btrfs_commit_transaction(trans,
 root-fs_info-extent_root);
  }

 but the async snap creation ioctl takes the async path, which runs 
 btrfs_commit_transaction in a worker thread.

 I'm not sure what the right thing to do is here is... can whatever is in 
 journal_info be attached to trans instead in 
 btrfs_commit_transaction_async()?
 
 It looks like it's not used for anything in btrfs, actually; it's just set 
 and cleared.  What's the point of that?
 

It is used now, check the beginning of start_transaction().  Thanks,

Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Sage Weil
On Fri, 10 Jun 2011, Josef Bacik wrote:
 On 06/10/2011 02:14 PM, Sage Weil wrote:
  On Fri, 10 Jun 2011, Sage Weil wrote:
  On Fri, 10 Jun 2011, Chris Mason wrote:
  Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400:
 
  [ two different btrfs crashes ]
 
  I think your two crashes in btrfs were from the uninit variables and
  those should be fixed in rc2.
 
  When I did my bisection, my criteria for success/failure was
  did mkcephfs succeed?.  When I apply this criteria to a recent
  linus kernel (e.g. 06e86849cf4019), which includes the fix you
  mentioned (aa0467d8d2a00e), I get still a different failure mode,
  which doesn't actually reference btrfs:
 
  [  276.364178] BUG: unable to handle kernel NULL pointer dereference at 
  000a
  [  276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd]
 
  Looking at the resulting code in the oops, we're here in journal_start:
 
  if (handle) {
  J_ASSERT(handle-h_transaction-t_journal == journal);
 
  handle comes from current-journal_info, and we're doing a deref on
  handle-h_transaction, which is probably 0xa.
 
  So, we're leaving crud in current-journal_info and ext3 is finding it.
 
  Perhaps its from ceph starting a transaction but leaving it running?
  The bug came with Josef's transaction performance fixes, but it is
  probably a mixture of his code with the ioctls ceph is using.
 
  Ah, yeah, that's the problem.  We saw a similar problem a while back with 
  the start/stop transaction ioctls.  In this case, create_snapshot is doing
 
 trans = btrfs_start_transaction(root-fs_info-extent_root, 5);
 if (IS_ERR(trans)) {
 ret = PTR_ERR(trans);
 goto fail;
 }
 
  which sets current-journal_info.  Then
 
 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
 BUG_ON(ret);
 
 list_add(pending_snapshot-list,
  trans-transaction-pending_snapshots);
 if (async_transid) {
 *async_transid = trans-transid;
 ret = btrfs_commit_transaction_async(trans,
  root-fs_info-extent_root, 1);
 } else {
 ret = btrfs_commit_transaction(trans,
root-fs_info-extent_root);
 }
 
  but the async snap creation ioctl takes the async path, which runs 
  btrfs_commit_transaction in a worker thread.
 
  I'm not sure what the right thing to do is here is... can whatever is in 
  journal_info be attached to trans instead in 
  btrfs_commit_transaction_async()?
  
  It looks like it's not used for anything in btrfs, actually; it's just set 
  and cleared.  What's the point of that?
  
 
 It is used now, check the beginning of start_transaction().  Thanks,

Oh I see, okay.

So clearing it in btrfs_commit_transaction_async should be fine then, 
right?  When btrfs_commit_transaction runs in the other thread it won't 
care that current-journal_info is NULL.

sage
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Josef Bacik
On 06/10/2011 02:35 PM, Sage Weil wrote:
 On Fri, 10 Jun 2011, Josef Bacik wrote:
 On 06/10/2011 02:14 PM, Sage Weil wrote:
 On Fri, 10 Jun 2011, Sage Weil wrote:
 On Fri, 10 Jun 2011, Chris Mason wrote:
 Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400:

 [ two different btrfs crashes ]

 I think your two crashes in btrfs were from the uninit variables and
 those should be fixed in rc2.

 When I did my bisection, my criteria for success/failure was
 did mkcephfs succeed?.  When I apply this criteria to a recent
 linus kernel (e.g. 06e86849cf4019), which includes the fix you
 mentioned (aa0467d8d2a00e), I get still a different failure mode,
 which doesn't actually reference btrfs:

 [  276.364178] BUG: unable to handle kernel NULL pointer dereference at 
 000a
 [  276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd]

 Looking at the resulting code in the oops, we're here in journal_start:

 if (handle) {
 J_ASSERT(handle-h_transaction-t_journal == journal);

 handle comes from current-journal_info, and we're doing a deref on
 handle-h_transaction, which is probably 0xa.

 So, we're leaving crud in current-journal_info and ext3 is finding it.

 Perhaps its from ceph starting a transaction but leaving it running?
 The bug came with Josef's transaction performance fixes, but it is
 probably a mixture of his code with the ioctls ceph is using.

 Ah, yeah, that's the problem.  We saw a similar problem a while back with 
 the start/stop transaction ioctls.  In this case, create_snapshot is doing

trans = btrfs_start_transaction(root-fs_info-extent_root, 5);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto fail;
}

 which sets current-journal_info.  Then

ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
BUG_ON(ret);

list_add(pending_snapshot-list,
 trans-transaction-pending_snapshots);
if (async_transid) {
*async_transid = trans-transid;
ret = btrfs_commit_transaction_async(trans,
 root-fs_info-extent_root, 1);
} else {
ret = btrfs_commit_transaction(trans,
   root-fs_info-extent_root);
}

 but the async snap creation ioctl takes the async path, which runs 
 btrfs_commit_transaction in a worker thread.

 I'm not sure what the right thing to do is here is... can whatever is in 
 journal_info be attached to trans instead in 
 btrfs_commit_transaction_async()?

 It looks like it's not used for anything in btrfs, actually; it's just set 
 and cleared.  What's the point of that?


 It is used now, check the beginning of start_transaction().  Thanks,
 
 Oh I see, okay.
 
 So clearing it in btrfs_commit_transaction_async should be fine then, 
 right?  When btrfs_commit_transaction runs in the other thread it won't 
 care that current-journal_info is NULL.
 

Oh yeah your patch is good :),

Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Sage Weil
On Fri, 10 Jun 2011, Josef Bacik wrote:
 On 06/10/2011 02:35 PM, Sage Weil wrote:
  On Fri, 10 Jun 2011, Josef Bacik wrote:
  On 06/10/2011 02:14 PM, Sage Weil wrote:
  On Fri, 10 Jun 2011, Sage Weil wrote:
  On Fri, 10 Jun 2011, Chris Mason wrote:
  Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400:
 
  [ two different btrfs crashes ]
 
  I think your two crashes in btrfs were from the uninit variables and
  those should be fixed in rc2.
 
  When I did my bisection, my criteria for success/failure was
  did mkcephfs succeed?.  When I apply this criteria to a recent
  linus kernel (e.g. 06e86849cf4019), which includes the fix you
  mentioned (aa0467d8d2a00e), I get still a different failure mode,
  which doesn't actually reference btrfs:
 
  [  276.364178] BUG: unable to handle kernel NULL pointer dereference 
  at 000a
  [  276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd]
 
  Looking at the resulting code in the oops, we're here in journal_start:
 
  if (handle) {
  J_ASSERT(handle-h_transaction-t_journal == journal);
 
  handle comes from current-journal_info, and we're doing a deref on
  handle-h_transaction, which is probably 0xa.
 
  So, we're leaving crud in current-journal_info and ext3 is finding it.
 
  Perhaps its from ceph starting a transaction but leaving it running?
  The bug came with Josef's transaction performance fixes, but it is
  probably a mixture of his code with the ioctls ceph is using.
 
  Ah, yeah, that's the problem.  We saw a similar problem a while back 
  with 
  the start/stop transaction ioctls.  In this case, create_snapshot is 
  doing
 
   trans = btrfs_start_transaction(root-fs_info-extent_root, 5);
   if (IS_ERR(trans)) {
   ret = PTR_ERR(trans);
   goto fail;
   }
 
  which sets current-journal_info.  Then
 
   ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
   BUG_ON(ret);
 
   list_add(pending_snapshot-list,
trans-transaction-pending_snapshots);
   if (async_transid) {
   *async_transid = trans-transid;
   ret = btrfs_commit_transaction_async(trans,
root-fs_info-extent_root, 1);
   } else {
   ret = btrfs_commit_transaction(trans,
  root-fs_info-extent_root);
   }
 
  but the async snap creation ioctl takes the async path, which runs 
  btrfs_commit_transaction in a worker thread.
 
  I'm not sure what the right thing to do is here is... can whatever is in 
  journal_info be attached to trans instead in 
  btrfs_commit_transaction_async()?
 
  It looks like it's not used for anything in btrfs, actually; it's just 
  set 
  and cleared.  What's the point of that?
 
 
  It is used now, check the beginning of start_transaction().  Thanks,
  
  Oh I see, okay.
  
  So clearing it in btrfs_commit_transaction_async should be fine then, 
  right?  When btrfs_commit_transaction runs in the other thread it won't 
  care that current-journal_info is NULL.
  
 
 Oh yeah your patch is good :),

Okay cool.  Here's the fix with a proper changelog and a little 
use-after-free paranoia.

Thanks!
sage


From 9881c0752293769d5133c01dff3ab04c0c24c61b Mon Sep 17 00:00:00 2001
From: Sage Weil s...@newdream.net
Date: Fri, 10 Jun 2011 11:41:25 -0700
Subject: [PATCH] Btrfs: clear current-journal_info on async transaction commit

Normally current-jouranl_info is cleared by commit_transaction.  For an
async snap or subvol creation, though, it runs in a work queue.  Clear
it in btrfs_commit_transaction_async() to avoid leaking a non-NULL
journal_info when we return to userspace.  When the actual commit runs in
the other thread it won't care that it's current-journal_info is already
NULL.

Signed-off-by: Sage Weil s...@newdream.net
---
 fs/btrfs/transaction.c |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dd71966..9d516ed 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1118,8 +1118,11 @@ int btrfs_commit_transaction_async(struct 
btrfs_trans_handle *trans,
wait_current_trans_commit_start_and_unblock(root, cur_trans);
else
wait_current_trans_commit_start(root, cur_trans);
-   put_transaction(cur_trans);
 
+   if (current-journal_info == trans)
+   current-journal_info = NULL;
+
+   put_transaction(cur_trans);
return 0;
 }
 
-- 
1.7.0


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Chris Mason
Excerpts from Josef Bacik's message of 2011-06-10 14:34:21 -0400:
 On 06/10/2011 02:35 PM, Sage Weil wrote:
  On Fri, 10 Jun 2011, Josef Bacik wrote:
  On 06/10/2011 02:14 PM, Sage Weil wrote:
  On Fri, 10 Jun 2011, Sage Weil wrote:
  On Fri, 10 Jun 2011, Chris Mason wrote:
  Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400:
 
  [ two different btrfs crashes ]
 
  I think your two crashes in btrfs were from the uninit variables and
  those should be fixed in rc2.
 
  When I did my bisection, my criteria for success/failure was
  did mkcephfs succeed?.  When I apply this criteria to a recent
  linus kernel (e.g. 06e86849cf4019), which includes the fix you
  mentioned (aa0467d8d2a00e), I get still a different failure mode,
  which doesn't actually reference btrfs:
 
  [  276.364178] BUG: unable to handle kernel NULL pointer dereference 
  at 000a
  [  276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd]
 
  Looking at the resulting code in the oops, we're here in journal_start:
 
  if (handle) {
  J_ASSERT(handle-h_transaction-t_journal == journal);
 
  handle comes from current-journal_info, and we're doing a deref on
  handle-h_transaction, which is probably 0xa.
 
  So, we're leaving crud in current-journal_info and ext3 is finding it.
 
  Perhaps its from ceph starting a transaction but leaving it running?
  The bug came with Josef's transaction performance fixes, but it is
  probably a mixture of his code with the ioctls ceph is using.
 
  Ah, yeah, that's the problem.  We saw a similar problem a while back 
  with 
  the start/stop transaction ioctls.  In this case, create_snapshot is 
  doing
 
  trans = btrfs_start_transaction(root-fs_info-extent_root, 5);
  if (IS_ERR(trans)) {
  ret = PTR_ERR(trans);
  goto fail;
  }
 
  which sets current-journal_info.  Then
 
  ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
  BUG_ON(ret);
 
  list_add(pending_snapshot-list,
   trans-transaction-pending_snapshots);
  if (async_transid) {
  *async_transid = trans-transid;
  ret = btrfs_commit_transaction_async(trans,
   root-fs_info-extent_root, 1);
  } else {
  ret = btrfs_commit_transaction(trans,
 root-fs_info-extent_root);
  }
 
  but the async snap creation ioctl takes the async path, which runs 
  btrfs_commit_transaction in a worker thread.
 
  I'm not sure what the right thing to do is here is... can whatever is in 
  journal_info be attached to trans instead in 
  btrfs_commit_transaction_async()?
 
  It looks like it's not used for anything in btrfs, actually; it's just 
  set 
  and cleared.  What's the point of that?
 
 
  It is used now, check the beginning of start_transaction().  Thanks,
  
  Oh I see, okay.
  
  So clearing it in btrfs_commit_transaction_async should be fine then, 
  right?  When btrfs_commit_transaction runs in the other thread it won't 
  care that current-journal_info is NULL.
  
 
 Oh yeah your patch is good :),

Thanks everyone (especially Jim).

-chris
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kernel BUG at fs/btrfs/inode.c:4676!

2011-06-10 Thread Andrew Lutomirski
On Fri, Jun 10, 2011 at 2:43 PM, Marek Otahal markota...@gmail.com wrote:

 The test-case is quite easy,
 1. mount the FS, just with compress-force=lzo option // I didn't try without, 
 but on my other btrfs partition that doesn't use compression the err never 
 happened ...so, can the others who experience the bug confirm compress=lzo 
 used?

Yes, I use compress=lzo.

--Andy
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-10 Thread Jim Schutt

Sage Weil wrote:

On Fri, 10 Jun 2011, Josef Bacik wrote:

On 06/10/2011 02:35 PM, Sage Weil wrote:

On Fri, 10 Jun 2011, Josef Bacik wrote:

On 06/10/2011 02:14 PM, Sage Weil wrote:

On Fri, 10 Jun 2011, Sage Weil wrote:

On Fri, 10 Jun 2011, Chris Mason wrote:

Excerpts from Jim Schutt's message of 2011-06-10 13:06:22 -0400:

[ two different btrfs crashes ]

I think your two crashes in btrfs were from the uninit variables and
those should be fixed in rc2.


When I did my bisection, my criteria for success/failure was
did mkcephfs succeed?.  When I apply this criteria to a recent
linus kernel (e.g. 06e86849cf4019), which includes the fix you
mentioned (aa0467d8d2a00e), I get still a different failure mode,
which doesn't actually reference btrfs:

[  276.364178] BUG: unable to handle kernel NULL pointer dereference at 
000a
[  276.365127] IP: [a05434b1] journal_start+0x3e/0x9c [jbd]

Looking at the resulting code in the oops, we're here in journal_start:

if (handle) {
J_ASSERT(handle-h_transaction-t_journal == journal);

handle comes from current-journal_info, and we're doing a deref on
handle-h_transaction, which is probably 0xa.

So, we're leaving crud in current-journal_info and ext3 is finding it.

Perhaps its from ceph starting a transaction but leaving it running?
The bug came with Josef's transaction performance fixes, but it is
probably a mixture of his code with the ioctls ceph is using.
Ah, yeah, that's the problem.  We saw a similar problem a while back with 
the start/stop transaction ioctls.  In this case, create_snapshot is doing


trans = btrfs_start_transaction(root-fs_info-extent_root, 5);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto fail;
}

which sets current-journal_info.  Then

ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
BUG_ON(ret);

list_add(pending_snapshot-list,
 trans-transaction-pending_snapshots);
if (async_transid) {
*async_transid = trans-transid;
ret = btrfs_commit_transaction_async(trans,
 root-fs_info-extent_root, 1);
} else {
ret = btrfs_commit_transaction(trans,
   root-fs_info-extent_root);
}

but the async snap creation ioctl takes the async path, which runs 
btrfs_commit_transaction in a worker thread.


I'm not sure what the right thing to do is here is... can whatever is in 
journal_info be attached to trans instead in 
btrfs_commit_transaction_async()?
It looks like it's not used for anything in btrfs, actually; it's just set 
and cleared.  What's the point of that?



It is used now, check the beginning of start_transaction().  Thanks,

Oh I see, okay.

So clearing it in btrfs_commit_transaction_async should be fine then, 
right?  When btrfs_commit_transaction runs in the other thread it won't 
care that current-journal_info is NULL.



Oh yeah your patch is good :),


Okay cool.  Here's the fix with a proper changelog and a little 
use-after-free paranoia.


This patch solves my issue - thanks a lot.

Tested-by: Jim Schutt jasc...@sandia.gov

-- Jim



Thanks!
sage



From 9881c0752293769d5133c01dff3ab04c0c24c61b Mon Sep 17 00:00:00 2001

From: Sage Weil s...@newdream.net
Date: Fri, 10 Jun 2011 11:41:25 -0700
Subject: [PATCH] Btrfs: clear current-journal_info on async transaction commit

Normally current-jouranl_info is cleared by commit_transaction.  For an
async snap or subvol creation, though, it runs in a work queue.  Clear
it in btrfs_commit_transaction_async() to avoid leaking a non-NULL
journal_info when we return to userspace.  When the actual commit runs in
the other thread it won't care that it's current-journal_info is already
NULL.

Signed-off-by: Sage Weil s...@newdream.net
---
 fs/btrfs/transaction.c |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dd71966..9d516ed 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1118,8 +1118,11 @@ int btrfs_commit_transaction_async(struct 
btrfs_trans_handle *trans,
wait_current_trans_commit_start_and_unblock(root, cur_trans);
else
wait_current_trans_commit_start(root, cur_trans);
-   put_transaction(cur_trans);
 
+	if (current-journal_info == trans)

+   current-journal_info = NULL;
+
+   put_transaction(cur_trans);
return 0;
 }
 



--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: use the normal checksumming infrastructure for free space cache

2011-06-10 Thread Josef Bacik
We used to store the checksums of the space cache directly in the space cache,
however that doesn't work out too well if we have more space than we can fit the
checksums into the first page.  So instead use the normal checksumming
infrastructure.  There were problems with doing this originally but those
problems don't exist now so this works out fine.  Thanks,

Signed-off-by: Josef Bacik jo...@redhat.com
---
 fs/btrfs/free-space-cache.c |  137 +--
 1 files changed, 28 insertions(+), 109 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 38f3fd9..17c26aa 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root 
*root,
return inode;
 
spin_lock(block_group-lock);
+   if (BTRFS_I(inode)-flags  BTRFS_INODE_NODATASUM) {
+   printk(KERN_INFO Old style space inode found, converting.\n);
+   BTRFS_I(inode)-flags = ~BTRFS_INODE_NODATASUM;
+   block_group-disk_cache_state = BTRFS_DC_CLEAR;
+   }
+
if (!btrfs_fs_closing(root-fs_info)) {
block_group-inode = igrab(inode);
block_group-iref = 1;
@@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root,
btrfs_set_inode_gid(leaf, inode_item, 0);
btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
- BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+ BTRFS_INODE_PREALLOC);
btrfs_set_inode_nlink(leaf, inode_item, 1);
btrfs_set_inode_transid(leaf, inode_item, trans-transid);
btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -239,17 +245,12 @@ int __load_free_space_cache(struct btrfs_root *root, 
struct inode *inode,
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct page *page;
-   u32 *checksums = NULL, *crc;
-   char *disk_crcs = NULL;
struct btrfs_key key;
struct list_head bitmaps;
u64 num_entries;
u64 num_bitmaps;
u64 generation;
-   u32 cur_crc = ~(u32)0;
pgoff_t index = 0;
-   unsigned long first_page_offset;
-   int num_checksums;
int ret = 0;
 
INIT_LIST_HEAD(bitmaps);
@@ -292,16 +293,6 @@ int __load_free_space_cache(struct btrfs_root *root, 
struct inode *inode,
if (!num_entries)
goto out;
 
-   /* Setup everything for doing checksumming */
-   num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
-   checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
-   if (!checksums)
-   goto out;
-   first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
-   disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
-   if (!disk_crcs)
-   goto out;
-
ret = readahead_cache(inode);
if (ret)
goto out;
@@ -311,17 +302,11 @@ int __load_free_space_cache(struct btrfs_root *root, 
struct inode *inode,
struct btrfs_free_space *e;
void *addr;
unsigned long offset = 0;
-   unsigned long start_offset = 0;
int need_loop = 0;
 
if (!num_entries  !num_bitmaps)
break;
 
-   if (index == 0) {
-   start_offset = first_page_offset;
-   offset = start_offset;
-   }
-
page = grab_cache_page(inode-i_mapping, index);
if (!page)
goto free_cache;
@@ -342,8 +327,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct 
inode *inode,
if (index == 0) {
u64 *gen;
 
-   memcpy(disk_crcs, addr, first_page_offset);
-   gen = addr + (sizeof(u32) * num_checksums);
+   gen = addr;
if (*gen != BTRFS_I(inode)-generation) {
printk(KERN_ERR btrfs: space cache generation
(%llu) does not match inode (%llu)\n,
@@ -355,24 +339,10 @@ int __load_free_space_cache(struct btrfs_root *root, 
struct inode *inode,
page_cache_release(page);
goto free_cache;
}
-   crc = (u32 *)disk_crcs;
-   }
-   entry = addr + start_offset;
-
-   /* First lets check our crc before we do anything fun */
-   cur_crc = ~(u32)0;
-   cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
- PAGE_CACHE_SIZE - start_offset);
-   btrfs_csum_final(cur_crc, (char *)cur_crc);
-   

Re: kernel BUG at fs/btrfs/inode.c:4676!

2011-06-10 Thread Josef Bacik
On 06/10/2011 02:43 PM, Marek Otahal wrote:
 On Friday 10 of June 2011 15:33:20 Josef Bacik wrote:
 On 06/09/2011 10:06 PM, Daniel J Blueman wrote:
 On 10 June 2011 09:57, Andy Lutomirski l...@mit.edu wrote:
 On 06/06/2011 06:19 AM, Marek Otahal wrote:

 Hello,
 the issue happens every time when i have to hard power-off my notebook
 (suspend problems).
 With kernel 2.6.39 the partition is unmountable, solution is to boot
 2.6.38 kernel which
 1/ is able to mount the partition,
 2/ by doing that fixes the problem so later .39 (after clean shutdown) can
 mount it also.

 Same problem here.  Mounting with 2.6.38 says:

 [   41.906259] Btrfs loaded
 [   41.906747] device fsid e040a9d60da49596-66c0275e348878bf devid 1 
 transid
 69217 /dev/mapper/vg_midnight_ssd-home
 [   41.908767] btrfs: disk space caching is enabled
 [   42.232185] btrfs: unlinked 17 orphans
 [   42.232189] btrfs: truncated 2 orphans

 dmesg in 2.6.39.1 says:
 []
 [   15.004255] kernel BUG at fs/btrfs/inode.c:4676!
 []

 I've been experiencing the same issue also.

 Josef/Chris, would an metadata snapshot or full block snapshot help
 debug this regression? I can probably setup a small testcase to
 trigger this.


 If you can come up with a testcase to reproduce I would love you forever
 ;).  If I get done what I wanted to do today I will try and reproduce.
 Thanks,

 Josef

 ...I was getting ready for you eternal love, Josef :P...but I can't reproduce 
 it 100%, like 70% success-rate. 
 
 The test-case is quite easy, 
 1. mount the FS, just with compress-force=lzo option // I didn't try without, 
 but on my other btrfs partition that doesn't use compression the err never 
 happened ...so, can the others who experience the bug confirm compress=lzo 
 used?   
 2. cd to it  create a file (not sure if needed)
 3. hard power-off
 
 To reproduce my tests: 
 dd /dev/zero /btrfstest bs=1M count=256 (min required for default mksf.btrfs)
 losetup /dev/loop0 /btrfstest
 mkfs.btrfs /dev/loop0
 mount -o compress-force=lzo /dev/loop0 /mnt/tmp
 vim /mnt/tmp/hello.txt
 ---power off!

How long do you wait between these two steps?  I've not been able to
reproduce this and I've done it maybe 5 times.  Either I've fixed it in
my tree (yay!) or I'm doing something wrong (boo!).  Thanks,

Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kernel BUG at fs/btrfs/inode.c:4676!

2011-06-10 Thread Marek Otahal
On Friday 10 of June 2011 16:52:36 Josef Bacik wrote:
 On 06/10/2011 02:43 PM, Marek Otahal wrote:
  On Friday 10 of June 2011 15:33:20 Josef Bacik wrote:
  On 06/09/2011 10:06 PM, Daniel J Blueman wrote:
  On 10 June 2011 09:57, Andy Lutomirski l...@mit.edu wrote:
  On 06/06/2011 06:19 AM, Marek Otahal wrote:
 
  Hello,
  the issue happens every time when i have to hard power-off my notebook
  (suspend problems).
  With kernel 2.6.39 the partition is unmountable, solution is to boot
  2.6.38 kernel which
  1/ is able to mount the partition,
  2/ by doing that fixes the problem so later .39 (after clean shutdown) 
  can
  mount it also.
 
  Same problem here.  Mounting with 2.6.38 says:
 
  [   41.906259] Btrfs loaded
  [   41.906747] device fsid e040a9d60da49596-66c0275e348878bf devid 1 
  transid
  69217 /dev/mapper/vg_midnight_ssd-home
  [   41.908767] btrfs: disk space caching is enabled
  [   42.232185] btrfs: unlinked 17 orphans
  [   42.232189] btrfs: truncated 2 orphans
 
  dmesg in 2.6.39.1 says:
  []
  [   15.004255] kernel BUG at fs/btrfs/inode.c:4676!
  []
 
  I've been experiencing the same issue also.
 
  Josef/Chris, would an metadata snapshot or full block snapshot help
  debug this regression? I can probably setup a small testcase to
  trigger this.
 
 
  If you can come up with a testcase to reproduce I would love you forever
  ;).  If I get done what I wanted to do today I will try and reproduce.
  Thanks,
 
  Josef
 
  ...I was getting ready for you eternal love, Josef :P...but I can't 
  reproduce it 100%, like 70% 
success-rate. 
  
  The test-case is quite easy, 
  1. mount the FS, just with compress-force=lzo option // I didn't try 
  without, but on my other 
btrfs partition that doesn't use compression the err never happened ...so, can 
the others who 
experience the bug confirm compress=lzo used?   
  2. cd to it  create a file (not sure if needed)
  3. hard power-off
  
  To reproduce my tests: 
  dd /dev/zero /btrfstest bs=1M count=256 (min required for default 
  mksf.btrfs)
  losetup /dev/loop0 /btrfstest
  mkfs.btrfs /dev/loop0
  mount -o compress-force=lzo /dev/loop0 /mnt/tmp
  vim /mnt/tmp/hello.txt
  ---power off!
 
 How long do you wait between these two steps?  I've not been able to
 reproduce this and I've done it maybe 5 times.  Either I've fixed it in
 my tree (yay!) or I'm doing something wrong (boo!).  Thanks,
 
 Josef
Not much but not immediately too, I'd say like ~5s. Did ls, df and quit. 
Tomorrow I'll try if I can spot a difference. 
Btw, is there a way to simulate power-off on a loopback-fs? Like to kill the 
loopback device while 
fs is mounted or some way? So I don't have to stress the poor hw :) 
Thank you, Mark

-- 

Marek Otahal :o)

signature.asc
Description: This is a digitally signed message part.


ext3 and btrfs various Oops and kernel BUGs

2011-06-10 Thread Norbert Preining
Dear all,

(please Cc)

yesterday I had two bugs with btrfs and ext3 occurrences, always happening
when plugging in an external USB btrfs disk.

Today I had a BUG which is purely ext3 related.

The last bug was with yesterdays latest git checkout.

Here are the three bugs/oops:

that one was with 3.0.0-rc2 (exactely)

Jun 10 14:50:23 mithrandir kernel: [40871.704129] BUG: unable to handle kernel 
NULL pointer dereference at 0030
Jun 10 14:50:23 mithrandir kernel: [40871.704185] IP: [a0d7b6e5] 
btrfs_print_leaf+0x17/0x75c [btrfs]
Jun 10 14:50:23 mithrandir kernel: [40871.704241] PGD 12d5e9067 PUD 12d5e8067 
PMD 0 
Jun 10 14:50:23 mithrandir kernel: [40871.704277] Oops:  [#1] PREEMPT SMP 
Jun 10 14:50:23 mithrandir kernel: [40871.704311] CPU 1 
Jun 10 14:50:23 mithrandir kernel: [40871.704324] Modules linked in: 
usb_storage rfcomm bnep snd_hrtimer vboxnetadp vboxnetflt vboxdrv binfmt_misc 
dm_crypt dm_mod isofs btrfs zlib_deflate crc32c libcrc32c vfat fat hso fuse 
loop uinput snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep 
snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi arc4 
snd_rawmidi iwlagn mac80211 snd_seq_midi_event cfg80211 snd_seq snd_timer 
snd_seq_device snd btusb bluetooth sony_laptop tpm_infineon soundcore mxm_wmi 
joydev firewire_ohci crc16 firewire_core snd_page_alloc rfkill crc_itu_t
Jun 10 14:50:23 mithrandir kernel: [40871.704733] 
Jun 10 14:50:23 mithrandir kernel: [40871.704745] Pid: 18862, comm: 
btrfs-transacti Tainted: P3.0.0-rc2+ #33 Sony Corporation 
VGN-Z11VN_B/VAIO
Jun 10 14:50:23 mithrandir kernel: [40871.704812] RIP: 
0010:[a0d7b6e5]  [a0d7b6e5] btrfs_print_leaf+0x17/0x75c 
[btrfs]
Jun 10 14:50:23 mithrandir kernel: [40871.704875] RSP: 0018:880103eadb10  
EFLAGS: 00010282
Jun 10 14:50:23 mithrandir kernel: [40871.704906] RAX: fffb RBX: 
 RCX: 88012431a828
Jun 10 14:50:23 mithrandir kernel: [40871.704945] RDX: 0044824bb000 RSI: 
 RDI: 88012bd11800
Jun 10 14:50:23 mithrandir kernel: [40871.704985] RBP: 880103eadb90 R08: 
0001 R09: 
Jun 10 14:50:23 mithrandir kernel: [40871.705024] R10:  R11: 
0001 R12: 88012bd11800
Jun 10 14:50:23 mithrandir kernel: [40871.705063] R13: 004482495000 R14: 
0007 R15: 
Jun 10 14:50:23 mithrandir kernel: [40871.705103] FS:  () 
GS:88013fd0() knlGS:
Jun 10 14:50:23 mithrandir kernel: [40871.705149] CS:  0010 DS:  ES:  
CR0: 8005003b
Jun 10 14:50:23 mithrandir kernel: [40871.705180] CR2: 0030 CR3: 
0001270df000 CR4: 06e0
Jun 10 14:50:23 mithrandir kernel: [40871.705220] DR0:  DR1: 
 DR2: 
Jun 10 14:50:23 mithrandir kernel: [40871.705259] DR3:  DR6: 
0ff0 DR7: 0400
Jun 10 14:50:23 mithrandir kernel: [40871.705299] Process btrfs-transacti (pid: 
18862, threadinfo 880103eac000, task 88013f5e7750)
Jun 10 14:50:23 mithrandir kernel: [40871.705350] Stack:
Jun 10 14:50:23 mithrandir kernel: [40871.705364]  880103eadb20 
004482495000 880103eadb50 88012bd11800
Jun 10 14:50:23 mithrandir kernel: [40871.705415]  8050 
0206 88013f7a64c0 
Jun 10 14:50:23 mithrandir kernel: [40871.705465]  a80044824950 
1000 8800374bf140 880102dcbd00
Jun 10 14:50:23 mithrandir kernel: [40871.705516] Call Trace:
Jun 10 14:50:23 mithrandir kernel: [40871.705539]  [a0d7584e] 
__btrfs_free_extent+0x24b/0x54f [btrfs]
Jun 10 14:50:23 mithrandir kernel: [40871.705579]  [810781cc] ? 
__delayacct_blkio_end+0x39/0x3b
Jun 10 14:50:23 mithrandir kernel: [40871.705616]  [81025a4e] ? 
delayacct_blkio_end+0x1c/0x3c
Jun 10 14:50:23 mithrandir kernel: [40871.705652]  [810d2a0e] ? 
wait_on_buffer+0x12/0x12
Jun 10 14:50:23 mithrandir kernel: [40871.705692]  [a0d77d22] 
run_clustered_refs+0x616/0x660 [btrfs]
Jun 10 14:50:23 mithrandir kernel: [40871.705733]  [81025042] ? 
set_next_entity+0x3b/0x60
Jun 10 14:50:23 mithrandir kernel: [40871.705776]  [a0db768f] ? 
btrfs_find_ref_cluster+0xf6/0x137 [btrfs]
Jun 10 14:50:23 mithrandir kernel: [40871.705822]  [a0d77e38] 
btrfs_run_delayed_refs+0xcc/0x17c [btrfs]
Jun 10 14:50:23 mithrandir kernel: [40871.705871]  [a0d85775] 
btrfs_commit_transaction+0x72/0x655 [btrfs]
Jun 10 14:50:23 mithrandir kernel: [40871.705920]  [a0d84d89] ? 
join_transaction.isra.23+0x203/0x20a [btrfs]
Jun 10 14:50:23 mithrandir kernel: [40871.705964]  [810481d4] ? 
add_wait_queue+0x44/0x44
Jun 10 14:50:23 mithrandir kernel: [40871.706006]  [a0d80180] 
transaction_kthread+0x174/0x22b [btrfs]
Jun 10 14:50:23 mithrandir kernel: [40871.706053]  [a0d8000c] ?