date:20130515

Re: subvol copying

2013-05-15 Thread Gabriel de Perthuis

 A user of a workstation has a home directory /home/john as a subvolume.  I 
 wrote a cron job to make read-only snapshots of it under /home/john/backup 
 which was fortunate as they just ran a script that did something like
 rm -rf ~.
 
 Apart from copying dozens of gigs of data back, is there a good way of 
 recovering it all?  Whatever you suggest isn't going to work for this time 
 (the copy is almost done) but will be useful for next time.
 
 Should I have put the backups under /backup instead so that I could just 
 delete the corrupted subvol and make a read-write snapshot of the last good 
 one?

You can move subvolumes at any time, as if they were regular directories.

For example: move the backups to an external location, move what's left
of the home to another location out of the way, and make a snapshot to
restore.


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 01/17] Btrfs: fix accessing a freed tree root

2013-05-15 Thread Miao Xie

inode_tree_del() will move the tree root into the dead root list, and
then the tree will be destroyed by the cleaner. So if we remove the
delayed node which is cached in the inode after inode_tree_del(),
we may access a freed tree root. Fix it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1669c3b..7f6e78a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4723,6 +4723,7 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
 no_delete:
+   btrfs_remove_delayed_node(inode);
clear_inode(inode);
return;
 }
@@ -7978,7 +7979,6 @@ void btrfs_destroy_inode(struct inode *inode)
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-   btrfs_remove_delayed_node(inode);
call_rcu(inode-i_rcu, btrfs_i_callback);
 }
 
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 02/17] Btrfs: fix unprotected root node of the subvolume's inode rb-tree

2013-05-15 Thread Miao Xie

The root node of the rb-tree may be changed, so we should get it under
the lock. Fix it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/inode.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7f6e78a..bf5c399 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4839,14 +4839,13 @@ static void inode_tree_add(struct inode *inode)
struct rb_node **p;
struct rb_node *parent;
u64 ino = btrfs_ino(inode);
-again:
-   p = root-inode_tree.rb_node;
-   parent = NULL;
 
if (inode_unhashed(inode))
return;
-
+again:
+   parent = NULL;
spin_lock(root-inode_lock);
+   p = root-inode_tree.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct btrfs_inode, rb_node);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 07/17] Btrfs: don't invoke btrfs_invalidate_inodes() in the spin lock context

2013-05-15 Thread Miao Xie

btrfs_invalidate_inodes() may sleep, so we should not invoke it in the
spin lock context. Fix it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 642c861..724a0da 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3685,8 +3685,11 @@ static void btrfs_destroy_ordered_operations(struct 
btrfs_transaction *t,
 ordered_operations);
 
list_del_init(btrfs_inode-ordered_operations);
+   spin_unlock(root-fs_info-ordered_extent_lock);
 
btrfs_invalidate_inodes(btrfs_inode-root);
+
+   spin_lock(root-fs_info-ordered_extent_lock);
}
 
spin_unlock(root-fs_info-ordered_extent_lock);
@@ -3808,8 +3811,11 @@ static void btrfs_destroy_delalloc_inodes(struct 
btrfs_root *root)
list_del_init(btrfs_inode-delalloc_inodes);
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
  btrfs_inode-runtime_flags);
+   spin_unlock(root-fs_info-delalloc_lock);
 
btrfs_invalidate_inodes(btrfs_inode-root);
+
+   spin_lock(root-fs_info-delalloc_lock);
}
 
spin_unlock(root-fs_info-delalloc_lock);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 08/17] Btrfs: introduce per-subvolume delalloc inode list

2013-05-15 Thread Miao Xie

When we create a snapshot, we need flush all delalloc inodes in the
fs, just flushing the inodes in the source tree is OK. So we introduce
per-subvolume delalloc inode list.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h   |  22 ---
 fs/btrfs/dev-replace.c |   2 +-
 fs/btrfs/disk-io.c |  49 ---
 fs/btrfs/extent-tree.c |   6 +-
 fs/btrfs/inode.c   | 167 -
 fs/btrfs/relocation.c  |   2 +-
 fs/btrfs/transaction.c |   2 +-
 7 files changed, 183 insertions(+), 67 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 958ce6c..067233f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1449,13 +1449,9 @@ struct btrfs_fs_info {
 */
struct list_head ordered_extents;
 
-   spinlock_t delalloc_lock;
-   /*
-* all of the inodes that have delalloc bytes.  It is possible for
-* this list to be empty even when there is still dirty data=ordered
-* extents waiting to finish IO.
-*/
-   struct list_head delalloc_inodes;
+   spinlock_t delalloc_root_lock;
+   /* all fs/file tree roots that have delalloc inodes. */
+   struct list_head delalloc_roots;
 
/*
 * there is a pool of worker threads for checksumming during writes
@@ -1740,6 +1736,16 @@ struct btrfs_root {
 
spinlock_t root_item_lock;
atomic_t refs;
+
+   spinlock_t delalloc_lock;
+   /*
+* all of the inodes that have delalloc bytes.  It is possible for
+* this list to be empty even when there is still dirty data=ordered
+* extents waiting to finish IO.
+*/
+   struct list_head delalloc_inodes;
+   struct list_head delalloc_root;
+   u64 nr_delalloc_inodes;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -3543,6 +3549,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle 
*trans,
   u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+   int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
  struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7ba7b39..aca77ad 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -465,7 +465,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info 
*fs_info,
 * flush all outstanding I/O and inode extent mappings before the
 * copy operation is declared as being finished
 */
-   ret = btrfs_start_delalloc_inodes(root, 0);
+   ret = btrfs_start_all_delalloc_inodes(root-fs_info, 0);
if (ret) {
mutex_unlock(dev_replace-lock_finishing_cancel_unmount);
return ret;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 724a0da..13dddba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1191,6 +1191,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root-objectid = objectid;
root-last_trans = 0;
root-highest_objectid = 0;
+   root-nr_delalloc_inodes = 0;
root-name = NULL;
root-inode_tree = RB_ROOT;
INIT_RADIX_TREE(root-delayed_nodes_tree, GFP_ATOMIC);
@@ -1199,10 +1200,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
u32 sectorsize,
 
INIT_LIST_HEAD(root-dirty_list);
INIT_LIST_HEAD(root-root_list);
+   INIT_LIST_HEAD(root-delalloc_inodes);
+   INIT_LIST_HEAD(root-delalloc_root);
INIT_LIST_HEAD(root-logged_list[0]);
INIT_LIST_HEAD(root-logged_list[1]);
spin_lock_init(root-orphan_lock);
spin_lock_init(root-inode_lock);
+   spin_lock_init(root-delalloc_lock);
spin_lock_init(root-accounting_lock);
spin_lock_init(root-log_extents_lock[0]);
spin_lock_init(root-log_extents_lock[1]);
@@ -2137,9 +2141,9 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(fs_info-trans_list);
INIT_LIST_HEAD(fs_info-dead_roots);
INIT_LIST_HEAD(fs_info-delayed_iputs);
-   INIT_LIST_HEAD(fs_info-delalloc_inodes);
+   INIT_LIST_HEAD(fs_info-delalloc_roots);
INIT_LIST_HEAD(fs_info-caching_block_groups);
-   spin_lock_init(fs_info-delalloc_lock);
+   spin_lock_init(fs_info-delalloc_root_lock);
spin_lock_init(fs_info-trans_lock);
spin_lock_init(fs_info-fs_roots_radix_lock);
spin_lock_init(fs_info-delayed_iput_lock);
@@ -3801,24 +3805,49 @@ static void btrfs_destroy_delalloc_inodes(struct 
btrfs_root *root)
 
INIT_LIST_HEAD(splice);
 
-   spin_lock(root-fs_info-delalloc_lock);
-   list_splice_init(root-fs_info-delalloc_inodes, splice);
+   spin_lock(root-delalloc_lock);
+

[PATCH 05/17] Btrfs: cleanup the similar code of the fs root read

2013-05-15 Thread Miao Xie

There are several functions whose code is similar, such as
  btrfs_find_last_root()
  btrfs_read_fs_root_no_radix()

Besides that, some functions are invoked twice, it is unnecessary,
for example, we are sure that all roots which is found in
  btrfs_find_orphan_roots()
have their orphan items, so it is unnecessary to check the orphan
item again.

So cleanup it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h   |   6 +-
 fs/btrfs/disk-io.c | 282 +
 fs/btrfs/disk-io.h |  11 +-
 fs/btrfs/extent-tree.c |   6 +-
 fs/btrfs/relocation.c  |   5 +-
 fs/btrfs/root-tree.c   | 170 ++---
 fs/btrfs/tree-log.c|   3 +-
 fs/btrfs/volumes.c |  13 ++-
 fs/btrfs/volumes.h |   1 +
 9 files changed, 228 insertions(+), 269 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9a48fd..845b77f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3369,9 +3369,9 @@ int __must_check btrfs_update_root(struct 
btrfs_trans_handle *trans,
   struct btrfs_root_item *item);
 void btrfs_read_root_item(struct extent_buffer *eb, int slot,
  struct btrfs_root_item *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
-btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+   struct btrfs_path *path, struct btrfs_root_item *root_item,
+   struct btrfs_key *root_key);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 void btrfs_set_root_node(struct btrfs_root_item *item,
 struct extent_buffer *node);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c1e4fb..42d6ba2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1234,39 +1234,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
spin_lock_init(root-root_item_lock);
 }
 
-static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
-   struct btrfs_fs_info *fs_info,
-   u64 objectid,
-   struct btrfs_root *root)
-{
-   int ret;
-   u32 blocksize;
-   u64 generation;
-
-   __setup_root(tree_root-nodesize, tree_root-leafsize,
-tree_root-sectorsize, tree_root-stripesize,
-root, fs_info, objectid);
-   ret = btrfs_find_last_root(tree_root, objectid,
-  root-root_item, root-root_key);
-   if (ret  0)
-   return -ENOENT;
-   else if (ret  0)
-   return ret;
-
-   generation = btrfs_root_generation(root-root_item);
-   blocksize = btrfs_level_size(root, btrfs_root_level(root-root_item));
-   root-commit_root = NULL;
-   root-node = read_tree_block(root, btrfs_root_bytenr(root-root_item),
-blocksize, generation);
-   if (!root-node || !btrfs_buffer_uptodate(root-node, generation, 0)) {
-   free_extent_buffer(root-node);
-   root-node = NULL;
-   return -EIO;
-   }
-   root-commit_root = btrfs_root_node(root);
-   return 0;
-}
-
 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 {
struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1451,70 +1418,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return 0;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-  struct btrfs_key *location)
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+   struct btrfs_key *key)
 {
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root-fs_info;
struct btrfs_path *path;
-   struct extent_buffer *l;
u64 generation;
u32 blocksize;
-   int ret = 0;
-   int slot;
+   int ret;
 
-   root = btrfs_alloc_root(fs_info);
-   if (!root)
+   path = btrfs_alloc_path();
+   if (!path)
return ERR_PTR(-ENOMEM);
-   if (location-offset == (u64)-1) {
-   ret = find_and_setup_root(tree_root, fs_info,
- location-objectid, root);
-   if (ret) {
-   kfree(root);
-   return ERR_PTR(ret);
-   }
-   goto out;
+
+   root = btrfs_alloc_root(fs_info);
+   if (!root) {
+   ret = -ENOMEM;
+   goto alloc_fail;
}
 
__setup_root(tree_root-nodesize, tree_root-leafsize,
 tree_root-sectorsize, tree_root-stripesize,
-

[PATCH 14/17] Btrfs: don't flush the delalloc inodes in the while loop if flushoncommit is set

2013-05-15 Thread Miao Xie

It is unnecessary to flush the delalloc inodes again and again because
we don't care the dirty pages which are introduced after the flush, and
they will be flush in the transaction commit.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fd319b2..265db57 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1518,16 +1518,8 @@ static void cleanup_transaction(struct 
btrfs_trans_handle *trans,
 static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
  struct btrfs_root *root)
 {
-   int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
int ret;
 
-   if (flush_on_commit) {
-   ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1);
-   if (ret)
-   return ret;
-   btrfs_wait_all_ordered_extents(root-fs_info, 1);
-   }
-
ret = btrfs_run_delayed_items(trans, root);
if (ret)
return ret;
@@ -1551,6 +1543,19 @@ static int btrfs_flush_all_pending_stuffs(struct 
btrfs_trans_handle *trans,
return ret;
 }
 
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+   if (btrfs_test_opt(fs_info-tree_root, FLUSHONCOMMIT))
+   return btrfs_start_all_delalloc_inodes(fs_info, 1);
+   return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+   if (btrfs_test_opt(fs_info-tree_root, FLUSHONCOMMIT))
+   btrfs_wait_all_ordered_extents(fs_info, 1);
+}
+
 /*
  * btrfs_transaction state sequence:
  *in_commit = 0, blocked = 0  (initial)
@@ -1654,6 +1659,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans,
 
extwriter_counter_dec(cur_trans, trans-type);
 
+   ret = btrfs_start_delalloc_flush(root-fs_info);
+   if (ret)
+   goto cleanup_transaction;
+
if (!btrfs_test_opt(root, SSD) 
(now  cur_trans-start_time || now - cur_trans-start_time  1))
should_grow = 1;
@@ -1683,6 +1692,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans,
if (ret)
goto cleanup_transaction;
 
+   btrfs_wait_delalloc_flush(root-fs_info);
/*
 * Ok now we need to make sure to block out any other joins while we
 * commit the transaction.  We could have started a join before setting
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 16/17] Btrfs: remove the time check in btrfs_commit_transaction()

2013-05-15 Thread Miao Xie

We checked the commit time to avoid committing the transaction
frequently, but it is unnecessary because:
- It made the transaction commit spend more time, and delayed the
  operation of the external writers(TRANS_START/TRANS_USERSPACE).
- Except the space that we have to commit transaction, such as
  snapshot creation, btrfs doesn't commit the transaction on its
  own initiative.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c | 29 ++---
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 75e7b15..5e75ff4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1566,10 +1566,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans,
 {
struct btrfs_transaction *cur_trans = trans-transaction;
struct btrfs_transaction *prev_trans = NULL;
-   DEFINE_WAIT(wait);
int ret;
-   int should_grow = 0;
-   unsigned long now = get_seconds();
 
ret = btrfs_run_ordered_operations(trans, root, 0);
if (ret) {
@@ -1660,28 +1657,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans,
if (ret)
goto cleanup_transaction;
 
-   if (!btrfs_test_opt(root, SSD) 
-   (now  cur_trans-start_time || now - cur_trans-start_time  1))
-   should_grow = 1;
-
-   do {
-   WARN_ON(cur_trans != trans-transaction);
-
-   ret = btrfs_flush_all_pending_stuffs(trans, root);
-   if (ret)
-   goto cleanup_transaction;
-
-   prepare_to_wait(cur_trans-writer_wait, wait,
-   TASK_UNINTERRUPTIBLE);
-
-   if (extwriter_counter_read(cur_trans)  0)
-   schedule();
-   else if (should_grow)
-   schedule_timeout(1);
+   ret = btrfs_flush_all_pending_stuffs(trans, root);
+   if (ret)
+   goto cleanup_transaction;
 
-   finish_wait(cur_trans-writer_wait, wait);
-   } while (extwriter_counter_read(cur_trans)  0);
+   wait_event(cur_trans-writer_wait,
+  extwriter_counter_read(cur_trans) == 0);
 
+   /* some pending stuffs might be added after the previous flush. */
ret = btrfs_flush_all_pending_stuffs(trans, root);
if (ret)
goto cleanup_transaction;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 13/17] Btrfs: don't wait for all the writers circularly during the transaction commit

2013-05-15 Thread Miao Xie

btrfs_commit_transaction has the following loop before we commit the
transaction.

do {
// attempt to do some useful stuff and/or sleep
} while (atomic_read(cur_trans-num_writers)  1 ||
 (should_grow  cur_trans-num_joined != joined));

This is used to prevent from the TRANS_START to get in the way of a
committing transaction. But it does not prevent from TRANS_JOIN, that
is we would do this loop for a long time if some writers JOIN the
current transaction endlessly.

Because we need join the current transaction to do some useful stuff,
we can not block TRANS_JOIN here. So we introduce a external writer
counter, which is used to count the TRANS_USERSPACE/TRANS_START writers.
If the external writer counter is zero, we can break the above loop.

In order to make the code more clear, we don't use enum variant
to define the type of the transaction handle, use bitmask instead.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c | 55 ++
 fs/btrfs/transaction.h | 31 
 2 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cf8706c..fd319b2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -51,17 +51,41 @@ static noinline void switch_commit_root(struct btrfs_root 
*root)
 }
 
 static inline int can_join_transaction(struct btrfs_transaction *trans,
-  int type)
+  unsigned int type)
 {
return !(trans-in_commit 
-type != TRANS_JOIN 
-type != TRANS_JOIN_NOLOCK);
+(type  TRANS_EXTWRITERS));
+}
+
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+unsigned int type)
+{
+   if (type  TRANS_EXTWRITERS)
+   atomic_inc(trans-num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+unsigned int type)
+{
+   if (type  TRANS_EXTWRITERS)
+   atomic_dec(trans-num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+   atomic_set(trans-num_extwriters, ((type  TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
+{
+   return atomic_read(trans-num_extwriters);
 }
 
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root, int type)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int 
type)
 {
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *fs_info = root-fs_info;
@@ -99,6 +123,7 @@ loop:
}
atomic_inc(cur_trans-use_count);
atomic_inc(cur_trans-num_writers);
+   extwriter_counter_inc(cur_trans, type);
cur_trans-num_joined++;
spin_unlock(fs_info-trans_lock);
return 0;
@@ -131,6 +156,7 @@ loop:
}
 
atomic_set(cur_trans-num_writers, 1);
+   extwriter_counter_init(cur_trans, type);
cur_trans-num_joined = 0;
init_waitqueue_head(cur_trans-writer_wait);
init_waitqueue_head(cur_trans-commit_wait);
@@ -307,7 +333,7 @@ static int may_wait_transaction(struct btrfs_root *root, 
int type)
 }
 
 static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
  enum btrfs_reserve_flush_enum flush)
 {
struct btrfs_trans_handle *h;
@@ -320,7 +346,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, 
int type,
return ERR_PTR(-EROFS);
 
if (current-journal_info) {
-   WARN_ON(type != TRANS_JOIN  type != TRANS_JOIN_NOLOCK);
+   WARN_ON(type  TRANS_EXTWRITERS);
h = current-journal_info;
h-use_count++;
WARN_ON(h-use_count  2);
@@ -366,7 +392,7 @@ again:
 * If we are ATTACH, it means we just want to catch the current
 * transaction and commit it, so we needn't do sb_start_intwrite(). 
 */
-   if (type  TRANS_JOIN_NOLOCK)
+   if (type  __TRANS_FREEZABLE)
sb_start_intwrite(root-fs_info-sb);
 
if (may_wait_transaction(root, type))
@@ -429,7 +455,7 @@ got_it:
return h;
 
 join_fail:
-   if (type  TRANS_JOIN_NOLOCK)
+   if (type  __TRANS_FREEZABLE)
sb_end_intwrite(root-fs_info-sb);
kmem_cache_free(btrfs_trans_handle_cachep, h);
 alloc_fail:
@@ -677,12 +703,13 @@ static int __btrfs_end_transaction(struct 
btrfs_trans_handle *trans,
}
}
 
-   if (trans-type

[PATCH 17/17] Btrfs: make the state of the transaction more readable

2013-05-15 Thread Miao Xie

We used 3 variants to track the state of the transaction, it was complex
and wasted the memory space. Besides that, it was hard to understand that
which types of the transaction handles should be blocked in each transaction
state, so the developers often made mistakes.

This patch improved the above problem. In this patch, we define 6 states
for the transaction,
  enum btrfs_trans_state {
TRANS_STATE_RUNNING = 0,
TRANS_STATE_BLOCKED = 1,
TRANS_STATE_COMMIT_START= 2,
TRANS_STATE_COMMIT_DOING= 3,
TRANS_STATE_UNBLOCKED   = 4,
TRANS_STATE_COMPLETED   = 5,
TRANS_STATE_MAX = 6,
  }
and just use 1 variant to track those state.

In order to make the blocked handle types for each state more clear,
we introduce a array:
  unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING]   = 0U,
[TRANS_STATE_BLOCKED]   = (__TRANS_USERSPACE |
   __TRANS_START),
[TRANS_STATE_COMMIT_START]  = (__TRANS_USERSPACE |
   __TRANS_START |
   __TRANS_ATTACH),
[TRANS_STATE_COMMIT_DOING]  = (__TRANS_USERSPACE |
   __TRANS_START |
   __TRANS_ATTACH |
   __TRANS_JOIN),
[TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
   __TRANS_START |
   __TRANS_ATTACH |
   __TRANS_JOIN |
   __TRANS_JOIN_NOLOCK),
[TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
   __TRANS_START |
   __TRANS_ATTACH |
   __TRANS_JOIN |
   __TRANS_JOIN_NOLOCK),
  }
it is very intuitionistic.

Besides that, because we remove -in_commit in transaction structure, so
the lock -commit_lock which was used to protect it is unnecessary, remove
-commit_lock.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h   |   1 -
 fs/btrfs/disk-io.c |  36 ++--
 fs/btrfs/transaction.c | 156 ++---
 fs/btrfs/transaction.h |  16 +++--
 4 files changed, 114 insertions(+), 95 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a7e71ff..bf92302 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1496,7 +1496,6 @@ struct btrfs_fs_info {
int closing;
int log_root_recovering;
int enospc_unlink;
-   int trans_no_join;
 
u64 total_pinned;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6bb3f3d..530e3c0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1747,7 +1747,7 @@ static int transaction_kthread(void *arg)
}
 
now = get_seconds();
-   if (!cur-blocked 
+   if (cur-state  TRANS_STATE_BLOCKED 
(now  cur-start_time || now - cur-start_time  30)) {
spin_unlock(root-fs_info-trans_lock);
delay = HZ * 5;
@@ -2183,7 +2183,6 @@ int open_ctree(struct super_block *sb,
fs_info-max_inline = 8192 * 1024;
fs_info-metadata_ratio = 0;
fs_info-defrag_inodes = RB_ROOT;
-   fs_info-trans_no_join = 0;
fs_info-free_chunk_space = 0;
fs_info-tree_mod_log = RB_ROOT;
 
@@ -3956,19 +3955,14 @@ void btrfs_cleanup_one_transaction(struct 
btrfs_transaction *cur_trans,
btrfs_block_rsv_release(root, root-fs_info-trans_block_rsv,
cur_trans-dirty_pages.dirty_bytes);
 
-   /* FIXME: cleanup wait for commit */
-   cur_trans-in_commit = 1;
-   cur_trans-blocked = 1;
+   cur_trans-state = TRANS_STATE_COMMIT_START;
wake_up(root-fs_info-transaction_blocked_wait);
 
btrfs_evict_pending_snapshots(cur_trans);
 
-   cur_trans-blocked = 0;
+   cur_trans-state = TRANS_STATE_UNBLOCKED;
wake_up(root-fs_info-transaction_wait);
 
-   cur_trans-commit_done = 1;
-   wake_up(cur_trans-commit_wait);
-
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
 
@@ -3977,6 +3971,9 @@ void btrfs_cleanup_one_transaction(struct 
btrfs_transaction *cur_trans,
btrfs_destroy_pinned_extent(root,
root-fs_info-pinned_extents);
 
+   cur_trans-state =TRANS_STATE_COMPLETED;
+   wake_up(cur_trans-commit_wait);
+
/*
memset(cur_trans, 0, sizeof(*cur_trans));
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -4004,25 +4001,23 @@ static int

[PATCH 12/17] Btrfs: remove the code for the impossible case in cleanup_transaction()

2013-05-15 Thread Miao Xie

If the transaction is removed from the transaction list, it means the
transaction has been committed successfully. So it is impossible to
call cleanup_transaction(), otherwise there is something wrong with
the code logic. Thus, we use BUG_ON() instead of the original handle.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bc22be9..cf8706c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1450,11 +1450,12 @@ static void cleanup_transaction(struct 
btrfs_trans_handle *trans,
 
spin_lock(root-fs_info-trans_lock);
 
-   if (list_empty(cur_trans-list)) {
-   spin_unlock(root-fs_info-trans_lock);
-   btrfs_end_transaction(trans, root);
-   return;
-   }
+   /*
+* If the transaction is removed from the list, it means this
+* transaction has been committed successfully, so it is impossible
+* to call the cleanup function.
+*/
+   BUG_ON(list_empty(cur_trans-list));
 
list_del_init(cur_trans-list);
if (cur_trans == root-fs_info-running_transaction) {
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 03/17] Btrfs: pause the space balance when remounting to R/O

2013-05-15 Thread Miao Xie

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a4807ce..f0857e0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1263,6 +1263,7 @@ static int btrfs_remount(struct super_block *sb, int 
*flags, char *data)
 
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
+   btrfs_pause_balance(fs_info);
 
ret = btrfs_commit_super(root);
if (ret)
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation

2013-05-15 Thread Miao Xie

Before applying this patch, we need flush all the delalloc inodes in
the fs when we want to create a snapshot, it wastes time, and make
the transaction commit be blocked for a long time. It means some other
user operation would also be blocked for a long time.

This patch improves this problem, we just flush the delalloc inodes that
in the source trees before snapshot creation, so the transaction commit
will complete quickly.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ioctl.c   |  6 ++
 fs/btrfs/transaction.c | 10 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0de4a2f..2677dcc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct 
inode *dir,
if (!root-ref_cows)
return -EINVAL;
 
+   ret = btrfs_start_delalloc_inodes(root, 0);
+   if (ret)
+   return ret;
+
+   btrfs_wait_ordered_extents(root, 0);
+
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot)
return -ENOMEM;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2b17213..bc22be9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct 
btrfs_trans_handle *trans,
  struct btrfs_root *root)
 {
int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
-   int snap_pending = 0;
int ret;
 
-   if (!flush_on_commit) {
-   spin_lock(root-fs_info-trans_lock);
-   if (!list_empty(trans-transaction-pending_snapshots))
-   snap_pending = 1;
-   spin_unlock(root-fs_info-trans_lock);
-   }
-
-   if (flush_on_commit || snap_pending) {
+   if (flush_on_commit) {
ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1);
if (ret)
return ret;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 09/17] Btrfs: introduce per-subvolume ordered extent list

2013-05-15 Thread Miao Xie

The reason we introduce per-subvolume ordered extent list is the same
as the per-subvolume delalloc inode list.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h|  25 ---
 fs/btrfs/dev-replace.c  |   4 +-
 fs/btrfs/disk-io.c  |  45 +++-
 fs/btrfs/extent-tree.c  |   6 +--
 fs/btrfs/inode.c|   4 +-
 fs/btrfs/ordered-data.c | 109 +---
 fs/btrfs/ordered-data.h |   2 +
 fs/btrfs/relocation.c   |   2 +-
 fs/btrfs/super.c|   2 +-
 fs/btrfs/transaction.c  |   2 +-
 10 files changed, 143 insertions(+), 58 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 067233f..a7e71ff 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1437,17 +1437,18 @@ struct btrfs_fs_info {
atomic_t open_ioctl_trans;
 
/*
-* this is used by the balancing code to wait for all the pending
-* ordered extents
+* this is used to protect the following list -- ordered_roots.
 */
-   spinlock_t ordered_extent_lock;
+   spinlock_t ordered_root_lock;
 
/*
-* all of the data=ordered extents pending writeback
+* all fs/file tree roots in which there are data=ordered extents
+* pending writeback are added into this list.
+*
 * these can span multiple transactions and basically include
 * every dirty data page that isn't from nodatacow
 */
-   struct list_head ordered_extents;
+   struct list_head ordered_roots;
 
spinlock_t delalloc_root_lock;
/* all fs/file tree roots that have delalloc inodes. */
@@ -1746,6 +1747,20 @@ struct btrfs_root {
struct list_head delalloc_inodes;
struct list_head delalloc_root;
u64 nr_delalloc_inodes;
+   /*
+* this is used by the balancing code to wait for all the pending
+* ordered extents
+*/
+   spinlock_t ordered_extent_lock;
+
+   /*
+* all of the data=ordered extents pending writeback
+* these can span multiple transactions and basically include
+* every dirty data page that isn't from nodatacow
+*/
+   struct list_head ordered_extents;
+   struct list_head ordered_root;
+   u64 nr_ordered_extents;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index aca77ad..4254da8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args-result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
 
-   btrfs_wait_ordered_extents(root, 0);
+   btrfs_wait_all_ordered_extents(root-fs_info, 0);
 
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -470,7 +470,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info 
*fs_info,
mutex_unlock(dev_replace-lock_finishing_cancel_unmount);
return ret;
}
-   btrfs_wait_ordered_extents(root, 0);
+   btrfs_wait_all_ordered_extents(root-fs_info, 0);
 
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 13dddba..44d5a86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1192,6 +1192,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root-last_trans = 0;
root-highest_objectid = 0;
root-nr_delalloc_inodes = 0;
+   root-nr_ordered_extents = 0;
root-name = NULL;
root-inode_tree = RB_ROOT;
INIT_RADIX_TREE(root-delayed_nodes_tree, GFP_ATOMIC);
@@ -1202,11 +1203,14 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
u32 sectorsize,
INIT_LIST_HEAD(root-root_list);
INIT_LIST_HEAD(root-delalloc_inodes);
INIT_LIST_HEAD(root-delalloc_root);
+   INIT_LIST_HEAD(root-ordered_extents);
+   INIT_LIST_HEAD(root-ordered_root);
INIT_LIST_HEAD(root-logged_list[0]);
INIT_LIST_HEAD(root-logged_list[1]);
spin_lock_init(root-orphan_lock);
spin_lock_init(root-inode_lock);
spin_lock_init(root-delalloc_lock);
+   spin_lock_init(root-ordered_extent_lock);
spin_lock_init(root-accounting_lock);
spin_lock_init(root-log_extents_lock[0]);
spin_lock_init(root-log_extents_lock[1]);
@@ -2190,8 +2194,8 @@ int open_ctree(struct super_block *sb,
fs_info-thread_pool_size = min_t(unsigned long,
  num_online_cpus() + 2, 8);
 
-   INIT_LIST_HEAD(fs_info-ordered_extents);
-   spin_lock_init(fs_info-ordered_extent_lock);
+   INIT_LIST_HEAD(fs_info-ordered_roots);
+   spin_lock_init(fs_info-ordered_root_lock);
fs_info-delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),

[PATCH 11/17] Btrfs: cleanup unnecessary assignment when cleaning up all the residual transaction

2013-05-15 Thread Miao Xie

When we umount a fs with serious errors, we will invoke 
btrfs_cleanup_transactions()
to clean up the residual transaction. At this time, It is impossible to start a 
new
transaction, so we needn't assign trans_no_join to 1, and also needn't clear 
running
transaction every time we destroy a residual transaction.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 44d5a86..6bb3f3d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3992,7 +3992,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root 
*root)
 
spin_lock(root-fs_info-trans_lock);
list_splice_init(root-fs_info-trans_list, list);
-   root-fs_info-trans_no_join = 1;
+   root-fs_info-running_transaction = NULL;
spin_unlock(root-fs_info-trans_lock);
 
while (!list_empty(list)) {
@@ -4028,10 +4028,6 @@ static int btrfs_cleanup_transaction(struct btrfs_root 
*root)
 
btrfs_destroy_all_delalloc_inodes(root-fs_info);
 
-   spin_lock(root-fs_info-trans_lock);
-   root-fs_info-running_transaction = NULL;
-   spin_unlock(root-fs_info-trans_lock);
-
btrfs_destroy_marked_extents(root, t-dirty_pages,
 EXTENT_DIRTY);
 
@@ -4044,9 +4040,6 @@ static int btrfs_cleanup_transaction(struct btrfs_root 
*root)
kmem_cache_free(btrfs_transaction_cachep, t);
}
 
-   spin_lock(root-fs_info-trans_lock);
-   root-fs_info-trans_no_join = 0;
-   spin_unlock(root-fs_info-trans_lock);
mutex_unlock(root-fs_info-transaction_kthread_mutex);
 
return 0;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 04/17] Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix()

2013-05-15 Thread Miao Xie

We have checked if -node is NULL or not, so it is unnecessary to
use BUG_ON() to check again. Remove it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2a9ae38..8c1e4fb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1513,7 +1513,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct 
btrfs_root *tree_root,
}
 
root-commit_root = btrfs_root_node(root);
-   BUG_ON(!root-node); /* -ENOMEM */
 out:
if (location-objectid != BTRFS_TREE_LOG_OBJECTID) {
root-ref_cows = 1;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 15/17] Btrfs: remove unnecessary varient -num_joined in btrfs_transaction structure

2013-05-15 Thread Miao Xie

We used -num_joined track if there were some writers which join the current
transaction when the committer was sleeping. If some writers joined the current
transaction, we has to continue the while loop to do some necessary stuff, such
as flush the ordered operations. But it is unnecessary because we will do it
after the while loop.

Besides that, tracking -num_joined would make the committer drop into the while
loop when there are lots of internal writers(TRANS_JOIN).

So we remove -num_joined and don't track if there are some writers which join
the current transaction when the committer is sleeping.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c | 8 +---
 fs/btrfs/transaction.h | 2 --
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 265db57..75e7b15 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -124,7 +124,6 @@ loop:
atomic_inc(cur_trans-use_count);
atomic_inc(cur_trans-num_writers);
extwriter_counter_inc(cur_trans, type);
-   cur_trans-num_joined++;
spin_unlock(fs_info-trans_lock);
return 0;
}
@@ -157,7 +156,6 @@ loop:
 
atomic_set(cur_trans-num_writers, 1);
extwriter_counter_init(cur_trans, type);
-   cur_trans-num_joined = 0;
init_waitqueue_head(cur_trans-writer_wait);
init_waitqueue_head(cur_trans-commit_wait);
cur_trans-in_commit = 0;
@@ -1566,7 +1564,6 @@ static inline void btrfs_wait_delalloc_flush(struct 
btrfs_fs_info *fs_info)
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 struct btrfs_root *root)
 {
-   unsigned long joined = 0;
struct btrfs_transaction *cur_trans = trans-transaction;
struct btrfs_transaction *prev_trans = NULL;
DEFINE_WAIT(wait);
@@ -1668,8 +1665,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans,
should_grow = 1;
 
do {
-   joined = cur_trans-num_joined;
-
WARN_ON(cur_trans != trans-transaction);
 
ret = btrfs_flush_all_pending_stuffs(trans, root);
@@ -1685,8 +1680,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans,
schedule_timeout(1);
 
finish_wait(cur_trans-writer_wait, wait);
-   } while (extwriter_counter_read(cur_trans)  0 ||
-(should_grow  cur_trans-num_joined != joined));
+   } while (extwriter_counter_read(cur_trans)  0);
 
ret = btrfs_flush_all_pending_stuffs(trans, root);
if (ret)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5cc77b0..0fc45e2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -37,8 +37,6 @@ struct btrfs_transaction {
atomic_t num_writers;
atomic_t use_count;
 
-   unsigned long num_joined;
-
spinlock_t commit_lock;
int in_commit;
int commit_done;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 00/17] improve the block time during the transaction commit

2013-05-15 Thread Miao Xie

This patchset improve the problem that the transaction may be blocked for a
long time when it is being committed if there is heavy I/O.

In this patchset,
- 0001-0005, 0007, 0011-0012 are random fix or code cleanup patch.
- 0006, 0008-0010 introduce per-subvolume delalloc inode list and ordered
  extent list, which can reduce the flush time when we create snapshots.
- 0013-0016 improve the block time during the transaction commit by removing
  the while loop at the beginning of the transaction commit.
- 0017 improves the readability of the code.

Miao Xie (17):
  Btrfs: fix accessing a freed tree root
  Btrfs: fix unprotected root node of the subvolume's inode rb-tree
  Btrfs: pause the space balance when remounting to R/O
  Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix()
  Btrfs: cleanup the similar code of the fs root read
  Btrfs: introduce grab/put functions for the root of the fs/file tree
  Btrfs: don't invoke btrfs_invalidate_inodes() in the spin lock context
  Btrfs: introduce per-subvolume delalloc inode list
  Btrfs: introduce per-subvolume ordered extent list
  Btrfs: just flush the delalloc inodes in the source tree before snapshot 
creation
  Btrfs: cleanup unnecessary assignment when cleaning up all the residual 
transaction
  Btrfs: remove the code for the impossible case in cleanup_transaction()
  Btrfs: don't wait for all the writers circularly during the transaction commit
  Btrfs: don't flush the delalloc inodes in the while loop if flushoncommit is 
set
  Btrfs: remove unnecessary varient -num_joined in btrfs_transaction structure
  Btrfs: remove the time check in btrfs_commit_transaction()
  Btrfs: make the state of the transaction more readable

 fs/btrfs/ctree.h|  55 +--
 fs/btrfs/dev-replace.c  |   6 +-
 fs/btrfs/disk-io.c  | 425 +++-
 fs/btrfs/disk-io.h  |  32 +++-
 fs/btrfs/extent-tree.c  |  20 +--
 fs/btrfs/inode.c| 180 ++--
 fs/btrfs/ioctl.c|   6 +
 fs/btrfs/ordered-data.c | 109 +
 fs/btrfs/ordered-data.h |   2 +
 fs/btrfs/relocation.c   |   9 +-
 fs/btrfs/root-tree.c| 170 ++-
 fs/btrfs/super.c|   3 +-
 fs/btrfs/transaction.c  | 271 --
 fs/btrfs/transaction.h  |  49 --
 fs/btrfs/tree-log.c |   3 +-
 fs/btrfs/volumes.c  |  13 +-
 fs/btrfs/volumes.h  |   1 +
 17 files changed, 791 insertions(+), 563 deletions(-)

-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree

2013-05-15 Thread Miao Xie

The grab/put funtions will be used in the next patch, which need grab
the root object and ensure it is not freed. We use reference counter
instead of the srcu lock is to aovid blocking the memory reclaim task,
which invokes synchronize_srcu().

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c |  5 +++--
 fs/btrfs/disk-io.h | 21 +
 fs/btrfs/extent-tree.c |  2 +-
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 845b77f..958ce6c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1739,6 +1739,7 @@ struct btrfs_root {
int force_cow;
 
spinlock_t root_item_lock;
+   atomic_t refs;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 42d6ba2..642c861 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(root-log_writers, 0);
atomic_set(root-log_batch, 0);
atomic_set(root-orphan_inodes, 0);
+   atomic_set(root-refs, 1);
root-log_transid = 0;
root-last_log_commit = 0;
extent_io_tree_init(root-dirty_log_pages,
@@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
} else {
free_extent_buffer(gang[0]-node);
free_extent_buffer(gang[0]-commit_root);
-   kfree(gang[0]);
+   btrfs_put_fs_root(gang[0]);
}
}
 
@@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root-free_ino_ctl);
kfree(root-free_ino_pinned);
kfree(root-name);
-   kfree(root);
+   btrfs_put_fs_root(root);
 }
 
 void btrfs_free_fs_root(struct btrfs_root *root)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 534d583..b71acd6e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root 
*root);
 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 struct btrfs_root *root);
 void btrfs_free_fs_root(struct btrfs_root *root);
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * fs_info-subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+   if (atomic_inc_not_zero(root-refs))
+   return root;
+   return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+   if (atomic_dec_and_test(root-refs))
+   kfree(root);
+}
+
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
  int atomic);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 08e42c8..08f9862 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
} else {
free_extent_buffer(root-node);
free_extent_buffer(root-commit_root);
-   kfree(root);
+   btrfs_put_fs_root(root);
}
 out_end_trans:
btrfs_end_transaction_throttle(trans, tree_root);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 0/8] Btrfs: introduce a tree for UUID to subvol ID mapping

2013-05-15 Thread Stefan Behrens

On Tue, 14 May 2013 19:11:54 +0200, Stefan Behrens wrote:
 On Tue, 14 May 2013 18:55:23 +0800, Liu Bo wrote:
 On Tue, May 14, 2013 at 11:36:52AM +0200, Stefan Behrens wrote:
 Mapping UUIDs to subvolume IDs is an operation with a high effort
 today. Today, the algorithm even has quadratic effort (based on the
 number of existing subvolumes), which means, that it takes minutes
 to send/receive a single subvolume if 10,000 subvolumes exist. But
 even linear effort would be too much since it is a waste. And these
 data structures to allow mapping UUIDs to subvolume IDs are created
 every time a btrfs send/receive instance is started.

 So the issue to address is that Btrfs send / receive does not work
 as it is today when a high number of subvolumes exist.

 It is much more efficient to maintain a searchable persistent data
 structure in the filesystem, one that is updated whenever a
 subvolume/snapshot is created and deleted, and when the received
 subvolume UUID is set by the btrfs-receive tool.

 Therefore kernel code is added that is able to maintain data
 structures in the filesystem that allow to quickly search for a
 given UUID and to retrieve the subvol ID.

 Now follows the lengthy justification, why a new tree was added
 instead of using the existing root tree:

 The first approach was to not create another tree that holds UUID
 items. Instead, the items should just go into the top root tree.
 Unfortunately this confused the algorithm to assign the objectid
 of subvolumes and snapshots. The reason is that
 btrfs_find_free_objectid() calls btrfs_find_highest_objectid() for
 the first created subvol or snapshot after mounting a filesystem,
 and this function simply searches for the largest used objectid in
 the root tree keys to pick the next objectid to assign. Of course,
 the UUID keys have always been the ones with the highest offset
 value, and the next assigned subvol ID was wastefully huge.

 To use any other existing tree did not look proper. To apply a
 workaround such as setting the objectid to zero in the UUID item
 key and to implement collision handling would either add
 limitations (in case of a btrfs_extend_item() approach to handle
 the collisions) or a lot of complexity and source code (in case a
 key would be looked up that is free of collisions). Adding new code
 that introduces limitations is not good, and adding code that is
 complex and lengthy for no good reason is also not good. That's the
 justification why a completely new tree was introduced.

 I'd appreciate if some performance number appear here since it's a speedup.
 
 That's a good idea. The numbers are below in the table and there's also a 
 link to a chart.
 
 I stopped the measurement with the old version after 1 subvolumes because 
 it already took almost 13 minutes to send a single, empty subvolume. All the 
 time is spent building a database, and this is done each time the btrfs send 
 or receive tool is started to send or receive a single subvolume.
 
 The table shows the time it takes to send a single, empty subvolume depending 
 on the number of subvolume that exist in the filesystem.
 
 # of subvols  | without| with
 in filesystem | UUID tree  | UUID tree
 --++--
 2 |  0m00.004s | 0m00.003s
  1000 |  0m07.010s | 0m00.004s
  2000 |  0m28.210s | 0m00.004s
  3000 |  1m04.872s | 0m00.004s
  4000 |  1m56.059s | 0m00.004s
  5000 |  3m00.489s | 0m00.004s
  6000 |  4m27.376s | 0m00.004s
  7000 |  6m08.938s | 0m00.004s
  8000 |  7m54.020s | 0m00.004s
  9000 | 10m05.108s | 0m00.004s
 1 | 12m47.406s | 0m00.004s
 
 Or as a chart:
 http://btrfs.giantdisaster.de/Btrfs-send-recv-perf.pdf

The table goes on like this for larger number of subvolumes (and the time value 
is always the time to transfer just _one_ of the subvolumes):

# of subvols  | without| with
in filesystem | UUID tree  | UUID tree
--++--
2 |  0m00.004s | 0m00.003s
 1000 |  0m07.010s | 0m00.004s
 2000 |  0m28.210s | 0m00.004s
 3000 |  1m04.872s | 0m00.004s
 4000 |  1m56.059s | 0m00.004s
 5000 |  3m00.489s | 0m00.004s
 6000 |  4m27.376s | 0m00.004s
 7000 |  6m08.938s | 0m00.004s
 8000 |  7m54.020s | 0m00.004s
 9000 | 10m05.108s | 0m00.004s
1 | 12m47.406s | 0m00.004s
11000 | 15m05.800s | 0m00.004s
12000 | 18m00.170s | 0m00.004s
13000 | 21m39.438s | 0m00.004s
14000 | 24m54.681s | 0m00.004s
15000 | 28m09.096s | 0m00.004s
16000 | 33m08.856s | 0m00.004s
17000 | 37m10.562s | 0m00.004s
18000 | 41m44.727s | 0m00.004s
19000 | 46m14.335s | 0m00.004s
2 | 51m55.100s | 0m00.004s
21000 | 56m54.346s | 0m00.004s
22000 | 62m53.466s | 0m00.004s
23000 | 66m57.328s | 0m00.004s
24000 | 73m59.687s |

Re: [PATCH] Btrfs: increase the max global reserve size to 1gig

2013-05-15 Thread David Sterba

On Fri, May 03, 2013 at 11:28:35PM +0800, Liu Bo wrote:
 On Fri, May 03, 2013 at 08:56:54AM -0400, Josef Bacik wrote:
  Apparently 512mb was too small, with a fs_mark command we could get so much
  delayed work built up that we'd never trip the lets commit the transaction
  logic until we'd gotten too much delayed refs built up.  Increasing this to 
  1
  gig makes us much safer and we no longer abort with Dave's fs_mark tester.
  Thanks,
 
 I remember that last time I made a similar commit, but users complains that
 they cannot boot their system on root btrfs partition due to lacking space and
 Chris eventually got to revert that one...

That was long time ago (3.5) and enospc has been updated since, so this
needs re-evaluation if it still leads to early enospc.

This patch is not in next queue, i wonder if your concern is the reason
of that.

david
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] xfstests btrfs/284: shorten duration, fix output

2013-05-15 Thread Rich Johnston


On 04/26/2013 01:45 PM, Eric Sandeen wrote:

test 284 had... some issues.




diff --git a/tests/btrfs/284 b/tests/btrfs/284
old mode 100644
new mode 100755
index d952977..67161a3
--- a/tests/btrfs/284
+++ b/tests/btrfs/284





This patch has been committed:

commit 91f87e3b89c0f7350a56d397ba7255d0e3d6b486
Author: Eric Sandeen sand...@redhat.com
Date:   Wed May 15 07:18:07 2013 -0500

xfstests btrfs/284: shorten duration, fix output

Signed-off-by: Eric Sandeen sand...@redhat.com
Reviewed-by: Josef Bacik jba...@fusionio.com
Reviewed-by: Liu Bo bo.li@oracle.com

Thanks
--Rich
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 0/8] Btrfs: introduce a tree for UUID to subvol ID mapping

2013-05-15 Thread David Sterba

On Tue, May 14, 2013 at 07:11:54PM +0200, Stefan Behrens wrote:
 # of subvols  | without| with
 in filesystem | UUID tree  | UUID tree
 --++--
 2 |  0m00.004s | 0m00.003s
  1000 |  0m07.010s | 0m00.004s
  2000 |  0m28.210s | 0m00.004s
  3000 |  1m04.872s | 0m00.004s
  4000 |  1m56.059s | 0m00.004s
  5000 |  3m00.489s | 0m00.004s
  6000 |  4m27.376s | 0m00.004s
  7000 |  6m08.938s | 0m00.004s
  8000 |  7m54.020s | 0m00.004s
  9000 | 10m05.108s | 0m00.004s
 1 | 12m47.406s | 0m00.004s

It looks like a copy paste error in the 3rd column. No really, this
is a great improvement!

david
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: I/O errors block the entire filesystem

2013-05-15 Thread Josef Bacik

On Sat, May 11, 2013 at 01:16:38AM -0600, Alexandre Oliva wrote:
 On Apr  4, 2013, Alexandre Oliva ol...@gnu.org wrote:
 
  I've been trying to figure out the btrfs I/O stack to try to understand
  why, sometimes (but not always), after a failure to read a (data
  non-replicated) block from the disk, the file being accessed becomes
  permanently locked, and the filesystem, unmountable.
 
 So, after some further investigation, we could determine that the
 problem was that end_bio_extent_readpage would unlock_extent_cached only
 part of the page, because it had previously computed whole_page as zero
 because of the nonzero bv_offset.
 
 So I started hunting for some place that would set up the bio with
 partial pages, and I failed.
 
 I was already suspecting some race condition or other form of corruption
 of the bvec before it got to end_bio_extent_readpage when I realized
 that the bv_offset was always a multiple of 512 bytes, and it
 represented the offset into the 4KiB page that the sector that failed to
 read was going to occupy.
 
 So I started hunting for places that modified bv_offset, and I found
 blk_update_request in fs/blk-core.c, where the very error message
 reporting the failed sector was output.
 
 The conclusion is that we cannot assume bvec is unmodified between our
 submitting the bio and our getting an error back.
 
 OTOH, I don't see that we ever set up bvecs that do not correspond to
 whole pages.  Indeed, my attempts to catch such situations with a
 wrapper around bio_add_page got no hits whatsoever, which suggests we
 could just do away with the whole_page computation, and take
 bv_offset+bv_len == PAGE_CACHE_SIZE as the requested read size.
 
 With this patch, after a read error, I get an EIO rather than a process
 hang that causes further attempts to access the file to hang, generally
 in a non-interruptible way.  Yay!
 
 

 btrfs: do away with non-whole_page extent I/O
 
 From: Alexandre Oliva ol...@gnu.org
 
 end_bio_extent_readpage computes whole_page based on bv_offset and
 bv_len, without taking into account that blk_update_request may modify
 them when some of the blocks to be read into a page produce a read
 error.  This would cause the read to unlock only part of the file
 range associated with the page, which would in turn leave the entire
 page locked, which would not only keep the process blocked instead of
 returning -EIO to it, but also prevent any further access to the file.
 
 It turns out that btrfs always issues whole-page reads and writes.
 The special handling of non-whole_page appears to be a mistake or a
 left-over from a time when this wasn't the case.  Indeed,
 end_bio_extent_writepage distinguished between whole_page and
 non-whole_page writes but behaved identically in both cases!
 
 I've replaced the whole_page computations with warnings, just to be
 sure that we're not issuing partial page reads or writes.  The
 warnings should probably just go away some time.
 
 Signed-off-by: Alexandre Oliva ol...@gnu.org
 ---
  fs/btrfs/extent_io.c |   85 
 ++
  1 file changed, 30 insertions(+), 55 deletions(-)
 
 diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
 index cdee391..f44b033 100644
 --- a/fs/btrfs/extent_io.c
 +++ b/fs/btrfs/extent_io.c
 @@ -1873,28 +1873,6 @@ static void check_page_uptodate(struct extent_io_tree 
 *tree, struct page *page)
  }
  
  /*
 - * helper function to unlock a page if all the extents in the tree
 - * for that page are unlocked
 - */
 -static void check_page_locked(struct extent_io_tree *tree, struct page *page)
 -{
 - u64 start = page_offset(page);
 - u64 end = start + PAGE_CACHE_SIZE - 1;
 - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
 - unlock_page(page);
 -}
 -
 -/*
 - * helper function to end page writeback if all the extents
 - * in the tree for that page are done with writeback
 - */
 -static void check_page_writeback(struct extent_io_tree *tree,
 -  struct page *page)
 -{
 - end_page_writeback(page);
 -}
 -
 -/*
   * When IO fails, either with EIO or csum verification fails, we
   * try other mirrors that might have a good copy of the data.  This
   * io_failure_record is used to record state as we go through all the
 @@ -2323,19 +2301,24 @@ static void end_bio_extent_writepage(struct bio *bio, 
 int err)
   struct extent_io_tree *tree;
   u64 start;
   u64 end;
 - int whole_page;
  
   do {
   struct page *page = bvec-bv_page;
   tree = BTRFS_I(page-mapping-host)-io_tree;
  
 - start = page_offset(page) + bvec-bv_offset;
 - end = start + bvec-bv_len - 1;
 + /* We always issue full-page reads, but if some block
 +  * in a page fails to read, blk_update_request() will
 +  * advance bv_offset and adjust bv_len to compensate.
 +  * Print a warning for nonzero offsets, and an error
 +

[PATCH] Btrfs-progs: detect when scrub is started twice

2013-05-15 Thread Stefan Behrens

Check whether any involved device is already busy running a
scrub. This would cause damaged status messages and the state
aborted without the explanation that a scrub was already
running. Therefore check it first, prevent it and give some
feedback to the user if scrub is already running.
Note that if scrub is started with a block device as the
parameter, only that particular block device is checked. It
is a normal mode of operation to start scrub on multiple
single devices, there is no reason to prevent this.

Here is an example:
/mnt2 is the mountpoint of a filesystem.
/dev/sdk and /dev/sdl are the block devices for that filesystem.

case 1:
btrfs scrub start /mnt2
btrfs scrub start /mnt2
- complain

case 1:
btrfs scrub start /dev/sdk
btrfs scrub start /dev/sdk
- complain

case 3:
btrfs scrub start /dev/sdk
btrfs scrub start /dev/sdl
- don't complain

case 4:
btrfs scrub start /dev/sdk
btrfs scrub start /mnt2
- complain

case 5:
btrfs scrub start /mnt2
btrfs scrub start /dev/sdk
- complain if the scrub on /dev/sdk is still running.
- don't complain if the scrub on /dev/sdk is finished, the
   status messages will be fine.

Reported-by: David Sterba dste...@suse.cz
Signed-off-by: Stefan Behrens sbehr...@giantdisaster.de
---
 cmds-scrub.c | 42 ++
 1 file changed, 42 insertions(+)

diff --git a/cmds-scrub.c b/cmds-scrub.c
index c0dc584..95dfee3 100644
--- a/cmds-scrub.c
+++ b/cmds-scrub.c
@@ -1025,6 +1025,27 @@ int mkdir_p(char *path)
return 0;
 }
 
+static int is_scrub_running_on_fs(struct btrfs_ioctl_fs_info_args *fi_args,
+ struct btrfs_ioctl_dev_info_args *di_args,
+ struct scrub_file_record **past_scrubs)
+{
+   int i;
+
+   if (!fi_args || !di_args || !past_scrubs)
+   return 0;
+
+   for (i = 0; i  fi_args-num_devices; i++) {
+   struct scrub_file_record *sfr =
+   last_dev_scrub(past_scrubs, di_args[i].devid);
+
+   if (!sfr)
+   continue;
+   if (!(sfr-stats.finished || sfr-stats.canceled))
+   return 1;
+   }
+   return 0;
+}
+
 static const char * const cmd_scrub_start_usage[];
 static const char * const cmd_scrub_resume_usage[];
 
@@ -1162,6 +1183,27 @@ static int scrub_start(int argc, char **argv, int resume)
close(fdres);
}
 
+   /*
+* check whether any involved device is already busy running a
+* scrub. This would cause damaged status messages and the state
+* aborted without the explanation that a scrub was already
+* running. Therefore check it first, prevent it and give some
+* feedback to the user if scrub is already running.
+* Note that if scrub is started with a block device as the
+* parameter, only that particular block device is checked. It
+* is a normal mode of operation to start scrub on multiple
+* single devices, there is no reason to prevent this.
+*/
+   if (is_scrub_running_on_fs(fi_args, di_args, past_scrubs)) {
+   ERR(!do_quiet,
+   ERROR: scrub is already running.\n
+   To cancel use 'btrfs scrub cancel %s'.\n
+   To see the status use 'btrfs scrub status [-d] %s'.\n,
+   path, path);
+   err = 1;
+   goto out;
+   }
+
t_devs = malloc(fi_args.num_devices * sizeof(*t_devs));
sp = calloc(fi_args.num_devices, sizeof(*sp));
spc.progress = calloc(fi_args.num_devices * 2, sizeof(*spc.progress));
-- 
1.8.2.3

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: increase the max global reserve size to 1gig

2013-05-15 Thread David Sterba

On Fri, May 03, 2013 at 08:56:54AM -0400, Josef Bacik wrote:
 Apparently 512mb was too small, with a fs_mark command we could get so much
 delayed work built up that we'd never trip the lets commit the transaction
 logic until we'd gotten too much delayed refs built up.  Increasing this to 1
 gig makes us much safer and we no longer abort with Dave's fs_mark tester.

With this patch on top of today's next I still hit the abort and will
file a bug.

david
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

kernel BUG at fs/btrfs/free-space-cache.c:1567!, Kernel 3.9.1

2013-05-15 Thread Philipp Dreimann

Hello,

my btrfs filesystem was not mountable anymore after a loss of power:

kernel BUG at fs/btrfs/free-space-cache.c:1567!
invalid opcode:  [#1] SMP
Modules linked in: btrfs libcrc32c xor zlib_deflate raid6_pq i915(+)
i2c_algo_bit drm_kms_helper drm i2c_core video uinput
CPU 3
Pid: 147, comm: mount Not tainted 3.9.1-301.fc19.x86_64 #1 LENOVO
4290W4H/4290W4H
RIP: 0010:[a01c5984]  [a01c5984]
remove_from_bitmap+0x1f4/0x210 [btrfs]
RSP: 0018:88020fddd698  EFLAGS: 00010287
RAX: 001454248000 RBX: 880211399180 RCX: 001454253000
RDX: 00013000 RSI: 51c0 RDI: 2e40
RBP: 88020fddd6e8 R08: 88020fecf5c8 R09: 00145940
R10: 8802124ce240 R11: 880211399198 R12: 880210025240
R13: 88020fddd708 R14: 88020fddd700 R15: 8802113991a8
FS:  7f019f213880() GS:88021e2c() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 7f4cdd4223f0 CR3: 00020ff03000 CR4: 000407e0
DR0:  DR1:  DR2: 
DR3:  DR6: 0ff0 DR7: 0400
Process mount (pid: 147, threadinfo 88020fddc000, task 8802100b4650)
Stack:
 88020fddd6e8 00145940 880210025264 001454253000
 00013000 880210025240 880210025264 
 880210025248 88020ff4ec00 88020fddd738 a01c714b
Call Trace:
 [a01c714b] btrfs_remove_free_space+0x5b/0x290 [btrfs]
 [810810e0] ? wake_up_bit+0x30/0x30
 [a0176bff] btrfs_alloc_logged_file_extent+0x1af/0x1d0 [btrfs]
 [a0162e86] ? btrfs_free_path+0x26/0x30 [btrfs]
 [a01c1ef7] replay_one_extent+0x607/0x690 [btrfs]
 [a01c224b] replay_one_buffer+0x2cb/0x390 [btrfs]
 [a01a77b4] ? alloc_extent_buffer+0x94/0x3f0 [btrfs]
 [a01bd484] walk_down_log_tree+0x224/0x410 [btrfs]
 [a01bd95f] walk_log_tree+0xbf/0x1f0 [btrfs]
 [a01c43c1] btrfs_recover_log_trees+0x1f1/0x360 [btrfs]
 [a01c1f80] ? replay_one_extent+0x690/0x690 [btrfs]
 [a0186a1b] open_ctree+0x18eb/0x1f90 [btrfs]
 [812dac54] ? disk_name+0x54/0xb0
 [a015f94e] btrfs_mount+0x5ce/0x6c0 [btrfs]
 [8129525d] ? selinux_sb_copy_data+0x14d/0x220
 [8119d1a9] mount_fs+0x39/0x1b0
 [811b635f] vfs_kern_mount+0x5f/0xf0
 [a015f4fc] btrfs_mount+0x17c/0x6c0 [btrfs]
 [8129525d] ? selinux_sb_copy_data+0x14d/0x220
 [8119d1a9] mount_fs+0x39/0x1b0
 [811b635f] vfs_kern_mount+0x5f/0xf0
 [811b862e] do_mount+0x23e/0xa20
 [811b8276] ? copy_mount_options+0x36/0x170
 [811b8e93] sys_mount+0x83/0xc0
 [8164db19] system_call_fastpath+0x16/0x1b
 Code: 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 1f 40 00 31 c0 48 83 7b 20
00 75 e4 48 89 de 4c 89 e7 89 45 c0 e8 b1 ed ff ff 8b 45 c0 eb d1 0f
0b e8 69 a4 02 00 0f 1f 44 00 00 e8 59 a4 02 00 66 66 2e 0f
 RIP  [a01c5984] remove_from_bitmap+0x1f4/0x210 [btrfs]
 RSP 88020fddd698

Mounting the fs with -o recovery,ro, and using btrfsck did not work.
btrfsck segfaulted, but I can not provide the error message.
btrfs-zero-log allowed me to mount the fs again.

BR,
 Philipp Dreimann
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs-progs: fix missing recow roots when making btrfs filesystem

2013-05-15 Thread David Sterba

On Tue, May 14, 2013 at 07:50:28PM +0800, Wang Shilong wrote:
 When making btrfs filesystem. we firstly write root leaf to
 specified filed, and then we recow the root. If we don't recow,
 some trees are not in the correct block group.
 
 Steps to reproduce:
   dd if=/dev/zero of=test.img bs=1M count=100
   mkfs.btrfs -f test.img
   btrfs-debug-tree test.img

Very simple reproducer, I guess that it gets fixed with first update of
the misplaced blocks.

 --- a/mkfs.c
 +++ b/mkfs.c
 @@ -151,37 +151,55 @@ static int recow_roots(struct btrfs_trans_handle *trans,
   int ret;
   struct extent_buffer *tmp;
   struct btrfs_fs_info *info = root-fs_info;
 + u64 generation;
  
 - ret = __btrfs_cow_block(trans, info-fs_root, info-fs_root-node,
 - NULL, 0, tmp, 0, 0);
 - BUG_ON(ret);
 - free_extent_buffer(tmp);
 + generation = btrfs_root_generation(info-fs_root-root_item);
 + if (generation != trans-transid) {
 + ret = __btrfs_cow_block(trans, info-fs_root,
 + info-fs_root-node, NULL, 0, tmp, 0, 0);
 + BUG_ON(ret);
 + free_extent_buffer(tmp);
 + }

This gets repeated, please use wrapper for that.

david
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: kernel BUG at fs/btrfs/free-space-cache.c:1567!, Kernel 3.9.1

2013-05-15 Thread Harald Glatt

On Wed, May 15, 2013 at 4:19 PM, Philipp Dreimann phil...@dreimann.net wrote:
 Hello,

 my btrfs filesystem was not mountable anymore after a loss of power:

 kernel BUG at fs/btrfs/free-space-cache.c:1567!
 invalid opcode:  [#1] SMP
 Modules linked in: btrfs libcrc32c xor zlib_deflate raid6_pq i915(+)
 i2c_algo_bit drm_kms_helper drm i2c_core video uinput
 CPU 3
 Pid: 147, comm: mount Not tainted 3.9.1-301.fc19.x86_64 #1 LENOVO
 4290W4H/4290W4H
 RIP: 0010:[a01c5984]  [a01c5984]
 remove_from_bitmap+0x1f4/0x210 [btrfs]
 RSP: 0018:88020fddd698  EFLAGS: 00010287
 RAX: 001454248000 RBX: 880211399180 RCX: 001454253000
 RDX: 00013000 RSI: 51c0 RDI: 2e40
 RBP: 88020fddd6e8 R08: 88020fecf5c8 R09: 00145940
 R10: 8802124ce240 R11: 880211399198 R12: 880210025240
 R13: 88020fddd708 R14: 88020fddd700 R15: 8802113991a8
 FS:  7f019f213880() GS:88021e2c() knlGS:
 CS:  0010 DS:  ES:  CR0: 80050033
 CR2: 7f4cdd4223f0 CR3: 00020ff03000 CR4: 000407e0
 DR0:  DR1:  DR2: 
 DR3:  DR6: 0ff0 DR7: 0400
 Process mount (pid: 147, threadinfo 88020fddc000, task 8802100b4650)
 Stack:
  88020fddd6e8 00145940 880210025264 001454253000
  00013000 880210025240 880210025264 
  880210025248 88020ff4ec00 88020fddd738 a01c714b
 Call Trace:
  [a01c714b] btrfs_remove_free_space+0x5b/0x290 [btrfs]
  [810810e0] ? wake_up_bit+0x30/0x30
  [a0176bff] btrfs_alloc_logged_file_extent+0x1af/0x1d0 [btrfs]
  [a0162e86] ? btrfs_free_path+0x26/0x30 [btrfs]
  [a01c1ef7] replay_one_extent+0x607/0x690 [btrfs]
  [a01c224b] replay_one_buffer+0x2cb/0x390 [btrfs]
  [a01a77b4] ? alloc_extent_buffer+0x94/0x3f0 [btrfs]
  [a01bd484] walk_down_log_tree+0x224/0x410 [btrfs]
  [a01bd95f] walk_log_tree+0xbf/0x1f0 [btrfs]
  [a01c43c1] btrfs_recover_log_trees+0x1f1/0x360 [btrfs]
  [a01c1f80] ? replay_one_extent+0x690/0x690 [btrfs]
  [a0186a1b] open_ctree+0x18eb/0x1f90 [btrfs]
  [812dac54] ? disk_name+0x54/0xb0
  [a015f94e] btrfs_mount+0x5ce/0x6c0 [btrfs]
  [8129525d] ? selinux_sb_copy_data+0x14d/0x220
  [8119d1a9] mount_fs+0x39/0x1b0
  [811b635f] vfs_kern_mount+0x5f/0xf0
  [a015f4fc] btrfs_mount+0x17c/0x6c0 [btrfs]
  [8129525d] ? selinux_sb_copy_data+0x14d/0x220
  [8119d1a9] mount_fs+0x39/0x1b0
  [811b635f] vfs_kern_mount+0x5f/0xf0
  [811b862e] do_mount+0x23e/0xa20
  [811b8276] ? copy_mount_options+0x36/0x170
  [811b8e93] sys_mount+0x83/0xc0
  [8164db19] system_call_fastpath+0x16/0x1b
  Code: 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 1f 40 00 31 c0 48 83 7b 20
 00 75 e4 48 89 de 4c 89 e7 89 45 c0 e8 b1 ed ff ff 8b 45 c0 eb d1 0f
 0b e8 69 a4 02 00 0f 1f 44 00 00 e8 59 a4 02 00 66 66 2e 0f
  RIP  [a01c5984] remove_from_bitmap+0x1f4/0x210 [btrfs]
  RSP 88020fddd698

 Mounting the fs with -o recovery,ro, and using btrfsck did not work.
 btrfsck segfaulted, but I can not provide the error message.
 btrfs-zero-log allowed me to mount the fs again.

 BR,
  Philipp Dreimann
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

Looks to me like you should have been able to mount with -o
clear_cache. Did you try that? Does it work again now?
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: kernel BUG at fs/btrfs/free-space-cache.c:1567!, Kernel 3.9.1

2013-05-15 Thread Philipp Dreimann

On 15 May 2013 17:00, Harald Glatt m...@hachre.de wrote:
 On Wed, May 15, 2013 at 4:19 PM, Philipp Dreimann phil...@dreimann.net 
 wrote:
 Hello,

 my btrfs filesystem was not mountable anymore after a loss of power:

 kernel BUG at fs/btrfs/free-space-cache.c:1567!
 invalid opcode:  [#1] SMP
 Modules linked in: btrfs libcrc32c xor zlib_deflate raid6_pq i915(+)
 i2c_algo_bit drm_kms_helper drm i2c_core video uinput
 CPU 3
 Pid: 147, comm: mount Not tainted 3.9.1-301.fc19.x86_64 #1 LENOVO
 4290W4H/4290W4H
 RIP: 0010:[a01c5984]  [a01c5984]
 remove_from_bitmap+0x1f4/0x210 [btrfs]
 RSP: 0018:88020fddd698  EFLAGS: 00010287
 RAX: 001454248000 RBX: 880211399180 RCX: 001454253000
 RDX: 00013000 RSI: 51c0 RDI: 2e40
 RBP: 88020fddd6e8 R08: 88020fecf5c8 R09: 00145940
 R10: 8802124ce240 R11: 880211399198 R12: 880210025240
 R13: 88020fddd708 R14: 88020fddd700 R15: 8802113991a8
 FS:  7f019f213880() GS:88021e2c() knlGS:
 CS:  0010 DS:  ES:  CR0: 80050033
 CR2: 7f4cdd4223f0 CR3: 00020ff03000 CR4: 000407e0
 DR0:  DR1:  DR2: 
 DR3:  DR6: 0ff0 DR7: 0400
 Process mount (pid: 147, threadinfo 88020fddc000, task 8802100b4650)
 Stack:
  88020fddd6e8 00145940 880210025264 001454253000
  00013000 880210025240 880210025264 
  880210025248 88020ff4ec00 88020fddd738 a01c714b
 Call Trace:
  [a01c714b] btrfs_remove_free_space+0x5b/0x290 [btrfs]
  [810810e0] ? wake_up_bit+0x30/0x30
  [a0176bff] btrfs_alloc_logged_file_extent+0x1af/0x1d0 [btrfs]
  [a0162e86] ? btrfs_free_path+0x26/0x30 [btrfs]
  [a01c1ef7] replay_one_extent+0x607/0x690 [btrfs]
  [a01c224b] replay_one_buffer+0x2cb/0x390 [btrfs]
  [a01a77b4] ? alloc_extent_buffer+0x94/0x3f0 [btrfs]
  [a01bd484] walk_down_log_tree+0x224/0x410 [btrfs]
  [a01bd95f] walk_log_tree+0xbf/0x1f0 [btrfs]
  [a01c43c1] btrfs_recover_log_trees+0x1f1/0x360 [btrfs]
  [a01c1f80] ? replay_one_extent+0x690/0x690 [btrfs]
  [a0186a1b] open_ctree+0x18eb/0x1f90 [btrfs]
  [812dac54] ? disk_name+0x54/0xb0
  [a015f94e] btrfs_mount+0x5ce/0x6c0 [btrfs]
  [8129525d] ? selinux_sb_copy_data+0x14d/0x220
  [8119d1a9] mount_fs+0x39/0x1b0
  [811b635f] vfs_kern_mount+0x5f/0xf0
  [a015f4fc] btrfs_mount+0x17c/0x6c0 [btrfs]
  [8129525d] ? selinux_sb_copy_data+0x14d/0x220
  [8119d1a9] mount_fs+0x39/0x1b0
  [811b635f] vfs_kern_mount+0x5f/0xf0
  [811b862e] do_mount+0x23e/0xa20
  [811b8276] ? copy_mount_options+0x36/0x170
  [811b8e93] sys_mount+0x83/0xc0
  [8164db19] system_call_fastpath+0x16/0x1b
  Code: 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 1f 40 00 31 c0 48 83 7b 20
 00 75 e4 48 89 de 4c 89 e7 89 45 c0 e8 b1 ed ff ff 8b 45 c0 eb d1 0f
 0b e8 69 a4 02 00 0f 1f 44 00 00 e8 59 a4 02 00 66 66 2e 0f
  RIP  [a01c5984] remove_from_bitmap+0x1f4/0x210 [btrfs]
  RSP 88020fddd698

 Mounting the fs with -o recovery,ro, and using btrfsck did not work.
 btrfsck segfaulted, but I can not provide the error message.
 btrfs-zero-log allowed me to mount the fs again.

 BR,
  Philipp Dreimann
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

 Looks to me like you should have been able to mount with -o
 clear_cache. Did you try that? Does it work again now?

I did not try that. The filesystem works since a btrfs-zero-log execution again!
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 4/8] Btrfs: maintain subvolume items in the UUID tree

2013-05-15 Thread Stefan Behrens

On Tue, 14 May 2013 18:44:11 +0800, Liu Bo wrote:
 On Tue, May 14, 2013 at 11:36:56AM +0200, Stefan Behrens wrote:
 @@ -396,7 +403,7 @@ static noinline int create_subvol(struct inode *dir,
   * of create_snapshot().
   */
  ret = btrfs_subvolume_reserve_metadata(root, block_rsv,
 -   7, qgroup_reserved);
 +   8, qgroup_reserved);
  if (ret)
  return ret;
 
 This uuid_root will not use trans-block_rsv but empty_rsv since it does not 
 set
 ref_cow, so you don't need to add one more to block_rsv, and same for
 the below cases.

Hi Liu Bo,

Thanks for your review comments!

You are right, this won't work because the empty_block_rsv is used for
the UUID tree as it is now.

I need to avoid ENOSPC in the middle of the transaction. Can you please
acknowledge or comment the following addition to get_block_rsv()? And
the plus 1 that I have added to the transaction reservations for
subvolume/snapshot creation/destruction will work correctly afterwards.

--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4259,6 +4259,9 @@ static struct btrfs_block_rsv *get_block_rsv(
if (root == root-fs_info-csum_root  trans-adding_csums)
block_rsv = trans-block_rsv;

+   if (root == root-fs_info-uuid_root)
+   block_rsv = trans-block_rsv;
+
if (!block_rsv)
block_rsv = root-block_rsv;



 @@ -567,9 +578,10 @@ static int create_snapshot(struct btrfs_root *root, 
 struct inode *dir,
   * 1 - root item
   * 2 - root ref/backref
   * 1 - root of snapshot
 + * 1 - UUID item
   */
  ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)-root,
 -pending_snapshot-block_rsv, 7,
 +pending_snapshot-block_rsv, 8,
  pending_snapshot-qgroup_reserved);
  if (ret)
  goto out;

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: subvol copying

2013-05-15 Thread Chris Murphy


On May 15, 2013, at 1:40 AM, Gabriel de Perthuis g2p.c...@gmail.com wrote:

 
 You can move subvolumes at any time, as if they were regular directories.

In the example case, the subvolumes are read-only. So is it possible to make a 
read-only subvolume (snapshot) read-writable? And is it possible to make a 
read-writeable snapshot of a read-only subvolume?


Chris Murphy

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: subvol copying

2013-05-15 Thread Harald Glatt

On Wed, May 15, 2013 at 6:43 PM, Chris Murphy li...@colorremedies.com wrote:

 On May 15, 2013, at 1:40 AM, Gabriel de Perthuis g2p.c...@gmail.com wrote:


 You can move subvolumes at any time, as if they were regular directories.

 In the example case, the subvolumes are read-only. So is it possible to make 
 a read-only subvolume (snapshot) read-writable? And is it possible to make a 
 read-writeable snapshot of a read-only subvolume?


 Chris Murphy

 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

You make a ro snapshot rw by creating a snapshot of it that is rw. So
yes to both questions, by doing the same thing in both cases.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: subvol copying

2013-05-15 Thread Chris Murphy


On May 15, 2013, at 10:44 AM, Harald Glatt m...@hachre.de wrote:

 
 You make a ro snapshot rw by creating a snapshot of it that is rw. So
 yes to both questions, by doing the same thing in both cases.

In other words, a normal snapshot (without -r) of a read-only snapshot will 
create a rw snapshot?

In any case, for the OP, making a rw snapshot of the last backup ro snapshot, 
and putting it in place of the home folder is the way to do the rollback.


Chris Murphy--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: subvol copying

2013-05-15 Thread Harald Glatt

On Wed, May 15, 2013 at 7:28 PM, Chris Murphy li...@colorremedies.com wrote:

 On May 15, 2013, at 10:44 AM, Harald Glatt m...@hachre.de wrote:


 You make a ro snapshot rw by creating a snapshot of it that is rw. So
 yes to both questions, by doing the same thing in both cases.

 In other words, a normal snapshot (without -r) of a read-only snapshot will 
 create a rw snapshot?

Yes
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs-progs: add a newline to a free space cache message

2013-05-15 Thread Josef Bacik

Left out a newline in the generation check printf.

Signed-off-by: Josef Bacik jba...@fusionio.com
---
 free-space-cache.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/free-space-cache.c b/free-space-cache.c
index 5fb8ece..a30438b 100644
--- a/free-space-cache.c
+++ b/free-space-cache.c
@@ -314,7 +314,7 @@ int __load_free_space_cache(struct btrfs_root *root,
struct btrfs_inode_item);
if (btrfs_inode_generation(leaf, inode_item) != generation) {
printf(free space inode generation (%llu) did not match 
-  free space cache generation (%llu),
+  free space cache generation (%llu)\n,
   (unsigned long long)btrfs_inode_generation(leaf,
  inode_item),
   (unsigned long long)generation);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: I/O errors block the entire filesystem

2013-05-15 Thread Alexandre Oliva

On May 14, 2013, Liu Bo bo.li@oracle.com wrote:

 In one of the failures that caused machine load spikes, I tried to
 collect info on active processes with perf top and SysRq-T, but nothing
 there seemed to explain the spike.  Thoughts on how to figure out what's
 causing this?

 Although I've seen your solution patch in this thread, I'm still curious
 about this senario, could you please share the reproducer script or
 something?

I'm afraid I don't have one.  I just use the filesystem on various
disks, with ceph osds and other non-ceph subvolumes and files, and
occasionally I run into one of these bad blocks and the filesystem gets
into these odd states.

 I guess that you're using '-l 64k -n 64k' for mkfs.btrfs

That is correct, but IIUC this should only affect metadata, and metadata
recovery from the DUP block works.  It's data (single copy) that fails
as described.

-- 
Alexandre Oliva, freedom fighterhttp://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist  Red Hat Brazil Compiler Engineer
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: I/O errors block the entire filesystem

2013-05-15 Thread Alexandre Oliva

On May 15, 2013, Josef Bacik jba...@fusionio.com wrote:

 So this should only happen in the case that you are on a dm device it looks
 like, is that how you are running?

That was my first thought, but no, I'm using partitions out of the SATA
disks directly.  I even checked for stray dm out of fake raid or
somesuch, but the dm modules were not even loaded, and perusing
/sys/block confirms the “scsi” devices are actual ATA disks.

Further investigation suggested that when individual 512-byte blocks are
read from a disk (that's the block size reported by the kernel), the
underlying disk driver is supposed to inform the upper layer about what
it could read by updating the bio_vec bits in precisely the observed
way.

-- 
Alexandre Oliva, freedom fighterhttp://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist  Red Hat Brazil Compiler Engineer
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point

2013-05-15 Thread Eric Wong

Zach Brown z...@redhat.com wrote:
 This adds a syscall and vfs entry point for clone_range which offloads
 data copying between existing files.
 
 The syscall is a thin wrapper around the vfs entry point.  Its arguments
 are inspired by sys_splice().

Why introduce a new syscall instead of extending sys_splice?

Perhaps adding a new SPLICE_F_COPYRANGE flag to avoid compatibility
problems with userspace which may expect ESPIPE when given two non-pipes
would be useful.

If the user doesn't need a out offset, then sendfile() should also be
able to transparently utilize COPY/CLONE_RANGE, too.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v0 1/4] vfs: add copy_range syscall and vfs entry point

2013-05-15 Thread Zach Brown

On Wed, May 15, 2013 at 07:44:05PM +, Eric Wong wrote:
 Why introduce a new syscall instead of extending sys_splice?

Personally, I think it's ugly to have different operations use the same
syscall just because their arguments match.

But that preference aside, sure, if the consensus is that we'd rather
use the splice() entry point then I can duck tape the pieces together to
make it work.

 If the user doesn't need a out offset, then sendfile() should also be
 able to transparently utilize COPY/CLONE_RANGE, too.

Perhaps, yeah.

- z
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v0 4/4] nfs, nfsd: rough sys_copy_range and COPY support

2013-05-15 Thread J. Bruce Fields

On Tue, May 14, 2013 at 02:15:26PM -0700, Zach Brown wrote:
 This crude patch illustrates the simplest plumbing involved in
 supporting sys_call_range with the NFS COPY operation that's pending in
 the 4.2 draft spec.
 
 The patch is based on a previous prototype that used the COPY op to
 implement sys_copyfileat which created a new file (based on the ocfs2
 reflink ioctl).  By contrast, this copies file contents between existing
 files.
 
 There's still a lot of implementation and testing to do, but this can
 get discussion going.

I'm using:

git://github.com/loghyr/NFSv4.2

as my reference for the draft protocol.

On a quick skim, one thing this is missing before it complies is a
client implementation of CB_OFFLOAD: If a client desires an
intra-server file copy, then it MUST support the COPY and CB_OFFLOAD
operations.

The server doesn't have to implement CB_OFFLOAD, though, so we should
ditch these todo's:

 +/*
 + * XXX:
 + *  - do something with stateids :)
 + *  - implement callback results and OFFLOAD_ABORT
 + *  - inter-server copies?
 + */

...

 + /* don't support async callbacks yet */

...

lest someone go try to implement them for no reason.  (Stranger things
have happened.)

Nits, possibly to ignore for now:

 + copy-u.ok.cr_callback_id_length = 0;
 +
 + return status;
 +}
 +
  /* This routine never returns NFS_OK!  If there are no other errors, it
   * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
   * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
 @@ -1798,6 +1829,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
   .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
   .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
   },
 + [OP_COPY] = {
 + .op_func = (nfsd4op_func)nfsd4_copy,
 + .op_name = OP_COPY,
 + },

There's some more boilerplate to fill in (see other ops).

 +static __be32
  nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
  {
   return nfs_ok;
 @@ -1557,6 +1577,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
   [OP_WANT_DELEGATION]= (nfsd4_dec)nfsd4_decode_notsupp,
   [OP_DESTROY_CLIENTID]   = (nfsd4_dec)nfsd4_decode_destroy_clientid,
   [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_reclaim_complete,
 + [OP_COPY]   = (nfsd4_dec)nfsd4_decode_copy,

And this should be made 4.2-specific.

  };
  
  struct nfsd4_minorversion_ops {
 @@ -3394,6 +3415,27 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres 
 *resp, __be32 nfserr,
  }
  
  static __be32
 +nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
 + struct nfsd4_copy *copy)
 +{
 + __be32 *p;
 +
 + if (!nfserr) {
 + RESERVE_SPACE(4);
 + WRITE32(copy-u.ok.cr_callback_id_length);
 + ADJUST_ARGS();
 + if (copy-u.ok.cr_callback_id_length == 1)
 + nfsd4_encode_stateid(resp, copy-u.ok.cr_callback_id);
 + } else {
 + RESERVE_SPACE(8);
 + WRITE64(copy-u.cr_bytes_copied);
 + ADJUST_ARGS();
 + }
 +
 + return nfserr;
 +}
 +
 +static __be32
  nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
  {
   return nfserr;
 @@ -3465,6 +3507,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
   [OP_WANT_DELEGATION]= (nfsd4_enc)nfsd4_encode_noop,
   [OP_DESTROY_CLIENTID]   = (nfsd4_enc)nfsd4_encode_noop,
   [OP_RECLAIM_COMPLETE]   = (nfsd4_enc)nfsd4_encode_noop,
 + [OP_COPY]   = (nfsd4_enc)nfsd4_encode_copy,
  };
  
  /*
 diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
 index 84ce601..0c1b427 100644
 --- a/fs/nfsd/vfs.c
 +++ b/fs/nfsd/vfs.c
 @@ -28,6 +28,8 @@
  #include asm/uaccess.h
  #include linux/exportfs.h
  #include linux/writeback.h
 +#include linux/fs_struct.h
 +#include linux/kmod.h
  
  #ifdef CONFIG_NFSD_V3
  #include xdr3.h
 @@ -621,6 +623,45 @@ int nfsd4_is_junction(struct dentry *dentry)
   return 0;
   return 1;
  }
 +
 +__be32
 +nfsd_copy_range(struct svc_rqst *rqstp, struct svc_fh *fhp_in, u64 pos_in,
 + struct svc_fh *fhp_out, u64 pos_out, u64 count)
 +{
 + struct file *filp_in = NULL;
 + struct file *filp_out = NULL;
 + int err;
 +
 + /* XXX verify pos and count within sane limits? */
 +
 + err = nfsd_open(rqstp, fhp_in, S_IFREG, NFSD_MAY_READ, filp_in);
 + if (err)
 + goto out;
 +
 + err = nfsd_open(rqstp, fhp_out, S_IFREG, NFSD_MAY_WRITE, filp_out);
 + if (err)
 + goto out;

Looking at the xdr... the COPY operation takes stateid's, which nfsd can
use to look up files, so the opens shouldn't be required.

--b.

 +
 + err = vfs_copy_range(filp_in, pos_in, filp_out, pos_out, count);
 + /* fall back if .copy_range isn't supported */
 +
 + if (!err  EX_ISSYNC(fhp_out-fh_export))
 + err = vfs_fsync_range(filp_out, pos_out, pos_out + count-1,

Re: [RFC v0 4/4] nfs, nfsd: rough sys_copy_range and COPY support

2013-05-15 Thread Myklebust, Trond

On Wed, 2013-05-15 at 16:19 -0400, J. Bruce Fields wrote:
 On Tue, May 14, 2013 at 02:15:26PM -0700, Zach Brown wrote:
  This crude patch illustrates the simplest plumbing involved in
  supporting sys_call_range with the NFS COPY operation that's pending in
  the 4.2 draft spec.
  
  The patch is based on a previous prototype that used the COPY op to
  implement sys_copyfileat which created a new file (based on the ocfs2
  reflink ioctl).  By contrast, this copies file contents between existing
  files.
  
  There's still a lot of implementation and testing to do, but this can
  get discussion going.
 
 I'm using:
 
   git://github.com/loghyr/NFSv4.2
 
 as my reference for the draft protocol.
 
 On a quick skim, one thing this is missing before it complies is a
 client implementation of CB_OFFLOAD: If a client desires an
 intra-server file copy, then it MUST support the COPY and CB_OFFLOAD
 operations.

Note that Bryan is currently working on updating the NFS implementation
to match the draft protocol.

-- 
Trond Myklebust
Linux NFS client maintainer

NetApp
trond.mykleb...@netapp.com
www.netapp.com
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v0 4/4] nfs, nfsd: rough sys_copy_range and COPY support

2013-05-15 Thread J. Bruce Fields

On Wed, May 15, 2013 at 08:21:54PM +, Myklebust, Trond wrote:
 On Wed, 2013-05-15 at 16:19 -0400, J. Bruce Fields wrote:
  On Tue, May 14, 2013 at 02:15:26PM -0700, Zach Brown wrote:
   This crude patch illustrates the simplest plumbing involved in
   supporting sys_call_range with the NFS COPY operation that's pending in
   the 4.2 draft spec.
   
   The patch is based on a previous prototype that used the COPY op to
   implement sys_copyfileat which created a new file (based on the ocfs2
   reflink ioctl).  By contrast, this copies file contents between existing
   files.
   
   There's still a lot of implementation and testing to do, but this can
   get discussion going.
  
  I'm using:
  
  git://github.com/loghyr/NFSv4.2
  
  as my reference for the draft protocol.
  
  On a quick skim, one thing this is missing before it complies is a
  client implementation of CB_OFFLOAD: If a client desires an
  intra-server file copy, then it MUST support the COPY and CB_OFFLOAD
  operations.
 
 Note that Bryan is currently working on updating the NFS implementation
 to match the draft protocol.

OK, good.--b.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] btrfs-progs: restore: use long option for the path regex

2013-05-15 Thread David Sterba

Current way of specifying the path to match is not very comfortable, but
the feature itself is very useful. Let's save the short option -m for a
more user friendly syntax and keep a long option --path-regex with the
current syntax.

CC: Peter Stuge pe...@stuge.se
Signed-off-by: David Sterba dste...@suse.cz
---
 cmds-restore.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/cmds-restore.c b/cmds-restore.c
index 52852bc..8a4f75a 100644
--- a/cmds-restore.c
+++ b/cmds-restore.c
@@ -32,6 +32,7 @@
 #include lzo/lzo1x.h
 #include zlib.h
 #include regex.h
+#include getopt.h
 
 #include ctree.h
 #include disk-io.h
@@ -924,6 +925,11 @@ out:
return ret;
 }
 
+static struct option long_options[] = {
+   { path-regex, 1, NULL, 256},
+   { 0, 0, 0, 0}
+};
+
 const char * const cmd_restore_usage[] = {
btrfs restore [options] device,
Try to restore files from a damaged filesystem (unmounted),
@@ -936,6 +942,10 @@ const char * const cmd_restore_usage[] = {
-f offset filesystem location,
-u block  super mirror,
-d  find dir,
+   --path-regex regex,
+   restore only filenames matching regex,,
+   you have to use following syntax (possibly quoted):,
+   ^/(|home(|/username(|/Desktop(|/.*$,
NULL
 };
 
@@ -950,6 +960,7 @@ int cmd_restore(int argc, char **argv)
int len;
int ret;
int opt;
+   int option_index = 0;
int super_mirror = 0;
int find_dir = 0;
int list_roots = 0;
@@ -958,7 +969,8 @@ int cmd_restore(int argc, char **argv)
regex_t match_reg, *mreg = NULL;
char reg_err[256];
 
-   while ((opt = getopt(argc, argv, sviot:u:df:r:lcm:)) != -1) {
+   while ((opt = getopt_long(argc, argv, sviot:u:df:r:lc, long_options,
+   option_index)) != -1) {
 
switch (opt) {
case 's':
@@ -1016,7 +1028,8 @@ int cmd_restore(int argc, char **argv)
case 'c':
match_cflags |= REG_ICASE;
break;
-   case 'm':
+   /* long option without single letter alternative */
+   case 256:
match_regstr = optarg;
break;
default:
-- 
1.8.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 4/8] Btrfs: maintain subvolume items in the UUID tree

2013-05-15 Thread Liu Bo

On Wed, May 15, 2013 at 05:39:35PM +0200, Stefan Behrens wrote:
 On Tue, 14 May 2013 18:44:11 +0800, Liu Bo wrote:
  On Tue, May 14, 2013 at 11:36:56AM +0200, Stefan Behrens wrote:
  @@ -396,7 +403,7 @@ static noinline int create_subvol(struct inode *dir,
  * of create_snapshot().
  */
 ret = btrfs_subvolume_reserve_metadata(root, block_rsv,
  - 7, qgroup_reserved);
  + 8, qgroup_reserved);
 if (ret)
 return ret;
  
  This uuid_root will not use trans-block_rsv but empty_rsv since it does 
  not set
  ref_cow, so you don't need to add one more to block_rsv, and same for
  the below cases.
 
 Hi Liu Bo,
 
 Thanks for your review comments!
 
 You are right, this won't work because the empty_block_rsv is used for
 the UUID tree as it is now.
 
 I need to avoid ENOSPC in the middle of the transaction. Can you please
 acknowledge or comment the following addition to get_block_rsv()? And
 the plus 1 that I have added to the transaction reservations for
 subvolume/snapshot creation/destruction will work correctly afterwards.

Looks good.

(Although I'm more looking forward to Josef's metadata enospc killer
patch, that way we won't ever need to deal with all of this buggy stuff ;))

thanks,
liubo

 
 --- a/fs/btrfs/extent-tree.c
 +++ b/fs/btrfs/extent-tree.c
 @@ -4259,6 +4259,9 @@ static struct btrfs_block_rsv *get_block_rsv(
 if (root == root-fs_info-csum_root  trans-adding_csums)
 block_rsv = trans-block_rsv;
 
 +   if (root == root-fs_info-uuid_root)
 +   block_rsv = trans-block_rsv;
 +
 if (!block_rsv)
 block_rsv = root-block_rsv;
 
 
 
  @@ -567,9 +578,10 @@ static int create_snapshot(struct btrfs_root *root, 
  struct inode *dir,
  * 1 - root item
  * 2 - root ref/backref
  * 1 - root of snapshot
  +   * 1 - UUID item
  */
 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)-root,
  -  pending_snapshot-block_rsv, 7,
  +  pending_snapshot-block_rsv, 8,
 pending_snapshot-qgroup_reserved);
 if (ret)
 goto out;
 
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation

2013-05-15 Thread Liu Bo

On Wed, May 15, 2013 at 03:48:24PM +0800, Miao Xie wrote:
 Before applying this patch, we need flush all the delalloc inodes in
 the fs when we want to create a snapshot, it wastes time, and make
 the transaction commit be blocked for a long time. It means some other
 user operation would also be blocked for a long time.
 
 This patch improves this problem, we just flush the delalloc inodes that
 in the source trees before snapshot creation, so the transaction commit
 will complete quickly.
 
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ioctl.c   |  6 ++
  fs/btrfs/transaction.c | 10 +-
  2 files changed, 7 insertions(+), 9 deletions(-)
 
 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
 index 0de4a2f..2677dcc 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, 
 struct inode *dir,
   if (!root-ref_cows)
   return -EINVAL;
  
 + ret = btrfs_start_delalloc_inodes(root, 0);
 + if (ret)
 + return ret;
 +
 + btrfs_wait_ordered_extents(root, 0);
 +

Does this look too radical?  Does this snapshot creation ioctl block all
writes on its src root?
No, we can only be sure that there is no ordered extents being added until
setting trans_no_join, and then it's safe to create pending snapshots.

thanks,
liubo

   pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
   if (!pending_snapshot)
   return -ENOMEM;
 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
 index 2b17213..bc22be9 100644
 --- a/fs/btrfs/transaction.c
 +++ b/fs/btrfs/transaction.c
 @@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct 
 btrfs_trans_handle *trans,
 struct btrfs_root *root)
  {
   int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
 - int snap_pending = 0;
   int ret;
  
 - if (!flush_on_commit) {
 - spin_lock(root-fs_info-trans_lock);
 - if (!list_empty(trans-transaction-pending_snapshots))
 - snap_pending = 1;
 - spin_unlock(root-fs_info-trans_lock);
 - }
 -
 - if (flush_on_commit || snap_pending) {
 + if (flush_on_commit) {
   ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1);
   if (ret)
   return ret;
 -- 
 1.8.1.4
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree

2013-05-15 Thread Liu Bo

On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote:
 The grab/put funtions will be used in the next patch, which need grab
 the root object and ensure it is not freed. We use reference counter
 instead of the srcu lock is to aovid blocking the memory reclaim task,
 which invokes synchronize_srcu().
 

I don't think this is necessary, we put 'kfree(root)' because we really
need to free them at the very end time, when there should be no inodes
linking on the root(we should have cleaned all inodes out from it).

So when we flush delalloc inodes and wait for ordered extents to finish,
the root should be valid, otherwise someone is doing wrong things.

And even with this grab_fs_root to avoid freeing root, it's just the
root that remains in memory, all its attributes, like root-node,
commit_root, root-inode_tree, are already NULL or empty.

thanks,
liubo

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ctree.h   |  1 +
  fs/btrfs/disk-io.c |  5 +++--
  fs/btrfs/disk-io.h | 21 +
  fs/btrfs/extent-tree.c |  2 +-
  4 files changed, 26 insertions(+), 3 deletions(-)
 
 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 845b77f..958ce6c 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1739,6 +1739,7 @@ struct btrfs_root {
   int force_cow;
  
   spinlock_t root_item_lock;
 + atomic_t refs;
  };
  
  struct btrfs_ioctl_defrag_range_args {
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index 42d6ba2..642c861 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
 u32 sectorsize,
   atomic_set(root-log_writers, 0);
   atomic_set(root-log_batch, 0);
   atomic_set(root-orphan_inodes, 0);
 + atomic_set(root-refs, 1);
   root-log_transid = 0;
   root-last_log_commit = 0;
   extent_io_tree_init(root-dirty_log_pages,
 @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
   } else {
   free_extent_buffer(gang[0]-node);
   free_extent_buffer(gang[0]-commit_root);
 - kfree(gang[0]);
 + btrfs_put_fs_root(gang[0]);
   }
   }
  
 @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root)
   kfree(root-free_ino_ctl);
   kfree(root-free_ino_pinned);
   kfree(root-name);
 - kfree(root);
 + btrfs_put_fs_root(root);
  }
  
  void btrfs_free_fs_root(struct btrfs_root *root)
 diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
 index 534d583..b71acd6e 100644
 --- a/fs/btrfs/disk-io.h
 +++ b/fs/btrfs/disk-io.h
 @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root 
 *root);
  void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
  void btrfs_free_fs_root(struct btrfs_root *root);
 +
 +/*
 + * This function is used to grab the root, and avoid it is freed when we
 + * access it. But it doesn't ensure that the tree is not dropped.
 + *
 + * If you want to ensure the whole tree is safe, you should use
 + *   fs_info-subvol_srcu
 + */
 +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
 +{
 + if (atomic_inc_not_zero(root-refs))
 + return root;
 + return NULL;
 +}
 +
 +static inline void btrfs_put_fs_root(struct btrfs_root *root)
 +{
 + if (atomic_dec_and_test(root-refs))
 + kfree(root);
 +}
 +
  void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 int atomic);
 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index 08e42c8..08f9862 100644
 --- a/fs/btrfs/extent-tree.c
 +++ b/fs/btrfs/extent-tree.c
 @@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
   } else {
   free_extent_buffer(root-node);
   free_extent_buffer(root-commit_root);
 - kfree(root);
 + btrfs_put_fs_root(root);
   }
  out_end_trans:
   btrfs_end_transaction_throttle(trans, tree_root);
 -- 
 1.8.1.4
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation

2013-05-15 Thread Miao Xie

On Thu, 16 May 2013 11:20:39 +0800, Liu Bo wrote:
 On Wed, May 15, 2013 at 03:48:24PM +0800, Miao Xie wrote:
 Before applying this patch, we need flush all the delalloc inodes in
 the fs when we want to create a snapshot, it wastes time, and make
 the transaction commit be blocked for a long time. It means some other
 user operation would also be blocked for a long time.

 This patch improves this problem, we just flush the delalloc inodes that
 in the source trees before snapshot creation, so the transaction commit
 will complete quickly.

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ioctl.c   |  6 ++
  fs/btrfs/transaction.c | 10 +-
  2 files changed, 7 insertions(+), 9 deletions(-)

 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
 index 0de4a2f..2677dcc 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, 
 struct inode *dir,
  if (!root-ref_cows)
  return -EINVAL;
  
 +ret = btrfs_start_delalloc_inodes(root, 0);
 +if (ret)
 +return ret;
 +
 +btrfs_wait_ordered_extents(root, 0);
 +
 
 Does this look too radical?  Does this snapshot creation ioctl block all
 writes on its src root?

I don't think it is radical, and I think flushing delalloc inodes during the 
transaction commit
is stupid, especially flushing all the inodes including the roots which are not 
going to be snapshoted.
Because it will block the operations of the users (such as mkdir, rmdir, create 
and so on) for
a long time if there are lots of dirty pages.

And The snapshot creation now doesn't block the writes of the source root at 
all, there is no
appreciable difference between this way and the background flusher.

 No, we can only be sure that there is no ordered extents being added until
 setting trans_no_join, and then it's safe to create pending snapshots.

Actually, we can not avoid that the new ordered extents are added before 
trans_no_join is set.
But for the users, the 1st case below must be handled correctly, but the 2nd 
one can be ignored
because we can see the write of the 2nd case as the one that happens after the 
snapshot creation.

1st case:
Task
write data into a file
make a snapshot

2nd case:
Task0   Task1
make a snapshot
  flush delalloc inodes
write data into a file
  commit transaction
create_pending_snapshot

Thanks
Miao

 
 thanks,
 liubo
 
  pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
  if (!pending_snapshot)
  return -ENOMEM;
 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
 index 2b17213..bc22be9 100644
 --- a/fs/btrfs/transaction.c
 +++ b/fs/btrfs/transaction.c
 @@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct 
 btrfs_trans_handle *trans,
struct btrfs_root *root)
  {
  int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
 -int snap_pending = 0;
  int ret;
  
 -if (!flush_on_commit) {
 -spin_lock(root-fs_info-trans_lock);
 -if (!list_empty(trans-transaction-pending_snapshots))
 -snap_pending = 1;
 -spin_unlock(root-fs_info-trans_lock);
 -}
 -
 -if (flush_on_commit || snap_pending) {
 +if (flush_on_commit) {
  ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1);
  if (ret)
  return ret;
 -- 
 1.8.1.4

 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree

2013-05-15 Thread Miao Xie

On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote:
 On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote:
 The grab/put funtions will be used in the next patch, which need grab
 the root object and ensure it is not freed. We use reference counter
 instead of the srcu lock is to aovid blocking the memory reclaim task,
 which invokes synchronize_srcu().

 
 I don't think this is necessary, we put 'kfree(root)' because we really
 need to free them at the very end time, when there should be no inodes
 linking on the root(we should have cleaned all inodes out from it).
 
 So when we flush delalloc inodes and wait for ordered extents to finish,
 the root should be valid, otherwise someone is doing wrong things.
 
 And even with this grab_fs_root to avoid freeing root, it's just the
 root that remains in memory, all its attributes, like root-node,
 commit_root, root-inode_tree, are already NULL or empty.

Please consider the case:
Task1   Task2   Cleaner
get the root
flush all delalloc inodes
drop subvolume
iput the last inode
  move the root into the dead list
drop 
subvolume

kfree(root)
If Task1 accesses the root now, oops will happen.

I introduced there two functions just to protect the access of the root object, 
not its
attributes, so don't worry about the attributes. (Please see the first sentence 
of the
changelog.)

Thanks
Miao

 
 thanks,
 liubo
 
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ctree.h   |  1 +
  fs/btrfs/disk-io.c |  5 +++--
  fs/btrfs/disk-io.h | 21 +
  fs/btrfs/extent-tree.c |  2 +-
  4 files changed, 26 insertions(+), 3 deletions(-)

 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 845b77f..958ce6c 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1739,6 +1739,7 @@ struct btrfs_root {
  int force_cow;
  
  spinlock_t root_item_lock;
 +atomic_t refs;
  };
  
  struct btrfs_ioctl_defrag_range_args {
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index 42d6ba2..642c861 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
 u32 sectorsize,
  atomic_set(root-log_writers, 0);
  atomic_set(root-log_batch, 0);
  atomic_set(root-orphan_inodes, 0);
 +atomic_set(root-refs, 1);
  root-log_transid = 0;
  root-last_log_commit = 0;
  extent_io_tree_init(root-dirty_log_pages,
 @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
  } else {
  free_extent_buffer(gang[0]-node);
  free_extent_buffer(gang[0]-commit_root);
 -kfree(gang[0]);
 +btrfs_put_fs_root(gang[0]);
  }
  }
  
 @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root)
  kfree(root-free_ino_ctl);
  kfree(root-free_ino_pinned);
  kfree(root-name);
 -kfree(root);
 +btrfs_put_fs_root(root);
  }
  
  void btrfs_free_fs_root(struct btrfs_root *root)
 diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
 index 534d583..b71acd6e 100644
 --- a/fs/btrfs/disk-io.h
 +++ b/fs/btrfs/disk-io.h
 @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root 
 *root);
  void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
   struct btrfs_root *root);
  void btrfs_free_fs_root(struct btrfs_root *root);
 +
 +/*
 + * This function is used to grab the root, and avoid it is freed when we
 + * access it. But it doesn't ensure that the tree is not dropped.
 + *
 + * If you want to ensure the whole tree is safe, you should use
 + *  fs_info-subvol_srcu
 + */
 +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
 +{
 +if (atomic_inc_not_zero(root-refs))
 +return root;
 +return NULL;
 +}
 +
 +static inline void btrfs_put_fs_root(struct btrfs_root *root)
 +{
 +if (atomic_dec_and_test(root-refs))
 +kfree(root);
 +}
 +
  void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index 08e42c8..08f9862 100644
 --- a/fs/btrfs/extent-tree.c
 +++ b/fs/btrfs/extent-tree.c
 @@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
  } else {
  free_extent_buffer(root-node);
  free_extent_buffer(root-commit_root);
 -kfree(root);
 +btrfs_put_fs_root(root);
  }
  out_end_trans:
  btrfs_end_transaction_throttle(trans, tree_root);

Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree

2013-05-15 Thread Liu Bo

On Thu, May 16, 2013 at 12:31:11PM +0800, Miao Xie wrote:
 On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote:
  On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote:
  The grab/put funtions will be used in the next patch, which need grab
  the root object and ensure it is not freed. We use reference counter
  instead of the srcu lock is to aovid blocking the memory reclaim task,
  which invokes synchronize_srcu().
 
  
  I don't think this is necessary, we put 'kfree(root)' because we really
  need to free them at the very end time, when there should be no inodes
  linking on the root(we should have cleaned all inodes out from it).
  
  So when we flush delalloc inodes and wait for ordered extents to finish,
  the root should be valid, otherwise someone is doing wrong things.
  
  And even with this grab_fs_root to avoid freeing root, it's just the
  root that remains in memory, all its attributes, like root-node,
  commit_root, root-inode_tree, are already NULL or empty.
 
 Please consider the case:
   Task1   Task2   Cleaner
   get the root
   flush all delalloc inodes
   drop subvolume
   iput the last inode
 move the root into the dead list
   drop 
 subvolume
   
 kfree(root)
 If Task1 accesses the root now, oops will happen.

Then it's task1's fault, why it is not protected by subvol_srcu section when
it's possible that someone like task2 sets root's refs to 0?

synchronize_srcu(subvol_srcu) before adding root into dead root list is
just for this race case, why do we need another?

thanks,
liubo

 
 I introduced there two functions just to protect the access of the root 
 object, not its
 attributes, so don't worry about the attributes. (Please see the first 
 sentence of the
 changelog.)
 
 Thanks
 Miao
 
  
  thanks,
  liubo
  
  Signed-off-by: Miao Xie mi...@cn.fujitsu.com
  ---
   fs/btrfs/ctree.h   |  1 +
   fs/btrfs/disk-io.c |  5 +++--
   fs/btrfs/disk-io.h | 21 +
   fs/btrfs/extent-tree.c |  2 +-
   4 files changed, 26 insertions(+), 3 deletions(-)
 
  diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
  index 845b77f..958ce6c 100644
  --- a/fs/btrfs/ctree.h
  +++ b/fs/btrfs/ctree.h
  @@ -1739,6 +1739,7 @@ struct btrfs_root {
 int force_cow;
   
 spinlock_t root_item_lock;
  +  atomic_t refs;
   };
   
   struct btrfs_ioctl_defrag_range_args {
  diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
  index 42d6ba2..642c861 100644
  --- a/fs/btrfs/disk-io.c
  +++ b/fs/btrfs/disk-io.c
  @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
  u32 sectorsize,
 atomic_set(root-log_writers, 0);
 atomic_set(root-log_batch, 0);
 atomic_set(root-orphan_inodes, 0);
  +  atomic_set(root-refs, 1);
 root-log_transid = 0;
 root-last_log_commit = 0;
 extent_io_tree_init(root-dirty_log_pages,
  @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info 
  *fs_info)
 } else {
 free_extent_buffer(gang[0]-node);
 free_extent_buffer(gang[0]-commit_root);
  -  kfree(gang[0]);
  +  btrfs_put_fs_root(gang[0]);
 }
 }
   
  @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root)
 kfree(root-free_ino_ctl);
 kfree(root-free_ino_pinned);
 kfree(root-name);
  -  kfree(root);
  +  btrfs_put_fs_root(root);
   }
   
   void btrfs_free_fs_root(struct btrfs_root *root)
  diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
  index 534d583..b71acd6e 100644
  --- a/fs/btrfs/disk-io.h
  +++ b/fs/btrfs/disk-io.h
  @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct 
  btrfs_root *root);
   void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
  struct btrfs_root *root);
   void btrfs_free_fs_root(struct btrfs_root *root);
  +
  +/*
  + * This function is used to grab the root, and avoid it is freed when we
  + * access it. But it doesn't ensure that the tree is not dropped.
  + *
  + * If you want to ensure the whole tree is safe, you should use
  + *fs_info-subvol_srcu
  + */
  +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root 
  *root)
  +{
  +  if (atomic_inc_not_zero(root-refs))
  +  return root;
  +  return NULL;
  +}
  +
  +static inline void btrfs_put_fs_root(struct btrfs_root *root)
  +{
  +  if (atomic_dec_and_test(root-refs))
  +  kfree(root);
  +}
  +
   void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
   int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
   int atomic);
  diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
  index

Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree

2013-05-15 Thread Miao Xie

On Thu, 16 May 2013 13:15:57 +0800, Liu Bo wrote:
 On Thu, May 16, 2013 at 12:31:11PM +0800, Miao Xie wrote:
 On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote:
 On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote:
 The grab/put funtions will be used in the next patch, which need grab
 the root object and ensure it is not freed. We use reference counter
 instead of the srcu lock is to aovid blocking the memory reclaim task,
 which invokes synchronize_srcu().


 I don't think this is necessary, we put 'kfree(root)' because we really
 need to free them at the very end time, when there should be no inodes
 linking on the root(we should have cleaned all inodes out from it).

 So when we flush delalloc inodes and wait for ordered extents to finish,
 the root should be valid, otherwise someone is doing wrong things.

 And even with this grab_fs_root to avoid freeing root, it's just the
 root that remains in memory, all its attributes, like root-node,
 commit_root, root-inode_tree, are already NULL or empty.

 Please consider the case:
  Task1   Task2   Cleaner
  get the root
  flush all delalloc inodes
  drop subvolume
  iput the last inode
move the root into the dead list
  drop 
 subvolume
  
 kfree(root)
 If Task1 accesses the root now, oops will happen.
 
 Then it's task1's fault, why it is not protected by subvol_srcu section when
 it's possible that someone like task2 sets root's refs to 0?
 
 synchronize_srcu(subvol_srcu) before adding root into dead root list is
 just for this race case, why do we need another?

Please read my changelog.

Miao

 
 thanks,
 liubo
 

 I introduced there two functions just to protect the access of the root 
 object, not its
 attributes, so don't worry about the attributes. (Please see the first 
 sentence of the
 changelog.)

 Thanks
 Miao


 thanks,
 liubo

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ctree.h   |  1 +
  fs/btrfs/disk-io.c |  5 +++--
  fs/btrfs/disk-io.h | 21 +
  fs/btrfs/extent-tree.c |  2 +-
  4 files changed, 26 insertions(+), 3 deletions(-)

 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 845b77f..958ce6c 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1739,6 +1739,7 @@ struct btrfs_root {
int force_cow;
  
spinlock_t root_item_lock;
 +  atomic_t refs;
  };
  
  struct btrfs_ioctl_defrag_range_args {
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index 42d6ba2..642c861 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
 u32 sectorsize,
atomic_set(root-log_writers, 0);
atomic_set(root-log_batch, 0);
atomic_set(root-orphan_inodes, 0);
 +  atomic_set(root-refs, 1);
root-log_transid = 0;
root-last_log_commit = 0;
extent_io_tree_init(root-dirty_log_pages,
 @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info 
 *fs_info)
} else {
free_extent_buffer(gang[0]-node);
free_extent_buffer(gang[0]-commit_root);
 -  kfree(gang[0]);
 +  btrfs_put_fs_root(gang[0]);
}
}
  
 @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root-free_ino_ctl);
kfree(root-free_ino_pinned);
kfree(root-name);
 -  kfree(root);
 +  btrfs_put_fs_root(root);
  }
  
  void btrfs_free_fs_root(struct btrfs_root *root)
 diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
 index 534d583..b71acd6e 100644
 --- a/fs/btrfs/disk-io.h
 +++ b/fs/btrfs/disk-io.h
 @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct 
 btrfs_root *root);
  void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 struct btrfs_root *root);
  void btrfs_free_fs_root(struct btrfs_root *root);
 +
 +/*
 + * This function is used to grab the root, and avoid it is freed when we
 + * access it. But it doesn't ensure that the tree is not dropped.
 + *
 + * If you want to ensure the whole tree is safe, you should use
 + *fs_info-subvol_srcu
 + */
 +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root 
 *root)
 +{
 +  if (atomic_inc_not_zero(root-refs))
 +  return root;
 +  return NULL;
 +}
 +
 +static inline void btrfs_put_fs_root(struct btrfs_root *root)
 +{
 +  if (atomic_dec_and_test(root-refs))
 +  kfree(root);
 +}
 +
  void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
  int atomic);
 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index 08e42c8..08f9862 100644
 ---

51 matches

Mail list logo