from:"Miao Xie"

[PATCH 13/17] Btrfs: don't wait for all the writers circularly during the transaction commit

2013-05-15 Thread Miao Xie

btrfs_commit_transaction has the following loop before we commit the
transaction.

do {
// attempt to do some useful stuff and/or sleep
} while (atomic_read(cur_trans-num_writers)  1 ||
 (should_grow  cur_trans-num_joined != joined));

This is used to prevent from the TRANS_START to get in the way of a
committing transaction. But it does not prevent from TRANS_JOIN, that
is we would do this loop for a long time if some writers JOIN the
current transaction endlessly.

Because we need join the current transaction to do some useful stuff,
we can not block TRANS_JOIN here. So we introduce a external writer
counter, which is used to count the TRANS_USERSPACE/TRANS_START writers.
If the external writer counter is zero, we can break the above loop.

In order to make the code more clear, we don't use enum variant
to define the type of the transaction handle, use bitmask instead.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c | 55 ++
 fs/btrfs/transaction.h | 31 
 2 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cf8706c..fd319b2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -51,17 +51,41 @@ static noinline void switch_commit_root(struct btrfs_root 
*root)
 }
 
 static inline int can_join_transaction(struct btrfs_transaction *trans,
-  int type)
+  unsigned int type)
 {
return !(trans-in_commit 
-type != TRANS_JOIN 
-type != TRANS_JOIN_NOLOCK);
+(type  TRANS_EXTWRITERS));
+}
+
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+unsigned int type)
+{
+   if (type  TRANS_EXTWRITERS)
+   atomic_inc(trans-num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+unsigned int type)
+{
+   if (type  TRANS_EXTWRITERS)
+   atomic_dec(trans-num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+ unsigned int type)
+{
+   atomic_set(trans-num_extwriters, ((type  TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
+{
+   return atomic_read(trans-num_extwriters);
 }
 
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root, int type)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int 
type)
 {
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *fs_info = root-fs_info;
@@ -99,6 +123,7 @@ loop:
}
atomic_inc(cur_trans-use_count);
atomic_inc(cur_trans-num_writers);
+   extwriter_counter_inc(cur_trans, type);
cur_trans-num_joined++;
spin_unlock(fs_info-trans_lock);
return 0;
@@ -131,6 +156,7 @@ loop:
}
 
atomic_set(cur_trans-num_writers, 1);
+   extwriter_counter_init(cur_trans, type);
cur_trans-num_joined = 0;
init_waitqueue_head(cur_trans-writer_wait);
init_waitqueue_head(cur_trans-commit_wait);
@@ -307,7 +333,7 @@ static int may_wait_transaction(struct btrfs_root *root, 
int type)
 }
 
 static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
  enum btrfs_reserve_flush_enum flush)
 {
struct btrfs_trans_handle *h;
@@ -320,7 +346,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, 
int type,
return ERR_PTR(-EROFS);
 
if (current-journal_info) {
-   WARN_ON(type != TRANS_JOIN  type != TRANS_JOIN_NOLOCK);
+   WARN_ON(type  TRANS_EXTWRITERS);
h = current-journal_info;
h-use_count++;
WARN_ON(h-use_count  2);
@@ -366,7 +392,7 @@ again:
 * If we are ATTACH, it means we just want to catch the current
 * transaction and commit it, so we needn't do sb_start_intwrite(). 
 */
-   if (type  TRANS_JOIN_NOLOCK)
+   if (type  __TRANS_FREEZABLE)
sb_start_intwrite(root-fs_info-sb);
 
if (may_wait_transaction(root, type))
@@ -429,7 +455,7 @@ got_it:
return h;
 
 join_fail:
-   if (type  TRANS_JOIN_NOLOCK)
+   if (type  __TRANS_FREEZABLE)
sb_end_intwrite(root-fs_info-sb);
kmem_cache_free(btrfs_trans_handle_cachep, h);
 alloc_fail:
@@ -677,12 +703,13 @@ static int __btrfs_end_transaction(struct 
btrfs_trans_handle *trans,
}
}
 
-   if (trans-type

[PATCH 17/17] Btrfs: make the state of the transaction more readable

2013-05-15 Thread Miao Xie

We used 3 variants to track the state of the transaction, it was complex
and wasted the memory space. Besides that, it was hard to understand that
which types of the transaction handles should be blocked in each transaction
state, so the developers often made mistakes.

This patch improved the above problem. In this patch, we define 6 states
for the transaction,
  enum btrfs_trans_state {
TRANS_STATE_RUNNING = 0,
TRANS_STATE_BLOCKED = 1,
TRANS_STATE_COMMIT_START= 2,
TRANS_STATE_COMMIT_DOING= 3,
TRANS_STATE_UNBLOCKED   = 4,
TRANS_STATE_COMPLETED   = 5,
TRANS_STATE_MAX = 6,
  }
and just use 1 variant to track those state.

In order to make the blocked handle types for each state more clear,
we introduce a array:
  unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING]   = 0U,
[TRANS_STATE_BLOCKED]   = (__TRANS_USERSPACE |
   __TRANS_START),
[TRANS_STATE_COMMIT_START]  = (__TRANS_USERSPACE |
   __TRANS_START |
   __TRANS_ATTACH),
[TRANS_STATE_COMMIT_DOING]  = (__TRANS_USERSPACE |
   __TRANS_START |
   __TRANS_ATTACH |
   __TRANS_JOIN),
[TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
   __TRANS_START |
   __TRANS_ATTACH |
   __TRANS_JOIN |
   __TRANS_JOIN_NOLOCK),
[TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
   __TRANS_START |
   __TRANS_ATTACH |
   __TRANS_JOIN |
   __TRANS_JOIN_NOLOCK),
  }
it is very intuitionistic.

Besides that, because we remove -in_commit in transaction structure, so
the lock -commit_lock which was used to protect it is unnecessary, remove
-commit_lock.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h   |   1 -
 fs/btrfs/disk-io.c |  36 ++--
 fs/btrfs/transaction.c | 156 ++---
 fs/btrfs/transaction.h |  16 +++--
 4 files changed, 114 insertions(+), 95 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a7e71ff..bf92302 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1496,7 +1496,6 @@ struct btrfs_fs_info {
int closing;
int log_root_recovering;
int enospc_unlink;
-   int trans_no_join;
 
u64 total_pinned;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6bb3f3d..530e3c0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1747,7 +1747,7 @@ static int transaction_kthread(void *arg)
}
 
now = get_seconds();
-   if (!cur-blocked 
+   if (cur-state  TRANS_STATE_BLOCKED 
(now  cur-start_time || now - cur-start_time  30)) {
spin_unlock(root-fs_info-trans_lock);
delay = HZ * 5;
@@ -2183,7 +2183,6 @@ int open_ctree(struct super_block *sb,
fs_info-max_inline = 8192 * 1024;
fs_info-metadata_ratio = 0;
fs_info-defrag_inodes = RB_ROOT;
-   fs_info-trans_no_join = 0;
fs_info-free_chunk_space = 0;
fs_info-tree_mod_log = RB_ROOT;
 
@@ -3956,19 +3955,14 @@ void btrfs_cleanup_one_transaction(struct 
btrfs_transaction *cur_trans,
btrfs_block_rsv_release(root, root-fs_info-trans_block_rsv,
cur_trans-dirty_pages.dirty_bytes);
 
-   /* FIXME: cleanup wait for commit */
-   cur_trans-in_commit = 1;
-   cur_trans-blocked = 1;
+   cur_trans-state = TRANS_STATE_COMMIT_START;
wake_up(root-fs_info-transaction_blocked_wait);
 
btrfs_evict_pending_snapshots(cur_trans);
 
-   cur_trans-blocked = 0;
+   cur_trans-state = TRANS_STATE_UNBLOCKED;
wake_up(root-fs_info-transaction_wait);
 
-   cur_trans-commit_done = 1;
-   wake_up(cur_trans-commit_wait);
-
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
 
@@ -3977,6 +3971,9 @@ void btrfs_cleanup_one_transaction(struct 
btrfs_transaction *cur_trans,
btrfs_destroy_pinned_extent(root,
root-fs_info-pinned_extents);
 
+   cur_trans-state =TRANS_STATE_COMPLETED;
+   wake_up(cur_trans-commit_wait);
+
/*
memset(cur_trans, 0, sizeof(*cur_trans));
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -4004,25 +4001,23 @@ static int

[PATCH 12/17] Btrfs: remove the code for the impossible case in cleanup_transaction()

2013-05-15 Thread Miao Xie

If the transaction is removed from the transaction list, it means the
transaction has been committed successfully. So it is impossible to
call cleanup_transaction(), otherwise there is something wrong with
the code logic. Thus, we use BUG_ON() instead of the original handle.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bc22be9..cf8706c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1450,11 +1450,12 @@ static void cleanup_transaction(struct 
btrfs_trans_handle *trans,
 
spin_lock(root-fs_info-trans_lock);
 
-   if (list_empty(cur_trans-list)) {
-   spin_unlock(root-fs_info-trans_lock);
-   btrfs_end_transaction(trans, root);
-   return;
-   }
+   /*
+* If the transaction is removed from the list, it means this
+* transaction has been committed successfully, so it is impossible
+* to call the cleanup function.
+*/
+   BUG_ON(list_empty(cur_trans-list));
 
list_del_init(cur_trans-list);
if (cur_trans == root-fs_info-running_transaction) {
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 03/17] Btrfs: pause the space balance when remounting to R/O

2013-05-15 Thread Miao Xie

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a4807ce..f0857e0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1263,6 +1263,7 @@ static int btrfs_remount(struct super_block *sb, int 
*flags, char *data)
 
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
+   btrfs_pause_balance(fs_info);
 
ret = btrfs_commit_super(root);
if (ret)
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation

2013-05-15 Thread Miao Xie

Before applying this patch, we need flush all the delalloc inodes in
the fs when we want to create a snapshot, it wastes time, and make
the transaction commit be blocked for a long time. It means some other
user operation would also be blocked for a long time.

This patch improves this problem, we just flush the delalloc inodes that
in the source trees before snapshot creation, so the transaction commit
will complete quickly.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ioctl.c   |  6 ++
 fs/btrfs/transaction.c | 10 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0de4a2f..2677dcc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct 
inode *dir,
if (!root-ref_cows)
return -EINVAL;
 
+   ret = btrfs_start_delalloc_inodes(root, 0);
+   if (ret)
+   return ret;
+
+   btrfs_wait_ordered_extents(root, 0);
+
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot)
return -ENOMEM;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2b17213..bc22be9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct 
btrfs_trans_handle *trans,
  struct btrfs_root *root)
 {
int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
-   int snap_pending = 0;
int ret;
 
-   if (!flush_on_commit) {
-   spin_lock(root-fs_info-trans_lock);
-   if (!list_empty(trans-transaction-pending_snapshots))
-   snap_pending = 1;
-   spin_unlock(root-fs_info-trans_lock);
-   }
-
-   if (flush_on_commit || snap_pending) {
+   if (flush_on_commit) {
ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1);
if (ret)
return ret;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 09/17] Btrfs: introduce per-subvolume ordered extent list

2013-05-15 Thread Miao Xie

The reason we introduce per-subvolume ordered extent list is the same
as the per-subvolume delalloc inode list.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h|  25 ---
 fs/btrfs/dev-replace.c  |   4 +-
 fs/btrfs/disk-io.c  |  45 +++-
 fs/btrfs/extent-tree.c  |   6 +--
 fs/btrfs/inode.c|   4 +-
 fs/btrfs/ordered-data.c | 109 +---
 fs/btrfs/ordered-data.h |   2 +
 fs/btrfs/relocation.c   |   2 +-
 fs/btrfs/super.c|   2 +-
 fs/btrfs/transaction.c  |   2 +-
 10 files changed, 143 insertions(+), 58 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 067233f..a7e71ff 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1437,17 +1437,18 @@ struct btrfs_fs_info {
atomic_t open_ioctl_trans;
 
/*
-* this is used by the balancing code to wait for all the pending
-* ordered extents
+* this is used to protect the following list -- ordered_roots.
 */
-   spinlock_t ordered_extent_lock;
+   spinlock_t ordered_root_lock;
 
/*
-* all of the data=ordered extents pending writeback
+* all fs/file tree roots in which there are data=ordered extents
+* pending writeback are added into this list.
+*
 * these can span multiple transactions and basically include
 * every dirty data page that isn't from nodatacow
 */
-   struct list_head ordered_extents;
+   struct list_head ordered_roots;
 
spinlock_t delalloc_root_lock;
/* all fs/file tree roots that have delalloc inodes. */
@@ -1746,6 +1747,20 @@ struct btrfs_root {
struct list_head delalloc_inodes;
struct list_head delalloc_root;
u64 nr_delalloc_inodes;
+   /*
+* this is used by the balancing code to wait for all the pending
+* ordered extents
+*/
+   spinlock_t ordered_extent_lock;
+
+   /*
+* all of the data=ordered extents pending writeback
+* these can span multiple transactions and basically include
+* every dirty data page that isn't from nodatacow
+*/
+   struct list_head ordered_extents;
+   struct list_head ordered_root;
+   u64 nr_ordered_extents;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index aca77ad..4254da8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args-result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
 
-   btrfs_wait_ordered_extents(root, 0);
+   btrfs_wait_all_ordered_extents(root-fs_info, 0);
 
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -470,7 +470,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info 
*fs_info,
mutex_unlock(dev_replace-lock_finishing_cancel_unmount);
return ret;
}
-   btrfs_wait_ordered_extents(root, 0);
+   btrfs_wait_all_ordered_extents(root-fs_info, 0);
 
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 13dddba..44d5a86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1192,6 +1192,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root-last_trans = 0;
root-highest_objectid = 0;
root-nr_delalloc_inodes = 0;
+   root-nr_ordered_extents = 0;
root-name = NULL;
root-inode_tree = RB_ROOT;
INIT_RADIX_TREE(root-delayed_nodes_tree, GFP_ATOMIC);
@@ -1202,11 +1203,14 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
u32 sectorsize,
INIT_LIST_HEAD(root-root_list);
INIT_LIST_HEAD(root-delalloc_inodes);
INIT_LIST_HEAD(root-delalloc_root);
+   INIT_LIST_HEAD(root-ordered_extents);
+   INIT_LIST_HEAD(root-ordered_root);
INIT_LIST_HEAD(root-logged_list[0]);
INIT_LIST_HEAD(root-logged_list[1]);
spin_lock_init(root-orphan_lock);
spin_lock_init(root-inode_lock);
spin_lock_init(root-delalloc_lock);
+   spin_lock_init(root-ordered_extent_lock);
spin_lock_init(root-accounting_lock);
spin_lock_init(root-log_extents_lock[0]);
spin_lock_init(root-log_extents_lock[1]);
@@ -2190,8 +2194,8 @@ int open_ctree(struct super_block *sb,
fs_info-thread_pool_size = min_t(unsigned long,
  num_online_cpus() + 2, 8);
 
-   INIT_LIST_HEAD(fs_info-ordered_extents);
-   spin_lock_init(fs_info-ordered_extent_lock);
+   INIT_LIST_HEAD(fs_info-ordered_roots);
+   spin_lock_init(fs_info-ordered_root_lock);
fs_info-delayed_root = kmalloc(sizeof(struct btrfs_delayed_root

[PATCH 11/17] Btrfs: cleanup unnecessary assignment when cleaning up all the residual transaction

2013-05-15 Thread Miao Xie

When we umount a fs with serious errors, we will invoke 
btrfs_cleanup_transactions()
to clean up the residual transaction. At this time, It is impossible to start a 
new
transaction, so we needn't assign trans_no_join to 1, and also needn't clear 
running
transaction every time we destroy a residual transaction.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 44d5a86..6bb3f3d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3992,7 +3992,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root 
*root)
 
spin_lock(root-fs_info-trans_lock);
list_splice_init(root-fs_info-trans_list, list);
-   root-fs_info-trans_no_join = 1;
+   root-fs_info-running_transaction = NULL;
spin_unlock(root-fs_info-trans_lock);
 
while (!list_empty(list)) {
@@ -4028,10 +4028,6 @@ static int btrfs_cleanup_transaction(struct btrfs_root 
*root)
 
btrfs_destroy_all_delalloc_inodes(root-fs_info);
 
-   spin_lock(root-fs_info-trans_lock);
-   root-fs_info-running_transaction = NULL;
-   spin_unlock(root-fs_info-trans_lock);
-
btrfs_destroy_marked_extents(root, t-dirty_pages,
 EXTENT_DIRTY);
 
@@ -4044,9 +4040,6 @@ static int btrfs_cleanup_transaction(struct btrfs_root 
*root)
kmem_cache_free(btrfs_transaction_cachep, t);
}
 
-   spin_lock(root-fs_info-trans_lock);
-   root-fs_info-trans_no_join = 0;
-   spin_unlock(root-fs_info-trans_lock);
mutex_unlock(root-fs_info-transaction_kthread_mutex);
 
return 0;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 04/17] Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix()

2013-05-15 Thread Miao Xie

We have checked if -node is NULL or not, so it is unnecessary to
use BUG_ON() to check again. Remove it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2a9ae38..8c1e4fb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1513,7 +1513,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct 
btrfs_root *tree_root,
}
 
root-commit_root = btrfs_root_node(root);
-   BUG_ON(!root-node); /* -ENOMEM */
 out:
if (location-objectid != BTRFS_TREE_LOG_OBJECTID) {
root-ref_cows = 1;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 15/17] Btrfs: remove unnecessary varient -num_joined in btrfs_transaction structure

2013-05-15 Thread Miao Xie

We used -num_joined track if there were some writers which join the current
transaction when the committer was sleeping. If some writers joined the current
transaction, we has to continue the while loop to do some necessary stuff, such
as flush the ordered operations. But it is unnecessary because we will do it
after the while loop.

Besides that, tracking -num_joined would make the committer drop into the while
loop when there are lots of internal writers(TRANS_JOIN).

So we remove -num_joined and don't track if there are some writers which join
the current transaction when the committer is sleeping.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c | 8 +---
 fs/btrfs/transaction.h | 2 --
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 265db57..75e7b15 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -124,7 +124,6 @@ loop:
atomic_inc(cur_trans-use_count);
atomic_inc(cur_trans-num_writers);
extwriter_counter_inc(cur_trans, type);
-   cur_trans-num_joined++;
spin_unlock(fs_info-trans_lock);
return 0;
}
@@ -157,7 +156,6 @@ loop:
 
atomic_set(cur_trans-num_writers, 1);
extwriter_counter_init(cur_trans, type);
-   cur_trans-num_joined = 0;
init_waitqueue_head(cur_trans-writer_wait);
init_waitqueue_head(cur_trans-commit_wait);
cur_trans-in_commit = 0;
@@ -1566,7 +1564,6 @@ static inline void btrfs_wait_delalloc_flush(struct 
btrfs_fs_info *fs_info)
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 struct btrfs_root *root)
 {
-   unsigned long joined = 0;
struct btrfs_transaction *cur_trans = trans-transaction;
struct btrfs_transaction *prev_trans = NULL;
DEFINE_WAIT(wait);
@@ -1668,8 +1665,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans,
should_grow = 1;
 
do {
-   joined = cur_trans-num_joined;
-
WARN_ON(cur_trans != trans-transaction);
 
ret = btrfs_flush_all_pending_stuffs(trans, root);
@@ -1685,8 +1680,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans,
schedule_timeout(1);
 
finish_wait(cur_trans-writer_wait, wait);
-   } while (extwriter_counter_read(cur_trans)  0 ||
-(should_grow  cur_trans-num_joined != joined));
+   } while (extwriter_counter_read(cur_trans)  0);
 
ret = btrfs_flush_all_pending_stuffs(trans, root);
if (ret)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5cc77b0..0fc45e2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -37,8 +37,6 @@ struct btrfs_transaction {
atomic_t num_writers;
atomic_t use_count;
 
-   unsigned long num_joined;
-
spinlock_t commit_lock;
int in_commit;
int commit_done;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 00/17] improve the block time during the transaction commit

2013-05-15 Thread Miao Xie

This patchset improve the problem that the transaction may be blocked for a
long time when it is being committed if there is heavy I/O.

In this patchset,
- 0001-0005, 0007, 0011-0012 are random fix or code cleanup patch.
- 0006, 0008-0010 introduce per-subvolume delalloc inode list and ordered
  extent list, which can reduce the flush time when we create snapshots.
- 0013-0016 improve the block time during the transaction commit by removing
  the while loop at the beginning of the transaction commit.
- 0017 improves the readability of the code.

Miao Xie (17):
  Btrfs: fix accessing a freed tree root
  Btrfs: fix unprotected root node of the subvolume's inode rb-tree
  Btrfs: pause the space balance when remounting to R/O
  Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix()
  Btrfs: cleanup the similar code of the fs root read
  Btrfs: introduce grab/put functions for the root of the fs/file tree
  Btrfs: don't invoke btrfs_invalidate_inodes() in the spin lock context
  Btrfs: introduce per-subvolume delalloc inode list
  Btrfs: introduce per-subvolume ordered extent list
  Btrfs: just flush the delalloc inodes in the source tree before snapshot 
creation
  Btrfs: cleanup unnecessary assignment when cleaning up all the residual 
transaction
  Btrfs: remove the code for the impossible case in cleanup_transaction()
  Btrfs: don't wait for all the writers circularly during the transaction commit
  Btrfs: don't flush the delalloc inodes in the while loop if flushoncommit is 
set
  Btrfs: remove unnecessary varient -num_joined in btrfs_transaction structure
  Btrfs: remove the time check in btrfs_commit_transaction()
  Btrfs: make the state of the transaction more readable

 fs/btrfs/ctree.h|  55 +--
 fs/btrfs/dev-replace.c  |   6 +-
 fs/btrfs/disk-io.c  | 425 +++-
 fs/btrfs/disk-io.h  |  32 +++-
 fs/btrfs/extent-tree.c  |  20 +--
 fs/btrfs/inode.c| 180 ++--
 fs/btrfs/ioctl.c|   6 +
 fs/btrfs/ordered-data.c | 109 +
 fs/btrfs/ordered-data.h |   2 +
 fs/btrfs/relocation.c   |   9 +-
 fs/btrfs/root-tree.c| 170 ++-
 fs/btrfs/super.c|   3 +-
 fs/btrfs/transaction.c  | 271 --
 fs/btrfs/transaction.h  |  49 --
 fs/btrfs/tree-log.c |   3 +-
 fs/btrfs/volumes.c  |  13 +-
 fs/btrfs/volumes.h  |   1 +
 17 files changed, 791 insertions(+), 563 deletions(-)

-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree

2013-05-15 Thread Miao Xie

The grab/put funtions will be used in the next patch, which need grab
the root object and ensure it is not freed. We use reference counter
instead of the srcu lock is to aovid blocking the memory reclaim task,
which invokes synchronize_srcu().

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c |  5 +++--
 fs/btrfs/disk-io.h | 21 +
 fs/btrfs/extent-tree.c |  2 +-
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 845b77f..958ce6c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1739,6 +1739,7 @@ struct btrfs_root {
int force_cow;
 
spinlock_t root_item_lock;
+   atomic_t refs;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 42d6ba2..642c861 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(root-log_writers, 0);
atomic_set(root-log_batch, 0);
atomic_set(root-orphan_inodes, 0);
+   atomic_set(root-refs, 1);
root-log_transid = 0;
root-last_log_commit = 0;
extent_io_tree_init(root-dirty_log_pages,
@@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
} else {
free_extent_buffer(gang[0]-node);
free_extent_buffer(gang[0]-commit_root);
-   kfree(gang[0]);
+   btrfs_put_fs_root(gang[0]);
}
}
 
@@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root-free_ino_ctl);
kfree(root-free_ino_pinned);
kfree(root-name);
-   kfree(root);
+   btrfs_put_fs_root(root);
 }
 
 void btrfs_free_fs_root(struct btrfs_root *root)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 534d583..b71acd6e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root 
*root);
 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 struct btrfs_root *root);
 void btrfs_free_fs_root(struct btrfs_root *root);
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * fs_info-subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+   if (atomic_inc_not_zero(root-refs))
+   return root;
+   return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+   if (atomic_dec_and_test(root-refs))
+   kfree(root);
+}
+
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
  int atomic);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 08e42c8..08f9862 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
} else {
free_extent_buffer(root-node);
free_extent_buffer(root-commit_root);
-   kfree(root);
+   btrfs_put_fs_root(root);
}
 out_end_trans:
btrfs_end_transaction_throttle(trans, tree_root);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 10/17] Btrfs: just flush the delalloc inodes in the source tree before snapshot creation

2013-05-15 Thread Miao Xie

On Thu, 16 May 2013 11:20:39 +0800, Liu Bo wrote:
 On Wed, May 15, 2013 at 03:48:24PM +0800, Miao Xie wrote:
 Before applying this patch, we need flush all the delalloc inodes in
 the fs when we want to create a snapshot, it wastes time, and make
 the transaction commit be blocked for a long time. It means some other
 user operation would also be blocked for a long time.

 This patch improves this problem, we just flush the delalloc inodes that
 in the source trees before snapshot creation, so the transaction commit
 will complete quickly.

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ioctl.c   |  6 ++
  fs/btrfs/transaction.c | 10 +-
  2 files changed, 7 insertions(+), 9 deletions(-)

 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
 index 0de4a2f..2677dcc 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, 
 struct inode *dir,
  if (!root-ref_cows)
  return -EINVAL;
  
 +ret = btrfs_start_delalloc_inodes(root, 0);
 +if (ret)
 +return ret;
 +
 +btrfs_wait_ordered_extents(root, 0);
 +
 
 Does this look too radical?  Does this snapshot creation ioctl block all
 writes on its src root?

I don't think it is radical, and I think flushing delalloc inodes during the 
transaction commit
is stupid, especially flushing all the inodes including the roots which are not 
going to be snapshoted.
Because it will block the operations of the users (such as mkdir, rmdir, create 
and so on) for
a long time if there are lots of dirty pages.

And The snapshot creation now doesn't block the writes of the source root at 
all, there is no
appreciable difference between this way and the background flusher.

 No, we can only be sure that there is no ordered extents being added until
 setting trans_no_join, and then it's safe to create pending snapshots.

Actually, we can not avoid that the new ordered extents are added before 
trans_no_join is set.
But for the users, the 1st case below must be handled correctly, but the 2nd 
one can be ignored
because we can see the write of the 2nd case as the one that happens after the 
snapshot creation.

1st case:
Task
write data into a file
make a snapshot

2nd case:
Task0   Task1
make a snapshot
  flush delalloc inodes
write data into a file
  commit transaction
create_pending_snapshot

Thanks
Miao

 
 thanks,
 liubo
 
  pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
  if (!pending_snapshot)
  return -ENOMEM;
 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
 index 2b17213..bc22be9 100644
 --- a/fs/btrfs/transaction.c
 +++ b/fs/btrfs/transaction.c
 @@ -1491,17 +1491,9 @@ static int btrfs_flush_all_pending_stuffs(struct 
 btrfs_trans_handle *trans,
struct btrfs_root *root)
  {
  int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
 -int snap_pending = 0;
  int ret;
  
 -if (!flush_on_commit) {
 -spin_lock(root-fs_info-trans_lock);
 -if (!list_empty(trans-transaction-pending_snapshots))
 -snap_pending = 1;
 -spin_unlock(root-fs_info-trans_lock);
 -}
 -
 -if (flush_on_commit || snap_pending) {
 +if (flush_on_commit) {
  ret = btrfs_start_all_delalloc_inodes(root-fs_info, 1);
  if (ret)
  return ret;
 -- 
 1.8.1.4

 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree

2013-05-15 Thread Miao Xie

On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote:
 On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote:
 The grab/put funtions will be used in the next patch, which need grab
 the root object and ensure it is not freed. We use reference counter
 instead of the srcu lock is to aovid blocking the memory reclaim task,
 which invokes synchronize_srcu().

 
 I don't think this is necessary, we put 'kfree(root)' because we really
 need to free them at the very end time, when there should be no inodes
 linking on the root(we should have cleaned all inodes out from it).
 
 So when we flush delalloc inodes and wait for ordered extents to finish,
 the root should be valid, otherwise someone is doing wrong things.
 
 And even with this grab_fs_root to avoid freeing root, it's just the
 root that remains in memory, all its attributes, like root-node,
 commit_root, root-inode_tree, are already NULL or empty.

Please consider the case:
Task1   Task2   Cleaner
get the root
flush all delalloc inodes
drop subvolume
iput the last inode
  move the root into the dead list
drop 
subvolume

kfree(root)
If Task1 accesses the root now, oops will happen.

I introduced there two functions just to protect the access of the root object, 
not its
attributes, so don't worry about the attributes. (Please see the first sentence 
of the
changelog.)

Thanks
Miao

 
 thanks,
 liubo
 
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ctree.h   |  1 +
  fs/btrfs/disk-io.c |  5 +++--
  fs/btrfs/disk-io.h | 21 +
  fs/btrfs/extent-tree.c |  2 +-
  4 files changed, 26 insertions(+), 3 deletions(-)

 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 845b77f..958ce6c 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1739,6 +1739,7 @@ struct btrfs_root {
  int force_cow;
  
  spinlock_t root_item_lock;
 +atomic_t refs;
  };
  
  struct btrfs_ioctl_defrag_range_args {
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index 42d6ba2..642c861 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
 u32 sectorsize,
  atomic_set(root-log_writers, 0);
  atomic_set(root-log_batch, 0);
  atomic_set(root-orphan_inodes, 0);
 +atomic_set(root-refs, 1);
  root-log_transid = 0;
  root-last_log_commit = 0;
  extent_io_tree_init(root-dirty_log_pages,
 @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
  } else {
  free_extent_buffer(gang[0]-node);
  free_extent_buffer(gang[0]-commit_root);
 -kfree(gang[0]);
 +btrfs_put_fs_root(gang[0]);
  }
  }
  
 @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root)
  kfree(root-free_ino_ctl);
  kfree(root-free_ino_pinned);
  kfree(root-name);
 -kfree(root);
 +btrfs_put_fs_root(root);
  }
  
  void btrfs_free_fs_root(struct btrfs_root *root)
 diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
 index 534d583..b71acd6e 100644
 --- a/fs/btrfs/disk-io.h
 +++ b/fs/btrfs/disk-io.h
 @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root 
 *root);
  void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
   struct btrfs_root *root);
  void btrfs_free_fs_root(struct btrfs_root *root);
 +
 +/*
 + * This function is used to grab the root, and avoid it is freed when we
 + * access it. But it doesn't ensure that the tree is not dropped.
 + *
 + * If you want to ensure the whole tree is safe, you should use
 + *  fs_info-subvol_srcu
 + */
 +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
 +{
 +if (atomic_inc_not_zero(root-refs))
 +return root;
 +return NULL;
 +}
 +
 +static inline void btrfs_put_fs_root(struct btrfs_root *root)
 +{
 +if (atomic_dec_and_test(root-refs))
 +kfree(root);
 +}
 +
  void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index 08e42c8..08f9862 100644
 --- a/fs/btrfs/extent-tree.c
 +++ b/fs/btrfs/extent-tree.c
 @@ -7463,7 +7463,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
  } else {
  free_extent_buffer(root-node);
  free_extent_buffer(root-commit_root);
 -kfree(root);
 +btrfs_put_fs_root(root);
  }
  out_end_trans:
  btrfs_end_transaction_throttle(trans, tree_root

Re: [PATCH 06/17] Btrfs: introduce grab/put functions for the root of the fs/file tree

2013-05-15 Thread Miao Xie

On Thu, 16 May 2013 13:15:57 +0800, Liu Bo wrote:
 On Thu, May 16, 2013 at 12:31:11PM +0800, Miao Xie wrote:
 On thu, 16 May 2013 11:36:46 +0800, Liu Bo wrote:
 On Wed, May 15, 2013 at 03:48:20PM +0800, Miao Xie wrote:
 The grab/put funtions will be used in the next patch, which need grab
 the root object and ensure it is not freed. We use reference counter
 instead of the srcu lock is to aovid blocking the memory reclaim task,
 which invokes synchronize_srcu().


 I don't think this is necessary, we put 'kfree(root)' because we really
 need to free them at the very end time, when there should be no inodes
 linking on the root(we should have cleaned all inodes out from it).

 So when we flush delalloc inodes and wait for ordered extents to finish,
 the root should be valid, otherwise someone is doing wrong things.

 And even with this grab_fs_root to avoid freeing root, it's just the
 root that remains in memory, all its attributes, like root-node,
 commit_root, root-inode_tree, are already NULL or empty.

 Please consider the case:
  Task1   Task2   Cleaner
  get the root
  flush all delalloc inodes
  drop subvolume
  iput the last inode
move the root into the dead list
  drop 
 subvolume
  
 kfree(root)
 If Task1 accesses the root now, oops will happen.
 
 Then it's task1's fault, why it is not protected by subvol_srcu section when
 it's possible that someone like task2 sets root's refs to 0?
 
 synchronize_srcu(subvol_srcu) before adding root into dead root list is
 just for this race case, why do we need another?

Please read my changelog.

Miao

 
 thanks,
 liubo
 

 I introduced there two functions just to protect the access of the root 
 object, not its
 attributes, so don't worry about the attributes. (Please see the first 
 sentence of the
 changelog.)

 Thanks
 Miao


 thanks,
 liubo

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ctree.h   |  1 +
  fs/btrfs/disk-io.c |  5 +++--
  fs/btrfs/disk-io.h | 21 +
  fs/btrfs/extent-tree.c |  2 +-
  4 files changed, 26 insertions(+), 3 deletions(-)

 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 845b77f..958ce6c 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1739,6 +1739,7 @@ struct btrfs_root {
int force_cow;
  
spinlock_t root_item_lock;
 +  atomic_t refs;
  };
  
  struct btrfs_ioctl_defrag_range_args {
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index 42d6ba2..642c861 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -1216,6 +1216,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
 u32 sectorsize,
atomic_set(root-log_writers, 0);
atomic_set(root-log_batch, 0);
atomic_set(root-orphan_inodes, 0);
 +  atomic_set(root-refs, 1);
root-log_transid = 0;
root-last_log_commit = 0;
extent_io_tree_init(root-dirty_log_pages,
 @@ -2049,7 +2050,7 @@ static void del_fs_roots(struct btrfs_fs_info 
 *fs_info)
} else {
free_extent_buffer(gang[0]-node);
free_extent_buffer(gang[0]-commit_root);
 -  kfree(gang[0]);
 +  btrfs_put_fs_root(gang[0]);
}
}
  
 @@ -3415,7 +3416,7 @@ static void free_fs_root(struct btrfs_root *root)
kfree(root-free_ino_ctl);
kfree(root-free_ino_pinned);
kfree(root-name);
 -  kfree(root);
 +  btrfs_put_fs_root(root);
  }
  
  void btrfs_free_fs_root(struct btrfs_root *root)
 diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
 index 534d583..b71acd6e 100644
 --- a/fs/btrfs/disk-io.h
 +++ b/fs/btrfs/disk-io.h
 @@ -76,6 +76,27 @@ void btrfs_btree_balance_dirty_nodelay(struct 
 btrfs_root *root);
  void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 struct btrfs_root *root);
  void btrfs_free_fs_root(struct btrfs_root *root);
 +
 +/*
 + * This function is used to grab the root, and avoid it is freed when we
 + * access it. But it doesn't ensure that the tree is not dropped.
 + *
 + * If you want to ensure the whole tree is safe, you should use
 + *fs_info-subvol_srcu
 + */
 +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root 
 *root)
 +{
 +  if (atomic_inc_not_zero(root-refs))
 +  return root;
 +  return NULL;
 +}
 +
 +static inline void btrfs_put_fs_root(struct btrfs_root *root)
 +{
 +  if (atomic_dec_and_test(root-refs))
 +  kfree(root);
 +}
 +
  void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
  int atomic);
 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index 08e42c8..08f9862 100644
 --- a/fs/btrfs

Re: [PATCH v3] btrfs: clean snapshots one by one

2013-05-14 Thread Miao Xie

On  tue, 7 May 2013 13:54:49 +0200, David Sterba wrote:
 On Mon, May 06, 2013 at 08:41:06PM -0400, Chris Mason wrote:
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index 988b860..4de2351 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -1690,15 +1690,19 @@ static int cleaner_kthread(void *arg)
 struct btrfs_root *root = arg;
  
 do {
 +   int again = 0;
 +
 if (!(root-fs_info-sb-s_flags  MS_RDONLY) 
 +   down_read_trylock(root-fs_info-sb-s_umount) 
 mutex_trylock(root-fs_info-cleaner_mutex)) {
 btrfs_run_delayed_iputs(root);
 -   btrfs_clean_old_snapshots(root);
 +   again = btrfs_clean_one_deleted_snapshot(root);
 mutex_unlock(root-fs_info-cleaner_mutex);
 btrfs_run_defrag_inodes(root-fs_info);
 +   up_read(root-fs_info-sb-s_umount);

 Can we use just the cleaner mutex for this?  We're deadlocking during
 068 with autodefrag on because the cleaner is holding s_umount while
 autodefrag is trying to bump the writer count.
 
 I have now reproduced the deadlock and see where it's stuck.  It did not
 happen with running 068 in a loop, but after interrupting the test.
 
 If unmount takes the cleaner mutex once it should wait long enough for
 the cleaner to stop.
 
 You mean removing s_umount from here completely? I'm not sure about
 other mis-interaction, eg with remount + autodefrag. Miao sent a patch
 for that case http://www.spinics.net/lists/linux-btrfs/msg16634.html
 (but it would not fix this deadlock).

I have given up this patch and fix this problem by the other way.
http://marc.info/?l=linux-btrfsm=136142833013628w=2

I think we need use s_umount here, all things we need do is to check R/O
in cleaner_mutex. Or we may continue to delete the dead tree after the fs
is remounted to be R/O.

Thanks
Miao

 
 I'm for keeping the clean-by-one patch for 3.10, we can fix other
 regressions during rc cycle.
 
 david
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/4] Btrfs: remove unnecessary -s_umount in cleaner_kthread()

2013-05-14 Thread Miao Xie

In order to avoid the R/O remount, we acquired -s_umount lock during
we deleted the dead snapshots and subvolumes. But it is unnecessary,
because we have cleaner_mutex.

We use cleaner_mutex to protect the process of the dead snapshots/subvolumes
deletion. And when we remount the fs to be R/O, we also acquire this mutex to
do cleanup after we change the status of the fs. That is this lock can serialize
the above operations, the cleaner can be aware of the status of the fs, and if
the cleaner is deleting the dead snapshots/subvolumes, the remount task will
wait for it. So it is safe to remove -s_umount in cleaner_kthread().

Cc: David Sterba dste...@suse.cz
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c | 40 
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a9df562..cb2bfd1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1676,24 +1676,40 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio_endio(bio, error);
 }
 
+/*
+ * If we remount the fs to be R/O, the cleaner needn't do anything except
+ * sleeping. This function is used to check the status of the fs.
+ */
+static inline int need_cleaner_sleep(struct btrfs_root *root)
+{
+   return root-fs_info-sb-s_flags  MS_RDONLY;
+}
+
 static int cleaner_kthread(void *arg)
 {
struct btrfs_root *root = arg;
+   int again;
 
do {
-   int again = 0;
-
-   if (!(root-fs_info-sb-s_flags  MS_RDONLY) 
-   down_read_trylock(root-fs_info-sb-s_umount)) {
-   if (mutex_trylock(root-fs_info-cleaner_mutex)) {
-   btrfs_run_delayed_iputs(root);
-   again = btrfs_clean_one_deleted_snapshot(root);
-   mutex_unlock(root-fs_info-cleaner_mutex);
-   }
-   btrfs_run_defrag_inodes(root-fs_info);
-   up_read(root-fs_info-sb-s_umount);
-   }
+   again = 0;
 
+   /* Make the cleaner go to sleep early. */
+   if (need_cleaner_sleep(root))
+   goto sleep;
+
+   if (!mutex_trylock(root-fs_info-cleaner_mutex))
+   goto sleep;
+
+   btrfs_run_delayed_iputs(root);
+   again = btrfs_clean_one_deleted_snapshot(root);
+   mutex_unlock(root-fs_info-cleaner_mutex);
+
+   /*
+* The defragger has dealt with the R/O remount, needn't
+* do anything special here.
+*/
+   btrfs_run_defrag_inodes(root-fs_info);
+sleep:
if (!try_to_freeze()  !again) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/4] Btrfs: move the R/O check out of btrfs_clean_one_deleted_snapshot()

2013-05-14 Thread Miao Xie

If the fs is remounted to be R/O, it is unnecessary to call
btrfs_clean_one_deleted_snapshot(), so move the R/O check out of
this function. And besides that, it can make the check logic in the
caller more clear.

Cc: David Sterba dste...@suse.cz
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c | 9 +
 fs/btrfs/transaction.c | 5 -
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 927da1a..c69ff46 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1702,6 +1702,15 @@ static int cleaner_kthread(void *arg)
if (!mutex_trylock(root-fs_info-cleaner_mutex))
goto sleep;
 
+   /*
+* Avoid the problem that we change the status of the fs
+* during the above check and trylock.
+*/
+   if (need_cleaner_sleep(root)) {
+   mutex_unlock(root-fs_info-cleaner_mutex);
+   goto sleep;
+   }
+
btrfs_run_delayed_iputs(root);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(root-fs_info-cleaner_mutex);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 89fad06..4b63111 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1885,11 +1885,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root 
*root)
int ret;
struct btrfs_fs_info *fs_info = root-fs_info;
 
-   if (fs_info-sb-s_flags  MS_RDONLY) {
-   pr_debug(btrfs: cleaner called for RO fs!\n);
-   return 0;
-   }
-
spin_lock(fs_info-trans_lock);
if (list_empty(fs_info-dead_roots)) {
spin_unlock(fs_info-trans_lock);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/4] Btrfs: make the cleaner complete early when the fs is going to be umounted

2013-05-14 Thread Miao Xie

Cc: David Sterba dste...@suse.cz
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cb2bfd1..927da1a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1677,12 +1677,14 @@ static void end_workqueue_fn(struct btrfs_work *work)
 }
 
 /*
- * If we remount the fs to be R/O, the cleaner needn't do anything except
- * sleeping. This function is used to check the status of the fs.
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
  */
 static inline int need_cleaner_sleep(struct btrfs_root *root)
 {
-   return root-fs_info-sb-s_flags  MS_RDONLY;
+   return (root-fs_info-sb-s_flags  MS_RDONLY ||
+   btrfs_fs_closing(root-fs_info));
 }
 
 static int cleaner_kthread(void *arg)
@@ -1705,8 +1707,8 @@ static int cleaner_kthread(void *arg)
mutex_unlock(root-fs_info-cleaner_mutex);
 
/*
-* The defragger has dealt with the R/O remount, needn't
-* do anything special here.
+* The defragger has dealt with the R/O remount and umount,
+* needn't do anything special here.
 */
btrfs_run_defrag_inodes(root-fs_info);
 sleep:
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/4] Btrfs: make the snap/subv deletion end more early when the fs is R/O

2013-05-14 Thread Miao Xie

The snapshot/subvolume deletion might spend lots of time, it would make
the remount task wait for a long time. This patch improve this problem,
we will break the deletion if the fs is remounted to be R/O. It will make
the users happy.

Cc: David Sterba dste...@suse.cz
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h   | 12 
 fs/btrfs/disk-io.c | 15 ++-
 fs/btrfs/extent-tree.c |  2 +-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index df9824b..067233f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3318,6 +3318,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info 
*fs_info)
smp_mb();
return fs_info-closing;
 }
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+   return (root-fs_info-sb-s_flags  MS_RDONLY ||
+   btrfs_fs_closing(root-fs_info));
+}
+
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
kfree(fs_info-balance_ctl);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c69ff46..78e2dfb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1676,17 +1676,6 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio_endio(bio, error);
 }
 
-/*
- * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
- * anything except sleeping. This function is used to check the status of
- * the fs.
- */
-static inline int need_cleaner_sleep(struct btrfs_root *root)
-{
-   return (root-fs_info-sb-s_flags  MS_RDONLY ||
-   btrfs_fs_closing(root-fs_info));
-}
-
 static int cleaner_kthread(void *arg)
 {
struct btrfs_root *root = arg;
@@ -1696,7 +1685,7 @@ static int cleaner_kthread(void *arg)
again = 0;
 
/* Make the cleaner go to sleep early. */
-   if (need_cleaner_sleep(root))
+   if (btrfs_need_cleaner_sleep(root))
goto sleep;
 
if (!mutex_trylock(root-fs_info-cleaner_mutex))
@@ -1706,7 +1695,7 @@ static int cleaner_kthread(void *arg)
 * Avoid the problem that we change the status of the fs
 * during the above check and trylock.
 */
-   if (need_cleaner_sleep(root)) {
+   if (btrfs_need_cleaner_sleep(root)) {
mutex_unlock(root-fs_info-cleaner_mutex);
goto sleep;
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fbeb0c0..455117a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7378,7 +7378,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
wc-reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
while (1) {
-   if (!for_reloc  btrfs_fs_closing(root-fs_info)) {
+   if (!for_reloc  btrfs_need_cleaner_sleep(root)) {
pr_debug(btrfs: drop snapshot early exit\n);
err = -EAGAIN;
goto out_end_trans;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/4] Btrfs: remove unnecessary -s_umount in cleaner_kthread()

2013-05-14 Thread Miao Xie

On tue, 14 May 2013 18:20:40 +0800, Miao Xie wrote:
 In order to avoid the R/O remount, we acquired -s_umount lock during
 we deleted the dead snapshots and subvolumes. But it is unnecessary,
 because we have cleaner_mutex.
 
 We use cleaner_mutex to protect the process of the dead snapshots/subvolumes
 deletion. And when we remount the fs to be R/O, we also acquire this mutex to
 do cleanup after we change the status of the fs. That is this lock can 
 serialize
 the above operations, the cleaner can be aware of the status of the fs, and if
 the cleaner is deleting the dead snapshots/subvolumes, the remount task will
 wait for it. So it is safe to remove -s_umount in cleaner_kthread().

According to my test, this patch can also fix the deadlock problem which is 
caused
by the race between autodefragger and freeze(xfstest 068).

Thanks
Miao

 
 Cc: David Sterba dste...@suse.cz
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/disk-io.c | 40 
  1 file changed, 28 insertions(+), 12 deletions(-)
 
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index a9df562..cb2bfd1 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -1676,24 +1676,40 @@ static void end_workqueue_fn(struct btrfs_work *work)
   bio_endio(bio, error);
  }
  
 +/*
 + * If we remount the fs to be R/O, the cleaner needn't do anything except
 + * sleeping. This function is used to check the status of the fs.
 + */
 +static inline int need_cleaner_sleep(struct btrfs_root *root)
 +{
 + return root-fs_info-sb-s_flags  MS_RDONLY;
 +}
 +
  static int cleaner_kthread(void *arg)
  {
   struct btrfs_root *root = arg;
 + int again;
  
   do {
 - int again = 0;
 -
 - if (!(root-fs_info-sb-s_flags  MS_RDONLY) 
 - down_read_trylock(root-fs_info-sb-s_umount)) {
 - if (mutex_trylock(root-fs_info-cleaner_mutex)) {
 - btrfs_run_delayed_iputs(root);
 - again = btrfs_clean_one_deleted_snapshot(root);
 - mutex_unlock(root-fs_info-cleaner_mutex);
 - }
 - btrfs_run_defrag_inodes(root-fs_info);
 - up_read(root-fs_info-sb-s_umount);
 - }
 + again = 0;
  
 + /* Make the cleaner go to sleep early. */
 + if (need_cleaner_sleep(root))
 + goto sleep;
 +
 + if (!mutex_trylock(root-fs_info-cleaner_mutex))
 + goto sleep;
 +
 + btrfs_run_delayed_iputs(root);
 + again = btrfs_clean_one_deleted_snapshot(root);
 + mutex_unlock(root-fs_info-cleaner_mutex);
 +
 + /*
 +  * The defragger has dealt with the R/O remount, needn't
 +  * do anything special here.
 +  */
 + btrfs_run_defrag_inodes(root-fs_info);
 +sleep:
   if (!try_to_freeze()  !again) {
   set_current_state(TASK_INTERRUPTIBLE);
   if (!kthread_should_stop())
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/5] Btrfs: don't abort the current transaction if there is no enough space for inode cache

2013-05-13 Thread Miao Xie

The filesystem with inode cache was forced to be read-only when we umounted it.

Steps to reproduce:
 # mkfs.btrfs -f ${DEV}
 # mount -o inode_cache ${DEV} ${MNT}
 # dd if=/dev/zero of=${MNT}/file1 bs=1M count=8192
 # btrfs fi syn ${MNT}
 # dd if=${MNT}/file1 of=/dev/null bs=1M
 # rm -f ${MNT}/file1
 # btrfs fi syn ${MNT}
 # umount ${MNT}

It is because there was no enough space to do inode cache truncation, and then
we aborted the current transaction.

But no space error is not a serious problem when we write out the inode cache,
and it is safe that we just skip this step if we meet this problem. So we need
not abort the current transaction.

Reported-by: Tsutomu Itoh t-i...@jp.fujitsu.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/inode-map.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d26f67a..9818d4a 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -468,7 +468,8 @@ again:
if (i_size_read(inode)  0) {
ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
if (ret) {
-   btrfs_abort_transaction(trans, root, ret);
+   if (ret != -ENOSPC)
+   btrfs_abort_transaction(trans, root, ret);
goto out_put;
}
}
-- 
1.8.0.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/5] Btrfs: don't steal the reserved space from the global reserve if their space type is different

2013-05-13 Thread Miao Xie

If the type of the space we need is different with the global reserve, we
can not steal the space from the global reserve, because we can not allocate
the space from the free space cache that the global reserve points to.

Cc: Tsutomu Itoh t-i...@jp.fujitsu.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/extent-tree.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e34e268..c48e1bd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6684,9 +6684,11 @@ try_reserve:
return block_rsv;
/*
 * If we couldn't reserve metadata bytes try and use some from
-* the global reserve.
+* the global reserve if its space type is the same as the global
+* reservation.
 */
-   if (block_rsv-type != BTRFS_BLOCK_RSV_GLOBAL) {
+   if (block_rsv-type != BTRFS_BLOCK_RSV_GLOBAL 
+   block_rsv-space_info == global_rsv-space_info) {
ret = block_rsv_use_bytes(global_rsv, blocksize);
if (!ret)
return global_rsv;
-- 
1.8.0.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/5] Btrfs: don't use global block reservation for inode cache truncation

2013-05-13 Thread Miao Xie

It is very likely that there are lots of subvolumes/snapshots in the filesystem,
so if we use global block reservation to do inode cache truncation, we may hog
all the free space that is reserved in global rsv. So it is better that we do
the free space reservation for inode cache truncation by ourselves.

Cc: Tsutomu Itoh t-i...@jp.fujitsu.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/extent-tree.c  |  5 +
 fs/btrfs/free-space-cache.c | 39 +++
 fs/btrfs/free-space-cache.h |  2 ++
 fs/btrfs/inode-map.c|  5 +++--
 fs/btrfs/relocation.c   |  5 +
 5 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2305b5c..43afa77 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3109,6 +3109,11 @@ again:
WARN_ON(ret);
 
if (i_size_read(inode)  0) {
+   ret = btrfs_check_trunc_cache_free_space(root,
+   root-fs_info-global_block_rsv);
+   if (ret)
+   goto out_put;
+
ret = btrfs_truncate_free_space_cache(root, trans, path,
  inode);
if (ret)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ecca6c7..a1948f4 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -197,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root,
 block_group-key.objectid);
 }
 
-int btrfs_truncate_free_space_cache(struct btrfs_root *root,
-   struct btrfs_trans_handle *trans,
-   struct btrfs_path *path,
-   struct inode *inode)
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+  struct btrfs_block_rsv *rsv)
 {
-   struct btrfs_block_rsv *rsv;
u64 needed_bytes;
-   loff_t oldsize;
-   int ret = 0;
-
-   rsv = trans-block_rsv;
-   trans-block_rsv = root-fs_info-global_block_rsv;
+   int ret;
 
/* 1 for slack space, 1 for updating the inode */
needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
btrfs_calc_trans_metadata_size(root, 1);
 
-   spin_lock(trans-block_rsv-lock);
-   if (trans-block_rsv-reserved  needed_bytes) {
-   spin_unlock(trans-block_rsv-lock);
-   trans-block_rsv = rsv;
-   return -ENOSPC;
-   }
-   spin_unlock(trans-block_rsv-lock);
+   spin_lock(rsv-lock);
+   if (rsv-reserved  needed_bytes)
+   ret = -ENOSPC;
+   else
+   ret = 0;
+   spin_unlock(rsv-lock);
+   return 0;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+   struct btrfs_trans_handle *trans,
+   struct btrfs_path *path,
+   struct inode *inode)
+{
+   loff_t oldsize;
+   int ret = 0;
 
oldsize = i_size_read(inode);
btrfs_i_size_write(inode, 0);
@@ -232,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 */
ret = btrfs_truncate_inode_items(trans, root, inode,
 0, BTRFS_EXTENT_DATA_KEY);
-
if (ret) {
-   trans-block_rsv = rsv;
btrfs_abort_transaction(trans, root, ret);
return ret;
}
@@ -242,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
ret = btrfs_update_inode(trans, root, inode);
if (ret)
btrfs_abort_transaction(trans, root, ret);
-   trans-block_rsv = rsv;
 
return ret;
 }
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 4dc17d8..8b7f19f 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
 
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+  struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_path *path,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 9818d4a..2c66ddb 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
num_bytes = trans-bytes_reserved;
/*
 * 1 item for inode item insertion if need
-* 3 items for inode item update (in the worst case)
+* 4 items for inode item update (in the worst case

[PATCH 5/5] Btrfs: update the global reserve if it is empty

2013-05-13 Thread Miao Xie

Before applying this patch, we reserved the space for the global reserve
by the minimum unit if we found it is empty, it was unreasonable and
inefficient, because if the global reserve space was depleted, it implied
that the size of the global reserve was too small. In this case, we shoud
update the global reserve and fill it.

Cc: Tsutomu Itoh t-i...@jp.fujitsu.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/extent-tree.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c48e1bd..c75fe11 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6656,12 +6656,13 @@ use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *global_rsv = root-fs_info-global_block_rsv;
int ret;
+   bool global_updated = false;
 
block_rsv = get_block_rsv(trans, root);
 
if (unlikely(block_rsv-size == 0))
goto try_reserve;
-
+again:
ret = block_rsv_use_bytes(block_rsv, blocksize);
if (!ret)
return block_rsv;
@@ -6669,6 +6670,12 @@ use_block_rsv(struct btrfs_trans_handle *trans,
if (block_rsv-failfast)
return ERR_PTR(ret);
 
+   if (block_rsv-type == BTRFS_BLOCK_RSV_GLOBAL  !global_updated) {
+   global_updated = true;
+   update_global_block_rsv(root-fs_info);
+   goto again;
+   }
+
if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL * 10,
-- 
1.8.0.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/5] Btrfs: optimize the error handle of use_block_rsv()

2013-05-13 Thread Miao Xie

cc: Tsutomu Itoh t-i...@jp.fujitsu.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/extent-tree.c | 65 ++
 1 file changed, 28 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 43afa77..e34e268 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6659,48 +6659,39 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 
block_rsv = get_block_rsv(trans, root);
 
-   if (block_rsv-size == 0) {
-   ret = reserve_metadata_bytes(root, block_rsv, blocksize,
-BTRFS_RESERVE_NO_FLUSH);
-   /*
-* If we couldn't reserve metadata bytes try and use some from
-* the global reserve.
-*/
-   if (ret  block_rsv != global_rsv) {
-   ret = block_rsv_use_bytes(global_rsv, blocksize);
-   if (!ret)
-   return global_rsv;
-   return ERR_PTR(ret);
-   } else if (ret) {
-   return ERR_PTR(ret);
-   }
-   return block_rsv;
-   }
+   if (unlikely(block_rsv-size == 0))
+   goto try_reserve;
 
ret = block_rsv_use_bytes(block_rsv, blocksize);
if (!ret)
return block_rsv;
-   if (ret  !block_rsv-failfast) {
-   if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
-   static DEFINE_RATELIMIT_STATE(_rs,
-   DEFAULT_RATELIMIT_INTERVAL * 10,
-   /*DEFAULT_RATELIMIT_BURST*/ 1);
-   if (__ratelimit(_rs))
-   WARN(1, KERN_DEBUG
-   btrfs: block rsv returned %d\n, ret);
-   }
-   ret = reserve_metadata_bytes(root, block_rsv, blocksize,
-BTRFS_RESERVE_NO_FLUSH);
-   if (!ret) {
-   return block_rsv;
-   } else if (ret  block_rsv != global_rsv) {
-   ret = block_rsv_use_bytes(global_rsv, blocksize);
-   if (!ret)
-   return global_rsv;
-   }
-   }
 
-   return ERR_PTR(-ENOSPC);
+   if (block_rsv-failfast)
+   return ERR_PTR(ret);
+
+   if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+   static DEFINE_RATELIMIT_STATE(_rs,
+   DEFAULT_RATELIMIT_INTERVAL * 10,
+   /*DEFAULT_RATELIMIT_BURST*/ 1);
+   if (__ratelimit(_rs))
+   WARN(1, KERN_DEBUG
+   btrfs: block rsv returned %d\n, ret);
+   }
+try_reserve:
+   ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+BTRFS_RESERVE_NO_FLUSH);
+   if (!ret)
+   return block_rsv;
+   /*
+* If we couldn't reserve metadata bytes try and use some from
+* the global reserve.
+*/
+   if (block_rsv-type != BTRFS_BLOCK_RSV_GLOBAL) {
+   ret = block_rsv_use_bytes(global_rsv, blocksize);
+   if (!ret)
+   return global_rsv;
+   }
+   return ERR_PTR(ret);
 }
 
 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
-- 
1.8.0.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V3] Btrfs: remove btrfs_sector_sum structure

2013-05-06 Thread Miao Xie

Using the structure btrfs_sector_sum to keep the checksum value is
unnecessary, because the extents that btrfs_sector_sum points to are
continuous, we can find out the expected checksums by btrfs_ordered_sum's
bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After
removing bytenr, there is only one member in the structure, so it makes
no sense to keep the structure, just remove it, and use a u32 array to
store the checksum value.

By this change, we don't use the while loop to get the checksums one by
one. Now, we can get several checksum value at one time, it improved the
performance by ~74% on my SSD (31MB/s - 54MB/s).

test command:
 # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v2 - v3:
- address the problem that the csums was inserted into the wrong range, this bug
  was reported by Josef.

Changelog v1 - v2:
- modify the changelog and the title which can not explain this patch clearly
- fix the 64bit division problem on 32bit machine
---
 fs/btrfs/file-item.c| 144 ++--
 fs/btrfs/ordered-data.c |  19 +++
 fs/btrfs/ordered-data.h |  25 ++---
 fs/btrfs/relocation.c   |  10 
 fs/btrfs/scrub.c|  16 ++
 5 files changed, 73 insertions(+), 141 deletions(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b193bf3..a7bfc95 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
   sizeof(struct btrfs_ordered_sum)) / \
-  sizeof(struct btrfs_sector_sum) * \
-  (r)-sectorsize - (r)-sectorsize)
+  sizeof(u32) * (r)-sectorsize)
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 
start, u64 end,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
-   struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 
start, u64 end,
  struct btrfs_csum_item);
while (start  csum_end) {
size = min_t(size_t, csum_end - start,
-   MAX_ORDERED_SUM_BYTES(root));
+MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
-   GFP_NOFS);
+  GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
goto fail;
}
 
-   sector_sum = sums-sums;
sums-bytenr = start;
-   sums-len = size;
+   sums-len = (int)size;
 
offset = (start - key.offset) 
root-fs_info-sb-s_blocksize_bits;
offset *= csum_size;
+   size = root-fs_info-sb-s_blocksize_bits;
 
-   while (size  0) {
-   read_extent_buffer(path-nodes[0],
-   sector_sum-sum,
-   ((unsigned long)item) +
-   offset, csum_size);
-   sector_sum-bytenr = start;
-
-   size -= root-sectorsize;
-   start += root-sectorsize;
-   offset += csum_size;
-   sector_sum++;
-   }
+   read_extent_buffer(path-nodes[0],
+  sums-sums,
+  ((unsigned long)item) + offset,
+  csum_size * size);
+
+   start += root-sectorsize * size;
list_add_tail(sums-list, tmplist);
}
path-slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct 
inode *inode,
   struct bio *bio, u64 file_start, int contig)
 {
struct btrfs_ordered_sum *sums;
-   struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
char *data;
struct bio_vec *bvec = bio-bi_io_vec;
int bio_index = 0;
+   int index;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
-   u64

Re: [PATCH V2 2/2] Btrfs: remove btrfs_sector_sum structure

2013-04-26 Thread Miao Xie

On tue, 23 Apr 2013 16:54:35 -0400, Josef Bacik wrote:
 On Wed, Apr 03, 2013 at 03:14:56AM -0600, Miao Xie wrote:
 Using the structure btrfs_sector_sum to keep the checksum value is
 unnecessary, because the extents that btrfs_sector_sum points to are
 continuous, we can find out the expected checksums by btrfs_ordered_sum's
 bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After
 removing bytenr, there is only one member in the structure, so it makes
 no sense to keep the structure, just remove it, and use a u32 array to
 store the checksum value.

 By this change, we don't use the while loop to get the checksums one by
 one. Now, we can get several checksum value at one time, it improved the
 performance by ~74% on my SSD (31MB/s - 54MB/s).

 test command:
  # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 Reviewed-by: Liu Bo bo.li@oracle.com
[SNIP]
 next_offset = (u64)-1;
 found_next = 0;
 +   bytenr = sums-bytenr + total_bytes;
 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 -   file_key.offset = sector_sum-bytenr;
 -   bytenr = sector_sum-bytenr;
 +   file_key.offset = bytenr;
 btrfs_set_key_type(file_key, BTRFS_EXTENT_CSUM_KEY);

 -   item = btrfs_lookup_csum(trans, root, path, sector_sum-bytenr, 1);
 +   item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
 if (!IS_ERR(item)) {
 -   leaf = path-nodes[0];
 -   ret = 0;
 -   goto found;
 +   csum_offset = 0;
 +   goto csum;
 
 Ok I've just spent the last 3 hours tracking down an fsync() problem that 
 turned
 out to be because of this patch.  btrfs_lookup_csum() assumes you are just 
 going
 in 4k chunks, but we could be going in larger chunks.  So as long as the 
 bytenr
 falls inside of this csum item it thinks its good.  So what I'm seeing is 
 this,
 we have item
 
 [0-8k]
 
 and we are csumming
 
 [4k-12k]
 
 and then we're adding our new csum into the old one, the sizes match but the
 bytenrs don't match.  If you want a reproducer just run my fsync xfstest that 
 I
 just posted.  I'm dropping this patch for now and I'll wait for you to fix it.
 Thanks,

Is the reproducer is the 311th case of xfstests?
([PATCH] xfstests 311: test fsync with dm flakey V2)

If yes, I'm so sorry that we didn't reproduce the problem you said above. Could 
you
give me your test option?

Thanks
Miao
From d8b9c06ecb4aa5cb2aca5be96a8b65af1afb1992 Mon Sep 17 00:00:00 2001
From: Miao Xie mi...@cn.fujitsu.com
Date: Sat, 16 Mar 2013 01:06:03 +0800
Subject: [PATCH] Btrfs: remove btrfs_sector_sum structure

Using the structure btrfs_sector_sum to keep the checksum value is
unnecessary, because the extents that btrfs_sector_sum points to are
continuous, we can find out the expected checksums by btrfs_ordered_sum's
bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After
removing bytenr, there is only one member in the structure, so it makes
no sense to keep the structure, just remove it, and use a u32 array to
store the checksum value.

By this change, we don't use the while loop to get the checksums one by
one. Now, we can get several checksum value at one time, it improved the
performance by ~74% on my SSD (31MB/s - 54MB/s).

test command:
 # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/file-item.c| 142 ++--
 fs/btrfs/ordered-data.c |  19 +++
 fs/btrfs/ordered-data.h |  25 ++---
 fs/btrfs/relocation.c   |  10 
 fs/btrfs/scrub.c|  16 ++
 5 files changed, 70 insertions(+), 142 deletions(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 769eb86..f5f6629 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
    sizeof(struct btrfs_ordered_sum)) / \
-   sizeof(struct btrfs_sector_sum) * \
-   (r)-sectorsize - (r)-sectorsize)
+   sizeof(u32) * (r)-sectorsize)
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
@@ -317,7 +316,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_csum_item *item;
 	LIST_HEAD(tmplist);
 	unsigned long offset;
@@ -388,34 +386,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
   struct btrfs_csum_item);
 		while (start  csum_end) {
 			size = min_t(size_t, csum_end - start,
-	MAX_ORDERED_SUM_BYTES(root));
+ MAX_ORDERED_SUM_BYTES(root));
 			sums = kzalloc(btrfs_ordered_sum_size(root, size),
-	GFP_NOFS);
+   GFP_NOFS);
 			if (!sums) {
 ret = -ENOMEM;
 goto fail;
 			}
 
-			sector_sum

Re: [PATCH V2 2/2] Btrfs: remove btrfs_sector_sum structure

2013-04-26 Thread Miao Xie

On Fri, 26 Apr 2013 16:58:18 +0800, Miao Xie wrote:
 On tue, 23 Apr 2013 16:54:35 -0400, Josef Bacik wrote:
 On Wed, Apr 03, 2013 at 03:14:56AM -0600, Miao Xie wrote:
 Using the structure btrfs_sector_sum to keep the checksum value is
 unnecessary, because the extents that btrfs_sector_sum points to are
 continuous, we can find out the expected checksums by btrfs_ordered_sum's
 bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After
 removing bytenr, there is only one member in the structure, so it makes
 no sense to keep the structure, just remove it, and use a u32 array to
 store the checksum value.

 By this change, we don't use the while loop to get the checksums one by
 one. Now, we can get several checksum value at one time, it improved the
 performance by ~74% on my SSD (31MB/s - 54MB/s).

 test command:
  # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 Reviewed-by: Liu Bo bo.li@oracle.com
 [SNIP]
 next_offset = (u64)-1;
 found_next = 0;
 +   bytenr = sums-bytenr + total_bytes;
 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 -   file_key.offset = sector_sum-bytenr;
 -   bytenr = sector_sum-bytenr;
 +   file_key.offset = bytenr;
 btrfs_set_key_type(file_key, BTRFS_EXTENT_CSUM_KEY);

 -   item = btrfs_lookup_csum(trans, root, path, sector_sum-bytenr, 1);
 +   item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
 if (!IS_ERR(item)) {
 -   leaf = path-nodes[0];
 -   ret = 0;
 -   goto found;
 +   csum_offset = 0;
 +   goto csum;

 Ok I've just spent the last 3 hours tracking down an fsync() problem that 
 turned
 out to be because of this patch.  btrfs_lookup_csum() assumes you are just 
 going
 in 4k chunks, but we could be going in larger chunks.  So as long as the 
 bytenr
 falls inside of this csum item it thinks its good.  So what I'm seeing is 
 this,
 we have item

 [0-8k]

 and we are csumming

 [4k-12k]

 and then we're adding our new csum into the old one, the sizes match but the
 bytenrs don't match.  If you want a reproducer just run my fsync xfstest 
 that I
 just posted.  I'm dropping this patch for now and I'll wait for you to fix 
 it.
 Thanks,
 
 Is the reproducer is the 311th case of xfstests?
 ([PATCH] xfstests 311: test fsync with dm flakey V2)
 
 If yes, I'm so sorry that we didn't reproduce the problem you said above. 
 Could you
 give me your test option?

please ignore the attached patch, I sent it out by mistake.

Thanks
Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: allocate new chunks if the space is not enough for global rsv

2013-04-25 Thread Miao Xie

When running the 208th of xfstests, the fs returned the enospc
error when there was lots of free space in the disk.

By bisect debug, we found it was introduced by commit 96f1bb5777.
This commit makes the space check for the global reservation in
can_overcommit() be inconsistent with should_alloc_chunk().
can_overcommit() requires that the free space is 2 times the size
of the global reservation, or we can't do overcommit. And instead,
we need reclaim some reserved space, and if we still don't have
enough free space, we need allocate a new chunk. But unfortunately,
should_alloc_chunk() just requires that the free space is 1 time
the size of the global reservation, that is we would not try to
allocate a new chunk if the free space size is in the middle of
these two requires, and just return the enospc error. Fix it.

Cc: Jim Schutt jasc...@sandia.gov
Cc: Josef Bacik jba...@fusionio.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/extent-tree.c | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0d84787..4976f93 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3557,6 +3557,11 @@ static void force_metadata_allocation(struct 
btrfs_fs_info *info)
rcu_read_unlock();
 }
 
+static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
+{
+   return (global-size  1);
+}
+
 static int should_alloc_chunk(struct btrfs_root *root,
  struct btrfs_space_info *sinfo, int force)
 {
@@ -3574,7 +3579,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 * global_rsv, it doesn't change except when the transaction commits.
 */
if (sinfo-flags  BTRFS_BLOCK_GROUP_METADATA)
-   num_allocated += global_rsv-size;
+   num_allocated += calc_global_rsv_need_space(global_rsv);
 
/*
 * in limited mode, we want to have some free space up to
@@ -3746,7 +3751,7 @@ static int can_overcommit(struct btrfs_root *root,
 {
struct btrfs_block_rsv *global_rsv = root-fs_info-global_block_rsv;
u64 profile = btrfs_get_alloc_profile(root, 0);
-   u64 rsv_size = 0;
+   u64 space_size;
u64 avail;
u64 used;
u64 to_add;
@@ -3754,18 +3759,16 @@ static int can_overcommit(struct btrfs_root *root,
used = space_info-bytes_used + space_info-bytes_reserved +
space_info-bytes_pinned + space_info-bytes_readonly;
 
-   spin_lock(global_rsv-lock);
-   rsv_size = global_rsv-size;
-   spin_unlock(global_rsv-lock);
-
/*
 * We only want to allow over committing if we have lots of actual space
 * free, but if we don't have enough space to handle the global reserve
 * space then we could end up having a real enospc problem when trying
 * to allocate a chunk or some other such important allocation.
 */
-   rsv_size = 1;
-   if (used + rsv_size = space_info-total_bytes)
+   spin_lock(global_rsv-lock);
+   space_size = calc_global_rsv_need_space(global_rsv);
+   spin_unlock(global_rsv-lock);
+   if (used + space_size = space_info-total_bytes)
return 0;
 
used += space_info-bytes_may_use;
-- 
1.8.0.1
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] Btrfs: introduce noextiref mount option

2013-04-25 Thread Miao Xie

On Mon, 15 Apr 2013 19:20:51 +0200, David Sterba wrote:
 On Fri, Apr 12, 2013 at 12:01:19PM -0500, Eric Sandeen wrote:
 On 4/11/13 5:35 AM, Miao Xie wrote:
 Now, we set incompat flag EXTEND_IREF when we actually need insert a
 extend inode reference, not when making a fs. But some users may hope
 that the fs still can be mounted on the old kernel, and don't hope we
 insert any extend inode references. So we introduce noextiref mount
 option to close this function.

 I'd really rather not have yet another work-around mount option.

 Wouldn't it be better to say: if you don't want extended irefs, turn
 that feature off on the filesystem itself, either at mkfs time or via
 btrfstune after the fact.
 
 I agree with this, and hope the inconsistency around extref is only
 temporary so the mount options is not required in the long term.
 
 The code reverting extref set by default in mkfs is in integration
 branch.
 
 The preferred solution is the -O option where we can put all the fs
 features in one go at mkfs time.

All right, let's add a option for mkfs, and throw away this patchset.

Thanks.
Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] Btrfs: use a lock to protect incompat/compat flag of the super block

2013-04-21 Thread Miao Xie

On  thu, 18 Apr 2013 00:17:11 +0200, David Sterba wrote:
 On Thu, Apr 11, 2013 at 06:30:16PM +0800, Miao Xie wrote:
 In order to avoid this problem, we introduce a lock named super_lock into
 the btrfs_fs_info structure. If we want to update incompat/compat flags
 of the super block, we must hold it.

 +/*
 + * Used to protect the incompat_flags, compat_flags, compat_ro_flags
 + * when they are updated.
 
 +spinlock_t super_lock;
 
 The lock name is too general for protecting just *_flags, do you have
 plans to add more items from superblock under this lock?  If no, I
 suggest to pick a different name.

Yes, I want to add more items from super block under this lock.

 
 @@ -3663,8 +3674,15 @@ static inline void __btrfs_set_fs_incompat(struct 
 btrfs_fs_info *fs_info,
  disk_super = fs_info-super_copy;
  features = btrfs_super_incompat_flags(disk_super);
  if (!(features  flag)) {
 -features |= flag;
 -btrfs_set_super_incompat_flags(disk_super, features);
 +spin_lock(fs_info-super_lock);
 +features = btrfs_super_incompat_flags(disk_super);
 +if (!(features  flag)) {
 +features |= flag;
 +btrfs_set_super_incompat_flags(disk_super, features);
 +printk(KERN_INFO btrfs: setting %llu feature flag\n,
 + flag);
 
 flag is u64, please use (unsigned long long)flag and possibly the new
 btrfs_info replacement of printks.

OK, I'll modify my patch.

Thanks for your view.

Miao

 
 +}
 +spin_unlock(fs_info-super_lock);
  }
  }
 
 otherwise ok.
 
 Reviewed-by: David Sterba dste...@suse.cz
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] Btrfs: introduce noextiref mount option

2013-04-14 Thread Miao Xie

On Fri, 12 Apr 2013 09:02:34 +0200, Jan Schmidt wrote:
 +static int btrfs_close_extend_iref(struct btrfs_fs_info *fs_info,
 + unsigned long old_opts)

 The name irritated me, it's more like unset instead of close, isn't it?

 Maybe btrfs_set_no_extend_iref() is better, the other developers might 
 think
 we will clear BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF.
 
 I think we should use the exact name of the mount option, so
 btrfs_set_noextiref is probably least ambiguous. Or even
 btrfs_set_mntflag_noextiref.

Much better than mine.


 +{
 +  struct btrfs_trans_handle *trans;
 +  int ret;
 +
 +  if (btrfs_raw_test_opt(old_opts, NOEXTIREF) ||
 +  !btrfs_raw_test_opt(fs_info-mount_opt, NOEXTIREF))
 +  return 0;
 +
 +  trans = btrfs_attach_transaction(fs_info-tree_root);
 +  if (IS_ERR(trans)) {
 +  if (PTR_ERR(trans) != -ENOENT)
 +  return PTR_ERR(trans);
 +  } else {
 +  ret = btrfs_commit_transaction(trans, fs_info-tree_root);
 +  if (ret)
 +  return ret;
 +  }

 Huh? I don't see why we need to commit the transaction here. Can you please 
 explain?

 We need avoid the case that we check incompat flag is set or not between the
 extended iref insertion and incompat flag set.
  Task1   Task2
  start_transaction()
  insert extended iref
  set NOEXTIREF
  check incompat flag
  set incompat flag

 checking incompat flag after transaction commit can make sure our check 
 happens
 after the flag is set.
 
 Understood.
 
 However, in my understanding of transaction.c, btrfs_join_transaction,
 btrfs_attach_transaction and btrfs_commit_transaction are special and need
 justification. If you only need the transaction for synchronization purposes,
 which seems to be the case here, btrfs_start_transaction and
 btrfs_end_transaction are the right choice.

btrfs_end_transaction() does not wait for/force the other tasks to end their
transaction, so it is not right here.

Thanks
Miao 

 
 Thanks,
 -Jan
 
 Thanks
 Miao


 Thanks,
 -Jan

 +
 +  if (btrfs_super_incompat_flags(fs_info-super_copy) 
 +  BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) {
 +  printk(KERN_ERR BTRFS: could not close extend iref.\n);
 +  return -EINVAL;
 +  }
 +
 +  return 0;
 +}
 +
  static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
  {
set_bit(BTRFS_FS_STATE_REMOUNTING, fs_info-fs_state);
 @@ -1259,6 +1293,11 @@ static int btrfs_remount(struct super_block *sb, 
 int *flags, char *data)
}
  
btrfs_remount_begin(fs_info, old_opts, *flags);
 +
 +  ret = btrfs_close_extend_iref(fs_info, old_opts);
 +  if (ret)
 +  goto restore;
 +
btrfs_resize_thread_pool(fs_info,
fs_info-thread_pool_size, old_thread_pool_size);
  

 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] Btrfs: fix unblocked autodefraggers when remount

2013-04-11 Thread Miao Xie

The new mount option is set after parsing the remount arguments,
so it is wrong that checking the autodefrag is close or not at
btrfs_remount_prepare(). Fix it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/super.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 68a29a1..0f03569 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1202,11 +1202,14 @@ static void btrfs_resize_thread_pool(struct 
btrfs_fs_info *fs_info,
  new_pool_size);
 }
 
-static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
-unsigned long old_opts, int flags)
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
 {
set_bit(BTRFS_FS_STATE_REMOUNTING, fs_info-fs_state);
+}
 
+static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
+  unsigned long old_opts, int flags)
+{
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) 
(!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) ||
 (flags  MS_RDONLY))) {
@@ -1247,7 +1250,7 @@ static int btrfs_remount(struct super_block *sb, int 
*flags, char *data)
unsigned int old_metadata_ratio = fs_info-metadata_ratio;
int ret;
 
-   btrfs_remount_prepare(fs_info, old_opts, *flags);
+   btrfs_remount_prepare(fs_info);
 
ret = btrfs_parse_options(root, data);
if (ret) {
@@ -1255,6 +1258,7 @@ static int btrfs_remount(struct super_block *sb, int 
*flags, char *data)
goto restore;
}
 
+   btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
fs_info-thread_pool_size, old_thread_pool_size);
 
-- 
1.8.0.1
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] Btrfs: use a lock to protect incompat/compat flag of the super block

2013-04-11 Thread Miao Xie

The following case will make the incompat/compat flag of the super block
be recovered.
 Task1  |Task2
 flags = btrfs_super_incompat_flags();  |
|flags = btrfs_super_incompat_flags();
 flags |= new_flag1;|
|flags |= new_flag2;
 btrfs_set_super_incompat_flags(flags); |
|btrfs_set_super_incompat_flags(flags);
the new_flag1 is recovered.

In order to avoid this problem, we introduce a lock named super_lock into
the btrfs_fs_info structure. If we want to update incompat/compat flags
of the super block, we must hold it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h   | 22 --
 fs/btrfs/disk-io.c |  5 +
 fs/btrfs/volumes.c | 10 +-
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d82922..a883e47 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1360,6 +1360,17 @@ struct btrfs_fs_info {
wait_queue_head_t transaction_blocked_wait;
wait_queue_head_t async_submit_wait;
 
+   /*
+* Used to protect the incompat_flags, compat_flags, compat_ro_flags
+* when they are updated.
+*
+* Because we do not clear the flags for ever, so we needn't use
+* the lock on the read side.
+*
+* We also needn't use the lock when we mount the fs, because
+* there is no other task which will update the flag.
+*/
+   spinlock_t super_lock;
struct btrfs_super_block *super_copy;
struct btrfs_super_block *super_for_commit;
struct block_device *__bdev;
@@ -3663,8 +3674,15 @@ static inline void __btrfs_set_fs_incompat(struct 
btrfs_fs_info *fs_info,
disk_super = fs_info-super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (!(features  flag)) {
-   features |= flag;
-   btrfs_set_super_incompat_flags(disk_super, features);
+   spin_lock(fs_info-super_lock);
+   features = btrfs_super_incompat_flags(disk_super);
+   if (!(features  flag)) {
+   features |= flag;
+   btrfs_set_super_incompat_flags(disk_super, features);
+   printk(KERN_INFO btrfs: setting %llu feature flag\n,
+flag);
+   }
+   spin_unlock(fs_info-super_lock);
}
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6d19a0a..ab8ef37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2060,6 +2060,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(fs_info-defrag_inodes_lock);
spin_lock_init(fs_info-free_chunk_lock);
spin_lock_init(fs_info-tree_mod_seq_lock);
+   spin_lock_init(fs_info-super_lock);
rwlock_init(fs_info-tree_mod_log_lock);
mutex_init(fs_info-reloc_mutex);
seqlock_init(fs_info-profiles_lock);
@@ -2319,6 +2320,10 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
 
+   /*
+* Needn't use the lock because there is no other task which will
+* update the flag.
+*/
btrfs_set_super_incompat_flags(disk_super, features);
 
features = btrfs_super_compat_ro_flags(disk_super) 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2854c82..e710db4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3674,18 +3674,10 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 
dev_stripe_target)
 
 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 {
-   u64 features;
-
if (!(type  (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
return;
 
-   features = btrfs_super_incompat_flags(info-super_copy);
-   if (features  BTRFS_FEATURE_INCOMPAT_RAID56)
-   return;
-
-   features |= BTRFS_FEATURE_INCOMPAT_RAID56;
-   btrfs_set_super_incompat_flags(info-super_copy, features);
-   printk(KERN_INFO btrfs: setting RAID5/6 feature flag\n);
+   btrfs_set_fs_incompat(info, RAID56);
 }
 
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-- 
1.8.0.1
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/2] do not open the extend inode reference at the beginning

2013-04-11 Thread Miao Xie

In most cases, we do not insert so many inode references, so it is
better that we don't set incompat flag -- EXTEND_IREF -- when we
make a fs. Otherwise we can not mount the fs on the old kernel
though there is no extend iref in fact.

And some users may not hope to inserting the extend inode reference
because of the incompatible problem. In this case, we introduce a
mount option named noextiref.

Note, if the extend inode reference function is enabled, we will
fail to mount a fs with this option because there might be some
extend irefs in the fs, we should not close this function.

This patchset is against:
[PATCH 1/2] Btrfs: fix unblocked autodefraggers when remount
[PATCH 2/2] Btrfs: use a lock to protect incompat/compat flag of the super block

Miao Xie (2):
  Btrfs: set the INCOMPAT_EXTENDED_IREF when the extended iref is inserted
  Btrfs: introduce noextiref mount option

 fs/btrfs/ctree.h  |  1 +
 fs/btrfs/disk-io.c|  9 +
 fs/btrfs/inode-item.c | 20 ++--
 fs/btrfs/super.c  | 41 -
 4 files changed, 60 insertions(+), 11 deletions(-)

-- 
1.8.0.1
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] Btrfs: set the INCOMPAT_EXTENDED_IREF when the extended iref is inserted

2013-04-11 Thread Miao Xie

We needn't set the INCOMAT_EXTENDED_IREF when making a new fs, just
do it after we insert a  extended iref successfully. Otherwise, we
can not mount the fs in which there is no extended iref in fact on
the old kernel, it is not so flexible for the users.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
Cc: Mark Fasheh mfas...@suse.de
---
 fs/btrfs/inode-item.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 48b8fda..f07eb45 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -443,15 +443,15 @@ out:
btrfs_free_path(path);
 
if (ret == -EMLINK) {
-   struct btrfs_super_block *disk_super = 
root-fs_info-super_copy;
-   /* We ran out of space in the ref array. Need to
-* add an extended ref. */
-   if (btrfs_super_incompat_flags(disk_super)
-BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
-   ret = btrfs_insert_inode_extref(trans, root, name,
-   name_len,
-   inode_objectid,
-   ref_objectid, index);
+   /*
+* We ran out of space in the ref array. Need to add an
+* extended ref.
+*/
+   ret = btrfs_insert_inode_extref(trans, root, name, name_len,
+   inode_objectid, ref_objectid,
+   index);
+   if (!ret)
+   btrfs_set_fs_incompat(root-fs_info, EXTENDED_IREF);
}
 
return ret;
-- 
1.8.0.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] Btrfs: introduce noextiref mount option

2013-04-11 Thread Miao Xie

Now, we set incompat flag EXTEND_IREF when we actually need insert a
extend inode reference, not when making a fs. But some users may hope
that the fs still can be mounted on the old kernel, and don't hope we
insert any extend inode references. So we introduce noextiref mount
option to close this function.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
Cc: Mark Fasheh mfas...@suse.de
---
 fs/btrfs/ctree.h  |  1 +
 fs/btrfs/disk-io.c|  9 +
 fs/btrfs/inode-item.c |  2 +-
 fs/btrfs/super.c  | 41 -
 4 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a883e47..db88963 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1911,6 +1911,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY(1  20)
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1  21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR   (1  22)
+#define BTRFS_MOUNT_NOEXTIREF  (1  23)
 
 #define btrfs_clear_opt(o, opt)((o) = ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)  ((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ab8ef37..ee00448 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2269,6 +2269,15 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
 
+   if ((btrfs_super_incompat_flags(disk_super) 
+BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 
+   btrfs_test_opt(tree_root, NOEXTIREF)) {
+   printk(KERN_ERR BTRFS: couldn't mount because the extend iref
+  can not be close.\n);
+   err = -EINVAL;
+   goto fail_alloc;
+   }
+
if (btrfs_super_leafsize(disk_super) !=
btrfs_super_nodesize(disk_super)) {
printk(KERN_ERR BTRFS: couldn't mount because metadata 
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index f07eb45..7c4f880 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -442,7 +442,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 out:
btrfs_free_path(path);
 
-   if (ret == -EMLINK) {
+   if (ret == -EMLINK  !btrfs_test_opt(root, NOEXTIREF)) {
/*
 * We ran out of space in the ref array. Need to add an
 * extended ref.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0f03569..fd375b3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -315,7 +315,7 @@ enum {
Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
-   Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+   Opt_notreelog, Opt_noextiref, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
@@ -344,6 +344,7 @@ static match_table_t tokens = {
{Opt_nossd, nossd},
{Opt_noacl, noacl},
{Opt_notreelog, notreelog},
+   {Opt_noextiref, noextiref},
{Opt_flushoncommit, flushoncommit},
{Opt_ratio, metadata_ratio=%d},
{Opt_discard, discard},
@@ -535,6 +536,10 @@ int btrfs_parse_options(struct btrfs_root *root, char 
*options)
printk(KERN_INFO btrfs: disabling tree log\n);
btrfs_set_opt(info-mount_opt, NOTREELOG);
break;
+   case Opt_noextiref:
+   printk(KERN_INFO btrfs: disabling extend inode ref\n);
+   btrfs_set_opt(info-mount_opt, NOEXTIREF);
+   break;
case Opt_flushoncommit:
printk(KERN_INFO btrfs: turning on flush-on-commit\n);
btrfs_set_opt(info-mount_opt, FLUSHONCOMMIT);
@@ -1202,6 +1207,35 @@ static void btrfs_resize_thread_pool(struct 
btrfs_fs_info *fs_info,
  new_pool_size);
 }
 
+static int btrfs_close_extend_iref(struct btrfs_fs_info *fs_info,
+  unsigned long old_opts)
+{
+   struct btrfs_trans_handle *trans;
+   int ret;
+
+   if (btrfs_raw_test_opt(old_opts, NOEXTIREF) ||
+   !btrfs_raw_test_opt(fs_info-mount_opt, NOEXTIREF))
+   return 0;
+
+   trans = btrfs_attach_transaction(fs_info-tree_root);
+   if (IS_ERR(trans)) {
+   if (PTR_ERR(trans) != -ENOENT)
+   return PTR_ERR(trans);
+   } else {
+   ret = btrfs_commit_transaction(trans, fs_info-tree_root);
+   if (ret)
+   return ret;
+   }
+
+   if (btrfs_super_incompat_flags(fs_info-super_copy

[PATCH] Btrfs-progs: don't set INCOMPAT_EXTENDED_IREF flag when making a new fs

2013-04-11 Thread Miao Xie

There is no extended irefs in the new fs, and we can mount it on
the old kernel without extended iref function safely. So we needn't
set INCOMPAT_EXTENDED_IREF flag when making a new fs, and just set
it when we actually insert a extended iref.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
Cc: Mark Fasheh mfas...@suse.de
---
 mkfs.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mkfs.c b/mkfs.c
index c8cb395..aca6e46 100644
--- a/mkfs.c
+++ b/mkfs.c
@@ -1654,8 +1654,6 @@ raid_groups:
 
super = root-fs_info-super_copy;
flags = btrfs_super_incompat_flags(super);
-   flags |= BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF;
-
if (mixed)
flags |= BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS;
 
-- 
1.8.0.1
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs-progs: don't set INCOMPAT_EXTENDED_IREF flag when making a new fs

2013-04-11 Thread Miao Xie

On Thu, 11 Apr 2013 16:28:11 +0200, Jan Schmidt wrote:
 On Thu, April 11, 2013 at 12:28 (+0200), Miao Xie wrote:
 There is no extended irefs in the new fs, and we can mount it on
 the old kernel without extended iref function safely. So we needn't
 set INCOMPAT_EXTENDED_IREF flag when making a new fs, and just set
 it when we actually insert a extended iref.

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 Cc: Mark Fasheh mfas...@suse.de
 ---
  mkfs.c | 2 --
  1 file changed, 2 deletions(-)

 diff --git a/mkfs.c b/mkfs.c
 index c8cb395..aca6e46 100644
 --- a/mkfs.c
 +++ b/mkfs.c
 @@ -1654,8 +1654,6 @@ raid_groups:
  
  super = root-fs_info-super_copy;
  flags = btrfs_super_incompat_flags(super);
 -flags |= BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF;
 -
  if (mixed)
  flags |= BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS;
  

 
 This one should have a large
 
 *** do not apply until kernel patches from [PATCH 0/2] do not open the extend
 *** inode reference at the beginning have been merged.
 
 tag. Otherwise, extended irefs are disabled entirely for all new file systems 
 in
 environments where they have been working so far.

Yes, thanks to point it out.

Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] Btrfs: introduce noextiref mount option

2013-04-11 Thread Miao Xie

On  thu, 11 Apr 2013 16:29:48 +0200, Jan Schmidt wrote:
 On Thu, April 11, 2013 at 12:35 (+0200), Miao Xie wrote:
 Now, we set incompat flag EXTEND_IREF when we actually need insert a
 extend inode reference, not when making a fs. But some users may hope
 that the fs still can be mounted on the old kernel, and don't hope we
 insert any extend inode references. So we introduce noextiref mount
 option to close this function.
 
 That's a much better approach compared to setting the flag on mkfs, I agree.
 
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 Cc: Mark Fasheh mfas...@suse.de
 ---
  fs/btrfs/ctree.h  |  1 +
  fs/btrfs/disk-io.c|  9 +
  fs/btrfs/inode-item.c |  2 +-
  fs/btrfs/super.c  | 41 -
  4 files changed, 51 insertions(+), 2 deletions(-)

 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index a883e47..db88963 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1911,6 +1911,7 @@ struct btrfs_ioctl_defrag_range_args {
  #define BTRFS_MOUNT_CHECK_INTEGRITY (1  20)
  #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1  21)
  #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR(1  22)
 +#define BTRFS_MOUNT_NOEXTIREF   (1  23)
  
  #define btrfs_clear_opt(o, opt) ((o) = ~BTRFS_MOUNT_##opt)
  #define btrfs_set_opt(o, opt)   ((o) |= BTRFS_MOUNT_##opt)
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index ab8ef37..ee00448 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -2269,6 +2269,15 @@ int open_ctree(struct super_block *sb,
  goto fail_alloc;
  }
  
 +if ((btrfs_super_incompat_flags(disk_super) 
 + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 
 +btrfs_test_opt(tree_root, NOEXTIREF)) {
 +printk(KERN_ERR BTRFS: couldn't mount because the extend iref
 +   can not be close.\n);
 +err = -EINVAL;
 +goto fail_alloc;
 +}
 +
  if (btrfs_super_leafsize(disk_super) !=
  btrfs_super_nodesize(disk_super)) {
  printk(KERN_ERR BTRFS: couldn't mount because metadata 
 diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
 index f07eb45..7c4f880 100644
 --- a/fs/btrfs/inode-item.c
 +++ b/fs/btrfs/inode-item.c
 @@ -442,7 +442,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle 
 *trans,
  out:
  btrfs_free_path(path);
  
 -if (ret == -EMLINK) {
 +if (ret == -EMLINK  !btrfs_test_opt(root, NOEXTIREF)) {
  /*
   * We ran out of space in the ref array. Need to add an
   * extended ref.
 diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
 index 0f03569..fd375b3 100644
 --- a/fs/btrfs/super.c
 +++ b/fs/btrfs/super.c
 @@ -315,7 +315,7 @@ enum {
  Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
  Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
  Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 -Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 +Opt_notreelog, Opt_noextiref, Opt_ratio, Opt_flushoncommit, Opt_discard,
  Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
  Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
  Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
 @@ -344,6 +344,7 @@ static match_table_t tokens = {
  {Opt_nossd, nossd},
  {Opt_noacl, noacl},
  {Opt_notreelog, notreelog},
 +{Opt_noextiref, noextiref},
  {Opt_flushoncommit, flushoncommit},
  {Opt_ratio, metadata_ratio=%d},
  {Opt_discard, discard},
 @@ -535,6 +536,10 @@ int btrfs_parse_options(struct btrfs_root *root, char 
 *options)
  printk(KERN_INFO btrfs: disabling tree log\n);
  btrfs_set_opt(info-mount_opt, NOTREELOG);
  break;
 +case Opt_noextiref:
 +printk(KERN_INFO btrfs: disabling extend inode ref\n);
 +btrfs_set_opt(info-mount_opt, NOEXTIREF);
 +break;
  case Opt_flushoncommit:
  printk(KERN_INFO btrfs: turning on flush-on-commit\n);
  btrfs_set_opt(info-mount_opt, FLUSHONCOMMIT);
 @@ -1202,6 +1207,35 @@ static void btrfs_resize_thread_pool(struct 
 btrfs_fs_info *fs_info,
new_pool_size);
  }
  
 +static int btrfs_close_extend_iref(struct btrfs_fs_info *fs_info,
 +   unsigned long old_opts)
 
 The name irritated me, it's more like unset instead of close, isn't it?

Maybe btrfs_set_no_extend_iref() is better, the other developers might think
we will clear BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF.

 
 +{
 +struct btrfs_trans_handle *trans;
 +int ret;
 +
 +if (btrfs_raw_test_opt(old_opts, NOEXTIREF) ||
 +!btrfs_raw_test_opt(fs_info-mount_opt, NOEXTIREF))
 +return 0;
 +
 +trans = btrfs_attach_transaction(fs_info-tree_root

Re: [PATCH 2/3] Btrfs: fix the deadlock between the transaction start/attach and commit

2013-04-10 Thread Miao Xie

On wed, 10 Apr 2013 21:45:43 +0300, Alex Lyakas wrote:
 Hi Miao,
 I attempted to fix the issue by not joining a transaction that has
 trans-in_commit set. I did something similar to what
 wait_current_trans() does, but I did:
 
 smp_rmb();
 if (cur_trans  cur_trans-in_commit) {
 ...
 wait_event(root-fs_info-transaction_wait,  !cur_trans-blocked);
 ...

But it will introduce deadlock if we need flush some dirty pages, for
example: run ordered operation.

 
 I also had to change the order of setting in_commit and blocked in
 btrfs_commit_transaction:
   trans-transaction-blocked = 1;
   trans-transaction-in_commit = 1;
   smp_wmb();
 to make sure that if in_commit is set, then blocked cannot be 0,
 because btrfs_commit_transaction haven't set it yet to 1.

we need smp_wmb() between 
trans-transaction-blocked = 1;
and
trans-transaction-in_commit = 1;

Or the cpu may set blocked after in_commmit.

 However, with this fix I observe two issues:
 # With large trees and heavy commits, join_transaction() is delayed
 sometimes by 1-3 seconds. This delays the host IO by too much.
 # With this fix, I think too many transactions happen. Basically with
 this fix, once transaction-in_commit is set, then I insist to open a
 new transaction and not to join the current one. It has some bad
 influence on host response times pattern, but I cannot exactly tell
 why is that.
 
 Did you have other fix in mind?
 
 Without the fix, I observe sometimes commits that take like 80
 seconds, out of which like 50 seconds are spent in the do-while loop
 of btrfs_commit_transaction.

I'm making the patch to fix this problem, my fix is:
- don't flush the dirty page during the commit if we create a snapshot
- introduce a new counter to count the external 
writers(TRANS_USERSPACE/TRANS_START)
  and if this counter is zero, we will break the while loop.
- if flushoncommit is set, we start delalloc flush before the while loop, not 
in the
  loop, so we don't flush the dirty pages again and again.
- introduce a new transaction handle type named TRANS_JOIN_ENDIO, which is used 
in the endio
  process.
- introduce a new state for transaction commit, at this state, we block 
TRANS_JOIN, but
  don't block TRANS_JOIN_ENDIO.

Thanks
Miao

 
 Thanks,
 Alex.
 
 
 
 On Mon, Mar 25, 2013 at 11:11 AM, Alex Lyakas
 alex.bt...@zadarastorage.com wrote:
 Hi Miao,

 On Mon, Mar 25, 2013 at 3:51 AM, Miao Xie mi...@cn.fujitsu.com wrote:
 On Sun, 24 Mar 2013 13:13:22 +0200, Alex Lyakas wrote:
 Hi Miao,
 I am seeing another issue. Your fix prevents from TRANS_START to get
 in the way of a committing transaction. But it does not prevent from
 TRANS_JOIN. On the other hand, btrfs_commit_transaction has the
 following loop:

 do {
 // attempt to do some useful stuff and/or sleep
 } while (atomic_read(cur_trans-num_writers)  1 ||
(should_grow  cur_trans-num_joined != joined));

 What I see is basically that new writers join the transaction, while
 btrfs_commit_transaction() does this loop. I see
 cur_trans-num_writers decreasing, but then it increases, then
 decreases etc. This can go for several seconds during heavy IO load.
 There is nothing to prevent new TRANS_JOIN writers coming and joining
 a transaction over and over, thus delaying transaction commit. The IO
 path uses TRANS_JOIN; for example run_delalloc_nocow() does that.

 Do you observe such behavior? Do you believe it's problematic?

 I know this behavior, there is no problem with it, the latter code
 will prevent from TRANS_JOIN.

 1672 spin_lock(root-fs_info-trans_lock);
 1673 root-fs_info-trans_no_join = 1;
 1674 spin_unlock(root-fs_info-trans_lock);
 1675 wait_event(cur_trans-writer_wait,
 1676atomic_read(cur_trans-num_writers) == 1);

 Yes, this code prevents anybody from joining, but before
 btrfs_commit_transaction() gets to this code, it may spend sometimes
 10 seconds (in my tests) in the do-while loop, while new writers come
 and go. Basically, it is not deterministic when the do-while loop will
 exit, it depends on the IO pattern.

 And if we block the TRANS_JOIN at the place you point out, the deadlock
 will happen because we need deal with the ordered operations which will
 use TRANS_JOIN here.

 (I am dealing with the problem you said above by adding a new type of
 TRANS_* now)

 Thanks.
 Alex.



 Thanks
 Miao

 Thanks,
 Alex.


 On Mon, Feb 25, 2013 at 12:20 PM, Miao Xie mi...@cn.fujitsu.com wrote:
 On sun, 24 Feb 2013 21:49:55 +0200, Alex Lyakas wrote:
 Hi Miao,
 can you please explain your solution a bit more.

 On Wed, Feb 20, 2013 at 11:16 AM, Miao Xie mi...@cn.fujitsu.com wrote:
 Now btrfs_commit_transaction() does this

 ret = btrfs_run_ordered_operations(root, 0)

 which async flushes all inodes on the ordered operations list, it 
 introduced
 a deadlock that transaction-start task, transaction-commit task and the 
 flush
 workers waited for each other.
 (See the following URL to get

Re: [PATCH 1/2] Btrfs: online data deduplication

2013-04-08 Thread Miao Xie

On  mon, 8 Apr 2013 22:16:26 +0800, Liu Bo wrote:
 On Mon, Apr 08, 2013 at 08:54:50AM -0400, Josef Bacik wrote:
 On Sun, Apr 07, 2013 at 07:12:48AM -0600, Liu Bo wrote:
 (NOTE: This leads to a FORMAT CHANGE, DO NOT use it on real data.)

 This introduce the online data deduplication feature for btrfs.

 (1) WHY do we need deduplication?
 To improve our storage effiency.

 (2) WHAT is deduplication?
 Two key ways for practical deduplication implementations,
 *  When the data is deduplicated
(inband vs background)
 *  The granularity of the deduplication.
(block level vs file level)

 For btrfs, we choose
 *  inband(synchronous)
 *  block level

 We choose them because of the same reason as how zfs does.
 a)  To get an immediate benefit.
 b)  To remove redundant parts within a file.

 So we have an inband, block level data deduplication here.

 (3) HOW does deduplication works?
 This makes full use of file extent back reference, the same way as
 IOCTL_CLONE, which lets us easily store multiple copies of a set of
 data as a single copy along with an index of references to the copy.

 Here we have
 a)  a new dedicated tree(DEDUP tree) and
 b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
 (stop 64bits of hash, type, disk offset),
 *  stop 64bits of hash
It comes from sha256, which is very helpful on avoiding 
 collision.
And we take the stop 64bits as the index.
 *  disk offset
It helps to find where the data is stored.

 So the whole deduplication process works as,
 1) write something,
 2) calculate the hash of this something,
 3) try to find the match of hash value by searching DEDUP keys in
a dedicated tree, DEDUP tree.
 4) if found, skip real IO and link to the existing copy
if not, do real IO and insert a DEDUP key to the DEDUP tree.

 For now, we limit the deduplication unit to PAGESIZE, 4096, and we're
 going to increase this unit dynamically in the future.

 Signed-off-by: Liu Bo bo.li@oracle.com
 ---
  fs/btrfs/ctree.h|   53 
  fs/btrfs/disk-io.c  |   33 +-
  fs/btrfs/extent-tree.c  |   22 +++-
  fs/btrfs/extent_io.c|8 +-
  fs/btrfs/extent_io.h|   11 ++
  fs/btrfs/file-item.c|  186 ++
  fs/btrfs/file.c |6 +-
  fs/btrfs/inode.c|  330 
 +++
  fs/btrfs/ioctl.c|   34 +-
  fs/btrfs/ordered-data.c |   25 +++-
  fs/btrfs/ordered-data.h |9 ++
  fs/btrfs/print-tree.c   |6 +-
  fs/btrfs/super.c|7 +-
  13 files changed, 685 insertions(+), 45 deletions(-)

 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 0d82922..59339bc 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -32,6 +32,7 @@
  #include asm/kmap_types.h
  #include linux/pagemap.h
  #include linux/btrfs.h
 +#include crypto/hash.h
  #include extent_io.h
  #include extent_map.h
  #include async-thread.h
 @@ -94,6 +95,9 @@ struct btrfs_ordered_sum;
  /* holds quota configuration and tracking */
  #define BTRFS_QUOTA_TREE_OBJECTID 8ULL

 +/* dedup tree(experimental) */
 +#define BTRFS_DEDUP_TREE_OBJECTID 9ULL
 +
  /* orhpan objectid for tracking unlinked/truncated files */
  #define BTRFS_ORPHAN_OBJECTID -5ULL

 @@ -884,12 +888,31 @@ struct btrfs_file_extent_item {
  */
 __le64 num_bytes;

 +   /*
 +* the stop 64bits of sha256 hash value, this helps us find the
 +* corresponding item in dedup tree.
 +*/
 +   __le64 dedup_hash;
 +
  } __attribute__ ((__packed__));


 Please don't do this, do like what we do with the crc tree, just lookup 
 based on
 the disk bytenr when we free the extent and drop refs that way.  Don't 
 further
 bloat the file extent item, we want it smaller not larger.  Thanks,

 Josef
 
 So the real trouble is that I take this hash value as the first field of
 btrfs_key, and binary searching without the precise first field is not easy.
 
 Otherwise we may have to add another key type which replaces hash value with
 disk bytenr, ie. (disk bytenr, ANOTHER_KEY_TYPE, hash value), then we'll need 
 to
 search the tree twice to free this one or drop refs.

Why need we store refs in btrfs_dedup_item structure?

I think the following one is better:
key.objectid = the stop 64bits of sha256 hash value
key.type = whatever
key.offset = bytenr /* the bytenr of block */

struct btrfs_dedup_item {
__le64 bytenr,  /* the start bytenr of the extent */
__le64 len,
}

In this way, we use the refs in btrfs_extent_item to make sure the block is not 
freed.
And when we truncate the file, all thing we need do is delete the dedup item 
when we
free the extent just like checksum tree.

Thanks
Miao

 
 Either case is tradeoff, but as this is an initial version, we can try all of
 these knobs and choose the better one

Re: [PATCH 1/2] Btrfs: online data deduplication

2013-04-08 Thread Miao Xie

On  mon, 8 Apr 2013 15:47:27 +0200, David Sterba wrote:
 On Sun, Apr 07, 2013 at 09:12:48PM +0800, Liu Bo wrote:
 (2) WHAT is deduplication?
 Two key ways for practical deduplication implementations,
 *  When the data is deduplicated
(inband vs background)
 *  The granularity of the deduplication.
(block level vs file level)

 For btrfs, we choose
 *  inband(synchronous)
 *  block level
 
 Block level may be too fine grained leading to excessive fragmentation
 and increased metadata usage given that there's a much higher chance to
 find duplicate (4k) blocks here and there.
 
 There's always a tradeoff, the practical values that are considered for
 granularity range from 8k to 64, see eg. this paper for graphs and analyses
 
 http://static.usenix.org/event/fast11/tech/full_papers/Meyer.pdf .
 
 This also depends on file data type and access patterns, fixing the dedup
 basic chunk size to one block does not IMHO fit most usecases.

Maybe we can make btrfs(including dedup) support the bigalloc just like ext4.

Thanks
Miao

 
 (3) HOW does deduplication works?
 ...
 Here we have
 a)  a new dedicated tree(DEDUP tree) and
 b)  a new key(BTRFS_DEDUP_ITEM_KEY), which consists of
 (stop 64bits of hash, type, disk offset),
 *  stop 64bits of hash
It comes from sha256, which is very helpful on avoiding collision.
And we take the stop 64bits as the index.
 
 Is it safe to use just 64 bits? I'd like to see better reasoning why
 this is ok. The limitation of btrfs_key to store only 1-2 64bit items is
 clear and must be handled, but it's IMO a critical design point.
 
 *  disk offset
It helps to find where the data is stored.
 
 Does the disk offset also help to resolving block hash collisions?
 
 So the whole deduplication process works as,
 1) write something,
 2) calculate the hash of this something,
 3) try to find the match of hash value by searching DEDUP keys in
a dedicated tree, DEDUP tree.
 4) if found, skip real IO and link to the existing copy
if not, do real IO and insert a DEDUP key to the DEDUP tree.
 
 ... how are the hash collisions handled? Using part of a secure has
 cannot be considered equally strong (given that there is not other
 safety checks like comparing the whole blocks).
 
 Last but not least, there was another dedup proposal (author CCed)
 
 http://thread.gmane.org/gmane.comp.file-systems.btrfs/21722
 
 
 david
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V3 1/2] Btrfs: improve the performance of the csums lookup

2013-04-05 Thread Miao Xie

It is very likely that there are several blocks in bio, it is very
inefficient if we get their csums one by one. This patch improves
this problem by getting the csums in batch.

According to the result of the following test, the execute time of
__btrfs_lookup_bio_sums() is down by ~28%(300us - 217us).

 # dd if=mnt/file of=/dev/null bs=1M count=1024

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v2 - v3:
- address the problem that the logical offset of the pages in the same bio is
  not contiguous.

Changelog v1 - v2:
- fix 64bit division problem on i386 machine
---
 fs/btrfs/extent_io.c| 58 +
 fs/btrfs/extent_io.h|  4 
 fs/btrfs/file-item.c| 49 +
 fs/btrfs/ordered-data.c | 28 ++--
 fs/btrfs/ordered-data.h |  3 ++-
 5 files changed, 111 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cdee391..19dd3da 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1777,6 +1777,64 @@ out:
return ret;
 }
 
+void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 
csums[],
+   int count)
+{
+   struct rb_node *node;
+   struct extent_state *state;
+
+   spin_lock(tree-lock);
+   /*
+* this search will find all the extents that end after
+* our range starts.
+*/
+   node = tree_search(tree, start);
+   BUG_ON(!node);
+
+   state = rb_entry(node, struct extent_state, rb_node);
+   BUG_ON(state-start != start);
+
+   while (count) {
+   state-private = *csums++;
+   count--;
+   state = next_state(state);
+   }
+   spin_unlock(tree-lock);
+}
+
+static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index)
+{
+   struct bio_vec *bvec = bio-bi_io_vec + bio_index;
+
+   return page_offset(bvec-bv_page) + bvec-bv_offset;
+}
+
+void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int 
bio_index,
+   u32 csums[], int count)
+{
+   struct rb_node *node;
+   struct extent_state *state = NULL;
+   u64 start;
+
+   spin_lock(tree-lock);
+   do {
+   start = __btrfs_get_bio_offset(bio, bio_index);
+   if (state == NULL || state-start != start) {
+   node = tree_search(tree, start);
+   BUG_ON(!node);
+
+   state = rb_entry(node, struct extent_state, rb_node);
+   BUG_ON(state-start != start);
+   }
+   state-private = *csums++;
+   count--;
+   bio_index++;
+
+   state = next_state(state);
+   } while (count);
+   spin_unlock(tree-lock);
+}
+
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 {
struct rb_node *node;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 258c921..db009d8 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -261,6 +261,10 @@ int extent_readpages(struct extent_io_tree *tree,
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent);
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 
csums[],
+   int count);
+void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio,
+   int bvec_index, u32 csums[], int count);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c4628a2..7e4df79 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -177,7 +177,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
   struct inode *inode, struct bio *bio,
   u64 logical_offset, u32 *dst, int dio)
 {
-   u32 sum;
+   u32 sum[16];
+   int len;
struct bio_vec *bvec = bio-bi_io_vec;
int bio_index = 0;
u64 offset = 0;
@@ -186,7 +187,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 disk_bytenr;
u32 diff;
u16 csum_size = btrfs_super_csum_size(root-fs_info-super_copy);
-   int ret;
+   int count;
struct btrfs_path *path;
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = BTRFS_I(inode)-io_tree;
@@ -214,10 +215,12 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root 
*root,
if (dio)
offset = logical_offset;
while (bio_index  bio-bi_vcnt) {
+   len = min_t(int, ARRAY_SIZE(sum), bio-bi_vcnt - bio_index);
if (!dio)
offset

[PATCH V2 1/2] Btrfs: improve the performance of the csums lookup

2013-04-03 Thread Miao Xie

It is very likely that there are several blocks in bio, it is very
inefficient if we get their csums one by one. This patch improves
this problem by getting the csums in batch.

According to the result of the following test, the execute time of
__btrfs_lookup_bio_sums() is down by ~28%(300us - 217us).

 # dd if=mnt/file of=/dev/null bs=1M count=1024

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- fix 64bit division problem on i386 machine
---
 fs/btrfs/extent_io.c| 31 +++
 fs/btrfs/extent_io.h|  2 ++
 fs/btrfs/file-item.c| 45 ++---
 fs/btrfs/ordered-data.c | 28 +---
 fs/btrfs/ordered-data.h |  3 ++-
 5 files changed, 78 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cdee391..fc4d3bc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1777,6 +1777,37 @@ out:
return ret;
 }
 
+void cache_csums(struct extent_io_tree *tree, u64 start, u32 csums[],
+int count, int sectorsize)
+{
+   struct rb_node *node;
+   struct extent_state *state, *next;
+
+   spin_lock(tree-lock);
+   /*
+* this search will find all the extents that end after
+* our range starts.
+*/
+   node = tree_search(tree, start);
+   BUG_ON(!node);
+
+   state = rb_entry(node, struct extent_state, rb_node);
+   BUG_ON(state-start != start);
+
+   while (count) {
+   BUG_ON(state-end + 1 - state-start != sectorsize);
+
+   state-private = *csums++;
+   count--;
+   next = next_state(state);
+
+   BUG_ON(count  (!next || next-start != state-end + 1));
+
+   state = next;
+   }
+   spin_unlock(tree-lock);
+}
+
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 {
struct rb_node *node;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 258c921..59819f0 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -261,6 +261,8 @@ int extent_readpages(struct extent_io_tree *tree,
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent);
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+void cache_csums(struct extent_io_tree *tree, u64 start, u32 csums[],
+int count, int sectorsize);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c4628a2..484017a 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -177,7 +177,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
   struct inode *inode, struct bio *bio,
   u64 logical_offset, u32 *dst, int dio)
 {
-   u32 sum;
+   u32 sum[16];
+   int len;
struct bio_vec *bvec = bio-bi_io_vec;
int bio_index = 0;
u64 offset = 0;
@@ -186,7 +187,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 disk_bytenr;
u32 diff;
u16 csum_size = btrfs_super_csum_size(root-fs_info-super_copy);
-   int ret;
+   int count;
+   int index;
struct btrfs_path *path;
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = BTRFS_I(inode)-io_tree;
@@ -214,10 +216,11 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root 
*root,
if (dio)
offset = logical_offset;
while (bio_index  bio-bi_vcnt) {
+   len = min_t(int, ARRAY_SIZE(sum), bio-bi_vcnt - bio_index);
if (!dio)
offset = page_offset(bvec-bv_page) + bvec-bv_offset;
-   ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum);
-   if (ret == 0)
+   count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum, 
len);
+   if (count)
goto found;
 
if (!item || disk_bytenr  item_start_offset ||
@@ -230,10 +233,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
item = btrfs_lookup_csum(NULL, root-fs_info-csum_root,
 path, disk_bytenr, 0);
if (IS_ERR(item)) {
-   ret = PTR_ERR(item);
-   if (ret == -ENOENT || ret == -EFBIG)
-   ret = 0;
-   sum = 0;
+   count = 1;
+   sum[0] = 0;
if (BTRFS_I(inode)-root-root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID

[PATCH V2 2/2] Btrfs: remove btrfs_sector_sum structure

2013-04-03 Thread Miao Xie

Using the structure btrfs_sector_sum to keep the checksum value is
unnecessary, because the extents that btrfs_sector_sum points to are
continuous, we can find out the expected checksums by btrfs_ordered_sum's
bytenr and the offset, so we can remove btrfs_sector_sum's bytenr. After
removing bytenr, there is only one member in the structure, so it makes
no sense to keep the structure, just remove it, and use a u32 array to
store the checksum value.

By this change, we don't use the while loop to get the checksums one by
one. Now, we can get several checksum value at one time, it improved the
performance by ~74% on my SSD (31MB/s - 54MB/s).

test command:
 # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
Reviewed-by: Liu Bo bo.li@oracle.com
---
Changelog v1 - v2:
- modify the changelog and the title which can not explain this patch clearly
- fix the 64bit division problem on 32bit machine
---
 fs/btrfs/file-item.c| 144 ++--
 fs/btrfs/ordered-data.c |  19 +++
 fs/btrfs/ordered-data.h |  25 ++---
 fs/btrfs/relocation.c   |  10 
 fs/btrfs/scrub.c|  16 ++
 5 files changed, 71 insertions(+), 143 deletions(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 484017a..8d653c2 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
   sizeof(struct btrfs_ordered_sum)) / \
-  sizeof(struct btrfs_sector_sum) * \
-  (r)-sectorsize - (r)-sectorsize)
+  sizeof(u32) * (r)-sectorsize)
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
@@ -313,7 +312,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 
start, u64 end,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
-   struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
unsigned long offset;
@@ -387,34 +385,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 
start, u64 end,
  struct btrfs_csum_item);
while (start  csum_end) {
size = min_t(size_t, csum_end - start,
-   MAX_ORDERED_SUM_BYTES(root));
+MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
-   GFP_NOFS);
+  GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
goto fail;
}
 
-   sector_sum = sums-sums;
sums-bytenr = start;
-   sums-len = size;
+   sums-len = (int)size;
 
offset = (start - key.offset) 
root-fs_info-sb-s_blocksize_bits;
offset *= csum_size;
+   size = root-fs_info-sb-s_blocksize_bits;
 
-   while (size  0) {
-   read_extent_buffer(path-nodes[0],
-   sector_sum-sum,
-   ((unsigned long)item) +
-   offset, csum_size);
-   sector_sum-bytenr = start;
-
-   size -= root-sectorsize;
-   start += root-sectorsize;
-   offset += csum_size;
-   sector_sum++;
-   }
+   read_extent_buffer(path-nodes[0],
+  sums-sums,
+  ((unsigned long)item) + offset,
+  csum_size * size);
+
+   start += root-sectorsize * size;
list_add_tail(sums-list, tmplist);
}
path-slots[0]++;
@@ -436,23 +428,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct 
inode *inode,
   struct bio *bio, u64 file_start, int contig)
 {
struct btrfs_ordered_sum *sums;
-   struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
char *data;
struct bio_vec *bvec = bio-bi_io_vec;
int bio_index = 0;
+   int index;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
-   u64 disk_bytenr;
 
WARN_ON(bio-bi_vcnt = 0);
sums = kzalloc

[PATCH 1/4] Btrfs: fix wrong reservation of csums

2013-03-28 Thread Miao Xie

We reserve the space for csums only when we write data into a file, in
the other cases, such as tree log, log replay, we don't do reservation,
so we can use the reservation of the transaction handle just for the former.
And for the latter, we should use the tree's own reservation. But the
function - btrfs_csum_file_blocks() didn't differentiate between these
two types of the cases, fix it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/file-item.c | 2 --
 fs/btrfs/inode.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ec16020..b7e529d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -728,7 +728,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
return -ENOMEM;
 
sector_sum = sums-sums;
-   trans-adding_csums = 1;
 again:
next_offset = (u64)-1;
found_next = 0;
@@ -899,7 +898,6 @@ next_sector:
goto again;
}
 out:
-   trans-adding_csums = 0;
btrfs_free_path(path);
return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1f26..63eec5c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1743,8 +1743,10 @@ static noinline int add_pending_csums(struct 
btrfs_trans_handle *trans,
struct btrfs_ordered_sum *sum;
 
list_for_each_entry(sum, list, list) {
+   trans-adding_csums = 1;
btrfs_csum_file_blocks(trans,
   BTRFS_I(inode)-root-fs_info-csum_root, sum);
+   trans-adding_csums = 0;
}
return 0;
 }
-- 
1.8.0.1
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/4] Btrfs: improve the performance of the csums lookup

2013-03-28 Thread Miao Xie

It is very likely that there are several blocks in bio, it is very
inefficient if we get their csums one by one. This patch improves
this problem by getting the csums in batch.

According to the result of the following test, the execute time of
__btrfs_lookup_bio_sums() is down by ~28%(300us - 217us).

 # dd if=mnt/file of=/dev/null bs=1M count=1024

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/extent_io.c| 31 +++
 fs/btrfs/extent_io.h|  2 ++
 fs/btrfs/file-item.c| 45 ++---
 fs/btrfs/ordered-data.c | 24 ++--
 fs/btrfs/ordered-data.h |  3 ++-
 5 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f173c5a..3da8da5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1744,6 +1744,37 @@ out:
return ret;
 }
 
+void cache_csums(struct extent_io_tree *tree, u64 start, u32 csums[],
+int count, int sectorsize)
+{
+   struct rb_node *node;
+   struct extent_state *state, *next;
+
+   spin_lock(tree-lock);
+   /*
+* this search will find all the extents that end after
+* our range starts.
+*/
+   node = tree_search(tree, start);
+   BUG_ON(!node);
+
+   state = rb_entry(node, struct extent_state, rb_node);
+   BUG_ON(state-start != start);
+
+   while (count) {
+   BUG_ON(state-end + 1 - state-start != sectorsize);
+
+   state-private = *csums++;
+   count--;
+   next = next_state(state);
+
+   BUG_ON(count  (!next || next-start != state-end + 1));
+
+   state = next;
+   }
+   spin_unlock(tree-lock);
+}
+
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 {
struct rb_node *node;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6068a19..b95fb6a 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -261,6 +261,8 @@ int extent_readpages(struct extent_io_tree *tree,
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent);
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+void cache_csums(struct extent_io_tree *tree, u64 start, u32 csums[],
+int count, int sectorsize);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b7e529d..3e2f080 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -175,7 +175,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
   struct inode *inode, struct bio *bio,
   u64 logical_offset, u32 *dst, int dio)
 {
-   u32 sum;
+   u32 sum[16];
+   int len;
struct bio_vec *bvec = bio-bi_io_vec;
int bio_index = 0;
u64 offset = 0;
@@ -184,7 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 disk_bytenr;
u32 diff;
u16 csum_size = btrfs_super_csum_size(root-fs_info-super_copy);
-   int ret;
+   int count;
+   int index;
struct btrfs_path *path;
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = BTRFS_I(inode)-io_tree;
@@ -212,10 +214,11 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root 
*root,
if (dio)
offset = logical_offset;
while (bio_index  bio-bi_vcnt) {
+   len = min_t(int, ARRAY_SIZE(sum), bio-bi_vcnt - bio_index);
if (!dio)
offset = page_offset(bvec-bv_page) + bvec-bv_offset;
-   ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum);
-   if (ret == 0)
+   count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum, 
len);
+   if (count)
goto found;
 
if (!item || disk_bytenr  item_start_offset ||
@@ -228,10 +231,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
item = btrfs_lookup_csum(NULL, root-fs_info-csum_root,
 path, disk_bytenr, 0);
if (IS_ERR(item)) {
-   ret = PTR_ERR(item);
-   if (ret == -ENOENT || ret == -EFBIG)
-   ret = 0;
-   sum = 0;
+   count = 1;
+   sum[0] = 0;
if (BTRFS_I(inode)-root-root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
set_extent_bits(io_tree, offset,
@@ -267,19 +268,25 @@ static int

[PATCH 3/4] Btrfs: remove unnecessary variant in btrfs_sector_sum structure

2013-03-28 Thread Miao Xie

bytenr in btrfs_sector_sum is unnecessary, because the extents that
btrfs_sector_sum points to are continuous，we can find out the expected
checksums by btrfs_ordered_sum's bytenr and the offset, so we can
remove btrfs_sector_sum's bytenr.

After removing bytenr, we don't use the while loop to get the checksums
one by one. Now, we can get several checksum value at one time. By this
way, the performance of write is improved by ~74% on my SSD (31MB/s - 54MB/s)

test command:
 # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/file-item.c| 144 ++--
 fs/btrfs/ordered-data.c |  21 +++
 fs/btrfs/ordered-data.h |  25 ++---
 fs/btrfs/relocation.c   |  10 
 fs/btrfs/scrub.c|  16 ++
 5 files changed, 72 insertions(+), 144 deletions(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 3e2f080..9a447bc 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
   sizeof(struct btrfs_ordered_sum)) / \
-  sizeof(struct btrfs_sector_sum) * \
-  (r)-sectorsize - (r)-sectorsize)
+  sizeof(u32) * (r)-sectorsize)
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
@@ -311,7 +310,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 
start, u64 end,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
-   struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
unsigned long offset;
@@ -385,34 +383,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 
start, u64 end,
  struct btrfs_csum_item);
while (start  csum_end) {
size = min_t(size_t, csum_end - start,
-   MAX_ORDERED_SUM_BYTES(root));
+MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
-   GFP_NOFS);
+  GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
goto fail;
}
 
-   sector_sum = sums-sums;
sums-bytenr = start;
-   sums-len = size;
+   sums-len = (int)size;
 
offset = (start - key.offset) 
root-fs_info-sb-s_blocksize_bits;
offset *= csum_size;
+   size = root-fs_info-sb-s_blocksize_bits;
 
-   while (size  0) {
-   read_extent_buffer(path-nodes[0],
-   sector_sum-sum,
-   ((unsigned long)item) +
-   offset, csum_size);
-   sector_sum-bytenr = start;
-
-   size -= root-sectorsize;
-   start += root-sectorsize;
-   offset += csum_size;
-   sector_sum++;
-   }
+   read_extent_buffer(path-nodes[0],
+  sums-sums,
+  ((unsigned long)item) + offset,
+  csum_size * size);
+
+   start += root-sectorsize * size;
list_add_tail(sums-list, tmplist);
}
path-slots[0]++;
@@ -434,23 +426,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct 
inode *inode,
   struct bio *bio, u64 file_start, int contig)
 {
struct btrfs_ordered_sum *sums;
-   struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
char *data;
struct bio_vec *bvec = bio-bi_io_vec;
int bio_index = 0;
+   int index;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
-   u64 disk_bytenr;
 
WARN_ON(bio-bi_vcnt = 0);
sums = kzalloc(btrfs_ordered_sum_size(root, bio-bi_size), GFP_NOFS);
if (!sums)
return -ENOMEM;
 
-   sector_sum = sums-sums;
-   disk_bytenr = (u64)bio-bi_sector  9;
sums-len = bio-bi_size;
INIT_LIST_HEAD(sums-list);
 
@@ -461,7 +450,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct 
inode *inode,
 
ordered = btrfs_lookup_ordered_extent

[PATCH 4/4] Btrfs: fix wrong return value of btrfs_lookup_csum()

2013-03-28 Thread Miao Xie

If we don't find the expected csum item, but find a csum item which is
adjacent to the specified extent, we should return -EFBIG, or we should
return -ENOENT. But btrfs_lookup_csum() return -EFBIG even the csum item
is not adjacent to the specified extent. Fix it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/file-item.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9a447bc..bc89e2f 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -117,9 +117,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct 
btrfs_trans_handle *trans,
csums_in_item = btrfs_item_size_nr(leaf, path-slots[0]);
csums_in_item /= csum_size;
 
-   if (csum_offset = csums_in_item) {
+   if (csum_offset == csums_in_item) {
ret = -EFBIG;
goto fail;
+   } else if (csum_offset  csums_in_item) {
+   goto fail;
}
}
item = btrfs_item_ptr(leaf, path-slots[0], struct btrfs_csum_item);
-- 
1.8.0.1
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V2 3/4] Btrfs: remove unnecessary variant in btrfs_sector_sum, structure

2013-03-28 Thread Miao Xie

bytenr in btrfs_sector_sum is unnecessary, because the extents that
btrfs_sector_sum points to are continuous, we can find out the expected
checksums by btrfs_ordered_sum's bytenr and the offset, so we can
remove btrfs_sector_sum's bytenr.

After removing bytenr, we don't use the while loop to get the checksums
one by one. Now, we can get several checksum value at one time. By this
way, the performance of write is improved by ~74% on my SSD (31MB/s - 54MB/s)

test command:
 # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- fix messy code in the changelog.
---
 fs/btrfs/file-item.c| 144 ++--
 fs/btrfs/ordered-data.c |  21 +++
 fs/btrfs/ordered-data.h |  25 ++---
 fs/btrfs/relocation.c   |  10 
 fs/btrfs/scrub.c|  16 ++
 5 files changed, 72 insertions(+), 144 deletions(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 3e2f080..9a447bc 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
   sizeof(struct btrfs_ordered_sum)) / \
-  sizeof(struct btrfs_sector_sum) * \
-  (r)-sectorsize - (r)-sectorsize)
+  sizeof(u32) * (r)-sectorsize)
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
@@ -311,7 +310,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 
start, u64 end,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
-   struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
unsigned long offset;
@@ -385,34 +383,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 
start, u64 end,
  struct btrfs_csum_item);
while (start  csum_end) {
size = min_t(size_t, csum_end - start,
-   MAX_ORDERED_SUM_BYTES(root));
+MAX_ORDERED_SUM_BYTES(root));
sums = kzalloc(btrfs_ordered_sum_size(root, size),
-   GFP_NOFS);
+  GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
goto fail;
}
 
-   sector_sum = sums-sums;
sums-bytenr = start;
-   sums-len = size;
+   sums-len = (int)size;
 
offset = (start - key.offset) 
root-fs_info-sb-s_blocksize_bits;
offset *= csum_size;
+   size = root-fs_info-sb-s_blocksize_bits;
 
-   while (size  0) {
-   read_extent_buffer(path-nodes[0],
-   sector_sum-sum,
-   ((unsigned long)item) +
-   offset, csum_size);
-   sector_sum-bytenr = start;
-
-   size -= root-sectorsize;
-   start += root-sectorsize;
-   offset += csum_size;
-   sector_sum++;
-   }
+   read_extent_buffer(path-nodes[0],
+  sums-sums,
+  ((unsigned long)item) + offset,
+  csum_size * size);
+
+   start += root-sectorsize * size;
list_add_tail(sums-list, tmplist);
}
path-slots[0]++;
@@ -434,23 +426,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct 
inode *inode,
   struct bio *bio, u64 file_start, int contig)
 {
struct btrfs_ordered_sum *sums;
-   struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
char *data;
struct bio_vec *bvec = bio-bi_io_vec;
int bio_index = 0;
+   int index;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
-   u64 disk_bytenr;
 
WARN_ON(bio-bi_vcnt = 0);
sums = kzalloc(btrfs_ordered_sum_size(root, bio-bi_size), GFP_NOFS);
if (!sums)
return -ENOMEM;
 
-   sector_sum = sums-sums;
-   disk_bytenr = (u64)bio-bi_sector  9;
sums-len = bio-bi_size;
INIT_LIST_HEAD(sums-list);
 
@@ -461,7 +450,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct 
inode

Re: [PATCH 3/4] Btrfs: remove unnecessary variant in btrfs_sector_sum structure

2013-03-28 Thread Miao Xie

On Thu, 28 Mar 2013 22:41:50 +0800, Liu Bo wrote:
 On Thu, Mar 28, 2013 at 10:38:34PM +0800, Liu Bo wrote:
 On Thu, Mar 28, 2013 at 04:11:38PM +0800, Miao Xie wrote:
 bytenr in btrfs_sector_sum is unnecessary, because the extents that
 btrfs_sector_sum points to are continuous，we can find out the expected
 checksums by btrfs_ordered_sum's bytenr and the offset, so we can
 remove btrfs_sector_sum's bytenr.

 After removing bytenr, we don't use the while loop to get the checksums
 one by one. Now, we can get several checksum value at one time. By this
 way, the performance of write is improved by ~74% on my SSD (31MB/s - 
 54MB/s)

 test command:
  # dd if=/dev/zero of=/mnt/btrfs/file0 bs=1M count=1024 oflag=sync
 
 but the title is a bit confused because you've actually killed all of
 btrfs_sector_sum.

I misused the old title, will change it later.

Thanks
Miao

 

 Looks good to me.

 Reviewed-by: Liu Bo bo.li@oracle.com


 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/file-item.c| 144 
 ++--
  fs/btrfs/ordered-data.c |  21 +++
  fs/btrfs/ordered-data.h |  25 ++---
  fs/btrfs/relocation.c   |  10 
  fs/btrfs/scrub.c|  16 ++
  5 files changed, 72 insertions(+), 144 deletions(-)

 diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
 index 3e2f080..9a447bc 100644
 --- a/fs/btrfs/file-item.c
 +++ b/fs/btrfs/file-item.c
 @@ -34,8 +34,7 @@
  
  #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
 -  sizeof(struct btrfs_sector_sum) * \
 -  (r)-sectorsize - (r)-sectorsize)
 +  sizeof(u32) * (r)-sectorsize)
  
  int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
  struct btrfs_root *root,
 @@ -311,7 +310,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, 
 u64 start, u64 end,
 struct btrfs_path *path;
 struct extent_buffer *leaf;
 struct btrfs_ordered_sum *sums;
 -   struct btrfs_sector_sum *sector_sum;
 struct btrfs_csum_item *item;
 LIST_HEAD(tmplist);
 unsigned long offset;
 @@ -385,34 +383,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, 
 u64 start, u64 end,
   struct btrfs_csum_item);
 while (start  csum_end) {
 size = min_t(size_t, csum_end - start,
 -   MAX_ORDERED_SUM_BYTES(root));
 +MAX_ORDERED_SUM_BYTES(root));
 sums = kzalloc(btrfs_ordered_sum_size(root, size),
 -   GFP_NOFS);
 +  GFP_NOFS);
 if (!sums) {
 ret = -ENOMEM;
 goto fail;
 }
  
 -   sector_sum = sums-sums;
 sums-bytenr = start;
 -   sums-len = size;
 +   sums-len = (int)size;
  
 offset = (start - key.offset) 
 root-fs_info-sb-s_blocksize_bits;
 offset *= csum_size;
 +   size = root-fs_info-sb-s_blocksize_bits;
  
 -   while (size  0) {
 -   read_extent_buffer(path-nodes[0],
 -   sector_sum-sum,
 -   ((unsigned long)item) +
 -   offset, csum_size);
 -   sector_sum-bytenr = start;
 -
 -   size -= root-sectorsize;
 -   start += root-sectorsize;
 -   offset += csum_size;
 -   sector_sum++;
 -   }
 +   read_extent_buffer(path-nodes[0],
 +  sums-sums,
 +  ((unsigned long)item) + offset,
 +  csum_size * size);
 +
 +   start += root-sectorsize * size;
 list_add_tail(sums-list, tmplist);
 }
 path-slots[0]++;
 @@ -434,23 +426,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, 
 struct inode *inode,
struct bio *bio, u64 file_start, int contig)
  {
 struct btrfs_ordered_sum *sums;
 -   struct btrfs_sector_sum *sector_sum;
 struct btrfs_ordered_extent *ordered;
 char *data;
 struct bio_vec *bvec = bio-bi_io_vec;
 int bio_index = 0;
 +   int index;
 unsigned long total_bytes = 0;
 unsigned long this_sum_bytes = 0;
 u64 offset;
 -   u64 disk_bytenr;
  
 WARN_ON(bio-bi_vcnt = 0);
 sums = kzalloc(btrfs_ordered_sum_size(root, bio-bi_size), GFP_NOFS);
 if (!sums)
 return -ENOMEM;
  
 -   sector_sum = sums-sums;
 -   disk_bytenr = (u64)bio

Re: [PATCH 2/3] Btrfs: fix the deadlock between the transaction start/attach and commit

2013-03-24 Thread Miao Xie

On Sun, 24 Mar 2013 13:13:22 +0200, Alex Lyakas wrote:
 Hi Miao,
 I am seeing another issue. Your fix prevents from TRANS_START to get
 in the way of a committing transaction. But it does not prevent from
 TRANS_JOIN. On the other hand, btrfs_commit_transaction has the
 following loop:
 
 do {
 // attempt to do some useful stuff and/or sleep
 } while (atomic_read(cur_trans-num_writers)  1 ||
(should_grow  cur_trans-num_joined != joined));
 
 What I see is basically that new writers join the transaction, while
 btrfs_commit_transaction() does this loop. I see
 cur_trans-num_writers decreasing, but then it increases, then
 decreases etc. This can go for several seconds during heavy IO load.
 There is nothing to prevent new TRANS_JOIN writers coming and joining
 a transaction over and over, thus delaying transaction commit. The IO
 path uses TRANS_JOIN; for example run_delalloc_nocow() does that.
 
 Do you observe such behavior? Do you believe it's problematic?

I know this behavior, there is no problem with it, the latter code
will prevent from TRANS_JOIN.

1672 spin_lock(root-fs_info-trans_lock);
1673 root-fs_info-trans_no_join = 1;
1674 spin_unlock(root-fs_info-trans_lock);
1675 wait_event(cur_trans-writer_wait,
1676atomic_read(cur_trans-num_writers) == 1);

And if we block the TRANS_JOIN at the place you point out, the deadlock
will happen because we need deal with the ordered operations which will
use TRANS_JOIN here.

(I am dealing with the problem you said above by adding a new type of
TRANS_* now)

Thanks
Miao

 Thanks,
 Alex.
 
 
 On Mon, Feb 25, 2013 at 12:20 PM, Miao Xie mi...@cn.fujitsu.com wrote:
 On sun, 24 Feb 2013 21:49:55 +0200, Alex Lyakas wrote:
 Hi Miao,
 can you please explain your solution a bit more.

 On Wed, Feb 20, 2013 at 11:16 AM, Miao Xie mi...@cn.fujitsu.com wrote:
 Now btrfs_commit_transaction() does this

 ret = btrfs_run_ordered_operations(root, 0)

 which async flushes all inodes on the ordered operations list, it 
 introduced
 a deadlock that transaction-start task, transaction-commit task and the 
 flush
 workers waited for each other.
 (See the following URL to get the detail
  http://marc.info/?l=linux-btrfsm=136070705732646w=2)

 As we know, if -in_commit is set, it means someone is committing the
 current transaction, we should not try to join it if we are not JOIN
 or JOIN_NOLOCK, wait is the best choice for it. In this way, we can avoid
 the above problem. In this way, there is another benefit: there is no new
 transaction handle to block the transaction which is on the way of commit,
 once we set -in_commit.

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/transaction.c |   17 -
  1 files changed, 16 insertions(+), 1 deletions(-)

 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
 index bc2f2d1..71b7e2e 100644
 --- a/fs/btrfs/transaction.c
 +++ b/fs/btrfs/transaction.c
 @@ -51,6 +51,14 @@ static noinline void switch_commit_root(struct 
 btrfs_root *root)
 root-commit_root = btrfs_root_node(root);
  }

 +static inline int can_join_transaction(struct btrfs_transaction *trans,
 +  int type)
 +{
 +   return !(trans-in_commit 
 +type != TRANS_JOIN 
 +type != TRANS_JOIN_NOLOCK);
 +}
 +
  /*
   * either allocate a new transaction or hop into the existing one
   */
 @@ -86,6 +94,10 @@ loop:
 spin_unlock(fs_info-trans_lock);
 return cur_trans-aborted;
 }
 +   if (!can_join_transaction(cur_trans, type)) {
 +   spin_unlock(fs_info-trans_lock);
 +   return -EBUSY;
 +   }
 atomic_inc(cur_trans-use_count);
 atomic_inc(cur_trans-num_writers);
 cur_trans-num_joined++;
 @@ -360,8 +372,11 @@ again:

 do {
 ret = join_transaction(root, type);
 -   if (ret == -EBUSY)
 +   if (ret == -EBUSY) {
 wait_current_trans(root);
 +   if (unlikely(type == TRANS_ATTACH))
 +   ret = -ENOENT;
 +   }

 So I understand that instead of incrementing num_writes and joining
 the current transaction, you do not join and wait for the current
 transaction to unblock.

 More specifically，TRANS_START、TRANS_USERSPACE and TRANS_ATTACH can not
 join and just wait for the current transaction to unblock if -in_commit
 is set.

 Which task in Josef's example
 http://marc.info/?l=linux-btrfsm=136070705732646w=2
 task 1, task 2 or task 3 is the one that will not join the
 transaction, but instead wait?

 Task1 will not join the transaction, in this way, async inode flush
 won't run, and then task3 won't do anything.

 Before applying the patch:
 Start/Attach_Trans_Task Commit_Task

Re: [PATCH] Btrfs: improve the delayed inode throttling

2013-03-06 Thread Miao Xie

On  wed, 6 Mar 2013 09:53:28 -0500, Chris Mason wrote:
 On Tue, Mar 05, 2013 at 07:45:34PM -0700, Miao Xie wrote:

 We re-queue the node just when there are some delayed items in the current 
 node.
 But if the node still has delayed items after we deal with it, that is to say
 someone is accessing the node. So it is better to release it and deal with it
 later. In this way, we can amass more items and deal with them in batches.
 
 Thanks, I've made this change.
 

 +   } else {
 +   btrfs_release_prepared_delayed_node(delayed_node);
 +   if (async_work-nr == 0 || total_done  async_work-nr)
 +   goto again;

 If joining transaction fails, we should end the async handle. And for case
 -nr == 0 (it means there are too many items, we need flush all), we can
 set -blocked of the current transaction, in this way, the users can not
 insert any delayed item for a while, and will wait until the current
 transation is committed 
 
 This one I've left out for now, the old code didn't block and I'd prefer
 that we test that change independently.
 
 V2 below, it also has the break Liu Bo mentioned.
 
 From: Chris Mason chris.ma...@fusionio.com
 Date: Mon, 4 Mar 2013 17:13:31 -0500
 Subject: [PATCH] Btrfs: improve the delayed inode throttling
 
 The delayed inode code batches up changes to the btree in hopes of doing
 them in bulk.  As the changes build up, processes kick off worker
 threads and wait for them to make progress.
 
 The current code kicks off an async work queue item for each delayed
 node, which creates a lot of churn.  It also uses a fixed 1 HZ waiting
 period for the throttle, which allows us to build a lot of pending
 work and can slow down the commit.
 
 This changes us to watch a sequence counter as it is bumped during the
 operations.  We kick off fewer work items and have each work item do
 more work.
 
 Signed-off-by: Chris Mason chris.ma...@fusionio.com
 ---
  fs/btrfs/delayed-inode.c | 152 
 ---
  fs/btrfs/delayed-inode.h |   2 +
  2 files changed, 94 insertions(+), 60 deletions(-)
 
 diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
 index 0b278b1..46f354a 100644
 --- a/fs/btrfs/delayed-inode.c
 +++ b/fs/btrfs/delayed-inode.c
 @@ -22,8 +22,9 @@
  #include disk-io.h
  #include transaction.h
  
 -#define BTRFS_DELAYED_WRITEBACK  400
 -#define BTRFS_DELAYED_BACKGROUND 100
 +#define BTRFS_DELAYED_WRITEBACK  512
 +#define BTRFS_DELAYED_BACKGROUND 128
 +#define BTRFS_DELAYED_BATCH  16
  
  static struct kmem_cache *delayed_node_cache;
  
 @@ -494,6 +495,15 @@ static int __btrfs_add_delayed_deletion_item(struct 
 btrfs_delayed_node *node,
   BTRFS_DELAYED_DELETION_ITEM);
  }
  
 +static void finish_one_item(struct btrfs_delayed_root *delayed_root)
 +{
 + int seq = atomic_inc_return(delayed_root-items_seq);
 + if ((atomic_dec_return(delayed_root-items) 
 + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) 
 + waitqueue_active(delayed_root-wait))
 + wake_up(delayed_root-wait);
 +}
 +
  static void __btrfs_remove_delayed_item(struct btrfs_delayed_item 
 *delayed_item)
  {
   struct rb_root *root;
 @@ -512,10 +522,8 @@ static void __btrfs_remove_delayed_item(struct 
 btrfs_delayed_item *delayed_item)
  
   rb_erase(delayed_item-rb_node, root);
   delayed_item-delayed_node-count--;
 - if (atomic_dec_return(delayed_root-items) 
 - BTRFS_DELAYED_BACKGROUND 
 - waitqueue_active(delayed_root-wait))
 - wake_up(delayed_root-wait);
 +
 + finish_one_item(delayed_root);
  }
  
  static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
 @@ -1056,10 +1064,7 @@ static void btrfs_release_delayed_inode(struct 
 btrfs_delayed_node *delayed_node)
   delayed_node-count--;
  
   delayed_root = delayed_node-root-fs_info-delayed_root;
 - if (atomic_dec_return(delayed_root-items) 
 - BTRFS_DELAYED_BACKGROUND 
 - waitqueue_active(delayed_root-wait))
 - wake_up(delayed_root-wait);
 + finish_one_item(delayed_root);
   }
  }
  
 @@ -1304,35 +1309,44 @@ void btrfs_remove_delayed_node(struct inode *inode)
   btrfs_release_delayed_node(delayed_node);
  }
  
 -struct btrfs_async_delayed_node {
 - struct btrfs_root *root;
 - struct btrfs_delayed_node *delayed_node;
 +struct btrfs_async_delayed_work {
 + struct btrfs_delayed_root *delayed_root;
 + int nr;
   struct btrfs_work work;
  };
  
 -static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 +static void btrfs_async_run_delayed_root(struct btrfs_work *work)
  {
 - struct btrfs_async_delayed_node *async_node;
 + struct btrfs_async_delayed_work *async_work;
 + struct btrfs_delayed_root *delayed_root;
   struct btrfs_trans_handle *trans;
   struct

Re: [PATCH] Btrfs: improve the delayed inode throttling

2013-03-06 Thread Miao Xie

On wed, 6 Mar 2013 09:53:28 -0500, Chris Mason wrote:
[SNIP]
  static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 -  struct btrfs_root *root, int all)
 +  struct btrfs_root *root, int nr)
  {
 - struct btrfs_async_delayed_node *async_node;
 - struct btrfs_delayed_node *curr;
 - int count = 0;
 + struct btrfs_async_delayed_work *async_work;
  
 -again:
 - curr = btrfs_first_prepared_delayed_node(delayed_root);
 - if (!curr)
 + if (atomic_read(delayed_root-items)  BTRFS_DELAYED_BACKGROUND)
   return 0;
  
 - async_node = kmalloc(sizeof(*async_node), GFP_NOFS);
 - if (!async_node) {
 - btrfs_release_prepared_delayed_node(curr);
 + async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
 + if (!async_work)
   return -ENOMEM;
 - }
 -
 - async_node-root = root;
 - async_node-delayed_node = curr;
 -
 - async_node-work.func = btrfs_async_run_delayed_node_done;
 - async_node-work.flags = 0;
 -
 - btrfs_queue_worker(root-fs_info-delayed_workers, async_node-work);
 - count++;
  
 - if (all || count  4)
 - goto again;
 + async_work-delayed_root = delayed_root;
 + async_work-work.func = btrfs_async_run_delayed_root;
 + async_work-work.flags = 0;
 + if (nr)
 + async_work-nr = 0;
 + else
 + async_work-nr = nr;

the code here is wrong.
the argument nr is the number we want to deal with, if it is 0, we will deal 
with all.
so
-   if (nr)
-   async_work-nr = 0;
-   else
-   async_work-nr = nr;
+   async_work-nr = nr;

  
 + btrfs_queue_worker(root-fs_info-delayed_workers, async_work-work);
   return 0;
  }
  
 @@ -1424,30 +1431,55 @@ void btrfs_assert_delayed_root_empty(struct 
 btrfs_root *root)
   WARN_ON(btrfs_first_delayed_node(delayed_root));
  }
  
 +static int refs_newer(struct btrfs_delayed_root *delayed_root,
 +   int seq, int count)
 +{
 + int val = atomic_read(delayed_root-items_seq);
 +
 + if (val  seq || val = seq + count)
 + return 1;
 + return 0;
 +}
 +
  void btrfs_balance_delayed_items(struct btrfs_root *root)
  {
   struct btrfs_delayed_root *delayed_root;
 + int seq;
  
   delayed_root = btrfs_get_delayed_root(root);
  
   if (atomic_read(delayed_root-items)  BTRFS_DELAYED_BACKGROUND)
   return;
  
 + seq = atomic_read(delayed_root-items_seq);
 +
   if (atomic_read(delayed_root-items) = BTRFS_DELAYED_WRITEBACK) {
   int ret;
 + DEFINE_WAIT(__wait);
 +
   ret = btrfs_wq_run_delayed_node(delayed_root, root, 1);

here
-   ret = btrfs_wq_run_delayed_node(delayed_root, root, 1);
+   ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);

   if (ret)
   return;
  
 - wait_event_interruptible_timeout(
 - delayed_root-wait,
 - (atomic_read(delayed_root-items) 
 -  BTRFS_DELAYED_BACKGROUND),
 - HZ);
 - return;
 + while (1) {
 + prepare_to_wait(delayed_root-wait, __wait,
 + TASK_INTERRUPTIBLE);
 +
 + if (refs_newer(delayed_root, seq,
 +BTRFS_DELAYED_BATCH) ||
 + atomic_read(delayed_root-items) 
 + BTRFS_DELAYED_BACKGROUND) {
 + break;
 + }
 + if (!signal_pending(current))
 + schedule();
 + else
 + break;
 + }
 + finish_wait(delayed_root-wait, __wait);
   }
  
 - btrfs_wq_run_delayed_node(delayed_root, root, 0);
 + btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
  }

There is a problem that we may introduce lots of btrfs_works, we need avoid
it.

Thanks
Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: improve the delayed inode throttling

2013-03-06 Thread Miao Xie

On  wed, 6 Mar 2013 22:06:50 -0500, Chris Mason wrote:
 On Wed, Mar 06, 2013 at 06:39:30PM -0700, Miao Xie wrote:
 On wed, 6 Mar 2013 09:53:28 -0500, Chris Mason wrote:
 [SNIP]
 +   async_work-delayed_root = delayed_root;
 +   async_work-work.func = btrfs_async_run_delayed_root;
 +   async_work-work.flags = 0;
 +   if (nr)
 +   async_work-nr = 0;
 +   else
 +   async_work-nr = nr;

 the code here is wrong.
 the argument nr is the number we want to deal with, if it is 0, we will deal 
 with all.
 
 Whoops, thanks.  I missed that when I was cleaning things up.
 
  
 -   btrfs_wq_run_delayed_node(delayed_root, root, 0);
 +   btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
  }

 There is a problem that we may introduce lots of btrfs_works, we need avoid
 it.
 
 It is possible, but we won't make more than we used to.  The real
 solution is to limit the workers per root, but the code isn't currently
 structured for that.  Right now the workers will exit out if the number
 of pending items is below the delayed limit, which isn't perfect but I
 think it's the best I can do right now.
 
 Do you see better ways to improve it?

How do you think about per-cpu btrfs_work? If btrfs_work on the current cpu
is dealt with, we don't queue it, just update -nr if need and tell the workers
that we need do flush again.

(This way is a bit ugly because btrfs_work might not be handled on its cpu)

Thanks
Miao
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: improve the delayed inode throttling

2013-03-05 Thread Miao Xie

On tue, 5 Mar 2013 10:40:17 -0500, Chris Mason wrote:
 diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
 index 0b278b1..460d1a8 100644
 --- a/fs/btrfs/delayed-inode.c
 +++ b/fs/btrfs/delayed-inode.c
 @@ -22,8 +22,8 @@
  #include disk-io.h
  #include transaction.h
  
 -#define BTRFS_DELAYED_WRITEBACK  400
 -#define BTRFS_DELAYED_BACKGROUND 100
 +#define BTRFS_DELAYED_WRITEBACK  512
 +#define BTRFS_DELAYED_BACKGROUND 128
  
  static struct kmem_cache *delayed_node_cache;
  
 @@ -494,6 +494,15 @@ static int __btrfs_add_delayed_deletion_item(struct 
 btrfs_delayed_node *node,
   BTRFS_DELAYED_DELETION_ITEM);
  }
  
 +static void finish_one_item(struct btrfs_delayed_root *delayed_root)
 +{
 + int seq = atomic_inc_return(delayed_root-items_seq);
 + if ((atomic_dec_return(delayed_root-items) 
 + BTRFS_DELAYED_BACKGROUND || seq % 16 == 0) 
 + waitqueue_active(delayed_root-wait))
 + wake_up(delayed_root-wait);
 +}
 +
  static void __btrfs_remove_delayed_item(struct btrfs_delayed_item 
 *delayed_item)
  {
   struct rb_root *root;
 @@ -512,10 +521,8 @@ static void __btrfs_remove_delayed_item(struct 
 btrfs_delayed_item *delayed_item)
  
   rb_erase(delayed_item-rb_node, root);
   delayed_item-delayed_node-count--;
 - if (atomic_dec_return(delayed_root-items) 
 - BTRFS_DELAYED_BACKGROUND 
 - waitqueue_active(delayed_root-wait))
 - wake_up(delayed_root-wait);
 +
 + finish_one_item(delayed_root);
  }
  
  static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
 @@ -1056,10 +1063,7 @@ static void btrfs_release_delayed_inode(struct 
 btrfs_delayed_node *delayed_node)
   delayed_node-count--;
  
   delayed_root = delayed_node-root-fs_info-delayed_root;
 - if (atomic_dec_return(delayed_root-items) 
 - BTRFS_DELAYED_BACKGROUND 
 - waitqueue_active(delayed_root-wait))
 - wake_up(delayed_root-wait);
 + finish_one_item(delayed_root);
   }
  }
  
 @@ -1304,35 +1308,55 @@ void btrfs_remove_delayed_node(struct inode *inode)
   btrfs_release_delayed_node(delayed_node);
  }
  
 +#if 0
  struct btrfs_async_delayed_node {
   struct btrfs_root *root;
   struct btrfs_delayed_node *delayed_node;
   struct btrfs_work work;
  };
 +#endif
 +
 +struct btrfs_async_delayed_work {
 + struct btrfs_delayed_root *delayed_root;
 + int nr;
 + struct btrfs_work work;
 +};
  
 -static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 +static void btrfs_async_run_delayed_root(struct btrfs_work *work)
  {
 - struct btrfs_async_delayed_node *async_node;
 + struct btrfs_async_delayed_work *async_work;
 + struct btrfs_delayed_root *delayed_root;
   struct btrfs_trans_handle *trans;
   struct btrfs_path *path;
   struct btrfs_delayed_node *delayed_node = NULL;
   struct btrfs_root *root;
   struct btrfs_block_rsv *block_rsv;
   int need_requeue = 0;
 + int total_done = 0;
  
 - async_node = container_of(work, struct btrfs_async_delayed_node, work);
 + async_work = container_of(work, struct btrfs_async_delayed_work, work);
 + delayed_root = async_work-delayed_root;
  
   path = btrfs_alloc_path();
   if (!path)
   goto out;
 - path-leave_spinning = 1;
  
 - delayed_node = async_node-delayed_node;
 +again:
 + if (atomic_read(delayed_root-items)  BTRFS_DELAYED_BACKGROUND / 2)
 + goto free_path;
 +
 + delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
 + if (!delayed_node)
 + goto free_path;
 +
 +requeue:
 + path-leave_spinning = 1;
 + need_requeue = 0;
   root = delayed_node-root;
  
   trans = btrfs_join_transaction(root);
   if (IS_ERR(trans))
 - goto free_path;
 + goto release_path;
  
   block_rsv = trans-block_rsv;
   trans-block_rsv = root-fs_info-delayed_block_rsv;
 @@ -1373,47 +1397,48 @@ static void btrfs_async_run_delayed_node_done(struct 
 btrfs_work *work)
   trans-block_rsv = block_rsv;
   btrfs_end_transaction_dmeta(trans, root);
   btrfs_btree_balance_dirty_nodelay(root);
 +
 +release_path:
 + btrfs_release_path(path);
 + total_done++;
 +
 + if (need_requeue) {
 + goto requeue;

We re-queue the node just when there are some delayed items in the current node.
But if the node still has delayed items after we deal with it, that is to say
someone is accessing the node. So it is better to release it and deal with it
later. In this way, we can amass more items and deal with them in batches.

 + } else {
 + btrfs_release_prepared_delayed_node(delayed_node);
 + if (async_work-nr == 0 || total_done  async_work-nr)
 + goto again;

If joining transaction fails, we should

[PATCH 1/2] Btrfs: fix wrong handle at error path of create_snapshot() when the commit fails

2013-03-04 Thread Miao Xie

There are several bugs at error path of create_snapshot() when the
transaction commitment failed.
- access the freed transaction handler. At the end of the
  transaction commitment, the transaction handler was freed, so we
  should not access it after the transaction commitment.
- we were not aware of the error which happened during the snapshot
  creation if we submitted a async transaction commitment.
- pending snapshot access vs pending snapshot free. when something
  wrong happened after we submitted a async transaction commitment,
  the transaction committer would cleanup the pending snapshots and
  free them. But the snapshot creators were not aware of it, they
  would access the freed pending snapshots.

This patch fixes the above problems by:
- remove the dangerous code that accessed the freed handler
- assign -error if the error happens during the snapshot creation
- the transaction committer doesn't free the pending snapshots,
  just assigns the error number and evicts them before we unblock
  the transaction.

Reported-by: Dan Carpenter dan.carpen...@oracle.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c |   16 +---
 fs/btrfs/ioctl.c   |6 +
 fs/btrfs/transaction.c |   58 +++
 3 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 02369a3..7d84651 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -62,7 +62,7 @@ static void btrfs_destroy_ordered_operations(struct 
btrfs_transaction *t,
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
  struct btrfs_root *root);
-static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages,
@@ -3687,7 +3687,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction 
*trans,
return ret;
 }
 
-static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)
 {
struct btrfs_pending_snapshot *snapshot;
struct list_head splice;
@@ -3700,10 +3700,8 @@ static void btrfs_destroy_pending_snapshots(struct 
btrfs_transaction *t)
snapshot = list_entry(splice.next,
  struct btrfs_pending_snapshot,
  list);
-
+   snapshot-error = -ECANCELED;
list_del_init(snapshot-list);
-
-   kfree(snapshot);
}
 }
 
@@ -3840,6 +3838,8 @@ void btrfs_cleanup_one_transaction(struct 
btrfs_transaction *cur_trans,
cur_trans-blocked = 1;
wake_up(root-fs_info-transaction_blocked_wait);
 
+   btrfs_evict_pending_snapshots(cur_trans);
+
cur_trans-blocked = 0;
wake_up(root-fs_info-transaction_wait);
 
@@ -3849,8 +3849,6 @@ void btrfs_cleanup_one_transaction(struct 
btrfs_transaction *cur_trans,
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
 
-   btrfs_destroy_pending_snapshots(cur_trans);
-
btrfs_destroy_marked_extents(root, cur_trans-dirty_pages,
 EXTENT_DIRTY);
btrfs_destroy_pinned_extent(root,
@@ -3894,6 +3892,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
if (waitqueue_active(root-fs_info-transaction_blocked_wait))
wake_up(root-fs_info-transaction_blocked_wait);
 
+   btrfs_evict_pending_snapshots(t);
+
t-blocked = 0;
smp_mb();
if (waitqueue_active(root-fs_info-transaction_wait))
@@ -3907,8 +3907,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
 
-   btrfs_destroy_pending_snapshots(t);
-
btrfs_destroy_delalloc_inodes(root);
 
spin_lock(root-fs_info-trans_lock);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b908960..94c0e42 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -596,12 +596,8 @@ static int create_snapshot(struct btrfs_root *root, struct 
inode *dir,
ret = btrfs_commit_transaction(trans,
   root-fs_info-extent_root);
}
-   if (ret) {
-   /* cleanup_transaction has freed this for us */
-   if (trans-aborted)
-   pending_snapshot = NULL;
+   if (ret)
goto fail;
-   }
 
ret = pending_snapshot-error;
if (ret

[PATCH 2/2] Btrfs: fix unclosed transaction handler when the async transaction commitment fails

2013-03-04 Thread Miao Xie

If the async transaction commitment failed, we need close the
current transaction handler, or the current transaction will be
blocked to commit because of this orphan handler.

We fix the problem by doing sync transaction commitment, that is
to invoke btrfs_commit_transaction().

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ioctl.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 94c0e42..3fdfabc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -527,6 +527,8 @@ fail:
if (async_transid) {
*async_transid = trans-transid;
err = btrfs_commit_transaction_async(trans, root, 1);
+   if (err)
+   err = btrfs_commit_transaction(trans, root);
} else {
err = btrfs_commit_transaction(trans, root);
}
@@ -592,6 +594,8 @@ static int create_snapshot(struct btrfs_root *root, struct 
inode *dir,
*async_transid = trans-transid;
ret = btrfs_commit_transaction_async(trans,
 root-fs_info-extent_root, 1);
+   if (ret)
+   ret = btrfs_commit_transaction(trans, root);
} else {
ret = btrfs_commit_transaction(trans,
   root-fs_info-extent_root);
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/2] Btrfs: fix wrong handle at error path of create_snapshot() when the commit fails

2013-03-04 Thread Miao Xie

On Mon, 4 Mar 2013 18:54:02 +0800, Liu Bo wrote:
 On Mon, Mar 04, 2013 at 05:44:29PM +0800, Miao Xie wrote:
 There are several bugs at error path of create_snapshot() when the
 transaction commitment failed.
 - access the freed transaction handler. At the end of the
   transaction commitment, the transaction handler was freed, so we
   should not access it after the transaction commitment.
 - we were not aware of the error which happened during the snapshot
   creation if we submitted a async transaction commitment.
 - pending snapshot access vs pending snapshot free. when something
   wrong happened after we submitted a async transaction commitment,
   the transaction committer would cleanup the pending snapshots and
   free them. But the snapshot creators were not aware of it, they
   would access the freed pending snapshots.

 This patch fixes the above problems by:
 - remove the dangerous code that accessed the freed handler
 - assign -error if the error happens during the snapshot creation
 - the transaction committer doesn't free the pending snapshots,
   just assigns the error number and evicts them before we unblock
   the transaction.

 Reported-by: Dan Carpenter dan.carpen...@oracle.com
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/disk-io.c |   16 +---
  fs/btrfs/ioctl.c   |6 +
  fs/btrfs/transaction.c |   58 
 +++
  3 files changed, 41 insertions(+), 39 deletions(-)

 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index 02369a3..7d84651 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -62,7 +62,7 @@ static void btrfs_destroy_ordered_operations(struct 
 btrfs_transaction *t,
  static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
  static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
 -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
 +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);
  static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
  static int btrfs_destroy_marked_extents(struct btrfs_root *root,
  struct extent_io_tree *dirty_pages,
 @@ -3687,7 +3687,7 @@ int btrfs_destroy_delayed_refs(struct 
 btrfs_transaction *trans,
  return ret;
  }
  
 -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
 +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)
  {
  struct btrfs_pending_snapshot *snapshot;
  struct list_head splice;
 @@ -3700,10 +3700,8 @@ static void btrfs_destroy_pending_snapshots(struct 
 btrfs_transaction *t)
  snapshot = list_entry(splice.next,
struct btrfs_pending_snapshot,
list);
 -
 +snapshot-error = -ECANCELED;
 
 ECANCELED or EROFS?  Now that EROFS is why we're here.

If trans-blocks_used is not 0, the file system may not be set to read-only, so 
I chose ECANCELED,
this error number is proper, I think.

Thanks
Miao

 Others look good.
 
 thanks,
 liubo
 
  list_del_init(snapshot-list);
 -
 -kfree(snapshot);
  }
  }
  
 @@ -3840,6 +3838,8 @@ void btrfs_cleanup_one_transaction(struct 
 btrfs_transaction *cur_trans,
  cur_trans-blocked = 1;
  wake_up(root-fs_info-transaction_blocked_wait);
  
 +btrfs_evict_pending_snapshots(cur_trans);
 +
  cur_trans-blocked = 0;
  wake_up(root-fs_info-transaction_wait);
  
 @@ -3849,8 +3849,6 @@ void btrfs_cleanup_one_transaction(struct 
 btrfs_transaction *cur_trans,
  btrfs_destroy_delayed_inodes(root);
  btrfs_assert_delayed_root_empty(root);
  
 -btrfs_destroy_pending_snapshots(cur_trans);
 -
  btrfs_destroy_marked_extents(root, cur_trans-dirty_pages,
   EXTENT_DIRTY);
  btrfs_destroy_pinned_extent(root,
 @@ -3894,6 +3892,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
  if (waitqueue_active(root-fs_info-transaction_blocked_wait))
  wake_up(root-fs_info-transaction_blocked_wait);
  
 +btrfs_evict_pending_snapshots(t);
 +
  t-blocked = 0;
  smp_mb();
  if (waitqueue_active(root-fs_info-transaction_wait))
 @@ -3907,8 +3907,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
  btrfs_destroy_delayed_inodes(root);
  btrfs_assert_delayed_root_empty(root);
  
 -btrfs_destroy_pending_snapshots(t);
 -
  btrfs_destroy_delalloc_inodes(root);
  
  spin_lock(root-fs_info-trans_lock);
 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
 index b908960..94c0e42 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -596,12 +596,8 @@ static int create_snapshot(struct btrfs_root *root, 
 struct inode *dir,
  ret = btrfs_commit_transaction(trans

[PATCH 1/3] Btrfs: remove unnecessary dget_parent/dput when creating the pending snapshot

2013-02-28 Thread Miao Xie

Since we have grabbed the parent inode at the beginning of the
snapshot creation, and both sync and async snapshot creation
release it after the pending snapshots are actually created,
it is safe to access the parent inode directly during the snapshot
creation, we needn't use dget_parent/dput to fix the parent dentry
and get the dir inode.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ioctl.c   |   10 ++
 fs/btrfs/transaction.c |5 +
 fs/btrfs/transaction.h |1 +
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2bbbed5..75c551d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -530,9 +530,10 @@ fail:
return ret;
 }
 
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-  char *name, int namelen, u64 *async_transid,
-  bool readonly, struct btrfs_qgroup_inherit *inherit)
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
+  struct dentry *dentry, char *name, int namelen,
+  u64 *async_transid, bool readonly,
+  struct btrfs_qgroup_inherit *inherit)
 {
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
@@ -551,6 +552,7 @@ static int create_snapshot(struct btrfs_root *root, struct 
dentry *dentry,
pending_snapshot-dentry = dentry;
pending_snapshot-root = root;
pending_snapshot-readonly = readonly;
+   pending_snapshot-dir = dir;
pending_snapshot-inherit = inherit;
 
trans = btrfs_start_transaction(root-fs_info-extent_root, 6);
@@ -728,7 +730,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_up_read;
 
if (snap_src) {
-   error = create_snapshot(snap_src, dentry, name, namelen,
+   error = create_snapshot(snap_src, dir, dentry, name, namelen,
async_transid, readonly, inherit);
} else {
error = create_subvol(BTRFS_I(dir)-root, dentry,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 955204c..63390a3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1061,7 +1061,6 @@ static noinline int create_pending_snapshot(struct 
btrfs_trans_handle *trans,
struct inode *parent_inode;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
-   struct dentry *parent;
struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
@@ -1118,8 +1117,7 @@ static noinline int create_pending_snapshot(struct 
btrfs_trans_handle *trans,
trans-block_rsv = pending-block_rsv;
 
dentry = pending-dentry;
-   parent = dget_parent(dentry);
-   parent_inode = parent-d_inode;
+   parent_inode = pending-dir;
parent_root = BTRFS_I(parent_inode)-root;
record_root_in_trans(trans, parent_root);
 
@@ -1267,7 +1265,6 @@ static noinline int create_pending_snapshot(struct 
btrfs_trans_handle *trans,
if (ret)
btrfs_abort_transaction(trans, root, ret);
 fail:
-   dput(parent);
trans-block_rsv = rsv;
 no_free_objectid:
kfree(new_root_item);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5afd7b1..5f67fba 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -84,6 +84,7 @@ struct btrfs_trans_handle {
 
 struct btrfs_pending_snapshot {
struct dentry *dentry;
+   struct inode *dir;
struct btrfs_root *root;
struct btrfs_root *snap;
struct btrfs_qgroup_inherit *inherit;
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3] Btrfs: fix wrong reserved space in qgroup during snap/subv creation

2013-02-28 Thread Miao Xie

There are two problems in the space reservation of the snapshot/
subvolume creation.
- don't reserve the space for the root item insertion
- the space which is reserved in the qgroup is different with
  the free space reservation. we need reserve free space for
  7 items, but in qgroup reservation, we need reserve space only
  for 3 items.

So we implement new metadata reservation functions for the
snapshot/subvolume creation.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ctree.h   |9 +-
 fs/btrfs/extent-tree.c |   65 +++-
 fs/btrfs/ioctl.c   |   62 +++--
 fs/btrfs/transaction.c |4 +--
 fs/btrfs/transaction.h |1 +
 5 files changed, 105 insertions(+), 36 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b355bb4..b98c451 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3068,8 +3068,13 @@ void btrfs_trans_release_metadata(struct 
btrfs_trans_handle *trans,
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
  struct inode *inode);
 void btrfs_orphan_release_metadata(struct inode *inode);
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-   struct btrfs_pending_snapshot *pending);
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+struct btrfs_block_rsv *rsv,
+int nitems,
+u64 *qgroup_reserved);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 88831fa..b795ed9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4496,19 +4496,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
btrfs_block_rsv_release(root, root-orphan_block_rsv, num_bytes);
 }
 
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-   struct btrfs_pending_snapshot *pending)
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+struct btrfs_block_rsv *rsv,
+int items,
+u64 *qgroup_reserved)
 {
-   struct btrfs_root *root = pending-root;
-   struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
-   struct btrfs_block_rsv *dst_rsv = pending-block_rsv;
-   /*
-* two for root back/forward refs, two for directory entries,
-* one for root of the snapshot and one for parent inode.
-*/
-   u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
-   dst_rsv-space_info = src_rsv-space_info;
-   return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+   u64 num_bytes;
+   int ret;
+
+   if (root-fs_info-quota_enabled) {
+   /* One for parent inode, two for dir entries */
+   num_bytes = 3 * root-leafsize;
+   ret = btrfs_qgroup_reserve(root, num_bytes);
+   if (ret)
+   return ret;
+   } else {
+   num_bytes = 0;
+   }
+
+   *qgroup_reserved = num_bytes;
+
+   num_bytes = btrfs_calc_trans_metadata_size(root, items);
+   rsv-space_info = __find_space_info(root-fs_info,
+   BTRFS_BLOCK_GROUP_METADATA);
+   ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+   if (ret) {
+   if (*qgroup_reserved)
+   btrfs_qgroup_free(root, *qgroup_reserved);
+   }
+
+   return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 qgroup_reserved)
+{
+   btrfs_block_rsv_release(root, rsv

[PATCH 3/3] Btrfs: fix wrong reserved space when deleting a snapshot/subvolume

2013-02-28 Thread Miao Xie

When deleting a snapshot/subvolume, we need remove root ref/backref,
dir entries and update the dir inode, so we must reserve free space
for those operations.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ioctl.c |   21 +++--
 1 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8046cfc..0b46081 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2064,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file 
*file,
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
+   struct btrfs_block_rsv block_rsv;
+   u64 qgroup_reserved;
int namelen;
int ret;
int err = 0;
@@ -2153,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct 
file *file,
if (err)
goto out_up_write;
 
+   btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP);
+   /*
+* One for dir inode, two for dir entries, two for root
+* ref/backref.
+*/
+   err = btrfs_subvolume_reserve_metadata(root, block_rsv,
+  5, qgroup_reserved);
+   if (err)
+   goto out_up_write;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
-   goto out_up_write;
+   goto out_release;
}
-   trans-block_rsv = root-fs_info-global_block_rsv;
+   trans-block_rsv = block_rsv;
+   trans-bytes_reserved = block_rsv.size;
 
ret = btrfs_unlink_subvol(trans, root, dir,
dest-root_key.objectid,
@@ -2188,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct 
file *file,
}
}
 out_end_trans:
+   trans-block_rsv = NULL;
+   trans-bytes_reserved = 0;
ret = btrfs_end_transaction(trans, root);
if (ret  !err)
err = ret;
inode-i_flags |= S_DEAD;
+out_release:
+   btrfs_subvolume_release_metadata(root, block_rsv, qgroup_reserved);
 out_up_write:
up_write(root-fs_info-subvol_sem);
 out_unlock:
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/3] Btrfs: fix the deadlock between the transaction start/attach and commit

2013-02-25 Thread Miao Xie

On sun, 24 Feb 2013 21:49:55 +0200, Alex Lyakas wrote:
 Hi Miao,
 can you please explain your solution a bit more.
 
 On Wed, Feb 20, 2013 at 11:16 AM, Miao Xie mi...@cn.fujitsu.com wrote:
 Now btrfs_commit_transaction() does this

 ret = btrfs_run_ordered_operations(root, 0)

 which async flushes all inodes on the ordered operations list, it introduced
 a deadlock that transaction-start task, transaction-commit task and the flush
 workers waited for each other.
 (See the following URL to get the detail
  http://marc.info/?l=linux-btrfsm=136070705732646w=2)

 As we know, if -in_commit is set, it means someone is committing the
 current transaction, we should not try to join it if we are not JOIN
 or JOIN_NOLOCK, wait is the best choice for it. In this way, we can avoid
 the above problem. In this way, there is another benefit: there is no new
 transaction handle to block the transaction which is on the way of commit,
 once we set -in_commit.

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/transaction.c |   17 -
  1 files changed, 16 insertions(+), 1 deletions(-)

 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
 index bc2f2d1..71b7e2e 100644
 --- a/fs/btrfs/transaction.c
 +++ b/fs/btrfs/transaction.c
 @@ -51,6 +51,14 @@ static noinline void switch_commit_root(struct btrfs_root 
 *root)
 root-commit_root = btrfs_root_node(root);
  }

 +static inline int can_join_transaction(struct btrfs_transaction *trans,
 +  int type)
 +{
 +   return !(trans-in_commit 
 +type != TRANS_JOIN 
 +type != TRANS_JOIN_NOLOCK);
 +}
 +
  /*
   * either allocate a new transaction or hop into the existing one
   */
 @@ -86,6 +94,10 @@ loop:
 spin_unlock(fs_info-trans_lock);
 return cur_trans-aborted;
 }
 +   if (!can_join_transaction(cur_trans, type)) {
 +   spin_unlock(fs_info-trans_lock);
 +   return -EBUSY;
 +   }
 atomic_inc(cur_trans-use_count);
 atomic_inc(cur_trans-num_writers);
 cur_trans-num_joined++;
 @@ -360,8 +372,11 @@ again:

 do {
 ret = join_transaction(root, type);
 -   if (ret == -EBUSY)
 +   if (ret == -EBUSY) {
 wait_current_trans(root);
 +   if (unlikely(type == TRANS_ATTACH))
 +   ret = -ENOENT;
 +   }
 
 So I understand that instead of incrementing num_writes and joining
 the current transaction, you do not join and wait for the current
 transaction to unblock.

More specifically，TRANS_START、TRANS_USERSPACE and TRANS_ATTACH can not
join and just wait for the current transaction to unblock if -in_commit
is set.

 Which task in Josef's example
 http://marc.info/?l=linux-btrfsm=136070705732646w=2
 task 1, task 2 or task 3 is the one that will not join the
 transaction, but instead wait?

Task1 will not join the transaction, in this way, async inode flush
won't run, and then task3 won't do anything.

Before applying the patch:
Start/Attach_Trans_Task Commit_Task 
Flush_Worker
(Task1) (Task2) (Task3) 
-- the name in Josef's example
btrfs_start_transaction()
 |-may_wait_transaction()
 |  (return 0)
 |  btrfs_commit_transaction()
 |   |-set -in_commit and
 |   |  blocked to 1
 |   |-wait writers to be 1
 |   |  (writers is 1)
 |-join_transaction()   |
 |  (writers is 2)   |
 |-btrfs_commit_transaction()   |
 |   |-set trans_no_join to 1
 |   |  (close join transaction)
 |-btrfs_run_ordered_operations |
(Those ordered operations|
 are added when releasing|
 file)   |
 |-async inode flush()  |
 |-wait_flush_comlete() |
 |  
work_loop()
 |   
|-run_work()
 |   
|-btrfs_join_transaction()
 |  
 |-wait_current_trans()
 |-wait writers to be 1

This three tasks waited for each other.

After applying this patch:
Start/Attach_Trans_Task Commit_Task 
Flush_Worker
(Task1) (Task2

Re: [PATCH] Btrfs: update inode flags when renaming

2013-02-24 Thread Miao Xie

On  mon, 25 Feb 2013 11:50:01 +0800, Liu Bo wrote:
 On Fri, Feb 22, 2013 at 11:04:40PM +0100, David Sterba wrote:
 On Fri, Feb 22, 2013 at 05:34:47PM +0800, Miao Xie wrote:
 On  fri, 22 Feb 2013 16:40:35 +0800, Liu Bo wrote:
 On Fri, Feb 22, 2013 at 03:32:50AM -0500, Marios Titas wrote:
 Sorry, but the bug persists even with the above patch.

 touch test
 chattr +C test
 lsattr test
 mv test test2
 lsattr test2

 In the above scenario test2 will not have the C flag.

 What do you expect?  IMO it's right that test2 does not have the C flag.

 No, it's not right.
 For the users, they expect the C flag is not lost because they just do
 a rename operation. but fixup_inode_flags() re-sets the flags by the
 parent directory's flag.

 I think we should inherit the flags from the parent just when we create
 a new file/directory, in the other cases, just give a option to the users.
 How do you think about?

 I agree with that. The COW status of a file should not be changed at all
 when renamed. The typical users are database files and vm images, losing
 the NOCOW flag just from moving here and back is quite unexpected.

 david
 
 Yeah, I agree to remove this bad 'change in rename', will send a patch to
 address it.

I think we can add a mount option, if the option is set, when we move a file 
to a new directory, or create a new file, we will inherit the flags of the 
parent.
If not set, we inherit the flags only when create a new file.
How do you think about it?

Thanks
Miao

 
 thanks,
 liubo
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: use reserved space for creating a snapshot

2013-02-22 Thread Miao Xie

On fri, 22 Feb 2013 12:33:36 +0800, Liu Bo wrote:
 While inserting dir index and updating inode for a snapshot, we'd
 add delayed items which consume trans-block_rsv, if we don't have
 any space reserved in this trans handle, we either just return or
 reserve space again.
 
 But before creating pending snapshots during committing transaction,
 we've done a release on this trans handle, so we don't have space reserved
 in it at this stage.
 
 What we're using is block_rsv of pending snapshots which has already
 reserved well enough space for both inserting dir index and updating
 inode, so we need to set trans handle to indicate that we have space
 now.
 
 Signed-off-by: Liu Bo bo.li@oracle.com

Reviewed-by: Miao Xie mi...@cn.fujitsu.com

 ---
  fs/btrfs/transaction.c |2 ++
  1 files changed, 2 insertions(+), 0 deletions(-)
 
 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
 index fc03aa6..5878bb4 100644
 --- a/fs/btrfs/transaction.c
 +++ b/fs/btrfs/transaction.c
 @@ -1063,6 +1063,7 @@ static noinline int create_pending_snapshot(struct 
 btrfs_trans_handle *trans,
  
   rsv = trans-block_rsv;
   trans-block_rsv = pending-block_rsv;
 + trans-bytes_reserved = trans-block_rsv-reserved;
  
   dentry = pending-dentry;
   parent = dget_parent(dentry);
 @@ -1216,6 +1217,7 @@ static noinline int create_pending_snapshot(struct 
 btrfs_trans_handle *trans,
  fail:
   dput(parent);
   trans-block_rsv = rsv;
 + trans-bytes_reserved = 0;
  no_free_objectid:
   kfree(new_root_item);
  root_item_alloc_fail:
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Btrfs: fix wrong outstanding_extents when doing DIO write

2013-02-21 Thread Miao Xie

When running the 083th case of xfstests on the filesystem with
compress-force=lzo, the following WARNINGs were triggered.
  WARNING: at fs/btrfs/inode.c:7908
  WARNING: at fs/btrfs/inode.c:7909
  WARNING: at fs/btrfs/inode.c:7911
  WARNING: at fs/btrfs/extent-tree.c:4510
  WARNING: at fs/btrfs/extent-tree.c:4511

This problem was introduced by the patch Btrfs: fix deadlock due
to unsubmitted. In this patch, there are two bugs which caused
the above problem.

The 1st one is a off-by-one bug, if the DIO write return 0, it is
also a short write, we need release the reserved space for it. But
we didn't do it in that patch. Fix it by change ret  0 to
ret = 0.

The 2nd one is -outstanding_extents was increased twice when
a short write happened. As we know, -outstanding_extents is
a counter to keep track of the number of extent items we may
use duo to delalloc, when we reserve the free space for a
delalloc write, we assume that the write will introduce just
one extent item, so we increase -outstanding_extents by 1 at
that time. And then we will increase it every time we split the
write, it is done at the beginning of btrfs_get_blocks_direct().
So when a short write happens, we needn't increase
-outstanding_extents again. But this patch done.

In order to fix the 2nd problem, I re-write the logic for
-outstanding_extents operation. We don't increase it at the
beginning of btrfs_get_blocks_direct(), instead, we just
increase it when the split actually happens.

Reported-by: Mitch Harder mitch.har...@sabayonlinux.org
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/inode.c | 20 +---
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b009fb5..9a1cc04 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6067,12 +6067,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
int unlock_bits = EXTENT_LOCKED;
int ret = 0;
 
-   if (create) {
-   spin_lock(BTRFS_I(inode)-lock);
-   BTRFS_I(inode)-outstanding_extents++;
-   spin_unlock(BTRFS_I(inode)-lock);
+   if (create)
unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
-   } else
+   else
len = min_t(u64, len, root-sectorsize);
 
lockstart = start;
@@ -6214,6 +6211,10 @@ unlock:
if (start + len  i_size_read(inode))
i_size_write(inode, start + len);
 
+   spin_lock(BTRFS_I(inode)-lock);
+   BTRFS_I(inode)-outstanding_extents++;
+   spin_unlock(BTRFS_I(inode)-lock);
+
ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart,
 lockstart + len - 1, EXTENT_DELALLOC, NULL,
 cached_state, GFP_NOFS);
@@ -6716,14 +6717,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
if (rw  WRITE) {
if (ret  0  ret != -EIOCBQUEUED)
btrfs_delalloc_release_space(inode, count);
-   else if (ret  0  (size_t)ret  count) {
-   spin_lock(BTRFS_I(inode)-lock);
-   BTRFS_I(inode)-outstanding_extents++;
-   spin_unlock(BTRFS_I(inode)-lock);
+   else if (ret = 0  (size_t)ret  count)
btrfs_delalloc_release_space(inode,
 count - (size_t)ret);
-   }
-   btrfs_delalloc_release_metadata(inode, 0);
+   else
+   btrfs_delalloc_release_metadata(inode, 0);
}
 out:
if (wakeup)
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Kernel WARNINGs on btrfs-next

2013-02-21 Thread Miao Xie

hi,

On wed, 20 Feb 2013 23:35:36 -0600, Mitch Harder wrote:
 I'm getting a series of kernel WARNING messages when testing Josef's
 btrfs-next and Chris' next branch running xfstests 083 when mounted
 with compress-force=lzo.
 
 I'm not seeing any other indications of problems other than the
 WARNINGs on xfstests 083, so this may be some sort of false positive.
 
 Here are the messages against Chris' -next branch (the same warnings
 are being generated against josef's branch, except against a 3.7.x
 kernel):

I sent a patch to fix this problem as a reply of this mail, could you
test it for me?

Thanks
Miao 

 
 [  553.194991] [ cut here ]
 [  553.195002] WARNING: at fs/btrfs/inode.c:7908
 btrfs_destroy_inode+0x67/0x25b [btrfs]()
 [  553.195043] Hardware name: OptiPlex 745
 [  553.195046] Modules linked in: ipv6 snd_hda_codec_analog
 snd_hda_intel snd_hda_codec snd_hwdep ppdev parport_pc snd_pcm
 snd_page_alloc snd_timer snd floppy sr_mod i2c_i801 tg3 ptp iTCO_wdt
 pps_core iTCO_vendor_support ehci_pci parport lpc_ich microcode
 serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64
 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate
 ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd
 ohci_hcd uhci_hcd ehci_hcd
 [  553.195099] Pid: 4674, comm: rm Not tainted 3.8.0-mason-next+ #1
 [  553.195102] Call Trace:
 [  553.195112]  [81030522] warn_slowpath_common+0x83/0x9b
 [  553.195118]  [81030554] warn_slowpath_null+0x1a/0x1c
 [  553.195135]  [a018d69e] btrfs_destroy_inode+0x67/0x25b [btrfs]
 [  553.195141]  [8111759a] destroy_inode+0x3b/0x54
 [  553.195145]  [811176fc] evict+0x149/0x151
 [  553.195149]  [81117f82] iput+0x12c/0x135
 [  553.195166]  [a0187f42] ? btrfs_unlink_inode+0x38/0x40 [btrfs]
 [  553.195171]  [8110de10] do_unlinkat+0x145/0x1df
 [  553.195177]  [81106e9f] ? sys_newfstatat+0x2a/0x33
 [  553.195191]  [8110fce5] sys_unlinkat+0x29/0x2b
 [  553.195212]  [81607746] system_call_fastpath+0x1a/0x1f
 [  553.195224] ---[ end trace 0adc4db1ad1a6634 ]---
 [  553.195231] [ cut here ]
 [  553.195247] WARNING: at fs/btrfs/inode.c:7909
 btrfs_destroy_inode+0x7e/0x25b [btrfs]()
 [  553.195249] Hardware name: OptiPlex 745
 [  553.195251] Modules linked in: ipv6 snd_hda_codec_analog
 snd_hda_intel snd_hda_codec snd_hwdep ppdev parport_pc snd_pcm
 snd_page_alloc snd_timer snd floppy sr_mod i2c_i801 tg3 ptp iTCO_wdt
 pps_core iTCO_vendor_support ehci_pci parport lpc_ich microcode
 serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64
 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate
 ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd
 ohci_hcd uhci_hcd ehci_hcd
 [  553.195296] Pid: 4674, comm: rm Tainted: GW3.8.0-mason-next+ #1
 [  553.195298] Call Trace:
 [  553.195304]  [81030522] warn_slowpath_common+0x83/0x9b
 [  553.195308]  [81030554] warn_slowpath_null+0x1a/0x1c
 [  553.195324]  [a018d6b5] btrfs_destroy_inode+0x7e/0x25b [btrfs]
 [  553.195329]  [8111759a] destroy_inode+0x3b/0x54
 [  553.195333]  [811176fc] evict+0x149/0x151
 [  553.195336]  [81117f82] iput+0x12c/0x135
 [  553.195352]  [a0187f42] ? btrfs_unlink_inode+0x38/0x40 [btrfs]
 [  553.195356]  [8110de10] do_unlinkat+0x145/0x1df
 [  553.195360]  [81106e9f] ? sys_newfstatat+0x2a/0x33
 [  553.195364]  [8110fce5] sys_unlinkat+0x29/0x2b
 [  553.195368]  [81607746] system_call_fastpath+0x1a/0x1f
 [  553.195371] ---[ end trace 0adc4db1ad1a6635 ]---
 [  553.195373] [ cut here ]
 [  553.195389] WARNING: at fs/btrfs/inode.c:7911
 btrfs_destroy_inode+0xae/0x25b [btrfs]()
 [  553.195391] Hardware name: OptiPlex 745
 [  553.195393] Modules linked in: ipv6 snd_hda_codec_analog
 snd_hda_intel snd_hda_codec snd_hwdep ppdev parport_pc snd_pcm
 snd_page_alloc snd_timer snd floppy sr_mod i2c_i801 tg3 ptp iTCO_wdt
 pps_core iTCO_vendor_support ehci_pci parport lpc_ich microcode
 serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64
 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate
 ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd
 ohci_hcd uhci_hcd ehci_hcd
 [  553.195437] Pid: 4674, comm: rm Tainted: GW3.8.0-mason-next+ #1
 [  553.195439] Call Trace:
 [  553.195444]  [81030522] warn_slowpath_common+0x83/0x9b
 [  553.195449]  [81030554] warn_slowpath_null+0x1a/0x1c
 [  553.195463]  [a018d6e5] btrfs_destroy_inode+0xae/0x25b [btrfs]
 [  553.195470]  [8111759a] destroy_inode+0x3b/0x54
 [  553.195474]  [811176fc] evict+0x149/0x151
 [  553.195480]  [81117f82] iput+0x12c/0x135
 [  553.195495]  [a0187f42] ? btrfs_unlink_inode+0x38/0x40 [btrfs]
 [  553.195499]  [8110de10] do_unlinkat+0x145/0x1df
 [  553.195504]

[PATCH 1/3] Btrfs: fix the qgroup reserved space is released prematurely

2013-02-20 Thread Miao Xie

In start_transactio(), we will try to join the transaction again after
the current transaction is committed, so we should not release the
reserved space of the qgroup. Fix it.

Cc: Arne Jansen sensi...@gmx.net
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fc03aa6..bc2f2d1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -383,7 +383,7 @@ again:
h-block_rsv = NULL;
h-orig_rsv = NULL;
h-aborted = 0;
-   h-qgroup_reserved = qgroup_reserved;
+   h-qgroup_reserved = 0;
h-delayed_ref_elem.seq = 0;
h-type = type;
INIT_LIST_HEAD(h-qgroup_ref_list);
@@ -401,6 +401,7 @@ again:
h-block_rsv = root-fs_info-trans_block_rsv;
h-bytes_reserved = num_bytes;
}
+   h-qgroup_reserved = qgroup_reserved;
 
 got_it:
btrfs_record_root_in_trans(h, root);
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3] Btrfs: fix the deadlock between the transaction start/attach and commit

2013-02-20 Thread Miao Xie

Now btrfs_commit_transaction() does this

ret = btrfs_run_ordered_operations(root, 0)

which async flushes all inodes on the ordered operations list, it introduced
a deadlock that transaction-start task, transaction-commit task and the flush
workers waited for each other.
(See the following URL to get the detail
 http://marc.info/?l=linux-btrfsm=136070705732646w=2)

As we know, if -in_commit is set, it means someone is committing the
current transaction, we should not try to join it if we are not JOIN
or JOIN_NOLOCK, wait is the best choice for it. In this way, we can avoid
the above problem. In this way, there is another benefit: there is no new
transaction handle to block the transaction which is on the way of commit,
once we set -in_commit.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c |   17 -
 1 files changed, 16 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bc2f2d1..71b7e2e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -51,6 +51,14 @@ static noinline void switch_commit_root(struct btrfs_root 
*root)
root-commit_root = btrfs_root_node(root);
 }
 
+static inline int can_join_transaction(struct btrfs_transaction *trans,
+  int type)
+{
+   return !(trans-in_commit 
+type != TRANS_JOIN 
+type != TRANS_JOIN_NOLOCK);
+}
+
 /*
  * either allocate a new transaction or hop into the existing one
  */
@@ -86,6 +94,10 @@ loop:
spin_unlock(fs_info-trans_lock);
return cur_trans-aborted;
}
+   if (!can_join_transaction(cur_trans, type)) {
+   spin_unlock(fs_info-trans_lock);
+   return -EBUSY;
+   }
atomic_inc(cur_trans-use_count);
atomic_inc(cur_trans-num_writers);
cur_trans-num_joined++;
@@ -360,8 +372,11 @@ again:
 
do {
ret = join_transaction(root, type);
-   if (ret == -EBUSY)
+   if (ret == -EBUSY) {
wait_current_trans(root);
+   if (unlikely(type == TRANS_ATTACH))
+   ret = -ENOENT;
+   }
} while (ret == -EBUSY);
 
if (ret  0) {
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/3] Btrfs: fix uncompleted transaction

2013-02-20 Thread Miao Xie

In some cases, we need commit the current transaction, but don't want
to start a new one if there is no running transaction, so we introduce
the function - btrfs_attach_transaction(), which can catch the current
transaction, and return -ENOENT if there is no running transaction.

But no running transaction doesn't mean the current transction completely,
because we removed the running transaction before it completes. In some
cases, it doesn't matter. But in some special cases, such as freeze fs, we
hope the transaction is fully on disk, it will introduce some bugs, for
example, we may feeze the fs and dump the data in the disk, if the transction
doesn't complete, we would dump inconsistent data. So we need fix the above
problem for those cases.

We fixes this problem by introducing a function:
btrfs_attach_transaction_barrier()
if we hope all the transaction is fully on the disk, even they are not
running, we can use this function.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ioctl.c   |2 +-
 fs/btrfs/super.c   |4 ++--
 fs/btrfs/transaction.c |   32 
 fs/btrfs/transaction.h |2 ++
 4 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a31cd93..7cbbc2a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3111,7 +3111,7 @@ static noinline long btrfs_ioctl_start_sync(struct 
btrfs_root *root,
u64 transid;
int ret;
 
-   trans = btrfs_attach_transaction(root);
+   trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT)
return PTR_ERR(trans);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d8982e9..74328f7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -876,7 +876,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
btrfs_wait_ordered_extents(root, 0);
 
-   trans = btrfs_attach_transaction(root);
+   trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT)
@@ -1559,7 +1559,7 @@ static int btrfs_freeze(struct super_block *sb)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = btrfs_sb(sb)-tree_root;
 
-   trans = btrfs_attach_transaction(root);
+   trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 71b7e2e..257f320 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -468,11 +468,43 @@ struct btrfs_trans_handle 
*btrfs_start_ioctl_transaction(struct btrfs_root *root
return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ * btrfs_attach_transaction_barrier()
+ */
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 {
return start_transaction(root, 0, TRANS_ATTACH, 0);
 }
 
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+   struct btrfs_trans_handle *trans;
+
+   trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+   if (IS_ERR(trans)  PTR_ERR(trans) == -ENOENT)
+   btrfs_wait_for_commit(root, 0);
+
+   return trans;
+}
+
 /* wait for a transaction commit to be fully complete */
 static noinline void wait_for_commit(struct btrfs_root *root,
struct btrfs_transaction *commit)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0e8aa1e..422a865 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -110,6 +110,8 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root 
*root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+   struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root

[PATCH V2] Btrfs: fix remount vs autodefrag

2013-02-20 Thread Miao Xie

If we remount the fs to close the auto defragment or make the fs R/O,
we should stop the auto defragment.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- don't use -s_umount to avoid R/W-R/O remounting during the defragment.
  Instead We add a new state that tell thedefragger the fs is under remount,
  then the defragger pauses.
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/file.c  |  5 +
 fs/btrfs/super.c | 40 ++--
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1679051..b355bb4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -339,6 +339,7 @@ static inline unsigned long btrfs_chunk_item_size(int 
num_stripes)
  * File system states
  */
 #define BTRFS_FS_STATE_ERROR   0
+#define BTRFS_FS_STATE_REMOUNTING  1
 
 /* Super block flags */
 /* Errors detected */
@@ -1864,6 +1865,7 @@ struct btrfs_ioctl_defrag_range_args {
 
 #define btrfs_clear_opt(o, opt)((o) = ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)  ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt) ((o)  BTRFS_MOUNT_##opt)
 #define btrfs_test_opt(root, opt)  ((root)-fs_info-mount_opt  \
 BTRFS_MOUNT_##opt)
 /*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b12ba52..32b5cff 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 
atomic_inc(fs_info-defrag_running);
while(1) {
+   /* Pause the auto defragger. */
+   if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+fs_info-fs_state))
+   break;
+
if (!__need_auto_defrag(fs_info-tree_root))
break;
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index db1ba9a..68a29a1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1202,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct 
btrfs_fs_info *fs_info,
  new_pool_size);
 }
 
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
+unsigned long old_opts, int flags)
+{
+   set_bit(BTRFS_FS_STATE_REMOUNTING, fs_info-fs_state);
+
+   if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) 
+   (!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) ||
+(flags  MS_RDONLY))) {
+   /* wait for any defraggers to finish */
+   wait_event(fs_info-transaction_wait,
+  (atomic_read(fs_info-defrag_running) == 0));
+   if (flags  MS_RDONLY)
+   sync_filesystem(fs_info-sb);
+   }
+}
+
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+unsigned long old_opts)
+{
+   /*
+* We need cleanup all defragable inodes if the autodefragment is
+* close or the fs is R/O.
+*/
+   if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) 
+   (!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) ||
+(fs_info-sb-s_flags  MS_RDONLY))) {
+   btrfs_cleanup_defrag_inodes(fs_info);
+   }
+
+   clear_bit(BTRFS_FS_STATE_REMOUNTING, fs_info-fs_state);
+}
+
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1215,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int 
*flags, char *data)
unsigned int old_metadata_ratio = fs_info-metadata_ratio;
int ret;
 
+   btrfs_remount_prepare(fs_info, old_opts, *flags);
+
ret = btrfs_parse_options(root, data);
if (ret) {
ret = -EINVAL;
@@ -1225,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int 
*flags, char *data)
fs_info-thread_pool_size, old_thread_pool_size);
 
if ((*flags  MS_RDONLY) == (sb-s_flags  MS_RDONLY))
-   return 0;
+   goto out;
 
if (*flags  MS_RDONLY) {
/*
@@ -1280,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int 
*flags, char *data)
}
sb-s_flags = ~MS_RDONLY;
}
-
+out:
+   btrfs_remount_cleanup(fs_info, old_opts);
return 0;
 
 restore:
@@ -1297,6 +1332,7 @@ restore:
btrfs_resize_thread_pool(fs_info,
old_thread_pool_size, fs_info-thread_pool_size);
fs_info-metadata_ratio = old_metadata_ratio;
+   btrfs_remount_cleanup(fs_info, old_opts);
return ret;
 }
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: fix the deadlock between the transaction attach and commit

2013-02-18 Thread Miao Xie

(Sorry for the late reply, I was on my vacation of the Spring Festival last 
week.)

On Tue, 12 Feb 2013 13:56:32 +0100, David Sterba wrote:
 On Mon, Feb 11, 2013 at 03:35:37PM -0500, Josef Bacik wrote:
 or something like that.  Me and kdave reproduced by running 274 in a loop, it
 happpened pretty quick.  I'd fix it myself but I have to leave my house for
 people to come look at it.  If you haven't fixed this by tomorrow I'll fix it
 up.  Thanks,
 
 I found 224 stuck with this
 
[SNIP]
 
 mounted with noatime,space_cache

Thanks for your test. My test skipped the 274th case because it always fails,
and all the other cases passed, so I didn't hit this problem.

Anyways, very sorry for my stupid patch.
(I have reviewed Josef's fix patch, and commented on it, please see the reply
of that patch)

Thanks
Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: place ordered operations on a per transaction list

2013-02-18 Thread Miao Xie

On wed, 13 Feb 2013 11:13:22 -0500, Josef Bacik wrote:
 Miao made the ordered operations stuff run async, which introduced a
 deadlock where we could get somebody (sync) racing in and committing the
 transaction while a commit was already happening.  The new committer would
 try and flush ordered operations which would hang waiting for the commit to
 finish because it is done asynchronously and no longer inherits the callers
 trans handle.  To fix this we need to make the ordered operations list a per
 transaction list.  We can get new inodes added to the ordered operation list
 by truncating them and then having another process writing to them, so this
 makes it so that anybody trying to add an ordered operation _must_ start a
 transaction in order to add itself to the list, which will keep new inodes
 from getting added to the ordered operations list after we start committing.
 This should fix the deadlock and also keeps us from doing a lot more work
 than we need to during commit.  Thanks,

Firstly, thanks to deal with the bug which was introduced by my patch.

But comparing with this fix method, I prefer the following one because:
- we won't worry the similar problem if we add more work during commit
  in the future.
- it is unnecessary to get a new handle and commit it if the transaction
  is under the commit.

Thanks
Miao

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fc03aa6..c449cb5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -277,7 +277,8 @@ static void wait_current_trans(struct btrfs_root *root)
}
 }
 
-static int may_wait_transaction(struct btrfs_root *root, int type)
+static int may_wait_transaction(struct btrfs_root *root, int type,
+   bool is_joined)
 {
if (root-fs_info-log_root_recovering)
return 0;
@@ -285,6 +286,14 @@ static int may_wait_transaction(struct btrfs_root *root, 
int type)
if (type == TRANS_USERSPACE)
return 1;
 
+   /*
+* If we are ATTACH, it means we just want to catch the current
+* transaction and commit it. So if someone is committing the
+* current transaction now, it is very glad to wait it.
+*/
+   if (is_joined  type == TRANS_ATTACH)
+   return 1;
+
if (type == TRANS_START 
!atomic_read(root-fs_info-open_ioctl_trans))
return 1;
@@ -355,7 +364,7 @@ again:
if (type  TRANS_JOIN_NOLOCK)
sb_start_intwrite(root-fs_info-sb);
 
-   if (may_wait_transaction(root, type))
+   if (may_wait_transaction(root, type, false))
wait_current_trans(root);
 
do {
@@ -383,16 +392,26 @@ again:
h-block_rsv = NULL;
h-orig_rsv = NULL;
h-aborted = 0;
-   h-qgroup_reserved = qgroup_reserved;
+   h-qgroup_reserved = 0;
h-delayed_ref_elem.seq = 0;
h-type = type;
INIT_LIST_HEAD(h-qgroup_ref_list);
INIT_LIST_HEAD(h-new_bgs);
 
smp_mb();
-   if (cur_trans-blocked  may_wait_transaction(root, type)) {
-   btrfs_commit_transaction(h, root);
-   goto again;
+   if (cur_trans-blocked  may_wait_transaction(root, type, true)) {
+   if (cur_trans-in_commit) {
+   btrfs_end_transaction(h, root);
+   wait_current_trans(root);
+   } else {
+   btrfs_commit_transaction(h, root);
+   }
+   if (unlikely(type == TRANS_ATTACH)) {
+   ret = -ENOENT;
+   goto alloc_fail;
+   } else {
+   goto again;
+   }
}
 
if (num_bytes) {
@@ -401,6 +420,7 @@ again:
h-block_rsv = root-fs_info-trans_block_rsv;
h-bytes_reserved = num_bytes;
}
+   h-qgroup_reserved = qgroup_reserved;
 
 got_it:
btrfs_record_root_in_trans(h, root);
-- 
1.6.5.2

 
 Signed-off-by: Josef Bacik jba...@fusionio.com
 ---
  fs/btrfs/ctree.h|7 ---
  fs/btrfs/disk-io.c  |   11 ++-
  fs/btrfs/file.c |   15 ++-
  fs/btrfs/ordered-data.c |   13 -
  fs/btrfs/ordered-data.h |3 ++-
  fs/btrfs/transaction.c  |5 +++--
  fs/btrfs/transaction.h  |1 +
  7 files changed, 34 insertions(+), 21 deletions(-)
 
 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 0c4e4df..9f72ec8 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1408,13 +1408,6 @@ struct btrfs_fs_info {
   struct list_head delalloc_inodes;
  
   /*
 -  * special rename and truncate targets that must be on disk before
 -  * we're allowed to commit.  This is basically the ext3 style
 -  * data=ordered list.
 -  */
 - struct list_head ordered_operations;
 -
 - /*
* there is a pool of worker threads for checksumming during writes
* and a pool for checksumming

Re: [PATCH 2/2] Btrfs: fix memory leak of pending_snapshot-inherit

2013-02-07 Thread Miao Xie

On Thu, 07 Feb 2013 09:43:47 +0100, Arne Jansen wrote:
 On 02/07/13 07:02, Miao Xie wrote:
 The argument inherit of btrfs_ioctl_snap_create_transid() was assigned
 to NULL during we created the snapshots, so we didn't free it though we
 called kfree() in the caller.

 But since we are sure the snapshot creation is done after the function -
 btrfs_ioctl_snap_create_transid() - completes, it is safe that we don't
 assign the pointer inherit to NULL, and just free it in the caller of
 btrfs_ioctl_snap_create_transid(). In this way, the code can become more
 readable.
 
 NAK. The snapshot creation is triggered from btrfs_commit_transaction,
 I don't want to implicitly rely on commit_transaction being called for
 each snapshot created. I'm not even sure the async path really commits
 the transaction.
 The responsibility for the creation is passed to the pending_snapshot
 data structure, and so should the responsibility for the inherit struct.

I don't agree with you.

We are sure the async path really commits the transaction because we pass 1
as the value of the third argument into btrfs_commit_transaction_async(). It
means we must wait for the completion of the current transaction. So Freeing
the inherit struct in the caller is safe.
 
Besides that, the pending_snapshot data structure is also allocated and freed
by the same function in fact, why not use this style for the inherit struct.
I think it is more readable. Assigning a pointer to be NULL and freeing it
in the caller is very strange for the people who reads the code. (It is also
the reason why I made the mistake at the beginning.)

So I think my patch is reasonable.

Thanks
Miao

 -Arne
 

 Reported-by: Alex Lyakas alex.bt...@zadarastorage.com
 Cc: Arne Jansen sensi...@gmx.net
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ioctl.c | 18 +++---
  1 file changed, 7 insertions(+), 11 deletions(-)

 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
 index 02d3035..40f2fbf 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root 
 *root,
struct dentry *dentry,
char *name, int namelen,
u64 *async_transid,
 -  struct btrfs_qgroup_inherit **inherit)
 +  struct btrfs_qgroup_inherit *inherit)
  {
  struct btrfs_trans_handle *trans;
  struct btrfs_key key;
 @@ -401,8 +401,7 @@ static noinline int create_subvol(struct btrfs_root 
 *root,
  if (IS_ERR(trans))
  return PTR_ERR(trans);
  
 -ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid,
 -   inherit ? *inherit : NULL);
 +ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, inherit);
  if (ret)
  goto fail;
  
 @@ -530,7 +529,7 @@ fail:
  
  static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 char *name, int namelen, u64 *async_transid,
 -   bool readonly, struct btrfs_qgroup_inherit **inherit)
 +   bool readonly, struct btrfs_qgroup_inherit *inherit)
  {
  struct inode *inode;
  struct btrfs_pending_snapshot *pending_snapshot;
 @@ -549,10 +548,7 @@ static int create_snapshot(struct btrfs_root *root, 
 struct dentry *dentry,
  pending_snapshot-dentry = dentry;
  pending_snapshot-root = root;
  pending_snapshot-readonly = readonly;
 -if (inherit) {
 -pending_snapshot-inherit = *inherit;
 -*inherit = NULL;/* take responsibility to free it */
 -}
 +pending_snapshot-inherit = inherit;
  
  trans = btrfs_start_transaction(root-fs_info-extent_root, 6);
  if (IS_ERR(trans)) {
 @@ -692,7 +688,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
 char *name, int namelen,
 struct btrfs_root *snap_src,
 u64 *async_transid, bool readonly,
 -   struct btrfs_qgroup_inherit **inherit)
 +   struct btrfs_qgroup_inherit *inherit)
  {
  struct inode *dir  = parent-dentry-d_inode;
  struct dentry *dentry;
 @@ -1454,7 +1450,7 @@ out:
  static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
  char *name, unsigned long fd, int subvol,
  u64 *transid, bool readonly,
 -struct btrfs_qgroup_inherit **inherit)
 +struct btrfs_qgroup_inherit *inherit)
  {
  int namelen;
  int ret = 0;
 @@ -1563,7 +1559,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct 
 file *file,
  
  ret = btrfs_ioctl_snap_create_transid(file, vol_args-name,
vol_args-fd, subvol, ptr

[RFC][PATCH] Btrfs: fix deadlock due to unsubmitted

2013-02-07 Thread Miao Xie

The deadlock problem happened when running fsstress(a test program in LTP).

Steps to reproduce:
 # mkfs.btrfs -b 100M partition
 # mount partition mnt
 # Path/fsstress -p 3 -n 1000 -d mnt

The reason is:
btrfs_direct_IO()
 |-do_direct_IO()
 |-get_page()
 |-get_blocks()
 |   |-btrfs_delalloc_resereve_space()
 |   |-btrfs_add_ordered_extent() ---  Add a new ordered extent
 |-dio_send_cur_page(page0) -- We didn't submit bio here
 |-get_page()
 |-get_blocks()
 |-btrfs_delalloc_resereve_space()
 |-flush_space()
 |-btrfs_start_ordered_extent()
 |-wait_event() -- Wait the completion of
the ordered extent that is
mentioned above

But because we didn't submit the bio that is mentioned above, the ordered
extent can not complete, we would wait for its completion forever.

There are two methods which can fix this deadlock problem:
1. submit the bio before we invoke get_blocks()
2. reserve the space before we do dio

Though the 1st is the simplest way, we need modify the code of VFS, and it
is likely to break contiguous requests, and introduce performance regression
for the other filesystems.

So we have to choose the 2nd way.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
Cc: Josef Bacik jba...@fusionio.com
---
 fs/btrfs/extent-tree.c |3 +-
 fs/btrfs/inode.c   |   81 ---
 2 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 85b8454..ca9afc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4670,7 +4670,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, 
u64 num_bytes)
spin_lock(BTRFS_I(inode)-lock);
dropped = drop_outstanding_extent(inode);
 
-   to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+   if (num_bytes)
+   to_free = calc_csum_metadata_size(inode, num_bytes, 0);
spin_unlock(BTRFS_I(inode)-lock);
if (dropped  0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ca7ace7..c5d829d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6004,16 +6004,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
u64 len = bh_result-b_size;
struct btrfs_trans_handle *trans;
int unlock_bits = EXTENT_LOCKED;
-   int ret;
+   int ret = 0;
 
if (create) {
-   ret = btrfs_delalloc_reserve_space(inode, len);
-   if (ret)
-   return ret;
+   spin_lock(BTRFS_I(inode)-lock);
+   BTRFS_I(inode)-outstanding_extents++;
+   spin_unlock(BTRFS_I(inode)-lock);
unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
-   } else {
+   } else
len = min_t(u64, len, root-sectorsize);
-   }
 
lockstart = start;
lockend = start + len - 1;
@@ -6025,14 +6024,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
if (lock_extent_direct(inode, lockstart, lockend, cached_state, 
create))
return -ENOTBLK;
 
-   if (create) {
-   ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart,
-lockend, EXTENT_DELALLOC, NULL,
-cached_state, GFP_NOFS);
-   if (ret)
-   goto unlock_err;
-   }
-
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -6064,7 +6055,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
if (!create  (em-block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, em-flags))) {
free_extent_map(em);
-   ret = 0;
goto unlock_err;
}
 
@@ -6162,6 +6152,11 @@ unlock:
 */
if (start + len  i_size_read(inode))
i_size_write(inode, start + len);
+
+   ret = set_extent_bit(BTRFS_I(inode)-io_tree, lockstart,
+lockstart + len - 1, EXTENT_DELALLOC, NULL,
+cached_state, GFP_NOFS);
+   BUG_ON(ret);
}
 
/*
@@ -6170,24 +6165,9 @@ unlock:
 * aren't using if there is any left over space.
 */
if (lockstart  lockend) {
-   if (create  len  lockend - lockstart) {
-   clear_extent_bit(BTRFS_I(inode)-io_tree, lockstart,
-lockstart + len - 1,
-unlock_bits | EXTENT_DEFRAG, 1, 0,
-cached_state

[PATCH] Btrfs: fix the deadlock between the transaction attach and commit

2013-02-07 Thread Miao Xie

Here is the whole story:
Trans_Attach_Task   Trans_Commit_Task
btrfs_commit_transaction()
 |-wait writers to be 1
btrfs_attach_transaction()   |
btrfs_commit_transaction()   |
 |   |-set trans_no_join to 1
 |   |  (close join transaction)
 |-btrfs_run_ordered_operations |
(Those ordered operations|
 are added when releasing|
 file)   |
 |-btrfs_join_transaction() |
|-wait_commit() |
 |-wait writers to be 1

Then these two tasks waited for each other.

As we know, btrfs_attach_transaction() is used to catch the current
transaction, and commit it, so if someone has committed the transaction,
it is unnecessary to join it and commit it, wait is the best choice
for it. In this way, we can fix the above problem.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/transaction.c |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f154946..7be9d5e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -285,6 +285,14 @@ static int may_wait_transaction(struct btrfs_root *root, 
int type)
if (type == TRANS_USERSPACE)
return 1;
 
+   /*
+* If we are ATTACH, it means we just want to catch the current
+* transaction and commit it. So if someone is committing the
+* current transaction now, it is very glad to wait it.
+*/
+   if (type == TRANS_ATTACH)
+   return 1;
+
if (type == TRANS_START 
!atomic_read(root-fs_info-open_ioctl_trans))
return 1;
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V2 1/2] Btrfs: serialize unlocked dio reads with truncate

2013-02-07 Thread Miao Xie

Currently, we can do unlocked dio reads, but the following race
is possible:

dio_read_task   truncate_task
-btrfs_setattr()
-btrfs_direct_IO
-__blockdev_direct_IO
  -btrfs_get_block
  -btrfs_truncate()
 #alloc truncated blocks
 #to other inode
  -submit_io()
 #INFORMATION LEAK

In order to avoid this problem, we must serialize unlocked dio reads with
truncate. There are two approaches:
- use extent lock to protect the extent that we truncate
- use inode_dio_wait() to make sure the truncating task will wait for
  the read DIO.

If we use the 1st one, we will meet the endless truncation problem due to
the nonlocked read DIO after we implement the nonlocked write DIO. It is
because we still need invoke inode_dio_wait() avoid the race between write
DIO and truncation. By that time, we have to introduce

  btrfs_inode_{block, resume}_nolock_dio()

again. That is we have to implement this patch again, so I choose the 2nd
way to fix the problem.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changlog v1 - v2:
- Rebase the patch against the following one:
  [RFC][PATCH] Btrfs: fix deadlock due to unsubmitted
- Modify the changelog to explain why we don't choose the extent lock to
  fix the bug
---
 fs/btrfs/btrfs_inode.h |   19 +++
 fs/btrfs/inode.c   |   23 +--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242..00e2601 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,7 @@
 #define BTRFS_INODE_HAS_ASYNC_EXTENT   6
 #define BTRFS_INODE_NEEDS_FULL_SYNC7
 #define BTRFS_INODE_COPY_EVERYTHING8
+#define BTRFS_INODE_READDIO_NEED_LOCK  9
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -216,4 +217,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, 
u64 generation)
return 0;
 }
 
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+   set_bit(BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_I(inode)-runtime_flags);
+   smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+   smp_mb__before_clear_bit();
+   clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ BTRFS_I(inode)-runtime_flags);
+}
+
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c5d829d..a49be05 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3832,6 +3832,12 @@ static int btrfs_setsize(struct inode *inode, struct 
iattr *attr)
 
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
+
+   /* Disable nonlocked read DIO to avoid the end less truncate */
+   btrfs_inode_block_unlocked_dio(inode);
+   inode_dio_wait(inode);
+   btrfs_inode_resume_unlocked_dio(inode);
+
ret = btrfs_truncate(inode);
if (ret  inode-i_nlink)
btrfs_orphan_del(NULL, inode);
@@ -6615,6 +6621,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
struct file *file = iocb-ki_filp;
struct inode *inode = file-f_mapping-host;
size_t count = 0;
+   int flags = 0;
+   bool wakeup = false;
ssize_t ret;
 
if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov,
@@ -6626,13 +6634,22 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
return ret;
+   } else {
+   atomic_inc(inode-i_dio_count);
+   smp_mb__after_atomic_inc();
+   if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ BTRFS_I(inode)-runtime_flags))) {
+   inode_dio_done(inode);
+   flags = DIO_LOCKING | DIO_SKIP_HOLES;
+   } else {
+   wakeup = true;
+   }
}
 
ret = __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev,
iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-   btrfs_submit_direct, 0);
-
+   btrfs_submit_direct, flags);
if (rw  WRITE) {
if (ret  0  ret != -EIOCBQUEUED)
btrfs_delalloc_release_space(inode, count);
@@ -6645,6 +6662,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
}
btrfs_delalloc_release_metadata(inode, 0

[PATCH V2 2/2] Btrfs: implement unlocked dio write

2013-02-07 Thread Miao Xie

This idea is from ext4. By this patch, we can make the dio write parallel,
and improve the performance. But because we can not update isize without
i_mutex, the unlocked dio write just can be done in front of the EOF.

We needn't worry about the race between dio write and truncate, because the
truncate need wait untill all the dio write end.

And we also needn't worry about the race between dio write and punch hole,
because we have extent lock to protect our operation.

I ran fio to test the performance of this feature.

== Hardware ==
CPU: Intel(R) Core(TM)2 Duo CPU E7500  @ 2.93GHz
Mem: 2GB
SSD: Intel X25-M 120GB (Test Partition: 60GB)

== config file ==
[global]
ioengine=psync
direct=1
bs=4k
size=32G
runtime=60
directory=/mnt/btrfs/
filename=testfile
group_reporting
thread

[file1]
numjobs=1 # 2 4
rw=randwrite

== result (KBps) ==
write   1   2   4
lock24936   24738   24726
nolock  24962   30866   32101

== result (iops) ==
write   1   2   4
lock623461846181
nolock  624077168025

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- don't do nolocked DIO write if it is beyond the EOF 
---
 fs/btrfs/inode.c |   35 +++
 1 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a49be05..2948123 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6622,28 +6622,36 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
struct inode *inode = file-f_mapping-host;
size_t count = 0;
int flags = 0;
-   bool wakeup = false;
+   bool wakeup = true;
+   bool relock = false;
ssize_t ret;
 
if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov,
offset, nr_segs))
return 0;
 
+   atomic_inc(inode-i_dio_count);
+   smp_mb__after_atomic_inc();
+
if (rw  WRITE) {
count = iov_length(iov, nr_segs);
+   /*
+* If the write DIO is beyond the EOF, we need update
+* the isize, but it is protected by i_mutex. So we can
+* not unlock the i_mutex at this case.
+*/
+   if (offset + count = inode-i_size) {
+   mutex_unlock(inode-i_mutex);
+   relock = true;
+   }
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
-   return ret;
-   } else {
-   atomic_inc(inode-i_dio_count);
-   smp_mb__after_atomic_inc();
-   if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- BTRFS_I(inode)-runtime_flags))) {
-   inode_dio_done(inode);
-   flags = DIO_LOCKING | DIO_SKIP_HOLES;
-   } else {
-   wakeup = true;
-   }
+   goto out;
+   } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+BTRFS_I(inode)-runtime_flags))) {
+   inode_dio_done(inode);
+   flags = DIO_LOCKING | DIO_SKIP_HOLES;
+   wakeup = false;
}
 
ret = __blockdev_direct_IO(rw, iocb, inode,
@@ -6662,8 +6670,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
}
btrfs_delalloc_release_metadata(inode, 0);
}
+out:
if (wakeup)
inode_dio_done(inode);
+   if (relock)
+   mutex_lock(inode-i_mutex);
 
return ret;
 }
-- 
1.6.5.2
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Leaking btrfs_qgroup_inherit on snapshot creation?

2013-02-06 Thread Miao Xie

On wed, 06 Feb 2013 13:14:23 +0100, Arne Jansen wrote:
 Hi Alex,
 
 On 02/06/13 12:18, Alex Lyakas wrote:
 Hi Jan, Arne,
 I see this code in create_snapshot:

  if (inherit) {
  pending_snapshot-inherit = *inherit;
  *inherit = NULL;/* take responsibility to free it */
  }

 So, first thing I think it should be:
 if (*inherit)
 because in btrfs_ioctl_snap_create_v2() we have:
 struct btrfs_qgroup_inherit *inherit = NULL;
 ...
 btrfs_ioctl_snap_create_transid(..., inherit)

 so the current check is very unlikely to be NULL.
 
 But in btrfs_ioctl_snap_create it is called with NULL, so *inherit would
 dereference a NULL pointer.
 

 Second, I don't see anybody freeing pending_snapshot-inherit. I guess
 it should be freed after callin btrfs_qgroup_inherit() and also in
 btrfs_destroy_pending_snapshots().
 
 You're right. In our original version (6f72c7e20dbaea5) it was still
 there, in transaction.c. It has been removed in 6fa9700e734:
 
 commit 6fa9700e734275de2acbcb0e99414bd7ddfc60f1
 Author: Miao Xie mi...@cn.fujitsu.com
 Date:   Thu Sep 6 04:00:32 2012 -0600
 
 Btrfs: fix error path in create_pending_snapshot()
 
 This patch fixes the following problem:
 - If we failed to deal with the delayed dir items, we should abort
 transaction,
   just as its comment said. Fix it.
 - If root reference or root back reference insertion failed, we should
   abort transaction. Fix it.
 - Fix the double free problem of pending-inherit.
 - Do not restore the trans-rsv if we doesn't change it.
 - make the error path more clearly.
 
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 
 Miao, can you please explain where you see a double free?

Sorry, I misread the code，I didn't notice that the pointer had been assigned
to NULL.

But I think we can make the code more readable and be easy to maintain, we can
free the memory in the caller(btrfs_ioctl_snap_create_v2()) since we are sure
the snapshot creation is done after btrfs_ioctl_snap_create_transid() completes.

Thanks
Miao

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] Btrfs: fix the race between bio and btrfs_stop_workers

2013-02-06 Thread Miao Xie

open_ctree() need read the metadata to initialize the global information
of btrfs. But it may fail after it submit some bio, and then it will jump
to the error path. Unfortunately, it doesn't check if there are some bios
in flight, and just stop all the worker threads. As a result, when the
submitted bios end, they can not find any worker thread which can deal with
subsequent work, then oops happen.

kernel BUG at fs/btrfs/async-thread.c:605!

Fix this problem by invoking invalidate_inode_pages2() before we stop the
worker threads. This function will wait until the bio end because it need
lock the pages which are going to be invalidated, and if a page is under
disk read IO, it must be locked. invalidate_inode_pages2() need wait until
end bio handler to unlocked it.

Reported-and-Tested-by: Tsutomu Itoh t-i...@jp.fujitsu.com
Signed-off-by: Eric Sandeen sand...@redhat.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0c31d07..d8fd711 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2728,13 +2728,13 @@ fail_cleaner:
 * kthreads
 */
filemap_write_and_wait(fs_info-btree_inode-i_mapping);
-   invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
 
 fail_block_groups:
btrfs_free_block_groups(fs_info);
 
 fail_tree_roots:
free_root_pointers(fs_info, 1);
+   invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
 
 fail_sb_buffer:
btrfs_stop_workers(fs_info-generic_worker);
@@ -2755,7 +2755,6 @@ fail_alloc:
 fail_iput:
btrfs_mapping_tree_free(fs_info-mapping_tree);
 
-   invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
iput(fs_info-btree_inode);
 fail_bdi:
bdi_destroy(fs_info-bdi);
-- 
1.7.11.7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] Btrfs: fix memory leak of pending_snapshot-inherit

2013-02-06 Thread Miao Xie

The argument inherit of btrfs_ioctl_snap_create_transid() was assigned
to NULL during we created the snapshots, so we didn't free it though we
called kfree() in the caller.

But since we are sure the snapshot creation is done after the function -
btrfs_ioctl_snap_create_transid() - completes, it is safe that we don't
assign the pointer inherit to NULL, and just free it in the caller of
btrfs_ioctl_snap_create_transid(). In this way, the code can become more
readable.

Reported-by: Alex Lyakas alex.bt...@zadarastorage.com
Cc: Arne Jansen sensi...@gmx.net
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/ioctl.c | 18 +++---
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 02d3035..40f2fbf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -367,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
  struct dentry *dentry,
  char *name, int namelen,
  u64 *async_transid,
- struct btrfs_qgroup_inherit **inherit)
+ struct btrfs_qgroup_inherit *inherit)
 {
struct btrfs_trans_handle *trans;
struct btrfs_key key;
@@ -401,8 +401,7 @@ static noinline int create_subvol(struct btrfs_root *root,
if (IS_ERR(trans))
return PTR_ERR(trans);
 
-   ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid,
-  inherit ? *inherit : NULL);
+   ret = btrfs_qgroup_inherit(trans, root-fs_info, 0, objectid, inherit);
if (ret)
goto fail;
 
@@ -530,7 +529,7 @@ fail:
 
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
   char *name, int namelen, u64 *async_transid,
-  bool readonly, struct btrfs_qgroup_inherit **inherit)
+  bool readonly, struct btrfs_qgroup_inherit *inherit)
 {
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
@@ -549,10 +548,7 @@ static int create_snapshot(struct btrfs_root *root, struct 
dentry *dentry,
pending_snapshot-dentry = dentry;
pending_snapshot-root = root;
pending_snapshot-readonly = readonly;
-   if (inherit) {
-   pending_snapshot-inherit = *inherit;
-   *inherit = NULL;/* take responsibility to free it */
-   }
+   pending_snapshot-inherit = inherit;
 
trans = btrfs_start_transaction(root-fs_info-extent_root, 6);
if (IS_ERR(trans)) {
@@ -692,7 +688,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
   char *name, int namelen,
   struct btrfs_root *snap_src,
   u64 *async_transid, bool readonly,
-  struct btrfs_qgroup_inherit **inherit)
+  struct btrfs_qgroup_inherit *inherit)
 {
struct inode *dir  = parent-dentry-d_inode;
struct dentry *dentry;
@@ -1454,7 +1450,7 @@ out:
 static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
char *name, unsigned long fd, int subvol,
u64 *transid, bool readonly,
-   struct btrfs_qgroup_inherit **inherit)
+   struct btrfs_qgroup_inherit *inherit)
 {
int namelen;
int ret = 0;
@@ -1563,7 +1559,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct 
file *file,
 
ret = btrfs_ioctl_snap_create_transid(file, vol_args-name,
  vol_args-fd, subvol, ptr,
- readonly, inherit);
+ readonly, inherit);
 
if (ret == 0  ptr 
copy_to_user(arg +
-- 
1.7.11.7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!

2013-02-03 Thread Miao Xie

Hi, Eric

I want to send out my fix patch, but Could I add your Signed-off-by?
because you found the key to solving the problem.

Thanks
Miao

On Fri, 01 Feb 2013 14:53:09 +0900, Tsutomu Itoh wrote:
 Can you please explain similar problems, Miao?

 Before missing device check, there are several places where we read the 
 metadata,
 such as reading chunk tree root, btrfs_read_chunk_tree, those functions may 
 fail
 after submit a bio. If we don't wait until the bio end, and just stop the 
 workers,
 the same problem will happen.

 (invalidate_inode_pages2() will wait until the bio end, because it need lock 
 the pages
   which are going to be invalidated, and the page is locked if it is under 
 disk read IO)
 
 I understood.
 
 My reproducer is not reproduce this problem yet. But the following messages 
 were
 displayed when 'rmmod btrfs' command was executed.
 
  [76378.723481] 
 =
  [76378.723901] BUG btrfs_extent_buffer (Tainted: G   B   ): Objects 
 remaining in btrfs_extent_buffer on kmem_cache_close()
  [76378.724333] 
 -
  [76378.724333]
  [76378.724959] INFO: Slab 0xea00065c3280 objects=23 used=2 
 fp=0x8801970caac0 flags=0x80004080
  [76378.725391] Pid: 9156, comm: rmmod Tainted: G B3.8.0-rc5 #1
  [76378.725397] Call Trace:
  [76378.725403]  [8111bc23] slab_err+0xb0/0xd2
 
 I think that this message means there is a possibility that I/O did not end
 normally.
 and, after Miao's patch applied, this message is not displayed when rmmod was
 executed.
 
 So, Miao's patch seems to fix the problem for me.
[SNIP]
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index 0c31d07..d8fd711 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -2728,13 +2728,13 @@ fail_cleaner:
 * kthreads
 */
filemap_write_and_wait(fs_info-btree_inode-i_mapping);
 -  invalidate_inode_pages2(fs_info-btree_inode-i_mapping);

fail_block_groups:
btrfs_free_block_groups(fs_info);

fail_tree_roots:
free_root_pointers(fs_info, 1);
 +  invalidate_inode_pages2(fs_info-btree_inode-i_mapping);

fail_sb_buffer:
btrfs_stop_workers(fs_info-generic_worker);
 @@ -2755,7 +2755,6 @@ fail_alloc:
fail_iput:
btrfs_mapping_tree_free(fs_info-mapping_tree);

 -  invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
iput(fs_info-btree_inode);
fail_bdi:
bdi_destroy(fs_info-bdi);

 
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] Btrfs: serialize unlocked dio reads with truncate

2013-01-31 Thread Miao Xie

Currently, we can do unlocked dio reads, but the following race
is possible:

dio_read_task   truncate_task
-btrfs_setattr()
-btrfs_direct_IO
-__blockdev_direct_IO
  -btrfs_get_block
  -btrfs_truncate()
 #alloc truncated blocks
 #to other inode
  -submit_io()
 #INFORMATION LEAK

In order to avoid this problem, we must serialize unlocked dio reads with
truncate by inode_dio_wait().

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/btrfs_inode.h | 19 +++
 fs/btrfs/inode.c   | 31 +++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242..00e2601 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,7 @@
 #define BTRFS_INODE_HAS_ASYNC_EXTENT   6
 #define BTRFS_INODE_NEEDS_FULL_SYNC7
 #define BTRFS_INODE_COPY_EVERYTHING8
+#define BTRFS_INODE_READDIO_NEED_LOCK  9
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -216,4 +217,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, 
u64 generation)
return 0;
 }
 
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+   set_bit(BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_I(inode)-runtime_flags);
+   smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+   smp_mb__before_clear_bit();
+   clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ BTRFS_I(inode)-runtime_flags);
+}
+
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 97f4c30..d17a04b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3785,6 +3785,11 @@ static int btrfs_setsize(struct inode *inode, loff_t 
newsize)
 
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
+
+   btrfs_inode_block_unlocked_dio(inode);
+   inode_dio_wait(inode);
+   btrfs_inode_resume_unlocked_dio(inode);
+
ret = btrfs_truncate(inode);
}
 
@@ -6583,15 +6588,33 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
 {
struct file *file = iocb-ki_filp;
struct inode *inode = file-f_mapping-host;
+   int flags = 0;
+   bool wakeup = false;
+   int ret;
 
if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov,
offset, nr_segs))
return 0;
 
-   return __blockdev_direct_IO(rw, iocb, inode,
-  BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev,
-  iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-  btrfs_submit_direct, 0);
+   if (rw == READ) {
+   atomic_inc(inode-i_dio_count);
+   smp_mb__after_atomic_inc();
+   if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+ BTRFS_I(inode)-runtime_flags))) {
+   inode_dio_done(inode);
+   flags = DIO_LOCKING | DIO_SKIP_HOLES;
+   } else {
+   wakeup = true;
+   }
+   }
+
+   ret = __blockdev_direct_IO(rw, iocb, inode,
+   BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev,
+   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+   btrfs_submit_direct, flags);
+   if (wakeup)
+   inode_dio_done(inode);
+   return ret;
 }
 
 #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
-- 
1.7.11.7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC][PATCH 2/2] Btrfs: implement unlocked dio write

2013-01-31 Thread Miao Xie

This idea is from ext4. By this patch, we can make the dio write parallel,
and improve the performance.

We needn't worry about the race between dio write and truncate, because the
truncate need wait untill all the dio write end.

And we also needn't worry about the race between dio write and punch hole,
because we have extent lock to protect our operation.

I ran fio to test the performance of this feature.

== Hardware ==
CPU: Intel(R) Core(TM)2 Duo CPU E7500  @ 2.93GHz
Mem: 2GB
SSD: Intel X25-M 120GB (Test Partition: 60GB)

== config file ==
[global]
ioengine=psync
direct=1
bs=4k
size=32G
runtime=60
directory=/mnt/btrfs/
filename=testfile
group_reporting
thread

[file1]
numjobs=1 # 2 4
rw=randwrite

== result (KBps) ==
write   1   2   4
lock24936   24738   24726
nolock  24962   30866   32101

== result (iops) ==
write   1   2   4
lock623461846181
nolock  624077168025

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/inode.c | 24 +---
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d17a04b..091593a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6589,31 +6589,33 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
struct file *file = iocb-ki_filp;
struct inode *inode = file-f_mapping-host;
int flags = 0;
-   bool wakeup = false;
+   bool wakeup = true;
int ret;
 
if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov,
offset, nr_segs))
return 0;
 
-   if (rw == READ) {
-   atomic_inc(inode-i_dio_count);
-   smp_mb__after_atomic_inc();
-   if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- BTRFS_I(inode)-runtime_flags))) {
-   inode_dio_done(inode);
-   flags = DIO_LOCKING | DIO_SKIP_HOLES;
-   } else {
-   wakeup = true;
-   }
+   atomic_inc(inode-i_dio_count);
+   smp_mb__after_atomic_inc();
+   if (rw == WRITE) {
+   mutex_unlock(inode-i_mutex);
+   } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+BTRFS_I(inode)-runtime_flags))) {
+   inode_dio_done(inode);
+   flags = DIO_LOCKING | DIO_SKIP_HOLES;
+   wakeup = false;
}
 
ret = __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev,
iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
+
if (wakeup)
inode_dio_done(inode);
+   if (rw == WRITE)
+   mutex_lock(inode-i_mutex);
return ret;
 }
 
-- 
1.7.11.7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!

2013-01-31 Thread Miao Xie

On Fri, 01 Feb 2013 09:31:33 +0900, Tsutomu Itoh wrote:
 Hi,
 
 On 2013/01/31 16:58, Miao Xie wrote:
 On wed, 30 Jan 2013 23:55:34 -0600, Eric Sandeen wrote:
 if you move the fail_block_groups: target above the comment, does that fix 
 it?
 (although I don't know yet what started IO . . . )

 like this:

 From: Eric Sandeen sand...@redhat.com

 Make sure that we are always done with the btree_inode's mapping
 before we shut down the worker threads in open_ctree() error
 cases.


 I reviewed your patch again, and found it just fix the above problem, it 
 still
 have similar problems which are not fixed.

 How about this one?
 
 Thanks Eric and Miao.
 But I can not reproduce this problem, yet.
 ('Btrfs: too many missing devices, writeable mount is not allowed' messages 
 was
  displayed, but not panic)
  So, I can not test your patch, sorry.
 
 Can you please explain similar problems, Miao?

Before missing device check, there are several places where we read the 
metadata,
such as reading chunk tree root, btrfs_read_chunk_tree, those functions may fail
after submit a bio. If we don't wait until the bio end, and just stop the 
workers,
the same problem will happen.

(invalidate_inode_pages2() will wait until the bio end, because it need lock 
the pages
 which are going to be invalidated, and the page is locked if it is under disk 
read IO)

Thanks
Miao

 
 Thanks,
 Tsutomu
 

 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index 0c31d07..d8fd711 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -2728,13 +2728,13 @@ fail_cleaner:
   * kthreads
   */
  filemap_write_and_wait(fs_info-btree_inode-i_mapping);
 -invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
   
   fail_block_groups:
  btrfs_free_block_groups(fs_info);
   
   fail_tree_roots:
  free_root_pointers(fs_info, 1);
 +invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
   
   fail_sb_buffer:
  btrfs_stop_workers(fs_info-generic_worker);
 @@ -2755,7 +2755,6 @@ fail_alloc:
   fail_iput:
  btrfs_mapping_tree_free(fs_info-mapping_tree);
   
 -invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
  iput(fs_info-btree_inode);
   fail_bdi:
  bdi_destroy(fs_info-bdi);

 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


 
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH 2/2] Btrfs: implement unlocked dio write

2013-01-31 Thread Miao Xie

On  fri, 1 Feb 2013 10:53:30 +0800, Liu Bo wrote:
 On Thu, Jan 31, 2013 at 05:39:03PM +0800, Miao Xie wrote:
 This idea is from ext4. By this patch, we can make the dio write parallel,
 and improve the performance.
 
 Interesting, AFAIK, ext4 can only do nolock dio write on some
 conditions(should be a overwrite, file size remains unchanged,
 no aligned/buffer io in flight), btrfs is ok without any conditions?

ext4 don't have extent lock, it can not avoid 2 AIO  threads are at work on the 
same
unwritten block, so it can not use unlocked dio write for unaligned dio/aio. 
But btrfs
has extent lock, it can avoid this problem.

And ext4 need take write lock of -i_data_sem, when it allocate the free space,
but in order to avoid truncation and hole punch during dio, it need take the 
read
lock of -i_data_sem before it release -i_mutex, that is if it isn't a 
overwrite,
deadlock will happen, so the unlocked dio of ext4 should be a overwrite. But 
btrfs
doesn't have such limitation.

Thanks
Miao

 
 thanks,
 liubo
 

 We needn't worry about the race between dio write and truncate, because the
 truncate need wait untill all the dio write end.

 And we also needn't worry about the race between dio write and punch hole,
 because we have extent lock to protect our operation.

 I ran fio to test the performance of this feature.

 == Hardware ==
 CPU: Intel(R) Core(TM)2 Duo CPU E7500  @ 2.93GHz
 Mem: 2GB
 SSD: Intel X25-M 120GB (Test Partition: 60GB)

 == config file ==
 [global]
 ioengine=psync
 direct=1
 bs=4k
 size=32G
 runtime=60
 directory=/mnt/btrfs/
 filename=testfile
 group_reporting
 thread

 [file1]
 numjobs=1 # 2 4
 rw=randwrite

 == result (KBps) ==
 write1   2   4
 lock 24936   24738   24726
 nolock   24962   30866   32101

 == result (iops) ==
 write1   2   4
 lock 623461846181
 nolock   624077168025

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/inode.c | 24 +---
  1 file changed, 13 insertions(+), 11 deletions(-)

 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
 index d17a04b..091593a 100644
 --- a/fs/btrfs/inode.c
 +++ b/fs/btrfs/inode.c
 @@ -6589,31 +6589,33 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
 *iocb,
  struct file *file = iocb-ki_filp;
  struct inode *inode = file-f_mapping-host;
  int flags = 0;
 -bool wakeup = false;
 +bool wakeup = true;
  int ret;
  
  if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov,
  offset, nr_segs))
  return 0;
  
 -if (rw == READ) {
 -atomic_inc(inode-i_dio_count);
 -smp_mb__after_atomic_inc();
 -if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 -  BTRFS_I(inode)-runtime_flags))) {
 -inode_dio_done(inode);
 -flags = DIO_LOCKING | DIO_SKIP_HOLES;
 -} else {
 -wakeup = true;
 -}
 +atomic_inc(inode-i_dio_count);
 +smp_mb__after_atomic_inc();
 +if (rw == WRITE) {
 +mutex_unlock(inode-i_mutex);
 +} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 + BTRFS_I(inode)-runtime_flags))) {
 +inode_dio_done(inode);
 +flags = DIO_LOCKING | DIO_SKIP_HOLES;
 +wakeup = false;
  }
  
  ret = __blockdev_direct_IO(rw, iocb, inode,
  BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev,
  iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
  btrfs_submit_direct, flags);
 +
  if (wakeup)
  inode_dio_done(inode);
 +if (rw == WRITE)
 +mutex_lock(inode-i_mutex);
  return ret;
  }
  
 -- 
 1.7.11.7
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/2] Btrfs: serialize unlocked dio reads with truncate

2013-01-31 Thread Miao Xie

On Thu, 31 Jan 2013 11:40:41 -0500, Josef Bacik wrote:
 On Thu, Jan 31, 2013 at 02:23:19AM -0700, Miao Xie wrote:
 Currently, we can do unlocked dio reads, but the following race
 is possible:

 dio_read_tasktruncate_task
  -btrfs_setattr()
 -btrfs_direct_IO
 -__blockdev_direct_IO
   -btrfs_get_block
-btrfs_truncate()
   #alloc truncated blocks
   #to other inode
   -submit_io()
  #INFORMATION LEAK

 In order to avoid this problem, we must serialize unlocked dio reads with
 truncate by inode_dio_wait().

 
 So I had thinking about this, are we sure we don't want to just lock the 
 extent
 range when we truncate?  I'm good with this, but it seems like we might as 
 well
 and be consistent and use the extent locks.  What do you think?  Thanks,

But comparing with the current approach, the extent lock has the following 
problem:
Dio_Read_Task   Truncate_task
truncate file
  set isize to 4096
  drop pages
lock extent[4096, 8191]
read extent[4096, 8191]
unlock extent[4096, 8191]
  lock extent[4096, -1ULL]
  truncate item
  unlock extent[4096, -1ULL]
lock extent[8192, ...]
read extent[8192, ...]
  no extent item
  zero the buffer
unlock extent[8192, ...]

we get the data that is mixed with new data.(Punch hole also has this problem, 
we need
fix)

Thanks
Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH 2/2] Btrfs: implement unlocked dio write

2013-01-31 Thread Miao Xie

On  fri, 01 Feb 2013 12:08:25 +0800, Miao Xie wrote:
 Onfri, 1 Feb 2013 10:53:30 +0800, Liu Bo wrote:
 On Thu, Jan 31, 2013 at 05:39:03PM +0800, Miao Xie wrote:
 This idea is from ext4. By this patch, we can make the dio write parallel,
 and improve the performance.

 Interesting, AFAIK, ext4 can only do nolock dio write on some
 conditions(should be a overwrite, file size remains unchanged,
 no aligned/buffer io in flight), btrfs is ok without any conditions?
 
 ext4 don't have extent lock, it can not avoid 2 AIO  threads are at work on 
 the same
 unwritten block, so it can not use unlocked dio write for unaligned dio/aio. 
 But btrfs
 has extent lock, it can avoid this problem.

Besides that, btrfs doesn't allow doing a unaligned dio/aio.

I read the code again, found there is a race that several tasks may update 
i_size at
the same time. There are two methods to fix this problem:
1. just like ext4, don't do unlocked write dio if it is beyond the end of the 
file
2. use a spin lock to protect i_size update

I want to choose the 2nd one.

Thanks
Miao

 
 And ext4 need take write lock of -i_data_sem, when it allocate the free 
 space,
 but in order to avoid truncation and hole punch during dio, it need take the 
 read
 lock of -i_data_sem before it release -i_mutex, that is if it isn't a 
 overwrite,
 deadlock will happen, so the unlocked dio of ext4 should be a overwrite. But 
 btrfs
 doesn't have such limitation.
 
 Thanks
 Miao
 

 thanks,
 liubo


 We needn't worry about the race between dio write and truncate, because the
 truncate need wait untill all the dio write end.

 And we also needn't worry about the race between dio write and punch hole,
 because we have extent lock to protect our operation.

 I ran fio to test the performance of this feature.

 == Hardware ==
 CPU: Intel(R) Core(TM)2 Duo CPU E7500  @ 2.93GHz
 Mem: 2GB
 SSD: Intel X25-M 120GB (Test Partition: 60GB)

 == config file ==
 [global]
 ioengine=psync
 direct=1
 bs=4k
 size=32G
 runtime=60
 directory=/mnt/btrfs/
 filename=testfile
 group_reporting
 thread

 [file1]
 numjobs=1 # 2 4
 rw=randwrite

 == result (KBps) ==
 write   1   2   4
 lock24936   24738   24726
 nolock  24962   30866   32101

 == result (iops) ==
 write   1   2   4
 lock623461846181
 nolock  624077168025

 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/inode.c | 24 +---
  1 file changed, 13 insertions(+), 11 deletions(-)

 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
 index d17a04b..091593a 100644
 --- a/fs/btrfs/inode.c
 +++ b/fs/btrfs/inode.c
 @@ -6589,31 +6589,33 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
 *iocb,
 struct file *file = iocb-ki_filp;
 struct inode *inode = file-f_mapping-host;
 int flags = 0;
 -   bool wakeup = false;
 +   bool wakeup = true;
 int ret;
  
 if (check_direct_IO(BTRFS_I(inode)-root, rw, iocb, iov,
 offset, nr_segs))
 return 0;
  
 -   if (rw == READ) {
 -   atomic_inc(inode-i_dio_count);
 -   smp_mb__after_atomic_inc();
 -   if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 - BTRFS_I(inode)-runtime_flags))) {
 -   inode_dio_done(inode);
 -   flags = DIO_LOCKING | DIO_SKIP_HOLES;
 -   } else {
 -   wakeup = true;
 -   }
 +   atomic_inc(inode-i_dio_count);
 +   smp_mb__after_atomic_inc();
 +   if (rw == WRITE) {
 +   mutex_unlock(inode-i_mutex);
 +   } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 +BTRFS_I(inode)-runtime_flags))) {
 +   inode_dio_done(inode);
 +   flags = DIO_LOCKING | DIO_SKIP_HOLES;
 +   wakeup = false;
 }
  
 ret = __blockdev_direct_IO(rw, iocb, inode,
 BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev,
 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
 btrfs_submit_direct, flags);
 +
 if (wakeup)
 inode_dio_done(inode);
 +   if (rw == WRITE)
 +   mutex_lock(inode-i_mutex);
 return ret;
  }
  
 -- 
 1.7.11.7
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

 
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info

Re: [PATCH 5/5] Btrfs: fix remount vs autodefrag

2013-01-30 Thread Miao Xie

Any comments about this patch?

Thanks
Miao

On mon, 26 Nov 2012 17:28:13 +0800, Miao Xie wrote:
 If we remount the fs to close the auto defragment or make the fs R/O, we 
 should
 stop the auto defragment.
 
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/ctree.h |  1 +
  fs/btrfs/file.c  | 13 +
  fs/btrfs/super.c | 29 +
  3 files changed, 43 insertions(+)
 
 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 4ce24ce..01d671c 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1759,6 +1759,7 @@ struct btrfs_ioctl_defrag_range_args {
  
  #define btrfs_clear_opt(o, opt)  ((o) = ~BTRFS_MOUNT_##opt)
  #define btrfs_set_opt(o, opt)((o) |= BTRFS_MOUNT_##opt)
 +#define btrfs_raw_test_opt(o, opt)   ((o)  BTRFS_MOUNT_##opt)
  #define btrfs_test_opt(root, opt)((root)-fs_info-mount_opt  \
BTRFS_MOUNT_##opt)
  /*
 diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
 index 40b17d0..7aaae56 100644
 --- a/fs/btrfs/file.c
 +++ b/fs/btrfs/file.c
 @@ -320,8 +320,21 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info 
 *fs_info,
   range.start = defrag-last_offset;
  
   sb_start_write(fs_info-sb);
 +
 + /* Avoid defraging files on R/O fs */
 + if (!down_write_trylock(fs_info-sb-s_umount)) {
 + sb_end_write(fs_info-sb);
 + btrfs_requeue_inode_defrag(inode, defrag);
 + iput(inode);
 + return -EBUSY;
 + }
 +
 + BUG_ON(fs_info-sb-s_flags  MS_RDONLY);
 +
   num_defrag = btrfs_defrag_file(inode, NULL, range, defrag-transid,
  BTRFS_DEFRAG_BATCH);
 +
 + up_write(fs_info-sb-s_umount);
   sb_end_write(fs_info-sb);
   /*
* if we filled the whole defrag batch, there
 diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
 index b3b041a..2e7beee 100644
 --- a/fs/btrfs/super.c
 +++ b/fs/btrfs/super.c
 @@ -1189,6 +1189,32 @@ static void btrfs_resize_thread_pool(struct 
 btrfs_fs_info *fs_info,
   btrfs_set_max_workers(fs_info-scrub_workers, new_pool_size);
  }
  
 +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
 +  unsigned long old_opts, int flags)
 +{
 + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) 
 + (!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) ||
 +  (flags  MS_RDONLY))) {
 + /* wait for any defraggers to finish */
 + wait_event(fs_info-transaction_wait,
 +(atomic_read(fs_info-defrag_running) == 0));
 + }
 +}
 +
 +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
 +  unsigned long old_opts, int flags)
 +{
 + /*
 +  * We remount the fs successfully, then we need cleanup all defragable
 +  * inodes if the autodefragment is close or the fs is R/O.
 +  */
 + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) 
 + (!btrfs_raw_test_opt(fs_info-mount_opt, AUTO_DEFRAG) ||
 +  (flags  MS_RDONLY)))
 + btrfs_cleanup_defrag_inodes(fs_info);
 +
 +}
 +
  static int btrfs_remount(struct super_block *sb, int *flags, char *data)
  {
   struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 @@ -1214,6 +1240,8 @@ static int btrfs_remount(struct super_block *sb, int 
 *flags, char *data)
   if ((*flags  MS_RDONLY) == (sb-s_flags  MS_RDONLY))
   return 0;
  
 + btrfs_remount_prepare(fs_info, old_opts, *flags);
 +
   if (*flags  MS_RDONLY) {
   sb-s_flags |= MS_RDONLY;
  
 @@ -1247,6 +1275,7 @@ static int btrfs_remount(struct super_block *sb, int 
 *flags, char *data)
   sb-s_flags = ~MS_RDONLY;
   }
  
 + btrfs_remount_cleanup(fs_info, old_opts, *flags);
   return 0;
  
  restore:
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!

2013-01-30 Thread Miao Xie

On thu, 31 Jan 2013 12:37:49 +0900, Tsutomu Itoh wrote:
 Hi,
 
 In kernel 3.8-rc5, the following panics occurred when the mount was done
 by the degraded option.
 
 # btrfs fi sh /dev/sdc8
 Label: none  uuid: fc63cd80-5ae2-4fbe-8795-2d526c937a56
 Total devices 3 FS bytes used 20.98GB
 devid1 size 9.31GB used 9.31GB path /dev/sdd8
 devid2 size 9.31GB used 9.31GB path /dev/sdc8
 *** Some devices missing
 
 Btrfs v0.20-rc1-37-g91d9eec
 # mount -o degraded /dev/sdc8 /test1
 
  564 static struct btrfs_worker_thread *find_worker(struct btrfs_workers 
 *workers)
  565 {
 ...
 ...
  595 fallback:
  596 fallback = NULL;
  597 /*
  598  * we have failed to find any workers, just
  599  * return the first one we can find.
  600  */
  601 if (!list_empty(workers-worker_list))
  602 fallback = workers-worker_list.next;
  603 if (!list_empty(workers-idle_list))
  604 fallback = workers-idle_list.next;
  605 BUG_ON(!fallback);  -- this !
  606 worker = list_entry(fallback,
  607   struct btrfs_worker_thread, worker_list);


If worker_list is not empty, we get a worker from this list; if worker_list is 
empty,
it means all the workers in idle_list, we get the worker from idle_list.

So the above bug is introduced by the second if sentence. it should be else 
if.

Thanks
Miao

 
 -Tsutomu
 
 ===
 
 [ 7913.075890] btrfs: allowing degraded mounts
 [ 7913.075893] btrfs: disk space caching is enabled
 [ 7913.092031] Btrfs: too many missing devices, writeable mount is not allowed
 [ 7913.092297] [ cut here ]
 [ 7913.092313] kernel BUG at fs/btrfs/async-thread.c:605!
 [ 7913.092326] invalid opcode:  [#1] SMP
 [ 7913.092342] Modules linked in: btrfs zlib_deflate crc32c libcrc32c nfsd 
 lockd nfs_acl auth_rpcgss sunrpc 8021q garp stp llc cpufreq_ondemand 
 cachefiles fscache ipv6 ext3 jbd dm_mirror dm_region_hash dm_log dm_mod 
 uinput ppdev iTCO_wdt iTCO_vendor_support parport_pc parport sg acpi_cpufreq 
 freq_table mperf coretemp kvm pcspkr i2c_i801 i2c_core lpc_ich mfd_core tg3 
 ptp pps_core shpchp pci_hotplug i3000_edac edac_core ext4 mbcache jbd2 crc16 
 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_piix libata megaraid_sas 
 scsi_mod floppy [last unloaded: microcode]
 [ 7913.092575] CPU 0
 [ 7913.092584] Pid: 3673, comm: btrfs-endio-wri Not tainted 3.8.0-rc5 #1 
 FUJITSU-SV  PRIMERGY/D2399
 [ 7913.092608] RIP: 0010:[a04670ef]  [a04670ef] 
 btrfs_queue_worker+0x10e/0x236 [btrfs]
 [ 7913.092663] RSP: 0018:88019fc03c10  EFLAGS: 00010046
 [ 7913.092676] RAX:  RBX: 8801967b8a58 RCX: 
 
 [ 7913.092894] RDX:  RSI: 8801961239b8 RDI: 
 8801967b8ab8
 [ 7913.093116] RBP: 88019fc03c50 R08:  R09: 
 880198801180
 [ 7913.093247] R10: a045fda7 R11: 0003 R12: 
 
 [ 7913.093247] R13: 8801961239b8 R14: 8801967b8ab8 R15: 
 0246
 [ 7913.093247] FS:  () GS:88019fc0() 
 knlGS:
 [ 7913.093247] CS:  0010 DS:  ES:  CR0: 8005003b
 [ 7913.093247] CR2: ff600400 CR3: 00019575d000 CR4: 
 07f0
 [ 7913.093247] DR0:  DR1:  DR2: 
 
 [ 7913.093247] DR3:  DR6: 0ff0 DR7: 
 0400
 [ 7913.093247] Process btrfs-endio-wri (pid: 3673, threadinfo 
 8801939ca000, task 880195795b00)
 [ 7913.093247] Stack:
 [ 7913.093247]  8801967b8a88 8801967b8a78 88003fa0a600 
 8801965ad0c0
 [ 7913.093247]  88003fa0a600   
 
 [ 7913.096183]  88019fc03c60 a043e357 88019fc03c70 
 811526aa
 [ 7913.096183] Call Trace:
 [ 7913.096183]  IRQ
 [ 7913.096183]
 [ 7913.096183]  [a043e357] end_workqueue_bio+0x79/0x7b [btrfs]
 [ 7913.096183]  [811526aa] bio_endio+0x2d/0x2f
 [ 7913.096183]  [a045fdb2] btrfs_end_bio+0x10b/0x122 [btrfs]
 [ 7913.096183]  [811526aa] bio_endio+0x2d/0x2f
 [ 7913.096183]  [811c5e3f] req_bio_endio+0x96/0x9f
 [ 7913.096183]  [811c601d] blk_update_request+0x1d5/0x3a4
 [ 7913.096183]  [811c620c] blk_update_bidi_request+0x20/0x6f
 [ 7913.096183]  [811c7a59] blk_end_bidi_request+0x1f/0x5d
 [ 7913.096183]  [811c7ad3] blk_end_request+0x10/0x12
 [ 7913.096183]  [a001db50] scsi_io_completion+0x207/0x4f3 [scsi_mod]
 [ 7913.096183]  [a0016df9] scsi_finish_command+0xec/0xf5 [scsi_mod]
 [ 7913.096183]  [a001df50] scsi_softirq_done+0xff/0x108 [scsi_mod]
 [ 7913.096183]  [811ccb3a] blk_done_softirq+0x7a/0x8e
 [ 7913.096183]  [810475c3]

Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!

2013-01-30 Thread Miao Xie

On Thu, 31 Jan 2013 01:19:41 -0500 (est), Eric Sandeen wrote:
 On Jan 31, 2013, at 12:13 AM, Miao Xie mi...@cn.fujitsu.com wrote:
 
 On thu, 31 Jan 2013 12:37:49 +0900, Tsutomu Itoh wrote:
 Hi,

 In kernel 3.8-rc5, the following panics occurred when the mount was done
 by the degraded option.

 # btrfs fi sh /dev/sdc8
 Label: none  uuid: fc63cd80-5ae2-4fbe-8795-2d526c937a56
Total devices 3 FS bytes used 20.98GB
devid1 size 9.31GB used 9.31GB path /dev/sdd8
devid2 size 9.31GB used 9.31GB path /dev/sdc8
*** Some devices missing

 Btrfs v0.20-rc1-37-g91d9eec
 # mount -o degraded /dev/sdc8 /test1

 564 static struct btrfs_worker_thread *find_worker(struct btrfs_workers 
 *workers)
 565 {
 ...
 ...
 595 fallback:
 596 fallback = NULL;
 597 /*
 598  * we have failed to find any workers, just
 599  * return the first one we can find.
 600  */
 601 if (!list_empty(workers-worker_list))
 602 fallback = workers-worker_list.next;
 603 if (!list_empty(workers-idle_list))
 604 fallback = workers-idle_list.next;
 605 BUG_ON(!fallback);  -- this !
 606 worker = list_entry(fallback,
 607   struct btrfs_worker_thread, worker_list);


 If worker_list is not empty, we get a worker from this list; if worker_list 
 is empty,
 it means all the workers in idle_list, we get the worker from idle_list.

 So the above bug is introduced by the second if sentence. it should be else 
 if.
 
 else if makes sense, but we cannot reach the BUG_ON unless both lists are 
 empty, correct?

You are right, I misread the code.

Thanks
Miao

 
 -Eric
 
 Thanks
 Miao


 -Tsutomu

 ===

 [ 7913.075890] btrfs: allowing degraded mounts
 [ 7913.075893] btrfs: disk space caching is enabled
 [ 7913.092031] Btrfs: too many missing devices, writeable mount is not 
 allowed
 [ 7913.092297] [ cut here ]
 [ 7913.092313] kernel BUG at fs/btrfs/async-thread.c:605!
 [ 7913.092326] invalid opcode:  [#1] SMP
 [ 7913.092342] Modules linked in: btrfs zlib_deflate crc32c libcrc32c nfsd 
 lockd nfs_acl auth_rpcgss sunrpc 8021q garp stp llc cpufreq_ondemand 
 cachefiles fscache ipv6 ext3 jbd dm_mirror dm_region_hash dm_log dm_mod 
 uinput ppdev iTCO_wdt iTCO_vendor_support parport_pc parport sg 
 acpi_cpufreq freq_table mperf coretemp kvm pcspkr i2c_i801 i2c_core lpc_ich 
 mfd_core tg3 ptp pps_core shpchp pci_hotplug i3000_edac edac_core ext4 
 mbcache jbd2 crc16 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_piix libata 
 megaraid_sas scsi_mod floppy [last unloaded: microcode]
 [ 7913.092575] CPU 0
 [ 7913.092584] Pid: 3673, comm: btrfs-endio-wri Not tainted 3.8.0-rc5 #1 
 FUJITSU-SV  PRIMERGY/D2399
 [ 7913.092608] RIP: 0010:[a04670ef]  [a04670ef] 
 btrfs_queue_worker+0x10e/0x236 [btrfs]
 [ 7913.092663] RSP: 0018:88019fc03c10  EFLAGS: 00010046
 [ 7913.092676] RAX:  RBX: 8801967b8a58 RCX: 
 
 [ 7913.092894] RDX:  RSI: 8801961239b8 RDI: 
 8801967b8ab8
 [ 7913.093116] RBP: 88019fc03c50 R08:  R09: 
 880198801180
 [ 7913.093247] R10: a045fda7 R11: 0003 R12: 
 
 [ 7913.093247] R13: 8801961239b8 R14: 8801967b8ab8 R15: 
 0246
 [ 7913.093247] FS:  () GS:88019fc0() 
 knlGS:
 [ 7913.093247] CS:  0010 DS:  ES:  CR0: 8005003b
 [ 7913.093247] CR2: ff600400 CR3: 00019575d000 CR4: 
 07f0
 [ 7913.093247] DR0:  DR1:  DR2: 
 
 [ 7913.093247] DR3:  DR6: 0ff0 DR7: 
 0400
 [ 7913.093247] Process btrfs-endio-wri (pid: 3673, threadinfo 
 8801939ca000, task 880195795b00)
 [ 7913.093247] Stack:
 [ 7913.093247]  8801967b8a88 8801967b8a78 88003fa0a600 
 8801965ad0c0
 [ 7913.093247]  88003fa0a600   
 
 [ 7913.096183]  88019fc03c60 a043e357 88019fc03c70 
 811526aa
 [ 7913.096183] Call Trace:
 [ 7913.096183]  IRQ
 [ 7913.096183]
 [ 7913.096183]  [a043e357] end_workqueue_bio+0x79/0x7b [btrfs]
 [ 7913.096183]  [811526aa] bio_endio+0x2d/0x2f
 [ 7913.096183]  [a045fdb2] btrfs_end_bio+0x10b/0x122 [btrfs]
 [ 7913.096183]  [811526aa] bio_endio+0x2d/0x2f
 [ 7913.096183]  [811c5e3f] req_bio_endio+0x96/0x9f
 [ 7913.096183]  [811c601d] blk_update_request+0x1d5/0x3a4
 [ 7913.096183]  [811c620c] blk_update_bidi_request+0x20/0x6f
 [ 7913.096183]  [811c7a59] blk_end_bidi_request+0x1f/0x5d
 [ 7913.096183]  [811c7ad3] blk_end_request+0x10/0x12
 [ 7913.096183]  [a001db50] scsi_io_completion+0x207

Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!

2013-01-30 Thread Miao Xie

On wed, 30 Jan 2013 23:55:34 -0600, Eric Sandeen wrote:
 ===

 [ 7913.075890] btrfs: allowing degraded mounts
 [ 7913.075893] btrfs: disk space caching is enabled
 [ 7913.092031] Btrfs: too many missing devices, writeable mount is not 
 allowed
 
 so this was supposed to fail the mount in open_ctree; it jumps to shutting 
 down
 the worker threads.  Which might result in no threads available.
 
 [ 7913.092297] [ cut here ]
 [ 7913.092313] kernel BUG at fs/btrfs/async-thread.c:605!
 [ 7913.092326] invalid opcode:  [#1] SMP
 [ 7913.092342] Modules linked in: btrfs zlib_deflate crc32c libcrc32c nfsd 
 lockd nfs_acl auth_rpcgss sunrpc 8021q garp stp llc cpufreq_ondemand 
 cachefiles fscache ipv6 ext3 jbd dm_mirror dm_region_hash dm_log dm_mod 
 uinput ppdev iTCO_wdt iTCO_vendor_support parport_pc parport sg acpi_cpufreq 
 freq_table mperf coretemp kvm pcspkr i2c_i801 i2c_core lpc_ich mfd_core tg3 
 ptp pps_core shpchp pci_hotplug i3000_edac edac_core ext4 mbcache jbd2 crc16 
 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_piix libata megaraid_sas 
 scsi_mod floppy [last unloaded: microcode]
 [ 7913.092575] CPU 0
 [ 7913.092584] Pid: 3673, comm: btrfs-endio-wri Not tainted 3.8.0-rc5 #1 
 FUJITSU-SV  PRIMERGY/D2399
 [ 7913.092608] RIP: 0010:[a04670ef]  [a04670ef] 
 btrfs_queue_worker+0x10e/0x236 [btrfs]
 
 but this is already trying to do work, and has no workers to handle it.
 
 The place we jump to is fail_block_groups, and before it is this comment:
 
 /*  
  * make sure we're done with the btree inode before we stop our
  * kthreads
  */
 filemap_write_and_wait(fs_info-btree_inode-i_mapping);
 invalidate_inode_pages2(fs_info-btree_inode-i_mapping);

 fail_block_groups:
 btrfs_free_block_groups(fs_info);
 
 if you move the fail_block_groups: target above the comment, does that fix it?
 (although I don't know yet what started IO . . . )

Reading the metadata of the tree root and Reading block group information 
started IO.
so, I think this patch can fix the problem.

 like this:
 
 From: Eric Sandeen sand...@redhat.com
 
 Make sure that we are always done with the btree_inode's mapping
 before we shut down the worker threads in open_ctree() error
 cases.
 
 Signed-off-by: Eric Sandeen sand...@redhat.com 
 
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
 index d89da40..1e2abda 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -2689,6 +2689,7 @@ fail_trans_kthread:
  fail_cleaner:
   kthread_stop(fs_info-cleaner_kthread);
  
 +fail_block_groups:
   /*
* make sure we're done with the btree inode before we stop our
* kthreads
 @@ -2696,7 +2697,6 @@ fail_cleaner:
   filemap_write_and_wait(fs_info-btree_inode-i_mapping);
   invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
  
 -fail_block_groups:
   btrfs_free_block_groups(fs_info);
  
  fail_tree_roots:
 
 Just a guess; but I don't know what would have started writes already...

I don't think it was write IO. It was just a soft interrupt caused by a 
metadata read IO,
and this soft interrupt happened while btrfs-endio-write-workers was going to 
stop.

Thanks
Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] kernel BUG at fs/btrfs/async-thread.c:605!

2013-01-30 Thread Miao Xie

On wed, 30 Jan 2013 23:55:34 -0600, Eric Sandeen wrote:
 if you move the fail_block_groups: target above the comment, does that fix it?
 (although I don't know yet what started IO . . . )
 
 like this:
 
 From: Eric Sandeen sand...@redhat.com
 
 Make sure that we are always done with the btree_inode's mapping
 before we shut down the worker threads in open_ctree() error
 cases.


I reviewed your patch again, and found it just fix the above problem, it still
have similar problems which are not fixed.

How about this one?

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0c31d07..d8fd711 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2728,13 +2728,13 @@ fail_cleaner:
 * kthreads
 */
filemap_write_and_wait(fs_info-btree_inode-i_mapping);
-   invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
 
 fail_block_groups:
btrfs_free_block_groups(fs_info);
 
 fail_tree_roots:
free_root_pointers(fs_info, 1);
+   invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
 
 fail_sb_buffer:
btrfs_stop_workers(fs_info-generic_worker);
@@ -2755,7 +2755,6 @@ fail_alloc:
 fail_iput:
btrfs_mapping_tree_free(fs_info-mapping_tree);
 
-   invalidate_inode_pages2(fs_info-btree_inode-i_mapping);
iput(fs_info-btree_inode);
 fail_bdi:
bdi_destroy(fs_info-bdi);

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V2 01/10] Btrfs: use atomic for btrfs_fs_info-generation

2013-01-29 Thread Miao Xie

fs_info-generation is a 64bit variant, and it can be accessed by
multi-task, if there is no lock or other methods to protect it, we
might get the wrong number, especially on 32bit machine.

For example, Assuming -generation is 0x    at the
beginning, then we increase it by 1, -generation will be
0x 0001  , but it is in the registers, then we store it
into the memory. If some task accesses it at this time, just like
this:
Task0   Task1
set low 32 bits
load low 32 bits
load high 32 bits
set high 32 bits

The task will get 0, it is a wrong number.

We fix this problem by the atomic operation.

Signed-off-by: Zhao Lei zhao...@cn.fujitsu.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- modify the changelog and make it more clear and stringency.
---
 fs/btrfs/ctree.c | 7 ---
 fs/btrfs/ctree.h | 2 +-
 fs/btrfs/disk-io.c   | 8 
 fs/btrfs/file.c  | 6 --
 fs/btrfs/inode.c | 5 +++--
 fs/btrfs/qgroup.c| 2 +-
 fs/btrfs/transaction.c   | 4 ++--
 include/trace/events/btrfs.h | 3 ++-
 8 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eea5da7..4a36c03 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1365,10 +1365,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle 
*trans,
   (unsigned long long)
   root-fs_info-running_transaction-transid);
 
-   if (trans-transid != root-fs_info-generation)
+   if (trans-transid != atomic64_read(root-fs_info-generation))
WARN(1, KERN_CRIT trans %llu running %llu\n,
   (unsigned long long)trans-transid,
-  (unsigned long long)root-fs_info-generation);
+  (unsigned long long)atomic64_read(
+  root-fs_info-generation));
 
if (!should_cow_block(trans, root, buf)) {
*cow_ret = buf;
@@ -1465,7 +1466,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
return 0;
 
WARN_ON(trans-transaction != root-fs_info-running_transaction);
-   WARN_ON(trans-transid != root-fs_info-generation);
+   WARN_ON(trans-transid != atomic64_read(root-fs_info-generation));
 
parent_nritems = btrfs_header_nritems(parent);
blocksize = btrfs_level_size(root, parent_level - 1);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 547b7b0..c3edb22 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1278,7 +1278,7 @@ struct btrfs_fs_info {
 
struct btrfs_block_rsv empty_block_rsv;
 
-   u64 generation;
+   atomic64_t generation;
u64 last_trans_committed;
 
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 65f0367..f03aebc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1200,7 +1200,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
memset(root-root_item, 0, sizeof(root-root_item));
memset(root-defrag_progress, 0, sizeof(root-defrag_progress));
memset(root-root_kobj, 0, sizeof(root-root_kobj));
-   root-defrag_trans_start = fs_info-generation;
+   root-defrag_trans_start = atomic64_read(fs_info-generation);
init_completion(root-kobj_unregister);
root-defrag_running = 0;
root-root_key.objectid = objectid;
@@ -2501,7 +2501,7 @@ retry_root_backup:
fs_info-pending_quota_state = 1;
}
 
-   fs_info-generation = generation;
+   atomic64_set(fs_info-generation, generation);
fs_info-last_trans_committed = generation;
 
ret = btrfs_recover_balance(fs_info);
@@ -3436,12 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
int was_dirty;
 
btrfs_assert_tree_locked(buf);
-   if (transid != root-fs_info-generation)
+   if (transid != atomic64_read(root-fs_info-generation))
WARN(1, KERN_CRIT btrfs transid mismatch buffer %llu, 
   found %llu running %llu\n,
(unsigned long long)buf-start,
(unsigned long long)transid,
-   (unsigned long long)root-fs_info-generation);
+   (u64)atomic64_read(root-fs_info-generation));
was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty) {
spin_lock(root-fs_info-delalloc_lock);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 841cfe3..02409b6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1588,7 +1588,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 * otherwise subsequent syncs to a file that's been synced in this
 * transaction will appear to have already occured.
 */
-   BTRFS_I(inode)-last_trans = root-fs_info-generation + 1

[PATCH V2 02/10] Btrfs: use atomic for fs_info-last_trans_committed

2013-01-29 Thread Miao Xie

fs_info-last_trans_committed is a 64bit variant, and it can be
accessed by multi-task, if there is no lock or other methods to
protect it, we might get the wrong number, especially on 32bit
machine.(Even on 64bit machine, it is possible that the compiler
may split a 64bit operation into two 32bit operation.)

For example, Assuming -last_trans_committed is 0x
at the beginning, then we want set it to 0x0001.
Task0   Task1
set low 32 bits
load low 32 bits
load high 32 bits
set high 32 bits

The task will get 0, it is a wrong number.

We fix this problem by the atomic operation.

Signed-off-by: Zhao Lei zhao...@cn.fujitsu.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- modify the changelog and make it more clear and stringency.
---
 fs/btrfs/ctree.h|  2 +-
 fs/btrfs/disk-io.c  |  2 +-
 fs/btrfs/file.c |  2 +-
 fs/btrfs/ioctl.c|  2 +-
 fs/btrfs/ordered-data.c |  2 +-
 fs/btrfs/scrub.c|  2 +-
 fs/btrfs/transaction.c  |  5 +++--
 fs/btrfs/tree-log.c | 16 +---
 8 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c3edb22..34a60a8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1279,7 +1279,7 @@ struct btrfs_fs_info {
struct btrfs_block_rsv empty_block_rsv;
 
atomic64_t generation;
-   u64 last_trans_committed;
+   atomic64_t last_trans_committed;
 
/*
 * this is updated to the current trans every time a full commit
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f03aebc..87ed05a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2502,7 +2502,7 @@ retry_root_backup:
}
 
atomic64_set(fs_info-generation, generation);
-   fs_info-last_trans_committed = generation;
+   atomic64_set(fs_info-last_trans_committed, generation);
 
ret = btrfs_recover_balance(fs_info);
if (ret) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 02409b6..910ea99 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1683,7 +1683,7 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
if (btrfs_inode_in_log(inode,
atomic64_read(root-fs_info-generation)) ||
BTRFS_I(inode)-last_trans =
-   root-fs_info-last_trans_committed) {
+   atomic64_read(root-fs_info-last_trans_committed)) {
BTRFS_I(inode)-last_trans = 0;
 
/*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index afbf3ac..3b6c339 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3114,7 +3114,7 @@ static noinline long btrfs_ioctl_start_sync(struct 
btrfs_root *root,
return PTR_ERR(trans);
 
/* No running transaction, don't bother */
-   transid = root-fs_info-last_trans_committed;
+   transid = atomic64_read(root-fs_info-last_trans_committed);
goto out;
}
transid = trans-transid;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f107312..f376621 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -975,7 +975,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle 
*trans,
 * if this file hasn't been changed since the last transaction
 * commit, we can safely return without doing anything
 */
-   if (last_mod  root-fs_info-last_trans_committed)
+   if (last_mod  atomic64_read(root-fs_info-last_trans_committed))
return;
 
spin_lock(root-fs_info-ordered_extent_lock);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bdbb94f..af0b566 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2703,7 +2703,7 @@ static noinline_for_stack int scrub_supers(struct 
scrub_ctx *sctx,
if (root-fs_info-fs_state  BTRFS_SUPER_FLAG_ERROR)
return -EIO;
 
-   gen = root-fs_info-last_trans_committed;
+   gen = atomic64_read(root-fs_info-last_trans_committed);
 
for (i = 0; i  BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 105d642..29fdf1c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -459,7 +459,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 
transid)
int ret = 0;
 
if (transid) {
-   if (transid = root-fs_info-last_trans_committed)
+   if (transid =
+   atomic64_read(root-fs_info-last_trans_committed))
goto out;
 
ret = -EINVAL;
@@ -1730,7 +1731,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle 
*trans,
 
cur_trans-commit_done = 1;
 
-   root-fs_info-last_trans_committed = cur_trans-transid;
+   atomic64_set(root-fs_info

[PATCH V2 03/10] Btrfs: use atomic for fs_info-last_trans_log_full_commit

2013-01-29 Thread Miao Xie

fs_info-last_trans_log_full_commit is a 64bit variant, and it
can be accessed by multi-task, if there is no lock or other methods
to protect it, we might get the wrong number, especially on 32bit
machine.(Even on 64bit machine, it is possible that the compiler
may split a 64bit operation into two 32bit operation.)

For example, Assuming -last_trans_log_full_commit
is 0x at the beginning, then we want set it
to 0x0001.
Task0   Task1
set low 32 bits
load low 32 bits
load high 32 bits
set high 32 bits

The task will get 0, it is a wrong number.

We fix this problem by the atomic operation.

Signed-off-by: Zhao Lei zhao...@cn.fujitsu.com
Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- modify the changelog and make it more clear and stringency.
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/extent-tree.c |  3 ++-
 fs/btrfs/inode.c   |  3 ++-
 fs/btrfs/tree-log.c| 32 +++-
 4 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 34a60a8..745e7ad 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1285,7 +1285,7 @@ struct btrfs_fs_info {
 * this is updated to the current trans every time a full commit
 * is required instead of the faster short fsync log commits
 */
-   u64 last_trans_log_full_commit;
+   atomic64_t last_trans_log_full_commit;
unsigned long mount_opt;
unsigned long compress_type:4;
u64 max_inline;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 85b8454..ef61a4a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7868,7 +7868,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle 
*trans,
 
extent_root = root-fs_info-extent_root;
 
-   root-fs_info-last_trans_log_full_commit = trans-transid;
+   atomic64_set(root-fs_info-last_trans_log_full_commit,
+trans-transid);
 
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 35c4dda..803be87 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7433,7 +7433,8 @@ static int btrfs_rename(struct inode *old_dir, struct 
dentry *old_dentry,
 
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
/* force full log commit if subvolume involved. */
-   root-fs_info-last_trans_log_full_commit = trans-transid;
+   atomic64_set(root-fs_info-last_trans_log_full_commit,
+trans-transid);
} else {
ret = btrfs_insert_inode_ref(trans, dest,
 new_dentry-d_name.name,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7f42a53..bb7c01b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2227,14 +2227,14 @@ static int wait_log_commit(struct btrfs_trans_handle 
*trans,
wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(root-log_mutex);
 
-   if (root-fs_info-last_trans_log_full_commit !=
+   if (atomic64_read(root-fs_info-last_trans_log_full_commit) !=
trans-transid  root-log_transid  transid + 2 
atomic_read(root-log_commit[index]))
schedule();
 
finish_wait(root-log_commit_wait[index], wait);
mutex_lock(root-log_mutex);
-   } while (root-fs_info-last_trans_log_full_commit !=
+   } while (atomic64_read(root-fs_info-last_trans_log_full_commit) !=
 trans-transid  root-log_transid  transid + 2 
 atomic_read(root-log_commit[index]));
return 0;
@@ -2244,12 +2244,12 @@ static void wait_for_writer(struct btrfs_trans_handle 
*trans,
struct btrfs_root *root)
 {
DEFINE_WAIT(wait);
-   while (root-fs_info-last_trans_log_full_commit !=
+   while (atomic64_read(root-fs_info-last_trans_log_full_commit) !=
   trans-transid  atomic_read(root-log_writers)) {
prepare_to_wait(root-log_writer_wait,
wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(root-log_mutex);
-   if (root-fs_info-last_trans_log_full_commit !=
+   if (atomic64_read(root-fs_info-last_trans_log_full_commit) !=
trans-transid  atomic_read(root-log_writers))
schedule();
mutex_lock(root-log_mutex);
@@ -2306,7 +2306,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
 
/* bail out if we need to do a full commit */
-   if (root-fs_info-last_trans_log_full_commit == trans-transid) {
+   if (atomic64_read(root-fs_info-last_trans_log_full_commit) ==
+   trans-transid) {
ret

[PATCH V2 04/10] Btrfs: add a comment for fs_info-max_inline

2013-01-29 Thread Miao Xie

Though -max_inline is a 64bit variant, and may be accessed by
multi-task, but it is just suggestive number, so we needn't add
anything to protect fs_info-max_inline, just add a comment to
explain wny we don't use a lock to protect it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- modify the changelog and make it more clear.
---
 fs/btrfs/ctree.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 745e7ad..3e672916 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1288,6 +1288,12 @@ struct btrfs_fs_info {
atomic64_t last_trans_log_full_commit;
unsigned long mount_opt;
unsigned long compress_type:4;
+   /*
+* It is a suggestive number, the read side is safe even it gets a
+* wrong number because we will write out the data into a regular
+* extent. The write side(mount/remount) is under -s_umount lock,
+* so it is also safe.
+*/
u64 max_inline;
u64 alloc_start;
struct btrfs_transaction *running_transaction;
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V2 05/10] Btrfs: protect fs_info-alloc_start

2013-01-29 Thread Miao Xie

fs_info-alloc_start is a 64bits variant, can be accessed by
multi-task, but it is not protected strictly, it can be changed
while we are accessing it. On 32bit machine, we will get wrong
value because we access it by two instructions.(In fact, it is
also possible that the same problem happens on the 64bit machine,
because the compiler may split the 64bit operation into two 32bit
operation.)

For example:
Assuming - alloc_start is 0x  0001  at the beginning,
then we remount and set -alloc_start to 0x 0100  .
Task0   Task1
load high 32 bits
set high 32 bits
set low 32 bits
load low 32 bits

Task1 will get 0.

This patch fixes this problem by using two locks to protect it
fs_info-chunk_mutex
sb-s_umount
On the read side, we just need get one of these two locks, and on
the write side, we must lock all of them.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- modify the changelog and make it more clear and stringency.
---
 fs/btrfs/ctree.h | 10 ++
 fs/btrfs/super.c |  4 
 2 files changed, 14 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3e672916..201be7d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1295,6 +1295,16 @@ struct btrfs_fs_info {
 * so it is also safe.
 */
u64 max_inline;
+   /*
+* Protected by -chunk_mutex and sb-s_umount.
+*
+* The reason that we use two lock to protect it is because only
+* remount and mount operations can change it and these two operations
+* are under sb-s_umount, but the read side (chunk allocation) can not
+* acquire sb-s_umount or the deadlock would happen. So we use two
+* locks to protect it. On the write side, we must acquire two locks,
+* and on the read side, we just need acquire one of them.
+*/
u64 alloc_start;
struct btrfs_transaction *running_transaction;
wait_queue_head_t transaction_throttle;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d8982e9..c96f132 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char 
*options)
case Opt_alloc_start:
num = match_strdup(args[0]);
if (num) {
+   mutex_lock(info-chunk_mutex);
info-alloc_start = memparse(num, NULL);
+   mutex_unlock(info-chunk_mutex);
kfree(num);
printk(KERN_INFO
btrfs: allocations start at %llu\n,
@@ -1289,7 +1291,9 @@ restore:
fs_info-mount_opt = old_opts;
fs_info-compress_type = old_compress_type;
fs_info-max_inline = old_max_inline;
+   mutex_lock(fs_info-chunk_mutex);
fs_info-alloc_start = old_alloc_start;
+   mutex_unlock(fs_info-chunk_mutex);
btrfs_resize_thread_pool(fs_info,
old_thread_pool_size, fs_info-thread_pool_size);
fs_info-metadata_ratio = old_metadata_ratio;
-- 
1.7.11.7

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V2 06/10] Btrfs: use percpu counter for dirty metadata count

2013-01-29 Thread Miao Xie

-dirty_metadata_bytes is accessed very frequently, so use percpu
counter instead of the u64 variant to reduce the contention of
the lock.

This patch also fixed the problem that we access it without
lock protection in __btrfs_btree_balance_dirty(), which may
cause we skip the dirty pages flush.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
Changelog v1 - v2:
- modify the changelog and make it more clear and stringency.
---
 fs/btrfs/ctree.h |  9 
 fs/btrfs/disk-io.c   | 64 
 fs/btrfs/extent_io.c |  9 +++-
 3 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 201be7d..1dcbbfd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 /* ioprio of readahead is set to idle */
 #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
 
+#define BTRFS_DIRTY_METADATA_THRESH(32 * 1024 * 1024)
+
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
  * block layout.
@@ -1439,10 +1441,9 @@ struct btrfs_fs_info {
 
u64 total_pinned;
 
-   /* protected by the delalloc lock, used to keep from writing
-* metadata until there is a nice batch
-*/
-   u64 dirty_metadata_bytes;
+   /* used to keep from writing metadata until there is a nice batch */
+   struct percpu_counter dirty_metadata_bytes;
+   s32 dirty_metadata_batch;
struct list_head dirty_cowonly_roots;
 
struct btrfs_fs_devices *fs_devices;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 87ed05a..961ac58 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -946,18 +946,20 @@ static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
 {
struct extent_io_tree *tree;
+   struct btrfs_fs_info *fs_info;
+   int ret;
+
tree = BTRFS_I(mapping-host)-io_tree;
if (wbc-sync_mode == WB_SYNC_NONE) {
-   struct btrfs_root *root = BTRFS_I(mapping-host)-root;
-   u64 num_dirty;
-   unsigned long thresh = 32 * 1024 * 1024;
 
if (wbc-for_kupdate)
return 0;
 
+   fs_info = BTRFS_I(mapping-host)-root-fs_info;
/* this is a bit racy, but that's ok */
-   num_dirty = root-fs_info-dirty_metadata_bytes;
-   if (num_dirty  thresh)
+   ret = percpu_counter_compare(fs_info-dirty_metadata_bytes,
+BTRFS_DIRTY_METADATA_THRESH);
+   if (ret  0)
return 0;
}
return btree_write_cache_pages(mapping, wbc);
@@ -1125,24 +1127,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root 
*root, u64 bytenr,
 void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root 
*root,
  struct extent_buffer *buf)
 {
+   struct btrfs_fs_info *fs_info = root-fs_info;
+
if (btrfs_header_generation(buf) ==
-   root-fs_info-running_transaction-transid) {
+   fs_info-running_transaction-transid) {
btrfs_assert_tree_locked(buf);
 
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, buf-bflags)) {
-   spin_lock(root-fs_info-delalloc_lock);
-   if (root-fs_info-dirty_metadata_bytes = buf-len)
-   root-fs_info-dirty_metadata_bytes -= buf-len;
-   else {
-   spin_unlock(root-fs_info-delalloc_lock);
-   btrfs_panic(root-fs_info, -EOVERFLOW,
- Can't clear %lu bytes from 
-  dirty_mdatadata_bytes (%llu),
- buf-len,
- root-fs_info-dirty_metadata_bytes);
-   }
-   spin_unlock(root-fs_info-delalloc_lock);
-
+   __percpu_counter_add(fs_info-dirty_metadata_bytes,
+-buf-len,
+fs_info-dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the 
page */
btrfs_set_lock_blocking(buf);
clear_extent_buffer_dirty(buf);
@@ -2004,10 +1998,18 @@ int open_ctree(struct super_block *sb,
goto fail_srcu;
}
 
+   ret = percpu_counter_init(fs_info-dirty_metadata_bytes, 0);
+   if (ret) {
+   err = ret;
+   goto fail_bdi;
+   }
+   fs_info-dirty_metadata_batch = PAGE_CACHE_SIZE *
+   (1 + ilog2(nr_cpu_ids));
+
fs_info-btree_inode = new_inode(sb);
if (!fs_info-btree_inode) {
err = -ENOMEM

< 1 2 3 4 5 6 7 8 9 10 >

401 - 500 of 1023 matches

Mail list logo