Re: [PATCH] Btrfs: place ordered operations on a per transaction list

2013-02-18 Thread Miao Xie
On wed, 13 Feb 2013 11:13:22 -0500, Josef Bacik wrote:
 Miao made the ordered operations stuff run async, which introduced a
 deadlock where we could get somebody (sync) racing in and committing the
 transaction while a commit was already happening.  The new committer would
 try and flush ordered operations which would hang waiting for the commit to
 finish because it is done asynchronously and no longer inherits the callers
 trans handle.  To fix this we need to make the ordered operations list a per
 transaction list.  We can get new inodes added to the ordered operation list
 by truncating them and then having another process writing to them, so this
 makes it so that anybody trying to add an ordered operation _must_ start a
 transaction in order to add itself to the list, which will keep new inodes
 from getting added to the ordered operations list after we start committing.
 This should fix the deadlock and also keeps us from doing a lot more work
 than we need to during commit.  Thanks,

Firstly, thanks to deal with the bug which was introduced by my patch.

But comparing with this fix method, I prefer the following one because:
- we won't worry the similar problem if we add more work during commit
  in the future.
- it is unnecessary to get a new handle and commit it if the transaction
  is under the commit.

Thanks
Miao

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fc03aa6..c449cb5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -277,7 +277,8 @@ static void wait_current_trans(struct btrfs_root *root)
}
 }
 
-static int may_wait_transaction(struct btrfs_root *root, int type)
+static int may_wait_transaction(struct btrfs_root *root, int type,
+   bool is_joined)
 {
if (root-fs_info-log_root_recovering)
return 0;
@@ -285,6 +286,14 @@ static int may_wait_transaction(struct btrfs_root *root, 
int type)
if (type == TRANS_USERSPACE)
return 1;
 
+   /*
+* If we are ATTACH, it means we just want to catch the current
+* transaction and commit it. So if someone is committing the
+* current transaction now, it is very glad to wait it.
+*/
+   if (is_joined  type == TRANS_ATTACH)
+   return 1;
+
if (type == TRANS_START 
!atomic_read(root-fs_info-open_ioctl_trans))
return 1;
@@ -355,7 +364,7 @@ again:
if (type  TRANS_JOIN_NOLOCK)
sb_start_intwrite(root-fs_info-sb);
 
-   if (may_wait_transaction(root, type))
+   if (may_wait_transaction(root, type, false))
wait_current_trans(root);
 
do {
@@ -383,16 +392,26 @@ again:
h-block_rsv = NULL;
h-orig_rsv = NULL;
h-aborted = 0;
-   h-qgroup_reserved = qgroup_reserved;
+   h-qgroup_reserved = 0;
h-delayed_ref_elem.seq = 0;
h-type = type;
INIT_LIST_HEAD(h-qgroup_ref_list);
INIT_LIST_HEAD(h-new_bgs);
 
smp_mb();
-   if (cur_trans-blocked  may_wait_transaction(root, type)) {
-   btrfs_commit_transaction(h, root);
-   goto again;
+   if (cur_trans-blocked  may_wait_transaction(root, type, true)) {
+   if (cur_trans-in_commit) {
+   btrfs_end_transaction(h, root);
+   wait_current_trans(root);
+   } else {
+   btrfs_commit_transaction(h, root);
+   }
+   if (unlikely(type == TRANS_ATTACH)) {
+   ret = -ENOENT;
+   goto alloc_fail;
+   } else {
+   goto again;
+   }
}
 
if (num_bytes) {
@@ -401,6 +420,7 @@ again:
h-block_rsv = root-fs_info-trans_block_rsv;
h-bytes_reserved = num_bytes;
}
+   h-qgroup_reserved = qgroup_reserved;
 
 got_it:
btrfs_record_root_in_trans(h, root);
-- 
1.6.5.2

 
 Signed-off-by: Josef Bacik jba...@fusionio.com
 ---
  fs/btrfs/ctree.h|7 ---
  fs/btrfs/disk-io.c  |   11 ++-
  fs/btrfs/file.c |   15 ++-
  fs/btrfs/ordered-data.c |   13 -
  fs/btrfs/ordered-data.h |3 ++-
  fs/btrfs/transaction.c  |5 +++--
  fs/btrfs/transaction.h  |1 +
  7 files changed, 34 insertions(+), 21 deletions(-)
 
 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 0c4e4df..9f72ec8 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1408,13 +1408,6 @@ struct btrfs_fs_info {
   struct list_head delalloc_inodes;
  
   /*
 -  * special rename and truncate targets that must be on disk before
 -  * we're allowed to commit.  This is basically the ext3 style
 -  * data=ordered list.
 -  */
 - struct list_head ordered_operations;
 -
 - /*
* there is a pool of worker threads for checksumming during writes
* and a pool for checksumming 

Re: [PATCH] Btrfs: place ordered operations on a per transaction list

2013-02-18 Thread Josef Bacik
On Mon, Feb 18, 2013 at 04:22:09AM -0700, Miao Xie wrote:
 On wed, 13 Feb 2013 11:13:22 -0500, Josef Bacik wrote:
  Miao made the ordered operations stuff run async, which introduced a
  deadlock where we could get somebody (sync) racing in and committing the
  transaction while a commit was already happening.  The new committer would
  try and flush ordered operations which would hang waiting for the commit to
  finish because it is done asynchronously and no longer inherits the callers
  trans handle.  To fix this we need to make the ordered operations list a per
  transaction list.  We can get new inodes added to the ordered operation list
  by truncating them and then having another process writing to them, so this
  makes it so that anybody trying to add an ordered operation _must_ start a
  transaction in order to add itself to the list, which will keep new inodes
  from getting added to the ordered operations list after we start committing.
  This should fix the deadlock and also keeps us from doing a lot more work
  than we need to during commit.  Thanks,
 
 Firstly, thanks to deal with the bug which was introduced by my patch.
 
 But comparing with this fix method, I prefer the following one because:
 - we won't worry the similar problem if we add more work during commit
   in the future.
 - it is unnecessary to get a new handle and commit it if the transaction
   is under the commit.

Mine has the benefit of not making a committing transaction flush more stuff
that it doesn't need to, so I think I'll keep mine as well, but I agree yours is
good for the attach case as well.  So can you send this along properly with a
signed off and such and we can have our cake and eat it too.  Thanks,

Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: place ordered operations on a per transaction list

2013-02-13 Thread Josef Bacik
Miao made the ordered operations stuff run async, which introduced a
deadlock where we could get somebody (sync) racing in and committing the
transaction while a commit was already happening.  The new committer would
try and flush ordered operations which would hang waiting for the commit to
finish because it is done asynchronously and no longer inherits the callers
trans handle.  To fix this we need to make the ordered operations list a per
transaction list.  We can get new inodes added to the ordered operation list
by truncating them and then having another process writing to them, so this
makes it so that anybody trying to add an ordered operation _must_ start a
transaction in order to add itself to the list, which will keep new inodes
from getting added to the ordered operations list after we start committing.
This should fix the deadlock and also keeps us from doing a lot more work
than we need to during commit.  Thanks,

Signed-off-by: Josef Bacik jba...@fusionio.com
---
 fs/btrfs/ctree.h|7 ---
 fs/btrfs/disk-io.c  |   11 ++-
 fs/btrfs/file.c |   15 ++-
 fs/btrfs/ordered-data.c |   13 -
 fs/btrfs/ordered-data.h |3 ++-
 fs/btrfs/transaction.c  |5 +++--
 fs/btrfs/transaction.h  |1 +
 7 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0c4e4df..9f72ec8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1408,13 +1408,6 @@ struct btrfs_fs_info {
struct list_head delalloc_inodes;
 
/*
-* special rename and truncate targets that must be on disk before
-* we're allowed to commit.  This is basically the ext3 style
-* data=ordered list.
-*/
-   struct list_head ordered_operations;
-
-   /*
 * there is a pool of worker threads for checksumming during writes
 * and a pool for checksumming after reads.  This is because readers
 * can run with FS locks held, and the writers may be waiting for
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 52573d0..6baa77d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -56,7 +56,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+struct btrfs_root *root);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
  struct btrfs_root *root);
@@ -2029,7 +2030,6 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(fs_info-dead_roots);
INIT_LIST_HEAD(fs_info-delayed_iputs);
INIT_LIST_HEAD(fs_info-delalloc_inodes);
-   INIT_LIST_HEAD(fs_info-ordered_operations);
INIT_LIST_HEAD(fs_info-caching_block_groups);
spin_lock_init(fs_info-delalloc_lock);
spin_lock_init(fs_info-trans_lock);
@@ -3538,7 +3538,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
btrfs_cleanup_transaction(root);
 }
 
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+struct btrfs_root *root)
 {
struct btrfs_inode *btrfs_inode;
struct list_head splice;
@@ -3548,7 +3549,7 @@ static void btrfs_destroy_ordered_operations(struct 
btrfs_root *root)
mutex_lock(root-fs_info-ordered_operations_mutex);
spin_lock(root-fs_info-ordered_extent_lock);
 
-   list_splice_init(root-fs_info-ordered_operations, splice);
+   list_splice_init(t-ordered_operations, splice);
while (!list_empty(splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
 ordered_operations);
@@ -3829,7 +3830,7 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
while (!list_empty(list)) {
t = list_entry(list.next, struct btrfs_transaction, list);
 
-   btrfs_destroy_ordered_operations(root);
+   btrfs_destroy_ordered_operations(t, root);
 
btrfs_destroy_ordered_extents(root);
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 237bf92..848442c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1629,7 +1629,20 @@ int btrfs_release_file(struct inode *inode, struct file 
*filp)
 */
if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
   BTRFS_I(inode)-runtime_flags)) {
-   btrfs_add_ordered_operation(NULL, BTRFS_I(inode)-root, inode);
+   struct btrfs_trans_handle *trans;
+