[PATCH v2] Btrfs: setup free ino caching in a more asynchronous way

2011-05-26 Thread Li Zefan
For a filesystem that has lots of files in it, the first time we mount
it with free ino caching support, it can take quite a long time to
setup the caching before we can create new files.

Here we fill the cache with [highest_ino, BTRFS_LAST_FREE_OBJECTID]
before we start the caching thread to search through the extent tree.

Signed-off-by: Li Zefan l...@cn.fujitsu.com
---

based on the integration-test branch.

v2: fixed an off-by-one bug

---
 fs/btrfs/inode-map.c |   28 ++--
 1 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 0009705..3262cd1 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -60,12 +60,12 @@ again:
 
while (1) {
smp_mb();
-   if (fs_info-closing  1)
+   if (fs_info-closing)
goto out;
 
leaf = path-nodes[0];
slot = path-slots[0];
-   if (path-slots[0] = btrfs_header_nritems(leaf)) {
+   if (slot = btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret  0)
goto out;
@@ -100,7 +100,7 @@ again:
if (key.type != BTRFS_INODE_ITEM_KEY)
goto next;
 
-   if (key.objectid = BTRFS_LAST_FREE_OBJECTID)
+   if (key.objectid = root-highest_objectid)
break;
 
if (last != (u64)-1  last + 1 != key.objectid) {
@@ -114,9 +114,9 @@ next:
path-slots[0]++;
}
 
-   if (last  BTRFS_LAST_FREE_OBJECTID - 1) {
+   if (last  root-highest_objectid - 1) {
__btrfs_add_free_space(ctl, last + 1,
-  BTRFS_LAST_FREE_OBJECTID - last - 1);
+  root-highest_objectid - last - 1);
}
 
spin_lock(root-cache_lock);
@@ -136,8 +136,10 @@ out:
 
 static void start_caching(struct btrfs_root *root)
 {
+   struct btrfs_free_space_ctl *ctl = root-free_ino_ctl;
struct task_struct *tsk;
int ret;
+   u64 objectid;
 
spin_lock(root-cache_lock);
if (root-cached != BTRFS_CACHE_NO) {
@@ -156,6 +158,19 @@ static void start_caching(struct btrfs_root *root)
return;
}
 
+   /*
+* It can be quite time-consuming to fill the cache by searching
+* through the extent tree, and this can keep ino allocation path
+* waiting. Therefore at start we quickly find out the highest
+* inode number and we know we can use inode numbers which fall in
+* [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
+*/
+   ret = btrfs_find_free_objectid(root, objectid);
+   if (!ret  objectid = BTRFS_LAST_FREE_OBJECTID) {
+   __btrfs_add_free_space(ctl, objectid,
+  BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+   }
+
tsk = kthread_run(caching_kthread, root, btrfs-ino-cache-%llu\n,
  root-root_key.objectid);
BUG_ON(IS_ERR(tsk));
@@ -209,7 +224,8 @@ again:
 
start_caching(root);
 
-   if (objectid = root-cache_progress)
+   if (objectid = root-cache_progress ||
+   objectid  root-highest_objectid)
__btrfs_add_free_space(ctl, objectid, 1);
else
__btrfs_add_free_space(pinned, objectid, 1);
-- 
1.7.3.1
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] Btrfs: setup free ino caching in a more asynchronous way

2011-05-26 Thread Arne Jansen
On 26.05.2011 08:38, Li Zefan wrote:
 For a filesystem that has lots of files in it, the first time we mount
 it with free ino caching support, it can take quite a long time to
 setup the caching before we can create new files.
 
 Here we fill the cache with [highest_ino, BTRFS_LAST_FREE_OBJECTID]
 before we start the caching thread to search through the extent tree.

This also sound like a very good consumer for the readahead-API.

-Arne

 
 Signed-off-by: Li Zefan l...@cn.fujitsu.com
 ---
 
 based on the integration-test branch.
 
 v2: fixed an off-by-one bug
 
 ---
  fs/btrfs/inode-map.c |   28 ++--
  1 files changed, 22 insertions(+), 6 deletions(-)
 
 diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
 index 0009705..3262cd1 100644
 --- a/fs/btrfs/inode-map.c
 +++ b/fs/btrfs/inode-map.c
 @@ -60,12 +60,12 @@ again:
  
   while (1) {
   smp_mb();
 - if (fs_info-closing  1)
 + if (fs_info-closing)
   goto out;
  
   leaf = path-nodes[0];
   slot = path-slots[0];
 - if (path-slots[0] = btrfs_header_nritems(leaf)) {
 + if (slot = btrfs_header_nritems(leaf)) {
   ret = btrfs_next_leaf(root, path);
   if (ret  0)
   goto out;
 @@ -100,7 +100,7 @@ again:
   if (key.type != BTRFS_INODE_ITEM_KEY)
   goto next;
  
 - if (key.objectid = BTRFS_LAST_FREE_OBJECTID)
 + if (key.objectid = root-highest_objectid)
   break;
  
   if (last != (u64)-1  last + 1 != key.objectid) {
 @@ -114,9 +114,9 @@ next:
   path-slots[0]++;
   }
  
 - if (last  BTRFS_LAST_FREE_OBJECTID - 1) {
 + if (last  root-highest_objectid - 1) {
   __btrfs_add_free_space(ctl, last + 1,
 -BTRFS_LAST_FREE_OBJECTID - last - 1);
 +root-highest_objectid - last - 1);
   }
  
   spin_lock(root-cache_lock);
 @@ -136,8 +136,10 @@ out:
  
  static void start_caching(struct btrfs_root *root)
  {
 + struct btrfs_free_space_ctl *ctl = root-free_ino_ctl;
   struct task_struct *tsk;
   int ret;
 + u64 objectid;
  
   spin_lock(root-cache_lock);
   if (root-cached != BTRFS_CACHE_NO) {
 @@ -156,6 +158,19 @@ static void start_caching(struct btrfs_root *root)
   return;
   }
  
 + /*
 +  * It can be quite time-consuming to fill the cache by searching
 +  * through the extent tree, and this can keep ino allocation path
 +  * waiting. Therefore at start we quickly find out the highest
 +  * inode number and we know we can use inode numbers which fall in
 +  * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
 +  */
 + ret = btrfs_find_free_objectid(root, objectid);
 + if (!ret  objectid = BTRFS_LAST_FREE_OBJECTID) {
 + __btrfs_add_free_space(ctl, objectid,
 +BTRFS_LAST_FREE_OBJECTID - objectid + 1);
 + }
 +
   tsk = kthread_run(caching_kthread, root, btrfs-ino-cache-%llu\n,
 root-root_key.objectid);
   BUG_ON(IS_ERR(tsk));
 @@ -209,7 +224,8 @@ again:
  
   start_caching(root);
  
 - if (objectid = root-cache_progress)
 + if (objectid = root-cache_progress ||
 + objectid  root-highest_objectid)
   __btrfs_add_free_space(ctl, objectid, 1);
   else
   __btrfs_add_free_space(pinned, objectid, 1);

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/11 v2] Btrfs: update block generation if should_cow_block fails

2011-05-26 Thread Liu Bo
Cause we've added sub transaction, if it do not want to cow a block, we also
need to get new sub transid recorded.  This is used for log code to find the
most uptodate file extents.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/ctree.c |   34 +-
 1 files changed, 33 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0c3b515..7e21fa9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -484,6 +484,33 @@ static noinline int __btrfs_cow_block(struct 
btrfs_trans_handle *trans,
return 0;
 }
 
+static inline void update_block_generation(struct btrfs_trans_handle *trans,
+  struct btrfs_root *root,
+  struct extent_buffer *buf,
+  struct extent_buffer *parent,
+  int slot)
+{
+   /*
+* If it does not need to cow this block, we still need to
+* update the block's generation, for transid may have been
+* changed during fsync.
+   */
+   if (btrfs_header_generation(buf) == trans-transid)
+   return;
+
+   if (buf == root-node) {
+   btrfs_set_header_generation(buf, trans-transid);
+   btrfs_mark_buffer_dirty(buf);
+   add_root_to_dirty_list(root);
+   } else {
+   btrfs_set_node_ptr_generation(parent, slot,
+ trans-transid);
+   btrfs_set_header_generation(buf, trans-transid);
+   btrfs_mark_buffer_dirty(parent);
+   btrfs_mark_buffer_dirty(buf);
+   }
+}
+
 static inline int should_cow_block(struct btrfs_trans_handle *trans,
   struct btrfs_root *root,
   struct extent_buffer *buf)
@@ -524,6 +551,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle 
*trans,
}
 
if (!should_cow_block(trans, root, buf)) {
+   update_block_generation(trans, root, buf, parent, parent_slot);
*cow_ret = buf;
return 0;
}
@@ -1639,8 +1667,12 @@ again:
 * then we don't want to set the path blocking,
 * so we test it here
 */
-   if (!should_cow_block(trans, root, b))
+   if (!should_cow_block(trans, root, b)) {
+   update_block_generation(trans, root, b,
+   p-nodes[level + 1],
+   p-slots[level + 1]);
goto cow_done;
+   }
 
btrfs_set_path_blocking(p);
 
-- 
1.6.5.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/11 v2] Btrfs: introduce first sub trans

2011-05-26 Thread Liu Bo
In multi-thread situations, writeback of a file may span across several
sub transactions, and we need to introduce first_sub_trans to get sub_transid of
the first sub transaction recorded, so that log code can skip file extents which
have been logged or committed into disk.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/btrfs_inode.h |9 +
 fs/btrfs/inode.c   |   13 -
 fs/btrfs/transaction.h |   17 -
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 57c3bb2..fb5617a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -79,6 +79,15 @@ struct btrfs_inode {
/* sequence number for NFS changes */
u64 sequence;
 
+   /* used to avoid race of first_sub_trans */
+   spinlock_t sub_trans_lock;
+
+   /*
+* sub transid of the trans that first modified this inode before
+* a trans commit or a log sync
+*/
+   u64 first_sub_trans;
+
/*
 * transid of the trans_handle that last modified this inode
 */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7242ebb..e1e5053 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6574,7 +6574,16 @@ again:
set_page_dirty(page);
SetPageUptodate(page);
 
-   BTRFS_I(inode)-last_trans = root-fs_info-generation;
+   spin_lock(BTRFS_I(inode)-sub_trans_lock);
+
+   if (BTRFS_I(inode)-first_sub_trans  root-fs_info-sub_generation ||
+   BTRFS_I(inode)-last_trans = BTRFS_I(inode)-logged_trans ||
+   BTRFS_I(inode)-last_trans = root-fs_info-last_trans_committed)
+   BTRFS_I(inode)-first_sub_trans = root-fs_info-sub_generation;
+
+   spin_unlock(BTRFS_I(inode)-sub_trans_lock);
+
+   BTRFS_I(inode)-last_trans = root-fs_info-sub_generation;
BTRFS_I(inode)-last_sub_trans = BTRFS_I(inode)-root-log_transid;
 
unlock_extent_cached(io_tree, page_start, page_end, cached_state, 
GFP_NOFS);
@@ -6768,6 +6777,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei-space_info = NULL;
ei-generation = 0;
ei-sequence = 0;
+   ei-first_sub_trans = 0;
ei-last_trans = 0;
ei-last_sub_trans = 0;
ei-logged_trans = 0;
@@ -6791,6 +6801,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(ei-io_tree, inode-i_data, GFP_NOFS);
extent_io_tree_init(ei-io_failure_tree, inode-i_data, GFP_NOFS);
mutex_init(ei-log_mutex);
+   spin_lock_init(ei-sub_trans_lock);
btrfs_ordered_inode_tree_init(ei-ordered_tree);
INIT_LIST_HEAD(ei-i_orphan);
INIT_LIST_HEAD(ei-delalloc_inodes);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6dcdd28..d531aea 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -83,7 +83,22 @@ static inline void btrfs_update_inode_block_group(
 static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
  struct inode *inode)
 {
-   BTRFS_I(inode)-last_trans = trans-transaction-transid;
+   spin_lock(BTRFS_I(inode)-sub_trans_lock);
+
+   /*
+* We have joined in a transaction, so btrfs_commit_transaction will
+* definitely wait for us and it does not need to add a extra
+* trans_mutex lock here.
+*/
+   if (BTRFS_I(inode)-first_sub_trans  trans-transid ||
+   BTRFS_I(inode)-last_trans = BTRFS_I(inode)-logged_trans ||
+   BTRFS_I(inode)-last_trans =
+BTRFS_I(inode)-root-fs_info-last_trans_committed)
+   BTRFS_I(inode)-first_sub_trans = trans-transid;
+
+   spin_unlock(BTRFS_I(inode)-sub_trans_lock);
+
+   BTRFS_I(inode)-last_trans = trans-transid;
BTRFS_I(inode)-last_sub_trans = BTRFS_I(inode)-root-log_transid;
 }
 
-- 
1.6.5.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/11 v2] Btrfs: modify btrfs_drop_extents API

2011-05-26 Thread Liu Bo
We want to use btrfs_drop_extent() in log code.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/ctree.h|3 ++-
 fs/btrfs/file.c |9 +++--
 fs/btrfs/inode.c|6 +++---
 fs/btrfs/ioctl.c|4 ++--
 fs/btrfs/tree-log.c |2 +-
 5 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ef68108..1ba3f91 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2575,7 +2575,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 
start, u64 end,
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern const struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-  u64 start, u64 end, u64 *hint_byte, int drop_cache);
+  u64 start, u64 end, u64 *hint_byte, int drop_cache,
+  int log);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
  struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 75899a0..d19cf3a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -290,7 +290,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, 
u64 end,
  * is deleted from the tree.
  */
 int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-  u64 start, u64 end, u64 *hint_byte, int drop_cache)
+  u64 start, u64 end, u64 *hint_byte, int drop_cache,
+  int log)
 {
struct btrfs_root *root = BTRFS_I(inode)-root;
struct extent_buffer *leaf;
@@ -309,6 +310,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, 
struct inode *inode,
int recow;
int ret;
 
+   /* drop the existed extents in log tree */
+   if (log)
+   root = root-log_root;
+
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
@@ -489,7 +494,7 @@ next_slot:
extent_end - key.offset);
extent_end = ALIGN(extent_end,
   root-sectorsize);
-   } else if (disk_bytenr  0) {
+   } else if (disk_bytenr  0  !log) {
ret = btrfs_free_extent(trans, root,
disk_bytenr, num_bytes, 0,
root-root_key.objectid,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8957c5d..7242ebb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -244,7 +244,7 @@ static noinline int cow_file_range_inline(struct 
btrfs_trans_handle *trans,
}
 
ret = btrfs_drop_extents(trans, inode, start, aligned_end,
-hint_byte, 1);
+hint_byte, 1, 0);
BUG_ON(ret);
 
if (isize  actual_end)
@@ -1640,7 +1640,7 @@ static int insert_reserved_file_extent(struct 
btrfs_trans_handle *trans,
 * with the others.
 */
ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
-hint, 0);
+hint, 0, 0);
BUG_ON(ret);
 
ins.objectid = inode-i_ino;
@@ -3650,7 +3650,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t 
oldsize, loff_t size)
 
err = btrfs_drop_extents(trans, inode, cur_offset,
 cur_offset + hole_size,
-hint_byte, 1);
+hint_byte, 1, 0);
if (err)
break;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cab08fa..5135579 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2020,7 +2020,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, 
unsigned long srcfd,
ret = btrfs_drop_extents(trans, inode,
 new_key.offset,
 new_key.offset + datal,
-hint_byte, 1);
+hint_byte, 1, 0);
BUG_ON(ret);
 
ret = btrfs_insert_empty_item(trans, root, path,
@@ -2075,7 +2075,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, 
unsigned long srcfd,
ret = btrfs_drop_extents(trans, inode,
 new_key.offset,
 new_key.offset + datal,
-

[PATCH 01/11 v2] Btrfs: introduce sub transaction stuff

2011-05-26 Thread Liu Bo
Introduce a new concept sub transaction,
the relation between transaction and sub transaction is

transaction A   --- transid = x
   sub trans a(1)   --- sub_transid = x+1
   sub trans a(2)   --- sub_transid = x+2
 ... ...
   sub trans a(n-1) --- sub_transid = x+n-1
   sub trans a(n)   --- sub_transid = x+n
transaction B   --- transid = x+n+1
 ... ...

And the most important is
a) a trans handler's transid now gets value from sub transid instead of transid.
b) when a transaction commits, transid may not added by 1, but depend on the
   biggest sub_transaction of the last neighbour transaction,
   i.e.
B-transid = a(n)-transid + 1,
(B-transid - A-transid) = 1
c) we start a new sub transaction after a fsync.

We also ship some 'trans-transid' to 'trans-transaction-transid' to
ensure btrfs works well and to get rid of WARNings.

These are used for the new log code.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/ctree.c   |   35 ++-
 fs/btrfs/ctree.h   |1 +
 fs/btrfs/disk-io.c |7 ---
 fs/btrfs/extent-tree.c |   10 ++
 fs/btrfs/inode.c   |4 ++--
 fs/btrfs/ioctl.c   |2 +-
 fs/btrfs/relocation.c  |6 +++---
 fs/btrfs/transaction.c |   13 +
 fs/btrfs/transaction.h |1 +
 fs/btrfs/tree-defrag.c |2 +-
 fs/btrfs/tree-log.c|   16 ++--
 11 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84d7ca1..0c3b515 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -201,9 +201,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int level;
struct btrfs_disk_key disk_key;
 
-   WARN_ON(root-ref_cows  trans-transid !=
+   WARN_ON(root-ref_cows  trans-transaction-transid !=
root-fs_info-running_transaction-transid);
-   WARN_ON(root-ref_cows  trans-transid != root-last_trans);
+   WARN_ON(root-ref_cows  trans-transid  root-last_trans);
 
level = btrfs_header_level(buf);
if (level == 0)
@@ -398,9 +398,9 @@ static noinline int __btrfs_cow_block(struct 
btrfs_trans_handle *trans,
 
btrfs_assert_tree_locked(buf);
 
-   WARN_ON(root-ref_cows  trans-transid !=
+   WARN_ON(root-ref_cows  trans-transaction-transid !=
root-fs_info-running_transaction-transid);
-   WARN_ON(root-ref_cows  trans-transid != root-last_trans);
+   WARN_ON(root-ref_cows  trans-transid  root-last_trans);
 
level = btrfs_header_level(buf);
 
@@ -466,7 +466,8 @@ static noinline int __btrfs_cow_block(struct 
btrfs_trans_handle *trans,
else
parent_start = 0;
 
-   WARN_ON(trans-transid != btrfs_header_generation(parent));
+   WARN_ON(btrfs_header_generation(parent) 
+   trans-transaction-transid);
btrfs_set_node_blockptr(parent, parent_slot,
cow-start);
btrfs_set_node_ptr_generation(parent, parent_slot,
@@ -487,7 +488,7 @@ static inline int should_cow_block(struct 
btrfs_trans_handle *trans,
   struct btrfs_root *root,
   struct extent_buffer *buf)
 {
-   if (btrfs_header_generation(buf) == trans-transid 
+   if (btrfs_header_generation(buf) = trans-transaction-transid 
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) 
!(root-root_key.objectid != BTRFS_TREE_RELOC_OBJECTID 
  btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
@@ -515,7 +516,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle 
*trans,
   root-fs_info-running_transaction-transid);
WARN_ON(1);
}
-   if (trans-transid != root-fs_info-generation) {
+   if (trans-transaction-transid != root-fs_info-generation) {
printk(KERN_CRIT trans %llu running %llu\n,
   (unsigned long long)trans-transid,
   (unsigned long long)root-fs_info-generation);
@@ -618,7 +619,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
if (trans-transaction != root-fs_info-running_transaction)
WARN_ON(1);
-   if (trans-transid != root-fs_info-generation)
+   if (trans-transaction-transid != root-fs_info-generation)
WARN_ON(1);
 
parent_nritems = btrfs_header_nritems(parent);
@@ -898,7 +899,7 @@ static noinline int balance_level(struct btrfs_trans_handle 
*trans,
mid = path-nodes[level];
 
WARN_ON(!path-locks[level]);
-   WARN_ON(btrfs_header_generation(mid) != trans-transid);
+   WARN_ON(btrfs_header_generation(mid)  trans-transaction-transid);
 
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
@@ -1105,7 +1106,7 @@ static noinline int push_nodes_for_insert(struct 
btrfs_trans_handle *trans,
  

[PATCH 00/11 v2] Btrfs: improve write ahead log with sub transaction

2011-05-26 Thread Liu Bo
I've been working to try to improve the write-ahead log's performance,
and I found that the bottleneck addresses in the checksum items,
especially when we want to make a random write on a large file, e.g a 4G file.

Then a idea for this suggested by Chris is to use sub transaction ids and just
to log the part of inode that had changed since either the last log commit or
the last transaction commit.  And as we also push the sub transid into the btree
blocks, we'll get much faster tree walks.  As a result, we abandon the original
brute force approach, which is to delete all items of the inode in log,
to making sure we get the most uptodate copies of everything, and instead
we manage to find and merge, i.e. finding extents in the log tree and merging
in the new extents from the file.

This patchset puts the above idea into code, and although the code is now more
complex, it brings us a great deal of performance improvement.

Beside the improvement of log, patch 8 fixes a small but critical bug of log 
code
with sub transaction.

Here I have some test results to show, I use sysbench to do random write + 
fsync.

===
sysbench --test=fileio --num-threads=1 --file-num=2 --file-block-size=4K 
--file-total-size=8G --file-test-mode=rndwr --file-io-mode=sync 
--file-extra-flags=  [prepare, run]
===

Sysbench args:
  - Number of threads: 1
  - Extra file open flags: 0
  - 2 files, 4Gb each
  - Block size 4Kb
  - Number of random requests for random IO: 1
  - Read/Write ratio for combined random IO test: 1.50
  - Periodic FSYNC enabled, calling fsync() each 100 requests.
  - Calling fsync() at the end of test, Enabled.
  - Using synchronous I/O mode
  - Doing random write test

Sysbench results:
===
   Operations performed:  0 Read, 1 Write, 200 Other = 10200 Total
   Read 0b  Written 39.062Mb  Total transferred 39.062Mb
===
a) without patch:  (*SPEED* : 451.01Kb/sec)
   112.75 Requests/sec executed

b) with patch: (*SPEED* : 4.3621Mb/sec)
   1116.71 Requests/sec executed

v1-v2: fix a EEXIST by logged_trans and a mismatch by log root generation

Liu Bo (11):
  Btrfs: introduce sub transaction stuff
  Btrfs: update block generation if should_cow_block fails
  Btrfs: modify btrfs_drop_extents API
  Btrfs: introduce first sub trans
  Btrfs: still update inode trans stuff when size remains unchanged
  Btrfs: improve log with sub transaction
  Btrfs: add checksum check for log
  Btrfs: fix a bug of log check
  Btrfs: kick off useless code
  Btrfs: deal with EEXIST after iput
  Btrfs: use the right generation number to read log_root_tree

 fs/btrfs/btrfs_inode.h |   12 ++-
 fs/btrfs/ctree.c   |   69 +
 fs/btrfs/ctree.h   |5 +-
 fs/btrfs/disk-io.c |   12 +-
 fs/btrfs/extent-tree.c |   10 +-
 fs/btrfs/file.c|   22 ++---
 fs/btrfs/inode.c   |   33 ---
 fs/btrfs/ioctl.c   |6 +-
 fs/btrfs/relocation.c  |6 +-
 fs/btrfs/transaction.c |   13 ++-
 fs/btrfs/transaction.h |   19 +++-
 fs/btrfs/tree-defrag.c |2 +-
 fs/btrfs/tree-log.c|  267 +++-
 13 files changed, 330 insertions(+), 146 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/11 v2] Btrfs: fix a bug of log check

2011-05-26 Thread Liu Bo
The current code uses struct root's last_log_commit to check if an inode
has been logged, but the problem is that this root-last_log_commit is
shared among files.  Say we have N inodes to be logged, after the first
inode, root-last_log_commit is updated and the N-1 remains will not be
logged.

As we've introduce sub transaction and filled inode's last_trans and
logged_trans with sub_transid instead of transaction id, we can just
compare last_trans with logged_trans to determine if the processing inode
is logged.  And the more important thing is these two values are
inode-individual, so it will not interfere with others.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/btrfs_inode.h |5 -
 fs/btrfs/ctree.h   |1 -
 fs/btrfs/disk-io.c |2 --
 fs/btrfs/inode.c   |2 --
 fs/btrfs/transaction.h |1 -
 fs/btrfs/tree-log.c|   16 +++-
 6 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fb5617a..d3a570c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -94,11 +94,6 @@ struct btrfs_inode {
u64 last_trans;
 
/*
-* log transid when this inode was last modified
-*/
-   u64 last_sub_trans;
-
-   /*
 * transid that last logged this inode
 */
u64 logged_trans;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1ba3f91..73aa36b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1114,7 +1114,6 @@ struct btrfs_root {
atomic_t log_writers;
atomic_t log_commit[2];
unsigned long log_transid;
-   unsigned long last_log_commit;
unsigned long log_batch;
pid_t log_start_pid;
bool log_multiple_pids;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a51c13c..5271365 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1079,7 +1079,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(root-log_writers, 0);
root-log_batch = 0;
root-log_transid = 0;
-   root-last_log_commit = 0;
extent_io_tree_init(root-dirty_log_pages,
 fs_info-btree_inode-i_mapping, GFP_NOFS);
 
@@ -1216,7 +1215,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root-log_root);
root-log_root = log_root;
root-log_transid = 0;
-   root-last_log_commit = 0;
return 0;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1ba98fd..8db16fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6585,7 +6585,6 @@ again:
spin_unlock(BTRFS_I(inode)-sub_trans_lock);
 
BTRFS_I(inode)-last_trans = root-fs_info-sub_generation;
-   BTRFS_I(inode)-last_sub_trans = BTRFS_I(inode)-root-log_transid;
 
unlock_extent_cached(io_tree, page_start, page_end, cached_state, 
GFP_NOFS);
 
@@ -6780,7 +6779,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei-sequence = 0;
ei-first_sub_trans = 0;
ei-last_trans = 0;
-   ei-last_sub_trans = 0;
ei-logged_trans = 0;
ei-delalloc_bytes = 0;
ei-reserved_bytes = 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d531aea..e169553 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -99,7 +99,6 @@ static inline void btrfs_set_inode_last_trans(struct 
btrfs_trans_handle *trans,
spin_unlock(BTRFS_I(inode)-sub_trans_lock);
 
BTRFS_I(inode)-last_trans = trans-transid;
-   BTRFS_I(inode)-last_sub_trans = BTRFS_I(inode)-root-log_transid;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ba014ea..8bedfb8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1967,7 +1967,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *log = root-log_root;
struct btrfs_root *log_root_tree = root-fs_info-log_root_tree;
-   unsigned long log_transid = 0;
 
mutex_lock(root-log_mutex);
index1 = root-log_transid % 2;
@@ -2002,8 +2001,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
 
-   log_transid = root-log_transid;
-   if (log_transid % 2 == 0)
+   if (root-log_transid % 2 == 0)
mark = EXTENT_DIRTY;
else
mark = EXTENT_NEW;
@@ -2108,11 +2106,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
write_ctree_super(trans, root-fs_info-tree_root, 1);
ret = 0;
 
-   mutex_lock(root-log_mutex);
-   if (root-last_log_commit  log_transid)
-   root-last_log_commit = log_transid;
-   mutex_unlock(root-log_mutex);
-
 out_wake_log_root:
atomic_set(log_root_tree-log_commit[index2], 0);
smp_mb();
@@ -3045,14 +3038,11 @@ out:
 static int inode_in_log(struct btrfs_trans_handle *trans,
 

[PATCH 11/11 v2] Btrfs: use the right generation number to read log_root_tree

2011-05-26 Thread Liu Bo
Currently we use the generation number of the super to read in the log
tree root after a crash.  This doesn't always match the sub trans id and
so it doesn't always match the transid stored in the btree blocks.

We can use log_root_transid to record the log_root_tree's generation
so that when we recover from crash, we can match log_root_tree's btree blocks.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/disk-io.c  |3 ++-
 fs/btrfs/tree-log.c |2 ++
 2 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5271365..517655f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2103,6 +2103,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (btrfs_super_log_root(disk_super) != 0 
!(fs_info-fs_state  BTRFS_SUPER_FLAG_ERROR)) {
u64 bytenr = btrfs_super_log_root(disk_super);
+   u64 log_root_transid = btrfs_super_log_root_transid(disk_super);
 
if (fs_devices-rw_devices == 0) {
printk(KERN_WARNING Btrfs log replay required 
@@ -2125,7 +2126,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
log_tree_root-node = read_tree_block(tree_root, bytenr,
  blocksize,
- generation + 1);
+ log_root_transid);
ret = btrfs_recover_log_trees(log_tree_root);
BUG_ON(ret);
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fea4f39..b033ba3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2089,6 +2089,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
log_root_tree-node-start);
btrfs_set_super_log_root_level(root-fs_info-super_for_commit,
btrfs_header_level(log_root_tree-node));
+   btrfs_set_super_log_root_transid(root-fs_info-super_for_commit,
+trans-transid);
 
log_root_tree-log_batch = 0;
log_root_tree-log_transid++;
-- 
1.6.5.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/11 v2] Btrfs: deal with EEXIST after iput

2011-05-26 Thread Liu Bo
There are two cases when BTRFS_I(inode)-logged_trans is zero:
a) an inode is just allocated;
b) iput an inode and reread it.

However, in b) if btrfs is not committed yet, and this inode _may_ still remain
in log tree.

So we need to check the log tree to get logged_trans a right value
in case it hits a EEXIST while logging.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/inode.c|9 +++--
 fs/btrfs/tree-log.c |   43 +++
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8db16fa..e310b5b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1770,12 +1770,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, 
u64 start, u64 end)
add_pending_csums(trans, inode, ordered_extent-file_offset,
  ordered_extent-list);
 
-   ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-   if (!ret) {
-   ret = btrfs_update_inode(trans, root, inode);
-   BUG_ON(ret);
-   } else
-   btrfs_set_inode_last_trans(trans, inode);
+   btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+   ret = btrfs_update_inode(trans, root, inode);
+   BUG_ON(ret);
ret = 0;
 out:
if (nolock) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8bedfb8..fea4f39 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3035,6 +3035,37 @@ out:
return ret;
 }
 
+static int check_logged_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+   struct btrfs_inode_item *inode_item;
+   struct btrfs_path *path;
+   int ret;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   ret = btrfs_search_slot(trans, root,
+   BTRFS_I(inode)-location, path, 0, 0);
+   if (ret) {
+   if (ret  0)
+   ret = 0;
+   goto out;
+   }
+
+   btrfs_unlock_up_safe(path, 1);
+   inode_item = btrfs_item_ptr(path-nodes[0], path-slots[0],
+   struct btrfs_inode_item);
+
+   BTRFS_I(inode)-logged_trans = btrfs_inode_transid(path-nodes[0],
+  inode_item);
+out:
+   btrfs_free_path(path);
+   return ret;
+}
+
+
 static int inode_in_log(struct btrfs_trans_handle *trans,
 struct inode *inode)
 {
@@ -3087,6 +3118,18 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle 
*trans,
if (ret)
goto end_no_trans;
 
+   /*
+* After we iput a inode and reread it from disk, logged_trans is 0.
+* However, this inode _may_ still remain in log tree and not be
+* committed yet.
+* So we need to check the log tree to get logged_trans a right value.
+*/
+   if (!BTRFS_I(inode)-logged_trans  root-log_root) {
+   ret = check_logged_trans(trans, root-log_root, inode);
+   if (ret)
+   goto end_no_trans;
+   }
+
if (inode_in_log(trans, inode)) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
-- 
1.6.5.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/11 v2] Btrfs: improve log with sub transaction

2011-05-26 Thread Liu Bo
When logging an inode _A_, current btrfs will
a) clear all items belonged to _A_ in log,
b) copy all items belonged to _A_ from fs/file tree to log tree,
and this just wastes a lot of time, especially when logging big files.

So we want to use a smarter approach, i.e. find and merge.
The amount of file extent items is the largest, so we focus on it.
Thanks to sub transaction, now we can find those file extent items which
are changed after last _transaction commit_ or last _log commit_, and
then merge them with the existed ones in log tree.

It will be great helpful on fsync performance, cause the common case is
make changes on a _part_ of inode.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/tree-log.c |  177 ---
 1 files changed, 126 insertions(+), 51 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fa0e8e4..28c3190 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2564,60 +2564,106 @@ again:
 }
 
 /*
- * a helper function to drop items from the log before we relog an
- * inode.  max_key_type indicates the highest item type to remove.
- * This cannot be run for file data extents because it does not
- * free the extents they point to.
+ * a helper function to drop items from the log before we merge
+ * the uptodate items into the log tree.
  */
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *log,
- struct btrfs_path *path,
- u64 objectid, int max_key_type)
+static int prepare_for_merge_items(struct btrfs_trans_handle *trans,
+  struct inode *inode,
+  struct extent_buffer *eb,
+  int slot, int nr)
 {
-   int ret;
-   struct btrfs_key key;
+   struct btrfs_root *log = BTRFS_I(inode)-root-log_root;
+   struct btrfs_path *path;
struct btrfs_key found_key;
+   struct btrfs_key key;
+   int i;
+   int ret;
 
-   key.objectid = objectid;
-   key.type = max_key_type;
-   key.offset = (u64)-1;
+   /* There are no relative items of the inode in log. */
+   if (BTRFS_I(inode)-logged_trans  trans-transaction-transid)
+   return 0;
 
-   while (1) {
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   for (i = 0; i  nr; i++) {
+   btrfs_item_key_to_cpu(eb, key, i + slot);
+
+   if (btrfs_key_type(key) == BTRFS_EXTENT_DATA_KEY) {
+   struct btrfs_file_extent_item *fi;
+   int found_type;
+   u64 mask = BTRFS_I(inode)-root-sectorsize - 1;
+   u64 start = key.offset;
+   u64 extent_end;
+   u64 hint;
+   unsigned long size;
+
+   fi = btrfs_item_ptr(eb, slot + i,
+struct btrfs_file_extent_item);
+   found_type = btrfs_file_extent_type(eb, fi);
+
+   if (found_type == BTRFS_FILE_EXTENT_REG ||
+   found_type == BTRFS_FILE_EXTENT_PREALLOC)
+   extent_end = start +
+   btrfs_file_extent_num_bytes(eb, fi);
+   else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+   size = btrfs_file_extent_inline_len(eb, fi);
+   extent_end = (start + size + mask)  ~mask;
+   } else
+   BUG_ON(1);
+
+   /* drop any overlapping extents */
+   ret = btrfs_drop_extents(trans, inode, start,
+extent_end, hint, 0, 1);
+   BUG_ON(ret);
+
+   continue;
+   }
+
+   /* non file extent */
ret = btrfs_search_slot(trans, log, key, path, -1, 1);
-   BUG_ON(ret == 0);
if (ret  0)
break;
 
-   if (path-slots[0] == 0)
+   /* empty log! */
+   if (ret  0  path-slots[0] == 0)
break;
 
-   path-slots[0]--;
+   if (ret  0) {
+   btrfs_release_path(log, path);
+   continue;
+   }
+
btrfs_item_key_to_cpu(path-nodes[0], found_key,
  path-slots[0]);
 
-   if (found_key.objectid != objectid)
-   break;
+   if (btrfs_comp_cpu_keys(found_key, key))
+   BUG_ON(1);
 
ret = btrfs_del_item(trans, log, path);
BUG_ON(ret);
btrfs_release_path(log, 

[PATCH 07/11 v2] Btrfs: add checksum check for log

2011-05-26 Thread Liu Bo
If a inode is a BTRFS_INODE_NODATASUM one, it need not to look for csum items
any more.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/tree-log.c |   13 -
 1 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 28c3190..ba014ea 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2655,7 +2655,8 @@ static noinline int copy_items(struct btrfs_trans_handle 
*trans,
   struct inode *inode,
   struct btrfs_path *dst_path,
   struct extent_buffer *src,
-  int start_slot, int nr, int inode_only)
+  int start_slot, int nr, int inode_only,
+  int csum)
 {
unsigned long src_offset;
unsigned long dst_offset;
@@ -2722,7 +2723,8 @@ static noinline int copy_items(struct btrfs_trans_handle 
*trans,
 * or deletes of this inode don't have to relog the inode
 * again
 */
-   if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+   if (btrfs_key_type(ins_keys + i) ==
+   BTRFS_EXTENT_DATA_KEY  csum) {
int found_type;
extent = btrfs_item_ptr(src, start_slot + i,
struct btrfs_file_extent_item);
@@ -2836,6 +2838,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
*trans,
int ins_start_slot = 0;
int ins_nr;
u64 transid;
+   int csum = (BTRFS_I(inode)-flags  BTRFS_INODE_NODATASUM) ? 0 : 1;
 
/*
 * We use transid in btrfs_search_forward() as a filter, in order to
@@ -2906,7 +2909,7 @@ filter:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, src,
 ins_start_slot,
-ins_nr, inode_only);
+ins_nr, inode_only, csum);
if (ret) {
err = ret;
goto out_unlock;
@@ -2925,7 +2928,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, src,
 ins_start_slot,
-ins_nr, inode_only);
+ins_nr, inode_only, csum);
if (ret) {
err = ret;
goto out_unlock;
@@ -2946,7 +2949,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, src,
 ins_start_slot,
-ins_nr, inode_only);
+ins_nr, inode_only, csum);
if (ret) {
err = ret;
goto out_unlock;
-- 
1.6.5.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/11 v2] Btrfs: kick off useless code

2011-05-26 Thread Liu Bo
fsync will wait for writeback till it finishes, and last_trans will get the real
transid recorded in writeback, so it does not need an extra +1 to ensure fsync's
process on the file.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/file.c |   13 -
 1 files changed, 0 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d19cf3a..73c46e2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1146,19 +1146,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
mutex_unlock(inode-i_mutex);
 
-   /*
-* we want to make sure fsync finds this change
-* but we haven't joined a transaction running right now.
-*
-* Later on, someone is sure to update the inode and get the
-* real transid recorded.
-*
-* We set last_trans now to the fs_info generation + 1,
-* this will either be one more than the running transaction
-* or the generation used for the next transaction if there isn't
-* one running right now.
-*/
-   BTRFS_I(inode)-last_trans = root-fs_info-generation + 1;
if (num_written  0 || num_written == -EIOCBQUEUED) {
err = generic_write_sync(file, pos, num_written);
if (err  0  num_written  0)
-- 
1.6.5.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/11 v2] Btrfs: improve write ahead log with sub transaction

2011-05-26 Thread liubo

This includes the two patches that we've discussed before.

I sent this as a whole just in case you have to patch the code by yourself. :)

thanks,
liubo

On 05/26/2011 04:19 PM, Liu Bo wrote:
 I've been working to try to improve the write-ahead log's performance,
 and I found that the bottleneck addresses in the checksum items,
 especially when we want to make a random write on a large file, e.g a 4G file.
 
 Then a idea for this suggested by Chris is to use sub transaction ids and just
 to log the part of inode that had changed since either the last log commit or
 the last transaction commit.  And as we also push the sub transid into the 
 btree
 blocks, we'll get much faster tree walks.  As a result, we abandon the 
 original
 brute force approach, which is to delete all items of the inode in log,
 to making sure we get the most uptodate copies of everything, and instead
 we manage to find and merge, i.e. finding extents in the log tree and 
 merging
 in the new extents from the file.
 
 This patchset puts the above idea into code, and although the code is now more
 complex, it brings us a great deal of performance improvement.
 
 Beside the improvement of log, patch 8 fixes a small but critical bug of log 
 code
 with sub transaction.
 
 Here I have some test results to show, I use sysbench to do random write + 
 fsync.
 
 ===
 sysbench --test=fileio --num-threads=1 --file-num=2 --file-block-size=4K 
 --file-total-size=8G --file-test-mode=rndwr --file-io-mode=sync 
 --file-extra-flags=  [prepare, run]
 ===
 
 Sysbench args:
   - Number of threads: 1
   - Extra file open flags: 0
   - 2 files, 4Gb each
   - Block size 4Kb
   - Number of random requests for random IO: 1
   - Read/Write ratio for combined random IO test: 1.50
   - Periodic FSYNC enabled, calling fsync() each 100 requests.
   - Calling fsync() at the end of test, Enabled.
   - Using synchronous I/O mode
   - Doing random write test
 
 Sysbench results:
 ===
Operations performed:  0 Read, 1 Write, 200 Other = 10200 Total
Read 0b  Written 39.062Mb  Total transferred 39.062Mb
 ===
 a) without patch:  (*SPEED* : 451.01Kb/sec)
112.75 Requests/sec executed
 
 b) with patch: (*SPEED* : 4.3621Mb/sec)
1116.71 Requests/sec executed
 
 v1-v2: fix a EEXIST by logged_trans and a mismatch by log root generation
 
 Liu Bo (11):
   Btrfs: introduce sub transaction stuff
   Btrfs: update block generation if should_cow_block fails
   Btrfs: modify btrfs_drop_extents API
   Btrfs: introduce first sub trans
   Btrfs: still update inode trans stuff when size remains unchanged
   Btrfs: improve log with sub transaction
   Btrfs: add checksum check for log
   Btrfs: fix a bug of log check
   Btrfs: kick off useless code
   Btrfs: deal with EEXIST after iput
   Btrfs: use the right generation number to read log_root_tree
 
  fs/btrfs/btrfs_inode.h |   12 ++-
  fs/btrfs/ctree.c   |   69 +
  fs/btrfs/ctree.h   |5 +-
  fs/btrfs/disk-io.c |   12 +-
  fs/btrfs/extent-tree.c |   10 +-
  fs/btrfs/file.c|   22 ++---
  fs/btrfs/inode.c   |   33 ---
  fs/btrfs/ioctl.c   |6 +-
  fs/btrfs/relocation.c  |6 +-
  fs/btrfs/transaction.c |   13 ++-
  fs/btrfs/transaction.h |   19 +++-
  fs/btrfs/tree-defrag.c |2 +-
  fs/btrfs/tree-log.c|  267 
 +++-
  13 files changed, 330 insertions(+), 146 deletions(-)
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: don't commit the transaction if we dont have enough pinned bytes V2

2011-05-26 Thread Adrian Hunter

On 25/05/11 22:30, Josef Bacik wrote:

I noticed when running an enospc test that we would get stuck committing the
transaction in check_data_space even though we truly didn't have enough space.
So check to see if bytes_pinned is bigger than num_bytes, if it's not don't
commit the transaction.  Thanks,

Signed-off-by: Josef Bacikjo...@redhat.com
---
V1-V2: Make it so it actually compiles ;)
  fs/btrfs/extent-tree.c |7 +++
  1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c8c3184..b4f67e8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3199,6 +3199,13 @@ alloc:
}
goto again;
}
+
+   /*
+* If we have less pinned bytes than we want to allocate then
+* don't bother committing the transaction, it won't help us.
+*/
+   if (data_sinfo-bytes_pinned  bytes)
+   committed = 1;
spin_unlock(data_sinfo-lock);

/* commit the current transaction and try again */


I tried that patch on 2.6.39 with the following:

sudo modprobe brd rd_size=262144
sudo mkfs.btrfs /dev/ram0
sudo mkdir -p /mnt/test
sudo mount -t btrfs /dev/ram0 /mnt/test
sudo mkdir -p /mnt/test/test
sudo chown $USER /mnt/test/test
sudo chgrp $USER /mnt/test/test
sudo umount /mnt/test
i=0
while true; do
sudo mount -t btrfs /dev/ram0 /mnt/test
fsstress -c -r -d /mnt/test/test -p 3 -n 1000 -l 10
sudo umount /mnt/test
i=`expr $i \+ 1`
echo $i
done


After 3 iterations it got really slow and then after some minutes it
still seems to lock up:

[ 2059.881182] SysRq : Show Blocked State
[ 2059.881188]   taskPC stack   pid father
[ 2059.881226] btrfs-transacti D 000100046afc 0  2212  2 
0x0080
[ 2059.881233]  880181483d60 0046 0003 
8801
[ 2059.881239]  00013a80 00013a80 00013a80 
880184dac530
[ 2059.881244]  00013a80 880181483fd8 00013a80 
00013a80

[ 2059.881249] Call Trace:
[ 2059.881273]  [a02e8780] wait_for_commit.clone.14+0x90/0xd5 
[btrfs]

[ 2059.881280]  [810692ab] ? wake_up_bit+0x2a/0x2a
[ 2059.881297]  [a02e95b4] 
btrfs_commit_transaction+0x102/0x665 [btrfs]
[ 2059.881312]  [a02e8eae] ? 
join_transaction.clone.21+0x180/0x18b [btrfs]

[ 2059.881316]  [810692ab] ? wake_up_bit+0x2a/0x2a
[ 2059.881331]  [a02e402e] transaction_kthread+0x17a/0x22f [btrfs]
[ 2059.881345]  [a02e3eb4] ? btrfs_congested_fn+0x82/0x82 [btrfs]
[ 2059.881349]  [81068dce] kthread+0x82/0x8a
[ 2059.881355]  [8147db64] kernel_thread_helper+0x4/0x10
[ 2059.881359]  [81068d4c] ? kthread_worker_fn+0x14b/0x14b
[ 2059.881364]  [8147db60] ? gs_change+0x13/0x13
[ 2059.881366] flush-btrfs-5   D 0001000f78a6 0  2219  2 
0x0080
[ 2059.881371]  8801828efb10 0046 0754 
8801
[ 2059.881376]  00013a80 00013a80 00013a80 
8801a53fc530
[ 2059.881382]  00013a80 8801828effd8 00013a80 
00013a80

[ 2059.881386] Call Trace:
[ 2059.881391]  [814750a4] schedule_timeout+0x36/0xe3
[ 2059.881396]  [8107b6c2] ? arch_local_irq_save+0x18/0x1e
[ 2059.881400]  [814762bf] ? _raw_spin_unlock_irqrestore+0x17/0x19
[ 2059.881404]  [8106953e] ? prepare_to_wait+0x6c/0x79
[ 2059.881419]  [a02e9764] 
btrfs_commit_transaction+0x2b2/0x665 [btrfs]

[ 2059.881423]  [810692ab] ? wake_up_bit+0x2a/0x2a
[ 2059.881440]  [a02f2d76] btrfs_write_inode+0x9b/0xa3 [btrfs]
[ 2059.881445]  [81138d78] writeback_single_inode+0x125/0x1bf
[ 2059.881450]  [81139055] writeback_sb_inodes+0xce/0x160
[ 2059.881454]  [81139d68] wb_writeback+0x257/0x3a7
[ 2059.881459]  [81139f40] wb_do_writeback+0x88/0x1a0
[ 2059.881464]  [8105b0a1] ? run_timer_softirq+0x298/0x298
[ 2059.881468]  [8113a0e3] bdi_writeback_thread+0x8b/0x20c
[ 2059.881473]  [8113a058] ? wb_do_writeback+0x1a0/0x1a0
[ 2059.881476]  [81068dce] kthread+0x82/0x8a
[ 2059.881481]  [8147db64] kernel_thread_helper+0x4/0x10
[ 2059.881485]  [81068d4c] ? kthread_worker_fn+0x14b/0x14b
[ 2059.881489]  [8147db60] ? gs_change+0x13/0x13
[ 2059.881493] fsstressD 8801a23f8000 0  2304  1 
0x0084
[ 2059.881497]  880189c93d48 0082  
0400
[ 2059.881502]  00013a80 00013a80 00013a80 
88017bc59710
[ 2059.881507]  00013a80 880189c93fd8 00013a80 
00013a80

[ 2059.881512] Call Trace:
[ 2059.881517]  [8103dadf] ? resched_task+0x48/0x72
[ 2059.881522]  [814750a4] schedule_timeout+0x36/0xe3
[ 

Re: [PATCH v1 3/5] btrfs: initial readahead code and prototypes

2011-05-26 Thread David Sterba
Hi,

On Mon, May 23, 2011 at 02:59:06PM +0200, Arne Jansen wrote:
 +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 +   struct btrfs_device *dev, u64 logical,
 +   struct btrfs_multi_bio *multi)
 +{
 + int ret;
 + int looped = 0;
 + struct reada_zone *zone;
 + struct btrfs_block_group_cache *cache = NULL;
 + u64 start;
 + u64 end;
 + int i;
 +
 +again:
 + zone = NULL;
 + spin_lock(fs_info-reada_lock);
 + ret = radix_tree_gang_lookup(dev-reada_zones, (void **)zone,
 +  logical  PAGE_CACHE_SHIFT, 1);
 + if (ret == 1)
 + kref_get(zone-refcnt);
 + spin_unlock(fs_info-reada_lock);
 +
 + if (ret == 1) {
 + if (logical = zone-start  logical  zone-end)
 + return zone;
 + spin_lock(fs_info-reada_lock);
 + reada_zone_put(zone);
 + spin_unlock(fs_info-reada_lock);
 + }
 +
 + if (looped)
 + return NULL;
 +
 + cache = btrfs_lookup_block_group(fs_info, logical);
 + if (!cache)
 + return NULL;
 +
 + start = cache-key.objectid;
 + end = start + cache-key.offset - 1;
 + btrfs_put_block_group(cache);
 +
 + zone = kzalloc(sizeof(*zone), GFP_NOFS);
 + if (!zone)
 + return NULL;
 +
 + zone-start = start;
 + zone-end = end;
 + INIT_LIST_HEAD(zone-list);
 + spin_lock_init(zone-lock);
 + zone-locked = 0;
 + kref_init(zone-refcnt);
 + zone-elems = 0;
 + zone-device = dev; /* our device always sits at index 0 */
 + for (i = 0; i  multi-num_stripes; ++i) {
 + /* bounds have already been checked */
 + zone-devs[i] = multi-stripes[i].dev;
 + }
 + zone-ndevs = multi-num_stripes;
 +
 + spin_lock(fs_info-reada_lock);
 + ret = radix_tree_insert(dev-reada_zones,
 + (unsigned long)zone-end  PAGE_CACHE_SHIFT,
 + zone);

this can sleep inside a spinlock, you initialize the radix tree with
GFP_NOFS, which allows __GFP_WAIT.

Options:
1) use GFP_ATOMIC in radix tree init flags
2) do the radix_tree_preload/radix_tree_preload_end, GFP_NOFS outside of the
locked section is ok but __GFP_WAIT has to be masked out (else radix
tree insert will not use the preloaded node)
3) unmask __GFP_WAIT from radix tree init flags

I'd go for 3, as the atomic context is not required, and is easier
than 2 to implement.

 + spin_unlock(fs_info-reada_lock);
 +
 + if (ret) {
 + kfree(zone);
 + looped = 1;
 + goto again;
 + }
 +
 + return zone;
 +}
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] fs: add a DCACHE_NEED_LOOKUP flag for d_flags V2

2011-05-26 Thread Al Viro
On Mon, May 23, 2011 at 12:13:07PM -0400, Josef Bacik wrote:
 On 05/21/2011 10:11 PM, Al Viro wrote:
  On Fri, May 20, 2011 at 01:44:30PM -0400, Josef Bacik wrote:
  +  if (unlikely(d_need_lookup(dentry))) {
  +  if (nameidata_dentry_drop_rcu(nd, dentry))
  +  return -ECHILD;
  +  dput(dentry);
  +  dentry = NULL;
  +  goto retry;
  
  Yecchhh...  How about simple goto unlazy; here instead and doing the rest
  there?  Especially since you have the same kind of thing elsewhere in the
  same sucker.  It had been bloody painful to untangle that thing; let's not
  add to the rat's nest that still remains...
 
 This is where I was a little confused, which is why I added this code.  It
 seems that having goto unlazy; will mean that we will come down to this
 section
 
 if (unlikely(status  = 0 )) {
   if (status  0) {
   dput(dentry);
   return status;
   }
 if (!d_invalidate(dentry)) {
   dput(dentry);
   dentry = NULL;
   need_reval = 1;
   goto retry;
   }
 }
 
 and d_invalidate will unhash us so we won't find our new dentry in the cache
 which makes this whole exercise useless.  Is there a different way you'd
 like
 this cleaned up?  Thanks,

Not *into* the loop; just before the beginning of that loop.  IOW, put
if (dentry  unlikely(dentry-d_flags  DCACHE_NEED_LOOKUP)) {
dput(dentry);
dentry = NULL;
}
just before retry: instead of doing it in non-lazy branch.  Voila - your
code in the lazy branch becomes
if (unlikely(dentry-d_flags  DCACHE_NEED_LOOKUP))
goto unlazy;
and that's it.  Can you resend it with such modifications?

ObMemoryPressureIssues: I really hoped to get Dave's patch (per-sb shrinkers)
in that cycle, but it'll probably have to wait for the next one...
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 3/5] btrfs: initial readahead code and prototypes

2011-05-26 Thread Miao Xie
On thu, 26 May 2011 12:14:21 +0200, David Sterba wrote:
 Hi,
 
 On Mon, May 23, 2011 at 02:59:06PM +0200, Arne Jansen wrote:
 +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 +  struct btrfs_device *dev, u64 logical,
 +  struct btrfs_multi_bio *multi)
 +{
 +int ret;
 +int looped = 0;
 +struct reada_zone *zone;
 +struct btrfs_block_group_cache *cache = NULL;
 +u64 start;
 +u64 end;
 +int i;
 +
 +again:
 +zone = NULL;
 +spin_lock(fs_info-reada_lock);
 +ret = radix_tree_gang_lookup(dev-reada_zones, (void **)zone,
 + logical  PAGE_CACHE_SHIFT, 1);
 +if (ret == 1)
 +kref_get(zone-refcnt);
 +spin_unlock(fs_info-reada_lock);
 +
 +if (ret == 1) {
 +if (logical = zone-start  logical  zone-end)
 +return zone;
 +spin_lock(fs_info-reada_lock);
 +reada_zone_put(zone);
 +spin_unlock(fs_info-reada_lock);
 +}
 +
 +if (looped)
 +return NULL;
 +
 +cache = btrfs_lookup_block_group(fs_info, logical);
 +if (!cache)
 +return NULL;
 +
 +start = cache-key.objectid;
 +end = start + cache-key.offset - 1;
 +btrfs_put_block_group(cache);
 +
 +zone = kzalloc(sizeof(*zone), GFP_NOFS);
 +if (!zone)
 +return NULL;
 +
 +zone-start = start;
 +zone-end = end;
 +INIT_LIST_HEAD(zone-list);
 +spin_lock_init(zone-lock);
 +zone-locked = 0;
 +kref_init(zone-refcnt);
 +zone-elems = 0;
 +zone-device = dev; /* our device always sits at index 0 */
 +for (i = 0; i  multi-num_stripes; ++i) {
 +/* bounds have already been checked */
 +zone-devs[i] = multi-stripes[i].dev;
 +}
 +zone-ndevs = multi-num_stripes;
 +
 +spin_lock(fs_info-reada_lock);
 +ret = radix_tree_insert(dev-reada_zones,
 +(unsigned long)zone-end  PAGE_CACHE_SHIFT,
 +zone);
 
 this can sleep inside a spinlock, you initialize the radix tree with
 GFP_NOFS, which allows __GFP_WAIT.
 
 Options:
 1) use GFP_ATOMIC in radix tree init flags
 2) do the radix_tree_preload/radix_tree_preload_end, GFP_NOFS outside of the
 locked section is ok but __GFP_WAIT has to be masked out (else radix
 tree insert will not use the preloaded node)
 3) unmask __GFP_WAIT from radix tree init flags
 
 I'd go for 3, as the atomic context is not required, and is easier
 than 2 to implement.

I like the second one, because it is the general way to fix this problem.

BTW: I think we can use RCU to protect the radix tree on the read side.
Arne, how do you think about?

Thanks
Miao

 
 +spin_unlock(fs_info-reada_lock);
 +
 +if (ret) {
 +kfree(zone);
 +looped = 1;
 +goto again;
 +}
 +
 +return zone;
 +}
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 3/5] btrfs: initial readahead code and prototypes

2011-05-26 Thread Arne Jansen
On 26.05.2011 12:47, Miao Xie wrote:
 On thu, 26 May 2011 12:14:21 +0200, David Sterba wrote:
 Hi,

 On Mon, May 23, 2011 at 02:59:06PM +0200, Arne Jansen wrote:
 +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 + struct btrfs_device *dev, u64 logical,
 + struct btrfs_multi_bio *multi)
 +{
 +   int ret;
 +   int looped = 0;
 +   struct reada_zone *zone;
 +   struct btrfs_block_group_cache *cache = NULL;
 +   u64 start;
 +   u64 end;
 +   int i;
 +
 +again:
 +   zone = NULL;
 +   spin_lock(fs_info-reada_lock);
 +   ret = radix_tree_gang_lookup(dev-reada_zones, (void **)zone,
 +logical  PAGE_CACHE_SHIFT, 1);
 +   if (ret == 1)
 +   kref_get(zone-refcnt);
 +   spin_unlock(fs_info-reada_lock);
 +
 +   if (ret == 1) {
 +   if (logical = zone-start  logical  zone-end)
 +   return zone;
 +   spin_lock(fs_info-reada_lock);
 +   reada_zone_put(zone);
 +   spin_unlock(fs_info-reada_lock);
 +   }
 +
 +   if (looped)
 +   return NULL;
 +
 +   cache = btrfs_lookup_block_group(fs_info, logical);
 +   if (!cache)
 +   return NULL;
 +
 +   start = cache-key.objectid;
 +   end = start + cache-key.offset - 1;
 +   btrfs_put_block_group(cache);
 +
 +   zone = kzalloc(sizeof(*zone), GFP_NOFS);
 +   if (!zone)
 +   return NULL;
 +
 +   zone-start = start;
 +   zone-end = end;
 +   INIT_LIST_HEAD(zone-list);
 +   spin_lock_init(zone-lock);
 +   zone-locked = 0;
 +   kref_init(zone-refcnt);
 +   zone-elems = 0;
 +   zone-device = dev; /* our device always sits at index 0 */
 +   for (i = 0; i  multi-num_stripes; ++i) {
 +   /* bounds have already been checked */
 +   zone-devs[i] = multi-stripes[i].dev;
 +   }
 +   zone-ndevs = multi-num_stripes;
 +
 +   spin_lock(fs_info-reada_lock);
 +   ret = radix_tree_insert(dev-reada_zones,
 +   (unsigned long)zone-end  PAGE_CACHE_SHIFT,
 +   zone);

 this can sleep inside a spinlock, you initialize the radix tree with
 GFP_NOFS, which allows __GFP_WAIT.

 Options:
 1) use GFP_ATOMIC in radix tree init flags
 2) do the radix_tree_preload/radix_tree_preload_end, GFP_NOFS outside of the
 locked section is ok but __GFP_WAIT has to be masked out (else radix
 tree insert will not use the preloaded node)
 3) unmask __GFP_WAIT from radix tree init flags

 I'd go for 3, as the atomic context is not required, and is easier
 than 2 to implement.
 
 I like the second one, because it is the general way to fix this problem.

I can't use the second one, as I have code to insert into 3 trees inside
one spin_lock, and preload can preload one for one insertion. My
intention was to use GFP_ATOMIC, don't know how I ended up NOFS
instead. If unmasking GFP_WAIT instead is sufficient, I'd prefer that
solution, too.

 
 BTW: I think we can use RCU to protect the radix tree on the read side.
 Arne, how do you think about?

I decided against RCU, because inside the same lock I also take a ref on
the structure.
The data structures and locking are already quite complex, so I tried to
keep it as simple as possible until profiling shows that this is a
problem.
Mainly it works with a single global (well, per btrfs) lock, but
splitting it is hard and will normally lead to needing more
lock/unlocks, so I'd prefer to only do it if it doesn't scale.

Thanks,
Arne

 
 Thanks
 Miao
 

 +   spin_unlock(fs_info-reada_lock);
 +
 +   if (ret) {
 +   kfree(zone);
 +   looped = 1;
 +   goto again;
 +   }
 +
 +   return zone;
 +}
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: don't commit the transaction if we dont have enough pinned bytes V2

2011-05-26 Thread Josef Bacik
On 05/26/2011 04:57 AM, Adrian Hunter wrote:
 On 25/05/11 22:30, Josef Bacik wrote:
 I noticed when running an enospc test that we would get stuck
 committing the
 transaction in check_data_space even though we truly didn't have
 enough space.
 So check to see if bytes_pinned is bigger than num_bytes, if it's not
 don't
 commit the transaction.  Thanks,

 Signed-off-by: Josef Bacikjo...@redhat.com
 ---
 V1-V2: Make it so it actually compiles ;)
   fs/btrfs/extent-tree.c |7 +++
   1 files changed, 7 insertions(+), 0 deletions(-)

 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index c8c3184..b4f67e8 100644
 --- a/fs/btrfs/extent-tree.c
 +++ b/fs/btrfs/extent-tree.c
 @@ -3199,6 +3199,13 @@ alloc:
   }
   goto again;
   }
 +
 +/*
 + * If we have less pinned bytes than we want to allocate then
 + * don't bother committing the transaction, it won't help us.
 + */
 +if (data_sinfo-bytes_pinned  bytes)
 +committed = 1;
   spin_unlock(data_sinfo-lock);

   /* commit the current transaction and try again */
 
 I tried that patch on 2.6.39 with the following:
 
 sudo modprobe brd rd_size=262144
 sudo mkfs.btrfs /dev/ram0
 sudo mkdir -p /mnt/test
 sudo mount -t btrfs /dev/ram0 /mnt/test
 sudo mkdir -p /mnt/test/test
 sudo chown $USER /mnt/test/test
 sudo chgrp $USER /mnt/test/test
 sudo umount /mnt/test
 i=0
 while true; do
 sudo mount -t btrfs /dev/ram0 /mnt/test
 fsstress -c -r -d /mnt/test/test -p 3 -n 1000 -l 10
 sudo umount /mnt/test
 i=`expr $i \+ 1`
 echo $i
 done
 
 
 After 3 iterations it got really slow and then after some minutes it
 still seems to lock up:
 

Did you run without my patch?  I assume this will still happen even
without my patch.  The only possible negative side-effect of my patch is
we could ENOSPC early.  Thanks,

Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] fs: add a DCACHE_NEED_LOOKUP flag for d_flags

2011-05-26 Thread Josef Bacik
Btrfs (and I'd venture most other fs's) stores its indexes in nice disk order
for readdir, but unfortunately in the case of anything that stats the files in
order that readdir spits back (like oh say ls) that means we still have to do
the normal lookup of the file, which means looking up our other index and then
looking up the inode.  What I want is a way to create dummy dentries when we
find them in readdir so that when ls or anything else subsequently does a
stat(), we already have the location information in the dentry and can go
straight to the inode itself.  The lookup stuff just assumes that if it finds a
dentry it is done, it doesn't perform a lookup.  So add a DCACHE_NEED_LOOKUP
flag so that the lookup code knows it still needs to run i_op-lookup() on the
parent to get the inode for the dentry.  I have tested this with btrfs and I
went from something that looks like this

http://people.redhat.com/jwhiter/ls-noreada.png

To this

http://people.redhat.com/jwhiter/ls-good.png

Thats a savings of 1300 seconds, or 22 minutes.  That is a significant savings.
Thanks,

Signed-off-by: Josef Bacik jo...@redhat.com
---
 fs/dcache.c|   34 ++-
 fs/namei.c |   51 
 include/linux/dcache.h |7 ++
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 22a0ef4..7fc0e30 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -343,6 +343,24 @@ void d_drop(struct dentry *dentry)
 EXPORT_SYMBOL(d_drop);
 
 /*
+ * d_clear_need_lookup - drop a dentry from cache and clear the need lookup 
flag
+ * @dentry: dentry to drop
+ *
+ * This is called when we do a lookup on a placeholder dentry that needed to be
+ * looked up.  The dentry should have been hashed in order for it to be found 
by
+ * the lookup code, but now needs to be unhashed while we do the actual lookup
+ * and clear the DCACHE_NEED_LOOKUP flag.
+ */
+void d_clear_need_lookup(struct dentry *dentry)
+{
+   spin_lock(dentry-d_lock);
+   __d_drop(dentry);
+   dentry-d_flags = ~DCACHE_NEED_LOOKUP;
+   spin_unlock(dentry-d_lock);
+}
+EXPORT_SYMBOL(d_clear_need_lookup);
+
+/*
  * Finish off a dentry we've decided to kill.
  * dentry-d_lock must be held, returns with it unlocked.
  * If ref is non-zero, then decrement the refcount too.
@@ -431,8 +449,13 @@ repeat:
if (d_unhashed(dentry))
goto kill_it;
 
-   /* Otherwise leave it cached and ensure it's on the LRU */
-   dentry-d_flags |= DCACHE_REFERENCED;
+   /*
+* If this dentry needs lookup, don't set the referenced flag so that it
+* is more likely to be cleaned up by the dcache shrinker in case of
+* memory pressure.
+*/
+   if (!d_need_lookup(dentry))
+   dentry-d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
 
dentry-d_count--;
@@ -1703,6 +1726,13 @@ struct dentry *d_add_ci(struct dentry *dentry, struct 
inode *inode,
}
 
/*
+* We are going to instantiate this dentry, unhash it and clear the
+* lookup flag so we can do that.
+*/
+   if (unlikely(d_need_lookup(found)))
+   d_clear_need_lookup(found);
+
+   /*
 * Negative dentry: instantiate it unless the inode is a directory and
 * already has a dentry.
 */
diff --git a/fs/namei.c b/fs/namei.c
index e3c4f11..fc8bc60 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1198,6 +1198,30 @@ static struct dentry *d_alloc_and_lookup(struct dentry 
*parent,
 }
 
 /*
+ * We already have a dentry, but require a lookup to be performed on the parent
+ * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error.
+ * parent-d_inode-i_mutex must be held. d_lookup must have verified that no
+ * child exists while under i_mutex.
+ */
+static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry 
*dentry,
+struct nameidata *nd)
+{
+   struct inode *inode = parent-d_inode;
+   struct dentry *old;
+
+   /* Don't create child dentry for a dead directory. */
+   if (unlikely(IS_DEADDIR(inode)))
+   return ERR_PTR(-ENOENT);
+
+   old = inode-i_op-lookup(inode, dentry, nd);
+   if (unlikely(old)) {
+   dput(dentry);
+   dentry = old;
+   }
+   return dentry;
+}
+
+/*
  *  It's more convoluted than I'd like it to be, but... it's still fairly
  *  small and for now I'd prefer to have fast path as straight as possible.
  *  It _is_ time-critical.
@@ -1236,6 +1260,8 @@ static int do_lookup(struct nameidata *nd, struct qstr 
*name,
goto unlazy;
}
}
+   if (unlikely(d_need_lookup(dentry)))
+   goto unlazy;
path-mnt = mnt;
path-dentry = dentry;
if (likely(__follow_mount_rcu(nd, 

Re: [PATCH 2/2] Btrfs: load the key from the dir item in readdir into a fake dentry

2011-05-26 Thread Josef Bacik
On 05/26/2011 02:50 PM, Andi Kleen wrote:
 Josef Bacik jo...@redhat.com writes:
 +
 +newkey = kzalloc(sizeof(struct btrfs_key),
 + GFP_NOFS);
 +if (!newkey)
 +goto no_dentry;
 +tmp = d_alloc(filp-f_dentry, q);
 
 This doesn't seem to address the find / fills all memory with dentries
 concerns brought up earlier at all.
 

Nope, this part does in patch 1/1

+   /*
+* If this dentry needs lookup, don't set the referenced flag so that it
+* is more likely to be cleaned up by the dcache shrinker in case of
+* memory pressure.
+*/
+   if (!d_need_lookup(dentry))
+   dentry-d_flags |= DCACHE_REFERENCED;


 d_alloc uses a normal GFP_KERNEL, which is quite in appropiate for this.
 
 It should at least reclaim and probably more, but even then it's
 risky.
 

Ah yeah I guess I should have probably used GFP_KERNEL.  Sorry about that,

Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Btrfs: load the key from the dir item in readdir into a fake dentry

2011-05-26 Thread Andi Kleen
On Thu, May 26, 2011 at 03:02:42PM -0400, Josef Bacik wrote:
 + /*
 +  * If this dentry needs lookup, don't set the referenced flag so that it
 +  * is more likely to be cleaned up by the dcache shrinker in case of
 +  * memory pressure.
 +  */
 + if (!d_need_lookup(dentry))
 + dentry-d_flags |= DCACHE_REFERENCED;

No it doesn't at all. The allocation will just push everything else
out.

Really you cannot view this by only looking at the dcache.
You have to look at the complete VM behaviour. All the caches
and the other memory interact.
 
 
  d_alloc uses a normal GFP_KERNEL, which is quite in appropiate for this.
  
  It should at least reclaim and probably more, but even then it's
  risky.
  
 
 Ah yeah I guess I should have probably used GFP_KERNEL.  Sorry about that,

GFP_KERNEL is already used, but it's wrong. I'm not sure any
of the existing GFP_* flags will give you the semantics you
need in fact. The new flag Minchan added for readahead may come
near, but even that is probably not enough.

-Andi

-- 
a...@linux.intel.com -- Speaking for myself only.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Btrfs: load the key from the dir item in readdir into a fake dentry

2011-05-26 Thread Josef Bacik
On 05/26/2011 04:03 PM, Andi Kleen wrote:
 On Thu, May 26, 2011 at 03:02:42PM -0400, Josef Bacik wrote:
 +/*
 + * If this dentry needs lookup, don't set the referenced flag so that it
 + * is more likely to be cleaned up by the dcache shrinker in case of
 + * memory pressure.
 + */
 +if (!d_need_lookup(dentry))
 +dentry-d_flags |= DCACHE_REFERENCED;
 
 No it doesn't at all. The allocation will just push everything else
 out.
 
 Really you cannot view this by only looking at the dcache.
 You have to look at the complete VM behaviour. All the caches
 and the other memory interact.

Agreed, but this makes it monumentally easier to push these entries out
of the cache, which is the best I can do with what I have for now.



 d_alloc uses a normal GFP_KERNEL, which is quite in appropiate for this.

 It should at least reclaim and probably more, but even then it's
 risky.


 Ah yeah I guess I should have probably used GFP_KERNEL.  Sorry about that,
 
 GFP_KERNEL is already used, but it's wrong. I'm not sure any
 of the existing GFP_* flags will give you the semantics you
 need in fact. The new flag Minchan added for readahead may come
 near, but even that is probably not enough.

Yeah if there was a GFP_DONTTRYTOHARD I would use that but there isn't.
 Maybe I'll code that up.  Thanks,

Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Btrfs: load the key from the dir item in readdir into a fake dentry

2011-05-26 Thread Andreas Dilger
On May 26, 2011, at 14:03, Andi Kleen wrote:
 On Thu, May 26, 2011 at 03:02:42PM -0400, Josef Bacik wrote:
 +/*
 + * If this dentry needs lookup, don't set the referenced flag so that it
 + * is more likely to be cleaned up by the dcache shrinker in case of
 + * memory pressure.
 + */
 +if (!d_need_lookup(dentry))
 +dentry-d_flags |= DCACHE_REFERENCED;
 
 No it doesn't at all. The allocation will just push everything else
 out.
 
 Really you cannot view this by only looking at the dcache.
 You have to look at the complete VM behaviour. All the caches
 and the other memory interact.

Even without this patch, if you are doing find / there will be considerable 
memory pressure from all of the pages that are accessed by readdir(), and to a 
lesser extent the directory dentry/inode entries.  I'm not sure whether the 
issue you raise is going to be significant in the end or not.


Taking this development a bit further, I've long thought it would be quite 
interesting if these dcache entries could be linked into a readdir-ordered 
linked list, and then discard the actual directory pages from cache entirely.  
This would potentially allow later readdir operations to work just by walking 
the readdir-ordered linked list and not touching the page cache at all.

Since normal operations like ls -l currently instantiate both a pagecache for 
readdir, and a dentry for each dirent, it could potentially even reduce memory 
if there was no a need to keep the directory pages in cache anymore.

 d_alloc uses a normal GFP_KERNEL, which is quite in appropiate for this.
 
 It should at least reclaim and probably more, but even then it's
 risky.
 
 
 Ah yeah I guess I should have probably used GFP_KERNEL.  Sorry about that,
 
 GFP_KERNEL is already used, but it's wrong. I'm not sure any
 of the existing GFP_* flags will give you the semantics you
 need in fact. The new flag Minchan added for readahead may come
 near, but even that is probably not enough.
 
 -Andi
 
 -- 
 a...@linux.intel.com -- Speaking for myself only.
 --
 To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas





--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


strange btrfs sub list output

2011-05-26 Thread Stephane Chazelas
Hiya,

I get a btrfs sub list output that I don't understand:

# btrfs sub list /backup/
ID 257 top level 5 path u1/linux/lvm+btrfs/storage/data/data
ID 260 top level 5 path u2/linux/lvm/linux/var/data
ID 262 top level 5 path u1/linux/lvm+btrfs/storage/data/snapshots/2010-10-11
ID 263 top level 5 path u2/linux/lvm/linux/home/snapshots/2011-04-07
ID 264 top level 5 path u2/linux/lvm/linux/root/snapshots/2011-04-07
ID 265 top level 5 path u2/linux/lvm/linux/var/snapshots/2011-04-07
ID 266 top level 5 path u1/linux/lvm+btrfs/storage/data/snapshots/2010-10-26
ID 267 top level 5 path u1/linux/lvm+btrfs/storage/data/snapshots/2010-11-08
ID 268 top level 5 path u1/linux/lvm+btrfs/storage/data/snapshots/2010-11-22
ID 269 top level 5 path u1/linux/lvm+btrfs/storage/data/snapshots/2010-12-15
ID 270 top level 5 path u2/linux/lvm/linux/home/snapshots/2011-04-14
ID 271 top level 5 path u2/linux/lvm/linux/root/snapshots/2011-04-14
ID 272 top level 5 path u2/linux/lvm/linux/var/snapshots/2011-04-14
ID 273 top level 5 path u1/linux/lvm+btrfs/storage/data/snapshots/2010-12-29
ID 274 top level 5 path u1/linux/lvm+btrfs/storage/data/snapshots/2011-01-26
ID 275 top level 5 path u1/linux/lvm+btrfs/storage/data/snapshots/2011-03-07
ID 276 top level 5 path u1/linux/lvm+btrfs/storage/data/snapshots/2011-04-01
ID 277 top level 5 path u2/linux/lvm/linux/home/data
ID 278 top level 5 path u2/linux/lvm/linux/home/snapshots/2011-04-27
ID 279 top level 5 path u2/linux/lvm/linux/root/snapshots/2011-04-27
ID 280 top level 5 path u2/linux/lvm/linux/var/snapshots/2011-04-27
ID 281 top level 5 path u3:10022/vm+xfs@u9/xvda1/g1/v4/data
ID 282 top level 5 path u3:10022/vm+xfs@u9/xvda1/g1/v4/snapshots/2011-05-19
ID 283 top level 5 path u5/vm+xfs@u9/xvda1/g1/v5/data
ID 284 top level 5 path u6:10022/vm+xfs@u8/xvda1/g8/v3/data
ID 286 top level 5 path u5/vm+xfs@u9/xvda1/g1/v5/snapshots/2011-05-24
ID 287 top level 285 path data
ID 288 top level 5 path u4/vm+xfs@u9/xvda1/g1/v1/data
ID 289 top level 5 path u4/vm+xfs@u9/xvda1/g1/v1/snapshots/2011-03-11
ID 290 top level 5 path u4/vm+xfs@u9/xvda1/g1/v2/data
ID 291 top level 5 path u4/vm+xfs@u9/xvda1/g1/v2/snapshots/2011-05-11
ID 292 top level 5 path u4/vm+xfs@u9/xvda1/g1/v1/snapshots/2011-05-11

See ID 287 above.

There is no /backup/data directory. There is however a
/backup/u6:10022/vm+xfs@u8/xvda1/g8/v3/snapshots/2011-03-30 that
contains the same thing as what I get if I mount the fs with
subvolid=287. And I did do a btrfs sub snap data
snapshots/2011-03/30 there.

What could be the cause of that? How to fix it?

In case that matters, there used to be more components in the
path of u6:10022/vm+xfs@u8/xvda1/g8/v3/data.

Thanks,
Stephane
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html