[PATCH 1/3] Btrfs: fix btrfsck error 400 when truncating a compressed file extent

2012-01-05 Thread Miao Xie
Reproduce steps:
 # mkfs.btrfs /dev/sdb5
 # mount /dev/sdb5 -o compress=lzo /mnt
 # dd if=/dev/zero of=/mnt/tmpfile bs=128K count=1
 # sync
 # truncate -s 64K /mnt/tmpfile
 root 5 inode 257 errors 400

This is because of the wrong if condition, which is used to check if we should
subtract the bytes of the dropped range from i_blocks/i_bytes of i-node or not.
When we truncate a compressed extent, btrfs substracts the bytes of the whole
extent, it's wrong. We should substract the real size that we truncate, no
matter it is a compressed extent or not. Fix it.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/inode.c |8 +---
 1 files changed, 1 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 13b0542..85e2312 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3009,7 +3009,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle 
*trans,
int pending_del_nr = 0;
int pending_del_slot = 0;
int extent_type = -1;
-   int encoding;
int ret;
int err = 0;
u64 ino = btrfs_ino(inode);
@@ -3059,7 +3058,6 @@ search_again:
leaf = path-nodes[0];
btrfs_item_key_to_cpu(leaf, found_key, path-slots[0]);
found_type = btrfs_key_type(found_key);
-   encoding = 0;
 
if (found_key.objectid != ino)
break;
@@ -3072,10 +3070,6 @@ search_again:
fi = btrfs_item_ptr(leaf, path-slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
-   encoding = btrfs_file_extent_compression(leaf, fi);
-   encoding |= btrfs_file_extent_encryption(leaf, fi);
-   encoding |= btrfs_file_extent_other_encoding(leaf, fi);
-
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
item_end +=
btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3097,7 @@ search_again:
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-   if (!del_item  !encoding) {
+   if (!del_item) {
u64 orig_num_bytes =
btrfs_file_extent_num_bytes(leaf, fi);
extent_num_bytes = new_size -
-- 
1.7.6.4
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] Btrfs: make btrfs_truncate_inode_items() more readable

2012-01-05 Thread Miao Xie
As the title said, this patch just make the functions of the truncation
more readable.

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/inode.c |  289 ++
 1 files changed, 159 insertions(+), 130 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 85e2312..df6060f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2977,10 +2977,142 @@ out:
return err;
 }
 
+static int btrfs_release_and_test_inline_data_extent(
+   struct btrfs_root *root,
+   struct inode *inode,
+   struct extent_buffer *leaf,
+   struct btrfs_file_extent_item *fi,
+   u64 offset,
+   u64 new_size)
+{
+   u64 item_end;
+
+   item_end = offset + btrfs_file_extent_inline_len(leaf, fi) - 1;
+
+   if (item_end  new_size)
+   return 0;
+
+   /*
+* Truncate inline items is special, we have done it by
+*   btrfs_truncate_page();
+*/
+   if (offset  new_size)
+   return 0;
+
+   if (root-ref_cows)
+   inode_sub_bytes(inode, item_end + 1 - offset);
+
+   return 1;
+}
+
 /*
- * this can truncate away extent items, csum items and directory items.
- * It starts at a high offset and removes keys until it can't find
- * any higher than new_size
+ * If this function return 1, it means this item can be dropped directly.
+ * If 0 is returned, the item can not be dropped.
+ */
+static int btrfs_release_and_test_data_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct inode *inode,
+ u64 offset,
+ u64 new_size)
+{
+   struct extent_buffer *leaf;
+   struct btrfs_file_extent_item *fi;
+   u64 extent_start;
+   u64 extent_offset;
+   u64 item_end;
+   u64 ino = btrfs_ino(inode);
+   u64 orig_nbytes;
+   u64 new_nbytes;
+   int extent_type;
+   int ret;
+
+   leaf = path-nodes[0];
+   fi = btrfs_item_ptr(leaf, path-slots[0],
+   struct btrfs_file_extent_item);
+
+   extent_type = btrfs_file_extent_type(leaf, fi);
+   if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+   return btrfs_release_and_test_inline_data_extent(root, inode,
+leaf, fi,
+offset,
+new_size);
+
+   item_end = offset + btrfs_file_extent_num_bytes(leaf, fi) - 1;
+
+   /*
+* If the new size is beyond the end of the extent:
+*   +--+
+*   |  |
+*   +--+
+*^ new size
+* so the extent should not be dropped or truncated.
+*/
+   if (item_end  new_size)
+   return 0;
+
+   extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+   if (offset  new_size) {
+   /*
+* If the new size is in the extent:
+*   +--+
+*   |  |
+*   +--+
+*  ^ new size
+* so this extent should be truncated, not be dropped directly.
+*/
+   orig_nbytes = btrfs_file_extent_num_bytes(leaf, fi);
+   new_nbytes = round_up(new_size - offset, root-sectorsize);
+
+   btrfs_set_file_extent_num_bytes(leaf, fi, new_nbytes);
+
+   if (extent_start != 0  root-ref_cows)
+   inode_sub_bytes(inode, orig_nbytes - new_nbytes);
+
+   btrfs_mark_buffer_dirty(leaf);
+   return 0;
+   } else {
+   /*
+* If the new size is in the font of the extent:
+*   +--+
+*   |  |
+*   +--+
+*  ^ new size
+* so this extent should be dropped.
+*/
+
+   /*
+* It is a dummy extent, or it is in log tree, we needn't do
+* anything, just drop it.
+*/
+   if (extent_start == 0 ||
+   !(root-ref_cows || root == root-fs_info-tree_root))
+   return 1;
+
+   /* If this file is not a free space management file... */
+   /* 

[RFC][PATCH 3/3] Btrfs: improve truncation of btrfs

2012-01-05 Thread Miao Xie
The original truncation of btrfs has a bug, that is the orphan item will not be
dropped when the truncation fails. This bug will trigger BUG() when unlink that
truncated file. And besides that, if the user does pre-allocation for the file
which is truncated unsuccessfully, after re-mount(umount-mount, not -o remount),
the pre-allocated extent will be dropped.

This patch modified the relative functions of the truncation, and makes the
truncation update i_size and disk_i_size of i-nodes every time we drop the file
extent successfully, and set them to the real value. By this way, we needn't
add orphan items to guarantee the consistency of the meta-data.

By this patch, it is possible that the file may not be truncated to the size
that the user expects(may be = the orignal size and = the expected one), so I
think it is better that we shouldn't lose the data that lies within the range
the expected size, the real size, because the user may take it for granted
that the data in that extent is not lost. In order to implement it, we just
write out all the dirty pages which are beyond the expected size of the file.
This operation will spend lots of time if there are many dirty pages. It is
also the only disadvantage of this patch. (Maybe I'm overcautious, we needn't
hold that data.)

Signed-off-by: Miao Xie mi...@cn.fujitsu.com
---
 fs/btrfs/inode.c |  159 +-
 1 files changed, 49 insertions(+), 110 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index df6060f..9ace01b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT  S_SHIFT] = {
 };
 
 static int btrfs_setsize(struct inode *inode, loff_t newsize);
-static int btrfs_truncate(struct inode *inode);
+static int btrfs_truncate(struct inode *inode, loff_t newsize);
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
 static noinline int cow_file_range(struct inode *inode,
   struct page *locked_page,
@@ -2230,7 +2230,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 * btrfs_delalloc_reserve_space to catch offenders.
 */
mutex_lock(inode-i_mutex);
-   ret = btrfs_truncate(inode);
+   ret = btrfs_truncate(inode, inode-i_size);
mutex_unlock(inode-i_mutex);
} else {
nr_unlink++;
@@ -2993,7 +2993,7 @@ static int btrfs_release_and_test_inline_data_extent(
return 0;
 
/*
-* Truncate inline items is special, we have done it by
+* Truncate inline items is special, we will do it by
 *   btrfs_truncate_page();
 */
if (offset  new_size)
@@ -3121,9 +3121,9 @@ static int btrfs_release_and_test_data_extent(struct 
btrfs_trans_handle *trans,
  * will kill all the items on this inode, including the INODE_ITEM_KEY.
  */
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
-   struct btrfs_root *root,
-   struct inode *inode,
-   u64 new_size, u32 min_type)
+  struct btrfs_root *root,
+  struct inode *inode,
+  u64 new_size, u32 min_type)
 {
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -3131,6 +3131,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle 
*trans,
struct btrfs_key found_key;
u64 mask = root-sectorsize - 1;
u64 ino = btrfs_ino(inode);
+   u64 old_size = i_size_read(inode);
u32 found_type;
int pending_del_nr = 0;
int pending_del_slot = 0;
@@ -3138,6 +3139,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle 
*trans,
int err = 0;
 
BUG_ON(new_size  0  min_type != BTRFS_EXTENT_DATA_KEY);
+   BUG_ON(new_size  mask);
 
path = btrfs_alloc_path();
if (!path)
@@ -3190,6 +3192,13 @@ search_again:
ret = btrfs_release_and_test_data_extent(trans, root,
path, inode, found_key.offset,
new_size);
+   if (root-ref_cows ||
+   root == root-fs_info-tree_root) {
+   if (ret  found_key.offset  old_size)
+   i_size_write(inode, found_key.offset);
+   else if (!ret)
+   i_size_write(inode, new_size);
+   }
if (!ret)
break;
}
@@ -3247,12 +3256,10 @@ out:
 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 {
struct inode *inode = mapping-host;
-   

Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Fabian Zeindl
Hi,

 the subject is pretty selfexplanatory. Im creating a btrfs using

sudo mkfs.btrfs -m raid1 -d raid1 smalldisk largedisk

it creates the fs, apparently with the size of the larger disk, no matter in 
which order i supply the disk-arguments.
How can this be correct?

Is there some way like cat /proc/mdstat to see what btrfs is doing and to 
assure myself my raid1 is secure? It's not
terribly important data, hence i'm trying btrfs, but i don't want to lose it 
either.

I posted this question on stackexchange as well: 
http://unix.stackexchange.com/questions/28357/why-does-btrfs-allow-to-create-a-raid1-with-mismatched-drives

Please CC me in any replies.

Regards
Fabian Zeindl--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Fabian Zeindl
On Jan 5, 2012, at 10:21 , Fabian Zeindl wrote:
 it creates the fs, apparently with the size of the larger disk, no matter in 
 which order i supply the disk-arguments.
 How can this be correct?

Edit: wrong observation here. The fs is created with the sum of the sizes of 
the two disks, though btrfs fi df shows RAID1 for metadata, system and data.

Fabian Zeindl

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Hugo Mills
On Thu, Jan 05, 2012 at 10:21:57AM +0100, Fabian Zeindl wrote:
 Hi,
 
  the subject is pretty selfexplanatory. Im creating a btrfs using
 
 sudo mkfs.btrfs -m raid1 -d raid1 smalldisk largedisk
 
 it creates the fs, apparently with the size of the larger disk, no matter in 
 which order i supply the disk-arguments.
 How can this be correct?

   Because btrfs doesn't actually do RAID-1 (in the sense that
blocks with the same address on the two disks have identical
contents). You should probably read the mis-named Sysadmin's Guide
on the wiki[1], which explains what btrfs actually does with its
replication.

   You should also probably read the FAQ entries on free space[2],
since using plain df for btrfs is usually misleading.

 Is there some way like cat /proc/mdstat to see what btrfs is doing and to 
 assure myself my raid1 is secure? It's not
 terribly important data, hence i'm trying btrfs, but i don't want to lose it 
 either.

   You could run a scrub, which will verify all of the data mirrors on
the volume, and fix anything that's not redundant.

   Hugo.

[1] http://btrfs.ipv5.de/index.php?title=SysadminGuide
[2] 
http://btrfs.ipv5.de/index.php?title=FAQ#Why_does_df_show_incorrect_free_space_for_my_RAID_volume.3F

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
  --- vi: The core of evil. ---  


signature.asc
Description: Digital signature


[RFC PATCH 0/3] apply the Probabilistic Skiplist on btrfs

2012-01-05 Thread Liu Bo
Since we are inclined to apply a lockless scheme on some objects of btrfs for
higher performance, we want to build a RCU version the Probabilistic Skiplist.

Here our skiplist algorithm is based on the skiplist experiments of
Con Kolivas ker...@kolivas.org for BFS cpu scheduler.
And more details about skiplist design are in patch 1.

Right now we have a plan to apply skiplist on extent_map and extent_state.

Here we choose extent_map firstly, since it is a read mostly thing,
and the change is quite direct, all we need to do is
a) to replace rbtree with skiplist,
b) to add rcu support.
And more details are in patch 2 and patch 3.

I've done some simple tests for performance on my 2-core box, there is no
obvious difference, but I want to focus on the design side and make sure
there is no more bug in it firstly.

For long term goals, we want to ship skiplist to lib, like lib/rbtree.c.

MORE TESTS ARE WELCOME!

Liu Bo (3):
  Btrfs: add the Probabilistic Skiplist
  Btrfs: rebuild extent_map based on skiplist
  Btrfs: convert rwlock to RCU for extent_map

 fs/btrfs/Makefile  |2 +-
 fs/btrfs/compression.c |8 +-
 fs/btrfs/disk-io.c |   15 ++-
 fs/btrfs/extent_io.c   |   13 +-
 fs/btrfs/extent_map.c  |  296 ++--
 fs/btrfs/extent_map.h  |   21 +++-
 fs/btrfs/file.c|   23 +++-
 fs/btrfs/inode.c   |   69 
 fs/btrfs/ioctl.c   |8 +-
 fs/btrfs/relocation.c  |9 +-
 fs/btrfs/scrub.c   |4 +-
 fs/btrfs/skiplist.c|   98 
 fs/btrfs/skiplist.h|  217 +++
 fs/btrfs/volumes.c |   68 ++-
 14 files changed, 651 insertions(+), 200 deletions(-)
 create mode 100644 fs/btrfs/skiplist.c
 create mode 100644 fs/btrfs/skiplist.h

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 2/3] Btrfs: rebuild extent_map based on skiplist

2012-01-05 Thread Liu Bo
extent_map applies a read more senario, since we want to build
a RCU-skiplist later, we build a new version extent_map based on
skiplist firstly.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/extent_map.c |  265 +++--
 fs/btrfs/extent_map.h |   14 +++-
 fs/btrfs/volumes.c|   22 ++--
 3 files changed, 190 insertions(+), 111 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b33..746084c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -9,6 +9,13 @@
 
 static struct kmem_cache *extent_map_cache;
 
+static LIST_HEAD(maps);
+
+#define MAP_LEAK_DEBUG 1
+#if MAP_LEAK_DEBUG
+static DEFINE_SPINLOCK(map_leak_lock);
+#endif
+
 int __init extent_map_init(void)
 {
extent_map_cache = kmem_cache_create(extent_map,
@@ -21,6 +28,30 @@ int __init extent_map_init(void)
 
 void extent_map_exit(void)
 {
+   struct extent_map *em;
+
+#if MAP_LEAK_DEBUG
+   struct list_head *tmp;
+   int count = 0;
+
+   list_for_each(tmp, maps)
+   count++;
+
+   printk(KERN_INFO %d em is left to free\n, count);
+
+   while (!list_empty(maps)) {
+   cond_resched();
+   em = list_entry(maps.next, struct extent_map, leak_list);
+   printk(KERN_ERR btrfs extent map: start %llu, len %llu 
+   refs %d block_start %llu, block_len %llu, in_tree 
%u\n,
+em-start, em-len, atomic_read(em-refs),
+em-block_start, em-block_len, em-in_tree);
+   WARN_ON(1);
+   list_del(em-leak_list);
+   kmem_cache_free(extent_map_cache, em);
+   }
+#endif
+
if (extent_map_cache)
kmem_cache_destroy(extent_map_cache);
 }
@@ -34,7 +65,8 @@ void extent_map_exit(void)
  */
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
-   tree-map = RB_ROOT;
+   tree-head.start = (-1ULL);
+   sl_init_list(tree-map, tree-head.sl_node);
rwlock_init(tree-lock);
 }
 
@@ -48,16 +80,41 @@ void extent_map_tree_init(struct extent_map_tree *tree)
 struct extent_map *alloc_extent_map(void)
 {
struct extent_map *em;
+#if MAP_LEAK_DEBUG
+   unsigned long flags;
+#endif
+
em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
if (!em)
return NULL;
em-in_tree = 0;
em-flags = 0;
em-compress_type = BTRFS_COMPRESS_NONE;
+   sl_init_node(em-sl_node);
atomic_set(em-refs, 1);
+#if MAP_LEAK_DEBUG
+   spin_lock_irqsave(map_leak_lock, flags);
+   list_add(em-leak_list, maps);
+   spin_unlock_irqrestore(map_leak_lock, flags);
+#endif
return em;
 }
 
+static inline void __free_extent_map(struct extent_map *em)
+{
+#if MAP_LEAK_DEBUG
+   unsigned long flags;
+
+   spin_lock_irqsave(map_leak_lock, flags);
+   list_del(em-leak_list);
+   spin_unlock_irqrestore(map_leak_lock, flags);
+#endif
+
+   WARN_ON(em-in_tree);
+   sl_free_node(em-sl_node);
+   kmem_cache_free(extent_map_cache, em);
+}
+
 /**
  * free_extent_map - drop reference count of an extent_map
  * @em:extent map beeing releasead
@@ -69,91 +126,113 @@ void free_extent_map(struct extent_map *em)
 {
if (!em)
return;
+
WARN_ON(atomic_read(em-refs) == 0);
-   if (atomic_dec_and_test(em-refs)) {
-   WARN_ON(em-in_tree);
-   kmem_cache_free(extent_map_cache, em);
-   }
+   if (atomic_dec_and_test(em-refs))
+   __free_extent_map(em);
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
-  struct rb_node *node)
+static inline int in_entry(struct sl_node *node, u64 offset)
 {
-   struct rb_node **p = root-rb_node;
-   struct rb_node *parent = NULL;
struct extent_map *entry;
 
-   while (*p) {
-   parent = *p;
-   entry = rb_entry(parent, struct extent_map, rb_node);
+   entry = sl_entry(node, struct extent_map, sl_node);
+   if (!node-head 
+   entry-start = offset  extent_map_end(entry) - 1 = offset)
+   return 1;
+   return 0;
+}
 
-   WARN_ON(!entry-in_tree);
+static inline struct extent_map *next_entry(struct sl_node *p, int l,
+   struct sl_node **q)
+{
+   struct extent_map *ret;
+   struct sl_node *next;
 
-   if (offset  entry-start)
-   p = (*p)-rb_left;
-   else if (offset = extent_map_end(entry))
-   p = (*p)-rb_right;
-   else
-   return parent;
-   }
+   next = __sl_next_with_level(p, l);
+   ret = sl_entry(next, struct extent_map, sl_node);
+   BUG_ON(!ret);
+   *q = next;
 
-   entry = rb_entry(node, struct extent_map, rb_node);
-   entry-in_tree = 1;
-   

[RFC PATCH 1/3] Btrfs: add the Probabilistic Skiplist

2012-01-05 Thread Liu Bo
The Probabilistic Skiplist is a O(lgn) data structure, and
we want to apply this for later use, mainly for RCU-skiplist.

Note:
a) The skiplist is probabilistic, and it is the distribution of node sizes
   that is maintained, but the strict order is not required[1].

b) This skiplist cannot be resized once it is created,
   so here is a level limit 16 and an associated (and fixed) probability 0.25
   that determines the distribution of nodes[1].

c) The level limit may need to be adjusted.
   I know it is a magic number, but now for simplicity we just keep it at 16,
   and then each skiplist is able to contain (2^32-1)/3 nodes at most.

[1] 
http://www.csee.umbc.edu/courses/undergraduate/341/fall01/Lectures/SkipLists/skip_lists/skip_lists.html

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/Makefile   |2 +-
 fs/btrfs/skiplist.c |   98 
 fs/btrfs/skiplist.h |  210 +++
 3 files changed, 309 insertions(+), 1 deletions(-)
 create mode 100644 fs/btrfs/skiplist.c
 create mode 100644 fs/btrfs/skiplist.h

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd2..3284462 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-  reada.o backref.o
+  reada.o backref.o skiplist.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/skiplist.c b/fs/btrfs/skiplist.c
new file mode 100644
index 000..c803478
--- /dev/null
+++ b/fs/btrfs/skiplist.c
@@ -0,0 +1,98 @@
+/*
+  The Probabilistic Skiplist
+  (C) 2011  Liu Bo liubo2...@cn.fujitsu.com
+
+  Based on the skiplist experiments of Con Kolivas ker...@kolivas.org
+  for BFS cpu scheduler.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include linux/random.h
+#include linux/slab.h
+#include skiplist.h
+
+inline int sl_fill_node(struct sl_node *node, int level, gfp_t mask)
+{
+   struct sl_node **p;
+   struct sl_node **q;
+   int num;
+
+   BUG_ON(level  MAXLEVEL);
+
+   num = level + 1;
+   p = kmalloc(sizeof(*p) * num, mask);
+   BUG_ON(!p);
+   if (!p)
+   return -ENOMEM;
+   q = kmalloc(sizeof(*q) * num, mask);
+   BUG_ON(!q);
+   if (!q) {
+   kfree(p);
+   return -ENOMEM;
+   }
+
+   node-next = p;
+   node-prev = q;
+   node-level = level;
+   return 0;
+}
+
+inline void sl_link_node(struct sl_node *node, struct sl_node **backlook,
+int level)
+{
+   struct sl_node *p, *q;
+   int i = 0;
+
+   do {
+   p = backlook[i];
+   q = p-next[i];
+
+   node-next[i] = q;
+   node-prev[i] = p;
+   p-next[i] = node;
+   q-prev[i] = node;
+
+   i++;
+   } while (i = level);
+}
+
+void sl_erase(struct sl_node *node, struct sl_list *list)
+{
+   struct sl_node *prev, *next;
+   struct sl_node *head;
+   int level;
+   int i;
+
+   level = node-level;
+
+   for (i = 0; i = level; i++) {
+   prev = node-prev[i];
+   next = node-next[i];
+
+   prev-next[i] = next;
+   next-prev[i] = prev;
+   node-next[i] = node;
+   node-prev[i] = node;
+   }
+
+   head = list-head;
+   if (level == list-level) {
+   while (head-next[level] == head 
+  head-prev[level] == head  level  0)
+   level--;
+   list-level = level;
+   }
+}
diff --git a/fs/btrfs/skiplist.h b/fs/btrfs/skiplist.h
new file mode 100644
index 000..3e414b5
--- /dev/null
+++ b/fs/btrfs/skiplist.h
@@ -0,0 +1,210 @@
+/*
+  The Probabilistic Skiplist
+  (C) 2011  Liu Bo liubo2...@cn.fujitsu.com
+
+  Based on the skiplist experiments of Con Kolivas ker...@kolivas.org
+  for BFS cpu scheduler.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by

[RFC PATCH 3/3] Btrfs: convert rwlock to RCU for extent_map

2012-01-05 Thread Liu Bo
In this patch, we make three things:

a) skiplist - rcu-skiplist
   This is quite direct, since in skiplist each level is a list,
   any modification to the skiplist refers to pointers change,
   which fits RCU's sematic.

b) use rcu lock to protect extent_map instead of rwlock.

c) make extent_map reclaim after dropping the updater side lock.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/compression.c |8 +++---
 fs/btrfs/disk-io.c |   15 ++
 fs/btrfs/extent_io.c   |   13 -
 fs/btrfs/extent_map.c  |   39 +-
 fs/btrfs/extent_map.h  |7 +++--
 fs/btrfs/file.c|   23 +++-
 fs/btrfs/inode.c   |   69 ---
 fs/btrfs/ioctl.c   |8 +++---
 fs/btrfs/relocation.c  |9 --
 fs/btrfs/scrub.c   |4 +-
 fs/btrfs/skiplist.c|6 ++--
 fs/btrfs/skiplist.h|   25 +++--
 fs/btrfs/volumes.c |   46 ++--
 13 files changed, 168 insertions(+), 104 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 14f1c5a..bb4ac31 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -498,10 +498,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 */
set_page_extent_mapped(page);
lock_extent(tree, last_offset, end, GFP_NOFS);
-   read_lock(em_tree-lock);
+   rcu_read_lock();
em = lookup_extent_mapping(em_tree, last_offset,
   PAGE_CACHE_SIZE);
-   read_unlock(em_tree-lock);
+   rcu_read_unlock();
 
if (!em || last_offset  em-start ||
(last_offset + PAGE_CACHE_SIZE  extent_map_end(em)) ||
@@ -583,11 +583,11 @@ int btrfs_submit_compressed_read(struct inode *inode, 
struct bio *bio,
em_tree = BTRFS_I(inode)-extent_tree;
 
/* we need the actual starting offset of this extent in the file */
-   read_lock(em_tree-lock);
+   rcu_read_lock();
em = lookup_extent_mapping(em_tree,
   page_offset(bio-bi_io_vec-bv_page),
   PAGE_CACHE_SIZE);
-   read_unlock(em_tree-lock);
+   rcu_read_unlock();
 
compressed_len = em-block_len;
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f9d555..2dbc969 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -189,17 +189,17 @@ static struct extent_map *btree_get_extent(struct inode 
*inode,
 {
struct extent_map_tree *em_tree = BTRFS_I(inode)-extent_tree;
struct extent_map *em;
+   struct extent_map *to_free1 = NULL, *to_free2 = NULL;
int ret;
 
-   read_lock(em_tree-lock);
+   rcu_read_lock();
em = lookup_extent_mapping(em_tree, start, len);
+   rcu_read_unlock();
if (em) {
em-bdev =
BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev;
-   read_unlock(em_tree-lock);
goto out;
}
-   read_unlock(em_tree-lock);
 
em = alloc_extent_map();
if (!em) {
@@ -212,8 +212,12 @@ static struct extent_map *btree_get_extent(struct inode 
*inode,
em-block_start = 0;
em-bdev = BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev;
 
-   write_lock(em_tree-lock);
-   ret = add_extent_mapping(em_tree, em);
+   spin_lock(em_tree-lock);
+   ret = add_extent_mapping(em_tree, em, to_free1, to_free2);
+   spin_unlock(em_tree-lock);
+   free_extent_map(to_free1);
+   free_extent_map(to_free2);
+
if (ret == -EEXIST) {
u64 failed_start = em-start;
u64 failed_len = em-len;
@@ -231,7 +235,6 @@ static struct extent_map *btree_get_extent(struct inode 
*inode,
free_extent_map(em);
em = NULL;
}
-   write_unlock(em_tree-lock);
 
if (ret)
em = ERR_PTR(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9d..30a8270 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2013,10 +2013,10 @@ static int bio_readpage_error(struct bio *failed_bio, 
struct page *page,
failrec-bio_flags = 0;
failrec-in_validation = 0;
 
-   read_lock(em_tree-lock);
+   rcu_read_lock();
em = lookup_extent_mapping(em_tree, start, failrec-len);
+   rcu_read_unlock();
if (!em) {
-   read_unlock(em_tree-lock);
kfree(failrec);
return -EIO;
}
@@ -2025,7 +2025,6 @@ static int bio_readpage_error(struct bio *failed_bio, 
struct page *page,
free_extent_map(em);
em = NULL;
}
- 

Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Fabian Zeindl
On Thursday, January 5, 2012 at 10:44 , Hugo Mills wrote:
 You should probably read the mis-named Sysadmin's Guide 
 on the wiki[1], which explains what btrfs actually does with its
 replication.
 
 You should also probably read the FAQ entries on free space[2],
 since using plain df for btrfs is usually misleading.

I read both, but it doesn't answer my question on how btrfs behaves when it 
can't actually do a raid1, because there's not enough data on an other disk 
for a chunk-copy.

 You could run a scrub, which will verify all of the data mirrors on
 the volume, and fix anything that's not redundant.

Will this command fail then for example?

fabian--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/10] Btrfs: backref walking rewrite

2012-01-05 Thread Jan Schmidt
On 05.01.2012 06:38, Li Zefan wrote:
 Jan Schmidt wrote:
 This patch series is a major rewrite of the backref walking code. The patch
 series Arne sent some weeks ago for quota groups had a very interesting
 function, find_all_roots. I took this from him together with the bits needed
 for find_all_roots to work and replaced a major part of the code in backref.c
 with it.

 It can be pulled from
  git://git.jan-o-sch.net/btrfs-unstable for-chris
 There's also a gitweb for that repo on
  http://git.jan-o-sch.net/?p=btrfs-unstable

 
 Thanks for the work!
 
 I got a compile warning:
 
   CC [M]  fs/btrfs/backref.o
 fs/btrfs/backref.c: In function 'inode_to_path':
 fs/btrfs/backref.c:1312:3: warning: format '%ld' expects type 'long int', but 
 argument 3 has type 'int

Thanks for looking at this! Should now be fixed in my git repository
(involved another force-push). I won't resend by email unless requested.

-Jan
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 0/3] apply the Probabilistic Skiplist on btrfs

2012-01-05 Thread Liu Bo
Since we are inclined to apply a lockless scheme on some objects of btrfs for
higher performance, we want to build a RCU version the Probabilistic Skiplist.

Here our skiplist algorithm is based on the skiplist experiments of
Con Kolivas ker...@kolivas.org for BFS cpu scheduler.
And more details about skiplist design are in patch 1.

Right now we have a plan to apply skiplist on extent_map and extent_state.

Here we choose extent_map firstly, since it is a read mostly thing,
and the change is quite direct, all we need to do is
a) to replace rbtree with skiplist,
b) to add rcu support.
And more details are in patch 2 and patch 3.

I've done some simple tests for performance on my 2-core box, there is no
obvious difference, but I want to focus on the design side and make sure
there is no more bug in it firstly.

For long term goals, we want to ship skiplist to lib, like lib/rbtree.c.

MORE TESTS ARE WELCOME!

Liu Bo (3):
  Btrfs: add the Probabilistic Skiplist
  Btrfs: rebuild extent_map based on skiplist
  Btrfs: convert rwlock to RCU for extent_map

 fs/btrfs/Makefile  |2 +-
 fs/btrfs/compression.c |8 +-
 fs/btrfs/disk-io.c |   15 ++-
 fs/btrfs/extent_io.c   |   13 +-
 fs/btrfs/extent_map.c  |  296 ++--
 fs/btrfs/extent_map.h  |   21 +++-
 fs/btrfs/file.c|   23 +++-
 fs/btrfs/inode.c   |   69 
 fs/btrfs/ioctl.c   |8 +-
 fs/btrfs/relocation.c  |9 +-
 fs/btrfs/scrub.c   |4 +-
 fs/btrfs/skiplist.c|   98 
 fs/btrfs/skiplist.h|  217 +++
 fs/btrfs/volumes.c |   68 ++-
 14 files changed, 651 insertions(+), 200 deletions(-)
 create mode 100644 fs/btrfs/skiplist.c
 create mode 100644 fs/btrfs/skiplist.h

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 1/3] Btrfs: add the Probabilistic Skiplist

2012-01-05 Thread Liu Bo
The Probabilistic Skiplist is a O(lgn) data structure, and
we want to apply this for later use, mainly for RCU-skiplist.

Note:
a) The skiplist is probabilistic, and it is the distribution of node sizes
   that is maintained, but the strict order is not required[1].

b) This skiplist cannot be resized once it is created,
   so here is a level limit 16 and an associated (and fixed) probability 0.25
   that determines the distribution of nodes[1].

c) The level limit may need to be adjusted.
   I know it is a magic number, but now for simplicity we just keep it at 16,
   and then each skiplist is able to contain (2^32-1)/3 nodes at most.

[1] 
http://www.csee.umbc.edu/courses/undergraduate/341/fall01/Lectures/SkipLists/skip_lists/skip_lists.html

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/Makefile   |2 +-
 fs/btrfs/skiplist.c |   98 
 fs/btrfs/skiplist.h |  210 +++
 3 files changed, 309 insertions(+), 1 deletions(-)
 create mode 100644 fs/btrfs/skiplist.c
 create mode 100644 fs/btrfs/skiplist.h

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd2..3284462 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-  reada.o backref.o
+  reada.o backref.o skiplist.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/skiplist.c b/fs/btrfs/skiplist.c
new file mode 100644
index 000..c803478
--- /dev/null
+++ b/fs/btrfs/skiplist.c
@@ -0,0 +1,98 @@
+/*
+  The Probabilistic Skiplist
+  (C) 2011  Liu Bo liubo2...@cn.fujitsu.com
+
+  Based on the skiplist experiments of Con Kolivas ker...@kolivas.org
+  for BFS cpu scheduler.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include linux/random.h
+#include linux/slab.h
+#include skiplist.h
+
+inline int sl_fill_node(struct sl_node *node, int level, gfp_t mask)
+{
+   struct sl_node **p;
+   struct sl_node **q;
+   int num;
+
+   BUG_ON(level  MAXLEVEL);
+
+   num = level + 1;
+   p = kmalloc(sizeof(*p) * num, mask);
+   BUG_ON(!p);
+   if (!p)
+   return -ENOMEM;
+   q = kmalloc(sizeof(*q) * num, mask);
+   BUG_ON(!q);
+   if (!q) {
+   kfree(p);
+   return -ENOMEM;
+   }
+
+   node-next = p;
+   node-prev = q;
+   node-level = level;
+   return 0;
+}
+
+inline void sl_link_node(struct sl_node *node, struct sl_node **backlook,
+int level)
+{
+   struct sl_node *p, *q;
+   int i = 0;
+
+   do {
+   p = backlook[i];
+   q = p-next[i];
+
+   node-next[i] = q;
+   node-prev[i] = p;
+   p-next[i] = node;
+   q-prev[i] = node;
+
+   i++;
+   } while (i = level);
+}
+
+void sl_erase(struct sl_node *node, struct sl_list *list)
+{
+   struct sl_node *prev, *next;
+   struct sl_node *head;
+   int level;
+   int i;
+
+   level = node-level;
+
+   for (i = 0; i = level; i++) {
+   prev = node-prev[i];
+   next = node-next[i];
+
+   prev-next[i] = next;
+   next-prev[i] = prev;
+   node-next[i] = node;
+   node-prev[i] = node;
+   }
+
+   head = list-head;
+   if (level == list-level) {
+   while (head-next[level] == head 
+  head-prev[level] == head  level  0)
+   level--;
+   list-level = level;
+   }
+}
diff --git a/fs/btrfs/skiplist.h b/fs/btrfs/skiplist.h
new file mode 100644
index 000..3e414b5
--- /dev/null
+++ b/fs/btrfs/skiplist.h
@@ -0,0 +1,210 @@
+/*
+  The Probabilistic Skiplist
+  (C) 2011  Liu Bo liubo2...@cn.fujitsu.com
+
+  Based on the skiplist experiments of Con Kolivas ker...@kolivas.org
+  for BFS cpu scheduler.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by

[RFC PATCH 3/3] Btrfs: convert rwlock to RCU for extent_map

2012-01-05 Thread Liu Bo
In this patch, we make three things:

a) skiplist - rcu-skiplist
   This is quite direct, since in skiplist each level is a list,
   any modification to the skiplist refers to pointers change,
   which fits RCU's sematic.

b) use rcu lock to protect extent_map instead of rwlock.

c) make extent_map reclaim after dropping the updater side lock.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/compression.c |8 +++---
 fs/btrfs/disk-io.c |   15 ++
 fs/btrfs/extent_io.c   |   13 -
 fs/btrfs/extent_map.c  |   39 +-
 fs/btrfs/extent_map.h  |7 +++--
 fs/btrfs/file.c|   23 +++-
 fs/btrfs/inode.c   |   69 ---
 fs/btrfs/ioctl.c   |8 +++---
 fs/btrfs/relocation.c  |9 --
 fs/btrfs/scrub.c   |4 +-
 fs/btrfs/skiplist.c|6 ++--
 fs/btrfs/skiplist.h|   25 +++--
 fs/btrfs/volumes.c |   46 ++--
 13 files changed, 168 insertions(+), 104 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 14f1c5a..bb4ac31 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -498,10 +498,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 */
set_page_extent_mapped(page);
lock_extent(tree, last_offset, end, GFP_NOFS);
-   read_lock(em_tree-lock);
+   rcu_read_lock();
em = lookup_extent_mapping(em_tree, last_offset,
   PAGE_CACHE_SIZE);
-   read_unlock(em_tree-lock);
+   rcu_read_unlock();
 
if (!em || last_offset  em-start ||
(last_offset + PAGE_CACHE_SIZE  extent_map_end(em)) ||
@@ -583,11 +583,11 @@ int btrfs_submit_compressed_read(struct inode *inode, 
struct bio *bio,
em_tree = BTRFS_I(inode)-extent_tree;
 
/* we need the actual starting offset of this extent in the file */
-   read_lock(em_tree-lock);
+   rcu_read_lock();
em = lookup_extent_mapping(em_tree,
   page_offset(bio-bi_io_vec-bv_page),
   PAGE_CACHE_SIZE);
-   read_unlock(em_tree-lock);
+   rcu_read_unlock();
 
compressed_len = em-block_len;
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f9d555..2dbc969 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -189,17 +189,17 @@ static struct extent_map *btree_get_extent(struct inode 
*inode,
 {
struct extent_map_tree *em_tree = BTRFS_I(inode)-extent_tree;
struct extent_map *em;
+   struct extent_map *to_free1 = NULL, *to_free2 = NULL;
int ret;
 
-   read_lock(em_tree-lock);
+   rcu_read_lock();
em = lookup_extent_mapping(em_tree, start, len);
+   rcu_read_unlock();
if (em) {
em-bdev =
BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev;
-   read_unlock(em_tree-lock);
goto out;
}
-   read_unlock(em_tree-lock);
 
em = alloc_extent_map();
if (!em) {
@@ -212,8 +212,12 @@ static struct extent_map *btree_get_extent(struct inode 
*inode,
em-block_start = 0;
em-bdev = BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev;
 
-   write_lock(em_tree-lock);
-   ret = add_extent_mapping(em_tree, em);
+   spin_lock(em_tree-lock);
+   ret = add_extent_mapping(em_tree, em, to_free1, to_free2);
+   spin_unlock(em_tree-lock);
+   free_extent_map(to_free1);
+   free_extent_map(to_free2);
+
if (ret == -EEXIST) {
u64 failed_start = em-start;
u64 failed_len = em-len;
@@ -231,7 +235,6 @@ static struct extent_map *btree_get_extent(struct inode 
*inode,
free_extent_map(em);
em = NULL;
}
-   write_unlock(em_tree-lock);
 
if (ret)
em = ERR_PTR(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9d..30a8270 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2013,10 +2013,10 @@ static int bio_readpage_error(struct bio *failed_bio, 
struct page *page,
failrec-bio_flags = 0;
failrec-in_validation = 0;
 
-   read_lock(em_tree-lock);
+   rcu_read_lock();
em = lookup_extent_mapping(em_tree, start, failrec-len);
+   rcu_read_unlock();
if (!em) {
-   read_unlock(em_tree-lock);
kfree(failrec);
return -EIO;
}
@@ -2025,7 +2025,6 @@ static int bio_readpage_error(struct bio *failed_bio, 
struct page *page,
free_extent_map(em);
em = NULL;
}
- 

[RFC PATCH 2/3] Btrfs: rebuild extent_map based on skiplist

2012-01-05 Thread Liu Bo
extent_map applies a read more senario, since we want to build
a RCU-skiplist later, we build a new version extent_map based on
skiplist firstly.

Signed-off-by: Liu Bo liubo2...@cn.fujitsu.com
---
 fs/btrfs/extent_map.c |  265 +++--
 fs/btrfs/extent_map.h |   14 +++-
 fs/btrfs/volumes.c|   22 ++--
 3 files changed, 190 insertions(+), 111 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b33..746084c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -9,6 +9,13 @@
 
 static struct kmem_cache *extent_map_cache;
 
+static LIST_HEAD(maps);
+
+#define MAP_LEAK_DEBUG 1
+#if MAP_LEAK_DEBUG
+static DEFINE_SPINLOCK(map_leak_lock);
+#endif
+
 int __init extent_map_init(void)
 {
extent_map_cache = kmem_cache_create(extent_map,
@@ -21,6 +28,30 @@ int __init extent_map_init(void)
 
 void extent_map_exit(void)
 {
+   struct extent_map *em;
+
+#if MAP_LEAK_DEBUG
+   struct list_head *tmp;
+   int count = 0;
+
+   list_for_each(tmp, maps)
+   count++;
+
+   printk(KERN_INFO %d em is left to free\n, count);
+
+   while (!list_empty(maps)) {
+   cond_resched();
+   em = list_entry(maps.next, struct extent_map, leak_list);
+   printk(KERN_ERR btrfs extent map: start %llu, len %llu 
+   refs %d block_start %llu, block_len %llu, in_tree 
%u\n,
+em-start, em-len, atomic_read(em-refs),
+em-block_start, em-block_len, em-in_tree);
+   WARN_ON(1);
+   list_del(em-leak_list);
+   kmem_cache_free(extent_map_cache, em);
+   }
+#endif
+
if (extent_map_cache)
kmem_cache_destroy(extent_map_cache);
 }
@@ -34,7 +65,8 @@ void extent_map_exit(void)
  */
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
-   tree-map = RB_ROOT;
+   tree-head.start = (-1ULL);
+   sl_init_list(tree-map, tree-head.sl_node);
rwlock_init(tree-lock);
 }
 
@@ -48,16 +80,41 @@ void extent_map_tree_init(struct extent_map_tree *tree)
 struct extent_map *alloc_extent_map(void)
 {
struct extent_map *em;
+#if MAP_LEAK_DEBUG
+   unsigned long flags;
+#endif
+
em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
if (!em)
return NULL;
em-in_tree = 0;
em-flags = 0;
em-compress_type = BTRFS_COMPRESS_NONE;
+   sl_init_node(em-sl_node);
atomic_set(em-refs, 1);
+#if MAP_LEAK_DEBUG
+   spin_lock_irqsave(map_leak_lock, flags);
+   list_add(em-leak_list, maps);
+   spin_unlock_irqrestore(map_leak_lock, flags);
+#endif
return em;
 }
 
+static inline void __free_extent_map(struct extent_map *em)
+{
+#if MAP_LEAK_DEBUG
+   unsigned long flags;
+
+   spin_lock_irqsave(map_leak_lock, flags);
+   list_del(em-leak_list);
+   spin_unlock_irqrestore(map_leak_lock, flags);
+#endif
+
+   WARN_ON(em-in_tree);
+   sl_free_node(em-sl_node);
+   kmem_cache_free(extent_map_cache, em);
+}
+
 /**
  * free_extent_map - drop reference count of an extent_map
  * @em:extent map beeing releasead
@@ -69,91 +126,113 @@ void free_extent_map(struct extent_map *em)
 {
if (!em)
return;
+
WARN_ON(atomic_read(em-refs) == 0);
-   if (atomic_dec_and_test(em-refs)) {
-   WARN_ON(em-in_tree);
-   kmem_cache_free(extent_map_cache, em);
-   }
+   if (atomic_dec_and_test(em-refs))
+   __free_extent_map(em);
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
-  struct rb_node *node)
+static inline int in_entry(struct sl_node *node, u64 offset)
 {
-   struct rb_node **p = root-rb_node;
-   struct rb_node *parent = NULL;
struct extent_map *entry;
 
-   while (*p) {
-   parent = *p;
-   entry = rb_entry(parent, struct extent_map, rb_node);
+   entry = sl_entry(node, struct extent_map, sl_node);
+   if (!node-head 
+   entry-start = offset  extent_map_end(entry) - 1 = offset)
+   return 1;
+   return 0;
+}
 
-   WARN_ON(!entry-in_tree);
+static inline struct extent_map *next_entry(struct sl_node *p, int l,
+   struct sl_node **q)
+{
+   struct extent_map *ret;
+   struct sl_node *next;
 
-   if (offset  entry-start)
-   p = (*p)-rb_left;
-   else if (offset = extent_map_end(entry))
-   p = (*p)-rb_right;
-   else
-   return parent;
-   }
+   next = __sl_next_with_level(p, l);
+   ret = sl_entry(next, struct extent_map, sl_node);
+   BUG_ON(!ret);
+   *q = next;
 
-   entry = rb_entry(node, struct extent_map, rb_node);
-   entry-in_tree = 1;
-   

Re: (renamed thread) btrfs metrics

2012-01-05 Thread Daniel Pocock

 
 From there on, one could potentially create a matrix: (proportional
 font art, apologies):
 
   | subvol1  | subvol2  | subvol3  |
 --+--+--+--+
  subvol1  |   200M   | 20M  | 50M  |
 --+--+--+--+
  subvol2  |20M   |350M  | 22M  |
 --+--+--+--+
  subvol3  |50M   | 22M  |634M  |
 --+--+--+--+
 
 The diagonal obviously shows the unique blocks, subvol2 and subvol1
 share 20M data, etc. Missing from this plot would be how much is
 shared between subvol1, subvol2, and subvol3 together, but it's a
 start and not something that hard to understand. One might add a
 column for total size of each subvol, which may obviously not be an
 addition of the rest of the columns in this diagram.
 
 Anyway, something like this would be high on my list of `df` numbers
 I'd like to see - since I think they are useful numbers.
 

This is an interesting way to look at it

Ganglia typically records time series data, it is quite conceivable to
create a metric for every permutation in each and store that in rrdtool

The challenge would then be in reporting on the data: the rrdtool graphs
use time as an X-axis, and then it can display multiple Y values

However, now that I've started thinking about the type of data generated
from btrfs, I was wondering if some kind of rr3dtool is needed - a 3D
graphing solution - or potentially making graphs that do not include
time on any axis?

Has anyone seen anything similar for administering ZFS, for example?

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Martin Steigerwald
Am Donnerstag, 5. Januar 2012 schrieb Fabian Zeindl:
 On Thursday, January 5, 2012 at 10:44 , Hugo Mills wrote:
  You should probably read the mis-named Sysadmin's Guide
  on the wiki[1], which explains what btrfs actually does with its
  replication.
  
  You should also probably read the FAQ entries on free space[2],
  since using plain df for btrfs is usually misleading.
 
 I read both, but it doesn't answer my question on how btrfs behaves
 when it can't actually do a raid1, because there's not enough data on
 an other disk for a chunk-copy.

From my reading that Sysadmin Guide answers your question:

BTRFS with RAID-1 will allocate chunks on two devices:

 Btrfs's RAID implementation bears only passing resemblance to
 traditional RAID implementations. Instead, btrfs replicates data on a 
 per-chunk basis. If the filesystem is configured to use RAID-1, for 
 example, chunks are allocated in pairs, with each chunk of the pair 
 being taken from a different block device. Data written to such a chunk 
 pair will be duplicated across both chunks.
 
 Stripe-based RAID levels (RAID-0, RAID-10) work in a similar way, 
 allocating as many chunks as can fit across the drives with free space, 
 and then perform striping of data at a level smaller than a chunk. So, 
 for a RAID-10 filesystem on 4 disks, data may be stored like this:

[… quoted from the Wiki page …]

Allocating as many chunks as can fit across the drives is also pretty 
clear to me. So if BTRFS can´t allocate a new chunk on two devices, its 
full. To me it seems obvious that BTRFS will not break the RAID-1 
redundancy guarentee unless a drive fails.

Thus when using a RAID-1 with two devices, the smaller one should define 
the maximum capacity of the device. But when you use a RAID-1 with one 500 
GB and two 250 GB drives, BTRFS can replicate each chunk on the 500 GB 
drive on *one* of the both 250 GB drives.

Thus is makes perfect sense to support differently sized drives in a BTRFS 
pool.

My own observations with a RAID-10 across 4 devices support this. I echo´d 
1  /sys/block/sdX/delete to remove one harddisk while a dd was running 
to the RAID. BTRFS used the remaining disks. On next reboot all disks 
where available again. While BTRFS didn´t start rebalancing the RAID 
automatically a btrfs filesystem balance made it fill up the previously 
failed device until all devices had the same usage. This is also described 
in the sysadmin guide: So this is what you have to care for manually. If a 
drive failed, you have to balance the filesystem so that it creates 
replicas where they are missing.

Now anyone deeper into BTRFS please check at whether my understanding 
matches what BTRFS is doing…

  You could run a scrub, which will verify all of the data mirrors on
  the volume, and fix anything that's not redundant.
 
 Will this command fail then for example?

No, unless more than the allowed number of disks are failing.

-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [3.2-rc7] slowdown, warning + oops creating lots of files

2012-01-05 Thread Dave Chinner
On Thu, Jan 05, 2012 at 02:11:31PM -0500, Liu Bo wrote:
 On 01/04/2012 09:26 PM, Dave Chinner wrote:
  On Wed, Jan 04, 2012 at 09:23:18PM -0500, Liu Bo wrote:
  On 01/04/2012 06:01 PM, Dave Chinner wrote:
  On Thu, Jan 05, 2012 at 09:23:52AM +1100, Chris Samuel wrote:
  On 05/01/12 09:11, Dave Chinner wrote:
 
  Looks to be reproducable.
  Does this happen with rc6 ?
  I haven't tried. All I'm doing is running some benchmarks to get
  numbers for a talk I'm giving about improvements in XFS metadata
  scalability, so I wanted to update my last set of numbers from
  2.6.39.
 
  As it was, these benchmarks also failed on btrfs with oopsen and
  corruptions back in 2.6.39 time frame.  e.g. same VM, same
  test, different crashes, similar slowdowns as reported here:
  http://comments.gmane.org/gmane.comp.file-systems.btrfs/11062
 
  Given that there is now a history of this simple test uncovering
  problems, perhaps this is a test that should be run more regularly
  by btrfs developers?
 
  If not then it might be easy to track down as there are only
  2 modifications between rc6 and rc7..
  They don't look like they'd be responsible for fixing an extent tree
  corruption, and I don't really have the time to do an open-ended
  bisect to find where the problem fix arose.
 
  As it is, 3rd attempt failed at 22m inodes, without the warning this
  time:
  
  .
  
  It's hard to tell exactly what path gets to that BUG_ON(), so much
  code is inlined by the compiler into run_clustered_refs() that I
  can't tell exactly how it got to the BUG_ON() triggered in
  alloc_reserved_tree_block().
 
  This seems to be an oops led by ENOSPC.
  
  At the time of the oops, this is the space used on the filesystem:
  
  $ df -h /mnt/scratch
  Filesystem  Size  Used Avail Use% Mounted on
  /dev/vdc 17T   31G   17T   1% /mnt/scratch
  
  It's less than 0.2% full, so I think ENOSPC can be ruled out here.
  
 
 This bug has done something with our block reservation allocator, not the 
 real disk space.
 
 Can you try the below one and see what happens?

Still crashes, still has severe slowdowns.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Fabian Zeindl

On Jan 5, 2012, at 11:39 , Martin Steigerwald wrote:
 Allocating as many chunks as can fit across the drives is also pretty 
 clear to me. So if BTRFS can´t allocate a new chunk on two devices, its 
 full. To me it seems obvious that BTRFS will not break the RAID-1 
 redundancy guarentee unless a drive fails.

So (assuming 1GB chunksize):

if i create a raid-1, btrfs with a 3GB and a 7GB device, it will show me ~10GB 
free space,
after saving a 1GB file, i will have 8GB left (-1GB on each device)
after saving another 1GB, i will have 6GB left (---  )
after saving another 1GB, it's suddenly full?

Fabian--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Martin Steigerwald
Am Donnerstag, 5. Januar 2012 schrieb Fabian Zeindl:
 On Jan 5, 2012, at 11:39 , Martin Steigerwald wrote:
  Allocating as many chunks as can fit across the drives is also
  pretty clear to me. So if BTRFS can´t allocate a new chunk on two
  devices, its full. To me it seems obvious that BTRFS will not break
  the RAID-1 redundancy guarentee unless a drive fails.
 
 So (assuming 1GB chunksize):
 
 if i create a raid-1, btrfs with a 3GB and a 7GB device, it will show
 me ~10GB free space, after saving a 1GB file, i will have 8GB left
 (-1GB on each device) after saving another 1GB, i will have 6GB left
 (---  )
 after saving another 1GB, it's suddenly full?

I would say yes, but suggest that you try this out or wait for confirmation 
of a BTRFS developer if you can to be sure about this. The other way of 
handling this would be to break the RAID-1 redundancy guarentee and I 
really hope that BTRFS is not doing this. I am not completely sure tough 
as I never tested it.

The output of df -h with BTRFS and RAID is bogus anyway.

Just consider df -h with two 10GB disks. df -H will display about 20GB 
free then. But when you write 100 MB it will show that 200 MB are 
allocated. So an application that assumes it will be able to write 12 GB 
easily will just fail doing that.

I don´t like this either, cause an application that writes something 
cannot even do a rough estimate. But then an application can never now 
whether a write will succeed cause another application could also write 
lots of data in the same time.

But even without RAID I cannot get exaxt figures from df. Just consider:

merkaba:~ btrfs filesystem show 
failed to read /dev/sr0
Label: 'debian'  uuid: dd52fea8-f6c3-4a60-bd4a-7650483655e5
Total devices 1 FS bytes used 11.35GB
devid1 size 18.62GB used 18.29GB path /dev/dm-0

Btrfs Btrfs v0.19
merkaba:~ btrfs filesystem df / 
Data: total=14.01GB, used=10.55GB
System, DUP: total=8.00MB, used=4.00KB
System: total=4.00MB, used=0.00
Metadata, DUP: total=2.12GB, used=814.32MB
Metadata: total=8.00MB, used=0.00
merkaba:~ df -hT /  
DateisystemTyp   Größe Benutzt Verf. Verw% Eingehängt auf
/dev/mapper/merkaba-debian btrfs   19G 13G  3,8G   77% /
merkaba:~ df -HT /
DateisystemTyp   Größe Benutzt Verf. Verw% Eingehängt auf
/dev/mapper/merkaba-debian btrfs   20G 14G  4,1G   77% /
merkaba:~

So how much space is free?

df just seems to reflect the size of the data, system and metadata b-trees, 
not their usage. Cause at least for me 10.55 GB, 4 KB and 814 KB just do 
not add up to 13 / 14 GB - I am not sure whether btrfs command uses 1024 
or 1000 as base.

So actually in this case I just be able to write more to the filesystem 
than df -hT tells me.

I prefer this over the other way around with RAID-1 where I just can write 
about half of the size that df reports.

So or so the current df output is bogus.

-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Roman Kapusta
On Thu, Jan 5, 2012 at 13:26, Fabian Zeindl fabian.zei...@gmail.com wrote:

 On Jan 5, 2012, at 11:39 , Martin Steigerwald wrote:
 Allocating as many chunks as can fit across the drives is also pretty
 clear to me. So if BTRFS can´t allocate a new chunk on two devices, its
 full. To me it seems obvious that BTRFS will not break the RAID-1
 redundancy guarentee unless a drive fails.

 So (assuming 1GB chunksize):

 if i create a raid-1, btrfs with a 3GB and a 7GB device, it will show me 
 ~10GB free space,
 after saving a 1GB file, i will have 8GB left (-1GB on each device)
 after saving another 1GB, i will have 6GB left (---  )
 after saving another 1GB, it's suddenly full?

you have still 4GB free of non RAID-1 (single) space, which is
currently unavailable, but it is planned that BTRFS will support mixed
storage:
some files can be RAID-1, some files can be RAID-0 and rest is basic
(single) storage


 Fabian--
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


btrfs truncate() does not change inode times

2012-01-05 Thread idank
Hi all,
I was running fstest (http://www.tuxera.com/community/posix-test-suite/) on 
btrfs. Only one test failed, and I believe it to be a bug in btrfs. The 
scenario is as follows:
* crate a file.
* note its times with stat.
* sleep a few seconds
* call truncate() on the file (not ftruncate(). ftruncate() works).
* sync
* note the file's times again with stat.
expected result: ctime and mtime are greater.
actual result: ctime and mtime remain unchanged.

Example:
[root@fedora-client pjd-fstest-20080816]# pwd
/test/pjd-fstest-20080816
[root@fedora-client pjd-fstest-20080816]# mount | grep /test
/dev/loop0 on /test type btrfs (rw,relatime)
[root@fedora-client pjd-fstest-20080816]# touch ctime_test
[root@fedora-client pjd-fstest-20080816]# stat ctime_test
  File: `ctime_test'
  Size: 0 Blocks: 0  IO Block: 4096   regular empty file
Device: 29h/41dInode: 1160Links: 1
Access: (0644/-rw-r--r--)  Uid: (0/root)   Gid: (0/root)
Access: 2012-01-05 14:42:21.067444155 +0200
Modify: 2012-01-05 14:42:21.067444155 +0200
Change: 2012-01-05 14:42:21.067444155 +0200
 Birth: -
[root@fedora-client pjd-fstest-20080816]# strace ./fstest truncate ctime_test 
200
execve(./fstest, [./fstest, truncate, ctime_test, 200], [/* 34 vars 
*/]) = 0
brk(0)  = 0x17c9000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x7f412616d000
access(/etc/ld.so.preload, R_OK)  = -1 ENOENT (No such file or directory)
open(/etc/ld.so.cache, O_RDONLY)  = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=47545, ...}) = 0
mmap(NULL, 47545, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f4126161000
close(3)= 0
open(/lib64/libc.so.6, O_RDONLY)  = 3
read(3, \177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0\0\1\0\0\0\260\24\342e2\0\0\0..., 
832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1951736, ...}) = 0
mmap(0x3265e0, 3773688, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 
0) = 0x3265e0
mprotect(0x3265f8f000, 2097152, PROT_NONE) = 0
mmap(0x326618f000, 20480, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x18f000) = 0x326618f000
mmap(0x3266194000, 21752, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3266194000
close(3)= 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x7f412616
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x7f412615e000
arch_prctl(ARCH_SET_FS, 0x7f412615e720) = 0
mprotect(0x326618f000, 16384, PROT_READ) = 0
mprotect(0x326581e000, 4096, PROT_READ) = 0
munmap(0x7f4126161000, 47545)   = 0
umask(0)= 022
truncate(ctime_test, 200) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x7f412616c000
write(1, 0\n, 20
)  = 2
exit_group(0)   = ?
[root@fedora-client pjd-fstest-20080816]# sleep 2
[root@fedora-client pjd-fstest-20080816]# stat ctime_test
  File: `ctime_test'
  Size: 200   Blocks: 0  IO Block: 4096   regular file
Device: 29h/41dInode: 1160Links: 1
Access: (0644/-rw-r--r--)  Uid: (0/root)   Gid: (0/root)
Access: 2012-01-05 14:42:21.067444155 +0200
Modify: 2012-01-05 14:42:21.067444155 +0200
Change: 2012-01-05 14:42:21.067444155 +0200
 Birth: -
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Fabian Zeindl
On Jan 5, 2012, at 14:35 , Roman Kapusta wrote:
 you have still 4GB free of non RAID-1 (single) space, which is
 currently unavailable, but it is planned that BTRFS will support mixed
 storage:
 some files can be RAID-1, some files can be RAID-0 and rest is basic
 (single) storage

Understood. So to clarify things i think it would be good if btrfs could print 
out
more detailled information.

Available raw space: 10GB
7G on drive A
3G on drive B
Assignable space for raid1: 3GB
 3G on drive A
 3G on drive B

Or maybe the other way round: show which different raid configurations there 
are and how the use which space.

I understand that free space is a difficult concept, if you do per-file or 
per-chunk redundancy, but i think there are 
a lot of users out there who just want to do a standard replication with 
their whole disk.
Maybe with the special ability of the 2x500G +1TB, which mdadm, AFAIK, can't 
do. 

This would be just a subset of what btrfs can do, of course, but it's a 
frequently used subset, so maybe there could be some kind of 
saved profile on how the user intends to use the filesystem.
Output could then be clarified using that profile and it could also give 
warnings or prevent actions that make no sense.

fabian--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfsck status?

2012-01-05 Thread David Summers

http://lwn.net/Articles/465160/

On 04/11/11 23:52, Yo'av Moshe wrote:

Hello!

I read Chris was supposed to demonstrate btrfsck on LinuxCon[0]. Does
anybody knows if that ever happen?
Also, does btrfsck development moved to the new GIT repository, or is
it still not released?

I'm holding to my broken btrfs partition[0] for more than a year now :-/

Thanks.

[1] http://article.gmane.org/gmane.comp.file-systems.btrfs/9423
--
Yo'av Moshe
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html




--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Martin Steigerwald
Am Donnerstag, 5. Januar 2012 schrieb Fabian Zeindl:
 On Jan 5, 2012, at 14:35 , Roman Kapusta wrote:
  you have still 4GB free of non RAID-1 (single) space, which is
  currently unavailable, but it is planned that BTRFS will support
  mixed storage:
  some files can be RAID-1, some files can be RAID-0 and rest is basic
  (single) storage
 
 Understood. So to clarify things i think it would be good if btrfs
 could print out more detailled information.
 
 Available raw space: 10GB
   7G on drive A
   3G on drive B
 Assignable space for raid1: 3GB
3G on drive A
3G on drive B
 
 Or maybe the other way round: show which different raid
 configurations there are and how the use which space.

As far as I see these informations can already be derived from btrfs 
filesystem df / show by combining values together. But it involves some 
manual calculations.

 I understand that free space is a difficult concept, if you do
 per-file or per-chunk redundancy, but i think there are a lot of users
 out there who just want to do a standard replication with their
 whole disk. Maybe with the special ability of the 2x500G +1TB, which
 mdadm, AFAIK, can't do.

It should be able to do that if you concatenate the two 2x500 GB via 
device mapper or the LVM layer above it.

 This would be just a subset of what btrfs can do, of course, but it's a
 frequently used subset, so maybe there could be some kind of saved
 profile on how the user intends to use the filesystem. Output
 could then be clarified using that profile and it could also give
 warnings or prevent actions that make no sense.

That makes sense to me.

As a default I would say that -d raid1 and -m raid1 just creates a RAID-1. 
And then BTRFS should put out the usable space for that RAID-1. I.e. when 
I have two 500 GB disks with 100 GB allocated it should return about 400 
GB free space. And when it uses one 1 TB disk as well as one 500 GB disk 
and two 250 GB disk with 100 GB allocated, it should return about 900GB 
free space. Only when it has one disk with 500 GB and one with 1 TB with 
100 GB allocated, it should return 400 GB free space.

IMHO this also should be what BTRFS reports to the regular df command.

This should only change if a mixed policy is in place. I do not know what 
to report to the OS then. Should it add the RAID-1 and the RAID-0 space? 
Should it only report the RAID-1 space? IMHO that depends on the 
allocation policy. If new files are enforced to be on RAID-1 space it 
should do the latter and if new files are created on RAID-0 space if RAID-1 
space is full it should report the former.

For more details, btrfs filesystem df / show still need to be used. Maybe 
with a revised and even more informative output like you suggested.

-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does Btrfs allow raid1 with mismatched drives? Also: How to look behind the curtain

2012-01-05 Thread Martin Steigerwald
Sorry, accidentally dropped CC.

Am Donnerstag, 5. Januar 2012 schrieb Fabian Zeindl:
 On Jan 5, 2012, at 14:35 , Roman Kapusta wrote:
  you have still 4GB free of non RAID-1 (single) space, which is
  currently unavailable, but it is planned that BTRFS will support
  mixed storage:
  some files can be RAID-1, some files can be RAID-0 and rest is basic
  (single) storage
 
 Understood. So to clarify things i think it would be good if btrfs
 could print out more detailled information.
 
 Available raw space: 10GB
   7G on drive A
   3G on drive B
 Assignable space for raid1: 3GB
3G on drive A
3G on drive B
 
 Or maybe the other way round: show which different raid
 configurations there are and how the use which space.

As far as I see these informations can already be derived from btrfs 
filesystem df / show by combining values together. But it involves some 
manual calculations.

 I understand that free space is a difficult concept, if you do
 per-file or per-chunk redundancy, but i think there are a lot of users
 out there who just want to do a standard replication with their
 whole disk. Maybe with the special ability of the 2x500G +1TB, which
 mdadm, AFAIK, can't do.

It should be able to do that if you concatenate the two 2x500 GB via 
device mapper or the LVM layer above it.

 This would be just a subset of what btrfs can do, of course, but it's a
 frequently used subset, so maybe there could be some kind of saved
 profile on how the user intends to use the filesystem. Output
 could then be clarified using that profile and it could also give
 warnings or prevent actions that make no sense.

That makes sense to me.

As a default I would say that -d raid1 and -m raid1 just creates a RAID-1. 
And then BTRFS should put out the usable space for that RAID-1. I.e. when 
I have two 500 GB disks with 100 GB allocated it should return about 400 
GB free space. And when it uses one 1 TB disk as well as one 500 GB disk 
and two 250 GB disk with 100 GB allocated, it should return about 900GB 
free space. Only when it has one disk with 500 GB and one with 1 TB with 
100 GB allocated, it should return 400 GB free space.

IMHO this also should be what BTRFS reports to the regular df command.

This should only change if a mixed policy is in place. I do not know what 
to report to the OS then. Should it add the RAID-1 and the RAID-0 space? 
Should it only report the RAID-1 space? IMHO that depends on the 
allocation policy. If new files are enforced to be on RAID-1 space it 
should do the latter and if new files are created on RAID-0 space if RAID-1 
space is full it should report the former.

For more details, btrfs filesystem df / show still need to be used. Maybe 
with a revised and even more informative output like you suggested.

-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] Btrfs: make btrfs_truncate_inode_items() more readable

2012-01-05 Thread Josef Bacik
On Thu, Jan 05, 2012 at 04:32:41PM +0800, Miao Xie wrote:
 As the title said, this patch just make the functions of the truncation
 more readable.
 
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/inode.c |  289 
 ++
  1 files changed, 159 insertions(+), 130 deletions(-)
 
 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
 index 85e2312..df6060f 100644
 --- a/fs/btrfs/inode.c
 +++ b/fs/btrfs/inode.c
 @@ -2977,10 +2977,142 @@ out:
   return err;
  }
  
 +static int btrfs_release_and_test_inline_data_extent(
 + struct btrfs_root *root,
 + struct inode *inode,
 + struct extent_buffer *leaf,
 + struct btrfs_file_extent_item *fi,
 + u64 offset,
 + u64 new_size)
 +{
 + u64 item_end;
 +
 + item_end = offset + btrfs_file_extent_inline_len(leaf, fi) - 1;
 +
 + if (item_end  new_size)
 + return 0;
 +
 + /*
 +  * Truncate inline items is special, we have done it by
 +  *   btrfs_truncate_page();
 +  */
 + if (offset  new_size)
 + return 0;
 +
 + if (root-ref_cows)
 + inode_sub_bytes(inode, item_end + 1 - offset);
 +
 + return 1;
 +}
 +
  /*
 - * this can truncate away extent items, csum items and directory items.
 - * It starts at a high offset and removes keys until it can't find
 - * any higher than new_size
 + * If this function return 1, it means this item can be dropped directly.
 + * If 0 is returned, the item can not be dropped.
 + */
 +static int btrfs_release_and_test_data_extent(struct btrfs_trans_handle 
 *trans,
 +   struct btrfs_root *root,
 +   struct btrfs_path *path,
 +   struct inode *inode,
 +   u64 offset,
 +   u64 new_size)
 +{
 + struct extent_buffer *leaf;
 + struct btrfs_file_extent_item *fi;
 + u64 extent_start;
 + u64 extent_offset;
 + u64 item_end;
 + u64 ino = btrfs_ino(inode);
 + u64 orig_nbytes;
 + u64 new_nbytes;
 + int extent_type;
 + int ret;
 +
 + leaf = path-nodes[0];
 + fi = btrfs_item_ptr(leaf, path-slots[0],
 + struct btrfs_file_extent_item);
 +
 + extent_type = btrfs_file_extent_type(leaf, fi);
 + if (extent_type == BTRFS_FILE_EXTENT_INLINE)
 + return btrfs_release_and_test_inline_data_extent(root, inode,
 +  leaf, fi,
 +  offset,
 +  new_size);
 +
 + item_end = offset + btrfs_file_extent_num_bytes(leaf, fi) - 1;
 +
 + /*
 +  * If the new size is beyond the end of the extent:
 +  *   +--+
 +  *   |  |
 +  *   +--+
 +  *^ new size
 +  * so the extent should not be dropped or truncated.
 +  */
 + if (item_end  new_size)
 + return 0;
 +
 + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
 + if (offset  new_size) {
 + /*
 +  * If the new size is in the extent:
 +  *   +--+
 +  *   |  |
 +  *   +--+
 +  *  ^ new size
 +  * so this extent should be truncated, not be dropped directly.
 +  */
 + orig_nbytes = btrfs_file_extent_num_bytes(leaf, fi);
 + new_nbytes = round_up(new_size - offset, root-sectorsize);
 +
 + btrfs_set_file_extent_num_bytes(leaf, fi, new_nbytes);
 +
 + if (extent_start != 0  root-ref_cows)
 + inode_sub_bytes(inode, orig_nbytes - new_nbytes);
 +
 + btrfs_mark_buffer_dirty(leaf);
 + return 0;

Use ret = 0 here, and then further down...

 + } else {
 + /*
 +  * If the new size is in the font of the extent:
 +  *   +--+
 +  *   |  |
 +  *   +--+
 +  *  ^ new size
 +  * so this extent should be dropped.
 +  */
 +
 + /*
 +  * It is a dummy extent, or it is in log tree, we needn't do
 +  * anything, just drop it.
 +  */
 + if (extent_start == 0 ||
 + !(root-ref_cows || root == root-fs_info-tree_root))
 + return 1;
 +
 + /* If this file 

Re: [RFC][PATCH 3/3] Btrfs: improve truncation of btrfs

2012-01-05 Thread Josef Bacik
On Thu, Jan 05, 2012 at 04:32:46PM +0800, Miao Xie wrote:
 The original truncation of btrfs has a bug, that is the orphan item will not 
 be
 dropped when the truncation fails. This bug will trigger BUG() when unlink 
 that
 truncated file. And besides that, if the user does pre-allocation for the file
 which is truncated unsuccessfully, after re-mount(umount-mount, not -o 
 remount),
 the pre-allocated extent will be dropped.
 
 This patch modified the relative functions of the truncation, and makes the
 truncation update i_size and disk_i_size of i-nodes every time we drop the 
 file
 extent successfully, and set them to the real value. By this way, we needn't
 add orphan items to guarantee the consistency of the meta-data.
 
 By this patch, it is possible that the file may not be truncated to the size
 that the user expects(may be = the orignal size and = the expected one), so 
 I
 think it is better that we shouldn't lose the data that lies within the range
 the expected size, the real size, because the user may take it for granted
 that the data in that extent is not lost. In order to implement it, we just
 write out all the dirty pages which are beyond the expected size of the file.
 This operation will spend lots of time if there are many dirty pages. It is
 also the only disadvantage of this patch. (Maybe I'm overcautious, we needn't
 hold that data.)
 
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  fs/btrfs/inode.c |  159 
 +-
  1 files changed, 49 insertions(+), 110 deletions(-)
 
 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
 index df6060f..9ace01b 100644
 --- a/fs/btrfs/inode.c
 +++ b/fs/btrfs/inode.c
 @@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT  S_SHIFT] 
 = {
  };
  
  static int btrfs_setsize(struct inode *inode, loff_t newsize);
 -static int btrfs_truncate(struct inode *inode);
 +static int btrfs_truncate(struct inode *inode, loff_t newsize);
  static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
  static noinline int cow_file_range(struct inode *inode,
  struct page *locked_page,
 @@ -2230,7 +2230,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* btrfs_delalloc_reserve_space to catch offenders.
*/
   mutex_lock(inode-i_mutex);
 - ret = btrfs_truncate(inode);
 + ret = btrfs_truncate(inode, inode-i_size);
   mutex_unlock(inode-i_mutex);
   } else {
   nr_unlink++;
 @@ -2993,7 +2993,7 @@ static int btrfs_release_and_test_inline_data_extent(
   return 0;
  
   /*
 -  * Truncate inline items is special, we have done it by
 +  * Truncate inline items is special, we will do it by
*   btrfs_truncate_page();
*/
   if (offset  new_size)
 @@ -3121,9 +3121,9 @@ static int btrfs_release_and_test_data_extent(struct 
 btrfs_trans_handle *trans,
   * will kill all the items on this inode, including the INODE_ITEM_KEY.
   */
  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 - struct btrfs_root *root,
 - struct inode *inode,
 - u64 new_size, u32 min_type)
 +struct btrfs_root *root,
 +struct inode *inode,
 +u64 new_size, u32 min_type)
  {
   struct btrfs_path *path;
   struct extent_buffer *leaf;
 @@ -3131,6 +3131,7 @@ int btrfs_truncate_inode_items(struct 
 btrfs_trans_handle *trans,
   struct btrfs_key found_key;
   u64 mask = root-sectorsize - 1;
   u64 ino = btrfs_ino(inode);
 + u64 old_size = i_size_read(inode);
   u32 found_type;
   int pending_del_nr = 0;
   int pending_del_slot = 0;
 @@ -3138,6 +3139,7 @@ int btrfs_truncate_inode_items(struct 
 btrfs_trans_handle *trans,
   int err = 0;
  
   BUG_ON(new_size  0  min_type != BTRFS_EXTENT_DATA_KEY);
 + BUG_ON(new_size  mask);
  
   path = btrfs_alloc_path();
   if (!path)
 @@ -3190,6 +3192,13 @@ search_again:
   ret = btrfs_release_and_test_data_extent(trans, root,
   path, inode, found_key.offset,
   new_size);
 + if (root-ref_cows ||
 + root == root-fs_info-tree_root) {
 + if (ret  found_key.offset  old_size)
 + i_size_write(inode, found_key.offset);
 + else if (!ret)
 + i_size_write(inode, new_size);
 + }
   if (!ret)
   break;
   }
 @@ -3247,12 +3256,10 @@ out:
  static int btrfs_truncate_page(struct address_space 

Re: checkums when converting from ext[234] to Btrfs

2012-01-05 Thread David Sterba
On Wed, Jan 04, 2012 at 03:22:11PM +0100, pubny wrote:
 Could someone help me with a clarification whether the btrfs-convert
 tool creates checksums on blocks of the existing ext[234] filesystem?

yes, it does checksum by default and you can switch it off with the '-d'
option.

 Any experiences how the size and the filesystem utilization (used vs.
 total diskspace) impacts the time of conversion?

I don't have numbers but it needs to go through extX metadata, read and
checksum data blocks, then write btrfs metadata. Nothing where a
significant time could be saved (except switching off the checksums).


david
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Lockdep warning during snapshot deletion in 3.2-rc7+

2012-01-05 Thread David Sterba
Hi,

I saw the following lockep message in syslog, with v3.2-rc7-83-g115e8e7

[ 9612.860677] btrfs: unlinked 1 orphans
about 50 times repeated
[ 8241.264764] btrfs: truncated 1 orphans
interleaved with about 10 such messages
[ 9758.530822] ==
[ 9758.532018] [ INFO: possible circular locking dependency detected ]
[ 9758.532018] 3.2.0-rc7-default+ #94
[ 9758.544811] ---
[ 9758.544811] btrfs/13767 is trying to acquire lock:
[ 9758.544811]  (fs_info-subvol_sem){..}, at: [a0065d8c] 
btrfs_ioctl_snap_destroy+0x20c/0x4c0 [btrfs]
[ 9758.544811]
[ 9758.544811] but task is already holding lock:
[ 9758.544811]  (sb-s_type-i_mutex_key#16){+.+.+.}, at: [a0065d62] 
btrfs_ioctl_snap_destroy+0x1e2/0x4c0 [btrfs]
[ 9758.544811]
[ 9758.544811] which lock already depends on the new lock.
[ 9758.544811]
[ 9758.544811]
[ 9758.544811] the existing dependency chain (in reverse order) is:
[ 9758.544811]
[ 9758.544811] - #1 (sb-s_type-i_mutex_key#16){+.+.+.}:
[ 9758.544811][810915d4] lock_acquire+0x94/0x130
[ 9758.544811][8187c387] mutex_lock_nested+0x77/0x380
[ 9758.544811][a0042da7] btrfs_orphan_cleanup+0x2a7/0x4e0 
[btrfs]
[ 9758.544811][a0066956] btrfs_mksubvol+0x306/0x360 [btrfs]
[ 9758.544811][a0066ab0] 
btrfs_ioctl_snap_create_transid+0x100/0x160 [btrfs]
[ 9758.544811][a0066c8d] 
btrfs_ioctl_snap_create_v2.clone.0+0xfd/0x110 [btrfs]
[ 9758.544811][a0068598] btrfs_ioctl+0x588/0x1080 [btrfs]
[ 9758.544811][8114d5d8] do_vfs_ioctl+0x98/0x560
[ 9758.544811][8114daef] sys_ioctl+0x4f/0x80
[ 9758.544811][81887ac2] system_call_fastpath+0x16/0x1b
[ 9758.544811]
[ 9758.544811] - #0 (fs_info-subvol_sem){..}:
[ 9758.544811][810909a6] __lock_acquire+0x18f6/0x1e90
[ 9758.544811][810915d4] lock_acquire+0x94/0x130
[ 9758.544811][8187d6fc] down_write+0x5c/0xb0
[ 9758.544811][a0065d8c] btrfs_ioctl_snap_destroy+0x20c/0x4c0 
[btrfs]
[ 9758.544811][a0068466] btrfs_ioctl+0x456/0x1080 [btrfs]
[ 9758.544811][8114d5d8] do_vfs_ioctl+0x98/0x560
[ 9758.544811][8114daef] sys_ioctl+0x4f/0x80
[ 9758.544811][81887ac2] system_call_fastpath+0x16/0x1b
[ 9758.544811]
[ 9758.544811] other info that might help us debug this:
[ 9758.544811]
[ 9758.544811]  Possible unsafe locking scenario:
[ 9758.544811]
[ 9758.544811]CPU0CPU1
[ 9758.544811]
[ 9758.544811]   lock(sb-s_type-i_mutex_key);
[ 9758.544811]lock(fs_info-subvol_sem);
[ 9758.544811]lock(sb-s_type-i_mutex_key);
[ 9758.544811]   lock(fs_info-subvol_sem);
[ 9758.544811]
[ 9758.544811]  *** DEADLOCK ***
[ 9758.544811]
[ 9758.544811] 2 locks held by btrfs/13767:
[ 9758.544811]  #0:  (sb-s_type-i_mutex_key#16/1){+.+.+.}, at: 
[a0065c70] btrfs_ioctl_snap_destroy+0xf0/0x4c0 [btrfs]
[ 9758.544811]  #1:  (sb-s_type-i_mutex_key#16){+.+.+.}, at: 
[a0065d62] btrfs_ioctl_snap_destroy+0x1e2/0x4c0 [btrfs]
[ 9758.544811]
[ 9758.544811] stack backtrace:
[ 9758.544811] Pid: 13767, comm: btrfs Not tainted 3.2.0-rc7-default+ #94
[ 9758.544811] Call Trace:
[ 9758.544811]  [8108dbf0] print_circular_bug+0x210/0x2f0
[ 9758.544811]  [810909a6] __lock_acquire+0x18f6/0x1e90
[ 9758.544811]  [8108bb5d] ? lock_release_holdtime+0x3d/0x1c0
[ 9758.544811]  [81009c65] ? native_sched_clock+0x15/0x80
[ 9758.544811]  [a0065d8c] ? btrfs_ioctl_snap_destroy+0x20c/0x4c0 
[btrfs]
[ 9758.544811]  [810915d4] lock_acquire+0x94/0x130
[ 9758.544811]  [a0065d8c] ? btrfs_ioctl_snap_destroy+0x20c/0x4c0 
[btrfs]
[ 9758.544811]  [81152a61] ? d_invalidate+0x81/0xa0
[ 9758.544811]  [8187d6fc] down_write+0x5c/0xb0
[ 9758.544811]  [a0065d8c] ? btrfs_ioctl_snap_destroy+0x20c/0x4c0 
[btrfs]
[ 9758.544811]  [a0065d8c] btrfs_ioctl_snap_destroy+0x20c/0x4c0 
[btrfs]
[ 9758.544811]  [a0068466] btrfs_ioctl+0x456/0x1080 [btrfs]
[ 9758.544811]  [818832c0] ? do_page_fault+0x2d0/0x580
[ 9758.544811]  [8107e5df] ? local_clock+0x6f/0x80
[ 9758.544811]  [8114d5d8] do_vfs_ioctl+0x98/0x560
[ 9758.544811]  [8187f7d9] ? retint_swapgs+0x13/0x1b
[ 9758.544811]  [8114daef] sys_ioctl+0x4f/0x80
[ 9758.544811]  [81887ac2] system_call_fastpath+0x16/0x1b


there were about 2000 snapshots created during subtrans-tester (as described
here http://www.spinics.net/lists/linux-btrfs/msg13931.html / a)
and deleted in parallel
(command: btrfs subvol list .|awk '{print $7}'|parallel --jobs 16 btrfs subvol 
del)

There were a blocked processes for a few minutes but then finished and
all snapshots were 

Crash in io_ctl_drop_pages after mount with csum errors

2012-01-05 Thread David Sterba
I mounted a multi-folume fs created not-so-long ago in a 3.1 based
kernel and mounted with v3.2-rc7-83-g115e8e7 , it crashed immediately.
It's quite possible that the disk is to blame, it's an old 160G
SP1614C, but syslog does not contain any error messages. I'm not sure
whether the fs was cleanly unmounted, seems not, but anyway I do not
expect a crash.

Label: none  uuid: 5f06f9eb-9736-49f7-91a2-2f45522512ef
Total devices 4 FS bytes used 1.38GB
devid4 size 34.00GB used 34.00GB path /dev/sdg8
devid3 size 34.00GB used 34.00GB path /dev/sdg7
devid2 size 34.00GB used 34.00GB path /dev/sdg6
devid1 size 34.00GB used 34.00GB path /dev/sdg5

mount options: compress-force=lzo,space_cache,autodefrag,inode_cache

[ 1461.732855] btrfs: force lzo compression
[ 1461.732876] btrfs: enabling auto defrag
[ 1461.732893] btrfs: enabling inode map caching
[ 1461.732907] btrfs: disk space caching is enabled
[ 1499.796181] btrfs: csum mismatch on free space cache
[ 1499.796266] btrfs: failed to load free space cache for block group 29360128
[ 1499.888699] btrfs csum failed ino 18446744073709551604 off 65536 csum 
2566472073 private 1925235876
[ 1499.26] btrfs csum failed ino 18446744073709551604 off 327680 csum 
2566472073 private 1925235876
[ 1499.906229] btrfs csum failed ino 18446744073709551604 off 0 csum 1695430581 
private 1170642078
[ 1499.906345] btrfs csum failed ino 18446744073709551604 off 262144 csum 
2566472073 private 1925235876
[ 1499.906446] btrfs csum failed ino 18446744073709551604 off 524288 csum 
2566472073 private 1925235876
[ 1499.924469] btrfs csum failed ino 18446744073709551604 off 196608 csum 
2566472073 private 1925235876
[ 1499.924574] btrfs csum failed ino 18446744073709551604 off 458752 csum 
2566472073 private 1925235876
[ 1499.946076] btrfs csum failed ino 18446744073709551604 off 131072 csum 
2566472073 private 1925235876
[ 1499.946217] btrfs csum failed ino 18446744073709551604 off 393216 csum 
2566472073 private 1925235876
[ 1499.946318] btrfs csum failed ino 18446744073709551604 off 0 csum 1695430581 
private 1170642078
[ 1499.946362] btrfs: error reading free space cache
[ 1499.946409] BUG: unable to handle kernel NULL pointer dereference at 
0001
[ 1499.946437] IP: [a0456dd7] io_ctl_drop_pages+0x37/0x70 [btrfs]
[ 1499.946515] PGD 125ce4067 PUD 126941067 PMD 0
[ 1499.946539] Oops: 0002 [#1] PREEMPT SMP
[ 1499.946560] CPU 0
[ 1499.946569] Modules linked in: btrfs zlib_deflate aoe nfs lockd fscache 
auth_rpcgss nfs_acl sunrpc af_packet cpufreq_conservative cpufreq_userspace 
cpufreq_powersave powernow_k8 mperf snd_hda_codec_analog snd_hda_intel snd
_hda_codec sg sp5100_tco snd_hwdep snd_pcm amd64_edac_mod snd_timer pcspkr 
edac_core snd edac_mce_amd firewire_ohci firewire_core crc_itu_t i2c_piix4 
k8temp asus_atk0110 soundcore snd_page_alloc sky2 autofs4 nouveau ttm drm_k
ms_helper drm processor i2c_algo_bit mxm_wmi wmi video thermal_sys button 
pata_via sata_promise sata_via ata_generic sata_sil pata_atiixp
[ 1499.946832]
[ 1499.946843] Pid: 2799, comm: rm Not tainted 3.2.0-rc7-1-desktop #1
[ 1499.946880] RIP: 0010:[a0456dd7]  [a0456dd7] 
io_ctl_drop_pages+0x37/0x70 [btrfs]
[ 1499.946936] RSP: 0018:880127c6bc48  EFLAGS: 00010202
[ 1499.946951] RAX: 0001 RBX: 880127c6bcf0 RCX: 88012ffa3000
[ 1499.946971] RDX:  RSI: ea0003ec0c80 RDI: ea0003ec0c80
[ 1499.946989] RBP: 0001 R08: 6400 R09: a8000fb03200
[ 1499.947008] R10: 57ffda4fd1ec0c80 R11:  R12: 0001
[ 1499.947028] R13: 880126d519b0 R14: 0002005a R15: 0001
[ 1499.947052] FS:  7f6a9aa1c700() GS:88012fc0() 
knlGS:
[ 1499.947078] CS:  0010 DS:  ES:  CR0: 8005003b
[ 1499.947097] CR2: 0001 CR3: 0001275e5000 CR4: 06f0
[ 1499.947120] DR0:  DR1:  DR2: 
[ 1499.947143] DR3:  DR6: 0ff0 DR7: 0400
[ 1499.947167] Process rm (pid: 2799, threadinfo 880127c6a000, task 
880126378280)
[ 1499.947551] Stack:
[ 1499.947551]   880127c6bcf0  
a0457e2e
[ 1499.947551]  0020 ea0003ec0c80 880126d51980 
880127c6bd48
[ 1499.947551]  880126d51980 0de0 880125d13720 
8801267e6600
[ 1499.947551] Call Trace:
[ 1499.947551]  [a0457e2e] io_ctl_prepare_pages.isra.31+0x9e/0x150 
[btrfs]
[ 1499.947551]  [a0459d3f] __load_free_space_cache+0x1ff/0x610 [btrfs]
[ 1499.947551]  [a045b134] load_free_ino_cache+0xd4/0x100 [btrfs]
[ 1499.947551]  [a041a956] start_caching+0x86/0x130 [btrfs]
[ 1499.947551]  [a041aab5] btrfs_return_ino+0xb5/0x170 [btrfs]
[ 1499.947551]  [a042dc6b] btrfs_evict_inode+0x2cb/0x320 [btrfs]
[ 1499.947551]  [811745af] 

Bug(?): btrfs carries on working if part of a device disappears

2012-01-05 Thread Maik Zumstrull
Hello list,

I hit a funny BIOS bug the other day where the BIOS suddenly sets a
HPA on a random hard disk, leaving only the first 33 MB accessible.
That disk had one device of a multi-device btrfs on it in my case.
(With dm-crypt/LUKS in between, no partitioning or LVM.)

The reason I'm writing to you is that btrfs apparently didn't care at
all. It didn't complain, and it certainly didn't consider Uhm, maybe
I should stop writing to a file system that mostly doesn't exist
anymore. The only errors I saw in dmesg were from the lower block
device level: someone trying to read or write beyond the end of a
device. An error btrfs apparently didn't mind. It took me a while to
figure out what had happened, during which time btrfsck and the btrfs
kernel part worked together to pretty much totally trash the fs. (I'm
still trying a few things, but I'm not hopeful. Hold the default
backup rant, I can in fact recover anything that was on this from
elsewhere, I think.)

So, I think during mount, btrfs should check the reported size of the
block device, and if it's significantly smaller than fs metadata
implies it must be, mount degraded or read-only or not at all. And
mostly, complain. Loudly.

This was on Debian's linux-image-3.1.0-1-amd6 at version 3.1.6-1.
Other ways this could happen than HPA are LVM or partitioning.


Maik
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [3.2-rc7] slowdown, warning + oops creating lots of files

2012-01-05 Thread Chris Mason
On Thu, Jan 05, 2012 at 10:01:22AM +1100, Dave Chinner wrote:
 On Thu, Jan 05, 2012 at 09:23:52AM +1100, Chris Samuel wrote:
  On 05/01/12 09:11, Dave Chinner wrote:
  
   Looks to be reproducable.
  
  Does this happen with rc6 ?
 
 I haven't tried. All I'm doing is running some benchmarks to get
 numbers for a talk I'm giving about improvements in XFS metadata
 scalability, so I wanted to update my last set of numbers from
 2.6.39.
 
 As it was, these benchmarks also failed on btrfs with oopsen and
 corruptions back in 2.6.39 time frame.  e.g. same VM, same
 test, different crashes, similar slowdowns as reported here:
 http://comments.gmane.org/gmane.comp.file-systems.btrfs/11062
 
 Given that there is now a history of this simple test uncovering
 problems, perhaps this is a test that should be run more regularly
 by btrfs developers?

Unfortunately, this one works for me.  I'll try it again and see if I
can push harder.  If not, I'll see if I can trade beer for some
diagnostic runs.

Thanks Dave

-chris
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Status of dedupe in btrfs

2012-01-05 Thread Konstantinos Skarlatos

Hello everyone,

I was reading this article in Slashdot about dedupe [1] and i was 
wondering about the status of the (offline) dedupe patches in btrfs. Are 
they applicable to a recent kernel? do userspace tools support it?


Kind regards


[1] 
http://sk.slashdot.org/story/12/01/04/1955248/ask-slashdot-freeopen-deduplication-software

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [3.2-rc7] slowdown, warning + oops creating lots of files

2012-01-05 Thread Chris Mason
On Thu, Jan 05, 2012 at 01:46:57PM -0500, Chris Mason wrote:
 On Thu, Jan 05, 2012 at 10:01:22AM +1100, Dave Chinner wrote:
  On Thu, Jan 05, 2012 at 09:23:52AM +1100, Chris Samuel wrote:
   On 05/01/12 09:11, Dave Chinner wrote:
   
Looks to be reproducable.
   
   Does this happen with rc6 ?
  
  I haven't tried. All I'm doing is running some benchmarks to get
  numbers for a talk I'm giving about improvements in XFS metadata
  scalability, so I wanted to update my last set of numbers from
  2.6.39.
  
  As it was, these benchmarks also failed on btrfs with oopsen and
  corruptions back in 2.6.39 time frame.  e.g. same VM, same
  test, different crashes, similar slowdowns as reported here:
  http://comments.gmane.org/gmane.comp.file-systems.btrfs/11062
  
  Given that there is now a history of this simple test uncovering
  problems, perhaps this is a test that should be run more regularly
  by btrfs developers?
 
 Unfortunately, this one works for me.  I'll try it again and see if I
 can push harder.  If not, I'll see if I can trade beer for some
 diagnostic runs.

Aha, if I try it just on the ssd instead of on my full array it triggers
at 88M files.  Great.

-chris

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: checkums when converting from ext[234] to Btrfs

2012-01-05 Thread pubny
Thanks David. That's very helpful.

Regards,
Gábor


-Original Message-
From: David Sterba d...@jikos.cz
Reply-to: d...@jikos.cz
To: pubny pu...@xs4all.nl
Cc: linux-btrfs@vger.kernel.org
Subject: Re: checkums when converting from ext[234] to Btrfs
Date: Thu, 5 Jan 2012 16:45:41 +0100
Mailer: Mutt/1.4.2.2i

On Wed, Jan 04, 2012 at 03:22:11PM +0100, pubny wrote:
 Could someone help me with a clarification whether the btrfs-convert
 tool creates checksums on blocks of the existing ext[234] filesystem?

yes, it does checksum by default and you can switch it off with the '-d'
option.

 Any experiences how the size and the filesystem utilization (used vs.
 total diskspace) impacts the time of conversion?

I don't have numbers but it needs to go through extX metadata, read and
checksum data blocks, then write btrfs metadata. Nothing where a
significant time could be saved (except switching off the checksums).


david

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [3.2-rc7] slowdown, warning + oops creating lots of files

2012-01-05 Thread Dave Chinner
On Thu, Jan 05, 2012 at 02:45:00PM -0500, Chris Mason wrote:
 On Thu, Jan 05, 2012 at 01:46:57PM -0500, Chris Mason wrote:
  On Thu, Jan 05, 2012 at 10:01:22AM +1100, Dave Chinner wrote:
   On Thu, Jan 05, 2012 at 09:23:52AM +1100, Chris Samuel wrote:
On 05/01/12 09:11, Dave Chinner wrote:

 Looks to be reproducable.

Does this happen with rc6 ?
   
   I haven't tried. All I'm doing is running some benchmarks to get
   numbers for a talk I'm giving about improvements in XFS metadata
   scalability, so I wanted to update my last set of numbers from
   2.6.39.
   
   As it was, these benchmarks also failed on btrfs with oopsen and
   corruptions back in 2.6.39 time frame.  e.g. same VM, same
   test, different crashes, similar slowdowns as reported here:
   http://comments.gmane.org/gmane.comp.file-systems.btrfs/11062
   
   Given that there is now a history of this simple test uncovering
   problems, perhaps this is a test that should be run more regularly
   by btrfs developers?
  
  Unfortunately, this one works for me.  I'll try it again and see if I
  can push harder.  If not, I'll see if I can trade beer for some
  diagnostic runs.
 
 Aha, if I try it just on the ssd instead of on my full array it triggers
 at 88M files.  Great.

Good to know.  The error that is generating the BUG on my machine is
-28 (ENOSPC).  Given there's 17TB free on my filesystem

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [3.2-rc7] slowdown, warning + oops creating lots of files

2012-01-05 Thread Chris Mason
On Fri, Jan 06, 2012 at 07:12:16AM +1100, Dave Chinner wrote:
 On Thu, Jan 05, 2012 at 02:45:00PM -0500, Chris Mason wrote:
  On Thu, Jan 05, 2012 at 01:46:57PM -0500, Chris Mason wrote:
   On Thu, Jan 05, 2012 at 10:01:22AM +1100, Dave Chinner wrote:
On Thu, Jan 05, 2012 at 09:23:52AM +1100, Chris Samuel wrote:
 On 05/01/12 09:11, Dave Chinner wrote:
 
  Looks to be reproducable.
 
 Does this happen with rc6 ?

I haven't tried. All I'm doing is running some benchmarks to get
numbers for a talk I'm giving about improvements in XFS metadata
scalability, so I wanted to update my last set of numbers from
2.6.39.

As it was, these benchmarks also failed on btrfs with oopsen and
corruptions back in 2.6.39 time frame.  e.g. same VM, same
test, different crashes, similar slowdowns as reported here:
http://comments.gmane.org/gmane.comp.file-systems.btrfs/11062

Given that there is now a history of this simple test uncovering
problems, perhaps this is a test that should be run more regularly
by btrfs developers?
   
   Unfortunately, this one works for me.  I'll try it again and see if I
   can push harder.  If not, I'll see if I can trade beer for some
   diagnostic runs.
  
  Aha, if I try it just on the ssd instead of on my full array it triggers
  at 88M files.  Great.
 
 Good to know.  The error that is generating the BUG on my machine is
 -28 (ENOSPC).  Given there's 17TB free on my filesystem

Yeah, same thing here.  I'm testing a fix now, it's pretty dumb.  We're
not allocating more metadata chunks from the drive because of where the
allocation is happening, so it is just a check for do we need a new
chunk in the right place.

I'll make sure it can fill my ssd and then send to you.

-chris

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [3.2-rc7] slowdown, warning + oops creating lots of files

2012-01-05 Thread Chris Samuel
On Fri,   6 Jan 2012, 08:02:55 EST, Chris Mason chris.ma...@oracle.com wrote:

 Yeah, same thing here.   I'm testing a fix now, it's pretty dumb.   We're
 not allocating more metadata chunks from the drive because of where the
 allocation is happening, so it is just a check for do we need a new
 chunk in the right place.
 
 I'll make sure it can fill my ssd and then send to you.

*If* the fix works would this be a candidate for a
stable 3.2.1 release rather than having to wait for
3.3?

cheers,
Chris
-- 
Chris Samuel - http://www.csamuel.org/ - on mobile
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Filesystem not mountable after reset, bad tree block

2012-01-05 Thread Michael Andreen
Hi,

Today one of my machines (running kernel 3.0.6) locked up, sadly the monitor 
was stuck in power saving mode and no access over network so not sure exactly 
what happened. After restarting the machine, the root filesystem is no longer 
mountable the following output appears: 

device label SSD devid 1 transid 507930 /dev/sdb2
btrfs bad tree block start 0 188205998080
btrfs bad tree block start 0 188205998080
btrfs bad tree block start 0 188205998080
btrfs: open_ctree failed

I downloaded the latest btrfs-progs, but getting similar output from several 
of the programs:

# ./btrfs-debug-tree /dev/sdb2
Check tree block failed, want=188205998080, have=0
Check tree block failed, want=188205998080, have=0
Check tree block failed, want=188205998080, have=0
Check tree block failed, want=188205998080, have=0
Check tree block failed, want=188205998080, have=0
read block failed check_tree_block
btrfs-debug-tree: disk-io.c:447: find_and_setup_root: Assertion `!(!root-
node)' failed.
Aborted

The find-root program seems to think there is a root (and poentially some 
older roots?), but not sure how to use that information.

# ./find-root /dev/sdb2 
Super think's the tree root is at 131212083200, chunk root 20983808
Well block 13856997376 seems great, but generation doesn't match, have=321485, 
want=507930
Well block 16362573824 seems great, but generation doesn't match, have=317992, 
want=507930
Well block 16367947776 seems great, but generation doesn't match, have=321490, 
want=507930
Well block 36317024256 seems great, but generation doesn't match, have=195981, 
want=507930
Well block 39698722816 seems great, but generation doesn't match, have=273675, 
want=507930
Well block 39714439168 seems great, but generation doesn't match, have=320627, 
want=507930
Well block 43016519680 seems great, but generation doesn't match, have=272156, 
want=507930
Well block 43031748608 seems great, but generation doesn't match, have=302652, 
want=507930
Well block 47617982464 seems great, but generation doesn't match, have=274708, 
want=507930
Found tree root at 131212083200

Anything else I can do to debug this or potentially recover a few bits before 
reformatting?

/Michael
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] apply the Probabilistic Skiplist on btrfs

2012-01-05 Thread David Sterba
Hi, I've let it run through xfstests and ended at 091, patches applied
on top of 3.2, mount options
compress-force=lzo,discard,inode_cache,space_cache,autodefrag
fresh mkfs with defaults.

[ 1081.623819] btrfs: force lzo compression
[ 1081.629166] btrfs: enabling inode map caching
[ 1081.634853] btrfs: enabling auto defrag
[ 1081.638569] btrfs: disk space caching is enabled
[ 1119.693957] [ cut here ]
[ 1119.697876] kernel BUG at fs/btrfs/file.c:530!
[ 1119.697876] invalid opcode:  [#1] SMP
[ 1119.697876] CPU 1
[ 1119.697876] Modules linked in: loop btrfs aoe
[ 1119.697876]
[ 1119.697876] Pid: 25819, comm: fsx Not tainted 3.2.0-default+ #95 Intel 
Corporation Santa Rosa platform/Matanzas
[ 1119.697876] RIP: 0010:[a0048a18]  [a0048a18] 
btrfs_drop_extent_cache+0x3f8/0x400 [btrfs]
[ 1119.697876] RSP: 0018:88000c47f698  EFLAGS: 00010282
[ 1119.697876] RAX: ffef RBX: 88006ff01e48 RCX: 00026fff
[ 1119.697876] RDX: 88006ed5d830 RSI: 00022000 RDI: 
[ 1119.697876] RBP: 88000c47f738 R08:  R09: 00022000
[ 1119.697876] R10: fffe R11: 00026fff R12: 88001ada9e48
[ 1119.697876] R13: 0001f000 R14:  R15: 88000c47f708
[ 1119.697876] FS:  7f262e570700() GS:88007de0() 
knlGS:
[ 1119.697876] CS:  0010 DS:  ES:  CR0: 8005003b
[ 1119.697876] CR2: 7fc4364fc000 CR3: 79435000 CR4: 06e0
[ 1119.697876] DR0:  DR1:  DR2: 
[ 1119.697876] DR3:  DR6: 0ff0 DR7: 0400
[ 1119.697876] Process fsx (pid: 25819, threadinfo 88000c47e000, task 
880063640700)
[ 1119.697876] Stack:
[ 1119.697876]  8800 81092040 88000c47f6f0 
01000246
[ 1119.697876]  0001  3000 
88006e5c44f0
[ 1119.697876]  88006e5c43e0   

[ 1119.697876] Call Trace:
[ 1119.697876]  [81092040] ? trace_hardirqs_on_caller+0x20/0x1d0
[ 1119.697876]  [a003a0b0] ? csum_exist_in_range+0xa0/0xa0 [btrfs]
[ 1119.697876]  [a003f296] cow_file_range+0x136/0x3e0 [btrfs]
[ 1119.697876]  [810921fd] ? trace_hardirqs_on+0xd/0x10
[ 1119.697876]  [a003f8a7] run_delalloc_nocow+0x367/0x820 [btrfs]
[ 1119.697876]  [81357dae] ? do_raw_spin_unlock+0x5e/0xb0
[ 1119.697876]  [a00400c9] run_delalloc_range+0x369/0x370 [btrfs]
[ 1119.697876]  [a00582c0] __extent_writepage+0x5f0/0x750 [btrfs]
[ 1119.697876]  [81349f4d] ? radix_tree_gang_lookup_tag_slot+0x8d/0xd0
[ 1119.697876]  [810f30d1] ? find_get_pages_tag+0x111/0x1b0
[ 1119.697876]  [a0058692] 
extent_write_cache_pages.clone.0+0x272/0x3f0 [btrfs]
[ 1119.697876]  [81357dae] ? do_raw_spin_unlock+0x5e/0xb0
[ 1119.697876]  [81131604] ? kfree+0xd4/0x180
[ 1119.697876]  [81092040] ? trace_hardirqs_on_caller+0x20/0x1d0
[ 1119.697876]  [a0058a56] extent_writepages+0x46/0x60 [btrfs]
[ 1119.697876]  [a003b590] ? acls_after_inode_item+0xd0/0xd0 [btrfs]
[ 1119.697876]  [a003ad17] btrfs_writepages+0x27/0x30 [btrfs]
[ 1120.018734]  [810fdcc4] do_writepages+0x24/0x40
[ 1120.018734]  [810f3cdb] __filemap_fdatawrite_range+0x5b/0x60
[ 1120.018734]  [810f3d3a] filemap_write_and_wait_range+0x5a/0x80
[ 1120.018734]  [a004859a] btrfs_file_aio_write+0x4da/0x560 [btrfs]
[ 1120.018734]  [8113a852] do_sync_write+0xe2/0x120
[ 1120.018734]  [8187d2ad] ? __mutex_unlock_slowpath+0xdd/0x180
[ 1120.018734]  [8187d35e] ? mutex_unlock+0xe/0x10
[ 1120.018734]  [a004703f] ? btrfs_file_llseek+0x6f/0x390 [btrfs]
[ 1120.018734]  [8113b15e] vfs_write+0xce/0x190
[ 1120.018734]  [8113b4a4] sys_write+0x54/0xa0
[ 1120.018734]  [81887a82] system_call_fastpath+0x16/0x1b
[ 1120.018734] Code: 5e 41 5f c9 c3 0f 0b be bf 01 00 00 48 c7 c7 e6 02 09 a0 
48 89 95 68 ff ff ff e8 e4 a2 00 e1 48 8b 95 68 ff ff ff e9 3c fc ff ff 0f 0b 
0f 0b 0f 1f 40 00 55 48 89 e5 41 57 41 56 41 55 41 54 53
[ 1120.018734] RIP  [a0048a18] btrfs_drop_extent_cache+0x3f8/0x400 
[btrfs]
[ 1120.018734]  RSP 88000c47f698
[ 1120.047841] ---[ end trace ca0f509767e0195d ]---

xfstests/091 output:

091 57s ... [19:47:50] [19:48:28] [failed, exit status 1] - output mismatch 
(see 091.out.bad)
--- 091.out 2011-11-01 10:31:12.0 +0100
+++ 091.out.bad 2012-01-05 19:48:28.0 +0100
@@ -5,3 +5,41 @@
 fsx -N 1 -o 8192 -l 50 -r PSIZE -t BSIZE -w BSIZE -Z -R -W
 fsx -N 1 -o 32768 -l 50 -r PSIZE -t BSIZE -w BSIZE -Z -R -W
 fsx -N 1 -o 128000 -l 50 -r PSIZE -t BSIZE -w BSIZE -Z -W
+./091: line 46: 25819 Segmentation fault  $here/ltp/fsx $args 
$TEST_DIR/junk  $seq.full 21
+fsx -N 1 -l 50 -r 

Re: [PATCH 2/3] Btrfs: make btrfs_truncate_inode_items() more readable

2012-01-05 Thread Chris Mason
On Thu, Jan 05, 2012 at 04:32:41PM +0800, Miao Xie wrote:
 As the title said, this patch just make the functions of the truncation
 more readable.

This is a big improvement ;) Thanks.

-chris
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [3.2-rc7] slowdown, warning + oops creating lots of files

2012-01-05 Thread Chris Mason
On Fri, Jan 06, 2012 at 08:24:32AM +1100, Chris Samuel wrote:
 On Fri,   6 Jan 2012, 08:02:55 EST, Chris Mason chris.ma...@oracle.com 
 wrote:
 
  Yeah, same thing here.   I'm testing a fix now, it's pretty dumb.   We're
  not allocating more metadata chunks from the drive because of where the
  allocation is happening, so it is just a check for do we need a new
  chunk in the right place.
  
  I'll make sure it can fill my ssd and then send to you.
 
 *If* the fix works would this be a candidate for a
 stable 3.2.1 release rather than having to wait for
 3.3?

Yes, it's pretty safe.

-chris

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] Btrfs: make btrfs_truncate_inode_items() more readable

2012-01-05 Thread Miao Xie
On thu, 5 Jan 2012 10:11:43 -0500, Josef Bacik wrote:
 +extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
 +if (offset  new_size) {
 +/*
 + * If the new size is in the extent:
 + *   +--+
 + *   |  |
 + *   +--+
 + *  ^ new size
 + * so this extent should be truncated, not be dropped directly.
 + */
 +orig_nbytes = btrfs_file_extent_num_bytes(leaf, fi);
 +new_nbytes = round_up(new_size - offset, root-sectorsize);
 +
 +btrfs_set_file_extent_num_bytes(leaf, fi, new_nbytes);
 +
 +if (extent_start != 0  root-ref_cows)
 +inode_sub_bytes(inode, orig_nbytes - new_nbytes);
 +
 +btrfs_mark_buffer_dirty(leaf);
 +return 0;
 
 Use ret = 0 here, and then further down...
 
 +} else {
 +/*
 + * If the new size is in the font of the extent:
 + *   +--+
 + *   |  |
 + *   +--+
 + *  ^ new size
 + * so this extent should be dropped.
 + */
 +
 +/*
 + * It is a dummy extent, or it is in log tree, we needn't do
 + * anything, just drop it.
 + */
 +if (extent_start == 0 ||
 +!(root-ref_cows || root == root-fs_info-tree_root))
 +return 1;
 +
 +/* If this file is not a free space management file... */
 +/* FIXME blocksize != 4096 */
 +if (root != root-fs_info-tree_root) {
 +orig_nbytes = btrfs_file_extent_num_bytes(leaf, fi);
 +inode_sub_bytes(inode, orig_nbytes);
 +}
 +
 +orig_nbytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 +extent_offset = offset - btrfs_file_extent_offset(leaf, fi);
 +btrfs_set_path_blocking(path);
 +ret = btrfs_free_extent(trans, root, extent_start,
 +orig_nbytes, 0,
 +btrfs_header_owner(leaf),
 +ino, extent_offset);
 +BUG_ON(ret);
 +btrfs_clear_path_blocking(path, NULL, 0);
 +
 +return 1;
 
 ret = 1
 +}
 
 return ret;

OK, I'll modify it. Thanks for your review.

Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs truncate() does not change inode times

2012-01-05 Thread Li Zefan
idank wrote:
 Hi all,
 I was running fstest (http://www.tuxera.com/community/posix-test-suite/) on 
 btrfs. Only one test failed, and I believe it to be a bug in btrfs. The 
 scenario is as follows:
 * crate a file.
 * note its times with stat.
 * sleep a few seconds
 * call truncate() on the file (not ftruncate(). ftruncate() works).
 * sync
 * note the file's times again with stat.
 expected result: ctime and mtime are greater.
 actual result: ctime and mtime remain unchanged.
 
 Example:

I followed your example, but got the expected result:

[root@lizf pjd-fstest-20090130-RC]# stat ctime_test 
  File: ctime_test
  Size: 0   Blocks: 0  IO Block: 4096   普通空文件
Device: 1bh/27d Inode: 1589980 Links: 1
Access: (0644/-rw-r--r--)  Uid: (0/root)   Gid: (0/root)
Access: 2012-01-06 10:27:53.900570365 +0800
Modify: 2012-01-06 10:27:53.900570365 +0800
Change: 2012-01-06 10:27:53.900570365 +0800
[root@lizf pjd-fstest-20090130-RC]# ./fstest truncate ctime_test 200
0
[root@lizf pjd-fstest-20090130-RC]# sleep 2
[root@lizf pjd-fstest-20090130-RC]# stat ctime_test 
  File: ctime_test
  Size: 200 Blocks: 0  IO Block: 4096   普通文件
Device: 1bh/27d Inode: 1589980 Links: 1
Access: (0644/-rw-r--r--)  Uid: (0/root)   Gid: (0/root)
Access: 2012-01-06 10:27:53.900570365 +0800
Modify: 2012-01-06 10:28:12.238569720 +0800
Change: 2012-01-06 10:28:12.238569720 +0800
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH 3/3] Btrfs: improve truncation of btrfs

2012-01-05 Thread Miao Xie
On thu, 5 Jan 2012 10:15:50 -0500, Josef Bacik wrote:
 +trans = btrfs_start_transaction(root, 2);
 +if (IS_ERR(trans))
 +return PTR_ERR(trans);
  
  /*
   * setattr is responsible for setting the ordered_data_close flag,
 @@ -6621,26 +6585,12 @@ static int btrfs_truncate(struct inode *inode)
   * using truncate to replace the contents of the file will
   * end up with a zero length file after a crash.
   */
 -if (inode-i_size == 0  BTRFS_I(inode)-ordered_data_close)
 +if (newsize == 0  BTRFS_I(inode)-ordered_data_close)
  btrfs_add_ordered_operation(trans, root, inode);

Since we have write out all the dirty page, we can drop the following code 
which is
in front of the while loop, and move the first btrfs_start_transaction() into 
the loop,
the logic of btrfs_truncate() will become simpler.

  while (1) {
 -ret = btrfs_block_rsv_refill(root, rsv, min_size);
 -if (ret) {
 -/*
 - * This can only happen with the original transaction we
 - * started above, every other time we shouldn't have a
 - * transaction started yet.
 - */
 -if (ret == -EAGAIN)
 -goto end_trans;
 -err = ret;
 -break;
 -}
 -
 
 Taking this part out is wrong, we need to have this slack space to account for
 any COW that truncate does.  Other than that this looks pretty good.  Thanks,
 

I think we can take this part out, because we start a new transaction every 
time we
do a truncation, and reserve enough space at that time. See below:

Thanks
Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Crash in io_ctl_drop_pages after mount with csum errors

2012-01-05 Thread Li Zefan
David Sterba wrote:
 I mounted a multi-folume fs created not-so-long ago in a 3.1 based
 kernel and mounted with v3.2-rc7-83-g115e8e7 , it crashed immediately.
 It's quite possible that the disk is to blame, it's an old 160G
 SP1614C, but syslog does not contain any error messages. I'm not sure
 whether the fs was cleanly unmounted, seems not, but anyway I do not
 expect a crash.
 
 Label: none  uuid: 5f06f9eb-9736-49f7-91a2-2f45522512ef
 Total devices 4 FS bytes used 1.38GB
 devid4 size 34.00GB used 34.00GB path /dev/sdg8
 devid3 size 34.00GB used 34.00GB path /dev/sdg7
 devid2 size 34.00GB used 34.00GB path /dev/sdg6
 devid1 size 34.00GB used 34.00GB path /dev/sdg5
 
 mount options: compress-force=lzo,space_cache,autodefrag,inode_cache
 
 [ 1461.732855] btrfs: force lzo compression
 [ 1461.732876] btrfs: enabling auto defrag
 [ 1461.732893] btrfs: enabling inode map caching
 [ 1461.732907] btrfs: disk space caching is enabled
 [ 1499.796181] btrfs: csum mismatch on free space cache
 [ 1499.796266] btrfs: failed to load free space cache for block group 29360128
 [ 1499.888699] btrfs csum failed ino 18446744073709551604 off 65536 csum 
 2566472073 private 1925235876
 [ 1499.26] btrfs csum failed ino 18446744073709551604 off 327680 csum 
 2566472073 private 1925235876
 [ 1499.906229] btrfs csum failed ino 18446744073709551604 off 0 csum 
 1695430581 private 1170642078
 [ 1499.906345] btrfs csum failed ino 18446744073709551604 off 262144 csum 
 2566472073 private 1925235876
 [ 1499.906446] btrfs csum failed ino 18446744073709551604 off 524288 csum 
 2566472073 private 1925235876
 [ 1499.924469] btrfs csum failed ino 18446744073709551604 off 196608 csum 
 2566472073 private 1925235876
 [ 1499.924574] btrfs csum failed ino 18446744073709551604 off 458752 csum 
 2566472073 private 1925235876
 [ 1499.946076] btrfs csum failed ino 18446744073709551604 off 131072 csum 
 2566472073 private 1925235876
 [ 1499.946217] btrfs csum failed ino 18446744073709551604 off 393216 csum 
 2566472073 private 1925235876
 [ 1499.946318] btrfs csum failed ino 18446744073709551604 off 0 csum 
 1695430581 private 1170642078
 [ 1499.946362] btrfs: error reading free space cache

We have inconsitent data on disk with both free space cache and free ino cache.

 [ 1499.946409] BUG: unable to handle kernel NULL pointer dereference at 
 0001
 [ 1499.946437] IP: [a0456dd7] io_ctl_drop_pages+0x37/0x70 [btrfs]

0x01 is weired, don't know how it occured. Nevertheless we need this fix:

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ec23d43..81771ca 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
io_ctl_unmap_page(io_ctl);
 
for (i = 0; i  io_ctl-num_pages; i++) {
-   ClearPageChecked(io_ctl-pages[i]);
-   unlock_page(io_ctl-pages[i]);
-   page_cache_release(io_ctl-pages[i]);
+   if (io_ctl-pages[i]) {
+   ClearPageChecked(io_ctl-pages[i]);
+   unlock_page(io_ctl-pages[i]);
+   page_cache_release(io_ctl-pages[i]);
+   }
}
 }

I'll resend the patch along with my other pending patches for 3.3.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html